x86_64: Remove redundant REX bytes from memrchr.S
Commit Message
By x86-64 specification, 32-bit destination registers are zero-extended
to 64 bits. There is no need to use 64-bit registers when only the lower
32 bits are non-zero. Also 2 instructions in:
mov %rdi, %rcx
and $15, %rcx
jz L(length_less16_offset0)
mov %rdi, %rcx <<< redundant
and $15, %rcx <<< redundant
are redundant.
Any comments?
H.J.
--
* sysdeps/x86_64/memrchr.S (__memrchr): Use 32-bit registers for
he lower 32 bits. Remove redundant instructions.
---
sysdeps/x86_64/memrchr.S | 36 +++++++++++++++++-------------------
1 file changed, 17 insertions(+), 19 deletions(-)
Comments
On Tue, May 30, 2017 at 1:04 PM, H.J. Lu <hongjiu.lu@intel.com> wrote:
> By x86-64 specification, 32-bit destination registers are zero-extended
> to 64 bits. There is no need to use 64-bit registers when only the lower
> 32 bits are non-zero. Also 2 instructions in:
>
> mov %rdi, %rcx
> and $15, %rcx
> jz L(length_less16_offset0)
>
> mov %rdi, %rcx <<< redundant
> and $15, %rcx <<< redundant
>
> are redundant.
>
> Any comments?
I will check it next week.
> H.J.
> --
> * sysdeps/x86_64/memrchr.S (__memrchr): Use 32-bit registers for
> he lower 32 bits. Remove redundant instructions.
> ---
> sysdeps/x86_64/memrchr.S | 36 +++++++++++++++++-------------------
> 1 file changed, 17 insertions(+), 19 deletions(-)
>
> diff --git a/sysdeps/x86_64/memrchr.S b/sysdeps/x86_64/memrchr.S
> index aab1a4a..5fa0fe9 100644
> --- a/sysdeps/x86_64/memrchr.S
> +++ b/sysdeps/x86_64/memrchr.S
> @@ -22,7 +22,7 @@
>
> .text
> ENTRY (__memrchr)
> - movd %rsi, %xmm1
> + movd %esi, %xmm1
>
> sub $16, %rdx
> jbe L(length_less16)
> @@ -42,8 +42,8 @@ ENTRY (__memrchr)
> jnz L(matches0)
>
> sub $64, %rdi
> - mov %rdi, %rcx
> - and $15, %rcx
> + mov %edi, %ecx
> + and $15, %ecx
> jz L(loop_prolog)
>
> add $16, %rdi
> @@ -108,8 +108,8 @@ L(loop_prolog):
> test %eax, %eax
> jnz L(matches0)
>
> - mov %rdi, %rcx
> - and $63, %rcx
> + mov %edi, %ecx
> + and $63, %ecx
> jz L(align64_loop)
>
> add $64, %rdi
> @@ -166,8 +166,8 @@ L(align64_loop):
>
> .p2align 4
> L(exit_loop):
> - add $64, %rdx
> - cmp $32, %rdx
> + add $64, %edx
> + cmp $32, %edx
> jbe L(exit_loop_32)
>
> movdqa 48(%rdi), %xmm0
> @@ -187,7 +187,7 @@ L(exit_loop):
> pmovmskb %xmm3, %eax
> test %eax, %eax
> jnz L(matches16_1)
> - cmp $48, %rdx
> + cmp $48, %edx
> jbe L(return_null)
>
> pcmpeqb (%rdi), %xmm1
> @@ -204,7 +204,7 @@ L(exit_loop_32):
> pmovmskb %xmm0, %eax
> test %eax, %eax
> jnz L(matches48_1)
> - cmp $16, %rdx
> + cmp $16, %edx
> jbe L(return_null)
>
> pcmpeqb 32(%rdi), %xmm1
> @@ -276,7 +276,7 @@ L(matches48_1):
>
> .p2align 4
> L(return_null):
> - xor %rax, %rax
> + xor %eax, %eax
> ret
>
> .p2align 4
> @@ -306,18 +306,16 @@ L(length_less16):
> punpcklbw %xmm1, %xmm1
> punpcklbw %xmm1, %xmm1
>
> - add $16, %rdx
> + add $16, %edx
>
> pshufd $0, %xmm1, %xmm1
>
> - mov %rdi, %rcx
> - and $15, %rcx
> + mov %edi, %ecx
> + and $15, %ecx
> jz L(length_less16_offset0)
>
> - mov %rdi, %rcx
> - and $15, %rcx
> mov %cl, %dh
> - mov %rcx, %r8
> + mov %ecx, %esi
> add %dl, %dh
> and $-16, %rdi
>
> @@ -340,7 +338,7 @@ L(length_less16):
>
> bsr %eax, %eax
> add %rdi, %rax
> - add %r8, %rax
> + add %rsi, %rax
> ret
>
> .p2align 4
> @@ -362,14 +360,14 @@ L(length_less16_part2):
> pcmpeqb (%rdi), %xmm1
> pmovmskb %xmm1, %eax
>
> - mov %r8, %rcx
> + mov %esi, %ecx
> sar %cl, %eax
> test %eax, %eax
> jz L(return_null)
>
> bsr %eax, %eax
> add %rdi, %rax
> - add %r8, %rax
> + add %rsi, %rax
> ret
>
> .p2align 4
> --
> 2.9.4
>
@@ -22,7 +22,7 @@
.text
ENTRY (__memrchr)
- movd %rsi, %xmm1
+ movd %esi, %xmm1
sub $16, %rdx
jbe L(length_less16)
@@ -42,8 +42,8 @@ ENTRY (__memrchr)
jnz L(matches0)
sub $64, %rdi
- mov %rdi, %rcx
- and $15, %rcx
+ mov %edi, %ecx
+ and $15, %ecx
jz L(loop_prolog)
add $16, %rdi
@@ -108,8 +108,8 @@ L(loop_prolog):
test %eax, %eax
jnz L(matches0)
- mov %rdi, %rcx
- and $63, %rcx
+ mov %edi, %ecx
+ and $63, %ecx
jz L(align64_loop)
add $64, %rdi
@@ -166,8 +166,8 @@ L(align64_loop):
.p2align 4
L(exit_loop):
- add $64, %rdx
- cmp $32, %rdx
+ add $64, %edx
+ cmp $32, %edx
jbe L(exit_loop_32)
movdqa 48(%rdi), %xmm0
@@ -187,7 +187,7 @@ L(exit_loop):
pmovmskb %xmm3, %eax
test %eax, %eax
jnz L(matches16_1)
- cmp $48, %rdx
+ cmp $48, %edx
jbe L(return_null)
pcmpeqb (%rdi), %xmm1
@@ -204,7 +204,7 @@ L(exit_loop_32):
pmovmskb %xmm0, %eax
test %eax, %eax
jnz L(matches48_1)
- cmp $16, %rdx
+ cmp $16, %edx
jbe L(return_null)
pcmpeqb 32(%rdi), %xmm1
@@ -276,7 +276,7 @@ L(matches48_1):
.p2align 4
L(return_null):
- xor %rax, %rax
+ xor %eax, %eax
ret
.p2align 4
@@ -306,18 +306,16 @@ L(length_less16):
punpcklbw %xmm1, %xmm1
punpcklbw %xmm1, %xmm1
- add $16, %rdx
+ add $16, %edx
pshufd $0, %xmm1, %xmm1
- mov %rdi, %rcx
- and $15, %rcx
+ mov %edi, %ecx
+ and $15, %ecx
jz L(length_less16_offset0)
- mov %rdi, %rcx
- and $15, %rcx
mov %cl, %dh
- mov %rcx, %r8
+ mov %ecx, %esi
add %dl, %dh
and $-16, %rdi
@@ -340,7 +338,7 @@ L(length_less16):
bsr %eax, %eax
add %rdi, %rax
- add %r8, %rax
+ add %rsi, %rax
ret
.p2align 4
@@ -362,14 +360,14 @@ L(length_less16_part2):
pcmpeqb (%rdi), %xmm1
pmovmskb %xmm1, %eax
- mov %r8, %rcx
+ mov %esi, %ecx
sar %cl, %eax
test %eax, %eax
jz L(return_null)
bsr %eax, %eax
add %rdi, %rax
- add %r8, %rax
+ add %rsi, %rax
ret
.p2align 4