i586: Use a jump table in strcpy.S {BZ #22353]

Message ID CAMe9rOqXQ-O85Z3yCVJtNC=TOWfa=CLJSUgPeeBPh9AF0kqF_w@mail.gmail.com
State New, archived
Headers

Commit Message

H.J. Lu Oct. 30, 2017, 3:46 p.m. UTC
  On Mon, Oct 30, 2017 at 6:20 AM, Florian Weimer <fweimer@redhat.com> wrote:
> On 10/30/2017 02:12 PM, H.J. Lu wrote:
>
>>> What's the performance impact of the change?  Is it even worth to use a
>>> jump
>>> table here?
>>>
>>
>> I added 2 strcpy implementations to hjl/pr22353/master.  Jump table is
>> faster
>> than other non-SSE strcpy implementations because it can copy up to 4
>> bytes
>> at a time.
>
>
> To clarify, I meant replacing the jump table with conditional branches,
> basically getting rid of Duff's device.
>

I changed:

         BRANCH_TO_JMPTBL_ENTRY (L(SrcTable), %ecx, 4)

          .p2align 4
L(Src0):

to

        cmpb    $2, %cl
        je      L(Src2)
        ja      L(Src3)
        cmpb    $1, %cl
        je      L(Src1)

L(Src0):

Their performances are equivalent on Haswell.

Here is the updated patch I am going to check in.

Thanks.
  

Comments

Florian Weimer Oct. 30, 2017, 3:48 p.m. UTC | #1
On 10/30/2017 04:46 PM, H.J. Lu wrote:
> Their performances are equivalent on Haswell.
> 
> Here is the updated patch I am going to check in.

Looks reasonable.  I'm glad the performance numbers work out.

Thanks,
Florian
  

Patch

From 9208134988aecc786aad96e996dab013b656b22e Mon Sep 17 00:00:00 2001
From: "H.J. Lu" <hjl.tools@gmail.com>
Date: Fri, 27 Oct 2017 02:31:13 -0700
Subject: [PATCH] i586: Use conditional branches in strcpy.S [BZ #22353]

i586 strcpy.S used a clever trick with LEA to implement jump table:

/* ECX has the last 2 bits of the address of source - 1.  */
	andl	$3, %ecx

        call    2f
2:      popl    %edx
	/* 0xb is the distance between 2: and 1:.  */
        leal    0xb(%edx,%ecx,8), %ecx
        jmp     *%ecx

        .align 8
1:  /* ECX == 0 */
        orb     (%esi), %al
        jz      L(end)
        stosb
        xorl    %eax, %eax
        incl    %esi
    /* ECX == 1 */
        orb     (%esi), %al
        jz      L(end)
        stosb
        xorl    %eax, %eax
        incl    %esi
    /* ECX == 2 */
        orb     (%esi), %al
        jz      L(end)
        stosb
        xorl    %eax, %eax
        incl    %esi
    /* ECX == 3 */
L(1):   movl    (%esi), %ecx
        leal    4(%esi),%esi

This fails if there are instruction length changes before L(1):.  This
patch replaces it with conditional branches:

	cmpb	$2, %cl
	je	L(Src2)
	ja	L(Src3)
	cmpb	$1, %cl
	je	L(Src1)

L(Src0):

which have similar performance and work with any instruction lengths.

Tested on i586 and i686 with and without --disable-multi-arch.

	[BZ #22353]
	* sysdeps/i386/i586/strcpy.S (STRCPY): Use conditional branches.
	(1): Renamed to ...
	(L(Src0)): This.
	(L(Src1)): New.
	(L(Src2)): Likewise.
	(L(1)): Renamed to ...
	(L(Src3)): This.
---
 sysdeps/i386/i586/strcpy.S | 28 +++++++++++-----------------
 1 file changed, 11 insertions(+), 17 deletions(-)

diff --git a/sysdeps/i386/i586/strcpy.S b/sysdeps/i386/i586/strcpy.S
index a444604f4f..bb73ca4ef3 100644
--- a/sysdeps/i386/i586/strcpy.S
+++ b/sysdeps/i386/i586/strcpy.S
@@ -53,41 +53,35 @@  ENTRY (STRCPY)
 	cfi_rel_offset (ebx, 0)
 	andl	$3, %ecx
 
-#ifdef PIC
-	call	2f
-	cfi_adjust_cfa_offset (4)
-2:	popl	%edx
-	cfi_adjust_cfa_offset (-4)
-	/* 0xb is the distance between 2: and 1: but we avoid writing
-	   1f-2b because the assembler generates worse code.  */
-	leal	0xb(%edx,%ecx,8), %ecx
-#else
-	leal	1f(,%ecx,8), %ecx
-#endif
-
-	jmp	*%ecx
+	cmpb	$2, %cl
+	je	L(Src2)
+	ja	L(Src3)
+	cmpb	$1, %cl
+	je	L(Src1)
 
-	.align 8
-1:
+L(Src0):
 	orb	(%esi), %al
 	jz	L(end)
 	stosb
 	xorl	%eax, %eax
 	incl	%esi
 
+L(Src1):
 	orb	(%esi), %al
 	jz	L(end)
 	stosb
 	xorl	%eax, %eax
 	incl	%esi
 
+L(Src2):
 	orb	(%esi), %al
 	jz	L(end)
 	stosb
 	xorl	%eax, %eax
 	incl	%esi
 
-L(1):	movl	(%esi), %ecx
+L(Src3):
+	movl	(%esi), %ecx
 	leal	4(%esi),%esi
 
 	subl	%ecx, %eax
@@ -107,7 +101,7 @@  L(1):	movl	(%esi), %ecx
 	movl	%edx, (%edi)
 	leal	4(%edi),%edi
 
-	jmp	L(1)
+	jmp	L(Src3)
 
 L(3):	movl	%ecx, %edx
 
-- 
2.13.6