i586: Use a jump table in strcpy.S {BZ #22353]

Message ID 20171027210714.GA15539@gmail.com
State New, archived
Headers

Commit Message

Lu, Hongjiu Oct. 27, 2017, 9:07 p.m. UTC
  i586 strcpy.S used a clever trick with LEA to avoid jump table:

/* ECX has the last 2 bits of the address of source - 1.  */
	andl	$3, %ecx

        call    2f
2:      popl    %edx
	/* 0xb is the distance between 2: and 1:.  */
        leal    0xb(%edx,%ecx,8), %ecx
        jmp     *%ecx

        .align 8
1:  /* ECX == 0 */
        orb     (%esi), %al
        jz      L(end)
        stosb
        xorl    %eax, %eax
        incl    %esi
    /* ECX == 1 */
        orb     (%esi), %al
        jz      L(end)
        stosb
        xorl    %eax, %eax
        incl    %esi
    /* ECX == 2 */
        orb     (%esi), %al
        jz      L(end)
        stosb
        xorl    %eax, %eax
        incl    %esi
    /* ECX == 3 */
L(1):   movl    (%esi), %ecx
        leal    4(%esi),%esi

This may fail if there are instruction changes before L(1):.  This patch
replaces it with a jump table which works with any instruction changes.

Tested on i586 and i686 with and without --disable-multi-arch.

Any objections or comments?

H.J.
	{BZ #22353]
	* sysdeps/i386/i586/strcpy.S (JMPTBL): New.
	(BRANCH_TO_JMPTBL_ENTRY): Likewise.
	(STRCPY): Use it.
	(1): Renamed to ...
	(L(Src0)): This.
	(L(Src1)): New.
	(L(Src2)): Likewise.
	(L(1)): Renamed to ...
	(L(Src3)): This.
	(L(SrcTable)): New.

---
 sysdeps/i386/i586/strcpy.S | 62 +++++++++++++++++++++++++++++++++-------------
 1 file changed, 45 insertions(+), 17 deletions(-)
  

Comments

Florian Weimer Oct. 30, 2017, 11:18 a.m. UTC | #1
On 10/27/2017 11:07 PM, H.J. Lu wrote:
> This may fail if there are instruction changes before L(1):.  This patch
> replaces it with a jump table which works with any instruction changes.

I think you should say instruction *length* changes.

What's the performance impact of the change?  Is it even worth to use a 
jump table here?

Thanks,
Florian
  
H.J. Lu Oct. 30, 2017, 1:12 p.m. UTC | #2
On Mon, Oct 30, 2017 at 4:18 AM, Florian Weimer <fweimer@redhat.com> wrote:
> On 10/27/2017 11:07 PM, H.J. Lu wrote:
>>
>> This may fail if there are instruction changes before L(1):.  This patch
>> replaces it with a jump table which works with any instruction changes.
>
>
> I think you should say instruction *length* changes.

Done.  The updated patch is at

https://sourceware.org/git/?p=glibc.git;a=patch;h=9a89973274f901f4c3313e6e2b84d6b2108c7924

> What's the performance impact of the change?  Is it even worth to use a jump
> table here?
>

I added 2 strcpy implementations to hjl/pr22353/master.  Jump table is faster
than other non-SSE strcpy implementations because it can copy up to 4 bytes
at a time.
  
Florian Weimer Oct. 30, 2017, 1:20 p.m. UTC | #3
On 10/30/2017 02:12 PM, H.J. Lu wrote:

>> What's the performance impact of the change?  Is it even worth to use a jump
>> table here?
>>
> 
> I added 2 strcpy implementations to hjl/pr22353/master.  Jump table is faster
> than other non-SSE strcpy implementations because it can copy up to 4 bytes
> at a time.

To clarify, I meant replacing the jump table with conditional branches, 
basically getting rid of Duff's device.

Thanks,
Florian
  

Patch

diff --git a/sysdeps/i386/i586/strcpy.S b/sysdeps/i386/i586/strcpy.S
index a444604f4f..ff3a6cbe7f 100644
--- a/sysdeps/i386/i586/strcpy.S
+++ b/sysdeps/i386/i586/strcpy.S
@@ -29,6 +29,34 @@ 
 # define STRCPY strcpy
 #endif
 
+#ifdef PIC
+# define JMPTBL(I, B)	I - B
+
+/* Load an entry in a jump table into EDX and branch to it. TABLE is a
+   jump table with relative offsets.  INDEX is a register contains the
+   index into the jump table.  SCALE is the scale of INDEX.  */
+
+# define BRANCH_TO_JMPTBL_ENTRY(TABLE, INDEX, SCALE)		\
+	/* We first load PC into EDX.  */			\
+	SETUP_PIC_REG(dx);					\
+	/* Get the address of the jump table.  */		\
+	addl	$(TABLE - .), %edx;				\
+	/* Get the entry and convert the relative offset to the \
+	   absolute address.  */				\
+	addl	(%edx,INDEX,SCALE), %edx;			\
+	/* We loaded the jump table and adjusted EDX. Go.  */  \
+	jmp	*%edx
+#else
+# define JMPTBL(I, B)	I
+
+/* Branch to an entry in a jump table.  TABLE is a jump table with
+   absolute offsets.  INDEX is a register contains the index into the
+   jump table.  SCALE is the scale of INDEX.  */
+
+# define BRANCH_TO_JMPTBL_ENTRY(TABLE, INDEX, SCALE)		\
+	jmp	*TABLE(,INDEX,SCALE)
+#endif
+
 #define magic 0xfefefeff
 
 	.text
@@ -53,41 +81,32 @@  ENTRY (STRCPY)
 	cfi_rel_offset (ebx, 0)
 	andl	$3, %ecx
 
-#ifdef PIC
-	call	2f
-	cfi_adjust_cfa_offset (4)
-2:	popl	%edx
-	cfi_adjust_cfa_offset (-4)
-	/* 0xb is the distance between 2: and 1: but we avoid writing
-	   1f-2b because the assembler generates worse code.  */
-	leal	0xb(%edx,%ecx,8), %ecx
-#else
-	leal	1f(,%ecx,8), %ecx
-#endif
-
-	jmp	*%ecx
+	BRANCH_TO_JMPTBL_ENTRY (L(SrcTable), %ecx, 4)
 
-	.align 8
-1:
+	.p2align 4
+L(Src0):
 	orb	(%esi), %al
 	jz	L(end)
 	stosb
 	xorl	%eax, %eax
 	incl	%esi
 
+L(Src1):
 	orb	(%esi), %al
 	jz	L(end)
 	stosb
 	xorl	%eax, %eax
 	incl	%esi
 
+L(Src2):
 	orb	(%esi), %al
 	jz	L(end)
 	stosb
 	xorl	%eax, %eax
 	incl	%esi
 
-L(1):	movl	(%esi), %ecx
+L(Src3):
+	movl	(%esi), %ecx
 	leal	4(%esi),%esi
 
 	subl	%ecx, %eax
@@ -107,7 +126,7 @@  L(1):	movl	(%esi), %ecx
 	movl	%edx, (%edi)
 	leal	4(%edi),%edi
 
-	jmp	L(1)
+	jmp	L(Src3)
 
 L(3):	movl	%ecx, %edx
 
@@ -164,6 +183,15 @@  L(end2):
 
 	ret
 END (STRCPY)
+
+	.p2align 2
+	.section .rodata
+L(SrcTable):
+	.int	JMPTBL (L(Src0), L(SrcTable))
+	.int	JMPTBL (L(Src1), L(SrcTable))
+	.int	JMPTBL (L(Src2), L(SrcTable))
+	.int	JMPTBL (L(Src3), L(SrcTable))
+
 #ifndef USE_AS_STPCPY
 libc_hidden_builtin_def (strcpy)
 #endif