i586: Use a jump table in strcpy.S {BZ #22353]
Commit Message
i586 strcpy.S used a clever trick with LEA to avoid jump table:
/* ECX has the last 2 bits of the address of source - 1. */
andl $3, %ecx
call 2f
2: popl %edx
/* 0xb is the distance between 2: and 1:. */
leal 0xb(%edx,%ecx,8), %ecx
jmp *%ecx
.align 8
1: /* ECX == 0 */
orb (%esi), %al
jz L(end)
stosb
xorl %eax, %eax
incl %esi
/* ECX == 1 */
orb (%esi), %al
jz L(end)
stosb
xorl %eax, %eax
incl %esi
/* ECX == 2 */
orb (%esi), %al
jz L(end)
stosb
xorl %eax, %eax
incl %esi
/* ECX == 3 */
L(1): movl (%esi), %ecx
leal 4(%esi),%esi
This may fail if there are instruction changes before L(1):. This patch
replaces it with a jump table which works with any instruction changes.
Tested on i586 and i686 with and without --disable-multi-arch.
Any objections or comments?
H.J.
{BZ #22353]
* sysdeps/i386/i586/strcpy.S (JMPTBL): New.
(BRANCH_TO_JMPTBL_ENTRY): Likewise.
(STRCPY): Use it.
(1): Renamed to ...
(L(Src0)): This.
(L(Src1)): New.
(L(Src2)): Likewise.
(L(1)): Renamed to ...
(L(Src3)): This.
(L(SrcTable)): New.
---
sysdeps/i386/i586/strcpy.S | 62 +++++++++++++++++++++++++++++++++-------------
1 file changed, 45 insertions(+), 17 deletions(-)
Comments
On 10/27/2017 11:07 PM, H.J. Lu wrote:
> This may fail if there are instruction changes before L(1):. This patch
> replaces it with a jump table which works with any instruction changes.
I think you should say instruction *length* changes.
What's the performance impact of the change? Is it even worth to use a
jump table here?
Thanks,
Florian
On Mon, Oct 30, 2017 at 4:18 AM, Florian Weimer <fweimer@redhat.com> wrote:
> On 10/27/2017 11:07 PM, H.J. Lu wrote:
>>
>> This may fail if there are instruction changes before L(1):. This patch
>> replaces it with a jump table which works with any instruction changes.
>
>
> I think you should say instruction *length* changes.
Done. The updated patch is at
https://sourceware.org/git/?p=glibc.git;a=patch;h=9a89973274f901f4c3313e6e2b84d6b2108c7924
> What's the performance impact of the change? Is it even worth to use a jump
> table here?
>
I added 2 strcpy implementations to hjl/pr22353/master. Jump table is faster
than other non-SSE strcpy implementations because it can copy up to 4 bytes
at a time.
On 10/30/2017 02:12 PM, H.J. Lu wrote:
>> What's the performance impact of the change? Is it even worth to use a jump
>> table here?
>>
>
> I added 2 strcpy implementations to hjl/pr22353/master. Jump table is faster
> than other non-SSE strcpy implementations because it can copy up to 4 bytes
> at a time.
To clarify, I meant replacing the jump table with conditional branches,
basically getting rid of Duff's device.
Thanks,
Florian
@@ -29,6 +29,34 @@
# define STRCPY strcpy
#endif
+#ifdef PIC
+# define JMPTBL(I, B) I - B
+
+/* Load an entry in a jump table into EDX and branch to it. TABLE is a
+ jump table with relative offsets. INDEX is a register contains the
+ index into the jump table. SCALE is the scale of INDEX. */
+
+# define BRANCH_TO_JMPTBL_ENTRY(TABLE, INDEX, SCALE) \
+ /* We first load PC into EDX. */ \
+ SETUP_PIC_REG(dx); \
+ /* Get the address of the jump table. */ \
+ addl $(TABLE - .), %edx; \
+ /* Get the entry and convert the relative offset to the \
+ absolute address. */ \
+ addl (%edx,INDEX,SCALE), %edx; \
+ /* We loaded the jump table and adjusted EDX. Go. */ \
+ jmp *%edx
+#else
+# define JMPTBL(I, B) I
+
+/* Branch to an entry in a jump table. TABLE is a jump table with
+ absolute offsets. INDEX is a register contains the index into the
+ jump table. SCALE is the scale of INDEX. */
+
+# define BRANCH_TO_JMPTBL_ENTRY(TABLE, INDEX, SCALE) \
+ jmp *TABLE(,INDEX,SCALE)
+#endif
+
#define magic 0xfefefeff
.text
@@ -53,41 +81,32 @@ ENTRY (STRCPY)
cfi_rel_offset (ebx, 0)
andl $3, %ecx
-#ifdef PIC
- call 2f
- cfi_adjust_cfa_offset (4)
-2: popl %edx
- cfi_adjust_cfa_offset (-4)
- /* 0xb is the distance between 2: and 1: but we avoid writing
- 1f-2b because the assembler generates worse code. */
- leal 0xb(%edx,%ecx,8), %ecx
-#else
- leal 1f(,%ecx,8), %ecx
-#endif
-
- jmp *%ecx
+ BRANCH_TO_JMPTBL_ENTRY (L(SrcTable), %ecx, 4)
- .align 8
-1:
+ .p2align 4
+L(Src0):
orb (%esi), %al
jz L(end)
stosb
xorl %eax, %eax
incl %esi
+L(Src1):
orb (%esi), %al
jz L(end)
stosb
xorl %eax, %eax
incl %esi
+L(Src2):
orb (%esi), %al
jz L(end)
stosb
xorl %eax, %eax
incl %esi
-L(1): movl (%esi), %ecx
+L(Src3):
+ movl (%esi), %ecx
leal 4(%esi),%esi
subl %ecx, %eax
@@ -107,7 +126,7 @@ L(1): movl (%esi), %ecx
movl %edx, (%edi)
leal 4(%edi),%edi
- jmp L(1)
+ jmp L(Src3)
L(3): movl %ecx, %edx
@@ -164,6 +183,15 @@ L(end2):
ret
END (STRCPY)
+
+ .p2align 2
+ .section .rodata
+L(SrcTable):
+ .int JMPTBL (L(Src0), L(SrcTable))
+ .int JMPTBL (L(Src1), L(SrcTable))
+ .int JMPTBL (L(Src2), L(SrcTable))
+ .int JMPTBL (L(Src3), L(SrcTable))
+
#ifndef USE_AS_STPCPY
libc_hidden_builtin_def (strcpy)
#endif