[3/3] Add i386 memset and memcpy assembly functions

Message ID 20150826134631.GC19484@gmail.com
State Committed
Headers

Commit Message

H.J. Lu Aug. 26, 2015, 1:46 p.m. UTC
  Add i386 memset and memcpy assembly functions with REP MOVSB/STOSB
instructions.  They will be used to implement i386 multi-arch memcpy.

OK for master?

H.J.
--
	* sysdeps/i386/bcopy.S: New file.
	* sysdeps/i386/bzero.S: Likewise.
	* sysdeps/i386/memcpy.S: Likewise.
	* sysdeps/i386/memmove.S: Likewise.
	* sysdeps/i386/mempcpy.S: Likewise.
	* sysdeps/i386/memset.S: Likewise.
	* sysdeps/i386/bzero.c: Removed.
	* sysdeps/i386/memset.c: Likewise.
	* sysdeps/i386/i586/memcpy_chk.S: Likewise.
	* sysdeps/i386/i586/mempcpy_chk.S: Likewise.
	* sysdeps/i386/i586/memset_chk.S: Likewise.
	* sysdeps/i386/i686/memcpy_chk.S: Moved to ...
	* sysdeps/i386/memcpy_chk.S: Here.
	* sysdeps/i386/i686/memmove_chk.S: Moved to ...
	* sysdeps/i386/memmove_chk.S: Here.
	* sysdeps/i386/i686/mempcpy_chk.S: Moved to ...
	* sysdeps/i386/mempcpy_chk.S: Likewise.
	* sysdeps/i386/i686/memset_chk.S: Moved to ...
	* sysdeps/i386/memset_chk.S: Likewise.
---
 sysdeps/i386/bcopy.S                  |  4 ++
 sysdeps/i386/bzero.S                  |  5 ++
 sysdeps/i386/bzero.c                  | 82 ------------------------------
 sysdeps/i386/i586/memcpy_chk.S        |  1 -
 sysdeps/i386/i586/mempcpy_chk.S       |  1 -
 sysdeps/i386/i586/memset_chk.S        |  1 -
 sysdeps/i386/memcpy.S                 | 95 +++++++++++++++++++++++++++++++++++
 sysdeps/i386/{i686 => }/memcpy_chk.S  |  8 +--
 sysdeps/i386/memmove.S                |  4 ++
 sysdeps/i386/{i686 => }/memmove_chk.S | 15 +++---
 sysdeps/i386/mempcpy.S                |  7 +++
 sysdeps/i386/{i686 => }/mempcpy_chk.S | 15 +++---
 sysdeps/i386/memset.S                 | 68 +++++++++++++++++++++++++
 sysdeps/i386/memset.c                 | 85 -------------------------------
 sysdeps/i386/{i686 => }/memset_chk.S  | 15 +++---
 15 files changed, 208 insertions(+), 198 deletions(-)
 create mode 100644 sysdeps/i386/bcopy.S
 create mode 100644 sysdeps/i386/bzero.S
 delete mode 100644 sysdeps/i386/bzero.c
 delete mode 100644 sysdeps/i386/i586/memcpy_chk.S
 delete mode 100644 sysdeps/i386/i586/mempcpy_chk.S
 delete mode 100644 sysdeps/i386/i586/memset_chk.S
 create mode 100644 sysdeps/i386/memcpy.S
 rename sysdeps/i386/{i686 => }/memcpy_chk.S (92%)
 create mode 100644 sysdeps/i386/memmove.S
 rename sysdeps/i386/{i686 => }/memmove_chk.S (78%)
 create mode 100644 sysdeps/i386/mempcpy.S
 rename sysdeps/i386/{i686 => }/mempcpy_chk.S (78%)
 create mode 100644 sysdeps/i386/memset.S
 delete mode 100644 sysdeps/i386/memset.c
 rename sysdeps/i386/{i686 => }/memset_chk.S (79%)
  

Comments

Ondrej Bilka Aug. 26, 2015, 2:29 p.m. UTC | #1
On Wed, Aug 26, 2015 at 06:46:31AM -0700, H.J. Lu wrote:
> Add i386 memset and memcpy assembly functions with REP MOVSB/STOSB
> instructions.  They will be used to implement i386 multi-arch memcpy.
> 
> OK for master?
>
No, as rep stosb has terrible performance on most of machines, on ivy
bridge its around six times slower than rep stosq. I wouldn't be
surprised when you test it for affected machines it would be at least three times
slower than rep stosl on affected machines.

Only exception where you should use rep stosb that I know is haswell.

Perhaps you could adapt this implementation that I used for rep stosq
and change to rep stosl?

.text ;.globl memset_rep8; .type memset_rep8, @function;memset_rep8:; .cfi_startproc
 movzbl  %sil, %eax
 lea (%rdi, %rdx), %rcx
 movabsq $72340172838076673, %rsi
 imulq %rsi, %rax 

 cmp $7, %rdx 
 jbe .Lless_16_bytes
 movq %rax, (%rdi)
 movq %rdi, %rsi
 leaq 8(%rdi), %rdi
 movq %rax, -8(%rcx)
 andq $-8, %rdi
 subq %rdi, %rcx
 shrq $3, %rcx
 rep stosq
 movq %rsi, %rax
 ret

.p2align 4
.Lless_16_bytes:
 movq %rax, %rsi
 movq %rdi, %rax
 testb $4, %dl; jne .Lbetween_4_7_bytes
 cmp $1, %dl; jbe .Lbetween_0_1_byte
 movw %si, -2(%rcx)
 movb %sil, (%rdi)
 ret

.p2align 3
.Lbetween_4_7_bytes:
 movl %esi, (%rdi)
 movl %esi, -4(%rcx)
 ret

.Lbetween_0_1_byte:
 jb .Lzero_byte
 movb %sil, (%rdi)
.Lzero_byte:
 ret

.cfi_endproc ; .size memset_rep8, .-memset_rep8
  
H.J. Lu Aug. 26, 2015, 2:49 p.m. UTC | #2
On Wed, Aug 26, 2015 at 7:29 AM, Ondřej Bílka <neleai@seznam.cz> wrote:
> On Wed, Aug 26, 2015 at 06:46:31AM -0700, H.J. Lu wrote:
>> Add i386 memset and memcpy assembly functions with REP MOVSB/STOSB
>> instructions.  They will be used to implement i386 multi-arch memcpy.
>>
>> OK for master?
>>
> No, as rep stosb has terrible performance on most of machines, on ivy
> bridge its around six times slower than rep stosq. I wouldn't be
>

I added them for i386 memcpy family multiarch functions.  We have
memcpy for i586 and i686:

sysdeps/i386/i586/memcpy.S
sysdeps/i386/i686/memcpy.S

But we don't have it for i486.  I add them so that I can implement
i386 memset and memcpy family multiarch functions for i486,
i586 and i686 targets.  i386 memset and memcpy are used only
when

1. Building glibc for i486 with --disable-multi-arch.  Or
2. Processor doesn't support i686 nor SSE2.

I believe these are a very rare cases.
  
Ondrej Bilka Aug. 26, 2015, 3:15 p.m. UTC | #3
On Wed, Aug 26, 2015 at 07:49:51AM -0700, H.J. Lu wrote:
> On Wed, Aug 26, 2015 at 7:29 AM, Ondřej Bílka <neleai@seznam.cz> wrote:
> > On Wed, Aug 26, 2015 at 06:46:31AM -0700, H.J. Lu wrote:
> >> Add i386 memset and memcpy assembly functions with REP MOVSB/STOSB
> >> instructions.  They will be used to implement i386 multi-arch memcpy.
> >>
> >> OK for master?
> >>
> > No, as rep stosb has terrible performance on most of machines, on ivy
> > bridge its around six times slower than rep stosq. I wouldn't be
> >
> 
> I added them for i386 memcpy family multiarch functions.  We have
> memcpy for i586 and i686:
> 
> sysdeps/i386/i586/memcpy.S
> sysdeps/i386/i686/memcpy.S
> 
> But we don't have it for i486.  I add them so that I can implement
> i386 memset and memcpy family multiarch functions for i486,
> i586 and i686 targets.  i386 memset and memcpy are used only
> when
> 
> 1. Building glibc for i486 with --disable-multi-arch.  Or
> 2. Processor doesn't support i686 nor SSE2.
> 
> I believe these are a very rare cases.
>
While true a existing implementation looked better. So if you need use
assembly could you pick these files compiled with gcc -S or something
like that?
  
H.J. Lu Aug. 26, 2015, 3:31 p.m. UTC | #4
On Wed, Aug 26, 2015 at 8:15 AM, Ondřej Bílka <neleai@seznam.cz> wrote:
> On Wed, Aug 26, 2015 at 07:49:51AM -0700, H.J. Lu wrote:
>> On Wed, Aug 26, 2015 at 7:29 AM, Ondřej Bílka <neleai@seznam.cz> wrote:
>> > On Wed, Aug 26, 2015 at 06:46:31AM -0700, H.J. Lu wrote:
>> >> Add i386 memset and memcpy assembly functions with REP MOVSB/STOSB
>> >> instructions.  They will be used to implement i386 multi-arch memcpy.
>> >>
>> >> OK for master?
>> >>
>> > No, as rep stosb has terrible performance on most of machines, on ivy
>> > bridge its around six times slower than rep stosq. I wouldn't be
>> >
>>
>> I added them for i386 memcpy family multiarch functions.  We have
>> memcpy for i586 and i686:
>>
>> sysdeps/i386/i586/memcpy.S
>> sysdeps/i386/i686/memcpy.S
>>
>> But we don't have it for i486.  I add them so that I can implement
>> i386 memset and memcpy family multiarch functions for i486,
>> i586 and i686 targets.  i386 memset and memcpy are used only
>> when
>>
>> 1. Building glibc for i486 with --disable-multi-arch.  Or
>> 2. Processor doesn't support i686 nor SSE2.
>>
>> I believe these are a very rare cases.
>>
> While true a existing implementation looked better. So if you need use
> assembly could you pick these files compiled with gcc -S or something
> like that?

We don't know if they are better than REP MOVSB/STOSB in cases of

1. Building glibc for i486 with --disable-multi-arch.  Or
2. Processor doesn't support i686 nor SSE2.

and on Haswell/Skylake, REP MOVSB/STOSB aren't too bad.
  
Ondrej Bilka Aug. 27, 2015, 11:46 a.m. UTC | #5
On Wed, Aug 26, 2015 at 08:31:30AM -0700, H.J. Lu wrote:
> On Wed, Aug 26, 2015 at 8:15 AM, Ondřej Bílka <neleai@seznam.cz> wrote:
> > On Wed, Aug 26, 2015 at 07:49:51AM -0700, H.J. Lu wrote:
> >> On Wed, Aug 26, 2015 at 7:29 AM, Ondřej Bílka <neleai@seznam.cz> wrote:
> >> > On Wed, Aug 26, 2015 at 06:46:31AM -0700, H.J. Lu wrote:
> >> >> Add i386 memset and memcpy assembly functions with REP MOVSB/STOSB
> >> >> instructions.  They will be used to implement i386 multi-arch memcpy.
> >> >>
> >> >> OK for master?
> >> >>
> >> > No, as rep stosb has terrible performance on most of machines, on ivy
> >> > bridge its around six times slower than rep stosq. I wouldn't be
> >> >
> >>
> >> I added them for i386 memcpy family multiarch functions.  We have
> >> memcpy for i586 and i686:
> >>
> >> sysdeps/i386/i586/memcpy.S
> >> sysdeps/i386/i686/memcpy.S
> >>
> >> But we don't have it for i486.  I add them so that I can implement
> >> i386 memset and memcpy family multiarch functions for i486,
> >> i586 and i686 targets.  i386 memset and memcpy are used only
> >> when
> >>
> >> 1. Building glibc for i486 with --disable-multi-arch.  Or
> >> 2. Processor doesn't support i686 nor SSE2.
> >>
> >> I believe these are a very rare cases.
> >>
> > While true a existing implementation looked better. So if you need use
> > assembly could you pick these files compiled with gcc -S or something
> > like that?
> 
> We don't know if they are better than REP MOVSB/STOSB in cases of
> 
> 1. Building glibc for i486 with --disable-multi-arch.  Or
> 2. Processor doesn't support i686 nor SSE2.
> 
> and on Haswell/Skylake, REP MOVSB/STOSB aren't too bad.
> 
I could accept that if we decide that we dont care about performance in
these cases. As 1. user already doesn't care as we need to use very slow
implementations with --disable-multi-arch.

As for 2 its about if we care about performance of old machines or not.
I would be for not optimizing for machines without sse2 as we don't have
these to test that.

With these arguments a change would be acceptable but of course I would
prefer one that is better on sandy bridge with disable-multiarch.
  
H.J. Lu Aug. 27, 2015, 12:37 p.m. UTC | #6
On Thu, Aug 27, 2015 at 4:46 AM, Ondřej Bílka <neleai@seznam.cz> wrote:
> On Wed, Aug 26, 2015 at 08:31:30AM -0700, H.J. Lu wrote:
>> On Wed, Aug 26, 2015 at 8:15 AM, Ondřej Bílka <neleai@seznam.cz> wrote:
>> > On Wed, Aug 26, 2015 at 07:49:51AM -0700, H.J. Lu wrote:
>> >> On Wed, Aug 26, 2015 at 7:29 AM, Ondřej Bílka <neleai@seznam.cz> wrote:
>> >> > On Wed, Aug 26, 2015 at 06:46:31AM -0700, H.J. Lu wrote:
>> >> >> Add i386 memset and memcpy assembly functions with REP MOVSB/STOSB
>> >> >> instructions.  They will be used to implement i386 multi-arch memcpy.
>> >> >>
>> >> >> OK for master?
>> >> >>
>> >> > No, as rep stosb has terrible performance on most of machines, on ivy
>> >> > bridge its around six times slower than rep stosq. I wouldn't be
>> >> >
>> >>
>> >> I added them for i386 memcpy family multiarch functions.  We have
>> >> memcpy for i586 and i686:
>> >>
>> >> sysdeps/i386/i586/memcpy.S
>> >> sysdeps/i386/i686/memcpy.S
>> >>
>> >> But we don't have it for i486.  I add them so that I can implement
>> >> i386 memset and memcpy family multiarch functions for i486,
>> >> i586 and i686 targets.  i386 memset and memcpy are used only
>> >> when
>> >>
>> >> 1. Building glibc for i486 with --disable-multi-arch.  Or
>> >> 2. Processor doesn't support i686 nor SSE2.
>> >>
>> >> I believe these are a very rare cases.
>> >>
>> > While true a existing implementation looked better. So if you need use
>> > assembly could you pick these files compiled with gcc -S or something
>> > like that?
>>
>> We don't know if they are better than REP MOVSB/STOSB in cases of
>>
>> 1. Building glibc for i486 with --disable-multi-arch.  Or
>> 2. Processor doesn't support i686 nor SSE2.
>>
>> and on Haswell/Skylake, REP MOVSB/STOSB aren't too bad.
>>
> I could accept that if we decide that we dont care about performance in
> these cases. As 1. user already doesn't care as we need to use very slow
> implementations with --disable-multi-arch.
>
> As for 2 its about if we care about performance of old machines or not.
> I would be for not optimizing for machines without sse2 as we don't have
> these to test that.
>
> With these arguments a change would be acceptable but of course I would
> prefer one that is better on sandy bridge with disable-multiarch.

If people want better performance on Sandy Bridge, they shouldn't
configure glibc as i486 with --disable-multi-arch :-(.
  
H.J. Lu Aug. 27, 2015, 3:52 p.m. UTC | #7
On Thu, Aug 27, 2015 at 5:37 AM, H.J. Lu <hjl.tools@gmail.com> wrote:
> On Thu, Aug 27, 2015 at 4:46 AM, Ondřej Bílka <neleai@seznam.cz> wrote:
>> On Wed, Aug 26, 2015 at 08:31:30AM -0700, H.J. Lu wrote:
>>> On Wed, Aug 26, 2015 at 8:15 AM, Ondřej Bílka <neleai@seznam.cz> wrote:
>>> > On Wed, Aug 26, 2015 at 07:49:51AM -0700, H.J. Lu wrote:
>>> >> On Wed, Aug 26, 2015 at 7:29 AM, Ondřej Bílka <neleai@seznam.cz> wrote:
>>> >> > On Wed, Aug 26, 2015 at 06:46:31AM -0700, H.J. Lu wrote:
>>> >> >> Add i386 memset and memcpy assembly functions with REP MOVSB/STOSB
>>> >> >> instructions.  They will be used to implement i386 multi-arch memcpy.
>>> >> >>
>>> >> >> OK for master?
>>> >> >>
>>> >> > No, as rep stosb has terrible performance on most of machines, on ivy
>>> >> > bridge its around six times slower than rep stosq. I wouldn't be
>>> >> >
>>> >>
>>> >> I added them for i386 memcpy family multiarch functions.  We have
>>> >> memcpy for i586 and i686:
>>> >>
>>> >> sysdeps/i386/i586/memcpy.S
>>> >> sysdeps/i386/i686/memcpy.S
>>> >>
>>> >> But we don't have it for i486.  I add them so that I can implement
>>> >> i386 memset and memcpy family multiarch functions for i486,
>>> >> i586 and i686 targets.  i386 memset and memcpy are used only
>>> >> when
>>> >>
>>> >> 1. Building glibc for i486 with --disable-multi-arch.  Or
>>> >> 2. Processor doesn't support i686 nor SSE2.
>>> >>
>>> >> I believe these are a very rare cases.
>>> >>
>>> > While true a existing implementation looked better. So if you need use
>>> > assembly could you pick these files compiled with gcc -S or something
>>> > like that?
>>>
>>> We don't know if they are better than REP MOVSB/STOSB in cases of
>>>
>>> 1. Building glibc for i486 with --disable-multi-arch.  Or
>>> 2. Processor doesn't support i686 nor SSE2.
>>>
>>> and on Haswell/Skylake, REP MOVSB/STOSB aren't too bad.
>>>
>> I could accept that if we decide that we dont care about performance in
>> these cases. As 1. user already doesn't care as we need to use very slow
>> implementations with --disable-multi-arch.
>>
>> As for 2 its about if we care about performance of old machines or not.
>> I would be for not optimizing for machines without sse2 as we don't have
>> these to test that.
>>
>> With these arguments a change would be acceptable but of course I would
>> prefer one that is better on sandy bridge with disable-multiarch.
>
> If people want better performance on Sandy Bridge, they shouldn't
> configure glibc as i486 with --disable-multi-arch :-(.
>
> --
> H.J.

I am checking it in now.
  

Patch

diff --git a/sysdeps/i386/bcopy.S b/sysdeps/i386/bcopy.S
new file mode 100644
index 0000000..12b8ddb
--- /dev/null
+++ b/sysdeps/i386/bcopy.S
@@ -0,0 +1,4 @@ 
+#define USE_AS_MEMMOVE
+#define USE_AS_BCOPY
+#define MEMCPY		bcopy
+#include "memcpy.S"
diff --git a/sysdeps/i386/bzero.S b/sysdeps/i386/bzero.S
new file mode 100644
index 0000000..c8dd47b
--- /dev/null
+++ b/sysdeps/i386/bzero.S
@@ -0,0 +1,5 @@ 
+#define USE_AS_BZERO
+#define memset __bzero
+#include "memset.S"
+
+weak_alias (__bzero, bzero)
diff --git a/sysdeps/i386/bzero.c b/sysdeps/i386/bzero.c
deleted file mode 100644
index 1a89444..0000000
--- a/sysdeps/i386/bzero.c
+++ /dev/null
@@ -1,82 +0,0 @@ 
-/* bzero -- set a block of memory to zero.  For Intel 80x86, x>=3.
-   This file is part of the GNU C Library.
-   Copyright (C) 1991-2015 Free Software Foundation, Inc.
-   Contributed by Torbjorn Granlund (tege@sics.se).
-
-   The GNU C Library is free software; you can redistribute it and/or
-   modify it under the terms of the GNU Lesser General Public
-   License as published by the Free Software Foundation; either
-   version 2.1 of the License, or (at your option) any later version.
-
-   The GNU C Library is distributed in the hope that it will be useful,
-   but WITHOUT ANY WARRANTY; without even the implied warranty of
-   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
-   Lesser General Public License for more details.
-
-   You should have received a copy of the GNU Lesser General Public
-   License along with the GNU C Library; if not, see
-   <http://www.gnu.org/licenses/>.  */
-
-#include <string.h>
-#include <memcopy.h>
-
-#undef	bzero
-#undef	__bzero
-
-#ifdef	__GNUC__
-
-void
-__bzero (dstpp, len)
-     void *dstpp;
-     size_t len;
-{
-  /* N.B.: This code is almost verbatim from memset.c.  */
-  int d0;
-  unsigned long int dstp = (unsigned long int) dstpp;
-
-  /* This explicit register allocation
-     improves code very much indeed.  */
-  register op_t x asm ("ax");
-
-  x = 0;
-
-  /* Clear the direction flag, so filling will move forward.  */
-  asm volatile ("cld");
-
-  /* This threshold value is optimal.  */
-  if (len >= 12)
-    {
-      /* Adjust LEN for the bytes handled in the first loop.  */
-      len -= (-dstp) % OPSIZ;
-
-      /* There are at least some bytes to set.
-	 No need to test for LEN == 0 in this alignment loop.  */
-
-      /* Fill bytes until DSTP is aligned on a longword boundary.  */
-      asm volatile ("rep\n"
-		    "stosb" /* %0, %2, %3 */ :
-		    "=D" (dstp), "=c" (d0) :
-		    "0" (dstp), "1" ((-dstp) % OPSIZ), "a" (x) :
-		    "memory");
-
-      /* Fill longwords.  */
-      asm volatile ("rep\n"
-		    "stosl" /* %0, %2, %3 */ :
-		    "=D" (dstp), "=c" (d0) :
-		    "0" (dstp), "1" (len / OPSIZ), "a" (x) :
-		    "memory");
-      len %= OPSIZ;
-    }
-
-  /* Write the last few bytes.  */
-  asm volatile ("rep\n"
-		"stosb" /* %0, %2, %3 */ :
-		"=D" (dstp), "=c" (d0) :
-		"0" (dstp), "c" (len), "a" (x) :
-		"memory");
-}
-weak_alias (__bzero, bzero)
-
-#else
-#include <string/bzero.c>
-#endif
diff --git a/sysdeps/i386/i586/memcpy_chk.S b/sysdeps/i386/i586/memcpy_chk.S
deleted file mode 100644
index ab8a95c..0000000
--- a/sysdeps/i386/i586/memcpy_chk.S
+++ /dev/null
@@ -1 +0,0 @@ 
-#include <sysdeps/i386/i686/memcpy_chk.S>
diff --git a/sysdeps/i386/i586/mempcpy_chk.S b/sysdeps/i386/i586/mempcpy_chk.S
deleted file mode 100644
index 9a1de1d..0000000
--- a/sysdeps/i386/i586/mempcpy_chk.S
+++ /dev/null
@@ -1 +0,0 @@ 
-#include <sysdeps/i386/i686/mempcpy_chk.S>
diff --git a/sysdeps/i386/i586/memset_chk.S b/sysdeps/i386/i586/memset_chk.S
deleted file mode 100644
index 09f9d42..0000000
--- a/sysdeps/i386/i586/memset_chk.S
+++ /dev/null
@@ -1 +0,0 @@ 
-#include <sysdeps/i386/i686/memset_chk.S>
diff --git a/sysdeps/i386/memcpy.S b/sysdeps/i386/memcpy.S
new file mode 100644
index 0000000..5f0196e
--- /dev/null
+++ b/sysdeps/i386/memcpy.S
@@ -0,0 +1,95 @@ 
+/* memcpy with REP MOVSB/STOSB
+   Copyright (C) 2015 Free Software Foundation, Inc.
+   This file is part of the GNU C Library.
+
+   The GNU C Library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 2.1 of the License, or (at your option) any later version.
+
+   The GNU C Library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with the GNU C Library; if not, see
+   <http://www.gnu.org/licenses/>.  */
+
+#include <sysdep.h>
+#include "asm-syntax.h"
+
+#ifndef MEMCPY
+# define MEMCPY		memcpy
+# define MEMCPY_CHK	__memcpy_chk
+#endif
+
+#ifdef USE_AS_BCOPY
+# define STR2		12
+# define STR1		STR2+4
+# define N     		STR1+4
+#else
+# define STR1		12
+# define STR2		STR1+4
+# define N     		STR2+4
+#endif
+
+#define CFI_PUSH(REG)						\
+  cfi_adjust_cfa_offset (4);					\
+  cfi_rel_offset (REG, 0)
+
+#define CFI_POP(REG)						\
+  cfi_adjust_cfa_offset (-4);					\
+  cfi_restore (REG)
+
+#define PUSH(REG)	pushl REG; CFI_PUSH (REG)
+#define POP(REG)	popl REG; CFI_POP (REG)
+
+	.text
+#if defined SHARED && IS_IN (libc) && !defined USE_AS_BCOPY
+ENTRY (MEMCPY_CHK)
+	movl	12(%esp), %eax
+	cmpl	%eax, 16(%esp)
+	jb	HIDDEN_JUMPTARGET (__chk_fail)
+END (MEMCPY_CHK)
+#endif
+ENTRY (MEMCPY)
+	PUSH	(%esi)
+	PUSH	(%edi)
+	movl	N(%esp), %ecx
+	movl	STR1(%esp), %edi
+	movl	STR2(%esp), %esi
+	mov	%edi, %eax
+#ifdef USE_AS_MEMPCPY
+	add	%ecx, %eax
+#endif
+
+#ifdef USE_AS_MEMMOVE
+	cmp	%esi, %edi
+	ja	L(copy_backward)
+	je	L(bwd_write_0bytes)
+#endif
+
+	rep	movsb
+	POP	(%edi)
+	POP	(%esi)
+	ret
+
+#ifdef USE_AS_MEMMOVE
+L(copy_backward):
+	lea	-1(%edi,%ecx), %edi
+	lea	-1(%esi,%ecx), %esi
+	std
+	rep	movsb
+	cld
+L(bwd_write_0bytes):
+	POP	(%edi)
+	POP	(%esi)
+	ret
+#endif
+
+END (MEMCPY)
+
+#ifndef USE_AS_BCOPY
+libc_hidden_builtin_def (MEMCPY)
+#endif
diff --git a/sysdeps/i386/i686/memcpy_chk.S b/sysdeps/i386/memcpy_chk.S
similarity index 92%
rename from sysdeps/i386/i686/memcpy_chk.S
rename to sysdeps/i386/memcpy_chk.S
index cdf807f..b3b25de 100644
--- a/sysdeps/i386/i686/memcpy_chk.S
+++ b/sysdeps/i386/memcpy_chk.S
@@ -1,4 +1,4 @@ 
-/* Checking memcpy for i686.
+/* Checking memcpy for i386.
    Copyright (C) 2004-2015 Free Software Foundation, Inc.
    This file is part of the GNU C Library.
 
@@ -16,10 +16,10 @@ 
    License along with the GNU C Library; if not, see
    <http://www.gnu.org/licenses/>.  */
 
-#include <sysdep.h>
-#include "asm-syntax.h"
+#ifndef SHARED
+# include <sysdep.h>
+# include "asm-syntax.h"
 
-#ifndef PIC
 	/* For libc.so this is defined in memcpy.S.
 	   For libc.a, this is a separate source to avoid
 	   memcpy bringing in __chk_fail and all routines
diff --git a/sysdeps/i386/memmove.S b/sysdeps/i386/memmove.S
new file mode 100644
index 0000000..60a45d2
--- /dev/null
+++ b/sysdeps/i386/memmove.S
@@ -0,0 +1,4 @@ 
+#define USE_AS_MEMMOVE
+#define MEMCPY		memmove
+#define MEMCPY_CHK	__memmove_chk
+#include "memcpy.S"
diff --git a/sysdeps/i386/i686/memmove_chk.S b/sysdeps/i386/memmove_chk.S
similarity index 78%
rename from sysdeps/i386/i686/memmove_chk.S
rename to sysdeps/i386/memmove_chk.S
index 64bf9e0..26d2abd 100644
--- a/sysdeps/i386/i686/memmove_chk.S
+++ b/sysdeps/i386/memmove_chk.S
@@ -1,4 +1,4 @@ 
-/* Checking memmove for x86-64.
+/* Checking memmove for i386
    Copyright (C) 2004-2015 Free Software Foundation, Inc.
    This file is part of the GNU C Library.
 
@@ -16,14 +16,13 @@ 
    License along with the GNU C Library; if not, see
    <http://www.gnu.org/licenses/>.  */
 
-#include <sysdep.h>
-#include "asm-syntax.h"
+#ifndef SHARED
+# include <sysdep.h>
+# include "asm-syntax.h"
 
-#ifndef PIC
-	/* For libc.so this is defined in memmove.S.
-	   For libc.a, this is a separate source to avoid
-	   memmove bringing in __chk_fail and all routines
-	   it calls.  */
+/* For libc.so this is defined in memmove.S.  For libc.a, this is a
+   separate source to avoid memmove bringing in __chk_fail and all
+   routines it calls.  */
         .text
 ENTRY (__memmove_chk)
 	movl	12(%esp), %eax
diff --git a/sysdeps/i386/mempcpy.S b/sysdeps/i386/mempcpy.S
new file mode 100644
index 0000000..61addb7
--- /dev/null
+++ b/sysdeps/i386/mempcpy.S
@@ -0,0 +1,7 @@ 
+#define USE_AS_MEMPCPY
+#define MEMCPY		__mempcpy
+#define MEMCPY_CHK	__mempcpy_chk
+#include "memcpy.S"
+
+weak_alias (__mempcpy, mempcpy)
+libc_hidden_builtin_def (mempcpy)
diff --git a/sysdeps/i386/i686/mempcpy_chk.S b/sysdeps/i386/mempcpy_chk.S
similarity index 78%
rename from sysdeps/i386/i686/mempcpy_chk.S
rename to sysdeps/i386/mempcpy_chk.S
index a61757b..05f86c3 100644
--- a/sysdeps/i386/i686/mempcpy_chk.S
+++ b/sysdeps/i386/mempcpy_chk.S
@@ -1,4 +1,4 @@ 
-/* Checking mempcpy for x86-64.
+/* Checking mempcpy for i386
    Copyright (C) 2004-2015 Free Software Foundation, Inc.
    This file is part of the GNU C Library.
 
@@ -16,14 +16,13 @@ 
    License along with the GNU C Library; if not, see
    <http://www.gnu.org/licenses/>.  */
 
-#include <sysdep.h>
-#include "asm-syntax.h"
+#ifndef SHARED
+# include <sysdep.h>
+# include "asm-syntax.h"
 
-#ifndef PIC
-	/* For libc.so this is defined in mempcpy.S.
-	   For libc.a, this is a separate source to avoid
-	   mempcpy bringing in __chk_fail and all routines
-	   it calls.  */
+/* For libc.so this is defined in mempcpy.S.  For libc.a, this is a
+   separate source to avoid mempcpy bringing in __chk_fail and all
+   routines it calls.  */
         .text
 ENTRY (__mempcpy_chk)
 	movl	12(%esp), %eax
diff --git a/sysdeps/i386/memset.S b/sysdeps/i386/memset.S
new file mode 100644
index 0000000..21b3430
--- /dev/null
+++ b/sysdeps/i386/memset.S
@@ -0,0 +1,68 @@ 
+/* memset with REP MOVSB/STOSB
+   Copyright (C) 2015 Free Software Foundation, Inc.
+   This file is part of the GNU C Library.
+
+   The GNU C Library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 2.1 of the License, or (at your option) any later version.
+
+   The GNU C Library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with the GNU C Library; if not, see
+   <http://www.gnu.org/licenses/>.  */
+
+#include <sysdep.h>
+
+#define CFI_PUSH(REG)						\
+  cfi_adjust_cfa_offset (4);					\
+  cfi_rel_offset (REG, 0)
+
+#define CFI_POP(REG)						\
+  cfi_adjust_cfa_offset (-4);					\
+  cfi_restore (REG)
+
+#define PUSH(REG)	pushl REG; CFI_PUSH (REG)
+#define POP(REG)	popl REG; CFI_POP (REG)
+
+#define STR1  8
+#ifdef USE_AS_BZERO
+#define N     STR1+4
+#else
+#define STR2  STR1+4
+#define N     STR2+4
+#endif
+
+	.text
+#if defined SHARED && IS_IN (libc) && !defined USE_AS_BZERO
+ENTRY (__memset_chk)
+	movl	12(%esp), %eax
+	cmpl	%eax, 16(%esp)
+	jb	HIDDEN_JUMPTARGET (__chk_fail)
+END (__memset_chk)
+#endif
+ENTRY (memset)
+	PUSH    (%edi)
+	movl	N(%esp), %ecx
+	movl	STR1(%esp), %edi
+#ifdef USE_AS_BZERO
+	xor	%eax, %eax
+#else
+	movzbl	STR2(%esp), %eax
+	mov	%edi, %edx
+#endif
+	rep	stosb
+#ifndef USE_AS_BZERO
+	mov	%edx, %eax
+#endif
+	POP     (%edi)
+	ret
+END (memset)
+
+#ifndef USE_AS_BZERO
+libc_hidden_builtin_def (memset)
+#endif
diff --git a/sysdeps/i386/memset.c b/sysdeps/i386/memset.c
deleted file mode 100644
index bf11590..0000000
--- a/sysdeps/i386/memset.c
+++ /dev/null
@@ -1,85 +0,0 @@ 
-/* Set a block of memory to some byte value.
-   For Intel 80x86, x>=3.
-   Copyright (C) 1991-2015 Free Software Foundation, Inc.
-   This file is part of the GNU C Library.
-   Contributed by Torbjorn Granlund (tege@sics.se).
-
-   The GNU C Library is free software; you can redistribute it and/or
-   modify it under the terms of the GNU Lesser General Public
-   License as published by the Free Software Foundation; either
-   version 2.1 of the License, or (at your option) any later version.
-
-   The GNU C Library is distributed in the hope that it will be useful,
-   but WITHOUT ANY WARRANTY; without even the implied warranty of
-   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
-   Lesser General Public License for more details.
-
-   You should have received a copy of the GNU Lesser General Public
-   License along with the GNU C Library; if not, see
-   <http://www.gnu.org/licenses/>.  */
-
-#include <string.h>
-#include <memcopy.h>
-
-#ifdef	__GNUC__
-
-#undef memset
-
-void *
-memset (void *dstpp, int c, size_t len)
-{
-  int d0;
-  unsigned long int dstp = (unsigned long int) dstpp;
-
-  /* This explicit register allocation
-     improves code very much indeed.  */
-  register op_t x asm("ax");
-
-  x = (unsigned char) c;
-
-  /* Clear the direction flag, so filling will move forward.  */
-  asm volatile("cld");
-
-  /* This threshold value is optimal.  */
-  if (len >= 12)
-    {
-      /* Fill X with four copies of the char we want to fill with.  */
-      x |= (x << 8);
-      x |= (x << 16);
-
-      /* Adjust LEN for the bytes handled in the first loop.  */
-      len -= (-dstp) % OPSIZ;
-
-      /* There are at least some bytes to set.
-	 No need to test for LEN == 0 in this alignment loop.  */
-
-      /* Fill bytes until DSTP is aligned on a longword boundary.  */
-      asm volatile("rep\n"
-		   "stosb" /* %0, %2, %3 */ :
-		   "=D" (dstp), "=c" (d0) :
-		   "0" (dstp), "1" ((-dstp) % OPSIZ), "a" (x) :
-		   "memory");
-
-      /* Fill longwords.  */
-      asm volatile("rep\n"
-		   "stosl" /* %0, %2, %3 */ :
-		   "=D" (dstp), "=c" (d0) :
-		   "0" (dstp), "1" (len / OPSIZ), "a" (x) :
-		   "memory");
-      len %= OPSIZ;
-    }
-
-  /* Write the last few bytes.  */
-  asm volatile("rep\n"
-	       "stosb" /* %0, %2, %3 */ :
-	       "=D" (dstp), "=c" (d0) :
-	       "0" (dstp), "1" (len), "a" (x) :
-	       "memory");
-
-  return dstpp;
-}
-libc_hidden_builtin_def (memset)
-
-#else
-#include <string/memset.c>
-#endif
diff --git a/sysdeps/i386/i686/memset_chk.S b/sysdeps/i386/memset_chk.S
similarity index 79%
rename from sysdeps/i386/i686/memset_chk.S
rename to sysdeps/i386/memset_chk.S
index da982fd..2312d32 100644
--- a/sysdeps/i386/i686/memset_chk.S
+++ b/sysdeps/i386/memset_chk.S
@@ -1,4 +1,4 @@ 
-/* Checking memset for i686.
+/* Checking memset for i386.
    Copyright (C) 2004-2015 Free Software Foundation, Inc.
    This file is part of the GNU C Library.
 
@@ -16,14 +16,13 @@ 
    License along with the GNU C Library; if not, see
    <http://www.gnu.org/licenses/>.  */
 
-#include <sysdep.h>
-#include "asm-syntax.h"
-
 #ifndef SHARED
-	/* For libc.so this is defined in memset.S.
-	   For libc.a, this is a separate source to avoid
-	   memset bringing in __chk_fail and all routines
-	   it calls.  */
+# include <sysdep.h>
+# include "asm-syntax.h"
+
+/* For libc.so this is defined in memset.S.  For libc.a, this is a
+   separate source to avoid memset bringing in __chk_fail and all
+   routines it calls.  */
         .text
 ENTRY (__memset_chk)
 	movl	12(%esp), %eax