[16/21] mips: libc: Add improved C implementation of memcpy/memset

Message ID 20241031054937.68189-17-arikalo@gmail.com
State New
Headers
Series A series of updates related to MIPS |

Commit Message

Aleksandar Rikalo Oct. 31, 2024, 5:49 a.m. UTC
  From: Faraz Shahbazker <fshahbazker@wavecomp.com>

newlib/
	* libc/machine/mips/memcpy.c: New file.
	* libc/machine/mips/memset.c: New file.
	* newlib/Makefile.in: Add new source files.

Signed-off-by: Faraz Shahbazker <fshahbazker@wavecomp.com>
Signed-off-by: Aleksandar Rikalo <arikalo@gmail.com>
---
 newlib/Makefile.in                |  30 +--
 newlib/libc/machine/mips/memcpy.c | 405 ++++++++++++++++++++++++++++++
 newlib/libc/machine/mips/memset.c | 176 +++++++++++++
 3 files changed, 582 insertions(+), 29 deletions(-)
 create mode 100644 newlib/libc/machine/mips/memcpy.c
 create mode 100644 newlib/libc/machine/mips/memset.c
  

Patch

diff --git a/newlib/Makefile.in b/newlib/Makefile.in
index efb642473..3ef3d8b95 100644
--- a/newlib/Makefile.in
+++ b/newlib/Makefile.in
@@ -798,7 +798,7 @@  check_PROGRAMS =
 @HAVE_LIBC_MACHINE_M88K_TRUE@am__append_95 = libc/machine/m88k/setjmp.S
 @HAVE_LIBC_MACHINE_MEP_TRUE@am__append_96 = libc/machine/mep/setjmp.S
 @HAVE_LIBC_MACHINE_MICROBLAZE_TRUE@am__append_97 = libc/machine/microblaze/strlen.c libc/machine/microblaze/strcmp.c libc/machine/microblaze/strcpy.c libc/machine/microblaze/setjmp.S libc/machine/microblaze/longjmp.S
-@HAVE_LIBC_MACHINE_MIPS_TRUE@am__append_98 = libc/machine/mips/setjmp.S libc/machine/mips/strlen.c libc/machine/mips/strcmp.S libc/machine/mips/strncpy.c libc/machine/mips/memset.S libc/machine/mips/memcpy.S
+@HAVE_LIBC_MACHINE_MIPS_TRUE@am__append_98 = libc/machine/mips/setjmp.S libc/machine/mips/strlen.c libc/machine/mips/strcmp.S libc/machine/mips/strncpy.c libc/machine/mips/memset.c libc/machine/mips/memcpy.c
 @HAVE_LIBC_MACHINE_MN10200_TRUE@am__append_99 = libc/machine/mn10200/setjmp.S
 @HAVE_LIBC_MACHINE_MN10300_TRUE@am__append_100 = \
 @HAVE_LIBC_MACHINE_MN10300_TRUE@	libc/machine/mn10300/setjmp.S libc/machine/mn10300/memchr.S libc/machine/mn10300/memcmp.S libc/machine/mn10300/memcpy.S libc/machine/mn10300/memset.S libc/machine/mn10300/strchr.S \
@@ -18846,34 +18846,6 @@  libc/machine/mips/libc_a-strcmp.obj: libc/machine/mips/strcmp.S
 @AMDEP_TRUE@@am__fastdepCCAS_FALSE@	DEPDIR=$(DEPDIR) $(CCASDEPMODE) $(depcomp) @AMDEPBACKSLASH@
 @am__fastdepCCAS_FALSE@	$(AM_V_CPPAS@am__nodep@)$(CCAS) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(libc_a_CPPFLAGS) $(CPPFLAGS) $(libc_a_CCASFLAGS) $(CCASFLAGS) -c -o libc/machine/mips/libc_a-strcmp.obj `if test -f 'libc/machine/mips/strcmp.S'; then $(CYGPATH_W) 'libc/machine/mips/strcmp.S'; else $(CYGPATH_W) '$(srcdir)/libc/machine/mips/strcmp.S'; fi`
 
-libc/machine/mips/libc_a-memset.o: libc/machine/mips/memset.S
-@am__fastdepCCAS_TRUE@	$(AM_V_CPPAS)$(CCAS) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(libc_a_CPPFLAGS) $(CPPFLAGS) $(libc_a_CCASFLAGS) $(CCASFLAGS) -MT libc/machine/mips/libc_a-memset.o -MD -MP -MF libc/machine/mips/$(DEPDIR)/libc_a-memset.Tpo -c -o libc/machine/mips/libc_a-memset.o `test -f 'libc/machine/mips/memset.S' || echo '$(srcdir)/'`libc/machine/mips/memset.S
-@am__fastdepCCAS_TRUE@	$(AM_V_at)$(am__mv) libc/machine/mips/$(DEPDIR)/libc_a-memset.Tpo libc/machine/mips/$(DEPDIR)/libc_a-memset.Po
-@AMDEP_TRUE@@am__fastdepCCAS_FALSE@	$(AM_V_CPPAS)source='libc/machine/mips/memset.S' object='libc/machine/mips/libc_a-memset.o' libtool=no @AMDEPBACKSLASH@
-@AMDEP_TRUE@@am__fastdepCCAS_FALSE@	DEPDIR=$(DEPDIR) $(CCASDEPMODE) $(depcomp) @AMDEPBACKSLASH@
-@am__fastdepCCAS_FALSE@	$(AM_V_CPPAS@am__nodep@)$(CCAS) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(libc_a_CPPFLAGS) $(CPPFLAGS) $(libc_a_CCASFLAGS) $(CCASFLAGS) -c -o libc/machine/mips/libc_a-memset.o `test -f 'libc/machine/mips/memset.S' || echo '$(srcdir)/'`libc/machine/mips/memset.S
-
-libc/machine/mips/libc_a-memset.obj: libc/machine/mips/memset.S
-@am__fastdepCCAS_TRUE@	$(AM_V_CPPAS)$(CCAS) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(libc_a_CPPFLAGS) $(CPPFLAGS) $(libc_a_CCASFLAGS) $(CCASFLAGS) -MT libc/machine/mips/libc_a-memset.obj -MD -MP -MF libc/machine/mips/$(DEPDIR)/libc_a-memset.Tpo -c -o libc/machine/mips/libc_a-memset.obj `if test -f 'libc/machine/mips/memset.S'; then $(CYGPATH_W) 'libc/machine/mips/memset.S'; else $(CYGPATH_W) '$(srcdir)/libc/machine/mips/memset.S'; fi`
-@am__fastdepCCAS_TRUE@	$(AM_V_at)$(am__mv) libc/machine/mips/$(DEPDIR)/libc_a-memset.Tpo libc/machine/mips/$(DEPDIR)/libc_a-memset.Po
-@AMDEP_TRUE@@am__fastdepCCAS_FALSE@	$(AM_V_CPPAS)source='libc/machine/mips/memset.S' object='libc/machine/mips/libc_a-memset.obj' libtool=no @AMDEPBACKSLASH@
-@AMDEP_TRUE@@am__fastdepCCAS_FALSE@	DEPDIR=$(DEPDIR) $(CCASDEPMODE) $(depcomp) @AMDEPBACKSLASH@
-@am__fastdepCCAS_FALSE@	$(AM_V_CPPAS@am__nodep@)$(CCAS) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(libc_a_CPPFLAGS) $(CPPFLAGS) $(libc_a_CCASFLAGS) $(CCASFLAGS) -c -o libc/machine/mips/libc_a-memset.obj `if test -f 'libc/machine/mips/memset.S'; then $(CYGPATH_W) 'libc/machine/mips/memset.S'; else $(CYGPATH_W) '$(srcdir)/libc/machine/mips/memset.S'; fi`
-
-libc/machine/mips/libc_a-memcpy.o: libc/machine/mips/memcpy.S
-@am__fastdepCCAS_TRUE@	$(AM_V_CPPAS)$(CCAS) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(libc_a_CPPFLAGS) $(CPPFLAGS) $(libc_a_CCASFLAGS) $(CCASFLAGS) -MT libc/machine/mips/libc_a-memcpy.o -MD -MP -MF libc/machine/mips/$(DEPDIR)/libc_a-memcpy.Tpo -c -o libc/machine/mips/libc_a-memcpy.o `test -f 'libc/machine/mips/memcpy.S' || echo '$(srcdir)/'`libc/machine/mips/memcpy.S
-@am__fastdepCCAS_TRUE@	$(AM_V_at)$(am__mv) libc/machine/mips/$(DEPDIR)/libc_a-memcpy.Tpo libc/machine/mips/$(DEPDIR)/libc_a-memcpy.Po
-@AMDEP_TRUE@@am__fastdepCCAS_FALSE@	$(AM_V_CPPAS)source='libc/machine/mips/memcpy.S' object='libc/machine/mips/libc_a-memcpy.o' libtool=no @AMDEPBACKSLASH@
-@AMDEP_TRUE@@am__fastdepCCAS_FALSE@	DEPDIR=$(DEPDIR) $(CCASDEPMODE) $(depcomp) @AMDEPBACKSLASH@
-@am__fastdepCCAS_FALSE@	$(AM_V_CPPAS@am__nodep@)$(CCAS) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(libc_a_CPPFLAGS) $(CPPFLAGS) $(libc_a_CCASFLAGS) $(CCASFLAGS) -c -o libc/machine/mips/libc_a-memcpy.o `test -f 'libc/machine/mips/memcpy.S' || echo '$(srcdir)/'`libc/machine/mips/memcpy.S
-
-libc/machine/mips/libc_a-memcpy.obj: libc/machine/mips/memcpy.S
-@am__fastdepCCAS_TRUE@	$(AM_V_CPPAS)$(CCAS) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(libc_a_CPPFLAGS) $(CPPFLAGS) $(libc_a_CCASFLAGS) $(CCASFLAGS) -MT libc/machine/mips/libc_a-memcpy.obj -MD -MP -MF libc/machine/mips/$(DEPDIR)/libc_a-memcpy.Tpo -c -o libc/machine/mips/libc_a-memcpy.obj `if test -f 'libc/machine/mips/memcpy.S'; then $(CYGPATH_W) 'libc/machine/mips/memcpy.S'; else $(CYGPATH_W) '$(srcdir)/libc/machine/mips/memcpy.S'; fi`
-@am__fastdepCCAS_TRUE@	$(AM_V_at)$(am__mv) libc/machine/mips/$(DEPDIR)/libc_a-memcpy.Tpo libc/machine/mips/$(DEPDIR)/libc_a-memcpy.Po
-@AMDEP_TRUE@@am__fastdepCCAS_FALSE@	$(AM_V_CPPAS)source='libc/machine/mips/memcpy.S' object='libc/machine/mips/libc_a-memcpy.obj' libtool=no @AMDEPBACKSLASH@
-@AMDEP_TRUE@@am__fastdepCCAS_FALSE@	DEPDIR=$(DEPDIR) $(CCASDEPMODE) $(depcomp) @AMDEPBACKSLASH@
-@am__fastdepCCAS_FALSE@	$(AM_V_CPPAS@am__nodep@)$(CCAS) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(libc_a_CPPFLAGS) $(CPPFLAGS) $(libc_a_CCASFLAGS) $(CCASFLAGS) -c -o libc/machine/mips/libc_a-memcpy.obj `if test -f 'libc/machine/mips/memcpy.S'; then $(CYGPATH_W) 'libc/machine/mips/memcpy.S'; else $(CYGPATH_W) '$(srcdir)/libc/machine/mips/memcpy.S'; fi`
-
 libc/machine/mn10200/libc_a-setjmp.o: libc/machine/mn10200/setjmp.S
 @am__fastdepCCAS_TRUE@	$(AM_V_CPPAS)$(CCAS) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(libc_a_CPPFLAGS) $(CPPFLAGS) $(libc_a_CCASFLAGS) $(CCASFLAGS) -MT libc/machine/mn10200/libc_a-setjmp.o -MD -MP -MF libc/machine/mn10200/$(DEPDIR)/libc_a-setjmp.Tpo -c -o libc/machine/mn10200/libc_a-setjmp.o `test -f 'libc/machine/mn10200/setjmp.S' || echo '$(srcdir)/'`libc/machine/mn10200/setjmp.S
 @am__fastdepCCAS_TRUE@	$(AM_V_at)$(am__mv) libc/machine/mn10200/$(DEPDIR)/libc_a-setjmp.Tpo libc/machine/mn10200/$(DEPDIR)/libc_a-setjmp.Po
diff --git a/newlib/libc/machine/mips/memcpy.c b/newlib/libc/machine/mips/memcpy.c
new file mode 100644
index 000000000..2d5031814
--- /dev/null
+++ b/newlib/libc/machine/mips/memcpy.c
@@ -0,0 +1,405 @@ 
+/*
+ * Copyright (C) 2018 MIPS Tech, LLC
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice,
+ * this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from this
+ * software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
+ * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+ * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+ * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+ * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+ * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ * POSSIBILITY OF SUCH DAMAGE.
+*/
+
+/* Typical observed latency in cycles in fetching from DRAM.  */
+#define LATENCY_CYCLES 63
+
+/* Pre-fetch performance is subject to accurate prefetch ahead,
+   which in turn depends on both the cache-line size and the amount
+   of look-ahead.  Since cache-line size is not nominally fixed in
+   a typically library built for multiple platforms, we make conservative
+   assumptions in the default case.  This code will typically operate
+   on such conservative assumptions, but if compiled with the correct
+   -mtune=xx options, will perform even better on those specific
+   platforms.  */
+#if defined(_MIPS_TUNE_OCTEON2) || defined(_MIPS_TUNE_OCTEON3)
+ #define CACHE_LINE 128
+ #define BLOCK_CYCLES 30
+ #undef LATENCY_CYCLES
+ #define LATENCY_CYCLES 150
+#elif defined(_MIPS_TUNE_I6400) || defined(_MIPS_TUNE_I6500)
+ #define CACHE_LINE 64
+ #define BLOCK_CYCLES 16
+#elif defined(_MIPS_TUNE_P6600)
+ #define CACHE_LINE 32
+ #define BLOCK_CYCLES 12
+#elif defined(_MIPS_TUNE_INTERAPTIV) ||  defined(_MIPS_TUNE_INTERAPTIV_MR2)
+ #define CACHE_LINE 32
+ #define BLOCK_CYCLES 30
+#else
+ #define CACHE_LINE 32
+ #define BLOCK_CYCLES 11
+#endif
+
+/* Pre-fetch look ahead = ceil (latency / block-cycles)  */
+#define PREF_AHEAD (LATENCY_CYCLES / BLOCK_CYCLES			\
+		    + ((LATENCY_CYCLES % BLOCK_CYCLES) == 0 ? 0 : 1))
+
+/* Unroll-factor, controls how many words at a time in the core loop.  */
+#define BLOCK (CACHE_LINE == 128 ? 16 : 8)
+
+#define __overloadable
+#ifndef UNALIGNED_INSTR_SUPPORT
+/* does target have unaligned lw/ld/ualw/uald instructions? */
+ #define UNALIGNED_INSTR_SUPPORT 0
+ #if (__mips_isa_rev < 6 && !__mips1)
+  #undef UNALIGNED_INSTR_SUPPORT
+  #define UNALIGNED_INSTR_SUPPORT 1
+ #endif
+#endif
+#if !defined(HW_UNALIGNED_SUPPORT)
+/* Does target have hardware support for unaligned accesses?  */
+ #define HW_UNALIGNED_SUPPORT 0
+ #if __mips_isa_rev >= 6
+  #undef HW_UNALIGNED_SUPPORT
+  #define HW_UNALIGNED_SUPPORT 1
+ #endif
+#endif
+#define ENABLE_PREFETCH     1
+#if ENABLE_PREFETCH
+ #define PREFETCH(addr)  __builtin_prefetch (addr, 0, 0)
+#else
+ #define PREFETCH(addr)
+#endif
+
+#include <string.h>
+
+#ifdef __mips64
+typedef unsigned long long reg_t;
+typedef struct
+{
+  reg_t B0:8, B1:8, B2:8, B3:8, B4:8, B5:8, B6:8, B7:8;
+} bits_t;
+#else
+typedef unsigned long reg_t;
+typedef struct
+{
+  reg_t B0:8, B1:8, B2:8, B3:8;
+} bits_t;
+#endif
+
+#define CACHE_LINES_PER_BLOCK ((BLOCK * sizeof (reg_t) > CACHE_LINE) ?	\
+			       (BLOCK * sizeof (reg_t) / CACHE_LINE)	\
+			       : 1)
+
+typedef union
+{
+  reg_t v;
+  bits_t b;
+} bitfields_t;
+
+#define DO_BYTE(a, i)   \
+  a[i] = bw.b.B##i;     \
+  len--;                \
+  if(!len) return ret;  \
+
+/* This code is called when aligning a pointer, there are remaining bytes
+   after doing word compares, or architecture does not have some form
+   of unaligned support.  */
+static inline void * __attribute__ ((always_inline))
+do_bytes (void *a, const void *b, unsigned long len, void *ret)
+{
+  unsigned char *x = (unsigned char *) a;
+  unsigned char *y = (unsigned char *) b;
+  unsigned long i;
+  /* 'len' might be zero here, so preloading the first two values
+     before the loop may access unallocated memory.  */
+  for (i = 0; i < len; i++)
+    {
+      *x = *y;
+      x++;
+      y++;
+    }
+  return ret;
+}
+
+/* This code is called to copy only remaining bytes within word or doubleword */
+static inline void * __attribute__ ((always_inline))
+do_bytes_remaining (void *a, const void *b, unsigned long len, void *ret)
+{
+  unsigned char *x = (unsigned char *) a;
+  bitfields_t bw;
+  if(len > 0)
+    {
+      bw.v = *(reg_t *)b;
+      DO_BYTE(x, 0);
+      DO_BYTE(x, 1);
+      DO_BYTE(x, 2);
+#ifdef __mips64
+      DO_BYTE(x, 3);
+      DO_BYTE(x, 4);
+      DO_BYTE(x, 5);
+      DO_BYTE(x, 6);
+#endif
+    }
+  return ret;
+}
+
+static inline void * __attribute__ ((always_inline))
+do_words_remaining (reg_t *a, const reg_t *b, unsigned long words,
+		    unsigned long bytes, void *ret)
+{
+  /* Use a set-back so that load/stores have incremented addresses in
+     order to promote bonding.  */
+  int off = (BLOCK - words);
+  a -= off;
+  b -= off;
+  switch (off)
+    {
+      case 1: a[1] = b[1];
+      case 2: a[2] = b[2];
+      case 3: a[3] = b[3];
+      case 4: a[4] = b[4];
+      case 5: a[5] = b[5];
+      case 6: a[6] = b[6];
+      case 7: a[7] = b[7];
+#if BLOCK==16
+      case 8: a[8] = b[8];
+      case 9: a[9] = b[9];
+      case 10: a[10] = b[10];
+      case 11: a[11] = b[11];
+      case 12: a[12] = b[12];
+      case 13: a[13] = b[13];
+      case 14: a[14] = b[14];
+      case 15: a[15] = b[15];
+#endif
+    }
+  return do_bytes_remaining (a + BLOCK, b + BLOCK, bytes, ret);
+}
+
+#if !HW_UNALIGNED_SUPPORT
+#if UNALIGNED_INSTR_SUPPORT
+/* For MIPS GCC, there are no unaligned builtins - so this struct forces
+   the compiler to treat the pointer access as unaligned.  */
+struct ulw
+{
+  reg_t uli;
+} __attribute__ ((packed));
+static inline void * __attribute__ ((always_inline))
+do_uwords_remaining (struct ulw *a, const reg_t *b, unsigned long words,
+		     unsigned long bytes, void *ret)
+{
+  /* Use a set-back so that load/stores have incremented addresses in
+     order to promote bonding.  */
+  int off = (BLOCK - words);
+  a -= off;
+  b -= off;
+  switch (off)
+    {
+      case 1: a[1].uli = b[1];
+      case 2: a[2].uli = b[2];
+      case 3: a[3].uli = b[3];
+      case 4: a[4].uli = b[4];
+      case 5: a[5].uli = b[5];
+      case 6: a[6].uli = b[6];
+      case 7: a[7].uli = b[7];
+#if BLOCK==16
+      case 8: a[8].uli = b[8];
+      case 9: a[9].uli = b[9];
+      case 10: a[10].uli = b[10];
+      case 11: a[11].uli = b[11];
+      case 12: a[12].uli = b[12];
+      case 13: a[13].uli = b[13];
+      case 14: a[14].uli = b[14];
+      case 15: a[15].uli = b[15];
+#endif
+    }
+  return do_bytes_remaining (a + BLOCK, b + BLOCK, bytes, ret);
+}
+
+/* The first pointer is not aligned while second pointer is.  */
+static void *
+unaligned_words (struct ulw *a, const reg_t * b,
+		 unsigned long words, unsigned long bytes, void *ret)
+{
+  unsigned long i, words_by_block, words_by_1;
+  words_by_1 = words % BLOCK;
+  words_by_block = words / BLOCK;
+  for (; words_by_block > 0; words_by_block--)
+    {
+      if (words_by_block >= PREF_AHEAD - CACHE_LINES_PER_BLOCK)
+	for (i = 0; i < CACHE_LINES_PER_BLOCK; i++)
+	  PREFETCH (b + (BLOCK / CACHE_LINES_PER_BLOCK) * (PREF_AHEAD + i));
+
+      reg_t y0 = b[0], y1 = b[1], y2 = b[2], y3 = b[3];
+      reg_t y4 = b[4], y5 = b[5], y6 = b[6], y7 = b[7];
+      a[0].uli = y0;
+      a[1].uli = y1;
+      a[2].uli = y2;
+      a[3].uli = y3;
+      a[4].uli = y4;
+      a[5].uli = y5;
+      a[6].uli = y6;
+      a[7].uli = y7;
+#if BLOCK==16
+      y0 = b[8], y1 = b[9], y2 = b[10], y3 = b[11];
+      y4 = b[12], y5 = b[13], y6 = b[14], y7 = b[15];
+      a[8].uli = y0;
+      a[9].uli = y1;
+      a[10].uli = y2;
+      a[11].uli = y3;
+      a[12].uli = y4;
+      a[13].uli = y5;
+      a[14].uli = y6;
+      a[15].uli = y7;
+#endif
+      a += BLOCK;
+      b += BLOCK;
+  }
+
+  /* Mop up any remaining bytes.  */
+  return do_uwords_remaining (a, b, words_by_1, bytes, ret);
+}
+
+#else
+
+/* No HW support or unaligned lw/ld/ualw/uald instructions.  */
+static void *
+unaligned_words (reg_t * a, const reg_t * b,
+		 unsigned long words, unsigned long bytes, void *ret)
+{
+  unsigned long i;
+  unsigned char *x;
+  for (i = 0; i < words; i++)
+    {
+      bitfields_t bw;
+      bw.v = *((reg_t*) b);
+      x = (unsigned char *) a;
+      x[0] = bw.b.B0;
+      x[1] = bw.b.B1;
+      x[2] = bw.b.B2;
+      x[3] = bw.b.B3;
+#if __mips64
+      x[4] = bw.b.B4;
+      x[5] = bw.b.B5;
+      x[6] = bw.b.B6;
+      x[7] = bw.b.B7;
+#endif
+      a += 1;
+      b += 1;
+    }
+  /* Mop up any remaining bytes.  */
+  return do_bytes_remaining (a, b, bytes, ret);
+}
+
+#endif /* UNALIGNED_INSTR_SUPPORT */
+#endif /* HW_UNALIGNED_SUPPORT */
+
+/* both pointers are aligned, or first isn't and HW support for unaligned.  */
+static void *
+aligned_words (reg_t * a, const reg_t * b,
+	       unsigned long words, unsigned long bytes, void *ret)
+{
+  unsigned long i, words_by_block, words_by_1;
+  words_by_1 = words % BLOCK;
+  words_by_block = words / BLOCK;
+  for (; words_by_block > 0; words_by_block--)
+    {
+      if(words_by_block >= PREF_AHEAD - CACHE_LINES_PER_BLOCK)
+	for (i = 0; i < CACHE_LINES_PER_BLOCK; i++)
+	  PREFETCH (b + ((BLOCK / CACHE_LINES_PER_BLOCK) * (PREF_AHEAD + i)));
+
+      reg_t x0 = b[0], x1 = b[1], x2 = b[2], x3 = b[3];
+      reg_t x4 = b[4], x5 = b[5], x6 = b[6], x7 = b[7];
+      a[0] = x0;
+      a[1] = x1;
+      a[2] = x2;
+      a[3] = x3;
+      a[4] = x4;
+      a[5] = x5;
+      a[6] = x6;
+      a[7] = x7;
+#if BLOCK==16
+      x0 = b[8], x1 = b[9], x2 = b[10], x3 = b[11];
+      x4 = b[12], x5 = b[13], x6 = b[14], x7 = b[15];
+      a[8] = x0;
+      a[9] = x1;
+      a[10] = x2;
+      a[11] = x3;
+      a[12] = x4;
+      a[13] = x5;
+      a[14] = x6;
+      a[15] = x7;
+#endif
+      a += BLOCK;
+      b += BLOCK;
+    }
+
+  /* mop up any remaining bytes.  */
+  return do_words_remaining (a, b, words_by_1, bytes, ret);
+}
+
+void *
+memcpy (void *a, const void *b, size_t len) __overloadable
+{
+  unsigned long bytes, words, i;
+  void *ret = a;
+  /* shouldn't hit that often.  */
+  if (len <= 8)
+    return do_bytes (a, b, len, a);
+
+  /* Start pre-fetches ahead of time.  */
+  if (len > CACHE_LINE * (PREF_AHEAD - 1))
+    for (i = 1; i < PREF_AHEAD - 1; i++)
+      PREFETCH ((char *)b + CACHE_LINE * i);
+  else
+    for (i = 1; i < len / CACHE_LINE; i++)
+      PREFETCH ((char *)b + CACHE_LINE * i);
+
+  /* Align the second pointer to word/dword alignment.
+     Note that the pointer is only 32-bits for o32/n32 ABIs.  For
+     n32, loads are done as 64-bit while address remains 32-bit.   */
+  bytes = ((unsigned long) b) % (sizeof (reg_t));
+
+  if (bytes)
+    {
+      bytes = (sizeof (reg_t)) - bytes;
+      if (bytes > len)
+	bytes = len;
+      do_bytes (a, b, bytes, ret);
+      if (len == bytes)
+	return ret;
+      len -= bytes;
+      a = (void *) (((unsigned char *) a) + bytes);
+      b = (const void *) (((unsigned char *) b) + bytes);
+    }
+
+  /* Second pointer now aligned.  */
+  words = len / sizeof (reg_t);
+  bytes = len % sizeof (reg_t);
+
+#if HW_UNALIGNED_SUPPORT
+  /* treat possible unaligned first pointer as aligned.  */
+  return aligned_words (a, b, words, bytes, ret);
+#else
+  if (((unsigned long) a) % sizeof (reg_t) == 0)
+    return aligned_words (a, b, words, bytes, ret);
+  /* need to use unaligned instructions on first pointer.  */
+  return unaligned_words (a, b, words, bytes, ret);
+#endif
+}
diff --git a/newlib/libc/machine/mips/memset.c b/newlib/libc/machine/mips/memset.c
new file mode 100644
index 000000000..9f07ef52b
--- /dev/null
+++ b/newlib/libc/machine/mips/memset.c
@@ -0,0 +1,176 @@ 
+/*
+ * Copyright (C) 2018 MIPS Tech, LLC
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice,
+ * this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from this
+ * software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
+ * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+ * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+ * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+ * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+ * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ * POSSIBILITY OF SUCH DAMAGE.
+*/
+
+#include <string.h>
+
+#if _MIPS_SIM == _ABIO32
+#define SIZEOF_reg_t 4
+typedef unsigned long reg_t;
+#else
+#define SIZEOF_reg_t 8
+typedef unsigned long long reg_t;
+#endif
+
+typedef struct bits8
+{
+  reg_t B0:8, B1:8, B2:8, B3:8;
+#if SIZEOF_reg_t == 8
+  reg_t B4:8, B5:8, B6:8, B7:8;
+#endif
+} bits8_t;
+typedef struct bits16
+{
+  reg_t B0:16, B1:16;
+#if SIZEOF_reg_t == 8
+  reg_t B2:16, B3:16;
+#endif
+} bits16_t;
+typedef struct bits32
+{
+  reg_t B0:32;
+#if SIZEOF_reg_t == 8
+  reg_t B1:32;
+#endif
+} bits32_t;
+
+/* This union assumes that small structures can be in registers.  If
+   not, then memory accesses will be done - not optimal, but ok.  */
+typedef union
+{
+  reg_t v;
+  bits8_t b8;
+  bits16_t b16;
+  bits32_t b32;
+} bitfields_t;
+
+/* This code is called when aligning a pointer or there are remaining bytes
+   after doing word sets.  */
+static inline void * __attribute__ ((always_inline))
+do_bytes (void *a, void *retval, unsigned char fill, const unsigned long len)
+{
+  unsigned char *x = ((unsigned char *) a);
+  unsigned long i;
+
+  for (i = 0; i < len; i++)
+    *x++ = fill;
+
+  return retval;
+}
+
+/* Pointer is aligned.  */
+static void *
+do_aligned_words (reg_t * a, void * retval, reg_t fill,
+	 unsigned long words, unsigned long bytes)
+{
+  unsigned long i, words_by_1, words_by_16;
+
+  words_by_1 = words % 16;
+  words_by_16 = words / 16;
+
+  /*
+   * Note: prefetching the store memory is not beneficial on most
+   * cores since the ls/st unit has store buffers that will be filled
+   * before the cache line is actually needed.
+   *
+   * Also, using prepare-for-store cache op is problematic since we
+   * don't know the implementation-defined cache line length and we
+   * don't want to touch unintended memory.
+   */
+  for (i = 0; i < words_by_16; i++)
+    {
+      a[0] = fill;
+      a[1] = fill;
+      a[2] = fill;
+      a[3] = fill;
+      a[4] = fill;
+      a[5] = fill;
+      a[6] = fill;
+      a[7] = fill;
+      a[8] = fill;
+      a[9] = fill;
+      a[10] = fill;
+      a[11] = fill;
+      a[12] = fill;
+      a[13] = fill;
+      a[14] = fill;
+      a[15] = fill;
+      a += 16;
+    }
+
+  /* do remaining words.  */
+  for (i = 0; i < words_by_1; i++)
+    *a++ = fill;
+
+  /* mop up any remaining bytes.  */
+  return do_bytes (a, retval, fill, bytes);
+}
+
+void *
+memset (void *a, int ifill, size_t len)
+{
+  unsigned long bytes, words;
+  bitfields_t fill;
+  void *retval = (void *) a;
+
+  /* shouldn't hit that often.  */
+  if (len < 16)
+    return do_bytes (a, retval, ifill, len);
+
+  /* Align the pointer to word/dword alignment.
+     Note that the pointer is only 32-bits for o32/n32 ABIs. For
+     n32, loads are done as 64-bit while address remains 32-bit.   */
+  bytes = ((unsigned long) a) % (sizeof (reg_t) * 2);
+  if (bytes)
+    {
+      bytes = (sizeof (reg_t) * 2 - bytes);
+      if (bytes > len)
+	bytes = len;
+      do_bytes (a, retval, ifill, bytes);
+      if (len == bytes)
+	return retval;
+      len -= bytes;
+      a = (void *) (((unsigned char *) a) + bytes);
+    }
+
+  /* Create correct fill value for reg_t sized variable.  */
+  if (ifill != 0)
+    {
+      fill.b8.B0 = (unsigned char) ifill;
+      fill.b8.B1 = fill.b8.B0;
+      fill.b16.B1 = fill.b16.B0;
+#if SIZEOF_reg_t == 8
+      fill.b32.B1 = fill.b32.B0;
+#endif
+    }
+  else
+    fill.v = 0;
+
+  words = len / sizeof (reg_t);
+  bytes = len % sizeof (reg_t);
+  return do_aligned_words (a, retval, fill.v, words, bytes);
+}