@@ -1,37 +1,29 @@
-/*
- * Copyright (C) 2024 MIPS Tech, LLC
- *
- * Redistribution and use in source and binary forms, with or without
- * modification, are permitted provided that the following conditions are met:
- *
- * 1. Redistributions of source code must retain the above copyright notice,
- * this list of conditions and the following disclaimer.
- * 2. Redistributions in binary form must reproduce the above copyright notice,
- * this list of conditions and the following disclaimer in the documentation
- * and/or other materials provided with the distribution.
- * 3. Neither the name of the copyright holder nor the names of its
- * contributors may be used to endorse or promote products derived from this
- * software without specific prior written permission.
- *
- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
- * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
- * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
- * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
- * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
- * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
- * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
- * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
- * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
- * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
- * POSSIBILITY OF SUCH DAMAGE.
-*/
+/* Copyright (C) 2024 Free Software Foundation, Inc.
+ This file is part of the GNU C Library.
+ Contributed by Wave Computing
+
+ The GNU C Library is free software; you can redistribute it and/or
+ modify it under the terms of the GNU Lesser General Public
+ License as published by the Free Software Foundation; either
+ version 2.1 of the License, or (at your option) any later version.
+
+ The GNU C Library is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ Lesser General Public License for more details.
+
+ You should have received a copy of the GNU Lesser General Public
+ License along with the GNU C Library. If not, see
+ <http://www.gnu.org/licenses/>. */
#ifdef __GNUC__
#undef memcpy
/* Typical observed latency in cycles in fetching from DRAM. */
-#define LATENCY_CYCLES 63
+#ifndef LATENCY_CYCLES
+ #define LATENCY_CYCLES 63
+#endif
/* Pre-fetch performance is subject to accurate prefetch ahead,
which in turn depends on both the cache-line size and the amount
@@ -48,30 +40,42 @@
#define LATENCY_CYCLES 150
#elif defined(_MIPS_TUNE_I6400) || defined(_MIPS_TUNE_I6500)
#define CACHE_LINE 64
- #define BLOCK_CYCLES 16
+ #define BLOCK_CYCLES 15
#elif defined(_MIPS_TUNE_P6600)
#define CACHE_LINE 32
- #define BLOCK_CYCLES 12
+ #define BLOCK_CYCLES 15
#elif defined(_MIPS_TUNE_INTERAPTIV) || defined(_MIPS_TUNE_INTERAPTIV_MR2)
#define CACHE_LINE 32
#define BLOCK_CYCLES 30
#else
- #define CACHE_LINE 32
- #define BLOCK_CYCLES 11
+ #ifndef CACHE_LINE
+ #define CACHE_LINE 32
+ #endif
+ #ifndef BLOCK_CYCLES
+ #ifdef __nanomips__
+ #define BLOCK_CYCLES 20
+ #else
+ #define BLOCK_CYCLES 11
+ #endif
+ #endif
#endif
/* Pre-fetch look ahead = ceil (latency / block-cycles) */
#define PREF_AHEAD (LATENCY_CYCLES / BLOCK_CYCLES \
+ ((LATENCY_CYCLES % BLOCK_CYCLES) == 0 ? 0 : 1))
-/* Unroll-factor, controls how many words at a time in the core loop. */
-#define BLOCK (CACHE_LINE == 128 ? 16 : 8)
+/* The unroll-factor controls how many words at a time in the core loop. */
+#ifndef BLOCK_SIZE
+ #define BLOCK_SIZE (CACHE_LINE == 128 ? 16 : 8)
+#elif BLOCK_SIZE != 8 && BLOCK_SIZE != 16
+ #error "BLOCK_SIZE must be 8 or 16"
+#endif
#define __overloadable
#if !defined(UNALIGNED_INSTR_SUPPORT)
/* does target have unaligned lw/ld/ualw/uald instructions? */
#define UNALIGNED_INSTR_SUPPORT 0
-#if (__mips_isa_rev < 6 && !defined(__mips1))
+#if (__mips_isa_rev < 6 && !defined(__mips1)) || defined(__nanomips__)
#undef UNALIGNED_INSTR_SUPPORT
#define UNALIGNED_INSTR_SUPPORT 1
#endif
@@ -79,17 +83,35 @@
#if !defined(HW_UNALIGNED_SUPPORT)
/* Does target have hardware support for unaligned accesses? */
#define HW_UNALIGNED_SUPPORT 0
- #if __mips_isa_rev >= 6
+ #if __mips_isa_rev >= 6 && !defined(__nanomips__)
#undef HW_UNALIGNED_SUPPORT
#define HW_UNALIGNED_SUPPORT 1
#endif
#endif
-#define ENABLE_PREFETCH 1
+
+#ifndef ENABLE_PREFETCH
+ #define ENABLE_PREFETCH 1
+#endif
+
+#ifndef ENABLE_PREFETCH_CHECK
+ #define ENABLE_PREFETCH_CHECK 0
+#endif
+
#if ENABLE_PREFETCH
- #define PREFETCH(addr) __builtin_prefetch (addr, 0, 0)
-#else
+ #if ENABLE_PREFETCH_CHECK
+#include <assert.h>
+static char *limit;
+#define PREFETCH(addr) \
+ do { \
+ assert ((char *)(addr) < limit); \
+ __builtin_prefetch ((addr), 0, 1); \
+ } while (0)
+#else /* ENABLE_PREFETCH_CHECK */
+ #define PREFETCH(addr) __builtin_prefetch (addr, 0, 1)
+ #endif /* ENABLE_PREFETCH_CHECK */
+#else /* ENABLE_PREFETCH */
#define PREFETCH(addr)
-#endif
+#endif /* ENABLE_PREFETCH */
#include <string.h>
@@ -99,17 +121,18 @@ typedef struct
{
reg_t B0:8, B1:8, B2:8, B3:8, B4:8, B5:8, B6:8, B7:8;
} bits_t;
-#else
+#else /* __mips64 */
typedef unsigned long reg_t;
typedef struct
{
reg_t B0:8, B1:8, B2:8, B3:8;
} bits_t;
-#endif
+#endif /* __mips64 */
-#define CACHE_LINES_PER_BLOCK ((BLOCK * sizeof (reg_t) > CACHE_LINE) ? \
- (BLOCK * sizeof (reg_t) / CACHE_LINE) \
- : 1)
+#define CACHE_LINES_PER_BLOCK \
+ ((BLOCK_SIZE * sizeof (reg_t) > CACHE_LINE) \
+ ? (BLOCK_SIZE * sizeof (reg_t) / CACHE_LINE) \
+ : 1)
typedef union
{
@@ -120,7 +143,7 @@ typedef union
#define DO_BYTE(a, i) \
a[i] = bw.b.B##i; \
len--; \
- if(!len) return ret; \
+ if (!len) return ret; \
/* This code is called when aligning a pointer, there are remaining bytes
after doing word compares, or architecture does not have some form
@@ -148,7 +171,7 @@ do_bytes_remaining (void *a, const void *b, unsigned long len, void *ret)
{
unsigned char *x = (unsigned char *) a;
bitfields_t bw;
- if(len > 0)
+ if (len > 0)
{
bw.v = *(reg_t *)b;
DO_BYTE(x, 0);
@@ -159,7 +182,7 @@ do_bytes_remaining (void *a, const void *b, unsigned long len, void *ret)
DO_BYTE(x, 4);
DO_BYTE(x, 5);
DO_BYTE(x, 6);
-#endif
+#endif /* __mips64 */
}
return ret;
}
@@ -170,7 +193,7 @@ do_words_remaining (reg_t *a, const reg_t *b, unsigned long words,
{
/* Use a set-back so that load/stores have incremented addresses in
order to promote bonding. */
- int off = (BLOCK - words);
+ int off = (BLOCK_SIZE - words);
a -= off;
b -= off;
switch (off)
@@ -182,7 +205,7 @@ do_words_remaining (reg_t *a, const reg_t *b, unsigned long words,
case 5: a[5] = b[5]; // Fall through
case 6: a[6] = b[6]; // Fall through
case 7: a[7] = b[7]; // Fall through
-#if BLOCK==16
+#if BLOCK_SIZE==16
case 8: a[8] = b[8]; // Fall through
case 9: a[9] = b[9]; // Fall through
case 10: a[10] = b[10]; // Fall through
@@ -191,9 +214,9 @@ do_words_remaining (reg_t *a, const reg_t *b, unsigned long words,
case 13: a[13] = b[13]; // Fall through
case 14: a[14] = b[14]; // Fall through
case 15: a[15] = b[15];
-#endif
+#endif /* BLOCK_SIZE==16 */
}
- return do_bytes_remaining (a + BLOCK, b + BLOCK, bytes, ret);
+ return do_bytes_remaining (a + BLOCK_SIZE, b + BLOCK_SIZE, bytes, ret);
}
#if !HW_UNALIGNED_SUPPORT
@@ -210,7 +233,7 @@ do_uwords_remaining (struct ulw *a, const reg_t *b, unsigned long words,
{
/* Use a set-back so that load/stores have incremented addresses in
order to promote bonding. */
- int off = (BLOCK - words);
+ int off = (BLOCK_SIZE - words);
a -= off;
b -= off;
switch (off)
@@ -222,7 +245,7 @@ do_uwords_remaining (struct ulw *a, const reg_t *b, unsigned long words,
case 5: a[5].uli = b[5]; // Fall through
case 6: a[6].uli = b[6]; // Fall through
case 7: a[7].uli = b[7]; // Fall through
-#if BLOCK==16
+#if BLOCK_SIZE==16
case 8: a[8].uli = b[8]; // Fall through
case 9: a[9].uli = b[9]; // Fall through
case 10: a[10].uli = b[10]; // Fall through
@@ -231,9 +254,9 @@ do_uwords_remaining (struct ulw *a, const reg_t *b, unsigned long words,
case 13: a[13].uli = b[13]; // Fall through
case 14: a[14].uli = b[14]; // Fall through
case 15: a[15].uli = b[15];
-#endif
+#endif /* BLOCK_SIZE==16 */
}
- return do_bytes_remaining (a + BLOCK, b + BLOCK, bytes, ret);
+ return do_bytes_remaining (a + BLOCK_SIZE, b + BLOCK_SIZE, bytes, ret);
}
/* The first pointer is not aligned while second pointer is. */
@@ -242,13 +265,19 @@ unaligned_words (struct ulw *a, const reg_t * b,
unsigned long words, unsigned long bytes, void *ret)
{
unsigned long i, words_by_block, words_by_1;
- words_by_1 = words % BLOCK;
- words_by_block = words / BLOCK;
+ words_by_1 = words % BLOCK_SIZE;
+ words_by_block = words / BLOCK_SIZE;
+
for (; words_by_block > 0; words_by_block--)
{
- if (words_by_block >= PREF_AHEAD - CACHE_LINES_PER_BLOCK)
+ /* This condition is deliberately conservative. One could theoretically
+ pre-fetch another time around in some cases without crossing the page
+ boundary at the limit, but checking for the right conditions here is
+ too expensive to be worth it. */
+ if (words_by_block > PREF_AHEAD)
for (i = 0; i < CACHE_LINES_PER_BLOCK; i++)
- PREFETCH (b + (BLOCK / CACHE_LINES_PER_BLOCK) * (PREF_AHEAD + i));
+ PREFETCH (b + ((BLOCK_SIZE / CACHE_LINES_PER_BLOCK)
+ * (PREF_AHEAD + i)));
reg_t y0 = b[0], y1 = b[1], y2 = b[2], y3 = b[3];
reg_t y4 = b[4], y5 = b[5], y6 = b[6], y7 = b[7];
@@ -260,7 +289,7 @@ unaligned_words (struct ulw *a, const reg_t * b,
a[5].uli = y5;
a[6].uli = y6;
a[7].uli = y7;
-#if BLOCK==16
+#if BLOCK_SIZE==16
y0 = b[8], y1 = b[9], y2 = b[10], y3 = b[11];
y4 = b[12], y5 = b[13], y6 = b[14], y7 = b[15];
a[8].uli = y0;
@@ -271,16 +300,16 @@ unaligned_words (struct ulw *a, const reg_t * b,
a[13].uli = y5;
a[14].uli = y6;
a[15].uli = y7;
-#endif
- a += BLOCK;
- b += BLOCK;
+#endif /* BLOCK_SIZE==16 */
+ a += BLOCK_SIZE;
+ b += BLOCK_SIZE;
}
/* Mop up any remaining bytes. */
return do_uwords_remaining (a, b, words_by_1, bytes, ret);
}
-#else
+#else /* !UNALIGNED_INSTR_SUPPORT */
/* No HW support or unaligned lw/ld/ualw/uald instructions. */
static void *
@@ -320,13 +349,15 @@ aligned_words (reg_t * a, const reg_t * b,
unsigned long words, unsigned long bytes, void *ret)
{
unsigned long i, words_by_block, words_by_1;
- words_by_1 = words % BLOCK;
- words_by_block = words / BLOCK;
+ words_by_1 = words % BLOCK_SIZE;
+ words_by_block = words / BLOCK_SIZE;
+
for (; words_by_block > 0; words_by_block--)
{
- if(words_by_block >= PREF_AHEAD - CACHE_LINES_PER_BLOCK)
+ if (words_by_block > PREF_AHEAD)
for (i = 0; i < CACHE_LINES_PER_BLOCK; i++)
- PREFETCH (b + ((BLOCK / CACHE_LINES_PER_BLOCK) * (PREF_AHEAD + i)));
+ PREFETCH (b + ((BLOCK_SIZE / CACHE_LINES_PER_BLOCK)
+ * (PREF_AHEAD + i)));
reg_t x0 = b[0], x1 = b[1], x2 = b[2], x3 = b[3];
reg_t x4 = b[4], x5 = b[5], x6 = b[6], x7 = b[7];
@@ -338,7 +369,7 @@ aligned_words (reg_t * a, const reg_t * b,
a[5] = x5;
a[6] = x6;
a[7] = x7;
-#if BLOCK==16
+#if BLOCK_SIZE==16
x0 = b[8], x1 = b[9], x2 = b[10], x3 = b[11];
x4 = b[12], x5 = b[13], x6 = b[14], x7 = b[15];
a[8] = x0;
@@ -349,9 +380,9 @@ aligned_words (reg_t * a, const reg_t * b,
a[13] = x5;
a[14] = x6;
a[15] = x7;
-#endif
- a += BLOCK;
- b += BLOCK;
+#endif /* BLOCK_SIZE==16 */
+ a += BLOCK_SIZE;
+ b += BLOCK_SIZE;
}
/* mop up any remaining bytes. */
@@ -363,13 +394,16 @@ memcpy (void *a, const void *b, size_t len) __overloadable
{
unsigned long bytes, words, i;
void *ret = a;
+#if ENABLE_PREFETCH_CHECK
+ limit = (char *)b + len;
+#endif /* ENABLE_PREFETCH_CHECK */
/* shouldn't hit that often. */
if (len <= 8)
return do_bytes (a, b, len, a);
/* Start pre-fetches ahead of time. */
- if (len > CACHE_LINE * (PREF_AHEAD - 1))
- for (i = 1; i < PREF_AHEAD - 1; i++)
+ if (len > CACHE_LINE * PREF_AHEAD)
+ for (i = 1; i < PREF_AHEAD; i++)
PREFETCH ((char *)b + CACHE_LINE * i);
else
for (i = 1; i < len / CACHE_LINE; i++)
@@ -400,12 +434,12 @@ memcpy (void *a, const void *b, size_t len) __overloadable
#if HW_UNALIGNED_SUPPORT
/* treat possible unaligned first pointer as aligned. */
return aligned_words (a, b, words, bytes, ret);
-#else
+#else /* !HW_UNALIGNED_SUPPORT */
if (((unsigned long) a) % sizeof (reg_t) == 0)
return aligned_words (a, b, words, bytes, ret);
/* need to use unaligned instructions on first pointer. */
return unaligned_words (a, b, words, bytes, ret);
-#endif
+#endif /* HW_UNALIGNED_SUPPORT */
}
libc_hidden_builtin_def (memcpy)