[06/11] Fix prefetching beyond copied memory

Message ID 20250123134308.1785777-8-aleksandar.rakic@htecgroup.com (mailing list archive)
State New
Headers
Series Improve Mips target |

Checks

Context Check Description
redhat-pt-bot/TryBot-apply_patch fail Patch failed to apply to master at the time it was sent

Commit Message

Aleksandar Rakic Jan. 23, 2025, 1:43 p.m. UTC
  GTM18-287/PP118771: memcpy prefetches beyond copied memory.
Fix prefetching in core loop to avoid exceeding the operated upon
memory region. Revert accidentally changed prefetch-hint back to
streaming mode. Refactor various bits and provide pre-processor
checks to allow parameters to be overridden from compiler command
line.

Cherry-picked 132e0bbbbed01f95ec88b68b5f7f2056f6125531
from https://github.com/MIPS/glibc

Signed-off-by: Faraz Shahbazker <fshahbazker@wavecomp.com>
Signed-off-by: Aleksandar Rakic <aleksandar.rakic@htecgroup.com>
---
 sysdeps/mips/memcpy.c | 188 +++++++++++++++++++++++++-----------------
 1 file changed, 111 insertions(+), 77 deletions(-)
  

Patch

diff --git a/sysdeps/mips/memcpy.c b/sysdeps/mips/memcpy.c
index 8c3aec7b36..798e991f6d 100644
--- a/sysdeps/mips/memcpy.c
+++ b/sysdeps/mips/memcpy.c
@@ -1,37 +1,29 @@ 
-/*
- * Copyright (C) 2024 MIPS Tech, LLC
- *
- * Redistribution and use in source and binary forms, with or without
- * modification, are permitted provided that the following conditions are met:
- *
- * 1. Redistributions of source code must retain the above copyright notice,
- * this list of conditions and the following disclaimer.
- * 2. Redistributions in binary form must reproduce the above copyright notice,
- * this list of conditions and the following disclaimer in the documentation
- * and/or other materials provided with the distribution.
- * 3. Neither the name of the copyright holder nor the names of its
- * contributors may be used to endorse or promote products derived from this
- * software without specific prior written permission.
- *
- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
- * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
- * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
- * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
- * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
- * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
- * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
- * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
- * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
- * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
- * POSSIBILITY OF SUCH DAMAGE.
-*/
+/* Copyright (C) 2024 Free Software Foundation, Inc.
+   This file is part of the GNU C Library.
+   Contributed by Wave Computing
+
+   The GNU C Library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 2.1 of the License, or (at your option) any later version.
+
+   The GNU C Library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with the GNU C Library.  If not, see
+   <http://www.gnu.org/licenses/>.  */
 
 #ifdef  __GNUC__
 
 #undef memcpy
 
 /* Typical observed latency in cycles in fetching from DRAM.  */
-#define LATENCY_CYCLES 63
+#ifndef LATENCY_CYCLES
+ #define LATENCY_CYCLES 63
+#endif
 
 /* Pre-fetch performance is subject to accurate prefetch ahead,
    which in turn depends on both the cache-line size and the amount
@@ -48,30 +40,42 @@ 
  #define LATENCY_CYCLES 150
 #elif defined(_MIPS_TUNE_I6400) || defined(_MIPS_TUNE_I6500)
  #define CACHE_LINE 64
- #define BLOCK_CYCLES 16
+ #define BLOCK_CYCLES 15
 #elif defined(_MIPS_TUNE_P6600)
  #define CACHE_LINE 32
- #define BLOCK_CYCLES 12
+ #define BLOCK_CYCLES 15
 #elif defined(_MIPS_TUNE_INTERAPTIV) || defined(_MIPS_TUNE_INTERAPTIV_MR2)
  #define CACHE_LINE 32
  #define BLOCK_CYCLES 30
 #else
- #define CACHE_LINE 32
- #define BLOCK_CYCLES 11
+ #ifndef CACHE_LINE
+  #define CACHE_LINE 32
+ #endif
+ #ifndef BLOCK_CYCLES
+  #ifdef __nanomips__
+   #define BLOCK_CYCLES 20
+  #else
+   #define BLOCK_CYCLES 11
+  #endif
+ #endif
 #endif
 
 /* Pre-fetch look ahead = ceil (latency / block-cycles)  */
 #define PREF_AHEAD (LATENCY_CYCLES / BLOCK_CYCLES			\
 		    + ((LATENCY_CYCLES % BLOCK_CYCLES) == 0 ? 0 : 1))
 
-/* Unroll-factor, controls how many words at a time in the core loop.  */
-#define BLOCK (CACHE_LINE == 128 ? 16 : 8)
+/* The unroll-factor controls how many words at a time in the core loop.  */
+#ifndef BLOCK_SIZE
+ #define BLOCK_SIZE (CACHE_LINE == 128 ? 16 : 8)
+#elif BLOCK_SIZE != 8 && BLOCK_SIZE != 16
+ #error "BLOCK_SIZE must be 8 or 16"
+#endif
 
 #define __overloadable
 #if !defined(UNALIGNED_INSTR_SUPPORT)
 /* does target have unaligned lw/ld/ualw/uald instructions? */
  #define UNALIGNED_INSTR_SUPPORT 0
-#if (__mips_isa_rev < 6 && !defined(__mips1))
+#if (__mips_isa_rev < 6 && !defined(__mips1)) || defined(__nanomips__)
   #undef UNALIGNED_INSTR_SUPPORT
   #define UNALIGNED_INSTR_SUPPORT 1
  #endif
@@ -79,17 +83,35 @@ 
 #if !defined(HW_UNALIGNED_SUPPORT)
 /* Does target have hardware support for unaligned accesses?  */
  #define HW_UNALIGNED_SUPPORT 0
- #if __mips_isa_rev >= 6
+ #if __mips_isa_rev >= 6 && !defined(__nanomips__)
   #undef HW_UNALIGNED_SUPPORT
   #define HW_UNALIGNED_SUPPORT 1
  #endif
 #endif
-#define ENABLE_PREFETCH     1
+
+#ifndef ENABLE_PREFETCH
+ #define ENABLE_PREFETCH 1
+#endif
+
+#ifndef ENABLE_PREFETCH_CHECK
+ #define ENABLE_PREFETCH_CHECK 0
+#endif
+
 #if ENABLE_PREFETCH
- #define PREFETCH(addr)  __builtin_prefetch (addr, 0, 0)
-#else
+ #if ENABLE_PREFETCH_CHECK
+#include <assert.h>
+static  char *limit;
+#define PREFETCH(addr)				\
+  do {						\
+    assert ((char *)(addr) < limit);		\
+    __builtin_prefetch ((addr), 0, 1);		\
+  } while (0)
+#else /* ENABLE_PREFETCH_CHECK */
+  #define PREFETCH(addr)  __builtin_prefetch (addr, 0, 1)
+ #endif /* ENABLE_PREFETCH_CHECK */
+#else /* ENABLE_PREFETCH */
  #define PREFETCH(addr)
-#endif
+#endif /* ENABLE_PREFETCH */
 
 #include <string.h>
 
@@ -99,17 +121,18 @@  typedef struct
 {
   reg_t B0:8, B1:8, B2:8, B3:8, B4:8, B5:8, B6:8, B7:8;
 } bits_t;
-#else
+#else /* __mips64 */
 typedef unsigned long reg_t;
 typedef struct
 {
   reg_t B0:8, B1:8, B2:8, B3:8;
 } bits_t;
-#endif
+#endif /* __mips64 */
 
-#define CACHE_LINES_PER_BLOCK ((BLOCK * sizeof (reg_t) > CACHE_LINE) ?	\
-			       (BLOCK * sizeof (reg_t) / CACHE_LINE)	\
-			       : 1)
+#define CACHE_LINES_PER_BLOCK						\
+  ((BLOCK_SIZE * sizeof (reg_t) > CACHE_LINE)				\
+   ? (BLOCK_SIZE * sizeof (reg_t) / CACHE_LINE)				\
+   : 1)
 
 typedef union
 {
@@ -120,7 +143,7 @@  typedef union
 #define DO_BYTE(a, i)   \
   a[i] = bw.b.B##i;     \
   len--;                \
-  if(!len) return ret;  \
+  if (!len) return ret;  \
 
 /* This code is called when aligning a pointer, there are remaining bytes
    after doing word compares, or architecture does not have some form
@@ -148,7 +171,7 @@  do_bytes_remaining (void *a, const void *b, unsigned long len, void *ret)
 {
   unsigned char *x = (unsigned char *) a;
   bitfields_t bw;
-  if(len > 0)
+  if (len > 0)
     {
       bw.v = *(reg_t *)b;
       DO_BYTE(x, 0);
@@ -159,7 +182,7 @@  do_bytes_remaining (void *a, const void *b, unsigned long len, void *ret)
       DO_BYTE(x, 4);
       DO_BYTE(x, 5);
       DO_BYTE(x, 6);
-#endif
+#endif /* __mips64 */
     }
   return ret;
 }
@@ -170,7 +193,7 @@  do_words_remaining (reg_t *a, const reg_t *b, unsigned long words,
 {
   /* Use a set-back so that load/stores have incremented addresses in
      order to promote bonding.  */
-  int off = (BLOCK - words);
+  int off = (BLOCK_SIZE - words);
   a -= off;
   b -= off;
   switch (off)
@@ -182,7 +205,7 @@  do_words_remaining (reg_t *a, const reg_t *b, unsigned long words,
       case 5: a[5] = b[5]; // Fall through
       case 6: a[6] = b[6]; // Fall through
       case 7: a[7] = b[7]; // Fall through
-#if BLOCK==16
+#if BLOCK_SIZE==16
       case 8: a[8] = b[8]; // Fall through
       case 9: a[9] = b[9]; // Fall through
       case 10: a[10] = b[10]; // Fall through
@@ -191,9 +214,9 @@  do_words_remaining (reg_t *a, const reg_t *b, unsigned long words,
       case 13: a[13] = b[13]; // Fall through
       case 14: a[14] = b[14]; // Fall through
       case 15: a[15] = b[15];
-#endif
+#endif /* BLOCK_SIZE==16 */
     }
-  return do_bytes_remaining (a + BLOCK, b + BLOCK, bytes, ret);
+  return do_bytes_remaining (a + BLOCK_SIZE, b + BLOCK_SIZE, bytes, ret);
 }
 
 #if !HW_UNALIGNED_SUPPORT
@@ -210,7 +233,7 @@  do_uwords_remaining (struct ulw *a, const reg_t *b, unsigned long words,
 {
   /* Use a set-back so that load/stores have incremented addresses in
      order to promote bonding.  */
-  int off = (BLOCK - words);
+  int off = (BLOCK_SIZE - words);
   a -= off;
   b -= off;
   switch (off)
@@ -222,7 +245,7 @@  do_uwords_remaining (struct ulw *a, const reg_t *b, unsigned long words,
       case 5: a[5].uli = b[5]; // Fall through
       case 6: a[6].uli = b[6]; // Fall through
       case 7: a[7].uli = b[7]; // Fall through
-#if BLOCK==16
+#if BLOCK_SIZE==16
       case 8: a[8].uli = b[8]; // Fall through
       case 9: a[9].uli = b[9]; // Fall through
       case 10: a[10].uli = b[10]; // Fall through
@@ -231,9 +254,9 @@  do_uwords_remaining (struct ulw *a, const reg_t *b, unsigned long words,
       case 13: a[13].uli = b[13]; // Fall through
       case 14: a[14].uli = b[14]; // Fall through
       case 15: a[15].uli = b[15];
-#endif
+#endif /* BLOCK_SIZE==16 */
     }
-  return do_bytes_remaining (a + BLOCK, b + BLOCK, bytes, ret);
+  return do_bytes_remaining (a + BLOCK_SIZE, b + BLOCK_SIZE, bytes, ret);
 }
 
 /* The first pointer is not aligned while second pointer is.  */
@@ -242,13 +265,19 @@  unaligned_words (struct ulw *a, const reg_t * b,
 		 unsigned long words, unsigned long bytes, void *ret)
 {
   unsigned long i, words_by_block, words_by_1;
-  words_by_1 = words % BLOCK;
-  words_by_block = words / BLOCK;
+  words_by_1 = words % BLOCK_SIZE;
+  words_by_block = words / BLOCK_SIZE;
+
   for (; words_by_block > 0; words_by_block--)
     {
-      if (words_by_block >= PREF_AHEAD - CACHE_LINES_PER_BLOCK)
+      /* This condition is deliberately conservative.  One could theoretically
+	 pre-fetch another time around in some cases without crossing the page
+	 boundary at the limit, but checking for the right conditions here is
+	 too expensive to be worth it.  */
+      if (words_by_block > PREF_AHEAD)
 	for (i = 0; i < CACHE_LINES_PER_BLOCK; i++)
-	  PREFETCH (b + (BLOCK / CACHE_LINES_PER_BLOCK) * (PREF_AHEAD + i));
+	  PREFETCH (b + ((BLOCK_SIZE / CACHE_LINES_PER_BLOCK)
+			 * (PREF_AHEAD + i)));
 
       reg_t y0 = b[0], y1 = b[1], y2 = b[2], y3 = b[3];
       reg_t y4 = b[4], y5 = b[5], y6 = b[6], y7 = b[7];
@@ -260,7 +289,7 @@  unaligned_words (struct ulw *a, const reg_t * b,
       a[5].uli = y5;
       a[6].uli = y6;
       a[7].uli = y7;
-#if BLOCK==16
+#if BLOCK_SIZE==16
       y0 = b[8], y1 = b[9], y2 = b[10], y3 = b[11];
       y4 = b[12], y5 = b[13], y6 = b[14], y7 = b[15];
       a[8].uli = y0;
@@ -271,16 +300,16 @@  unaligned_words (struct ulw *a, const reg_t * b,
       a[13].uli = y5;
       a[14].uli = y6;
       a[15].uli = y7;
-#endif
-      a += BLOCK;
-      b += BLOCK;
+#endif /* BLOCK_SIZE==16 */
+      a += BLOCK_SIZE;
+      b += BLOCK_SIZE;
   }
 
   /* Mop up any remaining bytes.  */
   return do_uwords_remaining (a, b, words_by_1, bytes, ret);
 }
 
-#else
+#else /* !UNALIGNED_INSTR_SUPPORT */
 
 /* No HW support or unaligned lw/ld/ualw/uald instructions.  */
 static void *
@@ -320,13 +349,15 @@  aligned_words (reg_t * a, const reg_t * b,
 	       unsigned long words, unsigned long bytes, void *ret)
 {
   unsigned long i, words_by_block, words_by_1;
-  words_by_1 = words % BLOCK;
-  words_by_block = words / BLOCK;
+  words_by_1 = words % BLOCK_SIZE;
+  words_by_block = words / BLOCK_SIZE;
+
   for (; words_by_block > 0; words_by_block--)
     {
-      if(words_by_block >= PREF_AHEAD - CACHE_LINES_PER_BLOCK)
+      if (words_by_block > PREF_AHEAD)
 	for (i = 0; i < CACHE_LINES_PER_BLOCK; i++)
-	  PREFETCH (b + ((BLOCK / CACHE_LINES_PER_BLOCK) * (PREF_AHEAD + i)));
+	  PREFETCH (b + ((BLOCK_SIZE / CACHE_LINES_PER_BLOCK)
+			 * (PREF_AHEAD + i)));
 
       reg_t x0 = b[0], x1 = b[1], x2 = b[2], x3 = b[3];
       reg_t x4 = b[4], x5 = b[5], x6 = b[6], x7 = b[7];
@@ -338,7 +369,7 @@  aligned_words (reg_t * a, const reg_t * b,
       a[5] = x5;
       a[6] = x6;
       a[7] = x7;
-#if BLOCK==16
+#if BLOCK_SIZE==16
       x0 = b[8], x1 = b[9], x2 = b[10], x3 = b[11];
       x4 = b[12], x5 = b[13], x6 = b[14], x7 = b[15];
       a[8] = x0;
@@ -349,9 +380,9 @@  aligned_words (reg_t * a, const reg_t * b,
       a[13] = x5;
       a[14] = x6;
       a[15] = x7;
-#endif
-      a += BLOCK;
-      b += BLOCK;
+#endif /* BLOCK_SIZE==16 */
+      a += BLOCK_SIZE;
+      b += BLOCK_SIZE;
     }
 
   /* mop up any remaining bytes.  */
@@ -363,13 +394,16 @@  memcpy (void *a, const void *b, size_t len) __overloadable
 {
   unsigned long bytes, words, i;
   void *ret = a;
+#if ENABLE_PREFETCH_CHECK
+  limit = (char *)b + len;
+#endif /* ENABLE_PREFETCH_CHECK */
   /* shouldn't hit that often.  */
   if (len <= 8)
     return do_bytes (a, b, len, a);
 
   /* Start pre-fetches ahead of time.  */
-  if (len > CACHE_LINE * (PREF_AHEAD - 1))
-    for (i = 1; i < PREF_AHEAD - 1; i++)
+  if (len > CACHE_LINE * PREF_AHEAD)
+    for (i = 1; i < PREF_AHEAD; i++)
       PREFETCH ((char *)b + CACHE_LINE * i);
   else
     for (i = 1; i < len / CACHE_LINE; i++)
@@ -400,12 +434,12 @@  memcpy (void *a, const void *b, size_t len) __overloadable
 #if HW_UNALIGNED_SUPPORT
   /* treat possible unaligned first pointer as aligned.  */
   return aligned_words (a, b, words, bytes, ret);
-#else
+#else /* !HW_UNALIGNED_SUPPORT */
   if (((unsigned long) a) % sizeof (reg_t) == 0)
     return aligned_words (a, b, words, bytes, ret);
   /* need to use unaligned instructions on first pointer.  */
   return unaligned_words (a, b, words, bytes, ret);
-#endif
+#endif /* HW_UNALIGNED_SUPPORT */
 }
 
 libc_hidden_builtin_def (memcpy)