[MIPS] Modify memcpy.S for mips32r6/mips64r6

Message ID 7ec2bf7e-fc1e-428b-ac0a-747f2a3ab3e6@BAMAIL02.ba.imgtec.org
State Rejected
Headers

Commit Message

Steve Ellcey Dec. 19, 2014, 11:02 p.m. UTC
  Changes to memcpy and memset are the final patches I have for mips32r6
and mips64r6 support.  This patch is the memcpy change.  In addition
to adding mips32r6/mips64r6 support I fixed some indentation inconsistencies
with preprocessor statements.

The main change is how to do memcpy when the source and destination cannot
both be aligned.  For MIPS architectures prior to mips32r6 and mips64r6
we aligned the destination and then used the ldl and ldr instructions to
load unaligned words.  These instructions are not in mips32r6 or mips64r6
so now we do two aligned word loads followed by an align instruction to
combine the parts of the two loads into one register that can be stored
into the previously aligned destination.

Tested with the mips32r6/mips64r6 GCC, binutils and qemu simulator.

OK to checkin?

Steve Ellcey
sellcey@imgtec.com


2014-12-19  Steve Ellcey  <sellcey@imgtec.com>

	* sysdeps/mips/memcpy.S: Fix preprocessor indentation.
	(memcpy): Modify for mips32r6/mips64r6 to use align
	instead of lwl/lwr for unaligned memcpy loops.
  

Comments

Ondrej Bilka Dec. 20, 2014, 9:19 a.m. UTC | #1
On Fri, Dec 19, 2014 at 03:02:37PM -0800, Steve Ellcey  wrote:
> Changes to memcpy and memset are the final patches I have for mips32r6
> and mips64r6 support.  This patch is the memcpy change.  In addition
> to adding mips32r6/mips64r6 support I fixed some indentation inconsistencies
> with preprocessor statements.
> 
> The main change is how to do memcpy when the source and destination cannot
> both be aligned.  For MIPS architectures prior to mips32r6 and mips64r6
> we aligned the destination and then used the ldl and ldr instructions to
> load unaligned words.  These instructions are not in mips32r6 or mips64r6
> so now we do two aligned word loads followed by an align instruction to
> combine the parts of the two loads into one register that can be stored
> into the previously aligned destination.
> 
> Tested with the mips32r6/mips64r6 GCC, binutils and qemu simulator.
> 
> OK to checkin?
> 
> Steve Ellcey
> sellcey@imgtec.com
> 

>  
> +#else /* R6_CODE */
> +
> +/* 
> + * Align the destination and hope that the source gets aligned too.  If it
> + * doesn't we jump to L(r6_unaligned*) to do unaligned copies using the r6
> + * align instruction.
> + */
> +	andi	t8,a0,7
> +	lapc	t9,L(atable)
> +	PTR_LSA	t9,t8,t9,2
> +	jrc	t9
> +L(atable):
> +	bc	L(lb0)
> +	bc	L(lb7)
> +	bc	L(lb6)

Same comment that jump table tends to be slow. Also is that faster than
storing these bytes unconditionally [1] or will it cause trouble in
processor that does not expect overlapping stores?

[1]

x[0] = y[0];
x[1] = y[1];
x[2] = y[2];
x[3] = y[3];
x[4] = y[4];
x[5] = y[5];
x[6] = y[6];
x[7] = y[7];
  
Joseph Myers Dec. 22, 2014, 5:59 p.m. UTC | #2
On Fri, 19 Dec 2014, Steve Ellcey  wrote:

> 	* sysdeps/mips/memcpy.S: Fix preprocessor indentation.

Please separate the formatting fixes from the substantive changes.  The 
formatting fixes - a patch that shows no changes from "git diff -w" - can 
go in as obvious.  The r6 changes should then be resubmitted.
  

Patch

diff --git a/sysdeps/mips/memcpy.S b/sysdeps/mips/memcpy.S
index 2420f93..2b7e5e8 100644
--- a/sysdeps/mips/memcpy.S
+++ b/sysdeps/mips/memcpy.S
@@ -16,69 +16,86 @@ 
    <http://www.gnu.org/licenses/>.  */
 
 #ifdef ANDROID_CHANGES
-#include "machine/asm.h"
-#include "machine/regdef.h"
-#define USE_MEMMOVE_FOR_OVERLAP
-#define PREFETCH_LOAD_HINT PREFETCH_HINT_LOAD_STREAMED
-#define PREFETCH_STORE_HINT PREFETCH_HINT_PREPAREFORSTORE
+# include "machine/asm.h"
+# include "machine/regdef.h"
+# define USE_MEMMOVE_FOR_OVERLAP
+# define PREFETCH_LOAD_HINT PREFETCH_HINT_LOAD_STREAMED
+# define PREFETCH_STORE_HINT PREFETCH_HINT_PREPAREFORSTORE
 #elif _LIBC
-#include <sysdep.h>
-#include <regdef.h>
-#include <sys/asm.h>
-#define PREFETCH_LOAD_HINT PREFETCH_HINT_LOAD_STREAMED
-#define PREFETCH_STORE_HINT PREFETCH_HINT_PREPAREFORSTORE
+# include <sysdep.h>
+# include <regdef.h>
+# include <sys/asm.h>
+# define PREFETCH_LOAD_HINT PREFETCH_HINT_LOAD_STREAMED
+# define PREFETCH_STORE_HINT PREFETCH_HINT_PREPAREFORSTORE
 #elif _COMPILING_NEWLIB
-#include "machine/asm.h"
-#include "machine/regdef.h"
-#define PREFETCH_LOAD_HINT PREFETCH_HINT_LOAD_STREAMED
-#define PREFETCH_STORE_HINT PREFETCH_HINT_PREPAREFORSTORE
+# include "machine/asm.h"
+# include "machine/regdef.h"
+# define PREFETCH_LOAD_HINT PREFETCH_HINT_LOAD_STREAMED
+# define PREFETCH_STORE_HINT PREFETCH_HINT_PREPAREFORSTORE
 #else
-#include <regdef.h>
-#include <sys/asm.h>
+# include <regdef.h>
+# include <sys/asm.h>
 #endif
 
-#if (_MIPS_ISA == _MIPS_ISA_MIPS4) || (_MIPS_ISA == _MIPS_ISA_MIPS5) || \
-    (_MIPS_ISA == _MIPS_ISA_MIPS32) || (_MIPS_ISA == _MIPS_ISA_MIPS64)
-#ifndef DISABLE_PREFETCH
-#define USE_PREFETCH
-#endif
-#endif
+/* Check to see if the MIPS architecture we are compiling for supports
+ * prefetching.
+ */
 
-#if defined(_MIPS_SIM) && ((_MIPS_SIM == _ABI64) || (_MIPS_SIM == _ABIN32))
-#ifndef DISABLE_DOUBLE
-#define USE_DOUBLE
-#endif
+#if (__mips == 4) || (__mips == 5) || (__mips == 32) || (__mips == 64)
+# ifndef DISABLE_PREFETCH
+#  define USE_PREFETCH
+# endif
 #endif
 
+#if defined(_MIPS_SIM) && (_MIPS_SIM == _ABI64 || _MIPS_SIM == _ABIN32)
+# ifndef DISABLE_DOUBLE
+#  define USE_DOUBLE
+# endif
+#endif
 
+#if __mips_isa_rev > 5
+# if (PREFETCH_STORE_HINT == PREFETCH_HINT_PREPAREFORSTORE)
+#  undef PREFETCH_STORE_HINT
+#  define PREFETCH_STORE_HINT PREFETCH_HINT_STORE_STREAMED
+# endif
+# define R6_CODE
+#endif
 
 /* Some asm.h files do not have the L macro definition.  */
 #ifndef L
-#if _MIPS_SIM == _ABIO32
-# define L(label) $L ## label
-#else
-# define L(label) .L ## label
-#endif
+# if _MIPS_SIM == _ABIO32
+#  define L(label) $L ## label
+# else
+#  define L(label) .L ## label
+# endif
 #endif
 
 /* Some asm.h files do not have the PTR_ADDIU macro definition.  */
 #ifndef PTR_ADDIU
-#ifdef USE_DOUBLE
-#define PTR_ADDIU	daddiu
-#else
-#define PTR_ADDIU	addiu
-#endif
+# if _MIPS_SIM == _ABI64
+#  define PTR_ADDIU	daddiu
+# else
+#  define PTR_ADDIU	addiu
+# endif
 #endif
 
 /* Some asm.h files do not have the PTR_SRA macro definition.  */
 #ifndef PTR_SRA
-#ifdef USE_DOUBLE
-#define PTR_SRA		dsra
-#else
-#define PTR_SRA		sra
-#endif
+# if _MIPS_SIM == _ABI64
+#  define PTR_SRA	dsra
+# else
+#  define PTR_SRA	sra
+# endif
 #endif
 
+/* New R6 instructions that may not be in asm.h.  */
+#ifndef PTR_LSA
+# if _MIPS_SIM == _ABI64
+#  define PTR_LSA	dlsa
+# else
+#  define PTR_LSA	lsa
+# endif
+#endif
 
 /*
  * Using PREFETCH_HINT_LOAD_STREAMED instead of PREFETCH_LOAD on load
@@ -183,7 +200,7 @@ 
 
 /* Allow the routine to be named something else if desired.  */
 #ifndef MEMCPY_NAME
-#define MEMCPY_NAME memcpy
+# define MEMCPY_NAME memcpy
 #endif
 
 /* We use these 32/64 bit registers as temporaries to do the copying.  */
@@ -191,16 +208,16 @@ 
 #define REG1 t1
 #define REG2 t2
 #define REG3 t3
-#if defined(_MIPS_SIM) && ((_MIPS_SIM == _ABIO32) || (_MIPS_SIM == _ABIO64))
-#  define REG4 t4
-#  define REG5 t5
-#  define REG6 t6
-#  define REG7 t7
+#if defined(_MIPS_SIM) && (_MIPS_SIM == _ABIO32 || _MIPS_SIM == _ABIO64)
+# define REG4 t4
+# define REG5 t5
+# define REG6 t6
+# define REG7 t7
 #else
-#  define REG4 ta0
-#  define REG5 ta1
-#  define REG6 ta2
-#  define REG7 ta3
+# define REG4 ta0
+# define REG5 ta1
+# define REG6 ta2
+# define REG7 ta3
 #endif
 
 /* We load/store 64 bits at a time when USE_DOUBLE is true.
@@ -208,44 +225,46 @@ 
  * conflicts with system header files.  */
 
 #ifdef USE_DOUBLE
-#  define C_ST	sd
-#  define C_LD	ld
-#if __MIPSEB
+# define C_ST	sd
+# define C_LD	ld
+# if __MIPSEB
 #  define C_LDHI	ldl	/* high part is left in big-endian	*/
 #  define C_STHI	sdl	/* high part is left in big-endian	*/
 #  define C_LDLO	ldr	/* low part is right in big-endian	*/
 #  define C_STLO	sdr	/* low part is right in big-endian	*/
-#else
+# else
 #  define C_LDHI	ldr	/* high part is right in little-endian	*/
 #  define C_STHI	sdr	/* high part is right in little-endian	*/
 #  define C_LDLO	ldl	/* low part is left in little-endian	*/
 #  define C_STLO	sdl	/* low part is left in little-endian	*/
-#endif
+# endif
+# define C_ALIGN	dalign	/* r6 align instruction			*/
 #else
-#  define C_ST	sw
-#  define C_LD	lw
-#if __MIPSEB
+# define C_ST	sw
+# define C_LD	lw
+# if __MIPSEB
 #  define C_LDHI	lwl	/* high part is left in big-endian	*/
 #  define C_STHI	swl	/* high part is left in big-endian	*/
 #  define C_LDLO	lwr	/* low part is right in big-endian	*/
 #  define C_STLO	swr	/* low part is right in big-endian	*/
-#else
+# else
 #  define C_LDHI	lwr	/* high part is right in little-endian	*/
 #  define C_STHI	swr	/* high part is right in little-endian	*/
 #  define C_LDLO	lwl	/* low part is left in little-endian	*/
 #  define C_STLO	swl	/* low part is left in little-endian	*/
-#endif
+# endif
+# define C_ALIGN	align	/* r6 align instruction			*/
 #endif
 
 /* Bookkeeping values for 32 vs. 64 bit mode.  */
 #ifdef USE_DOUBLE
-#  define NSIZE 8
-#  define NSIZEMASK 0x3f
-#  define NSIZEDMASK 0x7f
+# define NSIZE 8
+# define NSIZEMASK 0x3f
+# define NSIZEDMASK 0x7f
 #else
-#  define NSIZE 4
-#  define NSIZEMASK 0x1f
-#  define NSIZEDMASK 0x3f
+# define NSIZE 4
+# define NSIZEMASK 0x1f
+# define NSIZEDMASK 0x3f
 #endif
 #define UNIT(unit) ((unit)*NSIZE)
 #define UNITM1(unit) (((unit)*NSIZE)-1)
@@ -274,6 +293,7 @@  LEAF(MEMCPY_NAME)
 	 nop
 L(memcpy):
 #endif
+
 /*
  * If the size is less than 2*NSIZE (8 or 16), go to L(lastb).  Regardless of
  * size, copy dst pointer to v0 for the return value.
@@ -285,6 +305,9 @@  L(memcpy):
 #else
 	move	v0,a0
 #endif
+
+#if !defined (R6_CODE)
+
 /*
  * If src and dst have different alignments, go to L(unaligned), if they
  * have the same alignment (but are not actually aligned) do a partial
@@ -305,6 +328,74 @@  L(memcpy):
 	C_STHI	t8,0(a0)
 	PTR_ADDU a0,a0,a3
 
+#else /* R6_CODE */
+
+/* 
+ * Align the destination and hope that the source gets aligned too.  If it
+ * doesn't we jump to L(r6_unaligned*) to do unaligned copies using the r6
+ * align instruction.
+ */
+	andi	t8,a0,7
+	lapc	t9,L(atable)
+	PTR_LSA	t9,t8,t9,2
+	jrc	t9
+L(atable):
+	bc	L(lb0)
+	bc	L(lb7)
+	bc	L(lb6)
+	bc	L(lb5)
+	bc	L(lb4)
+	bc	L(lb3)
+	bc	L(lb2)
+	bc	L(lb1)
+L(lb7):
+	lb	a3, 6(a1)
+	sb	a3, 6(a0)
+L(lb6):
+	lb	a3, 5(a1)
+	sb	a3, 5(a0)
+L(lb5):
+	lb	a3, 4(a1)
+	sb	a3, 4(a0)
+L(lb4):
+	lb	a3, 3(a1)
+	sb	a3, 3(a0)
+L(lb3):
+	lb	a3, 2(a1)
+	sb	a3, 2(a0)
+L(lb2):
+	lb	a3, 1(a1)
+	sb	a3, 1(a0)
+L(lb1):
+	lb	a3, 0(a1)
+	sb	a3, 0(a0)
+
+	li	t9,8
+	subu	t8,t9,t8
+	PTR_SUBU a2,a2,t8
+	PTR_ADDU a0,a0,t8
+	PTR_ADDU a1,a1,t8
+L(lb0):
+
+	andi	t8,a1,(NSIZE-1)
+	lapc	t9,L(jtable)
+	PTR_LSA	t9,t8,t9,2
+	jrc	t9
+L(jtable):
+        bc      L(aligned)
+        bc      L(r6_unaligned1)
+        bc      L(r6_unaligned2)
+        bc      L(r6_unaligned3)
+# ifdef USE_DOUBLE
+        bc      L(r6_unaligned4)
+        bc      L(r6_unaligned5)
+        bc      L(r6_unaligned6)
+        bc      L(r6_unaligned7)
+# endif
+#endif /* R6_CODE */
+
+L(aligned):
+
 /*
  * Now dst/src are both aligned to (word or double word) aligned addresses
  * Set a2 to count how many bytes we have to copy after all the 64/128 byte
@@ -313,7 +404,6 @@  L(memcpy):
  * equals a3.
  */
 
-L(aligned):
 	andi	t8,a2,NSIZEDMASK /* any whole 64-byte/128-byte chunks? */
 	beq	a2,t8,L(chkw)	 /* if a2==t8, no 64-byte/128-byte chunks */
 	PTR_SUBU a3,a2,t8	 /* subtract from a2 the reminder */
@@ -363,13 +453,17 @@  L(loop16w):
 	bgtz	v1,L(skip_pref)
 #endif
 	C_LD	t1,UNIT(1)(a1)
+#if defined(R6_CODE)
+	PREFETCH_FOR_STORE (2, a0)
+#else
 	PREFETCH_FOR_STORE (4, a0)
 	PREFETCH_FOR_STORE (5, a0)
+#endif
 #if defined(RETURN_LAST_PREFETCH) && defined(USE_PREFETCH)
 	PTR_ADDIU v0,a0,(PREFETCH_CHUNK*5)
-#ifdef USE_DOUBLE
+# ifdef USE_DOUBLE
 	PTR_ADDIU v0,v0,32
-#endif
+# endif
 #endif
 L(skip_pref):
 	C_LD	REG2,UNIT(2)(a1)
@@ -378,7 +472,11 @@  L(skip_pref):
 	C_LD	REG5,UNIT(5)(a1)
 	C_LD	REG6,UNIT(6)(a1)
 	C_LD	REG7,UNIT(7)(a1)
-        PREFETCH_FOR_LOAD (4, a1)
+#if defined(R6_CODE)
+	PREFETCH_FOR_LOAD (3, a1)
+#else
+	PREFETCH_FOR_LOAD (4, a1)
+#endif
 
 	C_ST	t0,UNIT(0)(a0)
 	C_ST	t1,UNIT(1)(a0)
@@ -397,7 +495,9 @@  L(skip_pref):
 	C_LD	REG5,UNIT(13)(a1)
 	C_LD	REG6,UNIT(14)(a1)
 	C_LD	REG7,UNIT(15)(a1)
+#if !defined(R6_CODE)
         PREFETCH_FOR_LOAD (5, a1)
+#endif
 	C_ST	t0,UNIT(8)(a0)
 	C_ST	t1,UNIT(9)(a0)
 	C_ST	REG2,UNIT(10)(a0)
@@ -476,6 +576,8 @@  L(lastbloop):
 L(leave):
 	j	ra
 	nop
+
+#if !defined (R6_CODE)
 /*
  * UNALIGNED case, got here with a3 = "negu a0"
  * This code is nearly identical to the aligned code above
@@ -667,6 +769,59 @@  L(ua_smallCopy_loop):
 	j	ra
 	nop
 
+#else /* R6_CODE */
+
+# if __MIPSEB
+#  define SWAP_REGS(X,Y) X, Y
+#  define ALIGN_OFFSET(N) (N)
+# else
+#  define SWAP_REGS(X,Y) Y, X
+#  define ALIGN_OFFSET(N) (NSIZE-N)
+# endif
+# define R6_UNALIGNED_WORD_COPY(BYTEOFFSET) \
+	andi	REG7, a2, (NSIZE-1);/* REG7 is # of bytes to by bytes.     */ \
+	beq	REG7, a2, L(lastb); /* Check for bytes to copy by word	   */ \
+	PTR_SUBU a3, a2, REG7;	/* a3 is number of bytes to be copied in   */ \
+				/* (d)word chunks.			   */ \
+	move	a2, REG7;	/* a2 is # of bytes to copy byte by byte   */ \
+				/* after word loop is finished.		   */ \
+	PTR_ADDU REG6, a0, a3;	/* REG6 is the dst address after loop.	   */ \
+	PTR_SUBU REG2, a1, t8;	/* REG2 is the aligned src address.	   */ \
+	PTR_ADDU a1, a1, a3;	/* a1 is addr of source after word loop.   */ \
+	C_LD	t0, UNIT(0)(REG2);  /* Load first part of source.	   */ \
+L(r6_ua_wordcopy##BYTEOFFSET):						      \
+	C_LD	t1, UNIT(1)(REG2);  /* Load second part of source.	   */ \
+	C_ALIGN	REG3, SWAP_REGS(t1,t0), ALIGN_OFFSET(BYTEOFFSET);	      \
+	PTR_ADDIU a0, a0, UNIT(1);  /* Increment destination pointer.	   */ \
+	PTR_ADDIU REG2, REG2, UNIT(1); /* Increment aligned source pointer.*/ \
+	move	t0, t1;		/* Move second part of source to first.	   */ \
+	bne	a0, REG6,L(r6_ua_wordcopy##BYTEOFFSET);			      \
+	C_ST	REG3, UNIT(-1)(a0);					      \
+	j	L(lastb);						      \
+	nop
+
+	/* We are generating R6 code, the destination is 4 byte aligned and
+	   the source is not 4 byte aligned. t8 is 1, 2, or 3 depending on the
+           alignment of the source.  */
+
+L(r6_unaligned1):
+	R6_UNALIGNED_WORD_COPY(1)
+L(r6_unaligned2):
+	R6_UNALIGNED_WORD_COPY(2)
+L(r6_unaligned3):
+	R6_UNALIGNED_WORD_COPY(3)
+# ifdef USE_DOUBLE
+L(r6_unaligned4):
+	R6_UNALIGNED_WORD_COPY(4)
+L(r6_unaligned5):
+	R6_UNALIGNED_WORD_COPY(5)
+L(r6_unaligned6):
+	R6_UNALIGNED_WORD_COPY(6)
+L(r6_unaligned7):
+	R6_UNALIGNED_WORD_COPY(7)
+# endif
+#endif /* R6_CODE */
+
 	.set	at
 	.set	reorder
 END(MEMCPY_NAME)