From patchwork Fri Dec 19 23:02:37 2014 Content-Type: text/plain; charset="utf-8" MIME-Version: 1.0 Content-Transfer-Encoding: 7bit X-Patchwork-Submitter: Steve Ellcey X-Patchwork-Id: 4382 Received: (qmail 20120 invoked by alias); 19 Dec 2014 23:02:49 -0000 Mailing-List: contact libc-alpha-help@sourceware.org; run by ezmlm Precedence: bulk List-Id: List-Unsubscribe: List-Subscribe: List-Archive: List-Post: List-Help: , Sender: libc-alpha-owner@sourceware.org Delivered-To: mailing list libc-alpha@sourceware.org Received: (qmail 20102 invoked by uid 89); 19 Dec 2014 23:02:47 -0000 Authentication-Results: sourceware.org; auth=none X-Virus-Found: No X-Spam-SWARE-Status: No, score=-2.0 required=5.0 tests=AWL, BAYES_00, SPF_PASS, T_RP_MATCHES_RCVD autolearn=ham version=3.3.2 X-HELO: mailapp01.imgtec.com From: "Steve Ellcey " Date: Fri, 19 Dec 2014 15:02:37 -0800 To: Subject: [Patch, MIPS] Modify memcpy.S for mips32r6/mips64r6 User-Agent: Heirloom mailx 12.5 6/20/10 MIME-Version: 1.0 Message-ID: <7ec2bf7e-fc1e-428b-ac0a-747f2a3ab3e6@BAMAIL02.ba.imgtec.org> Changes to memcpy and memset are the final patches I have for mips32r6 and mips64r6 support. This patch is the memcpy change. In addition to adding mips32r6/mips64r6 support I fixed some indentation inconsistencies with preprocessor statements. The main change is how to do memcpy when the source and destination cannot both be aligned. For MIPS architectures prior to mips32r6 and mips64r6 we aligned the destination and then used the ldl and ldr instructions to load unaligned words. These instructions are not in mips32r6 or mips64r6 so now we do two aligned word loads followed by an align instruction to combine the parts of the two loads into one register that can be stored into the previously aligned destination. Tested with the mips32r6/mips64r6 GCC, binutils and qemu simulator. OK to checkin? Steve Ellcey sellcey@imgtec.com 2014-12-19 Steve Ellcey * sysdeps/mips/memcpy.S: Fix preprocessor indentation. (memcpy): Modify for mips32r6/mips64r6 to use align instead of lwl/lwr for unaligned memcpy loops. diff --git a/sysdeps/mips/memcpy.S b/sysdeps/mips/memcpy.S index 2420f93..2b7e5e8 100644 --- a/sysdeps/mips/memcpy.S +++ b/sysdeps/mips/memcpy.S @@ -16,69 +16,86 @@ . */ #ifdef ANDROID_CHANGES -#include "machine/asm.h" -#include "machine/regdef.h" -#define USE_MEMMOVE_FOR_OVERLAP -#define PREFETCH_LOAD_HINT PREFETCH_HINT_LOAD_STREAMED -#define PREFETCH_STORE_HINT PREFETCH_HINT_PREPAREFORSTORE +# include "machine/asm.h" +# include "machine/regdef.h" +# define USE_MEMMOVE_FOR_OVERLAP +# define PREFETCH_LOAD_HINT PREFETCH_HINT_LOAD_STREAMED +# define PREFETCH_STORE_HINT PREFETCH_HINT_PREPAREFORSTORE #elif _LIBC -#include -#include -#include -#define PREFETCH_LOAD_HINT PREFETCH_HINT_LOAD_STREAMED -#define PREFETCH_STORE_HINT PREFETCH_HINT_PREPAREFORSTORE +# include +# include +# include +# define PREFETCH_LOAD_HINT PREFETCH_HINT_LOAD_STREAMED +# define PREFETCH_STORE_HINT PREFETCH_HINT_PREPAREFORSTORE #elif _COMPILING_NEWLIB -#include "machine/asm.h" -#include "machine/regdef.h" -#define PREFETCH_LOAD_HINT PREFETCH_HINT_LOAD_STREAMED -#define PREFETCH_STORE_HINT PREFETCH_HINT_PREPAREFORSTORE +# include "machine/asm.h" +# include "machine/regdef.h" +# define PREFETCH_LOAD_HINT PREFETCH_HINT_LOAD_STREAMED +# define PREFETCH_STORE_HINT PREFETCH_HINT_PREPAREFORSTORE #else -#include -#include +# include +# include #endif -#if (_MIPS_ISA == _MIPS_ISA_MIPS4) || (_MIPS_ISA == _MIPS_ISA_MIPS5) || \ - (_MIPS_ISA == _MIPS_ISA_MIPS32) || (_MIPS_ISA == _MIPS_ISA_MIPS64) -#ifndef DISABLE_PREFETCH -#define USE_PREFETCH -#endif -#endif +/* Check to see if the MIPS architecture we are compiling for supports + * prefetching. + */ -#if defined(_MIPS_SIM) && ((_MIPS_SIM == _ABI64) || (_MIPS_SIM == _ABIN32)) -#ifndef DISABLE_DOUBLE -#define USE_DOUBLE -#endif +#if (__mips == 4) || (__mips == 5) || (__mips == 32) || (__mips == 64) +# ifndef DISABLE_PREFETCH +# define USE_PREFETCH +# endif #endif +#if defined(_MIPS_SIM) && (_MIPS_SIM == _ABI64 || _MIPS_SIM == _ABIN32) +# ifndef DISABLE_DOUBLE +# define USE_DOUBLE +# endif +#endif +#if __mips_isa_rev > 5 +# if (PREFETCH_STORE_HINT == PREFETCH_HINT_PREPAREFORSTORE) +# undef PREFETCH_STORE_HINT +# define PREFETCH_STORE_HINT PREFETCH_HINT_STORE_STREAMED +# endif +# define R6_CODE +#endif /* Some asm.h files do not have the L macro definition. */ #ifndef L -#if _MIPS_SIM == _ABIO32 -# define L(label) $L ## label -#else -# define L(label) .L ## label -#endif +# if _MIPS_SIM == _ABIO32 +# define L(label) $L ## label +# else +# define L(label) .L ## label +# endif #endif /* Some asm.h files do not have the PTR_ADDIU macro definition. */ #ifndef PTR_ADDIU -#ifdef USE_DOUBLE -#define PTR_ADDIU daddiu -#else -#define PTR_ADDIU addiu -#endif +# if _MIPS_SIM == _ABI64 +# define PTR_ADDIU daddiu +# else +# define PTR_ADDIU addiu +# endif #endif /* Some asm.h files do not have the PTR_SRA macro definition. */ #ifndef PTR_SRA -#ifdef USE_DOUBLE -#define PTR_SRA dsra -#else -#define PTR_SRA sra -#endif +# if _MIPS_SIM == _ABI64 +# define PTR_SRA dsra +# else +# define PTR_SRA sra +# endif #endif +/* New R6 instructions that may not be in asm.h. */ +#ifndef PTR_LSA +# if _MIPS_SIM == _ABI64 +# define PTR_LSA dlsa +# else +# define PTR_LSA lsa +# endif +#endif /* * Using PREFETCH_HINT_LOAD_STREAMED instead of PREFETCH_LOAD on load @@ -183,7 +200,7 @@ /* Allow the routine to be named something else if desired. */ #ifndef MEMCPY_NAME -#define MEMCPY_NAME memcpy +# define MEMCPY_NAME memcpy #endif /* We use these 32/64 bit registers as temporaries to do the copying. */ @@ -191,16 +208,16 @@ #define REG1 t1 #define REG2 t2 #define REG3 t3 -#if defined(_MIPS_SIM) && ((_MIPS_SIM == _ABIO32) || (_MIPS_SIM == _ABIO64)) -# define REG4 t4 -# define REG5 t5 -# define REG6 t6 -# define REG7 t7 +#if defined(_MIPS_SIM) && (_MIPS_SIM == _ABIO32 || _MIPS_SIM == _ABIO64) +# define REG4 t4 +# define REG5 t5 +# define REG6 t6 +# define REG7 t7 #else -# define REG4 ta0 -# define REG5 ta1 -# define REG6 ta2 -# define REG7 ta3 +# define REG4 ta0 +# define REG5 ta1 +# define REG6 ta2 +# define REG7 ta3 #endif /* We load/store 64 bits at a time when USE_DOUBLE is true. @@ -208,44 +225,46 @@ * conflicts with system header files. */ #ifdef USE_DOUBLE -# define C_ST sd -# define C_LD ld -#if __MIPSEB +# define C_ST sd +# define C_LD ld +# if __MIPSEB # define C_LDHI ldl /* high part is left in big-endian */ # define C_STHI sdl /* high part is left in big-endian */ # define C_LDLO ldr /* low part is right in big-endian */ # define C_STLO sdr /* low part is right in big-endian */ -#else +# else # define C_LDHI ldr /* high part is right in little-endian */ # define C_STHI sdr /* high part is right in little-endian */ # define C_LDLO ldl /* low part is left in little-endian */ # define C_STLO sdl /* low part is left in little-endian */ -#endif +# endif +# define C_ALIGN dalign /* r6 align instruction */ #else -# define C_ST sw -# define C_LD lw -#if __MIPSEB +# define C_ST sw +# define C_LD lw +# if __MIPSEB # define C_LDHI lwl /* high part is left in big-endian */ # define C_STHI swl /* high part is left in big-endian */ # define C_LDLO lwr /* low part is right in big-endian */ # define C_STLO swr /* low part is right in big-endian */ -#else +# else # define C_LDHI lwr /* high part is right in little-endian */ # define C_STHI swr /* high part is right in little-endian */ # define C_LDLO lwl /* low part is left in little-endian */ # define C_STLO swl /* low part is left in little-endian */ -#endif +# endif +# define C_ALIGN align /* r6 align instruction */ #endif /* Bookkeeping values for 32 vs. 64 bit mode. */ #ifdef USE_DOUBLE -# define NSIZE 8 -# define NSIZEMASK 0x3f -# define NSIZEDMASK 0x7f +# define NSIZE 8 +# define NSIZEMASK 0x3f +# define NSIZEDMASK 0x7f #else -# define NSIZE 4 -# define NSIZEMASK 0x1f -# define NSIZEDMASK 0x3f +# define NSIZE 4 +# define NSIZEMASK 0x1f +# define NSIZEDMASK 0x3f #endif #define UNIT(unit) ((unit)*NSIZE) #define UNITM1(unit) (((unit)*NSIZE)-1) @@ -274,6 +293,7 @@ LEAF(MEMCPY_NAME) nop L(memcpy): #endif + /* * If the size is less than 2*NSIZE (8 or 16), go to L(lastb). Regardless of * size, copy dst pointer to v0 for the return value. @@ -285,6 +305,9 @@ L(memcpy): #else move v0,a0 #endif + +#if !defined (R6_CODE) + /* * If src and dst have different alignments, go to L(unaligned), if they * have the same alignment (but are not actually aligned) do a partial @@ -305,6 +328,74 @@ L(memcpy): C_STHI t8,0(a0) PTR_ADDU a0,a0,a3 +#else /* R6_CODE */ + +/* + * Align the destination and hope that the source gets aligned too. If it + * doesn't we jump to L(r6_unaligned*) to do unaligned copies using the r6 + * align instruction. + */ + andi t8,a0,7 + lapc t9,L(atable) + PTR_LSA t9,t8,t9,2 + jrc t9 +L(atable): + bc L(lb0) + bc L(lb7) + bc L(lb6) + bc L(lb5) + bc L(lb4) + bc L(lb3) + bc L(lb2) + bc L(lb1) +L(lb7): + lb a3, 6(a1) + sb a3, 6(a0) +L(lb6): + lb a3, 5(a1) + sb a3, 5(a0) +L(lb5): + lb a3, 4(a1) + sb a3, 4(a0) +L(lb4): + lb a3, 3(a1) + sb a3, 3(a0) +L(lb3): + lb a3, 2(a1) + sb a3, 2(a0) +L(lb2): + lb a3, 1(a1) + sb a3, 1(a0) +L(lb1): + lb a3, 0(a1) + sb a3, 0(a0) + + li t9,8 + subu t8,t9,t8 + PTR_SUBU a2,a2,t8 + PTR_ADDU a0,a0,t8 + PTR_ADDU a1,a1,t8 +L(lb0): + + andi t8,a1,(NSIZE-1) + lapc t9,L(jtable) + PTR_LSA t9,t8,t9,2 + jrc t9 +L(jtable): + bc L(aligned) + bc L(r6_unaligned1) + bc L(r6_unaligned2) + bc L(r6_unaligned3) +# ifdef USE_DOUBLE + bc L(r6_unaligned4) + bc L(r6_unaligned5) + bc L(r6_unaligned6) + bc L(r6_unaligned7) +# endif +#endif /* R6_CODE */ + +L(aligned): + /* * Now dst/src are both aligned to (word or double word) aligned addresses * Set a2 to count how many bytes we have to copy after all the 64/128 byte @@ -313,7 +404,6 @@ L(memcpy): * equals a3. */ -L(aligned): andi t8,a2,NSIZEDMASK /* any whole 64-byte/128-byte chunks? */ beq a2,t8,L(chkw) /* if a2==t8, no 64-byte/128-byte chunks */ PTR_SUBU a3,a2,t8 /* subtract from a2 the reminder */ @@ -363,13 +453,17 @@ L(loop16w): bgtz v1,L(skip_pref) #endif C_LD t1,UNIT(1)(a1) +#if defined(R6_CODE) + PREFETCH_FOR_STORE (2, a0) +#else PREFETCH_FOR_STORE (4, a0) PREFETCH_FOR_STORE (5, a0) +#endif #if defined(RETURN_LAST_PREFETCH) && defined(USE_PREFETCH) PTR_ADDIU v0,a0,(PREFETCH_CHUNK*5) -#ifdef USE_DOUBLE +# ifdef USE_DOUBLE PTR_ADDIU v0,v0,32 -#endif +# endif #endif L(skip_pref): C_LD REG2,UNIT(2)(a1) @@ -378,7 +472,11 @@ L(skip_pref): C_LD REG5,UNIT(5)(a1) C_LD REG6,UNIT(6)(a1) C_LD REG7,UNIT(7)(a1) - PREFETCH_FOR_LOAD (4, a1) +#if defined(R6_CODE) + PREFETCH_FOR_LOAD (3, a1) +#else + PREFETCH_FOR_LOAD (4, a1) +#endif C_ST t0,UNIT(0)(a0) C_ST t1,UNIT(1)(a0) @@ -397,7 +495,9 @@ L(skip_pref): C_LD REG5,UNIT(13)(a1) C_LD REG6,UNIT(14)(a1) C_LD REG7,UNIT(15)(a1) +#if !defined(R6_CODE) PREFETCH_FOR_LOAD (5, a1) +#endif C_ST t0,UNIT(8)(a0) C_ST t1,UNIT(9)(a0) C_ST REG2,UNIT(10)(a0) @@ -476,6 +576,8 @@ L(lastbloop): L(leave): j ra nop + +#if !defined (R6_CODE) /* * UNALIGNED case, got here with a3 = "negu a0" * This code is nearly identical to the aligned code above @@ -667,6 +769,59 @@ L(ua_smallCopy_loop): j ra nop +#else /* R6_CODE */ + +# if __MIPSEB +# define SWAP_REGS(X,Y) X, Y +# define ALIGN_OFFSET(N) (N) +# else +# define SWAP_REGS(X,Y) Y, X +# define ALIGN_OFFSET(N) (NSIZE-N) +# endif +# define R6_UNALIGNED_WORD_COPY(BYTEOFFSET) \ + andi REG7, a2, (NSIZE-1);/* REG7 is # of bytes to by bytes. */ \ + beq REG7, a2, L(lastb); /* Check for bytes to copy by word */ \ + PTR_SUBU a3, a2, REG7; /* a3 is number of bytes to be copied in */ \ + /* (d)word chunks. */ \ + move a2, REG7; /* a2 is # of bytes to copy byte by byte */ \ + /* after word loop is finished. */ \ + PTR_ADDU REG6, a0, a3; /* REG6 is the dst address after loop. */ \ + PTR_SUBU REG2, a1, t8; /* REG2 is the aligned src address. */ \ + PTR_ADDU a1, a1, a3; /* a1 is addr of source after word loop. */ \ + C_LD t0, UNIT(0)(REG2); /* Load first part of source. */ \ +L(r6_ua_wordcopy##BYTEOFFSET): \ + C_LD t1, UNIT(1)(REG2); /* Load second part of source. */ \ + C_ALIGN REG3, SWAP_REGS(t1,t0), ALIGN_OFFSET(BYTEOFFSET); \ + PTR_ADDIU a0, a0, UNIT(1); /* Increment destination pointer. */ \ + PTR_ADDIU REG2, REG2, UNIT(1); /* Increment aligned source pointer.*/ \ + move t0, t1; /* Move second part of source to first. */ \ + bne a0, REG6,L(r6_ua_wordcopy##BYTEOFFSET); \ + C_ST REG3, UNIT(-1)(a0); \ + j L(lastb); \ + nop + + /* We are generating R6 code, the destination is 4 byte aligned and + the source is not 4 byte aligned. t8 is 1, 2, or 3 depending on the + alignment of the source. */ + +L(r6_unaligned1): + R6_UNALIGNED_WORD_COPY(1) +L(r6_unaligned2): + R6_UNALIGNED_WORD_COPY(2) +L(r6_unaligned3): + R6_UNALIGNED_WORD_COPY(3) +# ifdef USE_DOUBLE +L(r6_unaligned4): + R6_UNALIGNED_WORD_COPY(4) +L(r6_unaligned5): + R6_UNALIGNED_WORD_COPY(5) +L(r6_unaligned6): + R6_UNALIGNED_WORD_COPY(6) +L(r6_unaligned7): + R6_UNALIGNED_WORD_COPY(7) +# endif +#endif /* R6_CODE */ + .set at .set reorder END(MEMCPY_NAME)