[V2,3/3] sparc: M7 optimized memcpy/mempcpy/memmove/memset/bzero.

Message ID 1455553975-17516-4-git-send-email-jose.marchesi@oracle.com
State New, archived
Headers

Commit Message

Jose E. Marchesi Feb. 15, 2016, 4:32 p.m. UTC
  Tested in sparcv9-*-* and sparc64-*-* targets in both multi and
non-multi arch configurations.
---
 ChangeLog                                          |   20 +
 sysdeps/sparc/sparc32/sparcv9/multiarch/Makefile   |    3 +-
 .../sparcv9/multiarch/memcpy-memmove-niagara7.S    |    2 +
 sysdeps/sparc/sparc32/sparcv9/multiarch/memmove.S  |    2 +
 .../sparc32/sparcv9/multiarch/memset-niagara7.S    |    2 +
 .../sparc/sparc32/sparcv9/multiarch/rtld-memmove.c |    1 +
 sysdeps/sparc/sparc64/multiarch/Makefile           |    3 +-
 sysdeps/sparc/sparc64/multiarch/ifunc-impl-list.c  |   13 +
 .../sparc64/multiarch/memcpy-memmove-niagara7.S    | 1084 ++++++++++++++++++++
 sysdeps/sparc/sparc64/multiarch/memcpy.S           |   28 +-
 sysdeps/sparc/sparc64/multiarch/memmove.S          |   72 ++
 sysdeps/sparc/sparc64/multiarch/memset-niagara7.S  |  339 ++++++
 sysdeps/sparc/sparc64/multiarch/memset.S           |   28 +-
 sysdeps/sparc/sparc64/multiarch/rtld-memmove.c     |    1 +
 14 files changed, 1592 insertions(+), 6 deletions(-)
 create mode 100644 sysdeps/sparc/sparc32/sparcv9/multiarch/memcpy-memmove-niagara7.S
 create mode 100644 sysdeps/sparc/sparc32/sparcv9/multiarch/memmove.S
 create mode 100644 sysdeps/sparc/sparc32/sparcv9/multiarch/memset-niagara7.S
 create mode 100644 sysdeps/sparc/sparc32/sparcv9/multiarch/rtld-memmove.c
 create mode 100644 sysdeps/sparc/sparc64/multiarch/memcpy-memmove-niagara7.S
 create mode 100644 sysdeps/sparc/sparc64/multiarch/memmove.S
 create mode 100644 sysdeps/sparc/sparc64/multiarch/memset-niagara7.S
 create mode 100644 sysdeps/sparc/sparc64/multiarch/rtld-memmove.c
  

Comments

Adhemerval Zanella Feb. 15, 2016, 5:54 p.m. UTC | #1
On 15-02-2016 14:32, Jose E. Marchesi wrote:
> diff --git a/sysdeps/sparc/sparc64/multiarch/memmove.S b/sysdeps/sparc/sparc64/multiarch/memmove.S
> new file mode 100644
> index 0000000..ca4eca1
> --- /dev/null
> +++ b/sysdeps/sparc/sparc64/multiarch/memmove.S

For newer implementation I think we should avoid assembly ifunc selector
implementation and use the libc_ifunc macro instead. Check both x86_64 
and powerpc.
  
David Miller March 20, 2016, 9:06 p.m. UTC | #2
From: "Jose E. Marchesi" <jose.marchesi@oracle.com>
Date: Mon, 15 Feb 2016 08:32:55 -0800

> +.Lwr_loop_rest:
> +	EX_ST(STORE_ASI(%o1,%o5+8+8))
> +        sub     %o4, 64, %o4
> +        EX_ST(STORE_ASI(%o1,%o5+16+8))

Can you please indent all of this assembler code correctly?

Use tabs for most instructions, and an extra space for instructions
that sit in a branch/call/jmpl delay slot.

Thank you.
  
Jose E. Marchesi March 21, 2016, 12:28 a.m. UTC | #3
From: "Jose E. Marchesi" <jose.marchesi@oracle.com>
    Date: Mon, 15 Feb 2016 08:32:55 -0800
    
    > +.Lwr_loop_rest:
    > +	EX_ST(STORE_ASI(%o1,%o5+8+8))
    > +        sub     %o4, 64, %o4
    > +        EX_ST(STORE_ASI(%o1,%o5+16+8))
    
    Can you please indent all of this assembler code correctly?
    
    Use tabs for most instructions, and an extra space for instructions
    that sit in a branch/call/jmpl delay slot.
    
Sure thing :)
  

Patch

diff --git a/ChangeLog b/ChangeLog
index 41cb8ba..6003f58 100644
--- a/ChangeLog
+++ b/ChangeLog
@@ -1,5 +1,25 @@ 
 2016-02-15  Jose E. Marchesi  <jose.marchesi@oracle.com>
 
+	* sysdeps/sparc/sparc64/multiarch/memset-niagara7.S: New file.
+	* sysdeps/sparc/sparc64/multiarch/memcpy-memmove-niagara7.S:
+	Likewise.
+	* sysdeps/sparc/sparc64/multiarch/memmove.S: Likewise.
+	* sysdeps/sparc/sparc32/sparcv9/multiarch/memmove.S: Likewise.
+	* sysdeps/sparc/sparc64/multiarch/memcpy.S: Use __memcpy_niagara7
+	and __mempcpy_niagara7 if the ADP hw capability is present.
+	* sysdeps/sparc/sparc64/multiarch/memset.S: Likewise for memset
+	and bzero.
+	* sysdeps/sparc/sparc64/multiarch/Makefile (sysdep_routines):
+	Added memcpy-memmove-niagara7 and memset-niagara7.
+	* sysdeps/sparc/sparc32/sparcv9/multiarch/Makefile
+	(sysdep_routines): Likewise.
+	* sysdeps/sparc/sparc32/sparcv9/multiarch/memset-niagara7.S: Likewise.
+	* sysdeps/sparc/sparc64/multiarch/ifunc-impl-list.c
+	(__libc_ifunc_impl_list): Added the memmove, bcopy, memcpy,
+	mempcpy, memset and bzero M7 implementations.
+
+2016-02-15  Jose E. Marchesi  <jose.marchesi@oracle.com>
+
 	* sysdeps/sparc/sparc32/sparcv9/memmove.S: New file.
 	* sysdeps/sparc/sparc32/sparcv9/rtld-memmove.c: Likewise.
 	* sysdeps/sparc/sparc64/memmove.S: Likewise.
diff --git a/sysdeps/sparc/sparc32/sparcv9/multiarch/Makefile b/sysdeps/sparc/sparc32/sparcv9/multiarch/Makefile
index 4ad7aff..0ad3e5b 100644
--- a/sysdeps/sparc/sparc32/sparcv9/multiarch/Makefile
+++ b/sysdeps/sparc/sparc32/sparcv9/multiarch/Makefile
@@ -8,5 +8,6 @@  endif
 
 ifeq ($(subdir),string)
 sysdep_routines += memcpy-ultra3 memcpy-niagara1 memcpy-niagara2 \
-		   memset-niagara1 memcpy-niagara4 memset-niagara4
+		   memset-niagara1 memcpy-niagara4 memset-niagara4 \
+                   memset-niagara7 memcpy-memmove-niagara7
 endif
diff --git a/sysdeps/sparc/sparc32/sparcv9/multiarch/memcpy-memmove-niagara7.S b/sysdeps/sparc/sparc32/sparcv9/multiarch/memcpy-memmove-niagara7.S
new file mode 100644
index 0000000..16da150
--- /dev/null
+++ b/sysdeps/sparc/sparc32/sparcv9/multiarch/memcpy-memmove-niagara7.S
@@ -0,0 +1,2 @@ 
+#define XCC icc
+#include <sparc64/multiarch/memcpy-memmove-niagara7.S>
diff --git a/sysdeps/sparc/sparc32/sparcv9/multiarch/memmove.S b/sysdeps/sparc/sparc32/sparcv9/multiarch/memmove.S
new file mode 100644
index 0000000..08597ba
--- /dev/null
+++ b/sysdeps/sparc/sparc32/sparcv9/multiarch/memmove.S
@@ -0,0 +1,2 @@ 
+#define XCC icc
+#include <sparc64/multiarch/memmove.S>
diff --git a/sysdeps/sparc/sparc32/sparcv9/multiarch/memset-niagara7.S b/sysdeps/sparc/sparc32/sparcv9/multiarch/memset-niagara7.S
new file mode 100644
index 0000000..de91aa4
--- /dev/null
+++ b/sysdeps/sparc/sparc32/sparcv9/multiarch/memset-niagara7.S
@@ -0,0 +1,2 @@ 
+#define XCC icc
+#include <sparc64/multiarch/memset-niagara7.S>
diff --git a/sysdeps/sparc/sparc32/sparcv9/multiarch/rtld-memmove.c b/sysdeps/sparc/sparc32/sparcv9/multiarch/rtld-memmove.c
new file mode 100644
index 0000000..2c88061
--- /dev/null
+++ b/sysdeps/sparc/sparc32/sparcv9/multiarch/rtld-memmove.c
@@ -0,0 +1 @@ 
+#include <sparc64/multiarch/rtld-memmove.c>
diff --git a/sysdeps/sparc/sparc64/multiarch/Makefile b/sysdeps/sparc/sparc64/multiarch/Makefile
index 55b757f..271c57a 100644
--- a/sysdeps/sparc/sparc64/multiarch/Makefile
+++ b/sysdeps/sparc/sparc64/multiarch/Makefile
@@ -8,7 +8,8 @@  endif
 
 ifeq ($(subdir),string)
 sysdep_routines += memcpy-ultra3 memcpy-niagara1 memcpy-niagara2 \
-		   memset-niagara1 memcpy-niagara4 memset-niagara4
+		   memset-niagara1 memcpy-niagara4 memset-niagara4 \
+                   memset-niagara7 memcpy-memmove-niagara7
 endif
 
 ifeq ($(subdir),stdlib)
diff --git a/sysdeps/sparc/sparc64/multiarch/ifunc-impl-list.c b/sysdeps/sparc/sparc64/multiarch/ifunc-impl-list.c
index e52eeb0..8ee9b6d 100644
--- a/sysdeps/sparc/sparc64/multiarch/ifunc-impl-list.c
+++ b/sysdeps/sparc/sparc64/multiarch/ifunc-impl-list.c
@@ -36,6 +36,8 @@  __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
   hwcap = GLRO(dl_hwcap);
 
   IFUNC_IMPL (i, name, memcpy,
+              IFUNC_IMPL_ADD (array, i, memcpy, hwcap & HWCAP_SPARC_ADP,
+                              __memcpy_niagara7)
 	      IFUNC_IMPL_ADD (array, i, memcpy, hwcap & HWCAP_SPARC_CRYPTO,
 			      __memcpy_niagara4)
 	      IFUNC_IMPL_ADD (array, i, memcpy, hwcap & HWCAP_SPARC_N2,
@@ -47,6 +49,8 @@  __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
 	      IFUNC_IMPL_ADD (array, i, memcpy, 1, __memcpy_ultra1));
 
   IFUNC_IMPL (i, name, mempcpy,
+              IFUNC_IMPL_ADD (array, i, mempcpy, hwcap & HWCAP_SPARC_ADP,
+                              __mempcpy_niagara7)
 	      IFUNC_IMPL_ADD (array, i, mempcpy, hwcap & HWCAP_SPARC_CRYPTO,
 			      __mempcpy_niagara4)
 	      IFUNC_IMPL_ADD (array, i, mempcpy, hwcap & HWCAP_SPARC_N2,
@@ -58,6 +62,8 @@  __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
 	      IFUNC_IMPL_ADD (array, i, mempcpy, 1, __mempcpy_ultra1));
 
   IFUNC_IMPL (i, name, bzero,
+              IFUNC_IMPL_ADD (array, i, bzero, hwcap & HWCAP_SPARC_ADP,
+                              __bzero_niagara7)
 	      IFUNC_IMPL_ADD (array, i, bzero, hwcap & HWCAP_SPARC_CRYPTO,
 			      __bzero_niagara4)
 	      IFUNC_IMPL_ADD (array, i, bzero, hwcap & HWCAP_SPARC_BLKINIT,
@@ -65,11 +71,18 @@  __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
 	      IFUNC_IMPL_ADD (array, i, bzero, 1, __bzero_ultra1));
 
   IFUNC_IMPL (i, name, memset,
+              IFUNC_IMPL_ADD (array, i, memset, hwcap & HWCAP_SPARC_ADP,
+                              __memset_niagara7)
 	      IFUNC_IMPL_ADD (array, i, memset, hwcap & HWCAP_SPARC_CRYPTO,
 			      __memset_niagara4)
 	      IFUNC_IMPL_ADD (array, i, memset, hwcap & HWCAP_SPARC_BLKINIT,
 			      __memset_niagara1)
 	      IFUNC_IMPL_ADD (array, i, memset, 1, __memset_ultra1));
 
+  IFUNC_IMPL (i, name, memmove,
+              IFUNC_IMPL_ADD (array, i, memmove, hwcap & HWCAP_SPARC_ADP,
+                              __memmove_niagara7)
+              IFUNC_IMPL_ADD (array, i, memmove, 1, __memmove_ultra1));
+
   return i;
 }
diff --git a/sysdeps/sparc/sparc64/multiarch/memcpy-memmove-niagara7.S b/sysdeps/sparc/sparc64/multiarch/memcpy-memmove-niagara7.S
new file mode 100644
index 0000000..07559a1
--- /dev/null
+++ b/sysdeps/sparc/sparc64/multiarch/memcpy-memmove-niagara7.S
@@ -0,0 +1,1084 @@ 
+/* Copy SIZE bytes from SRC to DEST.  For SUN4V M7.
+   Copyright (C) 2016 Free Software Foundation, Inc.
+   This file is part of the GNU C Library.
+
+   The GNU C Library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 2.1 of the License, or (at your option) any later version.
+
+   The GNU C Library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with the GNU C Library; if not, see
+   <http://www.gnu.org/licenses/>.  */
+
+#include <sysdep.h>
+
+#ifndef XCC
+# define XCC    xcc
+      	.register	%g2,#scratch
+	.register	%g3,#scratch
+	.register	%g6,#scratch
+#endif
+
+#define GLOBAL_SPARE	%g5
+#define FPRS_FEF	0x04
+
+/*
+ * ASI_STBI_P marks the cache line as "least recently used"
+ * which means if many threads are active, it has a high chance
+ * of being pushed out of the cache between the first initializing
+ * store and the final stores.
+ * Thus, in this algorithm we use ASI_STBIMRU_P which marks the
+ * cache line as "most recently used" for all but the last cache
+ * line.
+ */
+
+#define ASI_BLK_INIT_QUAD_LDD_P 0xe2
+#define ASI_ST_BLK_INIT_MRU_P 0xf2
+	
+#define ASI_STBI_P      ASI_BLK_INIT_QUAD_LDD_P
+#define ASI_STBIMRU_P   ASI_ST_BLK_INIT_MRU_P
+
+#define	BLOCK_SIZE	64        
+#define	SHORTCOPY	3
+#define	SHORTCHECK	14
+#define	SHORT_LONG	64	/* max copy for short longword-aligned case */
+				/* must be at least 64 */
+#define	SMALL_MAX	128
+#define	MED_UMAX	4096	/* max copy for medium un-aligned case */
+#define	MED_WMAX	4096	/* max copy for medium word-aligned case */
+#define	MED_MAX		4096	/* max copy for medium longword-aligned case */
+#define ST_CHUNK	20	/* ST_CHUNK - block of values for BIS Store */
+/* on T4, prefetch 20 is a strong read prefetch to L1 and L2 data cache
+ * prefetch 20 can cause inst pipeline to delay if data is in memory
+ * prefetch 21 is a strong read prefetch to L2 data cache, not L1 data cache */
+#define ALIGN_PRE	20	/* distance for aligned prefetch loop */
+        
+#define EX_ST(x)	x
+#define EX_RETVAL(x)	x
+#define STORE_ASI(src,addr)	stxa src, [addr] %asi
+#define STORE_INIT(src,addr)     stxa src, [addr] ASI_STBI_P
+
+#if IS_IN (libc)
+
+	.text
+
+ENTRY(__memmove_niagara7)
+        /* %o0=dst, %o1=src, %o2=len */
+	cmp	%o1, %o0	/* if from address is >= to use forward copy  */
+	bgeu,pn	%XCC, .Lforcpy	/* else use backward if ...  */
+	sub	%o0, %o1, %o4	/* get difference of two addresses  */
+	cmp	%o2, %o4	/* compare size and difference of addresses  */
+	bleu,pn	%XCC, .Lforcpy	/* if size is bigger, do overlapped copy  */
+	add	%o1, %o2, %o5	/* get to end of source space  */
+
+/* an overlapped copy that must be done "backwards"  */
+.Lchksize:
+	cmp	%o2, 8			/* less than 8 byte do byte copy  */
+	blu,pn %XCC, 2f			/* else continue  */
+
+/* Now size is bigger than 8  */
+.Ldbalign:
+	add	%o0, %o2, %g1		/* get to end of dest space  */
+	andcc	%g1, 7, %o3		/* %o3 has bytes till dst 8 bytes aligned  */
+	bz,a,pn	%XCC, .Ldbbck		/* if dst is not 8 byte aligned: align it  */
+	andn	%o2, 7, %o3		/* %o3 count is multiple of 8 bytes size  */
+	sub	%o2, %o3, %o2		/* update o2 with new count  */
+
+1:	dec	%o5			/* decrement source  */
+	ldub	[%o5], %g1		/* load one byte  */
+	deccc	%o3			/* decrement count  */
+	bgu,pt	%XCC, 1b		/* if not done keep copying  */
+	stb	%g1, [%o5+%o4]		/* store one byte into dest  */
+	andncc	%o2, 7, %o3		/* %o3 count is multiple of 8 bytes size  */
+	bz,pn	%XCC, 2f		/* if size < 8, move to byte copy  */
+
+/* Now Destination is 8 byte aligned  */
+.Ldbbck:
+	andcc	%o5, 7, %o0		/* %o0 has src offset  */
+	bz,a,pn	%XCC, .Ldbcopybc        /* if src is aligned to fast mem move  */
+	sub	%o2, %o3, %o2		/* Residue bytes in %o2  */
+
+.Lcpy_dbwdbc:				/* alignment of src is needed  */
+	sub	%o2, 8, %o2		/* set size one loop ahead  */
+	sll	%o0, 3, %g1		/* %g1 is left shift  */
+	mov	64, GLOBAL_SPARE	/* init GLOBAL_SPARE to be 64  */
+	sub	GLOBAL_SPARE, %g1, GLOBAL_SPARE
+                                        /* GLOBAL_SPARE right shift = (64 - left shift)  */
+	sub	%o5, %o0, %o5		/* align the src at 8 bytes.  */
+	add	%o4, %o0, %o4		/* increase difference between src & dst  */
+	ldx	[%o5], %o1		/* load first 8 bytes  */
+	srlx	%o1, GLOBAL_SPARE, %o1
+1:	sub	%o5, 8, %o5		/* subtract 8 from src  */
+	ldx	[%o5], %o0		/* load 8 byte  */
+	sllx	%o0, %g1, %o3		/* shift loaded 8 bytes left into tmp reg  */
+	or	%o1, %o3, %o3		/* align data  */
+	stx	%o3, [%o5+%o4]		/* store 8 byte  */
+	subcc	%o2, 8, %o2		/* subtract 8 byte from size  */
+	bg,pt	%XCC, 1b		/* if size > 0 continue  */
+	srlx	%o0, GLOBAL_SPARE, %o1	/* move extra byte for the next use  */
+
+	srl	%g1, 3, %o0		/* restore %o0 value for alignment  */
+	add	%o5, %o0, %o5		/* restore src alignment  */
+	sub	%o4, %o0, %o4		/* restore difference between src & dest  */
+
+	ba	2f			/* branch to the trailing byte copy  */
+	add	%o2, 8, %o2		/* restore size value  */
+
+.Ldbcopybc:				/* alignment of src is not needed  */
+1:	sub	%o5, 8, %o5		/* subtract from src  */
+	ldx	[%o5], %g1		/* load 8 bytes  */
+	subcc	%o3, 8, %o3		/* subtract from size  */
+	bgu,pt	%XCC, 1b		/* if size is bigger 0 continue  */
+	stx	%g1, [%o5+%o4]		/* store 8 bytes to destination  */
+
+	ba	2f
+	nop
+
+.Lbcbyte:
+1:	ldub	[%o5], %g1		/* load one byte  */
+	stb	%g1, [%o5+%o4]		/* store one byte  */
+2:	deccc	%o2			/* decrement size  */
+	bgeu,a,pt %XCC, 1b		/* if size is >= 0 continue  */
+	dec	%o5			/* decrement from address  */
+
+.Lexitbc:				/* exit from backward copy  */
+	retl
+	add	%o5, %o4, %o0		/* restore dest addr  */
+
+
+/* Check to see if memmove is large aligned copy
+ * If so, use special version of copy that avoids
+ * use of block store init.  */
+.Lforcpy:
+	cmp	%o2, SMALL_MAX		/* check for not small case  */
+	blt,pn	%XCC, .Lmv_short	/* merge with memcpy  */
+	mov	%o0, %g1		/* save %o0  */
+	neg	%o0, %o5
+	andcc	%o5, 7, %o5		/* bytes till DST 8 byte aligned  */
+	brz,pt	%o5, .Lmv_dst_aligned_on_8
+
+/* %o5 has the bytes to be written in partial store.  */
+	sub	%o2, %o5, %o2
+	sub	%o1, %o0, %o1		/* %o1 gets the difference  */
+7:					/* dst aligning loop  */
+	ldub	[%o1+%o0], %o4		/* load one byte  */
+	subcc	%o5, 1, %o5
+	stb	%o4, [%o0]
+	bgu,pt	%XCC, 7b
+	add	%o0, 1, %o0		/* advance dst  */
+	add	%o1, %o0, %o1		/* restore %o1  */
+.Lmv_dst_aligned_on_8:
+	andcc	%o1, 7, %o5
+	brnz,pn	%o5, .Lsrc_dst_unaligned_on_8
+	prefetch [%o1 + (1 * BLOCK_SIZE)], 20
+
+.Lmv_src_dst_aligned_on_8:
+/* check if we are copying MED_MAX or more bytes  */
+        set MED_MAX, %o3
+	cmp	%o2, %o3		/* limit to store buffer size  */
+	bleu,pt	%XCC, .Lmedlong
+	prefetch [%o1 + (2 * BLOCK_SIZE)], 20
+
+/* The mv_align loop below mimics the memcpy code for large aligned copies,
+ * but does not use the ASI_STBI_P (block initializing store) performance
+ * optimization.  This is used when memcpy is incorrectly invoked with
+ * overlapping buffers.  */
+
+.Lmv_large_align8_copy:			/* Src and dst share 8 byte alignment  */
+/* align dst to 64 byte boundary  */
+	andcc	%o0, 0x3f, %o3		/* %o3 == 0 means dst is 64 byte aligned  */
+	brz,pn	%o3, .Lmv_aligned_on_64
+	sub	%o3, 64, %o3		/* %o3 has negative bytes to move  */
+	add	%o2, %o3, %o2		/* adjust remaining count  */
+.Lmv_align_to_64:
+	ldx	[%o1], %o4
+	add	%o1, 8, %o1		/* increment src ptr  */
+	addcc	%o3, 8, %o3
+	stx	%o4, [%o0]
+	brnz,pt	%o3, .Lmv_align_to_64
+	add	%o0, 8, %o0		/* increment dst ptr  */
+
+.Lmv_aligned_on_64:
+	prefetch [%o1 + (3 * BLOCK_SIZE)], 20
+	andn	%o2, 0x3f, %o5		/* %o5 is multiple of block size  */
+	prefetch [%o1 + (4 * BLOCK_SIZE)], 20
+	and	%o2, 0x3f, %o2		/* residue bytes in %o2  */
+	prefetch [%o1 + (5 * BLOCK_SIZE)], 20
+.Lmv_align_loop:
+	ldx	[%o1],%o4
+	stx	%o4,[%o0]
+	prefetch [%o0 + (10 * BLOCK_SIZE)], 22
+	prefetch [%o0 + (4 * BLOCK_SIZE)+32], 22
+	subcc	%o5, 64, %o5
+	ldx	[%o1+8],%o4
+	stx	%o4,[%o0+8]
+	ldx	[%o1+16],%o4
+	stx	%o4,[%o0+16]
+	ldx	[%o1+24],%o4
+	stx	%o4,[%o0+24]
+	ldx	[%o1+32],%o4
+	stx	%o4,[%o0+32]
+	ldx	[%o1+40],%o4
+	stx	%o4,[%o0+40]
+	ldx	[%o1+48],%o4
+	add	%o1, 64, %o1
+	stx	%o4,[%o0+48]
+	add	%o0, 64, %o0
+	ldx	[%o1-8],%o4
+	bgt,pt	%XCC, .Lmv_align_loop
+	stx	%o4,[%o0-8]
+
+	ba	.Lmedlong
+	nop
+END(__memmove_niagara7)        
+        
+ENTRY(__mempcpy_niagara7)
+        /* %o0=dst, %o1=src, %o2=len */
+        ba,pt           %icc, 101f
+         add            %o0, %o2, %g1   /* save dst + len  */
+END(__mempcpy_niagara7)
+
+        .align          32
+ENTRY(__memcpy_niagara7)
+100:    /* %o0=dst, %o1=src, %o2=len */
+        mov             %o0, %g1        /* save %o0  */
+101:
+#ifndef __arch64__
+        srl             %o2, 0, %o2
+#endif
+	cmp	%o2, SMALL_MAX		/* check for not small case  */
+	bgeu,pn	%XCC, .Lmedium		/* go to larger cases  */
+.Lmv_short:
+	cmp	%o2, SHORTCOPY		/* check for really short case  */
+	ble,pn	%XCC, .Lsmallfin
+	or	%o0, %o1, %o4		/* prepare alignment check  */
+	andcc	%o4, 0x3, %o5		/* test for alignment  */
+	bnz,pn	%XCC, .Lsmallunalign	/* branch to word aligned case  */
+	nop
+	subcc	%o2, 7, %o2		/* adjust count  */
+	ble,pn	%XCC, .Lsmallwordx
+	andcc	%o4, 0x7, %o5		/* test for long alignment  */
+/* 8 or more bytes, src and dest start on word boundary
+ * %o4 contains or %o0, %o1  */
+.Lsmalllong:
+	bnz,pn	%XCC, .Lsmallwords	/* branch to word aligned case  */
+	 cmp	%o2, SHORT_LONG-7
+	bge,a	%XCC, .Lmedl64		/* if we branch  */
+	sub	%o2,56,%o2		/* adjust %o2 to -63 off count  */
+	
+/* slightly unroll the small_long_loop to improve very short copies  */
+	cmp   %o2, 32-7
+	blt,a,pn  %XCC, .Lsmall_long_l
+ 	sub	%o1, %o0, %o1		/* %o1 gets the difference  */
+
+	ldx	[%o1], %g2
+	ldx	[%o1+8], %g3
+	ldx	[%o1+16], %o3
+
+	subcc	%o2, 24, %o2
+        sub	%o1, %o0, %o1		/* %o1 gets the difference  */
+
+	stx	%g2, [%o0]		/* write word  */
+	stx	%g3, [%o0+8]		/* write word  */
+	stx	%o3, [%o0+16]		/* write word  */
+	
+        add	%o0, 24, %o0
+
+/* end loop unroll  */
+
+.Lsmall_long_l:
+	ldx	[%o1+%o0], %o3
+	subcc	%o2, 8, %o2
+	add	%o0, 8, %o0
+	bgu,pn	%XCC, .Lsmall_long_l	/* loop until done  */
+	stx	%o3, [%o0-8]		/* write word  */
+	addcc	%o2, 7, %o2		/* restore %o2 to correct count  */
+	bnz,pn	%XCC, .Lsmall_long_x	/* check for completion  */
+	add	%o1, %o0, %o1		/* restore %o1  */
+	retl
+	mov	EX_RETVAL(%g1), %o0	/* restore %o0  */
+.Lsmall_long_x:
+	cmp	%o2, 4			/* check for 4 or more bytes left  */
+	blt,pn	%XCC, .Lsmallleft3	/* if not, go to finish up  */
+	nop
+	lduw	[%o1], %o3
+	add	%o1, 4, %o1
+	subcc	%o2, 4, %o2
+	stw	%o3, [%o0]
+	bnz,pn	%XCC, .Lsmallleft3
+	add	%o0, 4, %o0
+	retl
+	mov	EX_RETVAL(%g1), %o0	/* restore %o0  */
+
+	.align 32
+/* src and dest start on word boundary; 7 or fewer bytes  */
+.Lsmallwordx:
+	lduw	[%o1], %o3		/* read word  */
+	addcc	%o2, 3, %o2		/* restore count  */
+	bz,pt	%XCC, .Lsmallexit
+	stw	%o3, [%o0]		/* write word  */
+	deccc	%o2			/* reduce count for cc test  */
+	ldub	[%o1+4], %o3		/* load one byte  */
+	bz,pt	%XCC, .Lsmallexit
+	stb	%o3, [%o0+4]		/* store one byte  */
+	ldub	[%o1+5], %o3		/* load second byte  */
+	deccc	%o2
+	bz,pt	%XCC, .Lsmallexit
+	stb	%o3, [%o0+5]		/* store second byte  */
+	ldub	[%o1+6], %o3		/* load third byte  */
+	stb	%o3, [%o0+6]		/* store third byte  */
+.Lsmallexit:
+	retl
+	mov	EX_RETVAL(%g1), %o0	/* restore %o0  */
+
+
+	.align 32
+.Lsmallunalign:
+	cmp	%o2, SHORTCHECK
+	ble,pn	%XCC, .Lsmallrest
+	andcc	%o1, 0x3, %o5		/* is src word aligned  */
+	bz,pn	%XCC, .Laldst
+	cmp	%o5, 2			/* is src half-word aligned  */
+	be,pt	%XCC, .Ls2algn
+	cmp	%o5, 3			/* src is byte aligned  */
+.Ls1algn:
+        ldub	[%o1], %o3		/* move 1 or 3 bytes to align it  */
+	inc	1, %o1
+	stb	%o3, [%o0]		/* move a byte to align src  */
+	inc	1, %o0
+	bne,pt	%XCC, .Ls2algn
+	dec	%o2
+	b	.Lald			/* now go align dest  */
+	andcc	%o0, 0x3, %o5
+
+.Ls2algn:
+        lduh	[%o1], %o3		/* know src is 2 byte aligned  */
+	inc	2, %o1
+	srl	%o3, 8, %o4
+	stb	%o4, [%o0]		/* have to do bytes,  */
+	stb	%o3, [%o0 + 1]		/* do not know dst alignment  */
+	inc	2, %o0
+	dec	2, %o2
+
+.Laldst:
+        andcc	%o0, 0x3, %o5		/* align the destination address  */
+.Lald:	bz,pn	%XCC, .Lw4cp
+	cmp	%o5, 2
+	be,pn	%XCC, .Lw2cp
+	cmp	%o5, 3
+.Lw3cp:	lduw	[%o1], %o4
+	inc	4, %o1
+	srl	%o4, 24, %o5
+	stb	%o5, [%o0]
+	bne,pt	%XCC, .Lw1cp
+	inc	%o0
+	dec	1, %o2
+	andn	%o2, 3, %o3		/* %o3 is aligned word count  */
+	dec	4, %o3			/* avoid reading beyond tail of src  */
+	sub	%o1, %o0, %o1		/*  %o1 gets the difference  */
+
+1:	sll	%o4, 8, GLOBAL_SPARE		/* save residual bytes  */
+	lduw	[%o1+%o0], %o4
+	deccc	4, %o3
+	srl	%o4, 24, %o5		/* merge with residual  */
+	or	%o5, GLOBAL_SPARE, GLOBAL_SPARE
+	st	GLOBAL_SPARE, [%o0]
+	bnz,pt	%XCC, 1b
+	inc	4, %o0
+	sub	%o1, 3, %o1		/* used one byte of last word read  */
+	and	%o2, 3, %o2
+	b	7f
+	inc	4, %o2
+
+.Lw1cp:	srl	%o4, 8, %o5
+	sth	%o5, [%o0]
+	inc	2, %o0
+	dec	3, %o2
+	andn	%o2, 3, %o3		/* %o3 is aligned word count  */
+	dec	4, %o3			/* avoid reading beyond tail of src  */
+	sub	%o1, %o0, %o1		/* %o1 gets the difference  */
+
+2:	sll	%o4, 24, GLOBAL_SPARE		/* save residual bytes  */
+	lduw	[%o1+%o0], %o4
+	deccc	4, %o3
+	srl	%o4, 8, %o5		/* merge with residual  */
+	or	%o5, GLOBAL_SPARE, GLOBAL_SPARE
+	st	GLOBAL_SPARE, [%o0]
+	bnz,pt	%XCC, 2b
+	inc	4, %o0
+	sub	%o1, 1, %o1		/* used three bytes of last word read  */
+	and	%o2, 3, %o2
+	b	7f
+	inc	4, %o2
+
+.Lw2cp:	lduw	[%o1], %o4
+	inc	4, %o1
+	srl	%o4, 16, %o5
+	sth	%o5, [%o0]
+	inc	2, %o0
+	dec	2, %o2
+	andn	%o2, 3, %o3		/* %o3 is aligned word count  */
+	dec	4, %o3			/* avoid reading beyond tail of src  */
+	sub	%o1, %o0, %o1		/* %o1 gets the difference  */
+
+3:	sll	%o4, 16, GLOBAL_SPARE		/* save residual bytes  */
+	lduw	[%o1+%o0], %o4
+	deccc	4, %o3
+	srl	%o4, 16, %o5		/* merge with residual  */
+	or	%o5, GLOBAL_SPARE, GLOBAL_SPARE
+	st	GLOBAL_SPARE, [%o0]
+	bnz,pt	%XCC, 3b
+	inc	4, %o0
+	sub	%o1, 2, %o1		/* used two bytes of last word read  */
+	and	%o2, 3, %o2
+	b	7f
+	inc	4, %o2
+
+.Lw4cp:	andn	%o2, 3, %o3		/* %o3 is aligned word count  */
+	sub	%o1, %o0, %o1		/* %o1 gets the difference  */
+
+1:	lduw	[%o1+%o0], %o4		/* read from address  */
+	deccc	4, %o3			/* decrement count  */
+	st	%o4, [%o0]		/* write at destination address  */
+	bgu,pt	%XCC, 1b
+	inc	4, %o0			/* increment to address  */
+	and	%o2, 3, %o2		/* number of leftover bytes, if any  */
+
+	/* simple finish up byte copy, works with any alignment  */
+7:
+	add	%o1, %o0, %o1		/* restore %o1  */
+.Lsmallrest:
+	tst	%o2
+	bz,pt	%XCC, .Lsmallx
+	cmp	%o2, 4
+	blt,pn	%XCC, .Lsmallleft3
+	nop
+	sub	%o2, 3, %o2
+.Lsmallnotalign4:
+	ldub	[%o1], %o3		/* read byte  */
+	subcc	%o2, 4, %o2		/* reduce count by 4  */
+	stb	%o3, [%o0]		/* write byte  */
+	ldub	[%o1+1], %o3		/* repeat for total of 4 bytes  */
+	add	%o1, 4, %o1		/* advance SRC by 4  */
+	stb	%o3, [%o0+1]
+	ldub	[%o1-2], %o3
+	add	%o0, 4, %o0		/* advance DST by 4  */
+	stb	%o3, [%o0-2]
+	ldub	[%o1-1], %o3
+	bgu,pt	%XCC, .Lsmallnotalign4	/* loop til 3 or fewer bytes remain  */
+	stb	%o3, [%o0-1]
+	addcc	%o2, 3, %o2		/* restore count  */
+	bz,pt	%XCC, .Lsmallx
+.Lsmallleft3:				/* 1, 2, or 3 bytes remain  */
+	subcc	%o2, 1, %o2
+	ldub	[%o1], %o3		/* load one byte  */
+	bz,pt	%XCC, .Lsmallx
+	stb	%o3, [%o0]		/* store one byte  */
+	ldub	[%o1+1], %o3		/* load second byte  */
+	subcc	%o2, 1, %o2
+	bz,pt	%XCC, .Lsmallx
+	stb	%o3, [%o0+1]		/* store second byte  */
+	ldub	[%o1+2], %o3		/* load third byte  */
+	stb	%o3, [%o0+2]		/* store third byte  */
+.Lsmallx:
+	retl
+	mov	EX_RETVAL(%g1), %o0	/* restore %o0  */
+
+.Lsmallfin:
+	tst	%o2
+	bnz,pn	%XCC, .Lsmallleft3
+	nop
+	retl
+	mov	EX_RETVAL(%g1), %o0	/* restore %o0  */
+
+	.align 16
+.Lsmallwords:
+	lduw	[%o1], %o3		/* read word  */
+	subcc	%o2, 8, %o2		/* update count  */
+	stw	%o3, [%o0]		/* write word  */
+	add	%o1, 8, %o1		/* update SRC  */
+	lduw	[%o1-4], %o3		/* read word  */
+	add	%o0, 8, %o0		/* update DST  */
+	bgu,pt	%XCC, .Lsmallwords	/* loop until done  */
+	stw	%o3, [%o0-4]		/* write word  */
+	addcc	%o2, 7, %o2		/* restore count  */
+	bz,pt	%XCC, .Lsmallexit	/* check for completion  */
+	cmp	%o2, 4			/* check for 4 or more bytes left  */
+	blt	%XCC, .Lsmallleft3	/* if not, go to finish up  */
+	nop
+	lduw	[%o1], %o3
+	add	%o1, 4, %o1
+	subcc	%o2, 4, %o2
+	add	%o0, 4, %o0
+	bnz,pn	%XCC, .Lsmallleft3
+	stw	%o3, [%o0-4]
+	retl
+	mov	EX_RETVAL(%g1), %o0	/* restore %o0  */
+
+	.align 16
+.Lmedium:
+	neg	%o0, %o5
+	andcc	%o5, 7, %o5		/* bytes till DST 8 byte aligned  */
+	brz,pt	%o5, .Ldst_aligned_on_8
+
+	/* %o5 has the bytes to be written in partial store.  */
+	sub	%o2, %o5, %o2
+	sub	%o1, %o0, %o1		/* %o1 gets the difference  */
+7:					/* dst aligning loop  */
+	ldub	[%o1+%o0], %o4		/* load one byte  */
+	subcc	%o5, 1, %o5
+	stb	%o4, [%o0]
+	bgu,pt	%XCC, 7b
+	add	%o0, 1, %o0		/* advance dst  */
+	add	%o1, %o0, %o1		/* restore %o1  */
+.Ldst_aligned_on_8:
+	andcc	%o1, 7, %o5
+	brnz,pt	%o5, .Lsrc_dst_unaligned_on_8
+	nop
+
+.Lsrc_dst_aligned_on_8:
+	/* check if we are copying MED_MAX or more bytes  */
+        set MED_MAX, %o3
+        cmp %o2, %o3 			/* limit to store buffer size  */
+	bgu,pn	%XCC, .Llarge_align8_copy
+	 nop
+
+/*
+ * Special case for handling when src and dest are both long word aligned
+ * and total data to move is less than MED_MAX bytes
+ */
+.Lmedlong:
+	subcc	%o2, 63, %o2		/* adjust length to allow cc test  */
+	ble,pn	%XCC, .Lmedl63		/* skip big loop if less than 64 bytes  */
+	 nop
+.Lmedl64:
+	ldx	[%o1], %o4		/* load  */
+	subcc	%o2, 64, %o2		/* decrement length count  */
+	stx	%o4, [%o0]		/* and store  */
+	ldx	[%o1+8], %o3		/* a block of 64 bytes  */
+	stx	%o3, [%o0+8]
+	ldx	[%o1+16], %o4
+	stx	%o4, [%o0+16]
+	ldx	[%o1+24], %o3
+	stx	%o3, [%o0+24]
+	ldx	[%o1+32], %o4		/* load  */
+	stx	%o4, [%o0+32]		/* and store  */
+	ldx	[%o1+40], %o3		/* a block of 64 bytes  */
+	add	%o1, 64, %o1		/* increase src ptr by 64  */
+	stx	%o3, [%o0+40]
+	ldx	[%o1-16], %o4
+	add	%o0, 64, %o0		/* increase dst ptr by 64  */
+	stx	%o4, [%o0-16]
+	ldx	[%o1-8], %o3
+	bgu,pt	%XCC, .Lmedl64		/* repeat if at least 64 bytes left  */
+	stx	%o3, [%o0-8]
+.Lmedl63:
+	addcc	%o2, 32, %o2		/* adjust remaining count  */
+	ble,pt	%XCC, .Lmedl31		/* to skip if 31 or fewer bytes left  */
+	nop
+	ldx	[%o1], %o4		/* load  */
+	sub	%o2, 32, %o2		/* decrement length count  */
+	stx	%o4, [%o0]		/* and store  */
+	ldx	[%o1+8], %o3		/* a block of 32 bytes  */
+	add	%o1, 32, %o1		/* increase src ptr by 32  */
+	stx	%o3, [%o0+8]
+	ldx	[%o1-16], %o4
+	add	%o0, 32, %o0		/* increase dst ptr by 32  */
+	stx	%o4, [%o0-16]
+	ldx	[%o1-8], %o3
+	stx	%o3, [%o0-8]
+.Lmedl31:
+	addcc	%o2, 16, %o2		/* adjust remaining count  */
+	ble,pt	%XCC, .Lmedl15		/* skip if 15 or fewer bytes left  */
+	nop
+	ldx	[%o1], %o4		/* load and store 16 bytes  */
+	add	%o1, 16, %o1		/* increase src ptr by 16  */
+	stx	%o4, [%o0]
+	sub	%o2, 16, %o2		/* decrease count by 16  */
+	ldx	[%o1-8], %o3
+	add	%o0, 16, %o0		/* increase dst ptr by 16  */
+	stx	%o3, [%o0-8]
+.Lmedl15:
+	addcc	%o2, 15, %o2		/* restore count  */
+	bz,pt	%XCC, .Lsmallexit	/* exit if finished  */
+	cmp	%o2, 8
+	blt,pt	%XCC, .Lmedw7		/* skip if 7 or fewer bytes left  */
+	tst	%o2
+	ldx	[%o1], %o4		/* load 8 bytes  */
+	add	%o1, 8, %o1		/* increase src ptr by 8  */
+	add	%o0, 8, %o0		/* increase dst ptr by 8  */
+	subcc	%o2, 8, %o2		/* decrease count by 8  */
+	bnz,pn	%XCC, .Lmedw7
+	stx	%o4, [%o0-8]		/* and store 8 bytes  */
+	retl
+	mov	EX_RETVAL(%g1), %o0	/* restore %o0  */
+
+	.align 16
+.Lsrc_dst_unaligned_on_8:
+	/* DST is 8-byte aligned, src is not  */
+2:
+	andcc	%o1, 0x3, %o5		/* test word alignment  */
+	bnz,pt	%XCC, .Lunalignsetup	/* branch to skip if not word aligned  */
+	nop
+
+/*
+ * Handle all cases where src and dest are aligned on word
+ * boundaries. Use unrolled loops for better performance.
+ * This option wins over standard large data move when
+ * source and destination is in cache for medium
+ * to short data moves.
+ */
+        set MED_WMAX, %o3
+        cmp %o2, %o3 			/* limit to store buffer size  */
+	bge,pt	%XCC, .Lunalignrejoin	/* otherwise rejoin main loop  */
+	nop
+
+	subcc	%o2, 31, %o2		/* adjust length to allow cc test  */
+					/* for end of loop  */
+	ble,pt	%XCC, .Lmedw31		/* skip big loop if less than 16  */
+.Lmedw32:
+	ld	[%o1], %o4		/* move a block of 32 bytes  */
+	sllx	%o4, 32, %o5
+	ld	[%o1+4], %o4
+	or	%o4, %o5, %o5
+	stx	%o5, [%o0]
+	subcc	%o2, 32, %o2		/* decrement length count  */
+	ld	[%o1+8], %o4
+	sllx	%o4, 32, %o5
+	ld	[%o1+12], %o4
+	or	%o4, %o5, %o5
+	stx	%o5, [%o0+8]
+	add	%o1, 32, %o1		/* increase src ptr by 32  */
+	ld	[%o1-16], %o4
+	sllx	%o4, 32, %o5
+	ld	[%o1-12], %o4
+	or	%o4, %o5, %o5
+	stx	%o5, [%o0+16]
+	add	%o0, 32, %o0		/* increase dst ptr by 32  */
+	ld	[%o1-8], %o4
+	sllx	%o4, 32, %o5
+	ld	[%o1-4], %o4
+	or	%o4, %o5, %o5
+	bgu,pt	%XCC, .Lmedw32		/* repeat if at least 32 bytes left  */
+	stx	%o5, [%o0-8]
+.Lmedw31:
+	addcc	%o2, 31, %o2		/* restore count  */
+
+	bz,pt	%XCC, .Lsmallexit	/* exit if finished  */
+	nop
+	cmp	%o2, 16
+	blt,pt	%XCC, .Lmedw15
+	nop
+	ld	[%o1], %o4		/* move a block of 16 bytes  */
+	sllx	%o4, 32, %o5
+	subcc	%o2, 16, %o2		/* decrement length count  */
+	ld	[%o1+4], %o4
+	or	%o4, %o5, %o5
+	stx	%o5, [%o0]
+	add	%o1, 16, %o1		/* increase src ptr by 16  */
+	ld	[%o1-8], %o4
+	add	%o0, 16, %o0		/* increase dst ptr by 16  */
+	sllx	%o4, 32, %o5
+	ld	[%o1-4], %o4
+	or	%o4, %o5, %o5
+	stx	%o5, [%o0-8]
+.Lmedw15:
+	bz,pt	%XCC, .Lsmallexit	/* exit if finished  */
+	cmp	%o2, 8
+	blt,pn	%XCC, .Lmedw7		/* skip if 7 or fewer bytes left  */
+	tst	%o2
+	ld	[%o1], %o4		/* load 4 bytes  */
+	subcc	%o2, 8, %o2		/* decrease count by 8  */
+	stw	%o4, [%o0]		/* and store 4 bytes  */
+	add	%o1, 8, %o1		/* increase src ptr by 8  */
+	ld	[%o1-4], %o3		/* load 4 bytes  */
+	add	%o0, 8, %o0		/* increase dst ptr by 8  */
+	stw	%o3, [%o0-4]		/* and store 4 bytes  */
+	bz,pt	%XCC, .Lsmallexit	/* exit if finished  */
+.Lmedw7:					/* count is ge 1, less than 8  */
+	cmp	%o2, 4			/* check for 4 bytes left  */
+	blt,pn	%XCC, .Lsmallleft3	/* skip if 3 or fewer bytes left  */
+	nop
+	ld	[%o1], %o4		/* load 4 bytes  */
+	add	%o1, 4, %o1		/* increase src ptr by 4  */
+	add	%o0, 4, %o0		/* increase dst ptr by 4  */
+	subcc	%o2, 4, %o2		/* decrease count by 4  */
+	bnz	.Lsmallleft3
+	stw	%o4, [%o0-4]		/* and store 4 bytes  */
+	retl
+	mov	EX_RETVAL(%g1), %o0	/* restore %o0  */
+
+	.align 16
+.Llarge_align8_copy:			/* Src and dst share 8 byte alignment  */
+	/* align dst to 64 byte boundary  */
+	andcc	%o0, 0x3f, %o3		/* %o3 == 0 means dst is 64 byte aligned  */
+	brz,pn	%o3, .Laligned_to_64
+	andcc	%o0, 8, %o3		/* odd long words to move?  */
+	brz,pt	%o3, .Laligned_to_16
+	nop
+	ldx	[%o1], %o4
+	sub	%o2, 8, %o2
+	add	%o1, 8, %o1		/* increment src ptr  */
+	add	%o0, 8, %o0		/* increment dst ptr  */
+	stx	%o4, [%o0-8]
+.Laligned_to_16:
+	andcc	%o0, 16, %o3		/* pair of long words to move?  */
+	brz,pt	%o3, .Laligned_to_32
+	nop
+	ldx	[%o1], %o4
+	sub	%o2, 16, %o2
+	stx	%o4, [%o0]
+	add	%o1, 16, %o1		/* increment src ptr  */
+	ldx	[%o1-8], %o4
+	add	%o0, 16, %o0		/* increment dst ptr  */
+	stx	%o4, [%o0-8]
+.Laligned_to_32:
+	andcc	%o0, 32, %o3		/* four long words to move?  */
+	brz,pt	%o3, .Laligned_to_64
+	nop
+	ldx	[%o1], %o4
+	sub	%o2, 32, %o2
+	stx	%o4, [%o0]
+	ldx	[%o1+8], %o4
+	stx	%o4, [%o0+8]
+	ldx	[%o1+16], %o4
+	stx	%o4, [%o0+16]
+	add	%o1, 32, %o1		/* increment src ptr  */
+	ldx	[%o1-8], %o4
+	add	%o0, 32, %o0		/* increment dst ptr  */
+	stx	%o4, [%o0-8]
+.Laligned_to_64:
+/*	Following test is included to avoid issues where existing executables
+ *	incorrectly call memcpy with overlapping src and dest instead of memmove
+ *
+ *	if ( (src ge dst) and (dst+len > src)) go to overlap case
+ *	if ( (src lt dst) and (src+len > dst)) go to overlap case
+ */
+	cmp	%o1,%o0
+	bge,pt	%XCC, 1f
+	nop
+/*				src+len > dst?  */
+	add	%o1, %o2, %o4
+	cmp	%o4, %o0
+	bgt,pt	%XCC, .Lmv_aligned_on_64
+	nop
+	ba	2f
+	nop
+1:
+/*				dst+len > src?  */
+	add	%o0, %o2, %o4
+	cmp	%o4, %o1
+	bgt,pt	%XCC, .Lmv_aligned_on_64
+	nop
+2:
+/*	handle non-overlapped copies
+ *
+ *	Using block init store (BIS) instructions to avoid fetching cache
+ *	lines from memory. Use ST_CHUNK stores to first element of each cache 
+ *	line (similar to prefetching) to avoid overfilling STQ or miss buffers.
+ *	Gives existing cache lines time to be moved out of L1/L2/L3 cache.
+ *	Initial stores using MRU version of BIS to keep cache line in
+ *	cache until we are ready to store final element of cache line.
+ *	Then store last element using the LRU version of BIS.
+ */
+ 	rd	%asi, GLOBAL_SPARE
+ 	wr	%g0, ASI_STBIMRU_P, %asi
+ 
+ 	andn	%o2, 0x3f, %o5		/* %o5 is multiple of block size  */
+ 	and	%o2, 0x3f, %o2		/* residue bytes in %o2  */
+ 
+/*	We use ASI_STBIMRU_P for the first seven stores to each cache line
+ *	followed by ASI_STBI_P (mark as LRU) for the last store. That
+ *	mixed approach reduces the chances the cache line is removed
+ *	before we finish setting it, while minimizing the effects on
+ *	other cached values during a large memcpy
+ *
+ *	ST_CHUNK batches up initial BIS operations for several cache lines
+ *	to allow multiple requests to not be blocked by overflowing the
+ *	the store miss buffer. Then the matching stores for all those
+ *	BIS operations are executed.
+ */
+
+	sub	%o0, 8, %o0		/* adjust %o0 for ASI alignment  */
+.Lalign_loop:
+	cmp	%o5, ST_CHUNK*64
+	blu,pt	%XCC, .Lalign_loop_fin
+	mov	ST_CHUNK,%o3
+.Lalign_loop_start:
+	prefetch [%o1 + (ALIGN_PRE * BLOCK_SIZE)], 21
+	subcc	%o3, 1, %o3
+	ldx	[%o1], %o4
+	add	%o1, 64, %o1
+	add	%o0, 64, %o0
+	bgu	%XCC, .Lalign_loop_start
+	EX_ST(STORE_ASI(%o4, %o0-56))
+
+	mov	ST_CHUNK,%o3
+	sllx	%o3, 6, %o4		/* ST_CHUNK*64  */
+	sub	%o1, %o4, %o1		/* reset %o1  */
+	sub	%o0, %o4, %o0		/* reset %o0  */
+
+.Lalign_loop_rest:
+	ldx	[%o1+8],%o4
+        EX_ST(STORE_ASI(%o4,%o0+8+8))
+	ldx	[%o1+16],%o4
+        EX_ST(STORE_ASI(%o4,%o0+8+16))
+	subcc	%o3, 1, %o3
+	ldx	[%o1+24],%o4
+        EX_ST(STORE_ASI(%o4,%o0+8+24))
+	ldx	[%o1+32],%o4
+        EX_ST(STORE_ASI(%o4,%o0+8+32))
+	ldx	[%o1+40],%o4
+        EX_ST(STORE_ASI(%o4,%o0+8+40))
+	ldx	[%o1+48],%o4
+	add	%o1, 64, %o1
+        EX_ST(STORE_ASI(%o4,%o0+8+48))
+	add	%o0, 64, %o0
+	ldx	[%o1-8],%o4
+	sub	%o5, 64, %o5
+	bgu	%XCC, .Lalign_loop_rest
+	EX_ST(STORE_INIT(%o4,%o0))      /* mark cache line as LRU  */
+
+	cmp	%o5, ST_CHUNK*64
+	bgu,pt	%XCC, .Lalign_loop_start
+	mov	ST_CHUNK,%o3
+
+	cmp	%o5, 0
+	beq	.Lalign_done
+	nop
+.Lalign_loop_fin:
+	ldx	[%o1], %o4
+	stx	%o4, [%o0+8]
+	ldx	[%o1+8],%o4
+	stx	%o4,[%o0+8+8]
+	ldx	[%o1+16],%o4
+	stx	%o4,[%o0+8+16]
+	subcc	%o5, 64, %o5
+	ldx	[%o1+24],%o4
+	stx	%o4,[%o0+8+24]
+	ldx	[%o1+32],%o4
+	stx	%o4,[%o0+8+32]
+	ldx	[%o1+40],%o4
+	stx	%o4,[%o0+8+40]
+	ldx	[%o1+48],%o4
+	add	%o1, 64, %o1
+	stx	%o4,[%o0+8+48]
+	add	%o0, 64, %o0
+	ldx	[%o1-8],%o4
+	bgu	%XCC, .Lalign_loop_fin
+	stx	%o4,[%o0]
+
+.Lalign_done:
+	add	%o0, 8, %o0		/* restore %o0 from ASI alignment  */
+	membar	#StoreStore
+	sub	%o2, 63, %o2		/* adjust length to allow cc test  */
+	ba	.Lmedl63		/* in medl63  */
+	wr	%g0, GLOBAL_SPARE, %asi
+
+	.align 16
+	/* Dst is on 8 byte boundary; src is not; remaining count > SMALL_MAX  */
+.Lunalignsetup:
+.Lunalignrejoin:
+	rd	%fprs, GLOBAL_SPARE		/* check for unused fp  */
+	/* if fprs.fef == 0, set it.
+	 * Setting it when already set costs more than checking */
+	andcc	GLOBAL_SPARE, FPRS_FEF, GLOBAL_SPARE	/* test FEF, fprs.du = fprs.dl = 0  */
+	bz,a	%XCC, 1f
+	wr	%g0, FPRS_FEF, %fprs	/* fprs.fef = 1  */
+1:
+        set MED_UMAX, %o3
+        cmp %o2, %o3 			/* check for medium unaligned limit  */
+	bge,pt	%XCC, .Lunalign_large
+	prefetch [%o1 + (4 * BLOCK_SIZE)], 20
+	andn	%o2, 0x3f, %o5		/* %o5 is multiple of block size  */
+	and	%o2, 0x3f, %o2		/* residue bytes in %o2  */
+	cmp	%o2, 8			/* Insure we do not load beyond  */
+	bgt	.Lunalign_adjust	/* end of source buffer  */
+	andn	%o1, 0x7, %o4		/* %o4 has long word aligned src address  */
+	add	%o2, 64, %o2		/* adjust to leave loop  */
+	sub	%o5, 64, %o5		/* early if necessary  */
+.Lunalign_adjust:
+	alignaddr %o1, %g0, %g0		/* generate %gsr  */
+	add	%o1, %o5, %o1		/* advance %o1 to after blocks  */
+	ldd	[%o4], %f0
+.Lunalign_loop:
+	ldd	[%o4+8], %f2
+	faligndata %f0, %f2, %f16
+	ldd	[%o4+16], %f4
+	subcc	%o5, BLOCK_SIZE, %o5
+	std	%f16, [%o0]
+	faligndata %f2, %f4, %f18
+	ldd	[%o4+24], %f6
+	std	%f18, [%o0+8]
+	faligndata %f4, %f6, %f20
+	ldd	[%o4+32], %f8
+	std	%f20, [%o0+16]
+	faligndata %f6, %f8, %f22
+	ldd	[%o4+40], %f10
+	std	%f22, [%o0+24]
+	faligndata %f8, %f10, %f24
+	ldd	[%o4+48], %f12
+	std	%f24, [%o0+32]
+	faligndata %f10, %f12, %f26
+	ldd	[%o4+56], %f14
+	add	%o4, BLOCK_SIZE, %o4
+	std	%f26, [%o0+40]
+	faligndata %f12, %f14, %f28
+	ldd	[%o4], %f0
+	std	%f28, [%o0+48]
+	faligndata %f14, %f0, %f30
+	std	%f30, [%o0+56]
+	add	%o0, BLOCK_SIZE, %o0
+	bgu,pt	%XCC, .Lunalign_loop
+	prefetch [%o4 + (5 * BLOCK_SIZE)], 20
+	ba	.Lunalign_done
+	nop
+
+.Lunalign_large:
+	andcc	%o0, 0x3f, %o3		/* is dst 64-byte block aligned?  */
+	bz	%XCC, .Lunalignsrc
+	sub	%o3, 64, %o3		/* %o3 will be multiple of 8  */
+	neg	%o3			/* bytes until dest is 64 byte aligned  */
+	sub	%o2, %o3, %o2		/* update cnt with bytes to be moved  */
+	/* Move bytes according to source alignment  */
+	andcc	%o1, 0x1, %o5
+	bnz	%XCC, .Lunalignbyte	/* check for byte alignment  */
+	nop
+	andcc	%o1, 2, %o5		/* check for half word alignment  */
+	bnz	%XCC, .Lunalignhalf
+	nop
+	/* Src is word aligned  */
+.Lunalignword:
+	ld	[%o1], %o4		/* load 4 bytes  */
+	add	%o1, 8, %o1		/* increase src ptr by 8  */
+	stw	%o4, [%o0]		/* and store 4 bytes  */
+	subcc	%o3, 8, %o3		/* decrease count by 8  */
+	ld	[%o1-4], %o4		/* load 4 bytes  */
+	add	%o0, 8, %o0		/* increase dst ptr by 8  */
+	bnz	%XCC, .Lunalignword
+	stw	%o4, [%o0-4]		/* and store 4 bytes  */
+	ba	.Lunalignsrc
+	nop
+
+	/* Src is half-word aligned  */
+.Lunalignhalf:
+	lduh	[%o1], %o4		/* load 2 bytes  */
+	sllx	%o4, 32, %o5		/* shift left  */
+	lduw	[%o1+2], %o4
+	or	%o4, %o5, %o5
+	sllx	%o5, 16, %o5
+	lduh	[%o1+6], %o4
+	or	%o4, %o5, %o5
+	stx	%o5, [%o0]
+	add	%o1, 8, %o1
+	subcc	%o3, 8, %o3
+	bnz	%XCC, .Lunalignhalf
+	add	%o0, 8, %o0
+	ba	.Lunalignsrc
+	nop
+
+	/* Src is Byte aligned  */
+.Lunalignbyte:
+	sub	%o0, %o1, %o0		/* share pointer advance  */
+.Lunalignbyte_loop:
+	ldub	[%o1], %o4
+	sllx	%o4, 56, %o5
+	lduh	[%o1+1], %o4
+	sllx	%o4, 40, %o4
+	or	%o4, %o5, %o5
+	lduh	[%o1+3], %o4
+	sllx	%o4, 24, %o4
+	or	%o4, %o5, %o5
+	lduh	[%o1+5], %o4
+	sllx	%o4,  8, %o4
+	or	%o4, %o5, %o5
+	ldub	[%o1+7], %o4
+	or	%o4, %o5, %o5
+	stx	%o5, [%o0+%o1]
+	subcc	%o3, 8, %o3
+	bnz	%XCC, .Lunalignbyte_loop
+	add	%o1, 8, %o1
+	add	%o0,%o1, %o0 		/* restore pointer  */
+
+	/* Destination is now block (64 byte aligned)  */
+.Lunalignsrc:
+	andn	%o2, 0x3f, %o5		/* %o5 is multiple of block size  */
+	and	%o2, 0x3f, %o2		/* residue bytes in %o2  */
+	add	%o2, 64, %o2		/* Insure we do not load beyond  */
+	sub	%o5, 64, %o5		/* end of source buffer  */
+
+	andn	%o1, 0x7, %o4		/* %o4 has long word aligned src address  */
+	alignaddr %o1, %g0, %g0		/* generate %gsr  */
+	add	%o1, %o5, %o1		/* advance %o1 to after blocks  */
+
+	ldd	[%o4], %f14
+	add	%o4, 8, %o4
+.Lunalign_sloop:
+	ldd	[%o4], %f16
+	faligndata %f14, %f16, %f0
+	ldd	[%o4+8], %f18
+	faligndata %f16, %f18, %f2
+	ldd	[%o4+16], %f20
+	faligndata %f18, %f20, %f4
+	std	%f0, [%o0]
+	subcc	%o5, 64, %o5
+	ldd	[%o4+24], %f22
+	faligndata %f20, %f22, %f6
+	std	%f2, [%o0+8]
+	ldd	[%o4+32], %f24
+	faligndata %f22, %f24, %f8
+	std	%f4, [%o0+16]
+	ldd	[%o4+40], %f26
+	faligndata %f24, %f26, %f10
+	std	%f6, [%o0+24]
+	ldd	[%o4+48], %f28
+	faligndata %f26, %f28, %f12
+	std	%f8, [%o0+32]
+	add	%o4, 64, %o4
+	ldd	[%o4-8], %f30
+	faligndata %f28, %f30, %f14
+	std	%f10, [%o0+40]
+	std	%f12, [%o0+48]
+	add	%o0, 64, %o0
+	std	%f14, [%o0-8]
+	fsrc2	%f30, %f14
+	bgu,pt	%XCC, .Lunalign_sloop
+	prefetch [%o4 + (6 * BLOCK_SIZE)], 20
+
+.Lunalign_done:
+	/* Handle trailing bytes, 64 to 127
+	 * Dest long word aligned, Src not long word aligned  */
+	cmp	%o2, 15
+	bleu	%XCC, .Lunalign_short
+
+	andn	%o2, 0x7, %o5		/* %o5 is multiple of 8  */
+	and	%o2, 0x7, %o2		/* residue bytes in %o2  */
+	add	%o2, 8, %o2
+	sub	%o5, 8, %o5		/* insure we do not load past end of src  */
+	andn	%o1, 0x7, %o4		/* %o4 has long word aligned src address  */
+	add	%o1, %o5, %o1		/* advance %o1 to after multiple of 8  */
+	ldd	[%o4], %f0		/* fetch partial word  */
+.Lunalign_by8:
+	ldd	[%o4+8], %f2
+	add	%o4, 8, %o4
+	faligndata %f0, %f2, %f16
+	subcc	%o5, 8, %o5
+	std	%f16, [%o0]
+	fsrc2	%f2, %f0
+	bgu,pt	%XCC, .Lunalign_by8
+	add	%o0, 8, %o0
+
+.Lunalign_short:
+	brnz	GLOBAL_SPARE, .Lsmallrest
+	nop
+	ba	.Lsmallrest
+	wr	GLOBAL_SPARE, %g0, %fprs
+END(__memcpy_niagara7)
+
+#endif
diff --git a/sysdeps/sparc/sparc64/multiarch/memcpy.S b/sysdeps/sparc/sparc64/multiarch/memcpy.S
index 328f621..adf2ca3 100644
--- a/sysdeps/sparc/sparc64/multiarch/memcpy.S
+++ b/sysdeps/sparc/sparc64/multiarch/memcpy.S
@@ -27,7 +27,19 @@  ENTRY(memcpy)
 # ifdef SHARED
 	SETUP_PIC_REG_LEAF(o3, o5)
 # endif
-	set	HWCAP_SPARC_CRYPTO, %o1
+	set	HWCAP_SPARC_ADP, %o1
+	andcc	%o0, %o1, %g0
+	be	1f
+	 nop
+# ifdef SHARED
+	sethi	%gdop_hix22(__memcpy_niagara7), %o1
+	xor	%o1, %gdop_lox10(__memcpy_niagara7), %o1
+# else
+	set	__memcpy_niagara7, %o1
+# endif
+	ba	10f
+	 nop
+1:      set	HWCAP_SPARC_CRYPTO, %o1
 	andcc	%o0, %o1, %g0
 	be	1f
 	 andcc	%o0, HWCAP_SPARC_N2, %g0
@@ -89,7 +101,19 @@  ENTRY(__mempcpy)
 # ifdef SHARED
 	SETUP_PIC_REG_LEAF(o3, o5)
 # endif
-	set	HWCAP_SPARC_CRYPTO, %o1
+        set     HWCAP_SPARC_ADP, %o1
+        andcc   %o0, %o1, %g0
+        be      1f
+         nop
+# ifdef SHARED
+	sethi	%gdop_hix22(__mempcpy_niagara7), %o1
+	xor	%o1, %gdop_lox10(__mempcpy_niagara7), %o1
+# else
+	set	__mempcpy_niagara7, %o1
+# endif
+        ba      10f
+         nop
+1:      set	HWCAP_SPARC_CRYPTO, %o1
 	andcc	%o0, %o1, %g0
 	be	1f
 	 andcc	%o0, HWCAP_SPARC_N2, %g0
diff --git a/sysdeps/sparc/sparc64/multiarch/memmove.S b/sysdeps/sparc/sparc64/multiarch/memmove.S
new file mode 100644
index 0000000..ca4eca1
--- /dev/null
+++ b/sysdeps/sparc/sparc64/multiarch/memmove.S
@@ -0,0 +1,72 @@ 
+/* Multiple versions of memmove and bcopy
+   All versions must be listed in ifunc-impl-list.c.
+   Copyright (C) 2016 Free Software Foundation, Inc.
+   This file is part of the GNU C Library.
+
+   The GNU C Library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 2.1 of the License, or (at your option) any later version.
+
+   The GNU C Library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with the GNU C Library; if not, write to the Free
+   Software Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA
+   02111-1307 USA.  */
+
+#include <sysdep.h>
+
+#if IS_IN (libc)
+	.text
+ENTRY(memmove)
+        .type   memmove, @gnu_indirect_function
+# ifdef SHARED
+	SETUP_PIC_REG_LEAF(o3, o5)
+# endif
+	set	HWCAP_SPARC_ADP, %o1
+	andcc	%o0, %o1, %g0
+	be	1f
+	 nop
+# ifdef SHARED
+	sethi	%gdop_hix22(__memmove_niagara7), %o1
+	xor	%o1, %gdop_lox10(__memmove_niagara7), %o1
+# else
+	set	__memmove_niagara7, %o1
+# endif
+        ba 10f
+         nop
+1:
+# ifdef SHARED
+        sethi   %gdop_hix22(__memmove_ultra1), %o1
+        xor     %o1, %gdop_lox10(__memmove_ultra1), %o1
+# else
+        set     __memmove_ultra1, %o1
+# endif
+10:
+# ifdef SHARED
+	add	%o3, %o1, %o1
+# endif
+	retl
+	mov	%o1, %o0        
+END(memmove)
+
+libc_hidden_builtin_def (memmove)
+
+#undef libc_hidden_builtin_def
+#define libc_hidden_builtin_def(name)
+#undef weak_alias
+#define weak_alias(x, y)
+#undef libc_hidden_def
+#define libc_hidden_def(name)
+
+#define memmove __memmove_ultra1
+#define __memmove __memmove_ultra1
+        
+#endif
+
+#include "../memmove.S"
+        
diff --git a/sysdeps/sparc/sparc64/multiarch/memset-niagara7.S b/sysdeps/sparc/sparc64/multiarch/memset-niagara7.S
new file mode 100644
index 0000000..3c9f035
--- /dev/null
+++ b/sysdeps/sparc/sparc64/multiarch/memset-niagara7.S
@@ -0,0 +1,339 @@ 
+/* Set a block of memory to some byte value.  For SUN4V M7.
+   Copyright (C) 2016 Free Software Foundation, Inc.
+   This file is part of the GNU C Library.
+
+   The GNU C Library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 2.1 of the License, or (at your option) any later version.
+
+   The GNU C Library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with the GNU C Library; if not, see
+   <http://www.gnu.org/licenses/>.  */
+
+#include <sysdep.h>
+
+#ifndef XCC
+# define XCC    xcc
+        .register	%g2, #scratch
+	.register	%g3, #scratch
+#endif
+
+/* The algorithm is as follows :
+ *
+ *	For small 6 or fewer bytes stores, bytes will be stored.
+ *
+ *	For less than 32 bytes stores, align the address on 4 byte boundary.
+ *	Then store as many 4-byte chunks, followed by trailing bytes.
+ *
+ *	For sizes greater than 32 bytes, align the address on 8 byte boundary.
+ *	if (count >= 64) {
+ *      	store 8-bytes chunks to align the address on 64 byte boundary
+ *		if (value to be set is zero && count >= MIN_ZERO) {
+ *              	Using BIS stores, set the first long word of each 64-byte
+ *              	cache line to zero which will also clear the other
+ *             		seven long words of the cache line.
+ *       	}
+ *       	else if (count >= MIN_LOOP) {
+ *       		Using BIS stores, set the first long word of each of
+ *              	ST_CHUNK cache lines (64 bytes each) before the main loop
+ *              	is entered.
+ *              	In the main loop, continue pre-setting the first long
+ *              	word of each cache line ST_CHUNK lines in advance while
+ *              	setting the other seven long words (56 bytes) of each cache
+ *              	line until fewer than ST_CHUNK*64 bytes remain. Then set
+ *              	the remaining seven long words of each cache line that has
+ *              	already had its first long word set.
+ *       	}
+ *       	store remaining data in 64-byte chunks until less than
+ *       	64 bytes remain.
+ *       }
+ *       Store as many 8-byte chunks, followed by trailing bytes.
+ *	
+ *
+ * BIS = Block Init Store
+ *   Doing the advance store of the first element of the cache line
+ *   initiates the displacement of a cache line while only using a single
+ *   instruction in the pipeline. That avoids various pipeline delays,
+ *   such as filling the miss buffer. The performance effect is
+ *   similar to prefetching for normal stores.
+ *   The special case for zero fills runs faster and uses fewer instruction
+ *   cycles than the normal memset loop.
+ *
+ * We only use BIS for memset of greater than MIN_LOOP bytes because a sequence
+ * BIS stores must be followed by a membar #StoreStore. The benefit of
+ * the BIS store must be balanced against the cost of the membar operation.
+ */
+
+#define GLOBAL_SPARE	%g5
+	
+/*
+ * ASI_STBI_P marks the cache line as "least recently used"
+ * which means if many threads are active, it has a high chance
+ * of being pushed out of the cache between the first initializing
+ * store and the final stores.
+ * Thus, we use ASI_STBIMRU_P which marks the cache line as
+ * "most recently used" for all but the last store to the cache line.
+ */
+
+#define ASI_BLK_INIT_QUAD_LDD_P 0xe2
+#define ASI_ST_BLK_INIT_MRU_P 0xf2	
+	
+#define ASI_STBI_P      ASI_BLK_INIT_QUAD_LDD_P
+#define ASI_STBIMRU_P   ASI_ST_BLK_INIT_MRU_P
+
+#define ST_CHUNK        24   /* multiple of 4 due to loop unrolling */
+#define MIN_LOOP        32830
+#define MIN_ZERO        256
+
+#define EX_ST(x)	x
+#define EX_RETVAL(x)	x
+#define STORE_ASI(src,addr)	stxa src, [addr] %asi
+#define STORE_INIT(src,addr)	stxa src, [addr] ASI_STBI_P
+
+#if IS_IN (libc)
+
+	.text
+	.align		32
+
+ENTRY(__bzero_niagara7)        
+        /* bzero (dst, size)  */
+        mov %o1, %o2
+	mov 0, %o1
+	/* fall through into memset code */
+END(__bzero_niagara7)
+
+ENTRY(__memset_niagara7)
+        /* memset (src, c, size)  */
+        mov     %o0, %o5                /* copy sp1 before using it  */
+        cmp     %o2, 7                  /* if small counts, just write bytes  */
+        bleu,pn %XCC, .Lwrchar
+        and     %o1, 0xff, %o1          /* o1 is (char)c  */
+
+        sll     %o1, 8, %o3
+        or      %o1, %o3, %o1           /* now o1 has 2 bytes of c  */
+        sll     %o1, 16, %o3
+        cmp     %o2, 32
+        blu,pn  %XCC, .Lwdalign
+        or      %o1, %o3, %o1           /* now o1 has 4 bytes of c  */
+
+        sllx    %o1, 32, %o3
+        or      %o1, %o3, %o1           /* now o1 has 8 bytes of c  */
+
+.Ldbalign:
+        andcc   %o5, 7, %o3             /* is sp1 aligned on a 8 byte bound?  */
+        bz,pt   %XCC, .Lblkalign         /* already long word aligned  */
+        sub     %o3, 8, %o3             /* -(bytes till long word aligned)  */
+
+        add     %o2, %o3, %o2           /* update o2 with new count  */
+        /* Set -(%o3) bytes till sp1 long word aligned  */
+1:      stb     %o1, [%o5]              /* there is at least 1 byte to set  */
+        inccc   %o3                     /* byte clearing loop   */
+        bl,pt   %XCC, 1b
+        inc     %o5 
+
+        /* Now sp1 is long word aligned (sp1 is found in %o5) */
+.Lblkalign:
+        cmp     %o2, 64                 /* check if there are 64 bytes to set  */
+        blu,pn  %XCC, .Lwrshort
+        mov     %o2, %o3
+
+        andcc   %o5, 63, %o3            /* is sp1 block aligned?  */
+        bz,pt   %XCC, .Lblkwr           /* now block aligned  */
+        sub     %o3, 64, %o3            /* o3 is -(bytes till block aligned)  */
+        add     %o2, %o3, %o2           /* o2 is the remainder  */
+
+        /* Store -(%o3) bytes till dst is block (64 byte) aligned.  */
+        /* Use long word stores.  */
+        /* Recall that dst is already long word aligned  */
+1:
+        addcc   %o3, 8, %o3
+        stx     %o1, [%o5]
+        bl,pt   %XCC, 1b
+        add     %o5, 8, %o5
+
+        /* Now sp1 is block aligned  */
+.Lblkwr:
+        andn    %o2, 63, %o4            /* calculate size of blocks in bytes  */
+        brz,pn  %o1, .Lwrzero           /* special case if c == 0  */
+        and     %o2, 63, %o3            /* %o3 = bytes left after blk stores.  */
+
+        set     MIN_LOOP, %g1
+        cmp     %o4, %g1                /* check there are enough bytes to set  */
+        blu,pn  %XCC, .Lshort_set       /* to justify cost of membar   */
+                                        /* must be > pre-cleared lines  */
+        nop
+
+        /* initial cache-clearing stores  */
+        /* get store pipeline moving  */
+	rd	%asi, GLOBAL_SPARE
+        wr     %g0, ASI_STBIMRU_P, %asi
+
+/*       Primary memset loop for large memsets  */
+.Lwr_loop:
+        sub     %o5, 8, %o5     /* adjust %o5 for ASI store alignment  */
+        mov     ST_CHUNK, %g1
+.Lwr_loop_start:
+	EX_ST(STORE_ASI(%o1,%o5+8))
+        subcc   %g1, 4, %g1
+        EX_ST(STORE_ASI(%o1,%o5+8+64))
+        add     %o5, 256, %o5
+        EX_ST(STORE_ASI(%o1,%o5+8-128))
+        bgu     %XCC, .Lwr_loop_start
+        EX_ST(STORE_ASI(%o1,%o5+8-64))
+
+        sub     %o5, ST_CHUNK*64, %o5           /* reset %o5  */
+        mov     ST_CHUNK, %g1
+
+.Lwr_loop_rest:
+	EX_ST(STORE_ASI(%o1,%o5+8+8))
+        sub     %o4, 64, %o4
+        EX_ST(STORE_ASI(%o1,%o5+16+8))
+        subcc   %g1, 1, %g1
+        EX_ST(STORE_ASI(%o1,%o5+24+8))
+        EX_ST(STORE_ASI(%o1,%o5+32+8))
+        EX_ST(STORE_ASI(%o1,%o5+40+8))
+        add     %o5, 64, %o5
+        EX_ST(STORE_ASI(%o1,%o5-8))
+        bgu     %XCC, .Lwr_loop_rest
+        EX_ST(STORE_INIT(%o1,%o5))
+
+        /* If more than ST_CHUNK*64 bytes remain to set, continue  */
+        /* setting the first long word of each cache line in advance  */
+        /* to keep the store pipeline moving.  */
+
+        cmp     %o4, ST_CHUNK*64
+        bge,pt  %XCC, .Lwr_loop_start
+        mov     ST_CHUNK, %g1
+        
+        brz,a,pn %o4, .Lasi_done
+        add     %o5, 8, %o5             /* restore %o5 offset  */
+
+.Lwr_loop_small:
+	EX_ST(STORE_ASI(%o1,%o5+0+8))
+        EX_ST(STORE_ASI(%o1,%o5+8+8))
+        EX_ST(STORE_ASI(%o1,%o5+16+8))
+        EX_ST(STORE_ASI(%o1,%o5+24+8))
+        EX_ST(STORE_ASI(%o1,%o5+32+8))
+        subcc   %o4, 64, %o4
+        EX_ST(STORE_ASI(%o1,%o5+40+8))
+        add     %o5, 64, %o5
+        EX_ST(STORE_ASI(%o1,%o5-8))
+        bgu,pt  %XCC, .Lwr_loop_small
+        EX_ST(STORE_INIT(%o1,%o5))
+
+        ba      .Lasi_done
+        add     %o5, 8, %o5             /* restore %o5 offset  */
+
+/*       Special case loop for zero fill memsets  */
+/*       For each 64 byte cache line, single STBI to first element  */
+/*       clears line  */
+.Lwrzero:
+        cmp     %o4, MIN_ZERO           /* check if enough bytes to set  */
+                                        /* to pay %asi + membar cost  */
+        blu     %XCC, .Lshort_set
+        nop
+        sub     %o4, 256, %o4
+        
+.Lwrzero_loop:
+        
+        mov     64, %g3
+        EX_ST(STORE_INIT(%o1,%o5))
+        subcc   %o4, 256, %o4
+        EX_ST(STORE_INIT(%o1,%o5+%g3))
+        add     %o5, 256, %o5
+        sub     %g3, 192, %g3
+        EX_ST(STORE_INIT(%o1,%o5+%g3))
+        add %g3, 64, %g3
+        bge,pt  %XCC, .Lwrzero_loop
+        EX_ST(STORE_INIT(%o1,%o5+%g3))
+        add     %o4, 256, %o4
+        
+        brz,pn  %o4, .Lbsi_done
+        nop
+.Lwrzero_small:
+	EX_ST(STORE_INIT(%o1,%o5))
+        subcc   %o4, 64, %o4
+        bgu,pt  %XCC, .Lwrzero_small
+        add     %o5, 64, %o5
+	ba,a	.Lbsi_done
+
+.Lasi_done:
+	wr	%g0, GLOBAL_SPARE, %asi
+.Lbsi_done:
+        membar  #StoreStore             /* required by use of Block Store Init  */
+
+.Lshort_set:
+        cmp     %o4, 64                 /* check if 64 bytes to set  */
+        blu     %XCC, 5f
+        nop
+4:                                      /* set final blocks of 64 bytes  */
+        stx     %o1, [%o5]
+        stx     %o1, [%o5+8]
+        stx     %o1, [%o5+16]
+        stx     %o1, [%o5+24]
+        subcc   %o4, 64, %o4
+        stx     %o1, [%o5+32]
+        stx     %o1, [%o5+40]
+        add     %o5, 64, %o5
+        stx     %o1, [%o5-16]
+        bgu,pt  %XCC, 4b
+        stx     %o1, [%o5-8]
+
+5:
+        /* Set the remaining long words  */
+.Lwrshort:
+        subcc   %o3, 8, %o3             /* Can we store any long words?  */
+        blu,pn  %XCC, .Lwrchars
+        and     %o2, 7, %o2             /* calc bytes left after long words  */
+6:
+        subcc   %o3, 8, %o3
+        stx     %o1, [%o5]              /* store the long words  */
+        bgeu,pt %XCC, 6b
+        add     %o5, 8, %o5
+
+.Lwrchars:                               /* check for extra chars  */
+        brnz    %o2, .Lwrfin
+        nop
+        retl
+        nop
+
+.Lwdalign:                       
+        andcc   %o5, 3, %o3             /* is sp1 aligned on a word boundary  */
+        bz,pn   %XCC, .Lwrword
+        andn    %o2, 3, %o3             /* create word sized count in %o3  */
+
+        dec     %o2                     /* decrement count  */
+        stb     %o1, [%o5]              /* clear a byte  */
+        b       .Lwdalign
+        inc     %o5                     /* next byte  */
+
+.Lwrword:
+        subcc   %o3, 4, %o3
+        st      %o1, [%o5]              /* 4-byte writing loop  */
+        bnz,pt  %XCC, .Lwrword
+        add     %o5, 4, %o5
+
+        and     %o2, 3, %o2             /* leftover count, if any  */
+
+.Lwrchar:
+        /* Set the remaining bytes, if any  */
+        brz     %o2, .Lexit
+        nop
+.Lwrfin:
+        deccc   %o2
+        stb     %o1, [%o5]
+        bgu,pt  %XCC, .Lwrfin
+        inc     %o5
+.Lexit:
+        retl                            /* %o0 was preserved  */
+        nop
+END(__memset_niagara7)
+
+
+#endif
diff --git a/sysdeps/sparc/sparc64/multiarch/memset.S b/sysdeps/sparc/sparc64/multiarch/memset.S
index bd0e160..0b4b2df 100644
--- a/sysdeps/sparc/sparc64/multiarch/memset.S
+++ b/sysdeps/sparc/sparc64/multiarch/memset.S
@@ -27,7 +27,19 @@  ENTRY(memset)
 # ifdef SHARED
 	SETUP_PIC_REG_LEAF(o3, o5)
 # endif
-	set	HWCAP_SPARC_CRYPTO, %o1
+        set     HWCAP_SPARC_ADP, %o1
+        andcc   %o0, %o1, %g0
+        be      1f
+         nop
+# ifdef SHARED
+	sethi	%gdop_hix22(__memset_niagara7), %o1
+	xor	%o1, %gdop_lox10(__memset_niagara7), %o1
+# else
+	set	__memset_niagara7, %o1
+# endif
+	ba	10f
+	 nop
+1:      set	HWCAP_SPARC_CRYPTO, %o1
 	andcc	%o0, %o1, %g0
 	be	1f
 	 andcc	%o0, HWCAP_SPARC_BLKINIT, %g0
@@ -69,7 +81,19 @@  ENTRY(__bzero)
 # ifdef SHARED
 	SETUP_PIC_REG_LEAF(o3, o5)
 # endif
-	set	HWCAP_SPARC_CRYPTO, %o1
+        set     HWCAP_SPARC_ADP, %o1
+        andcc   %o0, %o1, %g0
+        be      1f
+         nop
+# ifdef SHARED
+	sethi	%gdop_hix22(__bzero_niagara7), %o1
+	xor	%o1, %gdop_lox10(__bzero_niagara7), %o1
+# else
+	set	__bzero_niagara7, %o1
+# endif
+	ba	10f
+	 nop
+1:      set	HWCAP_SPARC_CRYPTO, %o1
 	andcc	%o0, %o1, %g0
 	be	1f
 	 andcc	%o0, HWCAP_SPARC_BLKINIT, %g0
diff --git a/sysdeps/sparc/sparc64/multiarch/rtld-memmove.c b/sysdeps/sparc/sparc64/multiarch/rtld-memmove.c
new file mode 100644
index 0000000..66fe118
--- /dev/null
+++ b/sysdeps/sparc/sparc64/multiarch/rtld-memmove.c
@@ -0,0 +1 @@ 
+#include <../rtld-memmove.c>