Power7 optimization for strncpy and stpncpy.

Message ID 1395989036-19189-1-git-send-email-vidya@linux.vnet.ibm.com
State Committed
Delegated to: Adhemerval Zanella Netto
Headers

Commit Message

R Vidya March 28, 2014, 6:43 a.m. UTC
  From: Vidya Ranganathan <vidya@linux.vnet.ibm.com>

The optimization is achieved by following techniques:
   > data alignment [gain from aligned memory access on read/write]
   > prefetch data [gain from cache misses by anticipating load]
   > POWER7 gains performance with loop unrolling/unwinding
      [gain by reduction of branch penalty].

ChangeLog:
2014-03-27  Vidya Ranganathan  <vidya@linux.vnet.ibm.com>

	* sysdeps/powerpc/powerpc64/power7/strncpy.S: New file: Optimization.
	* sysdeps/powerpc/powerpc64/multiarch/strncpy.c: New file:
	multiarch strncpy for PPC64.
	* sysdeps/powerpc/powerpc64/multiarch/strncpy-ppc64.c: New file
	* sysdeps/powerpc/powerpc64/multiarch/strncpy-power7.S: New file
	* sysdeps/powerpc/powerpc64/multiarch/ifunc-impl-list.c:
	(__libc_ifunc_impl_list): Likewise.
	* sysdeps/powerpc/powerpc64/multiarch/Makefile: Add strpcpy, stpncpy
	multiarch optimizations
	* sysdeps/powerpc/powerpc64/power7/stpncpy.S: New file: Optimization.
	* sysdeps/powerpc/powerpc64/multiarch/stpncpy.c: New file:
	multiarch stpncpy for PPC64.
	* sysdeps/powerpc/powerpc64/multiarch/stpncpy-ppc64.c: New file
	* sysdeps/powerpc/powerpc64/multiarch/stpncpy-power7.S: New file

Signed-off-by: Vidya Ranganathan <vidya@linux.vnet.ibm.com>
---
 sysdeps/powerpc/powerpc64/multiarch/Makefile       |   3 +-
 .../powerpc/powerpc64/multiarch/ifunc-impl-list.c  |  16 +
 .../powerpc/powerpc64/multiarch/stpncpy-power7.S   |  42 ++
 .../powerpc/powerpc64/multiarch/stpncpy-ppc64.c    |  26 ++
 sysdeps/powerpc/powerpc64/multiarch/stpncpy.c      |  33 ++
 .../powerpc/powerpc64/multiarch/strncpy-power7.S   |  40 ++
 .../powerpc/powerpc64/multiarch/strncpy-ppc64.c    |  33 ++
 sysdeps/powerpc/powerpc64/multiarch/strncpy.c      |  35 ++
 sysdeps/powerpc/powerpc64/power7/stpncpy.S         |  24 +
 sysdeps/powerpc/powerpc64/power7/strncpy.S         | 483 +++++++++++++++++++++
 10 files changed, 734 insertions(+), 1 deletion(-)
 create mode 100644 sysdeps/powerpc/powerpc64/multiarch/stpncpy-power7.S
 create mode 100644 sysdeps/powerpc/powerpc64/multiarch/stpncpy-ppc64.c
 create mode 100644 sysdeps/powerpc/powerpc64/multiarch/stpncpy.c
 create mode 100644 sysdeps/powerpc/powerpc64/multiarch/strncpy-power7.S
 create mode 100644 sysdeps/powerpc/powerpc64/multiarch/strncpy-ppc64.c
 create mode 100644 sysdeps/powerpc/powerpc64/multiarch/strncpy.c
 create mode 100644 sysdeps/powerpc/powerpc64/power7/stpncpy.S
 create mode 100644 sysdeps/powerpc/powerpc64/power7/strncpy.S
  

Comments

Adhemerval Zanella Netto April 7, 2014, 12:11 p.m. UTC | #1
Hi Vidya,

Patch looks good in general, just some comments below:


On 28-03-2014 03:43, vidya@linux.vnet.ibm.com wrote:
> From: Vidya Ranganathan <vidya@linux.vnet.ibm.com>
>
> The optimization is achieved by following techniques:
>    > data alignment [gain from aligned memory access on read/write]
>    > prefetch data [gain from cache misses by anticipating load]
>    > POWER7 gains performance with loop unrolling/unwinding
>       [gain by reduction of branch penalty].
>
> ChangeLog:
> 2014-03-27  Vidya Ranganathan  <vidya@linux.vnet.ibm.com>
>
> 	* sysdeps/powerpc/powerpc64/power7/strncpy.S: New file: Optimization.
> 	* sysdeps/powerpc/powerpc64/multiarch/strncpy.c: New file:
> 	multiarch strncpy for PPC64.
> 	* sysdeps/powerpc/powerpc64/multiarch/strncpy-ppc64.c: New file
> 	* sysdeps/powerpc/powerpc64/multiarch/strncpy-power7.S: New file
> 	* sysdeps/powerpc/powerpc64/multiarch/ifunc-impl-list.c:
> 	(__libc_ifunc_impl_list): Likewise.

I think a better wording would be move it below 'sysdeps/powerpc/powerpc64/multiarch/Makefile'.

> diff --git a/sysdeps/powerpc/powerpc64/power7/strncpy.S b/sysdeps/powerpc/powerpc64/power7/strncpy.S
> new file mode 100644
> index 0000000..729401a
> --- /dev/null
> +++ b/sysdeps/powerpc/powerpc64/power7/strncpy.S
> @@ -0,0 +1,483 @@
> +/* Copyright (C) 2014 Free Software Foundation, Inc.
> +   This file is part of the GNU C Library.
> +
> +   The GNU C Library is free software; you can redistribute it and/or
> +   modify it under the terms of the GNU Lesser General Public
> +   License as published by the Free Software Foundation; either
> +   version 2.1 of the License, or (at your option) any later version.
> +
> +   The GNU C Library is distributed in the hope that it will be useful,
> +   but WITHOUT ANY WARRANTY; without even the implied warranty of
> +   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
> +   Lesser General Public License for more details.
> +
> +   You should have received a copy of the GNU Lesser General Public
> +   License along with the GNU C Library; if not, see
> +   <http://www.gnu.org/licenses/>.  */
> +
> +#include <sysdep.h>
> +
> +/* Implements the functions
> +
> +   char * [r3] strncpy (char *dst [r3], const char *src [r4], size_t n [r5])
> +
> +   AND
> +
> +   char * [r3] stpncpy (char *dst [r3], const char *src [r4], size_t n [r5])
> +
> +   The algorithm is as follows:
> +   > if src and dest are 8 byte aligned, perform double word copy
> +     else
> +   > if src and dest are 4 byte aligned, perform word copy
> +     else
> +   > copy byte by byte on unaligned addresses.
> +
> +   The aligned comparison are made using cmpb instructions.  */
> +
> +/* The focus on optimization for performance improvements are as follows:
> +   1. data alignment [gain from aligned memory access on read/write]
> +   2. prefetch data [gain from cache misses by anticipating load]
> +   3. POWER7 gains performance with loop unrolling/unwinding
> +      [gain by reduction of branch penalty].  */
> +
> +#ifdef USE_AS_STPNCPY
> +	#ifndef STPNCPY
> +		# ifdef weak_alias
> +			#  define STPNCPY       __stpncpy
> +			weak_alias (__stpncpy, stpncpy)
> +		# else
> +			#  define STPNCPY       stpncpy
> +	# endif
> +	#endif
> +	# define FUNC_NAME __stpncpy
> +#else
> +	#undef strncpy
> +
> +	#ifndef STRNCPY
> +		#define STRNCPY strncpy
> +	#endif
> +	# define FUNC_NAME strncpy
> +#endif

The ifdefs are not aligned to GLIBC standard and it could be simplified by just:

#ifdef USE_AS_STPNCPY
# define FUNC_NAME __stpncpy
#else
# define define FUNC_NAME strncpy
#endif

And the remove the lines:

#else
libc_hidden_def (__stpncpy)

>
> +END(FUNC_NAME)
> +#ifndef USE_AS_STPNCPY
> +libc_hidden_builtin_def (strncpy)
> +#else
> +libc_hidden_def (__stpncpy)
> +#endif

See comment above
  

Patch

diff --git a/sysdeps/powerpc/powerpc64/multiarch/Makefile b/sysdeps/powerpc/powerpc64/multiarch/Makefile
index 8d367aa..35020a7 100644
--- a/sysdeps/powerpc/powerpc64/multiarch/Makefile
+++ b/sysdeps/powerpc/powerpc64/multiarch/Makefile
@@ -16,7 +16,8 @@  sysdep_routines += memcpy-power7 memcpy-a2 memcpy-power6 memcpy-cell \
 		   strcpy-power7 strcpy-ppc64 stpcpy-power7 stpcpy-ppc64 \
 		   strrchr-power7 strrchr-ppc64 strncat-power7 strncat-ppc64 \
 		   strspn-power7 strspn-ppc64 strcspn-power7 strcspn-ppc64 \
-		   strpbrk-power7 strpbrk-ppc64
+		   strpbrk-power7 strpbrk-ppc64 strncpy-power7 strncpy-ppc64 \
+		   stpncpy-power7 stpncpy-ppc64
 
 CFLAGS-strncase-power7.c += -mcpu=power7 -funroll-loops
 CFLAGS-strncase_l-power7.c += -mcpu=power7 -funroll-loops
diff --git a/sysdeps/powerpc/powerpc64/multiarch/ifunc-impl-list.c b/sysdeps/powerpc/powerpc64/multiarch/ifunc-impl-list.c
index 91fabb0..d8578fb 100644
--- a/sysdeps/powerpc/powerpc64/multiarch/ifunc-impl-list.c
+++ b/sysdeps/powerpc/powerpc64/multiarch/ifunc-impl-list.c
@@ -278,5 +278,21 @@  __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
 	      IFUNC_IMPL_ADD (array, i, strpbrk, 1,
 			     __strpbrk_ppc))
 
+  /* Support sysdeps/powerpc/powerpc64/multiarch/strncpy.c.  */
+  IFUNC_IMPL (i, name, strncpy,
+	      IFUNC_IMPL_ADD (array, i, strncpy,
+			      hwcap & PPC_FEATURE_HAS_VSX,
+			      __strncpy_power7)
+	      IFUNC_IMPL_ADD (array, i, strncpy, 1,
+			     __strncpy_ppc))
+
+  /* Support sysdeps/powerpc/powerpc64/multiarch/stpncpy.c.  */
+  IFUNC_IMPL (i, name, stpncpy,
+	      IFUNC_IMPL_ADD (array, i, stpncpy,
+			      hwcap & PPC_FEATURE_HAS_VSX,
+			      __stpncpy_power7)
+	      IFUNC_IMPL_ADD (array, i, stpncpy, 1,
+			     __stpncpy_ppc))
+
   return i;
 }
diff --git a/sysdeps/powerpc/powerpc64/multiarch/stpncpy-power7.S b/sysdeps/powerpc/powerpc64/multiarch/stpncpy-power7.S
new file mode 100644
index 0000000..92c4236
--- /dev/null
+++ b/sysdeps/powerpc/powerpc64/multiarch/stpncpy-power7.S
@@ -0,0 +1,42 @@ 
+/* Optimized stpncpy implementation for POWER7.
+   Copyright (C) 2014 Free Software Foundation, Inc.
+   This file is part of the GNU C Library.
+
+   The GNU C Library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 2.1 of the License, or (at your option) any later version.
+
+   The GNU C Library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with the GNU C Library; if not, see
+   <http://www.gnu.org/licenses/>.  */
+
+#include <sysdep.h>
+
+#define USE_AS_STPNCPY
+
+#undef EALIGN
+#define EALIGN(name, alignt, words)				\
+  .section ".text";						\
+  ENTRY_2(__stpncpy_power7)					\
+  .align ALIGNARG(alignt);					\
+  EALIGN_W_##words;						\
+  BODY_LABEL(__stpncpy_power7):					\
+  cfi_startproc;						\
+  LOCALENTRY(__stpncpy_power7)
+
+#undef END
+#define END(name)						\
+  cfi_endproc;							\
+  TRACEBACK(__stpncpy_power7)					\
+  END_2(__stpncpy_power7)
+
+#undef libc_hidden_builtin_def
+#define libc_hidden_builtin_def(name)
+
+#include <sysdeps/powerpc/powerpc64/power7/stpncpy.S>
diff --git a/sysdeps/powerpc/powerpc64/multiarch/stpncpy-ppc64.c b/sysdeps/powerpc/powerpc64/multiarch/stpncpy-ppc64.c
new file mode 100644
index 0000000..74f47a7
--- /dev/null
+++ b/sysdeps/powerpc/powerpc64/multiarch/stpncpy-ppc64.c
@@ -0,0 +1,26 @@ 
+/* Default stpncpy implementation for PowerPC64.
+   Copyright (C) 2014 Free Software Foundation, Inc.
+   This file is part of the GNU C Library.
+
+   The GNU C Library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 2.1 of the License, or (at your option) any later version.
+
+   The GNU C Library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with the GNU C Library; if not, see
+   <http://www.gnu.org/licenses/>.  */
+
+#define STPNCPY __stpncpy_ppc
+#ifdef SHARED
+#undef libc_hidden_def
+#define libc_hidden_def(name) \
+  __hidden_ver1 (__stpncpy_ppc, __GI___stpncpy, __stpncpy_ppc);
+#endif
+
+#include <string/stpncpy.c>
diff --git a/sysdeps/powerpc/powerpc64/multiarch/stpncpy.c b/sysdeps/powerpc/powerpc64/multiarch/stpncpy.c
new file mode 100644
index 0000000..dbf8521
--- /dev/null
+++ b/sysdeps/powerpc/powerpc64/multiarch/stpncpy.c
@@ -0,0 +1,33 @@ 
+/* Multiple versions of stpncpy. PowerPC64 version.
+   Copyright (C) 2014 Free Software Foundation, Inc.
+   This file is part of the GNU C Library.
+
+   The GNU C Library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 2.1 of the License, or (at your option) any later version.
+
+   The GNU C Library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with the GNU C Library; if not, see
+   <http://www.gnu.org/licenses/>.  */
+
+#ifndef NOT_IN_libc
+# include <string.h>
+# include <shlib-compat.h>
+# include "init-arch.h"
+
+extern __typeof (__stpncpy) __stpncpy_ppc attribute_hidden;
+extern __typeof (__stpncpy) __stpncpy_power7 attribute_hidden;
+
+libc_ifunc (__stpncpy,
+            (hwcap & PPC_FEATURE_HAS_VSX)
+            ? __stpncpy_power7
+            : __stpncpy_ppc);
+
+weak_alias (__stpncpy, stpncpy)
+#endif
diff --git a/sysdeps/powerpc/powerpc64/multiarch/strncpy-power7.S b/sysdeps/powerpc/powerpc64/multiarch/strncpy-power7.S
new file mode 100644
index 0000000..052998c
--- /dev/null
+++ b/sysdeps/powerpc/powerpc64/multiarch/strncpy-power7.S
@@ -0,0 +1,40 @@ 
+/* Optimized strncpy implementation for POWER7.
+   Copyright (C) 2014 Free Software Foundation, Inc.
+   This file is part of the GNU C Library.
+
+   The GNU C Library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 2.1 of the License, or (at your option) any later version.
+
+   The GNU C Library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with the GNU C Library; if not, see
+   <http://www.gnu.org/licenses/>.  */
+
+#include <sysdep.h>
+
+#undef EALIGN
+#define EALIGN(name, alignt, words)				\
+  .section ".text";						\
+  ENTRY_2(__strncpy_power7)					\
+  .align ALIGNARG(alignt);					\
+  EALIGN_W_##words;						\
+  BODY_LABEL(__strncpy_power7):					\
+  cfi_startproc;						\
+  LOCALENTRY(__strncpy_power7)
+
+#undef END
+#define END(name)						\
+  cfi_endproc;							\
+  TRACEBACK(__strncpy_power7)					\
+  END_2(__strncpy_power7)
+
+#undef libc_hidden_builtin_def
+#define libc_hidden_builtin_def(name)
+
+#include <sysdeps/powerpc/powerpc64/power7/strncpy.S>
diff --git a/sysdeps/powerpc/powerpc64/multiarch/strncpy-ppc64.c b/sysdeps/powerpc/powerpc64/multiarch/strncpy-ppc64.c
new file mode 100644
index 0000000..e3111d2
--- /dev/null
+++ b/sysdeps/powerpc/powerpc64/multiarch/strncpy-ppc64.c
@@ -0,0 +1,33 @@ 
+/* Copyright (C) 2014 Free Software Foundation, Inc.
+   This file is part of the GNU C Library.
+
+   The GNU C Library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 2.1 of the License, or (at your option) any later version.
+
+   The GNU C Library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with the GNU C Library; if not, see
+   <http://www.gnu.org/licenses/>.  */
+
+#include <string.h>
+
+#define STRNCPY __strncpy_ppc
+#undef weak_alias
+#define weak_alias(name, aliasname) \
+  extern __typeof (__strncpy_ppc) aliasname \
+    __attribute__ ((weak, alias ("__strncpy_ppc")));
+#if !defined(NOT_IN_libc) && defined(SHARED)
+# undef libc_hidden_builtin_def
+# define libc_hidden_builtin_def(name) \
+  __hidden_ver1(__strncpy_ppc, __GI_strncpy, __strncpy_ppc);
+#endif
+
+extern __typeof (strncpy) __strncpy_ppc attribute_hidden;
+
+#include <string/strncpy.c>
diff --git a/sysdeps/powerpc/powerpc64/multiarch/strncpy.c b/sysdeps/powerpc/powerpc64/multiarch/strncpy.c
new file mode 100644
index 0000000..0766fa8
--- /dev/null
+++ b/sysdeps/powerpc/powerpc64/multiarch/strncpy.c
@@ -0,0 +1,35 @@ 
+/* Multiple versions of strncpy.
+ Copyright (C) 2014 Free Software Foundation, Inc.
+ This file is part of the GNU C Library.
+
+ The GNU C Library is free software; you can redistribute it and/or
+ modify it under the terms of the GNU Lesser General Public
+ License as published by the Free Software Foundation; either
+ version 2.1 of the License, or (at your option) any later version.
+
+ The GNU C Library is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ Lesser General Public License for more details.
+
+ You should have received a copy of the GNU Lesser General Public
+ License along with the GNU C Library; if not, see
+ <http://www.gnu.org/licenses/ >. */
+
+/* Define multiple versions only for definition in libc. */
+#ifndef NOT_IN_libc
+# include <string.h>
+# include <shlib-compat.h>
+# include "init-arch.h"
+
+extern __typeof (strncpy) __strncpy_ppc attribute_hidden;
+extern __typeof (strncpy) __strncpy_power7 attribute_hidden;
+
+/* Avoid DWARF definition DIE on ifunc symbol so that GDB can handle
+ ifunc symbol properly. */
+libc_ifunc (strncpy,
+ (hwcap & PPC_FEATURE_HAS_VSX)
+ ? __strncpy_power7
+ : __strncpy_ppc);
+
+#endif
diff --git a/sysdeps/powerpc/powerpc64/power7/stpncpy.S b/sysdeps/powerpc/powerpc64/power7/stpncpy.S
new file mode 100644
index 0000000..a539093
--- /dev/null
+++ b/sysdeps/powerpc/powerpc64/power7/stpncpy.S
@@ -0,0 +1,24 @@ 
+/* Optimized stpncpy implementation for PowerPC64/POWER7.
+   Copyright (C) 2014 Free Software Foundation, Inc.
+   This file is part of the GNU C Library.
+
+   The GNU C Library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 2.1 of the License, or (at your option) any later version.
+
+   The GNU C Library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with the GNU C Library; if not, see
+   <http://www.gnu.org/licenses/>.  */
+
+#define USE_AS_STPNCPY
+#include <sysdeps/powerpc/powerpc64/power7/strncpy.S>
+
+weak_alias (__stpncpy, stpncpy)
+libc_hidden_def (__stpncpy)
+libc_hidden_builtin_def (stpncpy)
diff --git a/sysdeps/powerpc/powerpc64/power7/strncpy.S b/sysdeps/powerpc/powerpc64/power7/strncpy.S
new file mode 100644
index 0000000..729401a
--- /dev/null
+++ b/sysdeps/powerpc/powerpc64/power7/strncpy.S
@@ -0,0 +1,483 @@ 
+/* Copyright (C) 2014 Free Software Foundation, Inc.
+   This file is part of the GNU C Library.
+
+   The GNU C Library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 2.1 of the License, or (at your option) any later version.
+
+   The GNU C Library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with the GNU C Library; if not, see
+   <http://www.gnu.org/licenses/>.  */
+
+#include <sysdep.h>
+
+/* Implements the functions
+
+   char * [r3] strncpy (char *dst [r3], const char *src [r4], size_t n [r5])
+
+   AND
+
+   char * [r3] stpncpy (char *dst [r3], const char *src [r4], size_t n [r5])
+
+   The algorithm is as follows:
+   > if src and dest are 8 byte aligned, perform double word copy
+     else
+   > if src and dest are 4 byte aligned, perform word copy
+     else
+   > copy byte by byte on unaligned addresses.
+
+   The aligned comparison are made using cmpb instructions.  */
+
+/* The focus on optimization for performance improvements are as follows:
+   1. data alignment [gain from aligned memory access on read/write]
+   2. prefetch data [gain from cache misses by anticipating load]
+   3. POWER7 gains performance with loop unrolling/unwinding
+      [gain by reduction of branch penalty].  */
+
+#ifdef USE_AS_STPNCPY
+	#ifndef STPNCPY
+		# ifdef weak_alias
+			#  define STPNCPY       __stpncpy
+			weak_alias (__stpncpy, stpncpy)
+		# else
+			#  define STPNCPY       stpncpy
+	# endif
+	#endif
+	# define FUNC_NAME __stpncpy
+#else
+	#undef strncpy
+
+	#ifndef STRNCPY
+		#define STRNCPY strncpy
+	#endif
+	# define FUNC_NAME strncpy
+#endif
+
+#define		FRAMESIZE	(FRAME_MIN_SIZE+32)
+
+	.machine  power7
+EALIGN(FUNC_NAME, 4, 0)
+	CALL_MCOUNT 3
+
+	dcbt 0, r3		/* CPU pre-fetch dst to avoid cache miss  */
+	dcbt 0, r4		/* CPU pre-fetch src to avoid cache miss  */
+
+	mflr r0			/* load link register LR to r0  */
+	or r9, r3, r4		/* to verify source and destination  */
+	rldicl. r10, r9, 0, 61	/* is doubleWord aligned ..?  */
+
+	std r31, -8(r1)		/* save callers register , r31  */
+	std r30, -16(r1)	/* save callers register , r30  */
+	std r15, -24(r1)	/* save callers register , r15  */
+	std r0, 16(r1)		/* store the link register  */
+	stdu r1, -FRAMESIZE(r1)	/* create the stack frame  */
+
+	mr r15, r3		/* save r3 into r15 as retcode for strncpy  */
+	mr r31, r3		/* save r3 into r31 for use  */
+	beq cr0,L(dwordAligned)
+	rldicl. r10, r9, 0, 62	/* is word aligned .. ?  */
+	bne cr0,L(byte_by_byte)
+
+
+	srdi r3, r5, 2		/* compute count for CTR ; count = n/4  */
+	cmpldi cr7, r3, 3	/* if count > 4 ; perform unrolling 4 times  */
+	ble cr7,L(update2)
+
+	lwz r9, 0(r4)		/* load word from src  */
+	cmpb r10, r9, r10	/* compare bytes in src we read just now  */
+	cmpdi cr7, r10, 0	/* if NULL not found in src, continue copy  */
+	bne cr7,L(update5)
+	stw r9, 0(r31)		/* store word into dst  */
+
+	lwz r9, 4(r4)		/* load next word from src ; do unrolling  */
+	cmpb r10, r9, r10	/* use cmpb to detect NULL in src  */
+	cmpdi cr7, r10, 0	/* if NULL not found in src, continue copy  */
+	bne cr7,L(HopBy4)
+	addi r10, r3, -4
+	mr r8, r31
+	srdi r10, r10, 2
+	mr r7, r4
+	addi r10, r10, 1
+	li r12, 0
+	mtctr r10
+	b L(wordCopy)
+	.p2align 4
+L(wordUnroll):
+	stw r10, 8(r31)		/* perform loop unrolling on word load/store */
+
+	lwz r10, 12(r4)		/* load next to next word from src  */
+	cmpb r9, r10, r9
+	cmpdi cr7, r9, 0
+	bne cr7,L(HopBy12)
+	stw r10, 12(r8)
+
+	addi r31, r31, 16
+	addi r4, r4, 16
+	bdz L(leftWords)
+
+	lwz r6, 16(r7)		/* unroll for word copy  */
+	cmpb r10, r6, r9
+	cmpdi cr7, r10, 0
+	bne cr7,L(update3)
+	stw r6, 16(r8)
+
+	lwz r9, 20(r7)
+	cmpb r10, r9, r10
+	cmpdi cr7, r10, 0
+	bne cr7,L(HopBy20)
+
+	mr r7, r4
+	mr r8, r31
+	mr r3, r11
+	mr r5, r0
+
+L(wordCopy):
+	stw r9, 4(r31)
+	addi r0, r5, -16
+	addi r11, r3, -4
+	lwz r10, 8(r4)
+	cmpb r9, r10, r12
+	cmpdi cr7, r9, 0
+	beq cr7,L(wordUnroll)
+	addi r31, r31, 8
+	addi r4, r4, 8
+	addi r5, r5, -8
+	addi r11, r3, -2
+
+L(wordUnrollOFF):
+	lwz r9, 0(r4)
+	li r10, 0
+	cmpb r10, r9, r10
+	cmpdi cr7, r10, 0
+	bne cr7,L(byte_by_byte)
+	mtctr r11
+	li r8, 0
+	b L(copyWord)
+
+	.p2align 4
+L(loadWordandCompare):
+	lwz r9, 0(r4)
+	cmpb r10, r9, r8
+	cmpdi cr7, r10, 0
+	bne cr7,L(byte_by_byte)
+
+L(copyWord):
+	addi r31, r31, 4
+	stw r9,-4(r31)
+	addi r4, r4, 4
+	addi r5, r5, -4
+	bdnz L(loadWordandCompare)
+	.p2align 4
+L(byte_by_byte):
+	cmpldi cr7, r5, 3
+	subf r30, r5, r31
+	ble cr7,L(verifyByte)
+	srdi r10, r5, 2
+	mr r9, r31
+	mtctr r10
+	b L(firstByteUnroll)
+
+	.p2align 4
+L(bytes_unroll):
+	lbz r10, 1(r4)		/* load byte from src  */
+	cmpdi cr7, r10, 0	/* compare for NULL  */
+	stb r10, 1(r9)		/* store byte to dst  */
+	beq cr7,L(updtDestComputeN2ndByte)
+
+	addi r4, r4, 4
+
+	lbz r10, -2(r4)		/* perform loop unrolling for byte r/w  */
+	cmpdi cr7, r10, 0
+	stb r10, 2(r9)
+	beq cr7,L(updtDestComputeN3rdByte)
+
+	lbz r10, -1(r4)		/* perform loop unrolling for byte r/w  */
+	addi r9, r9, 4
+	cmpdi cr7, r10, 0
+	stb r10, -1(r9)
+	beq cr7, L(updtDestComputeNByte)
+
+	bdz L(updateToContinue)
+
+L(firstByteUnroll):
+	lbz r10, 0(r4)		/* perform loop unrolling for byte r/w  */
+	cmpdi cr7, r10, 0
+	stb r10, 0(r9)
+	bne cr7, L(bytes_unroll)
+	addi r9, r9, 1
+
+L(updtDestComputeNByte):
+	subf r10, r9, r31
+	mr r31, r9
+	add r10, r10, r5
+
+L(zeroFill):
+	cmpdi cr7, r10, 0	/* compare if length is zero  */
+	beq cr7,L(hop2Return)
+	mr r3, r31		/* fill buffer with zero  */
+	li r4, 0		/* buffer size to fill zero with  */
+	mr r5, r10		/* fill buffer target  */
+	bl memset		/* fill with zeroes  */
+	nop			/* trigger CPU activity  */
+
+L(hop2Return):
+	addi r1, r1, FRAMESIZE	/* restore stack pointer  */
+
+/* the return value differs based on the call to strncpy or stpncpy,
+   so based on the USE_AS_STPNCPY macro defined the return value
+   is copied to r3.  */
+
+#ifdef USE_AS_STPNCPY
+	addi r3, r31, -1	/* set the return value  */
+#else
+	mr r3, r15		/* set the return value  */
+#endif
+	ld r0, 16(r1)		/* read the saved link register  */
+	ld r15, -24(r1)		/* restore callers save register, r15  */
+	ld r30, -16(r1)		/* restore callers save register, r30  */
+	ld r31, -8(r1)		/* restore callers save register, r31  */
+	mtlr r0			/* restore link register  */
+	blr			/* branch to link register  */
+
+	.p2align 4
+L(updateToContinue):
+	mr r31, r9
+
+	.p2align 4
+L(verifyByte):
+	rldicl. r10, r5, 0, 62
+	addi r4, r4, -1
+	beq cr0,L(done)
+	mtctr r10
+	b L(oneBYone)
+
+	.p2align 4
+L(proceed):
+	bdz L(done)
+L(oneBYone):
+	lbzu r9, 1(r4)		/* copy byte  */
+	addi r31, r31, 1
+	addi r10, r10, -1
+	cmpdi cr7, r9, 0
+	stb r9,-1(r31)
+	bne cr7,L(proceed)
+	b L(zeroFill)
+
+	.p2align 4
+L(dwordAligned):
+	srdi r3, r5, 3		/* compute count for CTR ; count = n/8  */
+	cmpldi cr7, r3, 3	/* if count > 4 ; perform unrolling 4 times  */
+	ble cr7,L(update0)
+
+	ld r9, 0(r4)		/* load doubleWord from src  */
+	cmpb r10, r9, r10	/* compare src with NULL ,we read just now  */
+	cmpdi cr7, 10, 0	/* if cmpb returned NULL ; we continue  */
+	bne cr7,L(update4)
+
+	std r9, 0(r31)		/* copy doubleword at offset=0  */
+	ld r9, 8(r4)		/* load next doubleword from offset=8  */
+	cmpb r10, r9, r10	/* compare src with NULL , we read just now  */
+	cmpdi cr7, r10, 0	/* if cmpb returned NULL ; we continue  */
+	bne cr7, L(HopBy8)
+
+	addi r10, r3, -4
+	mr r8, r31
+	srdi r10, r10, 2
+	mr r7, r4
+	addi r10, r10, 1
+	li r12, 0
+	mtctr r10
+	b L(dwordCopy)
+	.p2align 4
+
+L(dWordUnroll):
+	std r10, 16(r31)
+	ld r10, 24(r4)		/* load dword , perform loop unrolling again  */
+	cmpb r9, r10, r9
+	cmpdi cr7, r9, 0
+	bne 7,L(HopBy24)
+
+	std r10, 24(r8)		/* copy dword at offset=24  */
+	addi r31, r31, 32
+	addi r4, r4, 32
+	bdz  L(leftDwords)	/* continue with loop on counter  */
+
+	ld r6, 32(r7)
+	cmpb r10, r6, r9
+	cmpdi cr7, r10, 0
+	bne cr7,L(update1)
+
+	std r6, 32(r8)
+	ld r9, 40(r7)
+	cmpb r10, r9, r10
+	cmpdi cr7, r10, 0
+	bne cr7,L(HopBy40)
+
+	mr r7, r4
+	mr r8, r31
+	mr r3, r11
+	mr r5, r0
+
+L(dwordCopy):			/* perform loop unrolling ; copy dword  */
+	std r9, 8(r31)		/* copy dword at offset=8  */
+	addi r0, r5, -32
+	addi r11, r3, -4
+	ld r10, 16(r4)
+	cmpb r9, r10, r12
+	cmpdi cr7, r9, 0
+	beq cr7,L(dWordUnroll)
+	addi r31, r31, 16
+	addi r4, r4, 16
+	addi r5, r5, -16
+	addi r11, r3, -2
+
+L(dWordUnrollOFF):
+	ld r9, 0(r4)
+	li r10, 0		/* load mask = 0  */
+	cmpb r10, r9, r10
+	cmpdi cr7, r10, 0
+	bne cr7,L(byte_by_byte)
+	mtctr r11
+	li r8, 0
+	b L(CopyDword)
+
+	.p2align 4
+L(loadDWordandCompare):
+	ld r9, 0(r4)
+	cmpb r10, r9, r8
+	cmpdi cr7, r10, 0
+	bne cr7,L(byte_by_byte)
+
+L(CopyDword):
+	addi r31, r31, 8
+	std r9, -8(r31)
+	addi r4, r4, 8
+	addi r5, r5, -8
+	bdnz L(loadDWordandCompare)
+	b L(byte_by_byte)
+
+	.p2align 4
+L(done):
+	addi r1, r1, FRAMESIZE	/* restore stack pointer  */
+#ifdef USE_AS_STPNCPY
+	mr r3, r31		/* set the return value  */
+#else
+	mr r3, r15		/* set the return value  */
+#endif
+	ld r0, 16(r1)		/* read the saved link register  */
+	ld r15, -24(r1)		/* restore callers save register, r30  */
+	ld r30,-16(r1)		/* restore callers save register, r31  */
+	ld r31,-8(r1)		/* restore link register  */
+	mtlr r0			/* branch to link register  */
+	blr
+
+L(update0):
+	mr r11, r3
+	mr r0, r5
+
+	.p2align 4
+L(leftDwords):
+	cmpdi cr7, r11, 0
+	mr r5, r0
+	bne cr7,L(dWordUnrollOFF)
+	b L(byte_by_byte)
+
+	.p2align 4
+L(updtDestComputeN2ndByte):
+	addi r9, r9, 2		/* update dst by 2  */
+	subf r10, r9, r31	/* compute distance covered  */
+	mr r31, r9
+	add r10, r10, r5
+	b L(zeroFill)
+
+	.p2align 4
+L(updtDestComputeN3rdByte):
+	addi r9, r9, 3
+	subf r10, r9, r31
+	mr r31, r9
+	add r10, r10, r5
+	b L(zeroFill)
+
+	.p2align 4
+L(HopBy24):
+	addi r31, r31, 24	/* increment dst by 24  */
+	addi r4, r4, 24		/* increment src by 24  */
+	addi r5, r5, -24	/* decrement length 'n' by 24  */
+	addi r11, r3, -3	/* decrement loop counter  */
+	b L(dWordUnrollOFF)
+
+	.p2align 4
+L(update1):
+	mr r5, r0
+	b L(dWordUnrollOFF)
+
+	.p2align 4
+L(HopBy40):
+	addi r31, r8, 40	/* increment dst by 40  */
+	addi r4, r7, 40		/* increment src by 40  */
+	addi r5, r5, -40	/* decrement length 'n' by 40  */
+	addi r11, r3, -5	/* decrement loop counter  */
+	b L(dWordUnrollOFF)
+
+L(update2):
+	mr r11, r3
+	mr r0, r5
+
+L(leftWords):
+	cmpdi cr7, r11, 0	/* if words are left,process with unrollOFF  */
+	mr r5, r0
+	bne cr7,L(wordUnrollOFF)
+	b L(byte_by_byte)
+
+L(HopBy12):
+	addi r31, r31, 12	/* increment dst by 12  */
+	addi r4, r4, 12		/* increment src by 12  */
+	addi r5, r5, -12	/* decrement length 'n' by 12  */
+	addi r11, 3, -3		/* decrement loop counter  */
+	b L(wordUnrollOFF)
+
+L(update3):
+	mr r5, r0
+	b L(wordUnrollOFF)
+
+L(HopBy20):
+	addi r31, r8, 20	/* increment dst by 20  */
+	addi r4, r7, 20		/* increment src by 20  */
+	addi r5, r5, -20	/* decrement length 'n' by 20  */
+	addi r11, r3, -5	/* decrement loop counter  */
+	b L(wordUnrollOFF)
+
+L(update4):
+	mr r11, r3
+	b L(dWordUnrollOFF)
+
+L(HopBy8):
+	addi r31, r31, 8	/* increment dst by 8  */
+	addi r4, r4, 8		/* increment src by 8  */
+	addi r5, r5, -8		/* decrement length 'n' by 8  */
+	addi r11, r3, -1	/* decrement loop counter  */
+	b L(dWordUnrollOFF)
+
+L(update5):
+	mr r11, r3
+	b L(wordUnrollOFF)
+
+L(HopBy4):
+	addi r31, r31, 4	/* increment dst by 4  */
+	addi r4, 4, 4		/* increment src by 4  */
+	addi r5, r5, -4		/* decrement length 'n' by 4  */
+	addi r11, r3, -1	/* decrement loop counter  */
+	b L(wordUnrollOFF)
+
+END(FUNC_NAME)
+#ifndef USE_AS_STPNCPY
+libc_hidden_builtin_def (strncpy)
+#else
+libc_hidden_def (__stpncpy)
+#endif