powerpc: P9 vector load instruction change in memcpy and memmove

Message ID 20171019152531.12064-1-tuliom@linux.vnet.ibm.com
State Superseded
Headers

Commit Message

Tulio Magno Quites Machado Filho Oct. 19, 2017, 3:25 p.m. UTC
  From: Rajalakshmi Srinivasaraghavan <raji@linux.vnet.ibm.com>

POWER9 DD2.1 and earlier has an issue where some cache inhibited
vector load traps to the kernel.  To handle this in memcpy
and memmove, lvx/stvx is used for aligned addresses instead
of lxvd2x/stxvd2x.  The remaining part of the optimization remains
same as existing POWER7 code.

Reference: https://patchwork.ozlabs.org/patch/814059/
Tested on powerpc64le.

2017-10-19  Rajalakshmi Srinivasaraghavan  <raji@linux.vnet.ibm.com>

	* sysdeps/powerpc/powerpc64/multiarch/Makefile
	(sysdep_routines): Add memcpy_power9 and memmove_power9.
	* sysdeps/powerpc/powerpc64/multiarch/ifunc-impl-list.c
	(memcpy): Add __memcpy_power9 to list of memcpy functions.
	(memmove): Add __memmove_power9 to list of memmove functions.
	(bcopy): Add __bcopy_power9 to list of bcopy functions.
	* sysdeps/powerpc/powerpc64/multiarch/memcpy.c
	(memcpy): Add __memcpy_power9 to ifunc list.
	* sysdeps/powerpc/powerpc64/power9/memcpy.S: New File.
	* sysdeps/powerpc/powerpc64/multiarch/memcpy-power9.S: Likewise.
 	* sysdeps/powerpc/powerpc64/multiarch/bcopy.c
	(bcopy): Add __bcopy_power9 to ifunc list.
	* sysdeps/powerpc/powerpc64/multiarch/memmove-power7.S
	Change bcopy as __bcopy.
	* sysdeps/powerpc/powerpc64/multiarch/memmove.c
	(memmove): Add __memmove_power9 to ifunc list.
	* sysdeps/powerpc/powerpc64/power7/memmove.S:
	Alias bcopy only if not defined before.
	* sysdeps/powerpc/powerpc64/multiarch/memmove-power9.S:
	New file.
	* sysdeps/powerpc/powerpc64/power9/memmove.S: Likewise.
---
 sysdeps/powerpc/powerpc64/multiarch/Makefile       |   7 +-
 sysdeps/powerpc/powerpc64/multiarch/bcopy.c        |   6 +-
 .../powerpc/powerpc64/multiarch/ifunc-impl-list.c  |   6 +
 .../powerpc/powerpc64/multiarch/memcpy-power9.S    |  26 +
 sysdeps/powerpc/powerpc64/multiarch/memcpy.c       |   3 +
 .../powerpc/powerpc64/multiarch/memmove-power7.S   |   4 +-
 .../powerpc/powerpc64/multiarch/memmove-power9.S   |  29 +
 sysdeps/powerpc/powerpc64/multiarch/memmove.c      |   5 +-
 sysdeps/powerpc/powerpc64/power7/memmove.S         |   2 +
 sysdeps/powerpc/powerpc64/power9/memcpy.S          | 429 +++++++++++
 sysdeps/powerpc/powerpc64/power9/memmove.S         | 837 +++++++++++++++++++++
 11 files changed, 1347 insertions(+), 7 deletions(-)
 create mode 100644 sysdeps/powerpc/powerpc64/multiarch/memcpy-power9.S
 create mode 100644 sysdeps/powerpc/powerpc64/multiarch/memmove-power9.S
 create mode 100644 sysdeps/powerpc/powerpc64/power9/memcpy.S
 create mode 100644 sysdeps/powerpc/powerpc64/power9/memmove.S
  

Comments

Florian Weimer Oct. 19, 2017, 3:39 p.m. UTC | #1
On 10/19/2017 05:25 PM, Tulio Magno Quites Machado Filho wrote:
> diff --git a/sysdeps/powerpc/powerpc64/multiarch/bcopy.c b/sysdeps/powerpc/powerpc64/multiarch/bcopy.c
> index 05d46e2..4a4ee6e 100644
> --- a/sysdeps/powerpc/powerpc64/multiarch/bcopy.c
> +++ b/sysdeps/powerpc/powerpc64/multiarch/bcopy.c
> @@ -22,8 +22,12 @@
>   extern __typeof (bcopy) __bcopy_ppc attribute_hidden;
>   /* __bcopy_power7 symbol is implemented at memmove-power7.S  */
>   extern __typeof (bcopy) __bcopy_power7 attribute_hidden;
> +/* __bcopy_power9 symbol is implemented at memmove-power9.S.  */
> +extern __typeof (bcopy) __bcopy_power9 attribute_hidden;
>   
>   libc_ifunc (bcopy,
> -            (hwcap & PPC_FEATURE_HAS_VSX)
> +	    (hwcap2 & PPC_FEATURE2_ARCH_3_00)
> +	    ? __bcopy_power9
> +	    : (hwcap & PPC_FEATURE_HAS_VSX)
>               ? __bcopy_power7
>               : __bcopy_ppc);
> diff --git a/sysdeps/powerpc/powerpc64/multiarch/ifunc-impl-list.c b/sysdeps/powerpc/powerpc64/multiarch/ifunc-impl-list.c
> index 6a88536..9040bbc 100644
> --- a/sysdeps/powerpc/powerpc64/multiarch/ifunc-impl-list.c
> +++ b/sysdeps/powerpc/powerpc64/multiarch/ifunc-impl-list.c
> @@ -51,6 +51,8 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
>   #ifdef SHARED
>     /* Support sysdeps/powerpc/powerpc64/multiarch/memcpy.c.  */
>     IFUNC_IMPL (i, name, memcpy,
> +	      IFUNC_IMPL_ADD (array, i, memcpy, hwcap2 & PPC_FEATURE2_ARCH_3_00,
> +			      __memcpy_power9)
>   	      IFUNC_IMPL_ADD (array, i, memcpy, hwcap & PPC_FEATURE_HAS_VSX,
>   			      __memcpy_power7)
>   	      IFUNC_IMPL_ADD (array, i, memcpy, hwcap & PPC_FEATURE_ARCH_2_06,
> @@ -65,6 +67,8 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
>   
>     /* Support sysdeps/powerpc/powerpc64/multiarch/memmove.c.  */
>     IFUNC_IMPL (i, name, memmove,
> +	      IFUNC_IMPL_ADD (array, i, memmove, hwcap2 & PPC_FEATURE2_ARCH_3_00,
> +			      __memmove_power9)
>   	      IFUNC_IMPL_ADD (array, i, memmove, hwcap & PPC_FEATURE_HAS_VSX,
>   			      __memmove_power7)
>   	      IFUNC_IMPL_ADD (array, i, memmove, 1, __memmove_ppc))
> @@ -168,6 +172,8 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
>   
>     /* Support sysdeps/powerpc/powerpc64/multiarch/bcopy.c.  */
>     IFUNC_IMPL (i, name, bcopy,
> +	      IFUNC_IMPL_ADD (array, i, bcopy, hwcap2 & PPC_FEATURE2_ARCH_3_00,
> +			      __bcopy_power9)
>   	      IFUNC_IMPL_ADD (array, i, bcopy, hwcap & PPC_FEATURE_HAS_VSX,
>   			      __bcopy_power7)
>   	      IFUNC_IMPL_ADD (array, i, bcopy, 1, __bcopy_ppc))

I'm concerned that this needs *another* change to recognize post-DD2.1 
POWER 9 hardware which has the fix (or perhaps OpenPOWER implementations 
which never had the bug).

Is there any other way you could select the workaround that is more 
specific?  Or is the performance hit from avoiding the affected vector 
instructions not so severe that it would be an immediate concern for 
future silicon?

Thanks,
Florian
  
Adhemerval Zanella Oct. 19, 2017, 3:52 p.m. UTC | #2
On 19/10/2017 13:25, Tulio Magno Quites Machado Filho wrote:
> From: Rajalakshmi Srinivasaraghavan <raji@linux.vnet.ibm.com>
> 
> POWER9 DD2.1 and earlier has an issue where some cache inhibited
> vector load traps to the kernel.  To handle this in memcpy
> and memmove, lvx/stvx is used for aligned addresses instead
> of lxvd2x/stxvd2x.  The remaining part of the optimization remains
> same as existing POWER7 code.
> 
> Reference: https://patchwork.ozlabs.org/patch/814059/
> Tested on powerpc64le.

According to "POWER8 Processor User’s Manual for the Single-Chip Module"
(it is buried on a sign wall at [1]), both lxv2dx/lvx and stxvd2x/stvx
uses the same pipeline, have the same latency and same throughput.  The
only difference is lxv2dx/stxv2x have microcode handling for unaligned
case and for 4k crossing or 32-byte cross L1 miss (which should not
occur in the with aligned address).

Why not change POWER7 implementation instead of dropping another one
which is exactly the same for POWER9?

[1] https://www-355.ibm.com/systems/power/openpower/tgcmDocumentRepository.xhtml?aliasId=POWER8



> 
> 2017-10-19  Rajalakshmi Srinivasaraghavan  <raji@linux.vnet.ibm.com>
> 
> 	* sysdeps/powerpc/powerpc64/multiarch/Makefile
> 	(sysdep_routines): Add memcpy_power9 and memmove_power9.
> 	* sysdeps/powerpc/powerpc64/multiarch/ifunc-impl-list.c
> 	(memcpy): Add __memcpy_power9 to list of memcpy functions.
> 	(memmove): Add __memmove_power9 to list of memmove functions.
> 	(bcopy): Add __bcopy_power9 to list of bcopy functions.
> 	* sysdeps/powerpc/powerpc64/multiarch/memcpy.c
> 	(memcpy): Add __memcpy_power9 to ifunc list.
> 	* sysdeps/powerpc/powerpc64/power9/memcpy.S: New File.
> 	* sysdeps/powerpc/powerpc64/multiarch/memcpy-power9.S: Likewise.
>  	* sysdeps/powerpc/powerpc64/multiarch/bcopy.c
> 	(bcopy): Add __bcopy_power9 to ifunc list.
> 	* sysdeps/powerpc/powerpc64/multiarch/memmove-power7.S
> 	Change bcopy as __bcopy.
> 	* sysdeps/powerpc/powerpc64/multiarch/memmove.c
> 	(memmove): Add __memmove_power9 to ifunc list.
> 	* sysdeps/powerpc/powerpc64/power7/memmove.S:
> 	Alias bcopy only if not defined before.
> 	* sysdeps/powerpc/powerpc64/multiarch/memmove-power9.S:
> 	New file.
> 	* sysdeps/powerpc/powerpc64/power9/memmove.S: Likewise.
> ---
>  sysdeps/powerpc/powerpc64/multiarch/Makefile       |   7 +-
>  sysdeps/powerpc/powerpc64/multiarch/bcopy.c        |   6 +-
>  .../powerpc/powerpc64/multiarch/ifunc-impl-list.c  |   6 +
>  .../powerpc/powerpc64/multiarch/memcpy-power9.S    |  26 +
>  sysdeps/powerpc/powerpc64/multiarch/memcpy.c       |   3 +
>  .../powerpc/powerpc64/multiarch/memmove-power7.S   |   4 +-
>  .../powerpc/powerpc64/multiarch/memmove-power9.S   |  29 +
>  sysdeps/powerpc/powerpc64/multiarch/memmove.c      |   5 +-
>  sysdeps/powerpc/powerpc64/power7/memmove.S         |   2 +
>  sysdeps/powerpc/powerpc64/power9/memcpy.S          | 429 +++++++++++
>  sysdeps/powerpc/powerpc64/power9/memmove.S         | 837 +++++++++++++++++++++
>  11 files changed, 1347 insertions(+), 7 deletions(-)
>  create mode 100644 sysdeps/powerpc/powerpc64/multiarch/memcpy-power9.S
>  create mode 100644 sysdeps/powerpc/powerpc64/multiarch/memmove-power9.S
>  create mode 100644 sysdeps/powerpc/powerpc64/power9/memcpy.S
>  create mode 100644 sysdeps/powerpc/powerpc64/power9/memmove.S
> 
> diff --git a/sysdeps/powerpc/powerpc64/multiarch/Makefile b/sysdeps/powerpc/powerpc64/multiarch/Makefile
> index dea49ac..82728fa 100644
> --- a/sysdeps/powerpc/powerpc64/multiarch/Makefile
> +++ b/sysdeps/powerpc/powerpc64/multiarch/Makefile
> @@ -1,6 +1,6 @@
>  ifeq ($(subdir),string)
> -sysdep_routines += memcpy-power7 memcpy-a2 memcpy-power6 memcpy-cell \
> -		   memcpy-power4 memcpy-ppc64 \
> +sysdep_routines += memcpy-power9 memcpy-power7 memcpy-a2 memcpy-power6 \
> +		   memcpy-cell memcpy-power4 memcpy-ppc64 \
>  		   memcmp-power8 memcmp-power7 memcmp-power4 memcmp-ppc64 \
>  		   memset-power7 memset-power6 memset-power4 \
>  		   memset-ppc64 memset-power8 \
> @@ -24,7 +24,8 @@ sysdep_routines += memcpy-power7 memcpy-a2 memcpy-power6 memcpy-cell \
>  		   stpncpy-power8 stpncpy-power7 stpncpy-ppc64 \
>  		   strcmp-power9 strcmp-power8 strcmp-power7 strcmp-ppc64 \
>  		   strcat-power8 strcat-power7 strcat-ppc64 \
> -		   memmove-power7 memmove-ppc64 wordcopy-ppc64 bcopy-ppc64 \
> +		   memmove-power9 memmove-power7 memmove-ppc64 \
> +		   wordcopy-ppc64 bcopy-ppc64 \
>  		   strncpy-power8 strstr-power7 strstr-ppc64 \
>  		   strspn-power8 strspn-ppc64 strcspn-power8 strcspn-ppc64 \
>  		   strlen-power8 strcasestr-power8 strcasestr-ppc64 \
> diff --git a/sysdeps/powerpc/powerpc64/multiarch/bcopy.c b/sysdeps/powerpc/powerpc64/multiarch/bcopy.c
> index 05d46e2..4a4ee6e 100644
> --- a/sysdeps/powerpc/powerpc64/multiarch/bcopy.c
> +++ b/sysdeps/powerpc/powerpc64/multiarch/bcopy.c
> @@ -22,8 +22,12 @@
>  extern __typeof (bcopy) __bcopy_ppc attribute_hidden;
>  /* __bcopy_power7 symbol is implemented at memmove-power7.S  */
>  extern __typeof (bcopy) __bcopy_power7 attribute_hidden;
> +/* __bcopy_power9 symbol is implemented at memmove-power9.S.  */
> +extern __typeof (bcopy) __bcopy_power9 attribute_hidden;
>  
>  libc_ifunc (bcopy,
> -            (hwcap & PPC_FEATURE_HAS_VSX)
> +	    (hwcap2 & PPC_FEATURE2_ARCH_3_00)
> +	    ? __bcopy_power9
> +	    : (hwcap & PPC_FEATURE_HAS_VSX)
>              ? __bcopy_power7
>              : __bcopy_ppc);
> diff --git a/sysdeps/powerpc/powerpc64/multiarch/ifunc-impl-list.c b/sysdeps/powerpc/powerpc64/multiarch/ifunc-impl-list.c
> index 6a88536..9040bbc 100644
> --- a/sysdeps/powerpc/powerpc64/multiarch/ifunc-impl-list.c
> +++ b/sysdeps/powerpc/powerpc64/multiarch/ifunc-impl-list.c
> @@ -51,6 +51,8 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
>  #ifdef SHARED
>    /* Support sysdeps/powerpc/powerpc64/multiarch/memcpy.c.  */
>    IFUNC_IMPL (i, name, memcpy,
> +	      IFUNC_IMPL_ADD (array, i, memcpy, hwcap2 & PPC_FEATURE2_ARCH_3_00,
> +			      __memcpy_power9)
>  	      IFUNC_IMPL_ADD (array, i, memcpy, hwcap & PPC_FEATURE_HAS_VSX,
>  			      __memcpy_power7)
>  	      IFUNC_IMPL_ADD (array, i, memcpy, hwcap & PPC_FEATURE_ARCH_2_06,
> @@ -65,6 +67,8 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
>  
>    /* Support sysdeps/powerpc/powerpc64/multiarch/memmove.c.  */
>    IFUNC_IMPL (i, name, memmove,
> +	      IFUNC_IMPL_ADD (array, i, memmove, hwcap2 & PPC_FEATURE2_ARCH_3_00,
> +			      __memmove_power9)
>  	      IFUNC_IMPL_ADD (array, i, memmove, hwcap & PPC_FEATURE_HAS_VSX,
>  			      __memmove_power7)
>  	      IFUNC_IMPL_ADD (array, i, memmove, 1, __memmove_ppc))
> @@ -168,6 +172,8 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
>  
>    /* Support sysdeps/powerpc/powerpc64/multiarch/bcopy.c.  */
>    IFUNC_IMPL (i, name, bcopy,
> +	      IFUNC_IMPL_ADD (array, i, bcopy, hwcap2 & PPC_FEATURE2_ARCH_3_00,
> +			      __bcopy_power9)
>  	      IFUNC_IMPL_ADD (array, i, bcopy, hwcap & PPC_FEATURE_HAS_VSX,
>  			      __bcopy_power7)
>  	      IFUNC_IMPL_ADD (array, i, bcopy, 1, __bcopy_ppc))
> diff --git a/sysdeps/powerpc/powerpc64/multiarch/memcpy-power9.S b/sysdeps/powerpc/powerpc64/multiarch/memcpy-power9.S
> new file mode 100644
> index 0000000..fbd0788
> --- /dev/null
> +++ b/sysdeps/powerpc/powerpc64/multiarch/memcpy-power9.S
> @@ -0,0 +1,26 @@
> +/* Optimized memcpy implementation for PowerPC/POWER9.
> +   Copyright (C) 2017 Free Software Foundation, Inc.
> +   This file is part of the GNU C Library.
> +
> +   The GNU C Library is free software; you can redistribute it and/or
> +   modify it under the terms of the GNU Lesser General Public
> +   License as published by the Free Software Foundation; either
> +   version 2.1 of the License, or (at your option) any later version.
> +
> +   The GNU C Library is distributed in the hope that it will be useful,
> +   but WITHOUT ANY WARRANTY; without even the implied warranty of
> +   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
> +   Lesser General Public License for more details.
> +
> +   You should have received a copy of the GNU Lesser General Public
> +   License along with the GNU C Library; if not, see
> +   <http://www.gnu.org/licenses/>.  */
> +
> +#include <sysdep.h>
> +
> +#define MEMCPY __memcpy_power9
> +
> +#undef libc_hidden_builtin_def
> +#define libc_hidden_builtin_def(name)
> +
> +#include <sysdeps/powerpc/powerpc64/power9/memcpy.S>
> diff --git a/sysdeps/powerpc/powerpc64/multiarch/memcpy.c b/sysdeps/powerpc/powerpc64/multiarch/memcpy.c
> index 9f4286c..4c16fa0 100644
> --- a/sysdeps/powerpc/powerpc64/multiarch/memcpy.c
> +++ b/sysdeps/powerpc/powerpc64/multiarch/memcpy.c
> @@ -35,8 +35,11 @@ extern __typeof (__redirect_memcpy) __memcpy_cell attribute_hidden;
>  extern __typeof (__redirect_memcpy) __memcpy_power6 attribute_hidden;
>  extern __typeof (__redirect_memcpy) __memcpy_a2 attribute_hidden;
>  extern __typeof (__redirect_memcpy) __memcpy_power7 attribute_hidden;
> +extern __typeof (__redirect_memcpy) __memcpy_power9 attribute_hidden;
>  
>  libc_ifunc (__libc_memcpy,
> +	   (hwcap2 & PPC_FEATURE2_ARCH_3_00)
> +	   ? __memcpy_power9 :
>              (hwcap & PPC_FEATURE_HAS_VSX)
>              ? __memcpy_power7 :
>  	      (hwcap & PPC_FEATURE_ARCH_2_06)
> diff --git a/sysdeps/powerpc/powerpc64/multiarch/memmove-power7.S b/sysdeps/powerpc/powerpc64/multiarch/memmove-power7.S
> index a9435fa..0599a39 100644
> --- a/sysdeps/powerpc/powerpc64/multiarch/memmove-power7.S
> +++ b/sysdeps/powerpc/powerpc64/multiarch/memmove-power7.S
> @@ -23,7 +23,7 @@
>  #undef libc_hidden_builtin_def
>  #define libc_hidden_builtin_def(name)
>  
> -#undef bcopy
> -#define bcopy __bcopy_power7
> +#undef __bcopy
> +#define __bcopy __bcopy_power7
>  
>  #include <sysdeps/powerpc/powerpc64/power7/memmove.S>
> diff --git a/sysdeps/powerpc/powerpc64/multiarch/memmove-power9.S b/sysdeps/powerpc/powerpc64/multiarch/memmove-power9.S
> new file mode 100644
> index 0000000..16a2267
> --- /dev/null
> +++ b/sysdeps/powerpc/powerpc64/multiarch/memmove-power9.S
> @@ -0,0 +1,29 @@
> +/* Optimized memmove implementation for PowerPC64/POWER7.
> +   Copyright (C) 2017 Free Software Foundation, Inc.
> +   This file is part of the GNU C Library.
> +
> +   The GNU C Library is free software; you can redistribute it and/or
> +   modify it under the terms of the GNU Lesser General Public
> +   License as published by the Free Software Foundation; either
> +   version 2.1 of the License, or (at your option) any later version.
> +
> +   The GNU C Library is distributed in the hope that it will be useful,
> +   but WITHOUT ANY WARRANTY; without even the implied warranty of
> +   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
> +   Lesser General Public License for more details.
> +
> +   You should have received a copy of the GNU Lesser General Public
> +   License along with the GNU C Library; if not, see
> +   <http://www.gnu.org/licenses/>.  */
> +
> +#include <sysdep.h>
> +
> +#define MEMMOVE __memmove_power9
> +
> +#undef libc_hidden_builtin_def
> +#define libc_hidden_builtin_def(name)
> +
> +#undef __bcopy
> +#define __bcopy __bcopy_power9
> +
> +#include <sysdeps/powerpc/powerpc64/power9/memmove.S>
> diff --git a/sysdeps/powerpc/powerpc64/multiarch/memmove.c b/sysdeps/powerpc/powerpc64/multiarch/memmove.c
> index db2bbc7..f02498e 100644
> --- a/sysdeps/powerpc/powerpc64/multiarch/memmove.c
> +++ b/sysdeps/powerpc/powerpc64/multiarch/memmove.c
> @@ -31,9 +31,12 @@ extern __typeof (__redirect_memmove) __libc_memmove;
>  
>  extern __typeof (__redirect_memmove) __memmove_ppc attribute_hidden;
>  extern __typeof (__redirect_memmove) __memmove_power7 attribute_hidden;
> +extern __typeof (__redirect_memmove) __memmove_power9 attribute_hidden;
>  
>  libc_ifunc (__libc_memmove,
> -            (hwcap & PPC_FEATURE_HAS_VSX)
> +	    (hwcap2 & PPC_FEATURE2_ARCH_3_00)
> +	    ? __memmove_power9
> +	    : (hwcap & PPC_FEATURE_HAS_VSX)
>              ? __memmove_power7
>              : __memmove_ppc);
>  
> diff --git a/sysdeps/powerpc/powerpc64/power7/memmove.S b/sysdeps/powerpc/powerpc64/power7/memmove.S
> index 93baa69..0bb8ddc 100644
> --- a/sysdeps/powerpc/powerpc64/power7/memmove.S
> +++ b/sysdeps/powerpc/powerpc64/power7/memmove.S
> @@ -832,4 +832,6 @@ ENTRY_TOCLESS (__bcopy)
>  	mr	r4,r6
>  	b	L(_memmove)
>  END (__bcopy)
> +#ifndef __bcopy
>  weak_alias (__bcopy, bcopy)
> +#endif
> diff --git a/sysdeps/powerpc/powerpc64/power9/memcpy.S b/sysdeps/powerpc/powerpc64/power9/memcpy.S
> new file mode 100644
> index 0000000..0731bac
> --- /dev/null
> +++ b/sysdeps/powerpc/powerpc64/power9/memcpy.S
> @@ -0,0 +1,429 @@
> +/* Optimized memcpy implementation for PowerPC64/POWER9.
> +   Copyright (C) 2017 Free Software Foundation, Inc.
> +   This file is part of the GNU C Library.
> +
> +   The GNU C Library is free software; you can redistribute it and/or
> +   modify it under the terms of the GNU Lesser General Public
> +   License as published by the Free Software Foundation; either
> +   version 2.1 of the License, or (at your option) any later version.
> +
> +   The GNU C Library is distributed in the hope that it will be useful,
> +   but WITHOUT ANY WARRANTY; without even the implied warranty of
> +   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
> +   Lesser General Public License for more details.
> +
> +   You should have received a copy of the GNU Lesser General Public
> +   License along with the GNU C Library; if not, see
> +   <http://www.gnu.org/licenses/>.  */
> +
> +#include <sysdep.h>
> +
> +
> +/* void * [r3] memcpy (void *dst [r3], void *src [r4], size_t len [r5]);
> +   Returns 'dst'.  */
> +
> +#ifndef MEMCPY
> +# define MEMCPY memcpy
> +#endif
> +
> +#define dst 11		/* Use r11 so r3 kept unchanged.  */
> +#define src 4
> +#define cnt 5
> +
> +	.machine power7
> +ENTRY_TOCLESS (MEMCPY, 5)
> +	CALL_MCOUNT 3
> +
> +	cmpldi	cr1,cnt,31
> +	neg	0,3
> +	ble	cr1, L(copy_LT_32)  /* If move < 32 bytes use short move
> +				    code.  */
> +
> +/* Align copies using VSX instructions to quadword. It is to avoid alignment
> +   traps when memcpy is used on non-cacheable memory (for instance, memory
> +   mapped I/O).  */
> +	andi.	10,3,15
> +	clrldi	11,4,60
> +	cmpld	cr6,10,11	/* SRC and DST alignments match?  */
> +
> +	mr	dst,3
> +	bne	cr6,L(copy_GE_32_unaligned)
> +	beq	L(aligned_copy)
> +
> +	mtocrf	0x01,0
> +	clrldi	0,0,60
> +
> +/* Get the DST and SRC aligned to 16 bytes.  */
> +1:
> +	bf	31,2f
> +	lbz	6,0(src)
> +	addi	src,src,1
> +	stb	6,0(dst)
> +	addi	dst,dst,1
> +2:
> +	bf	30,4f
> +	lhz	6,0(src)
> +	addi	src,src,2
> +	sth	6,0(dst)
> +	addi	dst,dst,2
> +4:
> +	bf	29,8f
> +	lwz	6,0(src)
> +	addi	src,src,4
> +	stw	6,0(dst)
> +	addi	dst,dst,4
> +8:
> +	bf	28,16f
> +	ld	6,0(src)
> +	addi	src,src,8
> +	std	6,0(dst)
> +	addi	dst,dst,8
> +16:
> +	subf	cnt,0,cnt
> +
> +/* Main aligned copy loop. Copies 128 bytes at a time. */
> +L(aligned_copy):
> +	li	6,16
> +	li	7,32
> +	li	8,48
> +	mtocrf	0x02,cnt
> +	srdi	12,cnt,7
> +	cmpdi	12,0
> +	beq	L(aligned_tail)
> +	lvx	6,0,src
> +	lvx	7,src,6
> +	mtctr	12
> +	b	L(aligned_128loop)
> +
> +	.align  4
> +L(aligned_128head):
> +	/* for the 2nd + iteration of this loop. */
> +	lvx	6,0,src
> +	lvx	7,src,6
> +L(aligned_128loop):
> +	lvx	8,src,7
> +	lvx	9,src,8
> +	stvx	6,0,dst
> +	addi	src,src,64
> +	stvx	7,dst,6
> +	stvx	8,dst,7
> +	stvx	9,dst,8
> +	lvx	6,0,src
> +	lvx	7,src,6
> +	addi	dst,dst,64
> +	lvx	8,src,7
> +	lvx	9,src,8
> +	addi	src,src,64
> +	stvx	6,0,dst
> +	stvx	7,dst,6
> +	stvx	8,dst,7
> +	stvx	9,dst,8
> +	addi	dst,dst,64
> +	bdnz	L(aligned_128head)
> +
> +L(aligned_tail):
> +	mtocrf	0x01,cnt
> +	bf	25,32f
> +	lvx	6,0,src
> +	lvx	7,src,6
> +	lvx	8,src,7
> +	lvx	9,src,8
> +	addi	src,src,64
> +	stvx	6,0,dst
> +	stvx	7,dst,6
> +	stvx	8,dst,7
> +	stvx	9,dst,8
> +	addi	dst,dst,64
> +32:
> +	bf	26,16f
> +	lvx	6,0,src
> +	lvx	7,src,6
> +	addi	src,src,32
> +	stvx	6,0,dst
> +	stvx	7,dst,6
> +	addi	dst,dst,32
> +16:
> +	bf	27,8f
> +	lvx	6,0,src
> +	addi	src,src,16
> +	stvx	6,0,dst
> +	addi	dst,dst,16
> +8:
> +	bf	28,4f
> +	ld	6,0(src)
> +	addi	src,src,8
> +	std     6,0(dst)
> +	addi	dst,dst,8
> +4:	/* Copies 4~7 bytes.  */
> +	bf	29,L(tail2)
> +	lwz	6,0(src)
> +	stw     6,0(dst)
> +	bf      30,L(tail5)
> +	lhz     7,4(src)
> +	sth     7,4(dst)
> +	bflr	31
> +	lbz     8,6(src)
> +	stb     8,6(dst)
> +	/* Return original DST pointer.  */
> +	blr
> +
> +
> +/* Handle copies of 0~31 bytes.  */
> +	.align	4
> +L(copy_LT_32):
> +	mr	dst,3
> +	cmpldi	cr6,cnt,8
> +	mtocrf	0x01,cnt
> +	ble	cr6,L(copy_LE_8)
> +
> +	/* At least 9 bytes to go.  */
> +	neg	8,4
> +	andi.	0,8,3
> +	cmpldi	cr1,cnt,16
> +	beq	L(copy_LT_32_aligned)
> +
> +	/* Force 4-byte alignment for SRC.  */
> +	mtocrf	0x01,0
> +	subf	cnt,0,cnt
> +2:
> +	bf	30,1f
> +	lhz	6,0(src)
> +	addi	src,src,2
> +	sth	6,0(dst)
> +	addi	dst,dst,2
> +1:
> +	bf	31,L(end_4bytes_alignment)
> +	lbz	6,0(src)
> +	addi	src,src,1
> +	stb	6,0(dst)
> +	addi	dst,dst,1
> +
> +	.align	4
> +L(end_4bytes_alignment):
> +	cmpldi	cr1,cnt,16
> +	mtocrf	0x01,cnt
> +
> +L(copy_LT_32_aligned):
> +	/* At least 6 bytes to go, and SRC is word-aligned.  */
> +	blt	cr1,8f
> +
> +	/* Copy 16 bytes.  */
> +	lwz	6,0(src)
> +	lwz	7,4(src)
> +	stw	6,0(dst)
> +	lwz	8,8(src)
> +	stw	7,4(dst)
> +	lwz	6,12(src)
> +	addi	src,src,16
> +	stw	8,8(dst)
> +	stw	6,12(dst)
> +	addi	dst,dst,16
> +8:	/* Copy 8 bytes.  */
> +	bf	28,L(tail4)
> +	lwz	6,0(src)
> +	lwz	7,4(src)
> +	addi	src,src,8
> +	stw	6,0(dst)
> +	stw	7,4(dst)
> +	addi	dst,dst,8
> +
> +	.align	4
> +/* Copies 4~7 bytes.  */
> +L(tail4):
> +	bf	29,L(tail2)
> +	lwz	6,0(src)
> +	stw	6,0(dst)
> +	bf	30,L(tail5)
> +	lhz	7,4(src)
> +	sth	7,4(dst)
> +	bflr	31
> +	lbz	8,6(src)
> +	stb	8,6(dst)
> +	/* Return original DST pointer.  */
> +	blr
> +
> +	.align	4
> +/* Copies 2~3 bytes.  */
> +L(tail2):
> +	bf	30,1f
> +	lhz	6,0(src)
> +	sth	6,0(dst)
> +	bflr	31
> +	lbz	7,2(src)
> +	stb	7,2(dst)
> +	blr
> +
> +	.align	4
> +L(tail5):
> +	bflr	31
> +	lbz	6,4(src)
> +	stb	6,4(dst)
> +	blr
> +
> +	.align	4
> +1:
> +	bflr	31
> +	lbz	6,0(src)
> +	stb	6,0(dst)
> +	/* Return original DST pointer.  */
> +	blr
> +
> +
> +/* Handles copies of 0~8 bytes.  */
> +	.align	4
> +L(copy_LE_8):
> +	bne	cr6,L(tail4)
> +
> +	/* Though we could've used ld/std here, they are still
> +	slow for unaligned cases.  */
> +
> +	lwz	6,0(src)
> +	lwz	7,4(src)
> +	stw	6,0(dst)
> +	stw	7,4(dst)
> +	blr
> +
> +
> +/* Handle copies of 32+ bytes where DST is aligned (to quadword) but
> +   SRC is not.	Use aligned quadword loads from SRC, shifted to realign
> +   the data, allowing for aligned DST stores.  */
> +	.align	4
> +L(copy_GE_32_unaligned):
> +	clrldi	0,0,60	      /* Number of bytes until the 1st dst quadword.  */
> +	srdi	9,cnt,4	      /* Number of full quadwords remaining.  */
> +
> +	beq	L(copy_GE_32_unaligned_cont)
> +
> +	/* DST is not quadword aligned, get it aligned.  */
> +
> +	mtocrf	0x01,0
> +	subf	cnt,0,cnt
> +
> +	/* Vector instructions work best when proper alignment (16-bytes)
> +	is present.  Move 0~15 bytes as needed to get DST quadword-aligned.  */
> +1:
> +	bf	31,2f
> +	lbz	6,0(src)
> +	addi	src,src,1
> +	stb	6,0(dst)
> +	addi	dst,dst,1
> +2:
> +	bf	30,4f
> +	lhz	6,0(src)
> +	addi	src,src,2
> +	sth	6,0(dst)
> +	addi	dst,dst,2
> +4:
> +	bf	29,8f
> +	lwz	6,0(src)
> +	addi	src,src,4
> +	stw	6,0(dst)
> +	addi	dst,dst,4
> +8:
> +	bf	28,0f
> +	ld	6,0(src)
> +	addi	src,src,8
> +	std	6,0(dst)
> +	addi	dst,dst,8
> +0:
> +	srdi	9,cnt,4	      /* Number of full quadwords remaining.  */
> +
> +	/* The proper alignment is present, it is OK to copy the bytes now.  */
> +L(copy_GE_32_unaligned_cont):
> +
> +	/* Setup two indexes to speed up the indexed vector operations.  */
> +	clrldi	10,cnt,60
> +	li	6,16	      /* Index for 16-bytes offsets.  */
> +	li	7,32	      /* Index for 32-bytes offsets.  */
> +	cmpldi	cr1,10,0
> +	srdi	8,cnt,5	      /* Setup the loop counter.  */
> +	mtocrf	0x01,9
> +	cmpldi	cr6,9,1
> +#ifdef __LITTLE_ENDIAN__
> +	lvsr	5,0,src
> +#else
> +	lvsl	5,0,src
> +#endif
> +	lvx	3,0,src
> +	li	0,0
> +	bf	31,L(setup_unaligned_loop)
> +
> +	/* Copy another 16 bytes to align to 32-bytes due to the loop.  */
> +	lvx	4,src,6
> +#ifdef __LITTLE_ENDIAN__
> +	vperm	6,4,3,5
> +#else
> +	vperm	6,3,4,5
> +#endif
> +	addi	src,src,16
> +	stvx	6,0,dst
> +	addi	dst,dst,16
> +	vor	3,4,4
> +	clrrdi	0,src,60
> +
> +L(setup_unaligned_loop):
> +	mtctr	8
> +	ble	cr6,L(end_unaligned_loop)
> +
> +	/* Copy 32 bytes at a time using vector instructions.  */
> +	.align	4
> +L(unaligned_loop):
> +
> +	/* Note: vr6/vr10 may contain data that was already copied,
> +	but in order to get proper alignment, we may have to copy
> +	some portions again. This is faster than having unaligned
> +	vector instructions though.  */
> +
> +	lvx	4,src,6
> +#ifdef __LITTLE_ENDIAN__
> +	vperm	6,4,3,5
> +#else
> +	vperm	6,3,4,5
> +#endif
> +	lvx	3,src,7
> +#ifdef __LITTLE_ENDIAN__
> +	vperm	10,3,4,5
> +#else
> +	vperm	10,4,3,5
> +#endif
> +	addi	src,src,32
> +	stvx	6,0,dst
> +	stvx	10,dst,6
> +	addi	dst,dst,32
> +	bdnz	L(unaligned_loop)
> +
> +	clrrdi	0,src,60
> +
> +	.align	4
> +L(end_unaligned_loop):
> +
> +	/* Check for tail bytes.  */
> +	mtocrf	0x01,cnt
> +	beqlr	cr1
> +
> +	add	src,src,0
> +
> +	/*  We have 1~15 tail bytes to copy, and DST is quadword aligned.  */
> +	/* Copy 8 bytes.  */
> +	bf	28,4f
> +	lwz	6,0(src)
> +	lwz	7,4(src)
> +	addi	src,src,8
> +	stw	6,0(dst)
> +	stw	7,4(dst)
> +	addi	dst,dst,8
> +4:	/* Copy 4~7 bytes.  */
> +	bf	29,L(tail2)
> +	lwz	6,0(src)
> +	stw	6,0(dst)
> +	bf	30,L(tail5)
> +	lhz	7,4(src)
> +	sth	7,4(dst)
> +	bflr	31
> +	lbz	8,6(src)
> +	stb	8,6(dst)
> +	/* Return original DST pointer.  */
> +	blr
> +
> +END_GEN_TB (MEMCPY,TB_TOCLESS)
> +libc_hidden_builtin_def (memcpy)
> diff --git a/sysdeps/powerpc/powerpc64/power9/memmove.S b/sysdeps/powerpc/powerpc64/power9/memmove.S
> new file mode 100644
> index 0000000..9ed8f77
> --- /dev/null
> +++ b/sysdeps/powerpc/powerpc64/power9/memmove.S
> @@ -0,0 +1,837 @@
> +/* Optimized memmove implementation for PowerPC64/POWER7.
> +   Copyright (C) 2017 Free Software Foundation, Inc.
> +   This file is part of the GNU C Library.
> +
> +   The GNU C Library is free software; you can redistribute it and/or
> +   modify it under the terms of the GNU Lesser General Public
> +   License as published by the Free Software Foundation; either
> +   version 2.1 of the License, or (at your option) any later version.
> +
> +   The GNU C Library is distributed in the hope that it will be useful,
> +   but WITHOUT ANY WARRANTY; without even the implied warranty of
> +   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
> +   Lesser General Public License for more details.
> +
> +   You should have received a copy of the GNU Lesser General Public
> +   License along with the GNU C Library; if not, see
> +   <http://www.gnu.org/licenses/>.  */
> +
> +#include <sysdep.h>
> +
> +
> +/* void* [r3] memmove (void *dest [r3], const void *src [r4], size_t len [r5])
> +
> +   This optimization check if memory 'dest'  overlaps with 'src'. If it does
> +   not then it calls an optimized memcpy call (similar to memcpy for POWER7,
> +   embedded here to gain some cycles).
> +   If source and destiny overlaps, a optimized backwards memcpy is used
> +   instead.  */
> +
> +#ifndef MEMMOVE
> +# define MEMMOVE memmove
> +#endif
> +	.machine power7
> +ENTRY_TOCLESS (MEMMOVE, 5)
> +	CALL_MCOUNT 3
> +
> +L(_memmove):
> +	subf    r9,r4,r3
> +	cmpld   cr7,r9,r5
> +	blt	cr7,L(memmove_bwd)
> +
> +	cmpldi	cr1,r5,31
> +	neg	0,3
> +	ble	cr1, L(copy_LT_32)  /* If move < 32 bytes use short move
> +				       code.  */
> +
> +	andi.	10,3,15
> +	clrldi	11,4,60
> +	cmpld	cr6,10,11	/* SRC and DST alignments match?  */
> +
> +	mr	r11,3
> +	bne	cr6,L(copy_GE_32_unaligned)
> +	beq	L(aligned_copy)
> +
> +	mtocrf	0x01,0
> +	clrldi	0,0,60
> +
> +/* Get the DST and SRC aligned to 8 bytes (16 for little-endian).  */
> +1:
> +	bf	31,2f
> +	lbz	6,0(r4)
> +	addi	r4,r4,1
> +	stb	6,0(r11)
> +	addi	r11,r11,1
> +2:
> +	bf	30,4f
> +	lhz	6,0(r4)
> +	addi	r4,r4,2
> +	sth	6,0(r11)
> +	addi	r11,r11,2
> +4:
> +	bf	29,8f
> +	lwz	6,0(r4)
> +	addi	r4,r4,4
> +	stw	6,0(r11)
> +	addi	r11,r11,4
> +8:
> +	bf	28,16f
> +	ld	6,0(r4)
> +	addi	r4,r4,8
> +	std	6,0(r11)
> +	addi	r11,r11,8
> +16:
> +	subf	r5,0,r5
> +
> +/* Main aligned copy loop. Copies 128 bytes at a time. */
> +L(aligned_copy):
> +	li	6,16
> +	li	7,32
> +	li	8,48
> +	mtocrf	0x02,r5
> +	srdi	12,r5,7
> +	cmpdi	12,0
> +	beq	L(aligned_tail)
> +	lvx	6,0,r4
> +	lvx	7,r4,6
> +	mtctr	12
> +	b	L(aligned_128loop)
> +
> +	.align  4
> +L(aligned_128head):
> +	/* for the 2nd + iteration of this loop. */
> +	lvx	6,0,r4
> +	lvx	7,r4,6
> +L(aligned_128loop):
> +	lvx	8,r4,7
> +	lvx	9,r4,8
> +	stvx	6,0,r11
> +	addi	r4,r4,64
> +	stvx	7,r11,6
> +	stvx	8,r11,7
> +	stvx	9,r11,8
> +	lvx	6,0,r4
> +	lvx	7,r4,6
> +	addi	r11,r11,64
> +	lvx	8,r4,7
> +	lvx	9,r4,8
> +	addi	r4,r4,64
> +	stvx	6,0,r11
> +	stvx	7,r11,6
> +	stvx	8,r11,7
> +	stvx	9,r11,8
> +	addi	r11,r11,64
> +	bdnz	L(aligned_128head)
> +
> +L(aligned_tail):
> +	mtocrf	0x01,r5
> +	bf	25,32f
> +	lvx	6,0,r4
> +	lvx	7,r4,6
> +	lvx	8,r4,7
> +	lvx	9,r4,8
> +	addi	r4,r4,64
> +	stvx	6,0,r11
> +	stvx	7,r11,6
> +	stvx	8,r11,7
> +	stvx	9,r11,8
> +	addi	r11,r11,64
> +32:
> +	bf	26,16f
> +	lvx	6,0,r4
> +	lvx	7,r4,6
> +	addi	r4,r4,32
> +	stvx	6,0,r11
> +	stvx	7,r11,6
> +	addi	r11,r11,32
> +16:
> +	bf	27,8f
> +	lvx	6,0,r4
> +	addi	r4,r4,16
> +	stvx	6,0,r11
> +	addi	r11,r11,16
> +8:
> +	bf	28,4f
> +	ld	6,0(r4)
> +	addi	r4,r4,8
> +	std     6,0(r11)
> +	addi	r11,r11,8
> +4:	/* Copies 4~7 bytes.  */
> +	bf	29,L(tail2)
> +	lwz	6,0(r4)
> +	stw     6,0(r11)
> +	bf      30,L(tail5)
> +	lhz     7,4(r4)
> +	sth     7,4(r11)
> +	bflr	31
> +	lbz     8,6(r4)
> +	stb     8,6(r11)
> +	/* Return original DST pointer.  */
> +	blr
> +
> +/* Handle copies of 0~31 bytes.  */
> +	.align	4
> +L(copy_LT_32):
> +	mr	r11,3
> +	cmpldi	cr6,r5,8
> +	mtocrf	0x01,r5
> +	ble	cr6,L(copy_LE_8)
> +
> +	/* At least 9 bytes to go.  */
> +	neg	8,4
> +	andi.	0,8,3
> +	cmpldi	cr1,r5,16
> +	beq	L(copy_LT_32_aligned)
> +
> +	/* Force 4-byte alignment for SRC.  */
> +	mtocrf	0x01,0
> +	subf	r5,0,r5
> +2:
> +	bf	30,1f
> +	lhz	6,0(r4)
> +	addi	r4,r4,2
> +	sth	6,0(r11)
> +	addi	r11,r11,2
> +1:
> +	bf	31,L(end_4bytes_alignment)
> +	lbz	6,0(r4)
> +	addi	r4,r4,1
> +	stb	6,0(r11)
> +	addi	r11,r11,1
> +
> +	.align	4
> +L(end_4bytes_alignment):
> +	cmpldi	cr1,r5,16
> +	mtocrf	0x01,r5
> +
> +L(copy_LT_32_aligned):
> +	/* At least 6 bytes to go, and SRC is word-aligned.  */
> +	blt	cr1,8f
> +
> +	/* Copy 16 bytes.  */
> +	lwz	6,0(r4)
> +	lwz	7,4(r4)
> +	stw	6,0(r11)
> +	lwz	8,8(r4)
> +	stw	7,4(r11)
> +	lwz	6,12(r4)
> +	addi	r4,r4,16
> +	stw	8,8(r11)
> +	stw	6,12(r11)
> +	addi	r11,r11,16
> +8:	/* Copy 8 bytes.  */
> +	bf	28,L(tail4)
> +	lwz	6,0(r4)
> +	lwz	7,4(r4)
> +	addi	r4,r4,8
> +	stw	6,0(r11)
> +	stw	7,4(r11)
> +	addi	r11,r11,8
> +
> +	.align	4
> +/* Copies 4~7 bytes.  */
> +L(tail4):
> +	bf	29,L(tail2)
> +	lwz	6,0(r4)
> +	stw	6,0(r11)
> +	bf	30,L(tail5)
> +	lhz	7,4(r4)
> +	sth	7,4(r11)
> +	bflr	31
> +	lbz	8,6(r4)
> +	stb	8,6(r11)
> +	/* Return original DST pointer.  */
> +	blr
> +
> +	.align	4
> +/* Copies 2~3 bytes.  */
> +L(tail2):
> +	bf	30,1f
> +	lhz	6,0(r4)
> +	sth	6,0(r11)
> +	bflr	31
> +	lbz	7,2(r4)
> +	stb	7,2(r11)
> +	blr
> +
> +	.align	4
> +L(tail5):
> +	bflr	31
> +	lbz	6,4(r4)
> +	stb	6,4(r11)
> +	blr
> +
> +	.align	4
> +1:
> +	bflr	31
> +	lbz	6,0(r4)
> +	stb	6,0(r11)
> +	/* Return original DST pointer.  */
> +	blr
> +
> +/* Handles copies of 0~8 bytes.  */
> +	.align	4
> +L(copy_LE_8):
> +	bne	cr6,L(tail4)
> +
> +	/* Though we could've used ld/std here, they are still
> +	slow for unaligned cases.  */
> +
> +	lwz	6,0(r4)
> +	lwz	7,4(r4)
> +	stw	6,0(r11)
> +	stw	7,4(r11)
> +	blr
> +
> +
> +/* Handle copies of 32+ bytes where DST is aligned (to quadword) but
> +   SRC is not.	Use aligned quadword loads from SRC, shifted to realign
> +   the data, allowing for aligned DST stores.  */
> +	.align	4
> +L(copy_GE_32_unaligned):
> +	clrldi	0,0,60	      /* Number of bytes until the 1st r11 quadword.  */
> +	srdi	9,r5,4	      /* Number of full quadwords remaining.  */
> +
> +	beq	L(copy_GE_32_unaligned_cont)
> +
> +	/* DST is not quadword aligned, get it aligned.  */
> +
> +	mtocrf	0x01,0
> +	subf	r5,0,r5
> +
> +	/* Vector instructions work best when proper alignment (16-bytes)
> +	is present.  Move 0~15 bytes as needed to get DST quadword-aligned.  */
> +1:
> +	bf	31,2f
> +	lbz	6,0(r4)
> +	addi	r4,r4,1
> +	stb	6,0(r11)
> +	addi	r11,r11,1
> +2:
> +	bf	30,4f
> +	lhz	6,0(r4)
> +	addi	r4,r4,2
> +	sth	6,0(r11)
> +	addi	r11,r11,2
> +4:
> +	bf	29,8f
> +	lwz	6,0(r4)
> +	addi	r4,r4,4
> +	stw	6,0(r11)
> +	addi	r11,r11,4
> +8:
> +	bf	28,0f
> +	ld	6,0(r4)
> +	addi	r4,r4,8
> +	std	6,0(r11)
> +	addi	r11,r11,8
> +0:
> +	srdi	9,r5,4	      /* Number of full quadwords remaining.  */
> +
> +	/* The proper alignment is present, it is OK to copy the bytes now.  */
> +L(copy_GE_32_unaligned_cont):
> +
> +	/* Setup two indexes to speed up the indexed vector operations.  */
> +	clrldi	10,r5,60
> +	li	6,16	      /* Index for 16-bytes offsets.  */
> +	li	7,32	      /* Index for 32-bytes offsets.  */
> +	cmpldi	cr1,10,0
> +	srdi	8,r5,5	      /* Setup the loop counter.  */
> +	mtocrf	0x01,9
> +	cmpldi	cr6,9,1
> +#ifdef __LITTLE_ENDIAN__
> +	lvsr	5,0,r4
> +#else
> +	lvsl	5,0,r4
> +#endif
> +	lvx	3,0,r4
> +	li	0,0
> +	bf	31,L(setup_unaligned_loop)
> +
> +	/* Copy another 16 bytes to align to 32-bytes due to the loop.  */
> +	lvx	4,r4,6
> +#ifdef __LITTLE_ENDIAN__
> +	vperm	6,4,3,5
> +#else
> +	vperm	6,3,4,5
> +#endif
> +	addi	r4,r4,16
> +	stvx	6,0,r11
> +	addi	r11,r11,16
> +	vor	3,4,4
> +	clrrdi	0,r4,60
> +
> +L(setup_unaligned_loop):
> +	mtctr	8
> +	ble	cr6,L(end_unaligned_loop)
> +
> +	/* Copy 32 bytes at a time using vector instructions.  */
> +	.align	4
> +L(unaligned_loop):
> +
> +	/* Note: vr6/vr10 may contain data that was already copied,
> +	but in order to get proper alignment, we may have to copy
> +	some portions again. This is faster than having unaligned
> +	vector instructions though.  */
> +
> +	lvx	4,r4,6
> +#ifdef __LITTLE_ENDIAN__
> +	vperm	6,4,3,5
> +#else
> +	vperm	6,3,4,5
> +#endif
> +	lvx	3,r4,7
> +#ifdef __LITTLE_ENDIAN__
> +	vperm	10,3,4,5
> +#else
> +	vperm	10,4,3,5
> +#endif
> +	addi	r4,r4,32
> +	stvx	6,0,r11
> +	stvx	10,r11,6
> +	addi	r11,r11,32
> +	bdnz	L(unaligned_loop)
> +
> +	clrrdi	0,r4,60
> +
> +	.align	4
> +L(end_unaligned_loop):
> +
> +	/* Check for tail bytes.  */
> +	mtocrf	0x01,r5
> +	beqlr	cr1
> +
> +	add	r4,r4,0
> +
> +	/*  We have 1~15 tail bytes to copy, and DST is quadword aligned.  */
> +	/* Copy 8 bytes.  */
> +	bf	28,4f
> +	lwz	6,0(r4)
> +	lwz	7,4(r4)
> +	addi	r4,r4,8
> +	stw	6,0(r11)
> +	stw	7,4(r11)
> +	addi	r11,r11,8
> +4:	/* Copy 4~7 bytes.  */
> +	bf	29,L(tail2)
> +	lwz	6,0(r4)
> +	stw	6,0(r11)
> +	bf	30,L(tail5)
> +	lhz	7,4(r4)
> +	sth	7,4(r11)
> +	bflr	31
> +	lbz	8,6(r4)
> +	stb	8,6(r11)
> +	/* Return original DST pointer.  */
> +	blr
> +
> +	/* Start to memcpy backward implementation: the algorith first check if
> +	   src and dest have the same alignment and if it does align both to 16
> +	   bytes and copy using VSX instructions.
> +	   If does not, align dest to 16 bytes and use VMX (altivec) instruction
> +	   to read two 16 bytes at time, shift/permute the bytes read and write
> +	   aligned to dest.  */
> +L(memmove_bwd):
> +	cmpldi	cr1,r5,31
> +	/* Copy is done backwards: update the pointers and check alignment.  */
> +	add	r11,r3,r5
> +	add	r4,r4,r5
> +	mr	r0,r11
> +	ble	cr1, L(copy_LT_32_bwd)  /* If move < 32 bytes use short move
> +				           code.  */
> +
> +	andi.	r10,r11,15	    /* Check if r11 is aligned to 16 bytes  */
> +	clrldi	r9,r4,60	    /* Check if r4 is aligned to 16 bytes  */
> +	cmpld	cr6,r10,r9	    /* SRC and DST alignments match?  */
> +
> +	bne     cr6,L(copy_GE_32_unaligned_bwd)
> +	beq     L(aligned_copy_bwd)
> +
> +	mtocrf	0x01,r0
> +	clrldi	r0,r0,60
> +
> +/* Get the DST and SRC aligned to 16 bytes.  */
> +1:
> +	bf	31,2f
> +	lbz	r6,-1(r4)
> +	subi	r4,r4,1
> +	stb	r6,-1(r11)
> +	subi	r11,r11,1
> +2:
> +	bf	30,4f
> +	lhz	r6,-2(r4)
> +	subi	r4,r4,2
> +	sth	r6,-2(r11)
> +	subi	r11,r11,2
> +4:
> +	bf	29,8f
> +	lwz	r6,-4(r4)
> +	subi	r4,r4,4
> +	stw	r6,-4(r11)
> +	subi	r11,r11,4
> +8:
> +	bf	28,16f
> +	ld	r6,-8(r4)
> +	subi	r4,r4,8
> +	std	r6,-8(r11)
> +	subi	r11,r11,8
> +16:
> +	subf	r5,0,r5
> +
> +/* Main aligned copy loop. Copies 128 bytes at a time. */
> +L(aligned_copy_bwd):
> +	li	r6,-16
> +	li	r7,-32
> +	li	r8,-48
> +	li	r9,-64
> +	mtocrf	0x02,r5
> +	srdi	r12,r5,7
> +	cmpdi	r12,0
> +	beq	L(aligned_tail_bwd)
> +	lvx	v6,r4,r6
> +	lvx	v7,r4,r7
> +	mtctr	12
> +	b	L(aligned_128loop_bwd)
> +
> +	.align  4
> +L(aligned_128head_bwd):
> +	/* for the 2nd + iteration of this loop. */
> +	lvx	v6,r4,r6
> +	lvx	v7,r4,r7
> +L(aligned_128loop_bwd):
> +	lvx	v8,r4,r8
> +	lvx	v9,r4,r9
> +	stvx	v6,r11,r6
> +	subi	r4,r4,64
> +	stvx	v7,r11,r7
> +	stvx	v8,r11,r8
> +	stvx	v9,r11,r9
> +	lvx	v6,r4,r6
> +	lvx	v7,r4,7
> +	subi	r11,r11,64
> +	lvx	v8,r4,r8
> +	lvx	v9,r4,r9
> +	subi	r4,r4,64
> +	stvx	v6,r11,r6
> +	stvx	v7,r11,r7
> +	stvx	v8,r11,r8
> +	stvx	v9,r11,r9
> +	subi	r11,r11,64
> +	bdnz	L(aligned_128head_bwd)
> +
> +L(aligned_tail_bwd):
> +	mtocrf	0x01,r5
> +	bf	25,32f
> +	lvx	v6,r4,r6
> +	lvx	v7,r4,r7
> +	lvx	v8,r4,r8
> +	lvx	v9,r4,r9
> +	subi	r4,r4,64
> +	stvx	v6,r11,r6
> +	stvx	v7,r11,r7
> +	stvx	v8,r11,r8
> +	stvx	v9,r11,r9
> +	subi	r11,r11,64
> +32:
> +	bf	26,16f
> +	lvx	v6,r4,r6
> +	lvx	v7,r4,r7
> +	subi	r4,r4,32
> +	stvx	v6,r11,r6
> +	stvx	v7,r11,r7
> +	subi	r11,r11,32
> +16:
> +	bf	27,8f
> +	lvx	v6,r4,r6
> +	subi	r4,r4,16
> +	stvx	v6,r11,r6
> +	subi	r11,r11,16
> +8:
> +	bf	28,4f
> +	ld	r6,-8(r4)
> +	subi	r4,r4,8
> +	std     r6,-8(r11)
> +	subi	r11,r11,8
> +4:	/* Copies 4~7 bytes.  */
> +	bf	29,L(tail2_bwd)
> +	lwz	r6,-4(r4)
> +	stw     r6,-4(r11)
> +	bf      30,L(tail5_bwd)
> +	lhz     r7,-6(r4)
> +	sth     r7,-6(r11)
> +	bflr	31
> +	lbz     r8,-7(r4)
> +	stb     r8,-7(r11)
> +	/* Return original DST pointer.  */
> +	blr
> +
> +/* Handle copies of 0~31 bytes.  */
> +	.align	4
> +L(copy_LT_32_bwd):
> +	cmpldi	cr6,r5,8
> +	mtocrf	0x01,r5
> +	ble	cr6,L(copy_LE_8_bwd)
> +
> +	/* At least 9 bytes to go.  */
> +	neg	r8,r4
> +	andi.	r0,r8,3
> +	cmpldi	cr1,r5,16
> +	beq	L(copy_LT_32_aligned_bwd)
> +
> +	/* Force 4-byte alignment for SRC.  */
> +	mtocrf	0x01,0
> +	subf	r5,0,r5
> +2:
> +	bf	30,1f
> +	lhz	r6,-2(r4)
> +	subi	r4,r4,2
> +	sth	r6,-2(r11)
> +	subi	r11,r11,2
> +1:
> +	bf	31,L(end_4bytes_alignment_bwd)
> +	lbz	6,-1(r4)
> +	subi	r4,r4,1
> +	stb	6,-1(r11)
> +	subi	r11,r11,1
> +
> +	.align	4
> +L(end_4bytes_alignment_bwd):
> +	cmpldi	cr1,r5,16
> +	mtocrf	0x01,r5
> +
> +L(copy_LT_32_aligned_bwd):
> +	/* At least 6 bytes to go, and SRC is word-aligned.  */
> +	blt	cr1,8f
> +
> +	/* Copy 16 bytes.  */
> +	lwz	r6,-4(r4)
> +	lwz	r7,-8(r4)
> +	stw	r6,-4(r11)
> +	lwz	r8,-12(r4)
> +	stw	r7,-8(r11)
> +	lwz	r6,-16(r4)
> +	subi	r4,r4,16
> +	stw	r8,-12(r11)
> +	stw	r6,-16(r11)
> +	subi	r11,r11,16
> +8:	/* Copy 8 bytes.  */
> +	bf	28,L(tail4_bwd)
> +	lwz	r6,-4(r4)
> +	lwz	r7,-8(r4)
> +	subi	r4,r4,8
> +	stw	r6,-4(r11)
> +	stw	r7,-8(r11)
> +	subi	r11,r11,8
> +
> +	.align	4
> +/* Copies 4~7 bytes.  */
> +L(tail4_bwd):
> +	bf	29,L(tail2_bwd)
> +	lwz	6,-4(r4)
> +	stw	6,-4(r11)
> +	bf	30,L(tail5_bwd)
> +	lhz	7,-6(r4)
> +	sth	7,-6(r11)
> +	bflr	31
> +	lbz	8,-7(r4)
> +	stb	8,-7(r11)
> +	/* Return original DST pointer.  */
> +	blr
> +
> +	.align	4
> +/* Copies 2~3 bytes.  */
> +L(tail2_bwd):
> +	bf	30,1f
> +	lhz	6,-2(r4)
> +	sth	6,-2(r11)
> +	bflr	31
> +	lbz	7,-3(r4)
> +	stb	7,-3(r11)
> +	blr
> +
> +	.align	4
> +L(tail5_bwd):
> +	bflr	31
> +	lbz	6,-5(r4)
> +	stb	6,-5(r11)
> +	blr
> +
> +	.align	4
> +1:
> +	bflr	31
> +	lbz	6,-1(r4)
> +	stb	6,-1(r11)
> +	/* Return original DST pointer.  */
> +	blr
> +
> +
> +/* Handles copies of 0~8 bytes.  */
> +	.align	4
> +L(copy_LE_8_bwd):
> +	bne	cr6,L(tail4_bwd)
> +
> +	/* Though we could've used ld/std here, they are still
> +	   slow for unaligned cases.  */
> +	lwz	6,-8(r4)
> +	lwz	7,-4(r4)
> +	stw	6,-8(r11)
> +	stw	7,-4(r11)
> +	blr
> +
> +
> +/* Handle copies of 32+ bytes where DST is aligned (to quadword) but
> +   SRC is not.	Use aligned quadword loads from SRC, shifted to realign
> +   the data, allowing for aligned DST stores.  */
> +	.align	4
> +L(copy_GE_32_unaligned_bwd):
> +	andi.	r10,r11,15      /* Check alignment of DST against 16 bytes..  */
> +	srdi	r9,r5,4		/* Number of full quadwords remaining.  */
> +
> +	beq	L(copy_GE_32_unaligned_cont_bwd)
> +
> +	/* DST is not quadword aligned and r10 holds the address masked to
> +           compare alignments.  */
> +	mtocrf	0x01,r10
> +	subf	r5,r10,r5
> +
> +	/* Vector instructions work best when proper alignment (16-bytes)
> +	is present.  Move 0~15 bytes as needed to get DST quadword-aligned.  */
> +1:
> +	bf	31,2f
> +	lbz	r6,-1(r4)
> +	subi	r4,r4,1
> +	stb	r6,-1(r11)
> +	subi	r11,r11,1
> +2:
> +	bf	30,4f
> +	lhz	r6,-2(r4)
> +	subi	r4,r4,2
> +	sth	r6,-2(r11)
> +	subi	r11,r11,2
> +4:
> +	bf	29,8f
> +	lwz	r6,-4(r4)
> +	subi	r4,r4,4
> +	stw	r6,-4(r11)
> +	subi	r11,r11,4
> +8:
> +	bf	28,0f
> +	ld	r6,-8(r4)
> +	subi	r4,r4,8
> +	std	r6,-8(r11)
> +	subi	r11,r11,8
> +0:
> +	srdi	r9,r5,4	      /* Number of full quadwords remaining.  */
> +
> +	/* The proper alignment is present, it is OK to copy the bytes now.  */
> +L(copy_GE_32_unaligned_cont_bwd):
> +
> +	/* Setup two indexes to speed up the indexed vector operations.  */
> +	clrldi	r10,r5,60
> +	li	r6,-16	      /* Index for 16-bytes offsets.  */
> +	li	r7,-32	      /* Index for 32-bytes offsets.  */
> +	cmpldi	cr1,10,0
> +	srdi	r8,r5,5	      /* Setup the loop counter.  */
> +	mtocrf	0x01,9
> +	cmpldi	cr6,r9,1
> +#ifdef __LITTLE_ENDIAN__
> +	lvsr	v5,r0,r4
> +#else
> +	lvsl	v5,r0,r4
> +#endif
> +	lvx	v3,0,r4
> +	li	r0,0
> +	bf	31,L(setup_unaligned_loop_bwd)
> +
> +	/* Copy another 16 bytes to align to 32-bytes due to the loop.  */
> +	lvx	v4,r4,r6
> +#ifdef __LITTLE_ENDIAN__
> +	vperm	v6,v3,v4,v5
> +#else
> +	vperm	v6,v4,v3,v5
> +#endif
> +	subi	r4,r4,16
> +	stvx	v6,r11,r6
> +	subi	r11,r11,16
> +	vor	v3,v4,v4
> +	clrrdi	r0,r4,60
> +
> +L(setup_unaligned_loop_bwd):
> +	mtctr	r8
> +	ble	cr6,L(end_unaligned_loop_bwd)
> +
> +	/* Copy 32 bytes at a time using vector instructions.  */
> +	.align	4
> +L(unaligned_loop_bwd):
> +
> +	/* Note: vr6/vr10 may contain data that was already copied,
> +	but in order to get proper alignment, we may have to copy
> +	some portions again. This is faster than having unaligned
> +	vector instructions though.  */
> +
> +	lvx	v4,r4,r6
> +#ifdef __LITTLE_ENDIAN__
> +	vperm	v6,v3,v4,v5
> +#else
> +	vperm	v6,v4,v3,v5
> +#endif
> +	lvx	v3,r4,r7
> +#ifdef __LITTLE_ENDIAN__
> +	vperm	v10,v4,v3,v5
> +#else
> +	vperm	v10,v3,v4,v5
> +#endif
> +	subi	r4,r4,32
> +	stvx	v6,r11,r6
> +	stvx	v10,r11,r7
> +	subi	r11,r11,32
> +	bdnz	L(unaligned_loop_bwd)
> +
> +	clrrdi	r0,r4,60
> +
> +	.align	4
> +L(end_unaligned_loop_bwd):
> +
> +	/* Check for tail bytes.  */
> +	mtocrf	0x01,r5
> +	beqlr	cr1
> +
> +	add	r4,r4,0
> +
> +	/*  We have 1~15 tail bytes to copy, and DST is quadword aligned.  */
> +	/* Copy 8 bytes.  */
> +	bf	28,4f
> +	lwz	r6,-4(r4)
> +	lwz	r7,-8(r4)
> +	subi	r4,r4,8
> +	stw	r6,-4(r11)
> +	stw	r7,-8(r11)
> +	subi	r11,r11,8
> +4:	/* Copy 4~7 bytes.  */
> +	bf	29,L(tail2_bwd)
> +	lwz	r6,-4(r4)
> +	stw	r6,-4(r11)
> +	bf	30,L(tail5_bwd)
> +	lhz	r7,-6(r4)
> +	sth	r7,-6(r11)
> +	bflr	31
> +	lbz	r8,-7(r4)
> +	stb	r8,-7(r11)
> +	/* Return original DST pointer.  */
> +	blr
> +END_GEN_TB (MEMMOVE, TB_TOCLESS)
> +libc_hidden_builtin_def (memmove)
> +
> +
> +/* void bcopy(const void *src [r3], void *dest [r4], size_t n [r5])
> +   Implemented in this file to avoid linker create a stub function call
> +   in the branch to '_memmove'.  */
> +ENTRY_TOCLESS (__bcopy)
> +	mr	r6,r3
> +	mr	r3,r4
> +	mr	r4,r6
> +	b	L(_memmove)
> +END (__bcopy)
> +#ifndef __bcopy
> +weak_alias (__bcopy, bcopy)
> +#endif
>
  
Tulio Magno Quites Machado Filho Oct. 19, 2017, 3:56 p.m. UTC | #3
Florian Weimer <fweimer@redhat.com> writes:

> On 10/19/2017 05:25 PM, Tulio Magno Quites Machado Filho wrote:
>> diff --git a/sysdeps/powerpc/powerpc64/multiarch/bcopy.c b/sysdeps/powerpc/powerpc64/multiarch/bcopy.c
>> index 05d46e2..4a4ee6e 100644
>> --- a/sysdeps/powerpc/powerpc64/multiarch/bcopy.c
>> +++ b/sysdeps/powerpc/powerpc64/multiarch/bcopy.c
>> @@ -22,8 +22,12 @@
>>   extern __typeof (bcopy) __bcopy_ppc attribute_hidden;
>>   /* __bcopy_power7 symbol is implemented at memmove-power7.S  */
>>   extern __typeof (bcopy) __bcopy_power7 attribute_hidden;
>> +/* __bcopy_power9 symbol is implemented at memmove-power9.S.  */
>> +extern __typeof (bcopy) __bcopy_power9 attribute_hidden;
>>   
>>   libc_ifunc (bcopy,
>> -            (hwcap & PPC_FEATURE_HAS_VSX)
>> +	    (hwcap2 & PPC_FEATURE2_ARCH_3_00)
>> +	    ? __bcopy_power9
>> +	    : (hwcap & PPC_FEATURE_HAS_VSX)
>>               ? __bcopy_power7
>>               : __bcopy_ppc);
>> diff --git a/sysdeps/powerpc/powerpc64/multiarch/ifunc-impl-list.c b/sysdeps/powerpc/powerpc64/multiarch/ifunc-impl-list.c
>> index 6a88536..9040bbc 100644
>> --- a/sysdeps/powerpc/powerpc64/multiarch/ifunc-impl-list.c
>> +++ b/sysdeps/powerpc/powerpc64/multiarch/ifunc-impl-list.c
>> @@ -51,6 +51,8 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
>>   #ifdef SHARED
>>     /* Support sysdeps/powerpc/powerpc64/multiarch/memcpy.c.  */
>>     IFUNC_IMPL (i, name, memcpy,
>> +	      IFUNC_IMPL_ADD (array, i, memcpy, hwcap2 & PPC_FEATURE2_ARCH_3_00,
>> +			      __memcpy_power9)
>>   	      IFUNC_IMPL_ADD (array, i, memcpy, hwcap & PPC_FEATURE_HAS_VSX,
>>   			      __memcpy_power7)
>>   	      IFUNC_IMPL_ADD (array, i, memcpy, hwcap & PPC_FEATURE_ARCH_2_06,
>> @@ -65,6 +67,8 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
>>   
>>     /* Support sysdeps/powerpc/powerpc64/multiarch/memmove.c.  */
>>     IFUNC_IMPL (i, name, memmove,
>> +	      IFUNC_IMPL_ADD (array, i, memmove, hwcap2 & PPC_FEATURE2_ARCH_3_00,
>> +			      __memmove_power9)
>>   	      IFUNC_IMPL_ADD (array, i, memmove, hwcap & PPC_FEATURE_HAS_VSX,
>>   			      __memmove_power7)
>>   	      IFUNC_IMPL_ADD (array, i, memmove, 1, __memmove_ppc))
>> @@ -168,6 +172,8 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
>>   
>>     /* Support sysdeps/powerpc/powerpc64/multiarch/bcopy.c.  */
>>     IFUNC_IMPL (i, name, bcopy,
>> +	      IFUNC_IMPL_ADD (array, i, bcopy, hwcap2 & PPC_FEATURE2_ARCH_3_00,
>> +			      __bcopy_power9)
>>   	      IFUNC_IMPL_ADD (array, i, bcopy, hwcap & PPC_FEATURE_HAS_VSX,
>>   			      __bcopy_power7)
>>   	      IFUNC_IMPL_ADD (array, i, bcopy, 1, __bcopy_ppc))
>
> I'm concerned that this needs *another* change to recognize post-DD2.1 
> POWER 9 hardware which has the fix (or perhaps OpenPOWER implementations 
> which never had the bug).
>
> Is there any other way you could select the workaround that is more 
> specific?

In userspace, fast and simple to use in an IFUNC resolver, no.
AFAIK, it would require to parse /proc/cpuinfo.

> Or is the performance hit from avoiding the affected vector 
> instructions not so severe that it would be an immediate concern for 
> future silicon?

The instructions Raji chose to use here do not impact the performance on
current silicon and is not expected to impact future silicon as well.
  
Florian Weimer Oct. 19, 2017, 5:12 p.m. UTC | #4
On 10/19/2017 05:56 PM, Tulio Magno Quites Machado Filho wrote:
>> Or is the performance hit from avoiding the affected vector
>> instructions not so severe that it would be an immediate concern for
>> future silicon?
> The instructions Raji chose to use here do not impact the performance on
> current silicon and is not expected to impact future silicon as well.

Okay, this addresses my concern.  Unfortunately, I don't feel confident 
to review the new implementation myself.

Thanks,
Florian
  

Patch

diff --git a/sysdeps/powerpc/powerpc64/multiarch/Makefile b/sysdeps/powerpc/powerpc64/multiarch/Makefile
index dea49ac..82728fa 100644
--- a/sysdeps/powerpc/powerpc64/multiarch/Makefile
+++ b/sysdeps/powerpc/powerpc64/multiarch/Makefile
@@ -1,6 +1,6 @@ 
 ifeq ($(subdir),string)
-sysdep_routines += memcpy-power7 memcpy-a2 memcpy-power6 memcpy-cell \
-		   memcpy-power4 memcpy-ppc64 \
+sysdep_routines += memcpy-power9 memcpy-power7 memcpy-a2 memcpy-power6 \
+		   memcpy-cell memcpy-power4 memcpy-ppc64 \
 		   memcmp-power8 memcmp-power7 memcmp-power4 memcmp-ppc64 \
 		   memset-power7 memset-power6 memset-power4 \
 		   memset-ppc64 memset-power8 \
@@ -24,7 +24,8 @@  sysdep_routines += memcpy-power7 memcpy-a2 memcpy-power6 memcpy-cell \
 		   stpncpy-power8 stpncpy-power7 stpncpy-ppc64 \
 		   strcmp-power9 strcmp-power8 strcmp-power7 strcmp-ppc64 \
 		   strcat-power8 strcat-power7 strcat-ppc64 \
-		   memmove-power7 memmove-ppc64 wordcopy-ppc64 bcopy-ppc64 \
+		   memmove-power9 memmove-power7 memmove-ppc64 \
+		   wordcopy-ppc64 bcopy-ppc64 \
 		   strncpy-power8 strstr-power7 strstr-ppc64 \
 		   strspn-power8 strspn-ppc64 strcspn-power8 strcspn-ppc64 \
 		   strlen-power8 strcasestr-power8 strcasestr-ppc64 \
diff --git a/sysdeps/powerpc/powerpc64/multiarch/bcopy.c b/sysdeps/powerpc/powerpc64/multiarch/bcopy.c
index 05d46e2..4a4ee6e 100644
--- a/sysdeps/powerpc/powerpc64/multiarch/bcopy.c
+++ b/sysdeps/powerpc/powerpc64/multiarch/bcopy.c
@@ -22,8 +22,12 @@ 
 extern __typeof (bcopy) __bcopy_ppc attribute_hidden;
 /* __bcopy_power7 symbol is implemented at memmove-power7.S  */
 extern __typeof (bcopy) __bcopy_power7 attribute_hidden;
+/* __bcopy_power9 symbol is implemented at memmove-power9.S.  */
+extern __typeof (bcopy) __bcopy_power9 attribute_hidden;
 
 libc_ifunc (bcopy,
-            (hwcap & PPC_FEATURE_HAS_VSX)
+	    (hwcap2 & PPC_FEATURE2_ARCH_3_00)
+	    ? __bcopy_power9
+	    : (hwcap & PPC_FEATURE_HAS_VSX)
             ? __bcopy_power7
             : __bcopy_ppc);
diff --git a/sysdeps/powerpc/powerpc64/multiarch/ifunc-impl-list.c b/sysdeps/powerpc/powerpc64/multiarch/ifunc-impl-list.c
index 6a88536..9040bbc 100644
--- a/sysdeps/powerpc/powerpc64/multiarch/ifunc-impl-list.c
+++ b/sysdeps/powerpc/powerpc64/multiarch/ifunc-impl-list.c
@@ -51,6 +51,8 @@  __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
 #ifdef SHARED
   /* Support sysdeps/powerpc/powerpc64/multiarch/memcpy.c.  */
   IFUNC_IMPL (i, name, memcpy,
+	      IFUNC_IMPL_ADD (array, i, memcpy, hwcap2 & PPC_FEATURE2_ARCH_3_00,
+			      __memcpy_power9)
 	      IFUNC_IMPL_ADD (array, i, memcpy, hwcap & PPC_FEATURE_HAS_VSX,
 			      __memcpy_power7)
 	      IFUNC_IMPL_ADD (array, i, memcpy, hwcap & PPC_FEATURE_ARCH_2_06,
@@ -65,6 +67,8 @@  __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
 
   /* Support sysdeps/powerpc/powerpc64/multiarch/memmove.c.  */
   IFUNC_IMPL (i, name, memmove,
+	      IFUNC_IMPL_ADD (array, i, memmove, hwcap2 & PPC_FEATURE2_ARCH_3_00,
+			      __memmove_power9)
 	      IFUNC_IMPL_ADD (array, i, memmove, hwcap & PPC_FEATURE_HAS_VSX,
 			      __memmove_power7)
 	      IFUNC_IMPL_ADD (array, i, memmove, 1, __memmove_ppc))
@@ -168,6 +172,8 @@  __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
 
   /* Support sysdeps/powerpc/powerpc64/multiarch/bcopy.c.  */
   IFUNC_IMPL (i, name, bcopy,
+	      IFUNC_IMPL_ADD (array, i, bcopy, hwcap2 & PPC_FEATURE2_ARCH_3_00,
+			      __bcopy_power9)
 	      IFUNC_IMPL_ADD (array, i, bcopy, hwcap & PPC_FEATURE_HAS_VSX,
 			      __bcopy_power7)
 	      IFUNC_IMPL_ADD (array, i, bcopy, 1, __bcopy_ppc))
diff --git a/sysdeps/powerpc/powerpc64/multiarch/memcpy-power9.S b/sysdeps/powerpc/powerpc64/multiarch/memcpy-power9.S
new file mode 100644
index 0000000..fbd0788
--- /dev/null
+++ b/sysdeps/powerpc/powerpc64/multiarch/memcpy-power9.S
@@ -0,0 +1,26 @@ 
+/* Optimized memcpy implementation for PowerPC/POWER9.
+   Copyright (C) 2017 Free Software Foundation, Inc.
+   This file is part of the GNU C Library.
+
+   The GNU C Library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 2.1 of the License, or (at your option) any later version.
+
+   The GNU C Library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with the GNU C Library; if not, see
+   <http://www.gnu.org/licenses/>.  */
+
+#include <sysdep.h>
+
+#define MEMCPY __memcpy_power9
+
+#undef libc_hidden_builtin_def
+#define libc_hidden_builtin_def(name)
+
+#include <sysdeps/powerpc/powerpc64/power9/memcpy.S>
diff --git a/sysdeps/powerpc/powerpc64/multiarch/memcpy.c b/sysdeps/powerpc/powerpc64/multiarch/memcpy.c
index 9f4286c..4c16fa0 100644
--- a/sysdeps/powerpc/powerpc64/multiarch/memcpy.c
+++ b/sysdeps/powerpc/powerpc64/multiarch/memcpy.c
@@ -35,8 +35,11 @@  extern __typeof (__redirect_memcpy) __memcpy_cell attribute_hidden;
 extern __typeof (__redirect_memcpy) __memcpy_power6 attribute_hidden;
 extern __typeof (__redirect_memcpy) __memcpy_a2 attribute_hidden;
 extern __typeof (__redirect_memcpy) __memcpy_power7 attribute_hidden;
+extern __typeof (__redirect_memcpy) __memcpy_power9 attribute_hidden;
 
 libc_ifunc (__libc_memcpy,
+	   (hwcap2 & PPC_FEATURE2_ARCH_3_00)
+	   ? __memcpy_power9 :
             (hwcap & PPC_FEATURE_HAS_VSX)
             ? __memcpy_power7 :
 	      (hwcap & PPC_FEATURE_ARCH_2_06)
diff --git a/sysdeps/powerpc/powerpc64/multiarch/memmove-power7.S b/sysdeps/powerpc/powerpc64/multiarch/memmove-power7.S
index a9435fa..0599a39 100644
--- a/sysdeps/powerpc/powerpc64/multiarch/memmove-power7.S
+++ b/sysdeps/powerpc/powerpc64/multiarch/memmove-power7.S
@@ -23,7 +23,7 @@ 
 #undef libc_hidden_builtin_def
 #define libc_hidden_builtin_def(name)
 
-#undef bcopy
-#define bcopy __bcopy_power7
+#undef __bcopy
+#define __bcopy __bcopy_power7
 
 #include <sysdeps/powerpc/powerpc64/power7/memmove.S>
diff --git a/sysdeps/powerpc/powerpc64/multiarch/memmove-power9.S b/sysdeps/powerpc/powerpc64/multiarch/memmove-power9.S
new file mode 100644
index 0000000..16a2267
--- /dev/null
+++ b/sysdeps/powerpc/powerpc64/multiarch/memmove-power9.S
@@ -0,0 +1,29 @@ 
+/* Optimized memmove implementation for PowerPC64/POWER7.
+   Copyright (C) 2017 Free Software Foundation, Inc.
+   This file is part of the GNU C Library.
+
+   The GNU C Library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 2.1 of the License, or (at your option) any later version.
+
+   The GNU C Library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with the GNU C Library; if not, see
+   <http://www.gnu.org/licenses/>.  */
+
+#include <sysdep.h>
+
+#define MEMMOVE __memmove_power9
+
+#undef libc_hidden_builtin_def
+#define libc_hidden_builtin_def(name)
+
+#undef __bcopy
+#define __bcopy __bcopy_power9
+
+#include <sysdeps/powerpc/powerpc64/power9/memmove.S>
diff --git a/sysdeps/powerpc/powerpc64/multiarch/memmove.c b/sysdeps/powerpc/powerpc64/multiarch/memmove.c
index db2bbc7..f02498e 100644
--- a/sysdeps/powerpc/powerpc64/multiarch/memmove.c
+++ b/sysdeps/powerpc/powerpc64/multiarch/memmove.c
@@ -31,9 +31,12 @@  extern __typeof (__redirect_memmove) __libc_memmove;
 
 extern __typeof (__redirect_memmove) __memmove_ppc attribute_hidden;
 extern __typeof (__redirect_memmove) __memmove_power7 attribute_hidden;
+extern __typeof (__redirect_memmove) __memmove_power9 attribute_hidden;
 
 libc_ifunc (__libc_memmove,
-            (hwcap & PPC_FEATURE_HAS_VSX)
+	    (hwcap2 & PPC_FEATURE2_ARCH_3_00)
+	    ? __memmove_power9
+	    : (hwcap & PPC_FEATURE_HAS_VSX)
             ? __memmove_power7
             : __memmove_ppc);
 
diff --git a/sysdeps/powerpc/powerpc64/power7/memmove.S b/sysdeps/powerpc/powerpc64/power7/memmove.S
index 93baa69..0bb8ddc 100644
--- a/sysdeps/powerpc/powerpc64/power7/memmove.S
+++ b/sysdeps/powerpc/powerpc64/power7/memmove.S
@@ -832,4 +832,6 @@  ENTRY_TOCLESS (__bcopy)
 	mr	r4,r6
 	b	L(_memmove)
 END (__bcopy)
+#ifndef __bcopy
 weak_alias (__bcopy, bcopy)
+#endif
diff --git a/sysdeps/powerpc/powerpc64/power9/memcpy.S b/sysdeps/powerpc/powerpc64/power9/memcpy.S
new file mode 100644
index 0000000..0731bac
--- /dev/null
+++ b/sysdeps/powerpc/powerpc64/power9/memcpy.S
@@ -0,0 +1,429 @@ 
+/* Optimized memcpy implementation for PowerPC64/POWER9.
+   Copyright (C) 2017 Free Software Foundation, Inc.
+   This file is part of the GNU C Library.
+
+   The GNU C Library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 2.1 of the License, or (at your option) any later version.
+
+   The GNU C Library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with the GNU C Library; if not, see
+   <http://www.gnu.org/licenses/>.  */
+
+#include <sysdep.h>
+
+
+/* void * [r3] memcpy (void *dst [r3], void *src [r4], size_t len [r5]);
+   Returns 'dst'.  */
+
+#ifndef MEMCPY
+# define MEMCPY memcpy
+#endif
+
+#define dst 11		/* Use r11 so r3 kept unchanged.  */
+#define src 4
+#define cnt 5
+
+	.machine power7
+ENTRY_TOCLESS (MEMCPY, 5)
+	CALL_MCOUNT 3
+
+	cmpldi	cr1,cnt,31
+	neg	0,3
+	ble	cr1, L(copy_LT_32)  /* If move < 32 bytes use short move
+				    code.  */
+
+/* Align copies using VSX instructions to quadword. It is to avoid alignment
+   traps when memcpy is used on non-cacheable memory (for instance, memory
+   mapped I/O).  */
+	andi.	10,3,15
+	clrldi	11,4,60
+	cmpld	cr6,10,11	/* SRC and DST alignments match?  */
+
+	mr	dst,3
+	bne	cr6,L(copy_GE_32_unaligned)
+	beq	L(aligned_copy)
+
+	mtocrf	0x01,0
+	clrldi	0,0,60
+
+/* Get the DST and SRC aligned to 16 bytes.  */
+1:
+	bf	31,2f
+	lbz	6,0(src)
+	addi	src,src,1
+	stb	6,0(dst)
+	addi	dst,dst,1
+2:
+	bf	30,4f
+	lhz	6,0(src)
+	addi	src,src,2
+	sth	6,0(dst)
+	addi	dst,dst,2
+4:
+	bf	29,8f
+	lwz	6,0(src)
+	addi	src,src,4
+	stw	6,0(dst)
+	addi	dst,dst,4
+8:
+	bf	28,16f
+	ld	6,0(src)
+	addi	src,src,8
+	std	6,0(dst)
+	addi	dst,dst,8
+16:
+	subf	cnt,0,cnt
+
+/* Main aligned copy loop. Copies 128 bytes at a time. */
+L(aligned_copy):
+	li	6,16
+	li	7,32
+	li	8,48
+	mtocrf	0x02,cnt
+	srdi	12,cnt,7
+	cmpdi	12,0
+	beq	L(aligned_tail)
+	lvx	6,0,src
+	lvx	7,src,6
+	mtctr	12
+	b	L(aligned_128loop)
+
+	.align  4
+L(aligned_128head):
+	/* for the 2nd + iteration of this loop. */
+	lvx	6,0,src
+	lvx	7,src,6
+L(aligned_128loop):
+	lvx	8,src,7
+	lvx	9,src,8
+	stvx	6,0,dst
+	addi	src,src,64
+	stvx	7,dst,6
+	stvx	8,dst,7
+	stvx	9,dst,8
+	lvx	6,0,src
+	lvx	7,src,6
+	addi	dst,dst,64
+	lvx	8,src,7
+	lvx	9,src,8
+	addi	src,src,64
+	stvx	6,0,dst
+	stvx	7,dst,6
+	stvx	8,dst,7
+	stvx	9,dst,8
+	addi	dst,dst,64
+	bdnz	L(aligned_128head)
+
+L(aligned_tail):
+	mtocrf	0x01,cnt
+	bf	25,32f
+	lvx	6,0,src
+	lvx	7,src,6
+	lvx	8,src,7
+	lvx	9,src,8
+	addi	src,src,64
+	stvx	6,0,dst
+	stvx	7,dst,6
+	stvx	8,dst,7
+	stvx	9,dst,8
+	addi	dst,dst,64
+32:
+	bf	26,16f
+	lvx	6,0,src
+	lvx	7,src,6
+	addi	src,src,32
+	stvx	6,0,dst
+	stvx	7,dst,6
+	addi	dst,dst,32
+16:
+	bf	27,8f
+	lvx	6,0,src
+	addi	src,src,16
+	stvx	6,0,dst
+	addi	dst,dst,16
+8:
+	bf	28,4f
+	ld	6,0(src)
+	addi	src,src,8
+	std     6,0(dst)
+	addi	dst,dst,8
+4:	/* Copies 4~7 bytes.  */
+	bf	29,L(tail2)
+	lwz	6,0(src)
+	stw     6,0(dst)
+	bf      30,L(tail5)
+	lhz     7,4(src)
+	sth     7,4(dst)
+	bflr	31
+	lbz     8,6(src)
+	stb     8,6(dst)
+	/* Return original DST pointer.  */
+	blr
+
+
+/* Handle copies of 0~31 bytes.  */
+	.align	4
+L(copy_LT_32):
+	mr	dst,3
+	cmpldi	cr6,cnt,8
+	mtocrf	0x01,cnt
+	ble	cr6,L(copy_LE_8)
+
+	/* At least 9 bytes to go.  */
+	neg	8,4
+	andi.	0,8,3
+	cmpldi	cr1,cnt,16
+	beq	L(copy_LT_32_aligned)
+
+	/* Force 4-byte alignment for SRC.  */
+	mtocrf	0x01,0
+	subf	cnt,0,cnt
+2:
+	bf	30,1f
+	lhz	6,0(src)
+	addi	src,src,2
+	sth	6,0(dst)
+	addi	dst,dst,2
+1:
+	bf	31,L(end_4bytes_alignment)
+	lbz	6,0(src)
+	addi	src,src,1
+	stb	6,0(dst)
+	addi	dst,dst,1
+
+	.align	4
+L(end_4bytes_alignment):
+	cmpldi	cr1,cnt,16
+	mtocrf	0x01,cnt
+
+L(copy_LT_32_aligned):
+	/* At least 6 bytes to go, and SRC is word-aligned.  */
+	blt	cr1,8f
+
+	/* Copy 16 bytes.  */
+	lwz	6,0(src)
+	lwz	7,4(src)
+	stw	6,0(dst)
+	lwz	8,8(src)
+	stw	7,4(dst)
+	lwz	6,12(src)
+	addi	src,src,16
+	stw	8,8(dst)
+	stw	6,12(dst)
+	addi	dst,dst,16
+8:	/* Copy 8 bytes.  */
+	bf	28,L(tail4)
+	lwz	6,0(src)
+	lwz	7,4(src)
+	addi	src,src,8
+	stw	6,0(dst)
+	stw	7,4(dst)
+	addi	dst,dst,8
+
+	.align	4
+/* Copies 4~7 bytes.  */
+L(tail4):
+	bf	29,L(tail2)
+	lwz	6,0(src)
+	stw	6,0(dst)
+	bf	30,L(tail5)
+	lhz	7,4(src)
+	sth	7,4(dst)
+	bflr	31
+	lbz	8,6(src)
+	stb	8,6(dst)
+	/* Return original DST pointer.  */
+	blr
+
+	.align	4
+/* Copies 2~3 bytes.  */
+L(tail2):
+	bf	30,1f
+	lhz	6,0(src)
+	sth	6,0(dst)
+	bflr	31
+	lbz	7,2(src)
+	stb	7,2(dst)
+	blr
+
+	.align	4
+L(tail5):
+	bflr	31
+	lbz	6,4(src)
+	stb	6,4(dst)
+	blr
+
+	.align	4
+1:
+	bflr	31
+	lbz	6,0(src)
+	stb	6,0(dst)
+	/* Return original DST pointer.  */
+	blr
+
+
+/* Handles copies of 0~8 bytes.  */
+	.align	4
+L(copy_LE_8):
+	bne	cr6,L(tail4)
+
+	/* Though we could've used ld/std here, they are still
+	slow for unaligned cases.  */
+
+	lwz	6,0(src)
+	lwz	7,4(src)
+	stw	6,0(dst)
+	stw	7,4(dst)
+	blr
+
+
+/* Handle copies of 32+ bytes where DST is aligned (to quadword) but
+   SRC is not.	Use aligned quadword loads from SRC, shifted to realign
+   the data, allowing for aligned DST stores.  */
+	.align	4
+L(copy_GE_32_unaligned):
+	clrldi	0,0,60	      /* Number of bytes until the 1st dst quadword.  */
+	srdi	9,cnt,4	      /* Number of full quadwords remaining.  */
+
+	beq	L(copy_GE_32_unaligned_cont)
+
+	/* DST is not quadword aligned, get it aligned.  */
+
+	mtocrf	0x01,0
+	subf	cnt,0,cnt
+
+	/* Vector instructions work best when proper alignment (16-bytes)
+	is present.  Move 0~15 bytes as needed to get DST quadword-aligned.  */
+1:
+	bf	31,2f
+	lbz	6,0(src)
+	addi	src,src,1
+	stb	6,0(dst)
+	addi	dst,dst,1
+2:
+	bf	30,4f
+	lhz	6,0(src)
+	addi	src,src,2
+	sth	6,0(dst)
+	addi	dst,dst,2
+4:
+	bf	29,8f
+	lwz	6,0(src)
+	addi	src,src,4
+	stw	6,0(dst)
+	addi	dst,dst,4
+8:
+	bf	28,0f
+	ld	6,0(src)
+	addi	src,src,8
+	std	6,0(dst)
+	addi	dst,dst,8
+0:
+	srdi	9,cnt,4	      /* Number of full quadwords remaining.  */
+
+	/* The proper alignment is present, it is OK to copy the bytes now.  */
+L(copy_GE_32_unaligned_cont):
+
+	/* Setup two indexes to speed up the indexed vector operations.  */
+	clrldi	10,cnt,60
+	li	6,16	      /* Index for 16-bytes offsets.  */
+	li	7,32	      /* Index for 32-bytes offsets.  */
+	cmpldi	cr1,10,0
+	srdi	8,cnt,5	      /* Setup the loop counter.  */
+	mtocrf	0x01,9
+	cmpldi	cr6,9,1
+#ifdef __LITTLE_ENDIAN__
+	lvsr	5,0,src
+#else
+	lvsl	5,0,src
+#endif
+	lvx	3,0,src
+	li	0,0
+	bf	31,L(setup_unaligned_loop)
+
+	/* Copy another 16 bytes to align to 32-bytes due to the loop.  */
+	lvx	4,src,6
+#ifdef __LITTLE_ENDIAN__
+	vperm	6,4,3,5
+#else
+	vperm	6,3,4,5
+#endif
+	addi	src,src,16
+	stvx	6,0,dst
+	addi	dst,dst,16
+	vor	3,4,4
+	clrrdi	0,src,60
+
+L(setup_unaligned_loop):
+	mtctr	8
+	ble	cr6,L(end_unaligned_loop)
+
+	/* Copy 32 bytes at a time using vector instructions.  */
+	.align	4
+L(unaligned_loop):
+
+	/* Note: vr6/vr10 may contain data that was already copied,
+	but in order to get proper alignment, we may have to copy
+	some portions again. This is faster than having unaligned
+	vector instructions though.  */
+
+	lvx	4,src,6
+#ifdef __LITTLE_ENDIAN__
+	vperm	6,4,3,5
+#else
+	vperm	6,3,4,5
+#endif
+	lvx	3,src,7
+#ifdef __LITTLE_ENDIAN__
+	vperm	10,3,4,5
+#else
+	vperm	10,4,3,5
+#endif
+	addi	src,src,32
+	stvx	6,0,dst
+	stvx	10,dst,6
+	addi	dst,dst,32
+	bdnz	L(unaligned_loop)
+
+	clrrdi	0,src,60
+
+	.align	4
+L(end_unaligned_loop):
+
+	/* Check for tail bytes.  */
+	mtocrf	0x01,cnt
+	beqlr	cr1
+
+	add	src,src,0
+
+	/*  We have 1~15 tail bytes to copy, and DST is quadword aligned.  */
+	/* Copy 8 bytes.  */
+	bf	28,4f
+	lwz	6,0(src)
+	lwz	7,4(src)
+	addi	src,src,8
+	stw	6,0(dst)
+	stw	7,4(dst)
+	addi	dst,dst,8
+4:	/* Copy 4~7 bytes.  */
+	bf	29,L(tail2)
+	lwz	6,0(src)
+	stw	6,0(dst)
+	bf	30,L(tail5)
+	lhz	7,4(src)
+	sth	7,4(dst)
+	bflr	31
+	lbz	8,6(src)
+	stb	8,6(dst)
+	/* Return original DST pointer.  */
+	blr
+
+END_GEN_TB (MEMCPY,TB_TOCLESS)
+libc_hidden_builtin_def (memcpy)
diff --git a/sysdeps/powerpc/powerpc64/power9/memmove.S b/sysdeps/powerpc/powerpc64/power9/memmove.S
new file mode 100644
index 0000000..9ed8f77
--- /dev/null
+++ b/sysdeps/powerpc/powerpc64/power9/memmove.S
@@ -0,0 +1,837 @@ 
+/* Optimized memmove implementation for PowerPC64/POWER7.
+   Copyright (C) 2017 Free Software Foundation, Inc.
+   This file is part of the GNU C Library.
+
+   The GNU C Library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 2.1 of the License, or (at your option) any later version.
+
+   The GNU C Library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with the GNU C Library; if not, see
+   <http://www.gnu.org/licenses/>.  */
+
+#include <sysdep.h>
+
+
+/* void* [r3] memmove (void *dest [r3], const void *src [r4], size_t len [r5])
+
+   This optimization check if memory 'dest'  overlaps with 'src'. If it does
+   not then it calls an optimized memcpy call (similar to memcpy for POWER7,
+   embedded here to gain some cycles).
+   If source and destiny overlaps, a optimized backwards memcpy is used
+   instead.  */
+
+#ifndef MEMMOVE
+# define MEMMOVE memmove
+#endif
+	.machine power7
+ENTRY_TOCLESS (MEMMOVE, 5)
+	CALL_MCOUNT 3
+
+L(_memmove):
+	subf    r9,r4,r3
+	cmpld   cr7,r9,r5
+	blt	cr7,L(memmove_bwd)
+
+	cmpldi	cr1,r5,31
+	neg	0,3
+	ble	cr1, L(copy_LT_32)  /* If move < 32 bytes use short move
+				       code.  */
+
+	andi.	10,3,15
+	clrldi	11,4,60
+	cmpld	cr6,10,11	/* SRC and DST alignments match?  */
+
+	mr	r11,3
+	bne	cr6,L(copy_GE_32_unaligned)
+	beq	L(aligned_copy)
+
+	mtocrf	0x01,0
+	clrldi	0,0,60
+
+/* Get the DST and SRC aligned to 8 bytes (16 for little-endian).  */
+1:
+	bf	31,2f
+	lbz	6,0(r4)
+	addi	r4,r4,1
+	stb	6,0(r11)
+	addi	r11,r11,1
+2:
+	bf	30,4f
+	lhz	6,0(r4)
+	addi	r4,r4,2
+	sth	6,0(r11)
+	addi	r11,r11,2
+4:
+	bf	29,8f
+	lwz	6,0(r4)
+	addi	r4,r4,4
+	stw	6,0(r11)
+	addi	r11,r11,4
+8:
+	bf	28,16f
+	ld	6,0(r4)
+	addi	r4,r4,8
+	std	6,0(r11)
+	addi	r11,r11,8
+16:
+	subf	r5,0,r5
+
+/* Main aligned copy loop. Copies 128 bytes at a time. */
+L(aligned_copy):
+	li	6,16
+	li	7,32
+	li	8,48
+	mtocrf	0x02,r5
+	srdi	12,r5,7
+	cmpdi	12,0
+	beq	L(aligned_tail)
+	lvx	6,0,r4
+	lvx	7,r4,6
+	mtctr	12
+	b	L(aligned_128loop)
+
+	.align  4
+L(aligned_128head):
+	/* for the 2nd + iteration of this loop. */
+	lvx	6,0,r4
+	lvx	7,r4,6
+L(aligned_128loop):
+	lvx	8,r4,7
+	lvx	9,r4,8
+	stvx	6,0,r11
+	addi	r4,r4,64
+	stvx	7,r11,6
+	stvx	8,r11,7
+	stvx	9,r11,8
+	lvx	6,0,r4
+	lvx	7,r4,6
+	addi	r11,r11,64
+	lvx	8,r4,7
+	lvx	9,r4,8
+	addi	r4,r4,64
+	stvx	6,0,r11
+	stvx	7,r11,6
+	stvx	8,r11,7
+	stvx	9,r11,8
+	addi	r11,r11,64
+	bdnz	L(aligned_128head)
+
+L(aligned_tail):
+	mtocrf	0x01,r5
+	bf	25,32f
+	lvx	6,0,r4
+	lvx	7,r4,6
+	lvx	8,r4,7
+	lvx	9,r4,8
+	addi	r4,r4,64
+	stvx	6,0,r11
+	stvx	7,r11,6
+	stvx	8,r11,7
+	stvx	9,r11,8
+	addi	r11,r11,64
+32:
+	bf	26,16f
+	lvx	6,0,r4
+	lvx	7,r4,6
+	addi	r4,r4,32
+	stvx	6,0,r11
+	stvx	7,r11,6
+	addi	r11,r11,32
+16:
+	bf	27,8f
+	lvx	6,0,r4
+	addi	r4,r4,16
+	stvx	6,0,r11
+	addi	r11,r11,16
+8:
+	bf	28,4f
+	ld	6,0(r4)
+	addi	r4,r4,8
+	std     6,0(r11)
+	addi	r11,r11,8
+4:	/* Copies 4~7 bytes.  */
+	bf	29,L(tail2)
+	lwz	6,0(r4)
+	stw     6,0(r11)
+	bf      30,L(tail5)
+	lhz     7,4(r4)
+	sth     7,4(r11)
+	bflr	31
+	lbz     8,6(r4)
+	stb     8,6(r11)
+	/* Return original DST pointer.  */
+	blr
+
+/* Handle copies of 0~31 bytes.  */
+	.align	4
+L(copy_LT_32):
+	mr	r11,3
+	cmpldi	cr6,r5,8
+	mtocrf	0x01,r5
+	ble	cr6,L(copy_LE_8)
+
+	/* At least 9 bytes to go.  */
+	neg	8,4
+	andi.	0,8,3
+	cmpldi	cr1,r5,16
+	beq	L(copy_LT_32_aligned)
+
+	/* Force 4-byte alignment for SRC.  */
+	mtocrf	0x01,0
+	subf	r5,0,r5
+2:
+	bf	30,1f
+	lhz	6,0(r4)
+	addi	r4,r4,2
+	sth	6,0(r11)
+	addi	r11,r11,2
+1:
+	bf	31,L(end_4bytes_alignment)
+	lbz	6,0(r4)
+	addi	r4,r4,1
+	stb	6,0(r11)
+	addi	r11,r11,1
+
+	.align	4
+L(end_4bytes_alignment):
+	cmpldi	cr1,r5,16
+	mtocrf	0x01,r5
+
+L(copy_LT_32_aligned):
+	/* At least 6 bytes to go, and SRC is word-aligned.  */
+	blt	cr1,8f
+
+	/* Copy 16 bytes.  */
+	lwz	6,0(r4)
+	lwz	7,4(r4)
+	stw	6,0(r11)
+	lwz	8,8(r4)
+	stw	7,4(r11)
+	lwz	6,12(r4)
+	addi	r4,r4,16
+	stw	8,8(r11)
+	stw	6,12(r11)
+	addi	r11,r11,16
+8:	/* Copy 8 bytes.  */
+	bf	28,L(tail4)
+	lwz	6,0(r4)
+	lwz	7,4(r4)
+	addi	r4,r4,8
+	stw	6,0(r11)
+	stw	7,4(r11)
+	addi	r11,r11,8
+
+	.align	4
+/* Copies 4~7 bytes.  */
+L(tail4):
+	bf	29,L(tail2)
+	lwz	6,0(r4)
+	stw	6,0(r11)
+	bf	30,L(tail5)
+	lhz	7,4(r4)
+	sth	7,4(r11)
+	bflr	31
+	lbz	8,6(r4)
+	stb	8,6(r11)
+	/* Return original DST pointer.  */
+	blr
+
+	.align	4
+/* Copies 2~3 bytes.  */
+L(tail2):
+	bf	30,1f
+	lhz	6,0(r4)
+	sth	6,0(r11)
+	bflr	31
+	lbz	7,2(r4)
+	stb	7,2(r11)
+	blr
+
+	.align	4
+L(tail5):
+	bflr	31
+	lbz	6,4(r4)
+	stb	6,4(r11)
+	blr
+
+	.align	4
+1:
+	bflr	31
+	lbz	6,0(r4)
+	stb	6,0(r11)
+	/* Return original DST pointer.  */
+	blr
+
+/* Handles copies of 0~8 bytes.  */
+	.align	4
+L(copy_LE_8):
+	bne	cr6,L(tail4)
+
+	/* Though we could've used ld/std here, they are still
+	slow for unaligned cases.  */
+
+	lwz	6,0(r4)
+	lwz	7,4(r4)
+	stw	6,0(r11)
+	stw	7,4(r11)
+	blr
+
+
+/* Handle copies of 32+ bytes where DST is aligned (to quadword) but
+   SRC is not.	Use aligned quadword loads from SRC, shifted to realign
+   the data, allowing for aligned DST stores.  */
+	.align	4
+L(copy_GE_32_unaligned):
+	clrldi	0,0,60	      /* Number of bytes until the 1st r11 quadword.  */
+	srdi	9,r5,4	      /* Number of full quadwords remaining.  */
+
+	beq	L(copy_GE_32_unaligned_cont)
+
+	/* DST is not quadword aligned, get it aligned.  */
+
+	mtocrf	0x01,0
+	subf	r5,0,r5
+
+	/* Vector instructions work best when proper alignment (16-bytes)
+	is present.  Move 0~15 bytes as needed to get DST quadword-aligned.  */
+1:
+	bf	31,2f
+	lbz	6,0(r4)
+	addi	r4,r4,1
+	stb	6,0(r11)
+	addi	r11,r11,1
+2:
+	bf	30,4f
+	lhz	6,0(r4)
+	addi	r4,r4,2
+	sth	6,0(r11)
+	addi	r11,r11,2
+4:
+	bf	29,8f
+	lwz	6,0(r4)
+	addi	r4,r4,4
+	stw	6,0(r11)
+	addi	r11,r11,4
+8:
+	bf	28,0f
+	ld	6,0(r4)
+	addi	r4,r4,8
+	std	6,0(r11)
+	addi	r11,r11,8
+0:
+	srdi	9,r5,4	      /* Number of full quadwords remaining.  */
+
+	/* The proper alignment is present, it is OK to copy the bytes now.  */
+L(copy_GE_32_unaligned_cont):
+
+	/* Setup two indexes to speed up the indexed vector operations.  */
+	clrldi	10,r5,60
+	li	6,16	      /* Index for 16-bytes offsets.  */
+	li	7,32	      /* Index for 32-bytes offsets.  */
+	cmpldi	cr1,10,0
+	srdi	8,r5,5	      /* Setup the loop counter.  */
+	mtocrf	0x01,9
+	cmpldi	cr6,9,1
+#ifdef __LITTLE_ENDIAN__
+	lvsr	5,0,r4
+#else
+	lvsl	5,0,r4
+#endif
+	lvx	3,0,r4
+	li	0,0
+	bf	31,L(setup_unaligned_loop)
+
+	/* Copy another 16 bytes to align to 32-bytes due to the loop.  */
+	lvx	4,r4,6
+#ifdef __LITTLE_ENDIAN__
+	vperm	6,4,3,5
+#else
+	vperm	6,3,4,5
+#endif
+	addi	r4,r4,16
+	stvx	6,0,r11
+	addi	r11,r11,16
+	vor	3,4,4
+	clrrdi	0,r4,60
+
+L(setup_unaligned_loop):
+	mtctr	8
+	ble	cr6,L(end_unaligned_loop)
+
+	/* Copy 32 bytes at a time using vector instructions.  */
+	.align	4
+L(unaligned_loop):
+
+	/* Note: vr6/vr10 may contain data that was already copied,
+	but in order to get proper alignment, we may have to copy
+	some portions again. This is faster than having unaligned
+	vector instructions though.  */
+
+	lvx	4,r4,6
+#ifdef __LITTLE_ENDIAN__
+	vperm	6,4,3,5
+#else
+	vperm	6,3,4,5
+#endif
+	lvx	3,r4,7
+#ifdef __LITTLE_ENDIAN__
+	vperm	10,3,4,5
+#else
+	vperm	10,4,3,5
+#endif
+	addi	r4,r4,32
+	stvx	6,0,r11
+	stvx	10,r11,6
+	addi	r11,r11,32
+	bdnz	L(unaligned_loop)
+
+	clrrdi	0,r4,60
+
+	.align	4
+L(end_unaligned_loop):
+
+	/* Check for tail bytes.  */
+	mtocrf	0x01,r5
+	beqlr	cr1
+
+	add	r4,r4,0
+
+	/*  We have 1~15 tail bytes to copy, and DST is quadword aligned.  */
+	/* Copy 8 bytes.  */
+	bf	28,4f
+	lwz	6,0(r4)
+	lwz	7,4(r4)
+	addi	r4,r4,8
+	stw	6,0(r11)
+	stw	7,4(r11)
+	addi	r11,r11,8
+4:	/* Copy 4~7 bytes.  */
+	bf	29,L(tail2)
+	lwz	6,0(r4)
+	stw	6,0(r11)
+	bf	30,L(tail5)
+	lhz	7,4(r4)
+	sth	7,4(r11)
+	bflr	31
+	lbz	8,6(r4)
+	stb	8,6(r11)
+	/* Return original DST pointer.  */
+	blr
+
+	/* Start to memcpy backward implementation: the algorith first check if
+	   src and dest have the same alignment and if it does align both to 16
+	   bytes and copy using VSX instructions.
+	   If does not, align dest to 16 bytes and use VMX (altivec) instruction
+	   to read two 16 bytes at time, shift/permute the bytes read and write
+	   aligned to dest.  */
+L(memmove_bwd):
+	cmpldi	cr1,r5,31
+	/* Copy is done backwards: update the pointers and check alignment.  */
+	add	r11,r3,r5
+	add	r4,r4,r5
+	mr	r0,r11
+	ble	cr1, L(copy_LT_32_bwd)  /* If move < 32 bytes use short move
+				           code.  */
+
+	andi.	r10,r11,15	    /* Check if r11 is aligned to 16 bytes  */
+	clrldi	r9,r4,60	    /* Check if r4 is aligned to 16 bytes  */
+	cmpld	cr6,r10,r9	    /* SRC and DST alignments match?  */
+
+	bne     cr6,L(copy_GE_32_unaligned_bwd)
+	beq     L(aligned_copy_bwd)
+
+	mtocrf	0x01,r0
+	clrldi	r0,r0,60
+
+/* Get the DST and SRC aligned to 16 bytes.  */
+1:
+	bf	31,2f
+	lbz	r6,-1(r4)
+	subi	r4,r4,1
+	stb	r6,-1(r11)
+	subi	r11,r11,1
+2:
+	bf	30,4f
+	lhz	r6,-2(r4)
+	subi	r4,r4,2
+	sth	r6,-2(r11)
+	subi	r11,r11,2
+4:
+	bf	29,8f
+	lwz	r6,-4(r4)
+	subi	r4,r4,4
+	stw	r6,-4(r11)
+	subi	r11,r11,4
+8:
+	bf	28,16f
+	ld	r6,-8(r4)
+	subi	r4,r4,8
+	std	r6,-8(r11)
+	subi	r11,r11,8
+16:
+	subf	r5,0,r5
+
+/* Main aligned copy loop. Copies 128 bytes at a time. */
+L(aligned_copy_bwd):
+	li	r6,-16
+	li	r7,-32
+	li	r8,-48
+	li	r9,-64
+	mtocrf	0x02,r5
+	srdi	r12,r5,7
+	cmpdi	r12,0
+	beq	L(aligned_tail_bwd)
+	lvx	v6,r4,r6
+	lvx	v7,r4,r7
+	mtctr	12
+	b	L(aligned_128loop_bwd)
+
+	.align  4
+L(aligned_128head_bwd):
+	/* for the 2nd + iteration of this loop. */
+	lvx	v6,r4,r6
+	lvx	v7,r4,r7
+L(aligned_128loop_bwd):
+	lvx	v8,r4,r8
+	lvx	v9,r4,r9
+	stvx	v6,r11,r6
+	subi	r4,r4,64
+	stvx	v7,r11,r7
+	stvx	v8,r11,r8
+	stvx	v9,r11,r9
+	lvx	v6,r4,r6
+	lvx	v7,r4,7
+	subi	r11,r11,64
+	lvx	v8,r4,r8
+	lvx	v9,r4,r9
+	subi	r4,r4,64
+	stvx	v6,r11,r6
+	stvx	v7,r11,r7
+	stvx	v8,r11,r8
+	stvx	v9,r11,r9
+	subi	r11,r11,64
+	bdnz	L(aligned_128head_bwd)
+
+L(aligned_tail_bwd):
+	mtocrf	0x01,r5
+	bf	25,32f
+	lvx	v6,r4,r6
+	lvx	v7,r4,r7
+	lvx	v8,r4,r8
+	lvx	v9,r4,r9
+	subi	r4,r4,64
+	stvx	v6,r11,r6
+	stvx	v7,r11,r7
+	stvx	v8,r11,r8
+	stvx	v9,r11,r9
+	subi	r11,r11,64
+32:
+	bf	26,16f
+	lvx	v6,r4,r6
+	lvx	v7,r4,r7
+	subi	r4,r4,32
+	stvx	v6,r11,r6
+	stvx	v7,r11,r7
+	subi	r11,r11,32
+16:
+	bf	27,8f
+	lvx	v6,r4,r6
+	subi	r4,r4,16
+	stvx	v6,r11,r6
+	subi	r11,r11,16
+8:
+	bf	28,4f
+	ld	r6,-8(r4)
+	subi	r4,r4,8
+	std     r6,-8(r11)
+	subi	r11,r11,8
+4:	/* Copies 4~7 bytes.  */
+	bf	29,L(tail2_bwd)
+	lwz	r6,-4(r4)
+	stw     r6,-4(r11)
+	bf      30,L(tail5_bwd)
+	lhz     r7,-6(r4)
+	sth     r7,-6(r11)
+	bflr	31
+	lbz     r8,-7(r4)
+	stb     r8,-7(r11)
+	/* Return original DST pointer.  */
+	blr
+
+/* Handle copies of 0~31 bytes.  */
+	.align	4
+L(copy_LT_32_bwd):
+	cmpldi	cr6,r5,8
+	mtocrf	0x01,r5
+	ble	cr6,L(copy_LE_8_bwd)
+
+	/* At least 9 bytes to go.  */
+	neg	r8,r4
+	andi.	r0,r8,3
+	cmpldi	cr1,r5,16
+	beq	L(copy_LT_32_aligned_bwd)
+
+	/* Force 4-byte alignment for SRC.  */
+	mtocrf	0x01,0
+	subf	r5,0,r5
+2:
+	bf	30,1f
+	lhz	r6,-2(r4)
+	subi	r4,r4,2
+	sth	r6,-2(r11)
+	subi	r11,r11,2
+1:
+	bf	31,L(end_4bytes_alignment_bwd)
+	lbz	6,-1(r4)
+	subi	r4,r4,1
+	stb	6,-1(r11)
+	subi	r11,r11,1
+
+	.align	4
+L(end_4bytes_alignment_bwd):
+	cmpldi	cr1,r5,16
+	mtocrf	0x01,r5
+
+L(copy_LT_32_aligned_bwd):
+	/* At least 6 bytes to go, and SRC is word-aligned.  */
+	blt	cr1,8f
+
+	/* Copy 16 bytes.  */
+	lwz	r6,-4(r4)
+	lwz	r7,-8(r4)
+	stw	r6,-4(r11)
+	lwz	r8,-12(r4)
+	stw	r7,-8(r11)
+	lwz	r6,-16(r4)
+	subi	r4,r4,16
+	stw	r8,-12(r11)
+	stw	r6,-16(r11)
+	subi	r11,r11,16
+8:	/* Copy 8 bytes.  */
+	bf	28,L(tail4_bwd)
+	lwz	r6,-4(r4)
+	lwz	r7,-8(r4)
+	subi	r4,r4,8
+	stw	r6,-4(r11)
+	stw	r7,-8(r11)
+	subi	r11,r11,8
+
+	.align	4
+/* Copies 4~7 bytes.  */
+L(tail4_bwd):
+	bf	29,L(tail2_bwd)
+	lwz	6,-4(r4)
+	stw	6,-4(r11)
+	bf	30,L(tail5_bwd)
+	lhz	7,-6(r4)
+	sth	7,-6(r11)
+	bflr	31
+	lbz	8,-7(r4)
+	stb	8,-7(r11)
+	/* Return original DST pointer.  */
+	blr
+
+	.align	4
+/* Copies 2~3 bytes.  */
+L(tail2_bwd):
+	bf	30,1f
+	lhz	6,-2(r4)
+	sth	6,-2(r11)
+	bflr	31
+	lbz	7,-3(r4)
+	stb	7,-3(r11)
+	blr
+
+	.align	4
+L(tail5_bwd):
+	bflr	31
+	lbz	6,-5(r4)
+	stb	6,-5(r11)
+	blr
+
+	.align	4
+1:
+	bflr	31
+	lbz	6,-1(r4)
+	stb	6,-1(r11)
+	/* Return original DST pointer.  */
+	blr
+
+
+/* Handles copies of 0~8 bytes.  */
+	.align	4
+L(copy_LE_8_bwd):
+	bne	cr6,L(tail4_bwd)
+
+	/* Though we could've used ld/std here, they are still
+	   slow for unaligned cases.  */
+	lwz	6,-8(r4)
+	lwz	7,-4(r4)
+	stw	6,-8(r11)
+	stw	7,-4(r11)
+	blr
+
+
+/* Handle copies of 32+ bytes where DST is aligned (to quadword) but
+   SRC is not.	Use aligned quadword loads from SRC, shifted to realign
+   the data, allowing for aligned DST stores.  */
+	.align	4
+L(copy_GE_32_unaligned_bwd):
+	andi.	r10,r11,15      /* Check alignment of DST against 16 bytes..  */
+	srdi	r9,r5,4		/* Number of full quadwords remaining.  */
+
+	beq	L(copy_GE_32_unaligned_cont_bwd)
+
+	/* DST is not quadword aligned and r10 holds the address masked to
+           compare alignments.  */
+	mtocrf	0x01,r10
+	subf	r5,r10,r5
+
+	/* Vector instructions work best when proper alignment (16-bytes)
+	is present.  Move 0~15 bytes as needed to get DST quadword-aligned.  */
+1:
+	bf	31,2f
+	lbz	r6,-1(r4)
+	subi	r4,r4,1
+	stb	r6,-1(r11)
+	subi	r11,r11,1
+2:
+	bf	30,4f
+	lhz	r6,-2(r4)
+	subi	r4,r4,2
+	sth	r6,-2(r11)
+	subi	r11,r11,2
+4:
+	bf	29,8f
+	lwz	r6,-4(r4)
+	subi	r4,r4,4
+	stw	r6,-4(r11)
+	subi	r11,r11,4
+8:
+	bf	28,0f
+	ld	r6,-8(r4)
+	subi	r4,r4,8
+	std	r6,-8(r11)
+	subi	r11,r11,8
+0:
+	srdi	r9,r5,4	      /* Number of full quadwords remaining.  */
+
+	/* The proper alignment is present, it is OK to copy the bytes now.  */
+L(copy_GE_32_unaligned_cont_bwd):
+
+	/* Setup two indexes to speed up the indexed vector operations.  */
+	clrldi	r10,r5,60
+	li	r6,-16	      /* Index for 16-bytes offsets.  */
+	li	r7,-32	      /* Index for 32-bytes offsets.  */
+	cmpldi	cr1,10,0
+	srdi	r8,r5,5	      /* Setup the loop counter.  */
+	mtocrf	0x01,9
+	cmpldi	cr6,r9,1
+#ifdef __LITTLE_ENDIAN__
+	lvsr	v5,r0,r4
+#else
+	lvsl	v5,r0,r4
+#endif
+	lvx	v3,0,r4
+	li	r0,0
+	bf	31,L(setup_unaligned_loop_bwd)
+
+	/* Copy another 16 bytes to align to 32-bytes due to the loop.  */
+	lvx	v4,r4,r6
+#ifdef __LITTLE_ENDIAN__
+	vperm	v6,v3,v4,v5
+#else
+	vperm	v6,v4,v3,v5
+#endif
+	subi	r4,r4,16
+	stvx	v6,r11,r6
+	subi	r11,r11,16
+	vor	v3,v4,v4
+	clrrdi	r0,r4,60
+
+L(setup_unaligned_loop_bwd):
+	mtctr	r8
+	ble	cr6,L(end_unaligned_loop_bwd)
+
+	/* Copy 32 bytes at a time using vector instructions.  */
+	.align	4
+L(unaligned_loop_bwd):
+
+	/* Note: vr6/vr10 may contain data that was already copied,
+	but in order to get proper alignment, we may have to copy
+	some portions again. This is faster than having unaligned
+	vector instructions though.  */
+
+	lvx	v4,r4,r6
+#ifdef __LITTLE_ENDIAN__
+	vperm	v6,v3,v4,v5
+#else
+	vperm	v6,v4,v3,v5
+#endif
+	lvx	v3,r4,r7
+#ifdef __LITTLE_ENDIAN__
+	vperm	v10,v4,v3,v5
+#else
+	vperm	v10,v3,v4,v5
+#endif
+	subi	r4,r4,32
+	stvx	v6,r11,r6
+	stvx	v10,r11,r7
+	subi	r11,r11,32
+	bdnz	L(unaligned_loop_bwd)
+
+	clrrdi	r0,r4,60
+
+	.align	4
+L(end_unaligned_loop_bwd):
+
+	/* Check for tail bytes.  */
+	mtocrf	0x01,r5
+	beqlr	cr1
+
+	add	r4,r4,0
+
+	/*  We have 1~15 tail bytes to copy, and DST is quadword aligned.  */
+	/* Copy 8 bytes.  */
+	bf	28,4f
+	lwz	r6,-4(r4)
+	lwz	r7,-8(r4)
+	subi	r4,r4,8
+	stw	r6,-4(r11)
+	stw	r7,-8(r11)
+	subi	r11,r11,8
+4:	/* Copy 4~7 bytes.  */
+	bf	29,L(tail2_bwd)
+	lwz	r6,-4(r4)
+	stw	r6,-4(r11)
+	bf	30,L(tail5_bwd)
+	lhz	r7,-6(r4)
+	sth	r7,-6(r11)
+	bflr	31
+	lbz	r8,-7(r4)
+	stb	r8,-7(r11)
+	/* Return original DST pointer.  */
+	blr
+END_GEN_TB (MEMMOVE, TB_TOCLESS)
+libc_hidden_builtin_def (memmove)
+
+
+/* void bcopy(const void *src [r3], void *dest [r4], size_t n [r5])
+   Implemented in this file to avoid linker create a stub function call
+   in the branch to '_memmove'.  */
+ENTRY_TOCLESS (__bcopy)
+	mr	r6,r3
+	mr	r3,r4
+	mr	r4,r6
+	b	L(_memmove)
+END (__bcopy)
+#ifndef __bcopy
+weak_alias (__bcopy, bcopy)
+#endif