[4/4] S390: Implement mempcpy with help of memcpy. [BZ #19765]

Message ID 1461672469-2107-4-git-send-email-stli@linux.vnet.ibm.com
State Superseded
Headers

Commit Message

Stefan Liebler April 26, 2016, 12:07 p.m. UTC
  There exist optimized memcpy functions on s390, but no optimized mempcpy.
This patch adds mempcpy entry points in memcpy.S files, which
use the memcpy implementation. Now mempcpy itself is also an IFUNC function
as memcpy is and the variants are listed in ifunc-impl-list.c.

The s390 string.h defines _HAVE_STRING_ARCH_mempcpy and __mempcpy/mempcpy
definitions. If n is constant and small enough, __mempcpy/mempcpy is an inlined
function, which uses memcpy and returns dest + n. In this constant case,
GCC emits instructions like mvi or mvc and avoids the function call to memcpy.
If n is not constant, then __mempcpy is called.

ChangeLog:

	[BZ #19765]
	* sysdeps/s390/bits/string.h (mempcpy) Redirect to
	__mempcpy_inline_memcpy or __mempcpy_inline_mempcpy.
	(__mempcpy) Likewise.
	(__mempcpy_inline_mempcpy): New inline function.
	(__mempcpy_inline_memcpy): Likewise.
	(_HAVE_STRING_ARCH_mempcpy): New define.
	* sysdeps/s390/mempcpy.S: New File.
	* sysdeps/s390/multiarch/mempcpy.c: Likewise.
	* sysdeps/s390/multiarch/Makefile (sysdep_routines): Add mempcpy.
	* sysdeps/s390/multiarch/ifunc-impl-list.c (__libc_ifunc_impl_list):
	Add mempcpy variants.
	* sysdeps/s390/s390-32/memcpy.S: Add mempcpy entry point.
	(memcpy): Adjust to be usable from mempcpy entry point.
	(__memcpy_mvcle): Likewise.
	* sysdeps/s390/s390-64/memcpy.S: Likewise.
	* sysdeps/s390/s390-32/multiarch/memcpy-s390.S: Add entry points
	____mempcpy_z196, ____mempcpy_z10 and add __GI_ symbols for mempcpy.
	(__memcpy_z196): Adjust to be usable from mempcpy entry point.
	(__memcpy_z10): Likewise.
	* sysdeps/s390/s390-64/multiarch/memcpy-s390x.S: Likewise.
---
 sysdeps/s390/bits/string.h                    | 36 +++++++++++++++++++
 sysdeps/s390/mempcpy.S                        | 19 ++++++++++
 sysdeps/s390/multiarch/Makefile               |  3 +-
 sysdeps/s390/multiarch/ifunc-impl-list.c      |  7 ++++
 sysdeps/s390/multiarch/mempcpy.c              | 26 ++++++++++++++
 sysdeps/s390/s390-32/memcpy.S                 | 50 ++++++++++++++++-----------
 sysdeps/s390/s390-32/multiarch/memcpy-s390.S  | 31 +++++++++++++++--
 sysdeps/s390/s390-64/memcpy.S                 | 47 ++++++++++++++-----------
 sysdeps/s390/s390-64/multiarch/memcpy-s390x.S | 29 ++++++++++++++--
 9 files changed, 203 insertions(+), 45 deletions(-)
 create mode 100644 sysdeps/s390/mempcpy.S
 create mode 100644 sysdeps/s390/multiarch/mempcpy.c
  

Comments

Adhemerval Zanella Netto April 26, 2016, 1:32 p.m. UTC | #1
On 26/04/2016 09:07, Stefan Liebler wrote:
> There exist optimized memcpy functions on s390, but no optimized mempcpy.
> This patch adds mempcpy entry points in memcpy.S files, which
> use the memcpy implementation. Now mempcpy itself is also an IFUNC function
> as memcpy is and the variants are listed in ifunc-impl-list.c.
> 
> The s390 string.h defines _HAVE_STRING_ARCH_mempcpy and __mempcpy/mempcpy
> definitions. If n is constant and small enough, __mempcpy/mempcpy is an inlined
> function, which uses memcpy and returns dest + n. In this constant case,
> GCC emits instructions like mvi or mvc and avoids the function call to memcpy.
> If n is not constant, then __mempcpy is called.
> 

AFAIk GLIBC does not have any ports that implements mempcpy and not
memcpy and IMHO the inline version to convert mempcpy to memcpy is 
always a gain.

So I would suggest to just avoid adding arch-specific bits and move the
mempcpy inline optimization at string/string.h to string/string2.h as

--

#define mempcpy(dest, src, n) __mempcpy_inline (dest, src, n)
#define __mempcpy(dest, src, n) __mempcpy_inline (dest, src, n)

__extern_always_inline void *
__mempcpy_inline (void *__restrict __dest,
                  const void *__restrict __src, size_t __n)
{
  return (char *) memcpy (__dest, __src, __n) + __n;
}

--

This is convert any mempcpy to mempcy plus length and provide
the symbol only for older binaries.

> ChangeLog:
> 
> 	[BZ #19765]
> 	* sysdeps/s390/bits/string.h (mempcpy) Redirect to
> 	__mempcpy_inline_memcpy or __mempcpy_inline_mempcpy.
> 	(__mempcpy) Likewise.
> 	(__mempcpy_inline_mempcpy): New inline function.
> 	(__mempcpy_inline_memcpy): Likewise.
> 	(_HAVE_STRING_ARCH_mempcpy): New define.
> 	* sysdeps/s390/mempcpy.S: New File.
> 	* sysdeps/s390/multiarch/mempcpy.c: Likewise.
> 	* sysdeps/s390/multiarch/Makefile (sysdep_routines): Add mempcpy.
> 	* sysdeps/s390/multiarch/ifunc-impl-list.c (__libc_ifunc_impl_list):
> 	Add mempcpy variants.
> 	* sysdeps/s390/s390-32/memcpy.S: Add mempcpy entry point.
> 	(memcpy): Adjust to be usable from mempcpy entry point.
> 	(__memcpy_mvcle): Likewise.
> 	* sysdeps/s390/s390-64/memcpy.S: Likewise.
> 	* sysdeps/s390/s390-32/multiarch/memcpy-s390.S: Add entry points
> 	____mempcpy_z196, ____mempcpy_z10 and add __GI_ symbols for mempcpy.
> 	(__memcpy_z196): Adjust to be usable from mempcpy entry point.
> 	(__memcpy_z10): Likewise.
> 	* sysdeps/s390/s390-64/multiarch/memcpy-s390x.S: Likewise.
> ---
>  sysdeps/s390/bits/string.h                    | 36 +++++++++++++++++++
>  sysdeps/s390/mempcpy.S                        | 19 ++++++++++
>  sysdeps/s390/multiarch/Makefile               |  3 +-
>  sysdeps/s390/multiarch/ifunc-impl-list.c      |  7 ++++
>  sysdeps/s390/multiarch/mempcpy.c              | 26 ++++++++++++++
>  sysdeps/s390/s390-32/memcpy.S                 | 50 ++++++++++++++++-----------
>  sysdeps/s390/s390-32/multiarch/memcpy-s390.S  | 31 +++++++++++++++--
>  sysdeps/s390/s390-64/memcpy.S                 | 47 ++++++++++++++-----------
>  sysdeps/s390/s390-64/multiarch/memcpy-s390x.S | 29 ++++++++++++++--
>  9 files changed, 203 insertions(+), 45 deletions(-)
>  create mode 100644 sysdeps/s390/mempcpy.S
>  create mode 100644 sysdeps/s390/multiarch/mempcpy.c
> 
> diff --git a/sysdeps/s390/bits/string.h b/sysdeps/s390/bits/string.h
> index 39e0b7f..eae3c8e 100644
> --- a/sysdeps/s390/bits/string.h
> +++ b/sysdeps/s390/bits/string.h
> @@ -24,6 +24,42 @@
>  /* Use the unaligned string inline ABI.  */
>  #define _STRING_INLINE_unaligned 1
>  
> +#if defined __USE_GNU && defined __OPTIMIZE__ \
> +    && defined __extern_always_inline && __GNUC_PREREQ (3,2)
> +# if !defined _FORCE_INLINES && !defined _HAVE_STRING_ARCH_mempcpy
> +
> +/* Don't inline mempcpy into memcpy as s390 has an optimized mempcpy.  */
> +#  define _HAVE_STRING_ARCH_mempcpy 1
> +
> +__extern_always_inline void *
> +__mempcpy_inline_memcpy (void *__restrict __dest,
> +			 const void *__restrict __src, size_t __n)
> +{
> +  return (char *) memcpy (__dest, __src, __n) + __n;
> +}
> +
> +__extern_always_inline void *
> +__mempcpy_inline_mempcpy (void *__restrict __dest,
> +			 const void *__restrict __src, size_t __n)
> +{
> +  return __mempcpy (__dest, __src, __n);
> +}
> +
> +/* If n is constant and small enough, __mempcpy/mempcpy is an inlined function,
> +   which uses memcpy and returns dest + n. In this constant case, GCC emits
> +   instructions like mvi or mvc and avoids the function call to memcpy.
> +   If n is not constant, then __mempcpy is called.  */
> +#  undef mempcpy
> +#  undef __mempcpy
> +#  define __mempcpy(dest, src, n)					\
> +  (__extension__ (__builtin_constant_p (n) && n <= 65536		\
> +		  ? __mempcpy_inline_memcpy (dest, src, n)		\
> +		  : __mempcpy_inline_mempcpy (dest, src, n)))
> +#  define mempcpy(dest, src, n) __mempcpy (dest, src, n)
> +
> +# endif
> +#endif
> +
>  /* We only provide optimizations if the user selects them and if
>     GNU CC is used.  */
>  #if !defined __NO_STRING_INLINES && defined __USE_STRING_INLINES \
> diff --git a/sysdeps/s390/mempcpy.S b/sysdeps/s390/mempcpy.S
> new file mode 100644
> index 0000000..c62074b
> --- /dev/null
> +++ b/sysdeps/s390/mempcpy.S
> @@ -0,0 +1,19 @@
> +/* CPU specific mempcpy without multiarch - 32/64 bit S/390 version.
> +   Copyright (C) 2016 Free Software Foundation, Inc.
> +   This file is part of the GNU C Library.
> +
> +   The GNU C Library is free software; you can redistribute it and/or
> +   modify it under the terms of the GNU Lesser General Public
> +   License as published by the Free Software Foundation; either
> +   version 2.1 of the License, or (at your option) any later version.
> +
> +   The GNU C Library is distributed in the hope that it will be useful,
> +   but WITHOUT ANY WARRANTY; without even the implied warranty of
> +   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
> +   Lesser General Public License for more details.
> +
> +   You should have received a copy of the GNU Lesser General Public
> +   License along with the GNU C Library; if not, see
> +   <http://www.gnu.org/licenses/>.  */
> +
> +/* mempcpy is implemented in memcpy.S.  */
> diff --git a/sysdeps/s390/multiarch/Makefile b/sysdeps/s390/multiarch/Makefile
> index 0805b07..89324ca 100644
> --- a/sysdeps/s390/multiarch/Makefile
> +++ b/sysdeps/s390/multiarch/Makefile
> @@ -18,7 +18,8 @@ sysdep_routines += strlen strlen-vx strlen-c \
>  		   memchr memchr-vx \
>  		   rawmemchr rawmemchr-vx rawmemchr-c \
>  		   memccpy memccpy-vx memccpy-c \
> -		   memrchr memrchr-vx memrchr-c
> +		   memrchr memrchr-vx memrchr-c \
> +		   mempcpy
>  endif
>  
>  ifeq ($(subdir),wcsmbs)
> diff --git a/sysdeps/s390/multiarch/ifunc-impl-list.c b/sysdeps/s390/multiarch/ifunc-impl-list.c
> index 62a4359..89898ec 100644
> --- a/sysdeps/s390/multiarch/ifunc-impl-list.c
> +++ b/sysdeps/s390/multiarch/ifunc-impl-list.c
> @@ -69,6 +69,13 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
>  			      S390_IS_Z10 (stfle_bits), __memcpy_z10)
>  	      IFUNC_IMPL_ADD (array, i, memcpy, 1, __memcpy_default))
>  
> +  IFUNC_IMPL (i, name, mempcpy,
> +	      IFUNC_IMPL_ADD (array, i, mempcpy,
> +			      S390_IS_Z196 (stfle_bits), ____mempcpy_z196)
> +	      IFUNC_IMPL_ADD (array, i, mempcpy,
> +			      S390_IS_Z10 (stfle_bits), ____mempcpy_z10)
> +	      IFUNC_IMPL_ADD (array, i, mempcpy, 1, ____mempcpy_default))
> +
>  #endif /* SHARED */
>  
>  #ifdef HAVE_S390_VX_ASM_SUPPORT
> diff --git a/sysdeps/s390/multiarch/mempcpy.c b/sysdeps/s390/multiarch/mempcpy.c
> new file mode 100644
> index 0000000..34d8329
> --- /dev/null
> +++ b/sysdeps/s390/multiarch/mempcpy.c
> @@ -0,0 +1,26 @@
> +/* Multiple versions of mempcpy.
> +   Copyright (C) 2016 Free Software Foundation, Inc.
> +   This file is part of the GNU C Library.
> +
> +   The GNU C Library is free software; you can redistribute it and/or
> +   modify it under the terms of the GNU Lesser General Public
> +   License as published by the Free Software Foundation; either
> +   version 2.1 of the License, or (at your option) any later version.
> +
> +   The GNU C Library is distributed in the hope that it will be useful,
> +   but WITHOUT ANY WARRANTY; without even the implied warranty of
> +   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
> +   Lesser General Public License for more details.
> +
> +   You should have received a copy of the GNU Lesser General Public
> +   License along with the GNU C Library; if not, see
> +   <http://www.gnu.org/licenses/>.  */
> +
> +
> +#if defined SHARED && IS_IN (libc)
> +# include <ifunc-resolve.h>
> +s390_libc_ifunc (__mempcpy)
> +
> +__asm__ (".weak mempcpy\n\t"
> +	 ".set mempcpy,__mempcpy\n\t");
> +#endif
> diff --git a/sysdeps/s390/s390-32/memcpy.S b/sysdeps/s390/s390-32/memcpy.S
> index 2ac51ab..6be5104 100644
> --- a/sysdeps/s390/s390-32/memcpy.S
> +++ b/sysdeps/s390/s390-32/memcpy.S
> @@ -25,12 +25,23 @@
>       %r3 = address of source memory area
>       %r4 = number of bytes to copy.  */
>  
> -#ifdef USE_MULTIARCH
> -ENTRY(__memcpy_default)
> -#else
> -ENTRY(memcpy)
> +       .text
> +ENTRY(__mempcpy)
> +	.machine "g5"
> +	lr      %r1,%r2             # Use as dest
> +	la      %r2,0(%r4,%r2)      # Return dest + n
> +	j	.L_G5_start
> +END(__mempcpy)
> +#ifndef USE_MULTIARCH
> +libc_hidden_def (__mempcpy)
> +weak_alias (__mempcpy, mempcpy)
> +libc_hidden_builtin_def (mempcpy)
>  #endif
> +
> +ENTRY(memcpy)
>  	.machine "g5"
> +	lr      %r1,%r2             # r1: Use as dest ; r2: Return dest
> +.L_G5_start:
>  	st      %r13,52(%r15)
>  	.cfi_offset 13, -44
>  	basr    %r13,0
> @@ -41,14 +52,13 @@ ENTRY(memcpy)
>  	lr      %r5,%r4
>  	srl     %r5,8
>  	ltr     %r5,%r5
> -	lr      %r1,%r2
>  	jne     .L_G5_13
>  	ex      %r4,.L_G5_17-.L_G5_16(%r13)
>  .L_G5_4:
>  	l       %r13,52(%r15)
>  	br      %r14
>  .L_G5_13:
> -	chi	%r5,4096             # Switch to mvcle for copies >1MB
> +	chi	%r5,4096            # Switch to mvcle for copies >1MB
>  	jh	__memcpy_mvcle
>  .L_G5_12:
>  	mvc     0(256,%r1),0(%r3)
> @@ -59,24 +69,24 @@ ENTRY(memcpy)
>  	j       .L_G5_4
>  .L_G5_17:
>  	mvc     0(1,%r1),0(%r3)
> -#ifdef USE_MULTIARCH
> -END(__memcpy_default)
> -#else
>  END(memcpy)
> +#ifndef USE_MULTIARCH
>  libc_hidden_builtin_def (memcpy)
>  #endif
>  
>  ENTRY(__memcpy_mvcle)
> -       # Using as standalone function will result in unexpected
> -       # results since the length field is incremented by 1 in order to
> -       # compensate the changes already done in the functions above.
> -       ahi     %r4,1               # length + 1
> -       lr      %r5,%r4             # source length
> -       lr      %r4,%r3             # source address
> -       lr      %r3,%r5             # destination length = source length
> +	# Using as standalone function will result in unexpected
> +	# results since the length field is incremented by 1 in order to
> +	# compensate the changes already done in the functions above.
> +	lr      %r0,%r2             # backup return dest [ + n ]
> +	ahi     %r4,1               # length + 1
> +	lr      %r5,%r4             # source length
> +	lr      %r4,%r3             # source address
> +	lr      %r2,%r1             # destination address
> +	lr      %r3,%r5             # destination length = source length
>  .L_MVCLE_1:
> -       mvcle   %r2,%r4,0           # thats it, MVCLE is your friend
> -       jo      .L_MVCLE_1
> -       lr      %r2,%r1             # return destination address
> -       br      %r14
> +	mvcle   %r2,%r4,0           # thats it, MVCLE is your friend
> +	jo      .L_MVCLE_1
> +	lr      %r2,%r0             # return destination address
> +	br      %r14
>  END(__memcpy_mvcle)
> diff --git a/sysdeps/s390/s390-32/multiarch/memcpy-s390.S b/sysdeps/s390/s390-32/multiarch/memcpy-s390.S
> index 92ffaea..297a894 100644
> --- a/sysdeps/s390/s390-32/multiarch/memcpy-s390.S
> +++ b/sysdeps/s390/s390-32/multiarch/memcpy-s390.S
> @@ -29,14 +29,23 @@
>  
>  #if defined SHARED && IS_IN (libc)
>  
> +ENTRY(____mempcpy_z196)
> +	.machine "z196"
> +	.machinemode "zarch_nohighgprs"
> +	lr      %r1,%r2         # Use as dest
> +	la      %r2,0(%r4,%r2)  # Return dest + n
> +	j	.L_Z196_start
> +END(____mempcpy_z196)
> +
>  ENTRY(__memcpy_z196)
>  	.machine "z196"
>  	.machinemode "zarch_nohighgprs"
> +	lr      %r1,%r2         # r1: Use as dest ; r2: Return dest
> +.L_Z196_start:
>  	llgfr   %r4,%r4
>  	ltgr    %r4,%r4
>  	je      .L_Z196_4
>  	aghi    %r4,-1
> -	lr      %r1,%r2
>  	srlg    %r5,%r4,8
>  	ltgr    %r5,%r5
>  	jne     .L_Z196_5
> @@ -60,13 +69,22 @@ ENTRY(__memcpy_z196)
>  	mvc     0(1,%r1),0(%r3)
>  END(__memcpy_z196)
>  
> +ENTRY(____mempcpy_z10)
> +	.machine "z10"
> +	.machinemode "zarch_nohighgprs"
> +	lr      %r1,%r2         # Use as dest
> +	la      %r2,0(%r4,%r2)  # Return dest + n
> +	j	.L_Z10_start
> +END(____mempcpy_z10)
> +
>  ENTRY(__memcpy_z10)
>  	.machine "z10"
>  	.machinemode "zarch_nohighgprs"
> +	lr      %r1,%r2         # r1: Use as dest ; r2: Return dest
> +.L_Z10_start:
>  	llgfr   %r4,%r4
>  	cgije   %r4,0,.L_Z10_4
>  	aghi    %r4,-1
> -	lr      %r1,%r2
>  	srlg    %r5,%r4,8
>  	cgijlh  %r5,0,.L_Z10_13
>  .L_Z10_3:
> @@ -88,14 +106,23 @@ ENTRY(__memcpy_z10)
>  	mvc     0(1,%r1),0(%r3)
>  END(__memcpy_z10)
>  
> +# define __mempcpy ____mempcpy_default
>  #endif /* SHARED && IS_IN (libc) */
>  
> +#define memcpy __memcpy_default
>  #include "../memcpy.S"
> +#undef memcpy
>  
>  #if defined SHARED && IS_IN (libc)
>  .globl   __GI_memcpy
>  .set     __GI_memcpy,__memcpy_default
> +.globl   __GI_mempcpy
> +.set     __GI_mempcpy,____mempcpy_default
> +.globl   __GI___mempcpy
> +.set     __GI___mempcpy,____mempcpy_default
>  #else
>  .globl   memcpy
>  .set     memcpy,__memcpy_default
> +.weak    mempcpy
> +.set     mempcpy,__mempcpy
>  #endif
> diff --git a/sysdeps/s390/s390-64/memcpy.S b/sysdeps/s390/s390-64/memcpy.S
> index 9d60a14..e3ace0c 100644
> --- a/sysdeps/s390/s390-64/memcpy.S
> +++ b/sysdeps/s390/s390-64/memcpy.S
> @@ -27,19 +27,27 @@
>  
>  
>         .text
> +ENTRY(__mempcpy)
> +	.machine "z900"
> +	lgr     %r1,%r2             # Use as dest
> +	la      %r2,0(%r4,%r2)      # Return dest + n
> +	j	.L_Z900_start
> +END(__mempcpy)
> +#ifndef USE_MULTIARCH
> +libc_hidden_def (__mempcpy)
> +weak_alias (__mempcpy, mempcpy)
> +libc_hidden_builtin_def (mempcpy)
> +#endif
>  
> -#ifdef USE_MULTIARCH
> -ENTRY(__memcpy_default)
> -#else
>  ENTRY(memcpy)
> -#endif
>  	.machine "z900"
> +	lgr     %r1,%r2             # r1: Use as dest ; r2: Return dest
> +.L_Z900_start:
>  	ltgr    %r4,%r4
>  	je      .L_Z900_4
>  	aghi    %r4,-1
>  	srlg    %r5,%r4,8
>  	ltgr    %r5,%r5
> -	lgr     %r1,%r2
>  	jne     .L_Z900_13
>  .L_Z900_3:
>  	larl    %r5,.L_Z900_15
> @@ -57,25 +65,24 @@ ENTRY(memcpy)
>  	j       .L_Z900_3
>  .L_Z900_15:
>  	mvc     0(1,%r1),0(%r3)
> -
> -#ifdef USE_MULTIARCH
> -END(__memcpy_default)
> -#else
>  END(memcpy)
> +#ifndef USE_MULTIARCH
>  libc_hidden_builtin_def (memcpy)
>  #endif
>  
>  ENTRY(__memcpy_mvcle)
> -       # Using as standalone function will result in unexpected
> -       # results since the length field is incremented by 1 in order to
> -       # compensate the changes already done in the functions above.
> -       aghi    %r4,1               # length + 1
> -       lgr     %r5,%r4             # source length
> -       lgr     %r4,%r3             # source address
> -       lgr     %r3,%r5             # destination length = source length
> +	# Using as standalone function will result in unexpected
> +	# results since the length field is incremented by 1 in order to
> +	# compensate the changes already done in the functions above.
> +	lgr     %r0,%r2             # backup return dest [ + n ]
> +	aghi    %r4,1               # length + 1
> +	lgr     %r5,%r4             # source length
> +	lgr     %r4,%r3             # source address
> +	lgr     %r2,%r1             # destination address
> +	lgr     %r3,%r5             # destination length = source length
>  .L_MVCLE_1:
> -       mvcle   %r2,%r4,0           # thats it, MVCLE is your friend
> -       jo      .L_MVCLE_1
> -       lgr     %r2,%r1             # return destination address
> -       br      %r14
> +	mvcle   %r2,%r4,0           # thats it, MVCLE is your friend
> +	jo      .L_MVCLE_1
> +	lgr     %r2,%r0             # return destination address
> +	br      %r14
>  END(__memcpy_mvcle)
> diff --git a/sysdeps/s390/s390-64/multiarch/memcpy-s390x.S b/sysdeps/s390/s390-64/multiarch/memcpy-s390x.S
> index 8f54526..0f5a36e 100644
> --- a/sysdeps/s390/s390-64/multiarch/memcpy-s390x.S
> +++ b/sysdeps/s390/s390-64/multiarch/memcpy-s390x.S
> @@ -29,12 +29,20 @@
>  
>  #if defined SHARED && IS_IN (libc)
>  
> +ENTRY(____mempcpy_z196)
> +	.machine "z196"
> +	lgr     %r1,%r2         # Use as dest
> +	la      %r2,0(%r4,%r2)  # Return dest + n
> +	j	.L_Z196_start
> +END(____mempcpy_z196)
> +
>  ENTRY(__memcpy_z196)
>  	.machine "z196"
> +	lgr     %r1,%r2         # r1: Use as dest ; r2: Return dest
> +.L_Z196_start:
>  	ltgr    %r4,%r4
>  	je      .L_Z196_4
>  	aghi    %r4,-1
> -	lgr     %r1,%r2
>  	srlg    %r5,%r4,8
>  	ltgr    %r5,%r5
>  	jne     .L_Z196_5
> @@ -58,11 +66,19 @@ ENTRY(__memcpy_z196)
>  	mvc     0(1,%r1),0(%r3)
>  END(__memcpy_z196)
>  
> +ENTRY(____mempcpy_z10)
> +	.machine "z10"
> +	lgr     %r1,%r2         # Use as dest
> +	la      %r2,0(%r4,%r2)  # Return dest + n
> +	j	.L_Z10_start
> +END(____mempcpy_z10)
> +
>  ENTRY(__memcpy_z10)
>  	.machine "z10"
> +	lgr     %r1,%r2         # r1: Use as dest ; r2: Return dest
> +.L_Z10_start:
>  	cgije   %r4,0,.L_Z10_4
>  	aghi    %r4,-1
> -	lgr     %r1,%r2
>  	srlg    %r5,%r4,8
>  	cgijlh  %r5,0,.L_Z10_13
>  .L_Z10_3:
> @@ -84,14 +100,23 @@ ENTRY(__memcpy_z10)
>  	mvc     0(1,%r1),0(%r3)
>  END(__memcpy_z10)
>  
> +# define __mempcpy ____mempcpy_default
>  #endif /* SHARED && IS_IN (libc) */
>  
> +#define memcpy __memcpy_default
>  #include "../memcpy.S"
> +#undef memcpy
>  
>  #if defined SHARED && IS_IN (libc)
>  .globl   __GI_memcpy
>  .set     __GI_memcpy,__memcpy_default
> +.globl   __GI_mempcpy
> +.set     __GI_mempcpy,____mempcpy_default
> +.globl   __GI___mempcpy
> +.set     __GI___mempcpy,____mempcpy_default
>  #else
>  .globl   memcpy
>  .set     memcpy,__memcpy_default
> +.weak    mempcpy
> +.set     mempcpy,__mempcpy
>  #endif
>
  
Stefan Liebler April 27, 2016, 8:14 a.m. UTC | #2
On 04/26/2016 03:32 PM, Adhemerval Zanella wrote:
>
>
> On 26/04/2016 09:07, Stefan Liebler wrote:
>> There exist optimized memcpy functions on s390, but no optimized mempcpy.
>> This patch adds mempcpy entry points in memcpy.S files, which
>> use the memcpy implementation. Now mempcpy itself is also an IFUNC function
>> as memcpy is and the variants are listed in ifunc-impl-list.c.
>>
>> The s390 string.h defines _HAVE_STRING_ARCH_mempcpy and __mempcpy/mempcpy
>> definitions. If n is constant and small enough, __mempcpy/mempcpy is an inlined
>> function, which uses memcpy and returns dest + n. In this constant case,
>> GCC emits instructions like mvi or mvc and avoids the function call to memcpy.
>> If n is not constant, then __mempcpy is called.
>>
>
> AFAIk GLIBC does not have any ports that implements mempcpy and not
> memcpy and IMHO the inline version to convert mempcpy to memcpy is
> always a gain.
>
> So I would suggest to just avoid adding arch-specific bits and move the
> mempcpy inline optimization at string/string.h to string/string2.h as
>
> --
>
> #define mempcpy(dest, src, n) __mempcpy_inline (dest, src, n)
> #define __mempcpy(dest, src, n) __mempcpy_inline (dest, src, n)
>
> __extern_always_inline void *
> __mempcpy_inline (void *__restrict __dest,
>                    const void *__restrict __src, size_t __n)
> {
>    return (char *) memcpy (__dest, __src, __n) + __n;
> }
>
> --
>
> This is convert any mempcpy to mempcy plus length and provide
> the symbol only for older binaries.
>

According to the current behaviour on s390, a call to memcpy with 
constant byte-length below 65536 is done by emitting mvc, mvi or
a suitable instruction sequence.
If gcc don't know about a constant byte-length, then memcpy function
in glibc is called.

For mempcpy cascades like in iconv/gconv_conf.c:
466:  __mempcpy (__mempcpy (__mempcpy (gconv_path, __gconv_path_envvar,
					   user_len),
				":", 1),
		     default_gconv_path, sizeof (default_gconv_path));

The current behaviour is to perform a call to memcpy(), add user_len to
the return value of the call and use mvi, mvc for the two calls with 
constant byte-lengths.

If only _HAVE_STRING_ARCH_mempcpy is defined, there would be three calls
to mempcpy. Here you are right, this is not what you want.

The behaviour with this patch is to call mempcpy() once and use mvi, mvc
directly without the need to add user_len after the mempcpy call.
This way, there is no need to store user_len in an extra register or on 
stack.

I don't know the conditions on other archs, whether memcpy is inlined or 
called, thus I added the s390-specific condition to s390-string.h.
  
Stefan Liebler May 4, 2016, 1:15 p.m. UTC | #3
ping.

Any objection to the mempcpy macro in sysdeps/s390/bits/string.h?

On 04/26/2016 02:07 PM, Stefan Liebler wrote:
> There exist optimized memcpy functions on s390, but no optimized mempcpy.
> This patch adds mempcpy entry points in memcpy.S files, which
> use the memcpy implementation. Now mempcpy itself is also an IFUNC function
> as memcpy is and the variants are listed in ifunc-impl-list.c.
>
> The s390 string.h defines _HAVE_STRING_ARCH_mempcpy and __mempcpy/mempcpy
> definitions. If n is constant and small enough, __mempcpy/mempcpy is an inlined
> function, which uses memcpy and returns dest + n. In this constant case,
> GCC emits instructions like mvi or mvc and avoids the function call to memcpy.
> If n is not constant, then __mempcpy is called.
>
> ChangeLog:
>
> 	[BZ #19765]
> 	* sysdeps/s390/bits/string.h (mempcpy) Redirect to
> 	__mempcpy_inline_memcpy or __mempcpy_inline_mempcpy.
> 	(__mempcpy) Likewise.
> 	(__mempcpy_inline_mempcpy): New inline function.
> 	(__mempcpy_inline_memcpy): Likewise.
> 	(_HAVE_STRING_ARCH_mempcpy): New define.
> 	* sysdeps/s390/mempcpy.S: New File.
> 	* sysdeps/s390/multiarch/mempcpy.c: Likewise.
> 	* sysdeps/s390/multiarch/Makefile (sysdep_routines): Add mempcpy.
> 	* sysdeps/s390/multiarch/ifunc-impl-list.c (__libc_ifunc_impl_list):
> 	Add mempcpy variants.
> 	* sysdeps/s390/s390-32/memcpy.S: Add mempcpy entry point.
> 	(memcpy): Adjust to be usable from mempcpy entry point.
> 	(__memcpy_mvcle): Likewise.
> 	* sysdeps/s390/s390-64/memcpy.S: Likewise.
> 	* sysdeps/s390/s390-32/multiarch/memcpy-s390.S: Add entry points
> 	____mempcpy_z196, ____mempcpy_z10 and add __GI_ symbols for mempcpy.
> 	(__memcpy_z196): Adjust to be usable from mempcpy entry point.
> 	(__memcpy_z10): Likewise.
> 	* sysdeps/s390/s390-64/multiarch/memcpy-s390x.S: Likewise.
> ---
>   sysdeps/s390/bits/string.h                    | 36 +++++++++++++++++++
>   sysdeps/s390/mempcpy.S                        | 19 ++++++++++
>   sysdeps/s390/multiarch/Makefile               |  3 +-
>   sysdeps/s390/multiarch/ifunc-impl-list.c      |  7 ++++
>   sysdeps/s390/multiarch/mempcpy.c              | 26 ++++++++++++++
>   sysdeps/s390/s390-32/memcpy.S                 | 50 ++++++++++++++++-----------
>   sysdeps/s390/s390-32/multiarch/memcpy-s390.S  | 31 +++++++++++++++--
>   sysdeps/s390/s390-64/memcpy.S                 | 47 ++++++++++++++-----------
>   sysdeps/s390/s390-64/multiarch/memcpy-s390x.S | 29 ++++++++++++++--
>   9 files changed, 203 insertions(+), 45 deletions(-)
>   create mode 100644 sysdeps/s390/mempcpy.S
>   create mode 100644 sysdeps/s390/multiarch/mempcpy.c
>
> diff --git a/sysdeps/s390/bits/string.h b/sysdeps/s390/bits/string.h
> index 39e0b7f..eae3c8e 100644
> --- a/sysdeps/s390/bits/string.h
> +++ b/sysdeps/s390/bits/string.h
> @@ -24,6 +24,42 @@
>   /* Use the unaligned string inline ABI.  */
>   #define _STRING_INLINE_unaligned 1
>
> +#if defined __USE_GNU && defined __OPTIMIZE__ \
> +    && defined __extern_always_inline && __GNUC_PREREQ (3,2)
> +# if !defined _FORCE_INLINES && !defined _HAVE_STRING_ARCH_mempcpy
> +
> +/* Don't inline mempcpy into memcpy as s390 has an optimized mempcpy.  */
> +#  define _HAVE_STRING_ARCH_mempcpy 1
> +
> +__extern_always_inline void *
> +__mempcpy_inline_memcpy (void *__restrict __dest,
> +			 const void *__restrict __src, size_t __n)
> +{
> +  return (char *) memcpy (__dest, __src, __n) + __n;
> +}
> +
> +__extern_always_inline void *
> +__mempcpy_inline_mempcpy (void *__restrict __dest,
> +			 const void *__restrict __src, size_t __n)
> +{
> +  return __mempcpy (__dest, __src, __n);
> +}
> +
> +/* If n is constant and small enough, __mempcpy/mempcpy is an inlined function,
> +   which uses memcpy and returns dest + n. In this constant case, GCC emits
> +   instructions like mvi or mvc and avoids the function call to memcpy.
> +   If n is not constant, then __mempcpy is called.  */
> +#  undef mempcpy
> +#  undef __mempcpy
> +#  define __mempcpy(dest, src, n)					\
> +  (__extension__ (__builtin_constant_p (n) && n <= 65536		\
> +		  ? __mempcpy_inline_memcpy (dest, src, n)		\
> +		  : __mempcpy_inline_mempcpy (dest, src, n)))
> +#  define mempcpy(dest, src, n) __mempcpy (dest, src, n)
> +
> +# endif
> +#endif
> +
>   /* We only provide optimizations if the user selects them and if
>      GNU CC is used.  */
>   #if !defined __NO_STRING_INLINES && defined __USE_STRING_INLINES \
> diff --git a/sysdeps/s390/mempcpy.S b/sysdeps/s390/mempcpy.S
> new file mode 100644
> index 0000000..c62074b
> --- /dev/null
> +++ b/sysdeps/s390/mempcpy.S
> @@ -0,0 +1,19 @@
> +/* CPU specific mempcpy without multiarch - 32/64 bit S/390 version.
> +   Copyright (C) 2016 Free Software Foundation, Inc.
> +   This file is part of the GNU C Library.
> +
> +   The GNU C Library is free software; you can redistribute it and/or
> +   modify it under the terms of the GNU Lesser General Public
> +   License as published by the Free Software Foundation; either
> +   version 2.1 of the License, or (at your option) any later version.
> +
> +   The GNU C Library is distributed in the hope that it will be useful,
> +   but WITHOUT ANY WARRANTY; without even the implied warranty of
> +   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
> +   Lesser General Public License for more details.
> +
> +   You should have received a copy of the GNU Lesser General Public
> +   License along with the GNU C Library; if not, see
> +   <http://www.gnu.org/licenses/>.  */
> +
> +/* mempcpy is implemented in memcpy.S.  */
> diff --git a/sysdeps/s390/multiarch/Makefile b/sysdeps/s390/multiarch/Makefile
> index 0805b07..89324ca 100644
> --- a/sysdeps/s390/multiarch/Makefile
> +++ b/sysdeps/s390/multiarch/Makefile
> @@ -18,7 +18,8 @@ sysdep_routines += strlen strlen-vx strlen-c \
>   		   memchr memchr-vx \
>   		   rawmemchr rawmemchr-vx rawmemchr-c \
>   		   memccpy memccpy-vx memccpy-c \
> -		   memrchr memrchr-vx memrchr-c
> +		   memrchr memrchr-vx memrchr-c \
> +		   mempcpy
>   endif
>
>   ifeq ($(subdir),wcsmbs)
> diff --git a/sysdeps/s390/multiarch/ifunc-impl-list.c b/sysdeps/s390/multiarch/ifunc-impl-list.c
> index 62a4359..89898ec 100644
> --- a/sysdeps/s390/multiarch/ifunc-impl-list.c
> +++ b/sysdeps/s390/multiarch/ifunc-impl-list.c
> @@ -69,6 +69,13 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
>   			      S390_IS_Z10 (stfle_bits), __memcpy_z10)
>   	      IFUNC_IMPL_ADD (array, i, memcpy, 1, __memcpy_default))
>
> +  IFUNC_IMPL (i, name, mempcpy,
> +	      IFUNC_IMPL_ADD (array, i, mempcpy,
> +			      S390_IS_Z196 (stfle_bits), ____mempcpy_z196)
> +	      IFUNC_IMPL_ADD (array, i, mempcpy,
> +			      S390_IS_Z10 (stfle_bits), ____mempcpy_z10)
> +	      IFUNC_IMPL_ADD (array, i, mempcpy, 1, ____mempcpy_default))
> +
>   #endif /* SHARED */
>
>   #ifdef HAVE_S390_VX_ASM_SUPPORT
> diff --git a/sysdeps/s390/multiarch/mempcpy.c b/sysdeps/s390/multiarch/mempcpy.c
> new file mode 100644
> index 0000000..34d8329
> --- /dev/null
> +++ b/sysdeps/s390/multiarch/mempcpy.c
> @@ -0,0 +1,26 @@
> +/* Multiple versions of mempcpy.
> +   Copyright (C) 2016 Free Software Foundation, Inc.
> +   This file is part of the GNU C Library.
> +
> +   The GNU C Library is free software; you can redistribute it and/or
> +   modify it under the terms of the GNU Lesser General Public
> +   License as published by the Free Software Foundation; either
> +   version 2.1 of the License, or (at your option) any later version.
> +
> +   The GNU C Library is distributed in the hope that it will be useful,
> +   but WITHOUT ANY WARRANTY; without even the implied warranty of
> +   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
> +   Lesser General Public License for more details.
> +
> +   You should have received a copy of the GNU Lesser General Public
> +   License along with the GNU C Library; if not, see
> +   <http://www.gnu.org/licenses/>.  */
> +
> +
> +#if defined SHARED && IS_IN (libc)
> +# include <ifunc-resolve.h>
> +s390_libc_ifunc (__mempcpy)
> +
> +__asm__ (".weak mempcpy\n\t"
> +	 ".set mempcpy,__mempcpy\n\t");
> +#endif
> diff --git a/sysdeps/s390/s390-32/memcpy.S b/sysdeps/s390/s390-32/memcpy.S
> index 2ac51ab..6be5104 100644
> --- a/sysdeps/s390/s390-32/memcpy.S
> +++ b/sysdeps/s390/s390-32/memcpy.S
> @@ -25,12 +25,23 @@
>        %r3 = address of source memory area
>        %r4 = number of bytes to copy.  */
>
> -#ifdef USE_MULTIARCH
> -ENTRY(__memcpy_default)
> -#else
> -ENTRY(memcpy)
> +       .text
> +ENTRY(__mempcpy)
> +	.machine "g5"
> +	lr      %r1,%r2             # Use as dest
> +	la      %r2,0(%r4,%r2)      # Return dest + n
> +	j	.L_G5_start
> +END(__mempcpy)
> +#ifndef USE_MULTIARCH
> +libc_hidden_def (__mempcpy)
> +weak_alias (__mempcpy, mempcpy)
> +libc_hidden_builtin_def (mempcpy)
>   #endif
> +
> +ENTRY(memcpy)
>   	.machine "g5"
> +	lr      %r1,%r2             # r1: Use as dest ; r2: Return dest
> +.L_G5_start:
>   	st      %r13,52(%r15)
>   	.cfi_offset 13, -44
>   	basr    %r13,0
> @@ -41,14 +52,13 @@ ENTRY(memcpy)
>   	lr      %r5,%r4
>   	srl     %r5,8
>   	ltr     %r5,%r5
> -	lr      %r1,%r2
>   	jne     .L_G5_13
>   	ex      %r4,.L_G5_17-.L_G5_16(%r13)
>   .L_G5_4:
>   	l       %r13,52(%r15)
>   	br      %r14
>   .L_G5_13:
> -	chi	%r5,4096             # Switch to mvcle for copies >1MB
> +	chi	%r5,4096            # Switch to mvcle for copies >1MB
>   	jh	__memcpy_mvcle
>   .L_G5_12:
>   	mvc     0(256,%r1),0(%r3)
> @@ -59,24 +69,24 @@ ENTRY(memcpy)
>   	j       .L_G5_4
>   .L_G5_17:
>   	mvc     0(1,%r1),0(%r3)
> -#ifdef USE_MULTIARCH
> -END(__memcpy_default)
> -#else
>   END(memcpy)
> +#ifndef USE_MULTIARCH
>   libc_hidden_builtin_def (memcpy)
>   #endif
>
>   ENTRY(__memcpy_mvcle)
> -       # Using as standalone function will result in unexpected
> -       # results since the length field is incremented by 1 in order to
> -       # compensate the changes already done in the functions above.
> -       ahi     %r4,1               # length + 1
> -       lr      %r5,%r4             # source length
> -       lr      %r4,%r3             # source address
> -       lr      %r3,%r5             # destination length = source length
> +	# Using as standalone function will result in unexpected
> +	# results since the length field is incremented by 1 in order to
> +	# compensate the changes already done in the functions above.
> +	lr      %r0,%r2             # backup return dest [ + n ]
> +	ahi     %r4,1               # length + 1
> +	lr      %r5,%r4             # source length
> +	lr      %r4,%r3             # source address
> +	lr      %r2,%r1             # destination address
> +	lr      %r3,%r5             # destination length = source length
>   .L_MVCLE_1:
> -       mvcle   %r2,%r4,0           # thats it, MVCLE is your friend
> -       jo      .L_MVCLE_1
> -       lr      %r2,%r1             # return destination address
> -       br      %r14
> +	mvcle   %r2,%r4,0           # thats it, MVCLE is your friend
> +	jo      .L_MVCLE_1
> +	lr      %r2,%r0             # return destination address
> +	br      %r14
>   END(__memcpy_mvcle)
> diff --git a/sysdeps/s390/s390-32/multiarch/memcpy-s390.S b/sysdeps/s390/s390-32/multiarch/memcpy-s390.S
> index 92ffaea..297a894 100644
> --- a/sysdeps/s390/s390-32/multiarch/memcpy-s390.S
> +++ b/sysdeps/s390/s390-32/multiarch/memcpy-s390.S
> @@ -29,14 +29,23 @@
>
>   #if defined SHARED && IS_IN (libc)
>
> +ENTRY(____mempcpy_z196)
> +	.machine "z196"
> +	.machinemode "zarch_nohighgprs"
> +	lr      %r1,%r2         # Use as dest
> +	la      %r2,0(%r4,%r2)  # Return dest + n
> +	j	.L_Z196_start
> +END(____mempcpy_z196)
> +
>   ENTRY(__memcpy_z196)
>   	.machine "z196"
>   	.machinemode "zarch_nohighgprs"
> +	lr      %r1,%r2         # r1: Use as dest ; r2: Return dest
> +.L_Z196_start:
>   	llgfr   %r4,%r4
>   	ltgr    %r4,%r4
>   	je      .L_Z196_4
>   	aghi    %r4,-1
> -	lr      %r1,%r2
>   	srlg    %r5,%r4,8
>   	ltgr    %r5,%r5
>   	jne     .L_Z196_5
> @@ -60,13 +69,22 @@ ENTRY(__memcpy_z196)
>   	mvc     0(1,%r1),0(%r3)
>   END(__memcpy_z196)
>
> +ENTRY(____mempcpy_z10)
> +	.machine "z10"
> +	.machinemode "zarch_nohighgprs"
> +	lr      %r1,%r2         # Use as dest
> +	la      %r2,0(%r4,%r2)  # Return dest + n
> +	j	.L_Z10_start
> +END(____mempcpy_z10)
> +
>   ENTRY(__memcpy_z10)
>   	.machine "z10"
>   	.machinemode "zarch_nohighgprs"
> +	lr      %r1,%r2         # r1: Use as dest ; r2: Return dest
> +.L_Z10_start:
>   	llgfr   %r4,%r4
>   	cgije   %r4,0,.L_Z10_4
>   	aghi    %r4,-1
> -	lr      %r1,%r2
>   	srlg    %r5,%r4,8
>   	cgijlh  %r5,0,.L_Z10_13
>   .L_Z10_3:
> @@ -88,14 +106,23 @@ ENTRY(__memcpy_z10)
>   	mvc     0(1,%r1),0(%r3)
>   END(__memcpy_z10)
>
> +# define __mempcpy ____mempcpy_default
>   #endif /* SHARED && IS_IN (libc) */
>
> +#define memcpy __memcpy_default
>   #include "../memcpy.S"
> +#undef memcpy
>
>   #if defined SHARED && IS_IN (libc)
>   .globl   __GI_memcpy
>   .set     __GI_memcpy,__memcpy_default
> +.globl   __GI_mempcpy
> +.set     __GI_mempcpy,____mempcpy_default
> +.globl   __GI___mempcpy
> +.set     __GI___mempcpy,____mempcpy_default
>   #else
>   .globl   memcpy
>   .set     memcpy,__memcpy_default
> +.weak    mempcpy
> +.set     mempcpy,__mempcpy
>   #endif
> diff --git a/sysdeps/s390/s390-64/memcpy.S b/sysdeps/s390/s390-64/memcpy.S
> index 9d60a14..e3ace0c 100644
> --- a/sysdeps/s390/s390-64/memcpy.S
> +++ b/sysdeps/s390/s390-64/memcpy.S
> @@ -27,19 +27,27 @@
>
>
>          .text
> +ENTRY(__mempcpy)
> +	.machine "z900"
> +	lgr     %r1,%r2             # Use as dest
> +	la      %r2,0(%r4,%r2)      # Return dest + n
> +	j	.L_Z900_start
> +END(__mempcpy)
> +#ifndef USE_MULTIARCH
> +libc_hidden_def (__mempcpy)
> +weak_alias (__mempcpy, mempcpy)
> +libc_hidden_builtin_def (mempcpy)
> +#endif
>
> -#ifdef USE_MULTIARCH
> -ENTRY(__memcpy_default)
> -#else
>   ENTRY(memcpy)
> -#endif
>   	.machine "z900"
> +	lgr     %r1,%r2             # r1: Use as dest ; r2: Return dest
> +.L_Z900_start:
>   	ltgr    %r4,%r4
>   	je      .L_Z900_4
>   	aghi    %r4,-1
>   	srlg    %r5,%r4,8
>   	ltgr    %r5,%r5
> -	lgr     %r1,%r2
>   	jne     .L_Z900_13
>   .L_Z900_3:
>   	larl    %r5,.L_Z900_15
> @@ -57,25 +65,24 @@ ENTRY(memcpy)
>   	j       .L_Z900_3
>   .L_Z900_15:
>   	mvc     0(1,%r1),0(%r3)
> -
> -#ifdef USE_MULTIARCH
> -END(__memcpy_default)
> -#else
>   END(memcpy)
> +#ifndef USE_MULTIARCH
>   libc_hidden_builtin_def (memcpy)
>   #endif
>
>   ENTRY(__memcpy_mvcle)
> -       # Using as standalone function will result in unexpected
> -       # results since the length field is incremented by 1 in order to
> -       # compensate the changes already done in the functions above.
> -       aghi    %r4,1               # length + 1
> -       lgr     %r5,%r4             # source length
> -       lgr     %r4,%r3             # source address
> -       lgr     %r3,%r5             # destination length = source length
> +	# Using as standalone function will result in unexpected
> +	# results since the length field is incremented by 1 in order to
> +	# compensate the changes already done in the functions above.
> +	lgr     %r0,%r2             # backup return dest [ + n ]
> +	aghi    %r4,1               # length + 1
> +	lgr     %r5,%r4             # source length
> +	lgr     %r4,%r3             # source address
> +	lgr     %r2,%r1             # destination address
> +	lgr     %r3,%r5             # destination length = source length
>   .L_MVCLE_1:
> -       mvcle   %r2,%r4,0           # thats it, MVCLE is your friend
> -       jo      .L_MVCLE_1
> -       lgr     %r2,%r1             # return destination address
> -       br      %r14
> +	mvcle   %r2,%r4,0           # thats it, MVCLE is your friend
> +	jo      .L_MVCLE_1
> +	lgr     %r2,%r0             # return destination address
> +	br      %r14
>   END(__memcpy_mvcle)
> diff --git a/sysdeps/s390/s390-64/multiarch/memcpy-s390x.S b/sysdeps/s390/s390-64/multiarch/memcpy-s390x.S
> index 8f54526..0f5a36e 100644
> --- a/sysdeps/s390/s390-64/multiarch/memcpy-s390x.S
> +++ b/sysdeps/s390/s390-64/multiarch/memcpy-s390x.S
> @@ -29,12 +29,20 @@
>
>   #if defined SHARED && IS_IN (libc)
>
> +ENTRY(____mempcpy_z196)
> +	.machine "z196"
> +	lgr     %r1,%r2         # Use as dest
> +	la      %r2,0(%r4,%r2)  # Return dest + n
> +	j	.L_Z196_start
> +END(____mempcpy_z196)
> +
>   ENTRY(__memcpy_z196)
>   	.machine "z196"
> +	lgr     %r1,%r2         # r1: Use as dest ; r2: Return dest
> +.L_Z196_start:
>   	ltgr    %r4,%r4
>   	je      .L_Z196_4
>   	aghi    %r4,-1
> -	lgr     %r1,%r2
>   	srlg    %r5,%r4,8
>   	ltgr    %r5,%r5
>   	jne     .L_Z196_5
> @@ -58,11 +66,19 @@ ENTRY(__memcpy_z196)
>   	mvc     0(1,%r1),0(%r3)
>   END(__memcpy_z196)
>
> +ENTRY(____mempcpy_z10)
> +	.machine "z10"
> +	lgr     %r1,%r2         # Use as dest
> +	la      %r2,0(%r4,%r2)  # Return dest + n
> +	j	.L_Z10_start
> +END(____mempcpy_z10)
> +
>   ENTRY(__memcpy_z10)
>   	.machine "z10"
> +	lgr     %r1,%r2         # r1: Use as dest ; r2: Return dest
> +.L_Z10_start:
>   	cgije   %r4,0,.L_Z10_4
>   	aghi    %r4,-1
> -	lgr     %r1,%r2
>   	srlg    %r5,%r4,8
>   	cgijlh  %r5,0,.L_Z10_13
>   .L_Z10_3:
> @@ -84,14 +100,23 @@ ENTRY(__memcpy_z10)
>   	mvc     0(1,%r1),0(%r3)
>   END(__memcpy_z10)
>
> +# define __mempcpy ____mempcpy_default
>   #endif /* SHARED && IS_IN (libc) */
>
> +#define memcpy __memcpy_default
>   #include "../memcpy.S"
> +#undef memcpy
>
>   #if defined SHARED && IS_IN (libc)
>   .globl   __GI_memcpy
>   .set     __GI_memcpy,__memcpy_default
> +.globl   __GI_mempcpy
> +.set     __GI_mempcpy,____mempcpy_default
> +.globl   __GI___mempcpy
> +.set     __GI___mempcpy,____mempcpy_default
>   #else
>   .globl   memcpy
>   .set     memcpy,__memcpy_default
> +.weak    mempcpy
> +.set     mempcpy,__mempcpy
>   #endif
>
  
Adhemerval Zanella Netto May 4, 2016, 3:41 p.m. UTC | #4
On 27/04/2016 05:14, Stefan Liebler wrote:
> On 04/26/2016 03:32 PM, Adhemerval Zanella wrote:
>>
>>
>> On 26/04/2016 09:07, Stefan Liebler wrote:
>>> There exist optimized memcpy functions on s390, but no optimized mempcpy.
>>> This patch adds mempcpy entry points in memcpy.S files, which
>>> use the memcpy implementation. Now mempcpy itself is also an IFUNC function
>>> as memcpy is and the variants are listed in ifunc-impl-list.c.
>>>
>>> The s390 string.h defines _HAVE_STRING_ARCH_mempcpy and __mempcpy/mempcpy
>>> definitions. If n is constant and small enough, __mempcpy/mempcpy is an inlined
>>> function, which uses memcpy and returns dest + n. In this constant case,
>>> GCC emits instructions like mvi or mvc and avoids the function call to memcpy.
>>> If n is not constant, then __mempcpy is called.
>>>
>>
>> AFAIk GLIBC does not have any ports that implements mempcpy and not
>> memcpy and IMHO the inline version to convert mempcpy to memcpy is
>> always a gain.
>>
>> So I would suggest to just avoid adding arch-specific bits and move the
>> mempcpy inline optimization at string/string.h to string/string2.h as
>>
>> -- 
>>
>> #define mempcpy(dest, src, n) __mempcpy_inline (dest, src, n)
>> #define __mempcpy(dest, src, n) __mempcpy_inline (dest, src, n)
>>
>> __extern_always_inline void *
>> __mempcpy_inline (void *__restrict __dest,
>>                    const void *__restrict __src, size_t __n)
>> {
>>    return (char *) memcpy (__dest, __src, __n) + __n;
>> }
>>
>> -- 
>>
>> This is convert any mempcpy to mempcy plus length and provide
>> the symbol only for older binaries.
>>
> 
> According to the current behaviour on s390, a call to memcpy with constant byte-length below 65536 is done by emitting mvc, mvi or
> a suitable instruction sequence.
> If gcc don't know about a constant byte-length, then memcpy function
> in glibc is called.
> 
> For mempcpy cascades like in iconv/gconv_conf.c:
> 466:  __mempcpy (__mempcpy (__mempcpy (gconv_path, __gconv_path_envvar,
>                        user_len),
>                 ":", 1),
>              default_gconv_path, sizeof (default_gconv_path));
> 
> The current behaviour is to perform a call to memcpy(), add user_len to
> the return value of the call and use mvi, mvc for the two calls with constant byte-lengths.
> 
> If only _HAVE_STRING_ARCH_mempcpy is defined, there would be three calls
> to mempcpy. Here you are right, this is not what you want.
> 
> The behaviour with this patch is to call mempcpy() once and use mvi, mvc
> directly without the need to add user_len after the mempcpy call.
> This way, there is no need to store user_len in an extra register or on stack.
> 
> I don't know the conditions on other archs, whether memcpy is inlined or called, thus I added the s390-specific condition to s390-string.h.
> 

Right, but I *think* compiler would be smart enough to just avoid the
extra spilling. Take this example for instance [1], using GCC 5.3
for s390x I see no difference in generated assembly if I the strategy
I proposed (-DMEMPCPY_TO_MEMCPY) to the s390 specific you are suggesting.

In the end, I am proposing that architecture specific micro-optimization
should be avoid in favor of a more specific one.  Specially the one that
tend to avoid one or two extra spilling based on quite complex macro
expansion.

[1] http://pastie.org/10824072
  

Patch

diff --git a/sysdeps/s390/bits/string.h b/sysdeps/s390/bits/string.h
index 39e0b7f..eae3c8e 100644
--- a/sysdeps/s390/bits/string.h
+++ b/sysdeps/s390/bits/string.h
@@ -24,6 +24,42 @@ 
 /* Use the unaligned string inline ABI.  */
 #define _STRING_INLINE_unaligned 1
 
+#if defined __USE_GNU && defined __OPTIMIZE__ \
+    && defined __extern_always_inline && __GNUC_PREREQ (3,2)
+# if !defined _FORCE_INLINES && !defined _HAVE_STRING_ARCH_mempcpy
+
+/* Don't inline mempcpy into memcpy as s390 has an optimized mempcpy.  */
+#  define _HAVE_STRING_ARCH_mempcpy 1
+
+__extern_always_inline void *
+__mempcpy_inline_memcpy (void *__restrict __dest,
+			 const void *__restrict __src, size_t __n)
+{
+  return (char *) memcpy (__dest, __src, __n) + __n;
+}
+
+__extern_always_inline void *
+__mempcpy_inline_mempcpy (void *__restrict __dest,
+			 const void *__restrict __src, size_t __n)
+{
+  return __mempcpy (__dest, __src, __n);
+}
+
+/* If n is constant and small enough, __mempcpy/mempcpy is an inlined function,
+   which uses memcpy and returns dest + n. In this constant case, GCC emits
+   instructions like mvi or mvc and avoids the function call to memcpy.
+   If n is not constant, then __mempcpy is called.  */
+#  undef mempcpy
+#  undef __mempcpy
+#  define __mempcpy(dest, src, n)					\
+  (__extension__ (__builtin_constant_p (n) && n <= 65536		\
+		  ? __mempcpy_inline_memcpy (dest, src, n)		\
+		  : __mempcpy_inline_mempcpy (dest, src, n)))
+#  define mempcpy(dest, src, n) __mempcpy (dest, src, n)
+
+# endif
+#endif
+
 /* We only provide optimizations if the user selects them and if
    GNU CC is used.  */
 #if !defined __NO_STRING_INLINES && defined __USE_STRING_INLINES \
diff --git a/sysdeps/s390/mempcpy.S b/sysdeps/s390/mempcpy.S
new file mode 100644
index 0000000..c62074b
--- /dev/null
+++ b/sysdeps/s390/mempcpy.S
@@ -0,0 +1,19 @@ 
+/* CPU specific mempcpy without multiarch - 32/64 bit S/390 version.
+   Copyright (C) 2016 Free Software Foundation, Inc.
+   This file is part of the GNU C Library.
+
+   The GNU C Library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 2.1 of the License, or (at your option) any later version.
+
+   The GNU C Library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with the GNU C Library; if not, see
+   <http://www.gnu.org/licenses/>.  */
+
+/* mempcpy is implemented in memcpy.S.  */
diff --git a/sysdeps/s390/multiarch/Makefile b/sysdeps/s390/multiarch/Makefile
index 0805b07..89324ca 100644
--- a/sysdeps/s390/multiarch/Makefile
+++ b/sysdeps/s390/multiarch/Makefile
@@ -18,7 +18,8 @@  sysdep_routines += strlen strlen-vx strlen-c \
 		   memchr memchr-vx \
 		   rawmemchr rawmemchr-vx rawmemchr-c \
 		   memccpy memccpy-vx memccpy-c \
-		   memrchr memrchr-vx memrchr-c
+		   memrchr memrchr-vx memrchr-c \
+		   mempcpy
 endif
 
 ifeq ($(subdir),wcsmbs)
diff --git a/sysdeps/s390/multiarch/ifunc-impl-list.c b/sysdeps/s390/multiarch/ifunc-impl-list.c
index 62a4359..89898ec 100644
--- a/sysdeps/s390/multiarch/ifunc-impl-list.c
+++ b/sysdeps/s390/multiarch/ifunc-impl-list.c
@@ -69,6 +69,13 @@  __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
 			      S390_IS_Z10 (stfle_bits), __memcpy_z10)
 	      IFUNC_IMPL_ADD (array, i, memcpy, 1, __memcpy_default))
 
+  IFUNC_IMPL (i, name, mempcpy,
+	      IFUNC_IMPL_ADD (array, i, mempcpy,
+			      S390_IS_Z196 (stfle_bits), ____mempcpy_z196)
+	      IFUNC_IMPL_ADD (array, i, mempcpy,
+			      S390_IS_Z10 (stfle_bits), ____mempcpy_z10)
+	      IFUNC_IMPL_ADD (array, i, mempcpy, 1, ____mempcpy_default))
+
 #endif /* SHARED */
 
 #ifdef HAVE_S390_VX_ASM_SUPPORT
diff --git a/sysdeps/s390/multiarch/mempcpy.c b/sysdeps/s390/multiarch/mempcpy.c
new file mode 100644
index 0000000..34d8329
--- /dev/null
+++ b/sysdeps/s390/multiarch/mempcpy.c
@@ -0,0 +1,26 @@ 
+/* Multiple versions of mempcpy.
+   Copyright (C) 2016 Free Software Foundation, Inc.
+   This file is part of the GNU C Library.
+
+   The GNU C Library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 2.1 of the License, or (at your option) any later version.
+
+   The GNU C Library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with the GNU C Library; if not, see
+   <http://www.gnu.org/licenses/>.  */
+
+
+#if defined SHARED && IS_IN (libc)
+# include <ifunc-resolve.h>
+s390_libc_ifunc (__mempcpy)
+
+__asm__ (".weak mempcpy\n\t"
+	 ".set mempcpy,__mempcpy\n\t");
+#endif
diff --git a/sysdeps/s390/s390-32/memcpy.S b/sysdeps/s390/s390-32/memcpy.S
index 2ac51ab..6be5104 100644
--- a/sysdeps/s390/s390-32/memcpy.S
+++ b/sysdeps/s390/s390-32/memcpy.S
@@ -25,12 +25,23 @@ 
      %r3 = address of source memory area
      %r4 = number of bytes to copy.  */
 
-#ifdef USE_MULTIARCH
-ENTRY(__memcpy_default)
-#else
-ENTRY(memcpy)
+       .text
+ENTRY(__mempcpy)
+	.machine "g5"
+	lr      %r1,%r2             # Use as dest
+	la      %r2,0(%r4,%r2)      # Return dest + n
+	j	.L_G5_start
+END(__mempcpy)
+#ifndef USE_MULTIARCH
+libc_hidden_def (__mempcpy)
+weak_alias (__mempcpy, mempcpy)
+libc_hidden_builtin_def (mempcpy)
 #endif
+
+ENTRY(memcpy)
 	.machine "g5"
+	lr      %r1,%r2             # r1: Use as dest ; r2: Return dest
+.L_G5_start:
 	st      %r13,52(%r15)
 	.cfi_offset 13, -44
 	basr    %r13,0
@@ -41,14 +52,13 @@  ENTRY(memcpy)
 	lr      %r5,%r4
 	srl     %r5,8
 	ltr     %r5,%r5
-	lr      %r1,%r2
 	jne     .L_G5_13
 	ex      %r4,.L_G5_17-.L_G5_16(%r13)
 .L_G5_4:
 	l       %r13,52(%r15)
 	br      %r14
 .L_G5_13:
-	chi	%r5,4096             # Switch to mvcle for copies >1MB
+	chi	%r5,4096            # Switch to mvcle for copies >1MB
 	jh	__memcpy_mvcle
 .L_G5_12:
 	mvc     0(256,%r1),0(%r3)
@@ -59,24 +69,24 @@  ENTRY(memcpy)
 	j       .L_G5_4
 .L_G5_17:
 	mvc     0(1,%r1),0(%r3)
-#ifdef USE_MULTIARCH
-END(__memcpy_default)
-#else
 END(memcpy)
+#ifndef USE_MULTIARCH
 libc_hidden_builtin_def (memcpy)
 #endif
 
 ENTRY(__memcpy_mvcle)
-       # Using as standalone function will result in unexpected
-       # results since the length field is incremented by 1 in order to
-       # compensate the changes already done in the functions above.
-       ahi     %r4,1               # length + 1
-       lr      %r5,%r4             # source length
-       lr      %r4,%r3             # source address
-       lr      %r3,%r5             # destination length = source length
+	# Using as standalone function will result in unexpected
+	# results since the length field is incremented by 1 in order to
+	# compensate the changes already done in the functions above.
+	lr      %r0,%r2             # backup return dest [ + n ]
+	ahi     %r4,1               # length + 1
+	lr      %r5,%r4             # source length
+	lr      %r4,%r3             # source address
+	lr      %r2,%r1             # destination address
+	lr      %r3,%r5             # destination length = source length
 .L_MVCLE_1:
-       mvcle   %r2,%r4,0           # thats it, MVCLE is your friend
-       jo      .L_MVCLE_1
-       lr      %r2,%r1             # return destination address
-       br      %r14
+	mvcle   %r2,%r4,0           # thats it, MVCLE is your friend
+	jo      .L_MVCLE_1
+	lr      %r2,%r0             # return destination address
+	br      %r14
 END(__memcpy_mvcle)
diff --git a/sysdeps/s390/s390-32/multiarch/memcpy-s390.S b/sysdeps/s390/s390-32/multiarch/memcpy-s390.S
index 92ffaea..297a894 100644
--- a/sysdeps/s390/s390-32/multiarch/memcpy-s390.S
+++ b/sysdeps/s390/s390-32/multiarch/memcpy-s390.S
@@ -29,14 +29,23 @@ 
 
 #if defined SHARED && IS_IN (libc)
 
+ENTRY(____mempcpy_z196)
+	.machine "z196"
+	.machinemode "zarch_nohighgprs"
+	lr      %r1,%r2         # Use as dest
+	la      %r2,0(%r4,%r2)  # Return dest + n
+	j	.L_Z196_start
+END(____mempcpy_z196)
+
 ENTRY(__memcpy_z196)
 	.machine "z196"
 	.machinemode "zarch_nohighgprs"
+	lr      %r1,%r2         # r1: Use as dest ; r2: Return dest
+.L_Z196_start:
 	llgfr   %r4,%r4
 	ltgr    %r4,%r4
 	je      .L_Z196_4
 	aghi    %r4,-1
-	lr      %r1,%r2
 	srlg    %r5,%r4,8
 	ltgr    %r5,%r5
 	jne     .L_Z196_5
@@ -60,13 +69,22 @@  ENTRY(__memcpy_z196)
 	mvc     0(1,%r1),0(%r3)
 END(__memcpy_z196)
 
+ENTRY(____mempcpy_z10)
+	.machine "z10"
+	.machinemode "zarch_nohighgprs"
+	lr      %r1,%r2         # Use as dest
+	la      %r2,0(%r4,%r2)  # Return dest + n
+	j	.L_Z10_start
+END(____mempcpy_z10)
+
 ENTRY(__memcpy_z10)
 	.machine "z10"
 	.machinemode "zarch_nohighgprs"
+	lr      %r1,%r2         # r1: Use as dest ; r2: Return dest
+.L_Z10_start:
 	llgfr   %r4,%r4
 	cgije   %r4,0,.L_Z10_4
 	aghi    %r4,-1
-	lr      %r1,%r2
 	srlg    %r5,%r4,8
 	cgijlh  %r5,0,.L_Z10_13
 .L_Z10_3:
@@ -88,14 +106,23 @@  ENTRY(__memcpy_z10)
 	mvc     0(1,%r1),0(%r3)
 END(__memcpy_z10)
 
+# define __mempcpy ____mempcpy_default
 #endif /* SHARED && IS_IN (libc) */
 
+#define memcpy __memcpy_default
 #include "../memcpy.S"
+#undef memcpy
 
 #if defined SHARED && IS_IN (libc)
 .globl   __GI_memcpy
 .set     __GI_memcpy,__memcpy_default
+.globl   __GI_mempcpy
+.set     __GI_mempcpy,____mempcpy_default
+.globl   __GI___mempcpy
+.set     __GI___mempcpy,____mempcpy_default
 #else
 .globl   memcpy
 .set     memcpy,__memcpy_default
+.weak    mempcpy
+.set     mempcpy,__mempcpy
 #endif
diff --git a/sysdeps/s390/s390-64/memcpy.S b/sysdeps/s390/s390-64/memcpy.S
index 9d60a14..e3ace0c 100644
--- a/sysdeps/s390/s390-64/memcpy.S
+++ b/sysdeps/s390/s390-64/memcpy.S
@@ -27,19 +27,27 @@ 
 
 
        .text
+ENTRY(__mempcpy)
+	.machine "z900"
+	lgr     %r1,%r2             # Use as dest
+	la      %r2,0(%r4,%r2)      # Return dest + n
+	j	.L_Z900_start
+END(__mempcpy)
+#ifndef USE_MULTIARCH
+libc_hidden_def (__mempcpy)
+weak_alias (__mempcpy, mempcpy)
+libc_hidden_builtin_def (mempcpy)
+#endif
 
-#ifdef USE_MULTIARCH
-ENTRY(__memcpy_default)
-#else
 ENTRY(memcpy)
-#endif
 	.machine "z900"
+	lgr     %r1,%r2             # r1: Use as dest ; r2: Return dest
+.L_Z900_start:
 	ltgr    %r4,%r4
 	je      .L_Z900_4
 	aghi    %r4,-1
 	srlg    %r5,%r4,8
 	ltgr    %r5,%r5
-	lgr     %r1,%r2
 	jne     .L_Z900_13
 .L_Z900_3:
 	larl    %r5,.L_Z900_15
@@ -57,25 +65,24 @@  ENTRY(memcpy)
 	j       .L_Z900_3
 .L_Z900_15:
 	mvc     0(1,%r1),0(%r3)
-
-#ifdef USE_MULTIARCH
-END(__memcpy_default)
-#else
 END(memcpy)
+#ifndef USE_MULTIARCH
 libc_hidden_builtin_def (memcpy)
 #endif
 
 ENTRY(__memcpy_mvcle)
-       # Using as standalone function will result in unexpected
-       # results since the length field is incremented by 1 in order to
-       # compensate the changes already done in the functions above.
-       aghi    %r4,1               # length + 1
-       lgr     %r5,%r4             # source length
-       lgr     %r4,%r3             # source address
-       lgr     %r3,%r5             # destination length = source length
+	# Using as standalone function will result in unexpected
+	# results since the length field is incremented by 1 in order to
+	# compensate the changes already done in the functions above.
+	lgr     %r0,%r2             # backup return dest [ + n ]
+	aghi    %r4,1               # length + 1
+	lgr     %r5,%r4             # source length
+	lgr     %r4,%r3             # source address
+	lgr     %r2,%r1             # destination address
+	lgr     %r3,%r5             # destination length = source length
 .L_MVCLE_1:
-       mvcle   %r2,%r4,0           # thats it, MVCLE is your friend
-       jo      .L_MVCLE_1
-       lgr     %r2,%r1             # return destination address
-       br      %r14
+	mvcle   %r2,%r4,0           # thats it, MVCLE is your friend
+	jo      .L_MVCLE_1
+	lgr     %r2,%r0             # return destination address
+	br      %r14
 END(__memcpy_mvcle)
diff --git a/sysdeps/s390/s390-64/multiarch/memcpy-s390x.S b/sysdeps/s390/s390-64/multiarch/memcpy-s390x.S
index 8f54526..0f5a36e 100644
--- a/sysdeps/s390/s390-64/multiarch/memcpy-s390x.S
+++ b/sysdeps/s390/s390-64/multiarch/memcpy-s390x.S
@@ -29,12 +29,20 @@ 
 
 #if defined SHARED && IS_IN (libc)
 
+ENTRY(____mempcpy_z196)
+	.machine "z196"
+	lgr     %r1,%r2         # Use as dest
+	la      %r2,0(%r4,%r2)  # Return dest + n
+	j	.L_Z196_start
+END(____mempcpy_z196)
+
 ENTRY(__memcpy_z196)
 	.machine "z196"
+	lgr     %r1,%r2         # r1: Use as dest ; r2: Return dest
+.L_Z196_start:
 	ltgr    %r4,%r4
 	je      .L_Z196_4
 	aghi    %r4,-1
-	lgr     %r1,%r2
 	srlg    %r5,%r4,8
 	ltgr    %r5,%r5
 	jne     .L_Z196_5
@@ -58,11 +66,19 @@  ENTRY(__memcpy_z196)
 	mvc     0(1,%r1),0(%r3)
 END(__memcpy_z196)
 
+ENTRY(____mempcpy_z10)
+	.machine "z10"
+	lgr     %r1,%r2         # Use as dest
+	la      %r2,0(%r4,%r2)  # Return dest + n
+	j	.L_Z10_start
+END(____mempcpy_z10)
+
 ENTRY(__memcpy_z10)
 	.machine "z10"
+	lgr     %r1,%r2         # r1: Use as dest ; r2: Return dest
+.L_Z10_start:
 	cgije   %r4,0,.L_Z10_4
 	aghi    %r4,-1
-	lgr     %r1,%r2
 	srlg    %r5,%r4,8
 	cgijlh  %r5,0,.L_Z10_13
 .L_Z10_3:
@@ -84,14 +100,23 @@  ENTRY(__memcpy_z10)
 	mvc     0(1,%r1),0(%r3)
 END(__memcpy_z10)
 
+# define __mempcpy ____mempcpy_default
 #endif /* SHARED && IS_IN (libc) */
 
+#define memcpy __memcpy_default
 #include "../memcpy.S"
+#undef memcpy
 
 #if defined SHARED && IS_IN (libc)
 .globl   __GI_memcpy
 .set     __GI_memcpy,__memcpy_default
+.globl   __GI_mempcpy
+.set     __GI_mempcpy,____mempcpy_default
+.globl   __GI___mempcpy
+.set     __GI___mempcpy,____mempcpy_default
 #else
 .globl   memcpy
 .set     memcpy,__memcpy_default
+.weak    mempcpy
+.set     mempcpy,__mempcpy
 #endif