[3/4] S390: Do not call memcpy, memcmp, memset within libc.so via ifunc-plt.

Message ID 1461672469-2107-3-git-send-email-stli@linux.vnet.ibm.com
State Committed
Headers

Commit Message

Stefan Liebler April 26, 2016, 12:07 p.m. UTC
  On s390, the memcpy, memcmp, memset functions are IFUNC symbols,
which are created with s390_libc_ifunc-macro.
This macro creates a __GI_ symbol which is set to the
ifunced symbol. Thus calls within libc.so to e.g. memcpy
result in a call to *ABS*+0x954c0@plt stub and afterwards
to the resolved memcpy-ifunc-variant.

This patch sets the __GI_ symbol to the default-ifunc-variant
to avoid the plt call. The __GI_ symbols are now created at the
default variant of ifunced function.

ChangeLog:

	* sysdeps/s390/multiarch/ifunc-resolve.h (s390_libc_ifunc):
	Remove __GI_ symbol.
	* sysdeps/s390/s390-32/multiarch/memcmp-s390.S: Add __GI_memcmp symbol.
	* sysdeps/s390/s390-64/multiarch/memcmp-s390x.S: Likewise.
	* sysdeps/s390/s390-32/multiarch/memcpy-s390.S: Add __GI_memcpy symbol.
	* sysdeps/s390/s390-64/multiarch/memcpy-s390x.S: Likewise.
	* sysdeps/s390/s390-32/multiarch/memset-s390.S: Add __GI_memset symbol.
	* sysdeps/s390/s390-64/multiarch/memset-s390x.S: Likewise.
---
 sysdeps/s390/multiarch/ifunc-resolve.h        | 4 +---
 sysdeps/s390/s390-32/multiarch/memcmp-s390.S  | 3 +++
 sysdeps/s390/s390-32/multiarch/memcpy-s390.S  | 5 ++++-
 sysdeps/s390/s390-32/multiarch/memset-s390.S  | 3 +++
 sysdeps/s390/s390-64/multiarch/memcmp-s390x.S | 3 +++
 sysdeps/s390/s390-64/multiarch/memcpy-s390x.S | 5 ++++-
 sysdeps/s390/s390-64/multiarch/memset-s390x.S | 3 +++
 7 files changed, 21 insertions(+), 5 deletions(-)
  

Comments

Adhemerval Zanella April 26, 2016, 1:35 p.m. UTC | #1
On 26/04/2016 09:07, Stefan Liebler wrote:
> On s390, the memcpy, memcmp, memset functions are IFUNC symbols,
> which are created with s390_libc_ifunc-macro.
> This macro creates a __GI_ symbol which is set to the
> ifunced symbol. Thus calls within libc.so to e.g. memcpy
> result in a call to *ABS*+0x954c0@plt stub and afterwards
> to the resolved memcpy-ifunc-variant.
> 
> This patch sets the __GI_ symbol to the default-ifunc-variant
> to avoid the plt call. The __GI_ symbols are now created at the
> default variant of ifunced function.

Is the internal ifunc plt usage leading to a failure in s390/s390x
(as for powerpc32 and i686) or is it an optimization fix?

> 
> ChangeLog:
> 
> 	* sysdeps/s390/multiarch/ifunc-resolve.h (s390_libc_ifunc):
> 	Remove __GI_ symbol.
> 	* sysdeps/s390/s390-32/multiarch/memcmp-s390.S: Add __GI_memcmp symbol.
> 	* sysdeps/s390/s390-64/multiarch/memcmp-s390x.S: Likewise.
> 	* sysdeps/s390/s390-32/multiarch/memcpy-s390.S: Add __GI_memcpy symbol.
> 	* sysdeps/s390/s390-64/multiarch/memcpy-s390x.S: Likewise.
> 	* sysdeps/s390/s390-32/multiarch/memset-s390.S: Add __GI_memset symbol.
> 	* sysdeps/s390/s390-64/multiarch/memset-s390x.S: Likewise.
> ---
>  sysdeps/s390/multiarch/ifunc-resolve.h        | 4 +---
>  sysdeps/s390/s390-32/multiarch/memcmp-s390.S  | 3 +++
>  sysdeps/s390/s390-32/multiarch/memcpy-s390.S  | 5 ++++-
>  sysdeps/s390/s390-32/multiarch/memset-s390.S  | 3 +++
>  sysdeps/s390/s390-64/multiarch/memcmp-s390x.S | 3 +++
>  sysdeps/s390/s390-64/multiarch/memcpy-s390x.S | 5 ++++-
>  sysdeps/s390/s390-64/multiarch/memset-s390x.S | 3 +++
>  7 files changed, 21 insertions(+), 5 deletions(-)
> 
> diff --git a/sysdeps/s390/multiarch/ifunc-resolve.h b/sysdeps/s390/multiarch/ifunc-resolve.h
> index 744a0d8..26e097a 100644
> --- a/sysdeps/s390/multiarch/ifunc-resolve.h
> +++ b/sysdeps/s390/multiarch/ifunc-resolve.h
> @@ -44,9 +44,7 @@
>  #define s390_libc_ifunc(FUNC)						\
>    __asm__ (".globl " #FUNC "\n\t"					\
>  	   ".type  " #FUNC ",@gnu_indirect_function\n\t"		\
> -	   ".set   " #FUNC ",__resolve_" #FUNC "\n\t"			\
> -	   ".globl __GI_" #FUNC "\n\t"					\
> -	   ".set   __GI_" #FUNC "," #FUNC "\n");			\
> +	   ".set   " #FUNC ",__resolve_" #FUNC "\n\t");			\
>  									\
>    /* Make the declarations of the optimized functions hidden in order
>       to prevent GOT slots being generated for them. */			\
> diff --git a/sysdeps/s390/s390-32/multiarch/memcmp-s390.S b/sysdeps/s390/s390-32/multiarch/memcmp-s390.S
> index e9ee6d2..a01f3b7 100644
> --- a/sysdeps/s390/s390-32/multiarch/memcmp-s390.S
> +++ b/sysdeps/s390/s390-32/multiarch/memcmp-s390.S
> @@ -101,4 +101,7 @@ END(__memcmp_z10)
>  .set     memcmp,__memcmp_default
>  .weak    bcmp
>  .set	 bcmp,__memcmp_default
> +#elif defined SHARED && IS_IN (libc)
> +.globl   __GI_memcmp
> +.set     __GI_memcmp,__memcmp_default
>  #endif
> diff --git a/sysdeps/s390/s390-32/multiarch/memcpy-s390.S b/sysdeps/s390/s390-32/multiarch/memcpy-s390.S
> index 4e30cdf..92ffaea 100644
> --- a/sysdeps/s390/s390-32/multiarch/memcpy-s390.S
> +++ b/sysdeps/s390/s390-32/multiarch/memcpy-s390.S
> @@ -92,7 +92,10 @@ END(__memcpy_z10)
>  
>  #include "../memcpy.S"
>  
> -#if !defined SHARED || !IS_IN (libc)
> +#if defined SHARED && IS_IN (libc)
> +.globl   __GI_memcpy
> +.set     __GI_memcpy,__memcpy_default
> +#else
>  .globl   memcpy
>  .set     memcpy,__memcpy_default
>  #endif
> diff --git a/sysdeps/s390/s390-32/multiarch/memset-s390.S b/sysdeps/s390/s390-32/multiarch/memset-s390.S
> index 47277c1..a2ddd98 100644
> --- a/sysdeps/s390/s390-32/multiarch/memset-s390.S
> +++ b/sysdeps/s390/s390-32/multiarch/memset-s390.S
> @@ -110,4 +110,7 @@ END(__memset_mvcle)
>  #if !IS_IN (libc)
>  .globl   memset
>  .set     memset,__memset_default
> +#elif defined SHARED && IS_IN (libc)
> +.globl   __GI_memset
> +.set     __GI_memset,__memset_default
>  #endif
> diff --git a/sysdeps/s390/s390-64/multiarch/memcmp-s390x.S b/sysdeps/s390/s390-64/multiarch/memcmp-s390x.S
> index 2a4c0ae..b28ccaf 100644
> --- a/sysdeps/s390/s390-64/multiarch/memcmp-s390x.S
> +++ b/sysdeps/s390/s390-64/multiarch/memcmp-s390x.S
> @@ -98,4 +98,7 @@ END(__memcmp_z10)
>  .set     memcmp,__memcmp_default
>  .weak    bcmp
>  .set	 bcmp,__memcmp_default
> +#elif defined SHARED && IS_IN (libc)
> +.globl   __GI_memcmp
> +.set     __GI_memcmp,__memcmp_default
>  #endif
> diff --git a/sysdeps/s390/s390-64/multiarch/memcpy-s390x.S b/sysdeps/s390/s390-64/multiarch/memcpy-s390x.S
> index 69fa562..8f54526 100644
> --- a/sysdeps/s390/s390-64/multiarch/memcpy-s390x.S
> +++ b/sysdeps/s390/s390-64/multiarch/memcpy-s390x.S
> @@ -88,7 +88,10 @@ END(__memcpy_z10)
>  
>  #include "../memcpy.S"
>  
> -#if !defined SHARED || !IS_IN (libc)
> +#if defined SHARED && IS_IN (libc)
> +.globl   __GI_memcpy
> +.set     __GI_memcpy,__memcpy_default
> +#else
>  .globl   memcpy
>  .set     memcpy,__memcpy_default
>  #endif
> diff --git a/sysdeps/s390/s390-64/multiarch/memset-s390x.S b/sysdeps/s390/s390-64/multiarch/memset-s390x.S
> index 05e0682..a77e798 100644
> --- a/sysdeps/s390/s390-64/multiarch/memset-s390x.S
> +++ b/sysdeps/s390/s390-64/multiarch/memset-s390x.S
> @@ -106,4 +106,7 @@ END(__memset_mvcle)
>  #if !IS_IN (libc)
>  .globl   memset
>  .set     memset,__memset_default
> +#elif defined SHARED && IS_IN (libc)
> +.globl   __GI_memset
> +.set     __GI_memset,__memset_default
>  #endif
>
  
Stefan Liebler April 27, 2016, 8:14 a.m. UTC | #2
On 04/26/2016 03:35 PM, Adhemerval Zanella wrote:
>
>
> On 26/04/2016 09:07, Stefan Liebler wrote:
>> On s390, the memcpy, memcmp, memset functions are IFUNC symbols,
>> which are created with s390_libc_ifunc-macro.
>> This macro creates a __GI_ symbol which is set to the
>> ifunced symbol. Thus calls within libc.so to e.g. memcpy
>> result in a call to *ABS*+0x954c0@plt stub and afterwards
>> to the resolved memcpy-ifunc-variant.
>>
>> This patch sets the __GI_ symbol to the default-ifunc-variant
>> to avoid the plt call. The __GI_ symbols are now created at the
>> default variant of ifunced function.
>
> Is the internal ifunc plt usage leading to a failure in s390/s390x
> (as for powerpc32 and i686) or is it an optimization fix?

No it does not lead to a failure on s390.
It is an optimization to avoid the extra call to the plt-stub if called 
within libc.so.
  
Adhemerval Zanella May 4, 2016, 3:22 p.m. UTC | #3
On 27/04/2016 05:14, Stefan Liebler wrote:
> On 04/26/2016 03:35 PM, Adhemerval Zanella wrote:
>>
>>
>> On 26/04/2016 09:07, Stefan Liebler wrote:
>>> On s390, the memcpy, memcmp, memset functions are IFUNC symbols,
>>> which are created with s390_libc_ifunc-macro.
>>> This macro creates a __GI_ symbol which is set to the
>>> ifunced symbol. Thus calls within libc.so to e.g. memcpy
>>> result in a call to *ABS*+0x954c0@plt stub and afterwards
>>> to the resolved memcpy-ifunc-variant.
>>>
>>> This patch sets the __GI_ symbol to the default-ifunc-variant
>>> to avoid the plt call. The __GI_ symbols are now created at the
>>> default variant of ifunced function.
>>
>> Is the internal ifunc plt usage leading to a failure in s390/s390x
>> (as for powerpc32 and i686) or is it an optimization fix?
> 
> No it does not lead to a failure on s390.
> It is an optimization to avoid the extra call to the plt-stub if called within libc.so.
> 

Right because on 19c4bec0f43599eecc2f32de96ae179cd7d64053 I did the
exact opposite because for POWER it shows that the gains of using
optimized version over default one shows a good improvement in
algorithms that use these symbol internally (like regex).

It might be the case where the default s390 version is good enough
and shows no performance difference.
  
Stefan Liebler May 11, 2016, 1:40 p.m. UTC | #4
On 05/04/2016 05:22 PM, Adhemerval Zanella wrote:
>
>
> On 27/04/2016 05:14, Stefan Liebler wrote:
>> On 04/26/2016 03:35 PM, Adhemerval Zanella wrote:
>>>
>>>
>>> On 26/04/2016 09:07, Stefan Liebler wrote:
>>>> On s390, the memcpy, memcmp, memset functions are IFUNC symbols,
>>>> which are created with s390_libc_ifunc-macro.
>>>> This macro creates a __GI_ symbol which is set to the
>>>> ifunced symbol. Thus calls within libc.so to e.g. memcpy
>>>> result in a call to *ABS*+0x954c0@plt stub and afterwards
>>>> to the resolved memcpy-ifunc-variant.
>>>>
>>>> This patch sets the __GI_ symbol to the default-ifunc-variant
>>>> to avoid the plt call. The __GI_ symbols are now created at the
>>>> default variant of ifunced function.
>>>
>>> Is the internal ifunc plt usage leading to a failure in s390/s390x
>>> (as for powerpc32 and i686) or is it an optimization fix?
>>
>> No it does not lead to a failure on s390.
>> It is an optimization to avoid the extra call to the plt-stub if called within libc.so.
>>
>
> Right because on 19c4bec0f43599eecc2f32de96ae179cd7d64053 I did the
> exact opposite because for POWER it shows that the gains of using
> optimized version over default one shows a good improvement in
> algorithms that use these symbol internally (like regex).
>
> It might be the case where the default s390 version is good enough
> and shows no performance difference.
>

Hi Adhemerval,

I've retested the patchset in context of regex with these functions 
called with/without ifunc-plt.
The version without ifunc-plt is slightly faster.

But you are right. This decision has to be redetermined with further 
variants or newer machines.

In case of the string-functions, calling them via ifunc within libc.so 
could gain improvement.

Bye
Stefan
  
Adhemerval Zanella May 11, 2016, 7:27 p.m. UTC | #5
On 11/05/2016 10:40, Stefan Liebler wrote:
> On 05/04/2016 05:22 PM, Adhemerval Zanella wrote:
>>
>>
>> On 27/04/2016 05:14, Stefan Liebler wrote:
>>> On 04/26/2016 03:35 PM, Adhemerval Zanella wrote:
>>>>
>>>>
>>>> On 26/04/2016 09:07, Stefan Liebler wrote:
>>>>> On s390, the memcpy, memcmp, memset functions are IFUNC symbols,
>>>>> which are created with s390_libc_ifunc-macro.
>>>>> This macro creates a __GI_ symbol which is set to the
>>>>> ifunced symbol. Thus calls within libc.so to e.g. memcpy
>>>>> result in a call to *ABS*+0x954c0@plt stub and afterwards
>>>>> to the resolved memcpy-ifunc-variant.
>>>>>
>>>>> This patch sets the __GI_ symbol to the default-ifunc-variant
>>>>> to avoid the plt call. The __GI_ symbols are now created at the
>>>>> default variant of ifunced function.
>>>>
>>>> Is the internal ifunc plt usage leading to a failure in s390/s390x
>>>> (as for powerpc32 and i686) or is it an optimization fix?
>>>
>>> No it does not lead to a failure on s390.
>>> It is an optimization to avoid the extra call to the plt-stub if called within libc.so.
>>>
>>
>> Right because on 19c4bec0f43599eecc2f32de96ae179cd7d64053 I did the
>> exact opposite because for POWER it shows that the gains of using
>> optimized version over default one shows a good improvement in
>> algorithms that use these symbol internally (like regex).
>>
>> It might be the case where the default s390 version is good enough
>> and shows no performance difference.
>>
> 
> Hi Adhemerval,
> 
> I've retested the patchset in context of regex with these functions called with/without ifunc-plt.
> The version without ifunc-plt is slightly faster.
> 
> But you are right. This decision has to be redetermined with further variants or newer machines.
> 
> In case of the string-functions, calling them via ifunc within libc.so could gain improvement.
> 
> Bye
> Stefan
> 

Fair enough them.  I would expect without ifunc-plt to be slower in newer
architectures, but it could be case where the performance differences
between the base implementation and the arch optimized one is that large
(for instance powerpc64 memcpy the base one only uses 8-bytes store/loads,
which show large differences depending of the workload compared to power7
one).
  

Patch

diff --git a/sysdeps/s390/multiarch/ifunc-resolve.h b/sysdeps/s390/multiarch/ifunc-resolve.h
index 744a0d8..26e097a 100644
--- a/sysdeps/s390/multiarch/ifunc-resolve.h
+++ b/sysdeps/s390/multiarch/ifunc-resolve.h
@@ -44,9 +44,7 @@ 
 #define s390_libc_ifunc(FUNC)						\
   __asm__ (".globl " #FUNC "\n\t"					\
 	   ".type  " #FUNC ",@gnu_indirect_function\n\t"		\
-	   ".set   " #FUNC ",__resolve_" #FUNC "\n\t"			\
-	   ".globl __GI_" #FUNC "\n\t"					\
-	   ".set   __GI_" #FUNC "," #FUNC "\n");			\
+	   ".set   " #FUNC ",__resolve_" #FUNC "\n\t");			\
 									\
   /* Make the declarations of the optimized functions hidden in order
      to prevent GOT slots being generated for them. */			\
diff --git a/sysdeps/s390/s390-32/multiarch/memcmp-s390.S b/sysdeps/s390/s390-32/multiarch/memcmp-s390.S
index e9ee6d2..a01f3b7 100644
--- a/sysdeps/s390/s390-32/multiarch/memcmp-s390.S
+++ b/sysdeps/s390/s390-32/multiarch/memcmp-s390.S
@@ -101,4 +101,7 @@  END(__memcmp_z10)
 .set     memcmp,__memcmp_default
 .weak    bcmp
 .set	 bcmp,__memcmp_default
+#elif defined SHARED && IS_IN (libc)
+.globl   __GI_memcmp
+.set     __GI_memcmp,__memcmp_default
 #endif
diff --git a/sysdeps/s390/s390-32/multiarch/memcpy-s390.S b/sysdeps/s390/s390-32/multiarch/memcpy-s390.S
index 4e30cdf..92ffaea 100644
--- a/sysdeps/s390/s390-32/multiarch/memcpy-s390.S
+++ b/sysdeps/s390/s390-32/multiarch/memcpy-s390.S
@@ -92,7 +92,10 @@  END(__memcpy_z10)
 
 #include "../memcpy.S"
 
-#if !defined SHARED || !IS_IN (libc)
+#if defined SHARED && IS_IN (libc)
+.globl   __GI_memcpy
+.set     __GI_memcpy,__memcpy_default
+#else
 .globl   memcpy
 .set     memcpy,__memcpy_default
 #endif
diff --git a/sysdeps/s390/s390-32/multiarch/memset-s390.S b/sysdeps/s390/s390-32/multiarch/memset-s390.S
index 47277c1..a2ddd98 100644
--- a/sysdeps/s390/s390-32/multiarch/memset-s390.S
+++ b/sysdeps/s390/s390-32/multiarch/memset-s390.S
@@ -110,4 +110,7 @@  END(__memset_mvcle)
 #if !IS_IN (libc)
 .globl   memset
 .set     memset,__memset_default
+#elif defined SHARED && IS_IN (libc)
+.globl   __GI_memset
+.set     __GI_memset,__memset_default
 #endif
diff --git a/sysdeps/s390/s390-64/multiarch/memcmp-s390x.S b/sysdeps/s390/s390-64/multiarch/memcmp-s390x.S
index 2a4c0ae..b28ccaf 100644
--- a/sysdeps/s390/s390-64/multiarch/memcmp-s390x.S
+++ b/sysdeps/s390/s390-64/multiarch/memcmp-s390x.S
@@ -98,4 +98,7 @@  END(__memcmp_z10)
 .set     memcmp,__memcmp_default
 .weak    bcmp
 .set	 bcmp,__memcmp_default
+#elif defined SHARED && IS_IN (libc)
+.globl   __GI_memcmp
+.set     __GI_memcmp,__memcmp_default
 #endif
diff --git a/sysdeps/s390/s390-64/multiarch/memcpy-s390x.S b/sysdeps/s390/s390-64/multiarch/memcpy-s390x.S
index 69fa562..8f54526 100644
--- a/sysdeps/s390/s390-64/multiarch/memcpy-s390x.S
+++ b/sysdeps/s390/s390-64/multiarch/memcpy-s390x.S
@@ -88,7 +88,10 @@  END(__memcpy_z10)
 
 #include "../memcpy.S"
 
-#if !defined SHARED || !IS_IN (libc)
+#if defined SHARED && IS_IN (libc)
+.globl   __GI_memcpy
+.set     __GI_memcpy,__memcpy_default
+#else
 .globl   memcpy
 .set     memcpy,__memcpy_default
 #endif
diff --git a/sysdeps/s390/s390-64/multiarch/memset-s390x.S b/sysdeps/s390/s390-64/multiarch/memset-s390x.S
index 05e0682..a77e798 100644
--- a/sysdeps/s390/s390-64/multiarch/memset-s390x.S
+++ b/sysdeps/s390/s390-64/multiarch/memset-s390x.S
@@ -106,4 +106,7 @@  END(__memset_mvcle)
 #if !IS_IN (libc)
 .globl   memset
 .set     memset,__memset_default
+#elif defined SHARED && IS_IN (libc)
+.globl   __GI_memset
+.set     __GI_memset,__memset_default
 #endif