[1/2] S390: Save and restore fprs/vrs while resolving symbols.

Message ID 1458645925-28632-1-git-send-email-stli@linux.vnet.ibm.com
State Committed
Headers

Commit Message

Stefan Liebler March 22, 2016, 11:25 a.m. UTC
  On s390, no fpr/vrs were saved while resolving a symbol
via _dl_runtime_resolve/_dl_runtime_profile.

According to the abi, the fpr-arguments are defined as call clobbered.
In leaf-functions, gcc 4.9 and newer can use fprs for saving/restoring gprs
instead of saving them to the stack.
If gcc do this in one of the resolver-functions, then the floating point
arguments of a library-function are invalid for the first library-function-call.
Thus, this patch saves/restores the fprs around the resolving code.

The same could occur for vector registers. Furthermore an ifunc-resolver
could also clobber the vector/floating point argument registers.
Thus this patch provides the further variants _dl_runtime_resolve_vx/
_dl_runtime_profile_vx, which are used if the kernel claims, that
we run on a machine with vector registers.

Furthermore, if _dl_runtime_profile calls _dl_call_pltexit,
the pointers to inregs-/outregs-structs were setup invalid.
Now they point to the correct location in the stack-frame.
Before branching back to the caller, the return values are now
restored instead of containing the return values of the
_dl_call_pltexit() call.
On s390-32, an endless loop occurs if _dl_call_pltexit() should be called.
Now, this code-path branches to this function instead of just after the
preceding basr-instruction.

ChangeLog:

	* sysdeps/s390/s390-32/dl-trampoline.S: Include dl-trampoline.h twice
	to create a non-vector/vector version for _dl_runtime_resolve and
	_dl_runtime_profile. Move implementation to ...
	* sysdeps/s390/s390-32/dl-trampoline.h: ... here.
	(_dl_runtime_resolve) Save and restore fpr/vrs.
	(_dl_runtime_profile) Save and restore vrs and fix some issues
	if _dl_call_pltexit is called.
	* sysdeps/s390/s390-32/dl-machine.h (elf_machine_runtime_setup):
	Choose the correct resolver function if running on a machine with vx.
	* sysdeps/s390/s390-64/dl-trampoline.S: Include dl-trampoline.h twice
	to create a non-vector/vector version for _dl_runtime_resolve and
	_dl_runtime_profile. Move implementation to ...
	* sysdeps/s390/s390-64/dl-trampoline.h: ... here.
	(_dl_runtime_resolve) Save and restore fpr/vrs.
	(_dl_runtime_profile) Save and restore vrs and fix some issues
	* sysdeps/s390/s390-64/dl-machine.h: (elf_machine_runtime_setup):
	Choose the correct resolver function if running on a machine with vx.
---
 sysdeps/s390/s390-32/dl-machine.h    |  27 ++++-
 sysdeps/s390/s390-32/dl-trampoline.S | 134 ++--------------------
 sysdeps/s390/s390-32/dl-trampoline.h | 215 +++++++++++++++++++++++++++++++++++
 sysdeps/s390/s390-64/dl-machine.h    |  27 ++++-
 sysdeps/s390/s390-64/dl-trampoline.S | 130 ++-------------------
 sysdeps/s390/s390-64/dl-trampoline.h | 211 ++++++++++++++++++++++++++++++++++
 6 files changed, 496 insertions(+), 248 deletions(-)
 create mode 100644 sysdeps/s390/s390-32/dl-trampoline.h
 create mode 100644 sysdeps/s390/s390-64/dl-trampoline.h
  

Comments

Stefan Liebler March 31, 2016, 3:40 p.m. UTC | #1
Commited

On 03/22/2016 12:25 PM, Stefan Liebler wrote:
> On s390, no fpr/vrs were saved while resolving a symbol
> via _dl_runtime_resolve/_dl_runtime_profile.
>
> According to the abi, the fpr-arguments are defined as call clobbered.
> In leaf-functions, gcc 4.9 and newer can use fprs for saving/restoring gprs
> instead of saving them to the stack.
> If gcc do this in one of the resolver-functions, then the floating point
> arguments of a library-function are invalid for the first library-function-call.
> Thus, this patch saves/restores the fprs around the resolving code.
>
> The same could occur for vector registers. Furthermore an ifunc-resolver
> could also clobber the vector/floating point argument registers.
> Thus this patch provides the further variants _dl_runtime_resolve_vx/
> _dl_runtime_profile_vx, which are used if the kernel claims, that
> we run on a machine with vector registers.
>
> Furthermore, if _dl_runtime_profile calls _dl_call_pltexit,
> the pointers to inregs-/outregs-structs were setup invalid.
> Now they point to the correct location in the stack-frame.
> Before branching back to the caller, the return values are now
> restored instead of containing the return values of the
> _dl_call_pltexit() call.
> On s390-32, an endless loop occurs if _dl_call_pltexit() should be called.
> Now, this code-path branches to this function instead of just after the
> preceding basr-instruction.
>
> ChangeLog:
>
> 	* sysdeps/s390/s390-32/dl-trampoline.S: Include dl-trampoline.h twice
> 	to create a non-vector/vector version for _dl_runtime_resolve and
> 	_dl_runtime_profile. Move implementation to ...
> 	* sysdeps/s390/s390-32/dl-trampoline.h: ... here.
> 	(_dl_runtime_resolve) Save and restore fpr/vrs.
> 	(_dl_runtime_profile) Save and restore vrs and fix some issues
> 	if _dl_call_pltexit is called.
> 	* sysdeps/s390/s390-32/dl-machine.h (elf_machine_runtime_setup):
> 	Choose the correct resolver function if running on a machine with vx.
> 	* sysdeps/s390/s390-64/dl-trampoline.S: Include dl-trampoline.h twice
> 	to create a non-vector/vector version for _dl_runtime_resolve and
> 	_dl_runtime_profile. Move implementation to ...
> 	* sysdeps/s390/s390-64/dl-trampoline.h: ... here.
> 	(_dl_runtime_resolve) Save and restore fpr/vrs.
> 	(_dl_runtime_profile) Save and restore vrs and fix some issues
> 	* sysdeps/s390/s390-64/dl-machine.h: (elf_machine_runtime_setup):
> 	Choose the correct resolver function if running on a machine with vx.
> ---
>   sysdeps/s390/s390-32/dl-machine.h    |  27 ++++-
>   sysdeps/s390/s390-32/dl-trampoline.S | 134 ++--------------------
>   sysdeps/s390/s390-32/dl-trampoline.h | 215 +++++++++++++++++++++++++++++++++++
>   sysdeps/s390/s390-64/dl-machine.h    |  27 ++++-
>   sysdeps/s390/s390-64/dl-trampoline.S | 130 ++-------------------
>   sysdeps/s390/s390-64/dl-trampoline.h | 211 ++++++++++++++++++++++++++++++++++
>   6 files changed, 496 insertions(+), 248 deletions(-)
>   create mode 100644 sysdeps/s390/s390-32/dl-trampoline.h
>   create mode 100644 sysdeps/s390/s390-64/dl-trampoline.h
>
> diff --git a/sysdeps/s390/s390-32/dl-machine.h b/sysdeps/s390/s390-32/dl-machine.h
> index 14bde3b..ec0ae4a 100644
> --- a/sysdeps/s390/s390-32/dl-machine.h
> +++ b/sysdeps/s390/s390-32/dl-machine.h
> @@ -89,6 +89,11 @@ elf_machine_runtime_setup (struct link_map *l, int lazy, int profile)
>   {
>     extern void _dl_runtime_resolve (Elf32_Word);
>     extern void _dl_runtime_profile (Elf32_Word);
> +#if defined HAVE_S390_VX_ASM_SUPPORT
> +  extern void _dl_runtime_resolve_vx (Elf32_Word);
> +  extern void _dl_runtime_profile_vx (Elf32_Word);
> +#endif
> +
>
>     if (l->l_info[DT_JMPREL] && lazy)
>       {
> @@ -116,7 +121,14 @@ elf_machine_runtime_setup (struct link_map *l, int lazy, int profile)
>   	 end in this function.  */
>         if (__glibc_unlikely (profile))
>   	{
> +#if defined HAVE_S390_VX_ASM_SUPPORT
> +	  if (GLRO(dl_hwcap) & HWCAP_S390_VX)
> +	    got[2] = (Elf32_Addr) &_dl_runtime_profile_vx;
> +	  else
> +	    got[2] = (Elf32_Addr) &_dl_runtime_profile;
> +#else
>   	  got[2] = (Elf32_Addr) &_dl_runtime_profile;
> +#endif
>
>   	  if (GLRO(dl_profile) != NULL
>   	      && _dl_name_match_p (GLRO(dl_profile), l))
> @@ -125,9 +137,18 @@ elf_machine_runtime_setup (struct link_map *l, int lazy, int profile)
>   	    GL(dl_profile_map) = l;
>   	}
>         else
> -	/* This function will get called to fix up the GOT entry indicated by
> -	   the offset on the stack, and then jump to the resolved address.  */
> -	got[2] = (Elf32_Addr) &_dl_runtime_resolve;
> +	{
> +	  /* This function will get called to fix up the GOT entry indicated by
> +	     the offset on the stack, and then jump to the resolved address.  */
> +#if defined HAVE_S390_VX_ASM_SUPPORT
> +	  if (GLRO(dl_hwcap) & HWCAP_S390_VX)
> +	    got[2] = (Elf32_Addr) &_dl_runtime_resolve_vx;
> +	  else
> +	    got[2] = (Elf32_Addr) &_dl_runtime_resolve;
> +#else
> +	  got[2] = (Elf32_Addr) &_dl_runtime_resolve;
> +#endif
> +	}
>       }
>
>     return lazy;
> diff --git a/sysdeps/s390/s390-32/dl-trampoline.S b/sysdeps/s390/s390-32/dl-trampoline.S
> index 1645610..859183c 100644
> --- a/sysdeps/s390/s390-32/dl-trampoline.S
> +++ b/sysdeps/s390/s390-32/dl-trampoline.S
> @@ -16,130 +16,18 @@
>      License along with the GNU C Library; if not, see
>      <http://www.gnu.org/licenses/>.  */
>
> -/* This code is used in dl-runtime.c to call the `fixup' function
> -   and then redirect to the address it returns.  */
> -
> -/* The PLT stubs will call _dl_runtime_resolve/_dl_runtime_profile
> - * with the following linkage:
> - *   r2 - r6 : parameter registers
> - *   f0, f2 : floating point parameter registers
> - *   24(r15), 28(r15) : PLT arguments PLT1, PLT2
> - *   96(r15) : additional stack parameters
> - * The normal clobber rules for function calls apply:
> - *   r0 - r5 : call clobbered
> - *   r6 - r13 :	 call saved
> - *   r14 : return address (call clobbered)
> - *   r15 : stack pointer (call saved)
> - *   f4, f6 : call saved
> - *   f0 - f3, f5, f7 - f15 : call clobbered
> - */
> -
>   #include <sysdep.h>
>
>   	.text
> -	.globl _dl_runtime_resolve
> -	.type _dl_runtime_resolve, @function
> -	cfi_startproc
> -	.align 16
> -_dl_runtime_resolve:
> -	stm    %r2,%r5,32(%r15)		# save registers
> -	st     %r14,8(%r15)
> -	cfi_offset (r14, -88)
> -	lr     %r0,%r15			# create stack frame
> -	ahi    %r15,-96
> -	cfi_adjust_cfa_offset (96)
> -	st     0,0(%r15)
> -	lm     %r2,%r3,120(%r15)	# load args saved by PLT
> -	basr   %r1,0
> -0:	l      %r14,1f-0b(%r1)
> -	bas    %r14,0(%r14,%r1)		# call resolver
> -	lr     %r1,%r2			# function addr returned in r2
> -	ahi    %r15,96			# remove stack frame
> -	cfi_adjust_cfa_offset (-96)
> -	l      %r14,8(15)		# restore registers
> -	lm     %r2,%r5,32(%r15)
> -	br     %r1
> -1:	.long  _dl_fixup - 0b
> -	cfi_endproc
> -	.size _dl_runtime_resolve, .-_dl_runtime_resolve
> -
> -
> -#ifndef PROF
> -	.globl _dl_runtime_profile
> -	.type _dl_runtime_profile, @function
> -	cfi_startproc
> -	.align 16
> -_dl_runtime_profile:
> -	stm    %r2,%r6,32(%r15)		# save registers
> -	std    %f0,56(%r15)
> -	std    %f2,64(%r15)
> -	st     %r6,8(%r15)
> -	st     %r12,12(%r15)
> -	st     %r14,16(%r15)
> -	cfi_offset (r6, -64)
> -	cfi_offset (f0, -40)
> -	cfi_offset (f2, -32)
> -	cfi_offset (r12, -84)
> -	cfi_offset (r14, -80)
> -	lr     %r12,%r15		# create stack frame
> -	cfi_def_cfa_register (12)
> -	ahi    %r15,-96
> -	st     %r12,0(%r15)
> -	lm     %r2,%r3,24(%r12)		# load arguments saved by PLT
> -	lr     %r4,%r14			# return address as third parameter
> -	basr   %r1,0
> -0:	l      %r14,6f-0b(%r1)
> -	la     %r5,32(%r12)		# pointer to struct La_s390_32_regs
> -	la     %r6,20(%r12)		# long int * framesize
> -	bas    %r14,0(%r14,%r1)		# call resolver
> -	lr     %r1,%r2			# function addr returned in r2
> -	icm    %r0,15,20(%r12)		# load & test framesize
> -	jnm    2f
> -
> -	lm     %r2,%r6,32(%r12)
> -	ld     %f0,56(%r12)
> -	ld     %f2,64(%r12)
> -	lr     %r15,%r12		# remove stack frame
> -	cfi_def_cfa_register (15)
> -	l      %r14,16(%r15)		# restore registers
> -	l      %r12,12(%r15)
> -	br     %r1			# tail-call to the resolved function
> -
> -	cfi_def_cfa_register (12)
> -2:	jz     4f			# framesize == 0 ?
> -	ahi    %r0,7			# align framesize to 8
> -	lhi    %r2,-8
> -	nr     %r0,%r2
> -	slr    %r15,%r0			# make room for framesize bytes
> -	st     %r12,0(%r15)
> -	la     %r2,96(%r15)
> -	la     %r3,96(%r12)
> -	srl    %r0,3
> -3:	mvc    0(8,%r2),0(%r3)		# copy additional parameters
> -	la     %r2,8(%r2)
> -	la     %r3,8(%r3)
> -	brct   %r0,3b
> -4:	lm     %r2,%r6,32(%r12)		# load register parameters
> -	ld     %f0,56(%r12)
> -	ld     %f2,64(%r12)
> -	basr   %r14,%r1			# call resolved function
> -	stm    %r2,%r3,72(%r12)
> -	std    %f0,80(%r12)
> -	lm     %r2,%r3,24(%r12)		# load arguments saved by PLT
> -	basr   %r1,0
> -5:	l      %r14,7f-5b(%r1)
> -	la     %r4,32(%r12)		# pointer to struct La_s390_32_regs
> -	la     %r5,72(%r12)		# pointer to struct La_s390_32_retval
> -	basr   %r14,%r1			# call _dl_call_pltexit
> -
> -	lr     %r15,%r12		# remove stack frame
> -	cfi_def_cfa_register (15)
> -	l      %r14,16(%r15)		# restore registers
> -	l      %r12,12(%r15)
> -	br     %r14
> -
> -6:	.long  _dl_profile_fixup - 0b
> -7:	.long  _dl_call_pltexit - 5b
> -	cfi_endproc
> -	.size _dl_runtime_profile, .-_dl_runtime_profile
> +/* Create variant of _dl_runtime_resolve/profile for machines before z13.
> +   No vector registers are saved/restored.  */
> +#include <dl-trampoline.h>
> +
> +#if defined HAVE_S390_VX_ASM_SUPPORT
> +/* Create variant of _dl_runtime_resolve/profile for z13 and newer.
> +   The vector registers are saved/restored, too.*/
> +# define _dl_runtime_resolve _dl_runtime_resolve_vx
> +# define _dl_runtime_profile _dl_runtime_profile_vx
> +# define RESTORE_VRS
> +# include <dl-trampoline.h>
>   #endif
> diff --git a/sysdeps/s390/s390-32/dl-trampoline.h b/sysdeps/s390/s390-32/dl-trampoline.h
> new file mode 100644
> index 0000000..a152a7b
> --- /dev/null
> +++ b/sysdeps/s390/s390-32/dl-trampoline.h
> @@ -0,0 +1,215 @@
> +/* PLT trampolines.  s390 version.
> +   Copyright (C) 2016 Free Software Foundation, Inc.
> +   This file is part of the GNU C Library.
> +
> +   The GNU C Library is free software; you can redistribute it and/or
> +   modify it under the terms of the GNU Lesser General Public
> +   License as published by the Free Software Foundation; either
> +   version 2.1 of the License, or (at your option) any later version.
> +
> +   The GNU C Library is distributed in the hope that it will be useful,
> +   but WITHOUT ANY WARRANTY; without even the implied warranty of
> +   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
> +   Lesser General Public License for more details.
> +
> +   You should have received a copy of the GNU Lesser General Public
> +   License along with the GNU C Library; if not, see
> +   <http://www.gnu.org/licenses/>.  */
> +
> +/* This code is used in dl-runtime.c to call the `fixup' function
> +   and then redirect to the address it returns.  */
> +
> +/* The PLT stubs will call _dl_runtime_resolve/_dl_runtime_profile
> + * with the following linkage:
> + *   r2 - r6 : parameter registers
> + *   f0, f2 : floating point parameter registers
> + *   v24, v26, v28, v30, v25, v27, v29, v31 : vector parameter registers
> + *   24(r15), 28(r15) : PLT arguments PLT1, PLT2
> + *   96(r15) : additional stack parameters
> + * The normal clobber rules for function calls apply:
> + *   r0 - r5 : call clobbered
> + *   r6 - r13 :	call saved
> + *   r14 : return address (call clobbered)
> + *   r15 : stack pointer (call saved)
> + *   f4, f6 : call saved
> + *   f0 - f3, f5, f7 - f15 : call clobbered
> + *   v0 - v3, v5, v7 - v15 : bytes 0-7 overlap with fprs: call clobbered
> +               bytes 8-15: call clobbered
> + *   v4, v6 : bytes 0-7 overlap with f4, f6: call saved
> +              bytes 8-15: call clobbered
> + *   v16 - v31 : call clobbered
> + */
> +
> +
> +	.globl _dl_runtime_resolve
> +	.type _dl_runtime_resolve, @function
> +	cfi_startproc
> +	.align 16
> +_dl_runtime_resolve:
> +	stm    %r2,%r5,32(%r15)		# save registers
> +	cfi_offset (r2, -64)
> +	cfi_offset (r3, -60)
> +	cfi_offset (r4, -56)
> +	cfi_offset (r5, -52)
> +	std    %f0,56(%r15)
> +	cfi_offset (f0, -40)
> +	std    %f2,64(%r15)
> +	cfi_offset (f2, -32)
> +	st     %r14,8(%r15)
> +	cfi_offset (r14, -88)
> +	lr     %r0,%r15
> +	lm     %r2,%r3,24(%r15)		# load args saved by PLT
> +#ifdef RESTORE_VRS
> +	ahi    %r15,-224		# create stack frame
> +	cfi_adjust_cfa_offset (224)
> +	.machine push
> +	.machine "z13"
> +	.machinemode "zarch_nohighgprs"
> +	vstm   %v24,%v31,96(%r15)	# store call-clobbered vr arguments
> +	cfi_offset (v24, -224)
> +	cfi_offset (v25, -208)
> +	cfi_offset (v26, -192)
> +	cfi_offset (v27, -176)
> +	cfi_offset (v28, -160)
> +	cfi_offset (v29, -144)
> +	cfi_offset (v30, -128)
> +	cfi_offset (v31, -112)
> +	.machine pop
> +#else
> +	ahi    %r15,-96			# create stack frame
> +	cfi_adjust_cfa_offset (96)
> +#endif
> +	st     %r0,0(%r15)		# write backchain
> +	basr   %r1,0
> +0:	l      %r14,1f-0b(%r1)
> +	bas    %r14,0(%r14,%r1)		# call _dl_fixup
> +	lr     %r1,%r2			# function addr returned in r2
> +#ifdef RESTORE_VRS
> +	.machine push
> +	.machine "z13"
> +	.machinemode "zarch_nohighgprs"
> +	vlm    %v24,%v31,96(%r15)	# restore vector registers
> +	.machine pop
> +	aghi   %r15,224			# remove stack frame
> +	cfi_adjust_cfa_offset (-224)
> +#else
> +	ahi    %r15,96			# remove stack frame
> +	cfi_adjust_cfa_offset (-96)
> +#endif
> +	l      %r14,8(15)		# restore registers
> +	ld     %f0,56(%r15)
> +	ld     %f2,64(%r15)
> +	lm     %r2,%r5,32(%r15)
> +	br     %r1
> +1:	.long  _dl_fixup - 0b
> +	cfi_endproc
> +	.size _dl_runtime_resolve, .-_dl_runtime_resolve
> +
> +
> +#ifndef PROF
> +	.globl _dl_runtime_profile
> +	.type _dl_runtime_profile, @function
> +	cfi_startproc
> +	.align 16
> +_dl_runtime_profile:
> +	stm    %r2,%r6,32(%r15)		# save registers
> +	cfi_offset (r2, -64)		# + r6 needed as arg for
> +	cfi_offset (r3, -60)		#  _dl_profile_fixup
> +	cfi_offset (r4, -56)
> +	cfi_offset (r5, -52)
> +	cfi_offset (r6, -48)
> +	std    %f0,56(%r15)
> +	cfi_offset (f0, -40)
> +	std    %f2,64(%r15)
> +	cfi_offset (f2, -32)
> +	st     %r12,12(%r15)		# r12 is used as backup of r15
> +	cfi_offset (r12, -84)
> +	st     %r14,16(%r15)
> +	cfi_offset (r14, -80)
> +	lr     %r12,%r15		# backup stack pointer
> +	cfi_def_cfa_register (12)
> +#ifdef RESTORE_VRS
> +	ahi    %r15,-224		# create stack frame
> +	.machine push
> +	.machine "z13"
> +	.machinemode "zarch_nohighgprs"
> +	vstm   %v24,%v31,96(%r15)	# store call-clobbered vr arguments
> +	cfi_offset (v24, -224)
> +	cfi_offset (v25, -208)
> +	cfi_offset (v26, -192)
> +	cfi_offset (v27, -176)
> +	cfi_offset (v28, -160)
> +	cfi_offset (v29, -144)
> +	cfi_offset (v30, -128)
> +	cfi_offset (v31, -112)
> +	.machine pop
> +#else
> +	ahi    %r15,-96			# create stack frame
> +#endif
> +	st     %r12,0(%r15)		# save backchain
> +	lm     %r2,%r3,24(%r12)		# load arguments saved by PLT
> +	lr     %r4,%r14			# return address as third parameter
> +	basr   %r1,0
> +0:	l      %r14,6f-0b(%r1)
> +	la     %r5,32(%r12)		# pointer to struct La_s390_32_regs
> +	la     %r6,20(%r12)		# long int * framesize
> +	bas    %r14,0(%r14,%r1)		# call resolver
> +	lr     %r1,%r2			# function addr returned in r2
> +	ld     %f0,56(%r12)		# restore call-clobbered arg fprs
> +	ld     %f2,64(%r12)
> +#ifdef RESTORE_VRS
> +	.machine push
> +	.machine "z13"
> +	.machinemode "zarch_nohighgprs"
> +	vlm    %v24,%v31,96(%r15)	# restore call-clobbered arg vrs
> +	.machine pop
> +#endif
> +	icm    %r0,15,20(%r12)		# load & test framesize
> +	jnm    2f
> +
> +	lm     %r2,%r6,32(%r12)
> +	lr     %r15,%r12		# remove stack frame
> +	cfi_def_cfa_register (15)
> +	l      %r14,16(%r15)		# restore registers
> +	l      %r12,12(%r15)
> +	br     %r1			# tail-call to the resolved function
> +
> +	cfi_def_cfa_register (12)
> +2:	jz     4f			# framesize == 0 ?
> +	ahi    %r0,7			# align framesize to 8
> +	lhi    %r2,-8
> +	nr     %r0,%r2
> +	slr    %r15,%r0			# make room for framesize bytes
> +	st     %r12,0(%r15)		# save backchain
> +	la     %r2,96(%r15)
> +	la     %r3,96(%r12)
> +	srl    %r0,3
> +3:	mvc    0(8,%r2),0(%r3)		# copy additional parameters
> +	la     %r2,8(%r2)
> +	la     %r3,8(%r3)
> +	brct   %r0,3b
> +4:	lm     %r2,%r6,32(%r12)		# load register parameters
> +	basr   %r14,%r1			# call resolved function
> +	stm    %r2,%r3,72(%r12)		# store return values r2, r3, f0
> +	std    %f0,80(%r12)		# to struct La_s390_32_retval
> +	lm     %r2,%r3,24(%r12)		# load arguments saved by PLT
> +	basr   %r1,0
> +5:	l      %r14,7f-5b(%r1)
> +	la     %r4,32(%r12)		# pointer to struct La_s390_32_regs
> +	la     %r5,72(%r12)		# pointer to struct La_s390_32_retval
> +	bas    %r14,0(%r14,%r1)		# call _dl_call_pltexit
> +
> +	lr     %r15,%r12		# remove stack frame
> +	cfi_def_cfa_register (15)
> +	l      %r14,16(%r15)		# restore registers
> +	l      %r12,12(%r15)
> +	l      %r2,72(%r15)		# restore return values
> +	l      %r3,76(%r15)
> +	ld     %f0,80(%r15)
> +	br     %r14
> +
> +6:	.long  _dl_profile_fixup - 0b
> +7:	.long  _dl_call_pltexit - 5b
> +	cfi_endproc
> +	.size _dl_runtime_profile, .-_dl_runtime_profile
> +#endif
> diff --git a/sysdeps/s390/s390-64/dl-machine.h b/sysdeps/s390/s390-64/dl-machine.h
> index cb81aaf..9ee7c92 100644
> --- a/sysdeps/s390/s390-64/dl-machine.h
> +++ b/sysdeps/s390/s390-64/dl-machine.h
> @@ -26,6 +26,7 @@
>   #include <sys/param.h>
>   #include <string.h>
>   #include <link.h>
> +#include <sysdeps/s390/dl-procinfo.h>
>   #include <dl-irel.h>
>
>   #define ELF_MACHINE_IRELATIVE       R_390_IRELATIVE
> @@ -78,6 +79,10 @@ elf_machine_runtime_setup (struct link_map *l, int lazy, int profile)
>   {
>     extern void _dl_runtime_resolve (Elf64_Word);
>     extern void _dl_runtime_profile (Elf64_Word);
> +#if defined HAVE_S390_VX_ASM_SUPPORT
> +  extern void _dl_runtime_resolve_vx (Elf64_Word);
> +  extern void _dl_runtime_profile_vx (Elf64_Word);
> +#endif
>
>     if (l->l_info[DT_JMPREL] && lazy)
>       {
> @@ -105,7 +110,14 @@ elf_machine_runtime_setup (struct link_map *l, int lazy, int profile)
>   	 end in this function.	*/
>         if (__glibc_unlikely (profile))
>   	{
> +#if defined HAVE_S390_VX_ASM_SUPPORT
> +	  if (GLRO(dl_hwcap) & HWCAP_S390_VX)
> +	    got[2] = (Elf64_Addr) &_dl_runtime_profile_vx;
> +	  else
> +	    got[2] = (Elf64_Addr) &_dl_runtime_profile;
> +#else
>   	  got[2] = (Elf64_Addr) &_dl_runtime_profile;
> +#endif
>
>   	  if (GLRO(dl_profile) != NULL
>   	      && _dl_name_match_p (GLRO(dl_profile), l))
> @@ -114,9 +126,18 @@ elf_machine_runtime_setup (struct link_map *l, int lazy, int profile)
>   	    GL(dl_profile_map) = l;
>   	}
>         else
> -	/* This function will get called to fix up the GOT entry indicated by
> -	   the offset on the stack, and then jump to the resolved address.  */
> -	got[2] = (Elf64_Addr) &_dl_runtime_resolve;
> +	{
> +	  /* This function will get called to fix up the GOT entry indicated by
> +	     the offset on the stack, and then jump to the resolved address.  */
> +#if defined HAVE_S390_VX_ASM_SUPPORT
> +	  if (GLRO(dl_hwcap) & HWCAP_S390_VX)
> +	    got[2] = (Elf64_Addr) &_dl_runtime_resolve_vx;
> +	  else
> +	    got[2] = (Elf64_Addr) &_dl_runtime_resolve;
> +#else
> +	  got[2] = (Elf64_Addr) &_dl_runtime_resolve;
> +#endif
> +	}
>       }
>
>     return lazy;
> diff --git a/sysdeps/s390/s390-64/dl-trampoline.S b/sysdeps/s390/s390-64/dl-trampoline.S
> index 6919ed0..1b0c9e2 100644
> --- a/sysdeps/s390/s390-64/dl-trampoline.S
> +++ b/sysdeps/s390/s390-64/dl-trampoline.S
> @@ -16,126 +16,18 @@
>      License along with the GNU C Library; if not, see
>      <http://www.gnu.org/licenses/>.  */
>
> -/* The PLT stubs will call _dl_runtime_resolve/_dl_runtime_profile
> - * with the following linkage:
> - *   r2 - r6 : parameter registers
> - *   f0, f2, f4, f6 : floating point parameter registers
> - *   48(r15), 56(r15) : PLT arguments PLT1, PLT2
> - *   160(r15) : additional stack parameters
> - * The normal clobber rules for function calls apply:
> - *   r0 - r5 : call clobbered
> - *   r6 - r13 :	 call saved
> - *   r14 : return address (call clobbered)
> - *   r15 : stack pointer (call saved)
> - *   f1, f3, f5, f7 : call saved
> - *   f0 - f3, f5, f7 - f15 : call clobbered
> - */
> -
>   #include <sysdep.h>
>
>   	.text
> -	.globl _dl_runtime_resolve
> -	.type _dl_runtime_resolve, @function
> -	cfi_startproc
> -	.align 16
> -_dl_runtime_resolve:
> -	stmg   %r2,%r5,64(15)	# save call-clobbered argument registers
> -	stg    %r14,96(15)
> -	cfi_offset (r14, -64)
> -	lgr    %r0,%r15
> -	aghi   %r15,-160	# create stack frame
> -	cfi_adjust_cfa_offset (160)
> -	stg    %r0,0(%r15)      # write backchain
> -	lmg    %r2,%r3,208(%r15)# load args saved by PLT
> -	brasl  %r14,_dl_fixup	# call fixup
> -	lgr    %r1,%r2		# function addr returned in r2
> -	aghi   %r15,160		# remove stack frame
> -	cfi_adjust_cfa_offset (-160)
> -	lg     %r14,96(15)	# restore registers
> -	lmg    %r2,%r5,64(15)
> -	br     %r1
> -	cfi_endproc
> -	.size _dl_runtime_resolve, .-_dl_runtime_resolve
> -
> -
> -#ifndef PROF
> -	.globl _dl_runtime_profile
> -	.type _dl_runtime_profile, @function
> -	cfi_startproc
> -	.align 16
> -_dl_runtime_profile:
> -	stmg   %r2,%r6,64(%r15)		# save call-clobbered arg regs
> -	std    %f0,104(%r15)		# + r6 needed as arg for
> -	std    %f2,112(%r15)		#  _dl_profile_fixup
> -	std    %f4,120(%r15)
> -	std    %f6,128(%r15)
> -	stg    %r12,24(%r15)		# r12 is used as backup of r15
> -	stg    %r14,32(%r15)
> -	cfi_offset (r6, -96)
> -	cfi_offset (f0, -56)
> -	cfi_offset (f2, -48)
> -	cfi_offset (f4, -40)
> -	cfi_offset (f6, -32)
> -	cfi_offset (r12, -136)
> -	cfi_offset (r14, -128)
> -	lgr    %r12,%r15		# backup stack pointer
> -	cfi_def_cfa_register (12)
> -	aghi   %r15,-160		# create stack frame
> -	stg    %r12,0(%r15)		# save backchain
> -	lmg    %r2,%r3,48(%r12)		# load arguments saved by PLT
> -	lgr    %r4,%r14			# return address as third parameter
> -	la     %r5,64(%r12)		# pointer to struct La_s390_32_regs
> -	la     %r6,40(%r12)		# long int * framesize
> -	brasl  %r14,_dl_profile_fixup	# call resolver
> -	lgr    %r1,%r2			# function addr returned in r2
> -	lg     %r0,40(%r12)		# load framesize
> -	ltgr   %r0,%r0
> -	jnm    1f
> -
> -	lmg    %r2,%r6,64(%r12)		# framesize < 0 means no pltexit call
> -	ld     %f0,104(%r12)		# so we can do a tail call without
> -	ld     %f2,112(%r12)		# copying the arg overflow area
> -	ld     %f4,120(%r12)
> -	ld     %f6,128(%r12)
> -
> -	lgr    %r15,%r12		# remove stack frame
> -	cfi_def_cfa_register (15)
> -	lg     %r14,32(%r15)		# restore registers
> -	lg     %r12,24(%r15)
> -	br     %r1			# tail-call to resolved function
> -
> -	cfi_def_cfa_register (12)
> -1:	jz     4f			# framesize == 0 ?
> -	aghi   %r0,7			# align framesize to 8
> -	nill   %r0,0xfff8
> -	slgr   %r15,%r0			# make room for framesize bytes
> -	stg    %r12,0(%r15)
> -	la     %r2,160(%r15)
> -	la     %r3,160(%r12)
> -	srlg   %r0,%r0,3
> -3:	mvc    0(8,%r2),0(%r3)		# copy additional parameters
> -	la     %r2,8(%r2)
> -	la     %r3,8(%r3)
> -	brctg  %r0,3b
> -4:	lmg    %r2,%r6,64(%r12)		# load register parameters
> -	ld     %f0,104(%r12)            # restore call-clobbered arg regs
> -	ld     %f2,112(%r12)
> -	ld     %f4,120(%r12)
> -	ld     %f6,128(%r12)
> -	basr   %r14,%r1			# call resolved function
> -	stg    %r2,136(%r12)
> -	std    %f0,144(%r12)
> -	lmg    %r2,%r3,48(%r12)		# load arguments saved by PLT
> -	la     %r4,32(%r12)		# pointer to struct La_s390_32_regs
> -	la     %r5,72(%r12)		# pointer to struct La_s390_32_retval
> -	brasl  %r14,_dl_call_pltexit
> -
> -	lgr    %r15,%r12		# remove stack frame
> -	cfi_def_cfa_register (15)
> -	lg     %r14,32(%r15)		# restore registers
> -	lg     %r12,24(%r15)
> -	br     %r14
> -
> -	cfi_endproc
> -	.size _dl_runtime_profile, .-_dl_runtime_profile
> +/* Create variant of _dl_runtime_resolve/profile for machines before z13.
> +   No vector registers are saved/restored.  */
> +#include <dl-trampoline.h>
> +
> +#if defined HAVE_S390_VX_ASM_SUPPORT
> +/* Create variant of _dl_runtime_resolve/profile for z13 and newer.
> +   The vector registers are saved/restored, too.*/
> +# define _dl_runtime_resolve _dl_runtime_resolve_vx
> +# define _dl_runtime_profile _dl_runtime_profile_vx
> +# define RESTORE_VRS
> +# include <dl-trampoline.h>
>   #endif
> diff --git a/sysdeps/s390/s390-64/dl-trampoline.h b/sysdeps/s390/s390-64/dl-trampoline.h
> new file mode 100644
> index 0000000..658e3a3
> --- /dev/null
> +++ b/sysdeps/s390/s390-64/dl-trampoline.h
> @@ -0,0 +1,211 @@
> +/* PLT trampolines.  s390x version.
> +   Copyright (C) 2016 Free Software Foundation, Inc.
> +   This file is part of the GNU C Library.
> +
> +   The GNU C Library is free software; you can redistribute it and/or
> +   modify it under the terms of the GNU Lesser General Public
> +   License as published by the Free Software Foundation; either
> +   version 2.1 of the License, or (at your option) any later version.
> +
> +   The GNU C Library is distributed in the hope that it will be useful,
> +   but WITHOUT ANY WARRANTY; without even the implied warranty of
> +   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
> +   Lesser General Public License for more details.
> +
> +   You should have received a copy of the GNU Lesser General Public
> +   License along with the GNU C Library; if not, see
> +   <http://www.gnu.org/licenses/>.  */
> +
> +/* The PLT stubs will call _dl_runtime_resolve/_dl_runtime_profile
> + * with the following linkage:
> + *   r2 - r6 : parameter registers
> + *   f0, f2, f4, f6 : floating point parameter registers
> + *   v24, v26, v28, v30, v25, v27, v29, v31 : vector parameter registers
> + *   48(r15), 56(r15) : PLT arguments PLT1, PLT2
> + *   160(r15) : additional stack parameters
> + * The normal clobber rules for function calls apply:
> + *   r0 - r5 : call clobbered
> + *   r6 - r13 :	 call saved
> + *   r14 : return address (call clobbered)
> + *   r15 : stack pointer (call saved)
> + *   f0 - f7 : call clobbered
> + *   f8 - f15 : call saved
> + *   v0 - v7 : bytes 0-7 overlap with f0-f7: call clobbered
> +               bytes 8-15: call clobbered
> + *   v8 - v15 : bytes 0-7 overlap with f8-f15: call saved
> +                bytes 8-15: call clobbered
> + *   v16 - v31 : call clobbered
> + */
> +
> +	.globl _dl_runtime_resolve
> +	.type _dl_runtime_resolve, @function
> +	cfi_startproc
> +	.align 16
> +_dl_runtime_resolve:
> +	stmg   %r2,%r5,64(%r15)	# save call-clobbered argument registers
> +	cfi_offset (r2, -96)
> +	cfi_offset (r3, -88)
> +	cfi_offset (r4, -80)
> +	cfi_offset (r5, -72)
> +	std    %f0,104(%r15)
> +	cfi_offset (f0, -56)
> +	std    %f2,112(%r15)
> +	cfi_offset (f2, -48)
> +	std    %f4,120(%r15)
> +	cfi_offset (f4, -40)
> +	std    %f6,128(%r15)
> +	cfi_offset (f6, -32)
> +	stg    %r14,96(15)
> +	cfi_offset (r14, -64)
> +	lmg    %r2,%r3,48(%r15) # load args for fixup saved by PLT
> +	lgr    %r0,%r15
> +#ifdef RESTORE_VRS
> +	aghi   %r15,-288        # create stack frame
> +	cfi_adjust_cfa_offset (288)
> +	.machine push
> +	.machine "z13"
> +	vstm   %v24,%v31,160(%r15)# store call-clobbered vector argument registers
> +	cfi_offset (v24, -288)
> +	cfi_offset (v25, -272)
> +	cfi_offset (v26, -256)
> +	cfi_offset (v27, -240)
> +	cfi_offset (v28, -224)
> +	cfi_offset (v29, -208)
> +	cfi_offset (v30, -192)
> +	cfi_offset (v31, -176)
> +	.machine pop
> +#else
> +	aghi   %r15,-160        # create stack frame
> +	cfi_adjust_cfa_offset (160)
> +#endif
> +	stg    %r0,0(%r15)      # write backchain
> +	brasl  %r14,_dl_fixup	# call _dl_fixup
> +	lgr    %r1,%r2		# function addr returned in r2
> +#ifdef RESTORE_VRS
> +	.machine push
> +	.machine "z13"
> +	vlm    %v24,%v31,160(%r15)# restore vector registers
> +	.machine pop
> +	aghi   %r15,288         # remove stack frame
> +	cfi_adjust_cfa_offset (-288)
> +#else
> +	aghi   %r15,160         # remove stack frame
> +	cfi_adjust_cfa_offset (-160)
> +#endif
> +	lg     %r14,96(%r15)	# restore registers
> +	ld     %f0,104(%r15)
> +	ld     %f2,112(%r15)
> +	ld     %f4,120(%r15)
> +	ld     %f6,128(%r15)
> +	lmg    %r2,%r5,64(%r15)
> +	br     %r1
> +	cfi_endproc
> +	.size _dl_runtime_resolve, .-_dl_runtime_resolve
> +
> +
> +#ifndef PROF
> +	.globl _dl_runtime_profile
> +	.type _dl_runtime_profile, @function
> +	cfi_startproc
> +	.align 16
> +_dl_runtime_profile:
> +	stmg   %r2,%r6,64(%r15)		# save call-clobbered arg regs
> +	cfi_offset (r2, -96)		# + r6 needed as arg for
> +	cfi_offset (r3, -88)		#  _dl_profile_fixup
> +	cfi_offset (r4, -80)
> +	cfi_offset (r5, -72)
> +	cfi_offset (r6, -64)
> +	std    %f0,104(%r15)
> +	cfi_offset (f0, -56)
> +	std    %f2,112(%r15)
> +	cfi_offset (f2, -48)
> +	std    %f4,120(%r15)
> +	cfi_offset (f4, -40)
> +	std    %f6,128(%r15)
> +	cfi_offset (f6, -32)
> +	stg    %r12,24(%r15)		# r12 is used as backup of r15
> +	cfi_offset (r12, -136)
> +	stg    %r14,32(%r15)
> +	cfi_offset (r14, -128)
> +	lgr    %r12,%r15		# backup stack pointer
> +	cfi_def_cfa_register (12)
> +#ifdef RESTORE_VRS
> +	aghi   %r15,-288		# create stack frame
> +	.machine push
> +	.machine "z13"
> +	vstm   %v24,%v31,160(%r15)# store call-clobbered vector argument registers
> +	cfi_offset (v24, -288)
> +	cfi_offset (v25, -272)
> +	cfi_offset (v26, -256)
> +	cfi_offset (v27, -240)
> +	cfi_offset (v28, -224)
> +	cfi_offset (v29, -208)
> +	cfi_offset (v30, -192)
> +	cfi_offset (v31, -176)
> +	.machine pop
> +#else
> +	aghi   %r15,-160		# create stack frame
> +#endif
> +	stg    %r12,0(%r15)		# save backchain
> +	lmg    %r2,%r3,48(%r12)		# load arguments saved by PLT
> +	lgr    %r4,%r14			# return address as third parameter
> +	la     %r5,64(%r12)		# pointer to struct La_s390_64_regs
> +	la     %r6,40(%r12)		# long int * framesize
> +	brasl  %r14,_dl_profile_fixup	# call resolver
> +	lgr    %r1,%r2			# function addr returned in r2
> +	ld     %f0,104(%r12)		# restore call-clobbered arg fprs
> +	ld     %f2,112(%r12)
> +	ld     %f4,120(%r12)
> +	ld     %f6,128(%r12)
> +#ifdef RESTORE_VRS
> +	.machine push
> +	.machine "z13"
> +	vlm    %v24,%v31,160(%r15)	# restore call-clobbered arg vrs
> +	.machine pop
> +#endif
> +	lg     %r0,40(%r12)		# load framesize
> +	ltgr   %r0,%r0
> +	jnm    1f
> +
> +	lmg    %r2,%r6,64(%r12)		# framesize < 0 means no pltexit call
> +					# so we can do a tail call without
> +					# copying the arg overflow area
> +	lgr    %r15,%r12		# remove stack frame
> +	cfi_def_cfa_register (15)
> +	lg     %r14,32(%r15)		# restore registers
> +	lg     %r12,24(%r15)
> +	br     %r1			# tail-call to resolved function
> +
> +	cfi_def_cfa_register (12)
> +1:	jz     4f			# framesize == 0 ?
> +	aghi   %r0,7			# align framesize to 8
> +	nill   %r0,0xfff8
> +	slgr   %r15,%r0			# make room for framesize bytes
> +	stg    %r12,0(%r15)		# save backchain
> +	la     %r2,160(%r15)
> +	la     %r3,160(%r12)
> +	srlg   %r0,%r0,3
> +3:	mvc    0(8,%r2),0(%r3)		# copy additional parameters
> +	la     %r2,8(%r2)		# depending on framesize
> +	la     %r3,8(%r3)
> +	brctg  %r0,3b
> +4:	lmg    %r2,%r6,64(%r12)		# restore call-clobbered arg gprs
> +	basr   %r14,%r1			# call resolved function
> +	stg    %r2,136(%r12)		# store return values r2, f0
> +	std    %f0,144(%r12)		# to struct La_s390_64_retval
> +	lmg    %r2,%r3,48(%r12)		# load arguments saved by PLT
> +	la     %r4,64(%r12)		# pointer to struct La_s390_64_regs
> +	la     %r5,136(%r12)		# pointer to struct La_s390_64_retval
> +	brasl  %r14,_dl_call_pltexit
> +
> +	lgr    %r15,%r12		# remove stack frame
> +	cfi_def_cfa_register (15)
> +	lg     %r14,32(%r15)		# restore registers
> +	lg     %r12,24(%r15)
> +	lg     %r2,136(%r15)		# restore return values
> +	ld     %f0,144(%r15)
> +	br     %r14			# Jump back to caller
> +
> +	cfi_endproc
> +	.size _dl_runtime_profile, .-_dl_runtime_profile
> +#endif
>
  
Florian Weimer April 6, 2016, 11:56 a.m. UTC | #2
On 03/22/2016 12:25 PM, Stefan Liebler wrote:
> On s390, no fpr/vrs were saved while resolving a symbol
> via _dl_runtime_resolve/_dl_runtime_profile.
> 
> According to the abi, the fpr-arguments are defined as call clobbered.
> In leaf-functions, gcc 4.9 and newer can use fprs for saving/restoring gprs
> instead of saving them to the stack.
> If gcc do this in one of the resolver-functions, then the floating point
> arguments of a library-function are invalid for the first library-function-call.
> Thus, this patch saves/restores the fprs around the resolving code.

I think this bug is end-user-visible because it is due to an incorrect
implementation of the original ABI.  Can you file a bug to track this
and add the number to the ChangeLog retroactively?

(The other patch, “Extend structs La_s390_regs / La_s390_retval” seems
different in this regard; I believe applications would have to check
that glibc supports the new ABI before they use the new features.)

Thanks,
Florian
  
Stefan Liebler April 6, 2016, 1:45 p.m. UTC | #3
On 04/06/2016 01:56 PM, Florian Weimer wrote:
> On 03/22/2016 12:25 PM, Stefan Liebler wrote:
>> On s390, no fpr/vrs were saved while resolving a symbol
>> via _dl_runtime_resolve/_dl_runtime_profile.
>>
>> According to the abi, the fpr-arguments are defined as call clobbered.
>> In leaf-functions, gcc 4.9 and newer can use fprs for saving/restoring gprs
>> instead of saving them to the stack.
>> If gcc do this in one of the resolver-functions, then the floating point
>> arguments of a library-function are invalid for the first library-function-call.
>> Thus, this patch saves/restores the fprs around the resolving code.
>
> I think this bug is end-user-visible because it is due to an incorrect
> implementation of the original ABI.  Can you file a bug to track this
> and add the number to the ChangeLog retroactively?
>
> (The other patch, “Extend structs La_s390_regs / La_s390_retval” seems
> different in this regard; I believe applications would have to check
> that glibc supports the new ABI before they use the new features.)
>
> Thanks,
> Florian
>
Okay. Done. Here is the bugzilla 19916:
https://sourceware.org/bugzilla/show_bug.cgi?id=19916
  

Patch

diff --git a/sysdeps/s390/s390-32/dl-machine.h b/sysdeps/s390/s390-32/dl-machine.h
index 14bde3b..ec0ae4a 100644
--- a/sysdeps/s390/s390-32/dl-machine.h
+++ b/sysdeps/s390/s390-32/dl-machine.h
@@ -89,6 +89,11 @@  elf_machine_runtime_setup (struct link_map *l, int lazy, int profile)
 {
   extern void _dl_runtime_resolve (Elf32_Word);
   extern void _dl_runtime_profile (Elf32_Word);
+#if defined HAVE_S390_VX_ASM_SUPPORT
+  extern void _dl_runtime_resolve_vx (Elf32_Word);
+  extern void _dl_runtime_profile_vx (Elf32_Word);
+#endif
+
 
   if (l->l_info[DT_JMPREL] && lazy)
     {
@@ -116,7 +121,14 @@  elf_machine_runtime_setup (struct link_map *l, int lazy, int profile)
 	 end in this function.  */
       if (__glibc_unlikely (profile))
 	{
+#if defined HAVE_S390_VX_ASM_SUPPORT
+	  if (GLRO(dl_hwcap) & HWCAP_S390_VX)
+	    got[2] = (Elf32_Addr) &_dl_runtime_profile_vx;
+	  else
+	    got[2] = (Elf32_Addr) &_dl_runtime_profile;
+#else
 	  got[2] = (Elf32_Addr) &_dl_runtime_profile;
+#endif
 
 	  if (GLRO(dl_profile) != NULL
 	      && _dl_name_match_p (GLRO(dl_profile), l))
@@ -125,9 +137,18 @@  elf_machine_runtime_setup (struct link_map *l, int lazy, int profile)
 	    GL(dl_profile_map) = l;
 	}
       else
-	/* This function will get called to fix up the GOT entry indicated by
-	   the offset on the stack, and then jump to the resolved address.  */
-	got[2] = (Elf32_Addr) &_dl_runtime_resolve;
+	{
+	  /* This function will get called to fix up the GOT entry indicated by
+	     the offset on the stack, and then jump to the resolved address.  */
+#if defined HAVE_S390_VX_ASM_SUPPORT
+	  if (GLRO(dl_hwcap) & HWCAP_S390_VX)
+	    got[2] = (Elf32_Addr) &_dl_runtime_resolve_vx;
+	  else
+	    got[2] = (Elf32_Addr) &_dl_runtime_resolve;
+#else
+	  got[2] = (Elf32_Addr) &_dl_runtime_resolve;
+#endif
+	}
     }
 
   return lazy;
diff --git a/sysdeps/s390/s390-32/dl-trampoline.S b/sysdeps/s390/s390-32/dl-trampoline.S
index 1645610..859183c 100644
--- a/sysdeps/s390/s390-32/dl-trampoline.S
+++ b/sysdeps/s390/s390-32/dl-trampoline.S
@@ -16,130 +16,18 @@ 
    License along with the GNU C Library; if not, see
    <http://www.gnu.org/licenses/>.  */
 
-/* This code is used in dl-runtime.c to call the `fixup' function
-   and then redirect to the address it returns.  */
-
-/* The PLT stubs will call _dl_runtime_resolve/_dl_runtime_profile
- * with the following linkage:
- *   r2 - r6 : parameter registers
- *   f0, f2 : floating point parameter registers
- *   24(r15), 28(r15) : PLT arguments PLT1, PLT2
- *   96(r15) : additional stack parameters
- * The normal clobber rules for function calls apply:
- *   r0 - r5 : call clobbered
- *   r6 - r13 :	 call saved
- *   r14 : return address (call clobbered)
- *   r15 : stack pointer (call saved)
- *   f4, f6 : call saved
- *   f0 - f3, f5, f7 - f15 : call clobbered
- */
-
 #include <sysdep.h>
 
 	.text
-	.globl _dl_runtime_resolve
-	.type _dl_runtime_resolve, @function
-	cfi_startproc
-	.align 16
-_dl_runtime_resolve:
-	stm    %r2,%r5,32(%r15)		# save registers
-	st     %r14,8(%r15)
-	cfi_offset (r14, -88)
-	lr     %r0,%r15			# create stack frame
-	ahi    %r15,-96
-	cfi_adjust_cfa_offset (96)
-	st     0,0(%r15)
-	lm     %r2,%r3,120(%r15)	# load args saved by PLT
-	basr   %r1,0
-0:	l      %r14,1f-0b(%r1)
-	bas    %r14,0(%r14,%r1)		# call resolver
-	lr     %r1,%r2			# function addr returned in r2
-	ahi    %r15,96			# remove stack frame
-	cfi_adjust_cfa_offset (-96)
-	l      %r14,8(15)		# restore registers
-	lm     %r2,%r5,32(%r15)
-	br     %r1
-1:	.long  _dl_fixup - 0b
-	cfi_endproc
-	.size _dl_runtime_resolve, .-_dl_runtime_resolve
-
-
-#ifndef PROF
-	.globl _dl_runtime_profile
-	.type _dl_runtime_profile, @function
-	cfi_startproc
-	.align 16
-_dl_runtime_profile:
-	stm    %r2,%r6,32(%r15)		# save registers
-	std    %f0,56(%r15)
-	std    %f2,64(%r15)
-	st     %r6,8(%r15)
-	st     %r12,12(%r15)
-	st     %r14,16(%r15)
-	cfi_offset (r6, -64)
-	cfi_offset (f0, -40)
-	cfi_offset (f2, -32)
-	cfi_offset (r12, -84)
-	cfi_offset (r14, -80)
-	lr     %r12,%r15		# create stack frame
-	cfi_def_cfa_register (12)
-	ahi    %r15,-96
-	st     %r12,0(%r15)
-	lm     %r2,%r3,24(%r12)		# load arguments saved by PLT
-	lr     %r4,%r14			# return address as third parameter
-	basr   %r1,0
-0:	l      %r14,6f-0b(%r1)
-	la     %r5,32(%r12)		# pointer to struct La_s390_32_regs
-	la     %r6,20(%r12)		# long int * framesize
-	bas    %r14,0(%r14,%r1)		# call resolver
-	lr     %r1,%r2			# function addr returned in r2
-	icm    %r0,15,20(%r12)		# load & test framesize
-	jnm    2f
-
-	lm     %r2,%r6,32(%r12)
-	ld     %f0,56(%r12)
-	ld     %f2,64(%r12)
-	lr     %r15,%r12		# remove stack frame
-	cfi_def_cfa_register (15)
-	l      %r14,16(%r15)		# restore registers
-	l      %r12,12(%r15)
-	br     %r1			# tail-call to the resolved function
-
-	cfi_def_cfa_register (12)
-2:	jz     4f			# framesize == 0 ?
-	ahi    %r0,7			# align framesize to 8
-	lhi    %r2,-8
-	nr     %r0,%r2
-	slr    %r15,%r0			# make room for framesize bytes
-	st     %r12,0(%r15)
-	la     %r2,96(%r15)
-	la     %r3,96(%r12)
-	srl    %r0,3
-3:	mvc    0(8,%r2),0(%r3)		# copy additional parameters
-	la     %r2,8(%r2)
-	la     %r3,8(%r3)
-	brct   %r0,3b
-4:	lm     %r2,%r6,32(%r12)		# load register parameters
-	ld     %f0,56(%r12)
-	ld     %f2,64(%r12)
-	basr   %r14,%r1			# call resolved function
-	stm    %r2,%r3,72(%r12)
-	std    %f0,80(%r12)
-	lm     %r2,%r3,24(%r12)		# load arguments saved by PLT
-	basr   %r1,0
-5:	l      %r14,7f-5b(%r1)
-	la     %r4,32(%r12)		# pointer to struct La_s390_32_regs
-	la     %r5,72(%r12)		# pointer to struct La_s390_32_retval
-	basr   %r14,%r1			# call _dl_call_pltexit
-
-	lr     %r15,%r12		# remove stack frame
-	cfi_def_cfa_register (15)
-	l      %r14,16(%r15)		# restore registers
-	l      %r12,12(%r15)
-	br     %r14
-
-6:	.long  _dl_profile_fixup - 0b
-7:	.long  _dl_call_pltexit - 5b
-	cfi_endproc
-	.size _dl_runtime_profile, .-_dl_runtime_profile
+/* Create variant of _dl_runtime_resolve/profile for machines before z13.
+   No vector registers are saved/restored.  */
+#include <dl-trampoline.h>
+
+#if defined HAVE_S390_VX_ASM_SUPPORT
+/* Create variant of _dl_runtime_resolve/profile for z13 and newer.
+   The vector registers are saved/restored, too.*/
+# define _dl_runtime_resolve _dl_runtime_resolve_vx
+# define _dl_runtime_profile _dl_runtime_profile_vx
+# define RESTORE_VRS
+# include <dl-trampoline.h>
 #endif
diff --git a/sysdeps/s390/s390-32/dl-trampoline.h b/sysdeps/s390/s390-32/dl-trampoline.h
new file mode 100644
index 0000000..a152a7b
--- /dev/null
+++ b/sysdeps/s390/s390-32/dl-trampoline.h
@@ -0,0 +1,215 @@ 
+/* PLT trampolines.  s390 version.
+   Copyright (C) 2016 Free Software Foundation, Inc.
+   This file is part of the GNU C Library.
+
+   The GNU C Library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 2.1 of the License, or (at your option) any later version.
+
+   The GNU C Library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with the GNU C Library; if not, see
+   <http://www.gnu.org/licenses/>.  */
+
+/* This code is used in dl-runtime.c to call the `fixup' function
+   and then redirect to the address it returns.  */
+
+/* The PLT stubs will call _dl_runtime_resolve/_dl_runtime_profile
+ * with the following linkage:
+ *   r2 - r6 : parameter registers
+ *   f0, f2 : floating point parameter registers
+ *   v24, v26, v28, v30, v25, v27, v29, v31 : vector parameter registers
+ *   24(r15), 28(r15) : PLT arguments PLT1, PLT2
+ *   96(r15) : additional stack parameters
+ * The normal clobber rules for function calls apply:
+ *   r0 - r5 : call clobbered
+ *   r6 - r13 :	call saved
+ *   r14 : return address (call clobbered)
+ *   r15 : stack pointer (call saved)
+ *   f4, f6 : call saved
+ *   f0 - f3, f5, f7 - f15 : call clobbered
+ *   v0 - v3, v5, v7 - v15 : bytes 0-7 overlap with fprs: call clobbered
+               bytes 8-15: call clobbered
+ *   v4, v6 : bytes 0-7 overlap with f4, f6: call saved
+              bytes 8-15: call clobbered
+ *   v16 - v31 : call clobbered
+ */
+
+
+	.globl _dl_runtime_resolve
+	.type _dl_runtime_resolve, @function
+	cfi_startproc
+	.align 16
+_dl_runtime_resolve:
+	stm    %r2,%r5,32(%r15)		# save registers
+	cfi_offset (r2, -64)
+	cfi_offset (r3, -60)
+	cfi_offset (r4, -56)
+	cfi_offset (r5, -52)
+	std    %f0,56(%r15)
+	cfi_offset (f0, -40)
+	std    %f2,64(%r15)
+	cfi_offset (f2, -32)
+	st     %r14,8(%r15)
+	cfi_offset (r14, -88)
+	lr     %r0,%r15
+	lm     %r2,%r3,24(%r15)		# load args saved by PLT
+#ifdef RESTORE_VRS
+	ahi    %r15,-224		# create stack frame
+	cfi_adjust_cfa_offset (224)
+	.machine push
+	.machine "z13"
+	.machinemode "zarch_nohighgprs"
+	vstm   %v24,%v31,96(%r15)	# store call-clobbered vr arguments
+	cfi_offset (v24, -224)
+	cfi_offset (v25, -208)
+	cfi_offset (v26, -192)
+	cfi_offset (v27, -176)
+	cfi_offset (v28, -160)
+	cfi_offset (v29, -144)
+	cfi_offset (v30, -128)
+	cfi_offset (v31, -112)
+	.machine pop
+#else
+	ahi    %r15,-96			# create stack frame
+	cfi_adjust_cfa_offset (96)
+#endif
+	st     %r0,0(%r15)		# write backchain
+	basr   %r1,0
+0:	l      %r14,1f-0b(%r1)
+	bas    %r14,0(%r14,%r1)		# call _dl_fixup
+	lr     %r1,%r2			# function addr returned in r2
+#ifdef RESTORE_VRS
+	.machine push
+	.machine "z13"
+	.machinemode "zarch_nohighgprs"
+	vlm    %v24,%v31,96(%r15)	# restore vector registers
+	.machine pop
+	aghi   %r15,224			# remove stack frame
+	cfi_adjust_cfa_offset (-224)
+#else
+	ahi    %r15,96			# remove stack frame
+	cfi_adjust_cfa_offset (-96)
+#endif
+	l      %r14,8(15)		# restore registers
+	ld     %f0,56(%r15)
+	ld     %f2,64(%r15)
+	lm     %r2,%r5,32(%r15)
+	br     %r1
+1:	.long  _dl_fixup - 0b
+	cfi_endproc
+	.size _dl_runtime_resolve, .-_dl_runtime_resolve
+
+
+#ifndef PROF
+	.globl _dl_runtime_profile
+	.type _dl_runtime_profile, @function
+	cfi_startproc
+	.align 16
+_dl_runtime_profile:
+	stm    %r2,%r6,32(%r15)		# save registers
+	cfi_offset (r2, -64)		# + r6 needed as arg for
+	cfi_offset (r3, -60)		#  _dl_profile_fixup
+	cfi_offset (r4, -56)
+	cfi_offset (r5, -52)
+	cfi_offset (r6, -48)
+	std    %f0,56(%r15)
+	cfi_offset (f0, -40)
+	std    %f2,64(%r15)
+	cfi_offset (f2, -32)
+	st     %r12,12(%r15)		# r12 is used as backup of r15
+	cfi_offset (r12, -84)
+	st     %r14,16(%r15)
+	cfi_offset (r14, -80)
+	lr     %r12,%r15		# backup stack pointer
+	cfi_def_cfa_register (12)
+#ifdef RESTORE_VRS
+	ahi    %r15,-224		# create stack frame
+	.machine push
+	.machine "z13"
+	.machinemode "zarch_nohighgprs"
+	vstm   %v24,%v31,96(%r15)	# store call-clobbered vr arguments
+	cfi_offset (v24, -224)
+	cfi_offset (v25, -208)
+	cfi_offset (v26, -192)
+	cfi_offset (v27, -176)
+	cfi_offset (v28, -160)
+	cfi_offset (v29, -144)
+	cfi_offset (v30, -128)
+	cfi_offset (v31, -112)
+	.machine pop
+#else
+	ahi    %r15,-96			# create stack frame
+#endif
+	st     %r12,0(%r15)		# save backchain
+	lm     %r2,%r3,24(%r12)		# load arguments saved by PLT
+	lr     %r4,%r14			# return address as third parameter
+	basr   %r1,0
+0:	l      %r14,6f-0b(%r1)
+	la     %r5,32(%r12)		# pointer to struct La_s390_32_regs
+	la     %r6,20(%r12)		# long int * framesize
+	bas    %r14,0(%r14,%r1)		# call resolver
+	lr     %r1,%r2			# function addr returned in r2
+	ld     %f0,56(%r12)		# restore call-clobbered arg fprs
+	ld     %f2,64(%r12)
+#ifdef RESTORE_VRS
+	.machine push
+	.machine "z13"
+	.machinemode "zarch_nohighgprs"
+	vlm    %v24,%v31,96(%r15)	# restore call-clobbered arg vrs
+	.machine pop
+#endif
+	icm    %r0,15,20(%r12)		# load & test framesize
+	jnm    2f
+
+	lm     %r2,%r6,32(%r12)
+	lr     %r15,%r12		# remove stack frame
+	cfi_def_cfa_register (15)
+	l      %r14,16(%r15)		# restore registers
+	l      %r12,12(%r15)
+	br     %r1			# tail-call to the resolved function
+
+	cfi_def_cfa_register (12)
+2:	jz     4f			# framesize == 0 ?
+	ahi    %r0,7			# align framesize to 8
+	lhi    %r2,-8
+	nr     %r0,%r2
+	slr    %r15,%r0			# make room for framesize bytes
+	st     %r12,0(%r15)		# save backchain
+	la     %r2,96(%r15)
+	la     %r3,96(%r12)
+	srl    %r0,3
+3:	mvc    0(8,%r2),0(%r3)		# copy additional parameters
+	la     %r2,8(%r2)
+	la     %r3,8(%r3)
+	brct   %r0,3b
+4:	lm     %r2,%r6,32(%r12)		# load register parameters
+	basr   %r14,%r1			# call resolved function
+	stm    %r2,%r3,72(%r12)		# store return values r2, r3, f0
+	std    %f0,80(%r12)		# to struct La_s390_32_retval
+	lm     %r2,%r3,24(%r12)		# load arguments saved by PLT
+	basr   %r1,0
+5:	l      %r14,7f-5b(%r1)
+	la     %r4,32(%r12)		# pointer to struct La_s390_32_regs
+	la     %r5,72(%r12)		# pointer to struct La_s390_32_retval
+	bas    %r14,0(%r14,%r1)		# call _dl_call_pltexit
+
+	lr     %r15,%r12		# remove stack frame
+	cfi_def_cfa_register (15)
+	l      %r14,16(%r15)		# restore registers
+	l      %r12,12(%r15)
+	l      %r2,72(%r15)		# restore return values
+	l      %r3,76(%r15)
+	ld     %f0,80(%r15)
+	br     %r14
+
+6:	.long  _dl_profile_fixup - 0b
+7:	.long  _dl_call_pltexit - 5b
+	cfi_endproc
+	.size _dl_runtime_profile, .-_dl_runtime_profile
+#endif
diff --git a/sysdeps/s390/s390-64/dl-machine.h b/sysdeps/s390/s390-64/dl-machine.h
index cb81aaf..9ee7c92 100644
--- a/sysdeps/s390/s390-64/dl-machine.h
+++ b/sysdeps/s390/s390-64/dl-machine.h
@@ -26,6 +26,7 @@ 
 #include <sys/param.h>
 #include <string.h>
 #include <link.h>
+#include <sysdeps/s390/dl-procinfo.h>
 #include <dl-irel.h>
 
 #define ELF_MACHINE_IRELATIVE       R_390_IRELATIVE
@@ -78,6 +79,10 @@  elf_machine_runtime_setup (struct link_map *l, int lazy, int profile)
 {
   extern void _dl_runtime_resolve (Elf64_Word);
   extern void _dl_runtime_profile (Elf64_Word);
+#if defined HAVE_S390_VX_ASM_SUPPORT
+  extern void _dl_runtime_resolve_vx (Elf64_Word);
+  extern void _dl_runtime_profile_vx (Elf64_Word);
+#endif
 
   if (l->l_info[DT_JMPREL] && lazy)
     {
@@ -105,7 +110,14 @@  elf_machine_runtime_setup (struct link_map *l, int lazy, int profile)
 	 end in this function.	*/
       if (__glibc_unlikely (profile))
 	{
+#if defined HAVE_S390_VX_ASM_SUPPORT
+	  if (GLRO(dl_hwcap) & HWCAP_S390_VX)
+	    got[2] = (Elf64_Addr) &_dl_runtime_profile_vx;
+	  else
+	    got[2] = (Elf64_Addr) &_dl_runtime_profile;
+#else
 	  got[2] = (Elf64_Addr) &_dl_runtime_profile;
+#endif
 
 	  if (GLRO(dl_profile) != NULL
 	      && _dl_name_match_p (GLRO(dl_profile), l))
@@ -114,9 +126,18 @@  elf_machine_runtime_setup (struct link_map *l, int lazy, int profile)
 	    GL(dl_profile_map) = l;
 	}
       else
-	/* This function will get called to fix up the GOT entry indicated by
-	   the offset on the stack, and then jump to the resolved address.  */
-	got[2] = (Elf64_Addr) &_dl_runtime_resolve;
+	{
+	  /* This function will get called to fix up the GOT entry indicated by
+	     the offset on the stack, and then jump to the resolved address.  */
+#if defined HAVE_S390_VX_ASM_SUPPORT
+	  if (GLRO(dl_hwcap) & HWCAP_S390_VX)
+	    got[2] = (Elf64_Addr) &_dl_runtime_resolve_vx;
+	  else
+	    got[2] = (Elf64_Addr) &_dl_runtime_resolve;
+#else
+	  got[2] = (Elf64_Addr) &_dl_runtime_resolve;
+#endif
+	}
     }
 
   return lazy;
diff --git a/sysdeps/s390/s390-64/dl-trampoline.S b/sysdeps/s390/s390-64/dl-trampoline.S
index 6919ed0..1b0c9e2 100644
--- a/sysdeps/s390/s390-64/dl-trampoline.S
+++ b/sysdeps/s390/s390-64/dl-trampoline.S
@@ -16,126 +16,18 @@ 
    License along with the GNU C Library; if not, see
    <http://www.gnu.org/licenses/>.  */
 
-/* The PLT stubs will call _dl_runtime_resolve/_dl_runtime_profile
- * with the following linkage:
- *   r2 - r6 : parameter registers
- *   f0, f2, f4, f6 : floating point parameter registers
- *   48(r15), 56(r15) : PLT arguments PLT1, PLT2
- *   160(r15) : additional stack parameters
- * The normal clobber rules for function calls apply:
- *   r0 - r5 : call clobbered
- *   r6 - r13 :	 call saved
- *   r14 : return address (call clobbered)
- *   r15 : stack pointer (call saved)
- *   f1, f3, f5, f7 : call saved
- *   f0 - f3, f5, f7 - f15 : call clobbered
- */
-
 #include <sysdep.h>
 
 	.text
-	.globl _dl_runtime_resolve
-	.type _dl_runtime_resolve, @function
-	cfi_startproc
-	.align 16
-_dl_runtime_resolve:
-	stmg   %r2,%r5,64(15)	# save call-clobbered argument registers
-	stg    %r14,96(15)
-	cfi_offset (r14, -64)
-	lgr    %r0,%r15
-	aghi   %r15,-160	# create stack frame
-	cfi_adjust_cfa_offset (160)
-	stg    %r0,0(%r15)      # write backchain
-	lmg    %r2,%r3,208(%r15)# load args saved by PLT
-	brasl  %r14,_dl_fixup	# call fixup
-	lgr    %r1,%r2		# function addr returned in r2
-	aghi   %r15,160		# remove stack frame
-	cfi_adjust_cfa_offset (-160)
-	lg     %r14,96(15)	# restore registers
-	lmg    %r2,%r5,64(15)
-	br     %r1
-	cfi_endproc
-	.size _dl_runtime_resolve, .-_dl_runtime_resolve
-
-
-#ifndef PROF
-	.globl _dl_runtime_profile
-	.type _dl_runtime_profile, @function
-	cfi_startproc
-	.align 16
-_dl_runtime_profile:
-	stmg   %r2,%r6,64(%r15)		# save call-clobbered arg regs
-	std    %f0,104(%r15)		# + r6 needed as arg for
-	std    %f2,112(%r15)		#  _dl_profile_fixup
-	std    %f4,120(%r15)
-	std    %f6,128(%r15)
-	stg    %r12,24(%r15)		# r12 is used as backup of r15
-	stg    %r14,32(%r15)
-	cfi_offset (r6, -96)
-	cfi_offset (f0, -56)
-	cfi_offset (f2, -48)
-	cfi_offset (f4, -40)
-	cfi_offset (f6, -32)
-	cfi_offset (r12, -136)
-	cfi_offset (r14, -128)
-	lgr    %r12,%r15		# backup stack pointer
-	cfi_def_cfa_register (12)
-	aghi   %r15,-160		# create stack frame
-	stg    %r12,0(%r15)		# save backchain
-	lmg    %r2,%r3,48(%r12)		# load arguments saved by PLT
-	lgr    %r4,%r14			# return address as third parameter
-	la     %r5,64(%r12)		# pointer to struct La_s390_32_regs
-	la     %r6,40(%r12)		# long int * framesize
-	brasl  %r14,_dl_profile_fixup	# call resolver
-	lgr    %r1,%r2			# function addr returned in r2
-	lg     %r0,40(%r12)		# load framesize
-	ltgr   %r0,%r0
-	jnm    1f
-
-	lmg    %r2,%r6,64(%r12)		# framesize < 0 means no pltexit call
-	ld     %f0,104(%r12)		# so we can do a tail call without
-	ld     %f2,112(%r12)		# copying the arg overflow area
-	ld     %f4,120(%r12)
-	ld     %f6,128(%r12)
-
-	lgr    %r15,%r12		# remove stack frame
-	cfi_def_cfa_register (15)
-	lg     %r14,32(%r15)		# restore registers
-	lg     %r12,24(%r15)
-	br     %r1			# tail-call to resolved function
-
-	cfi_def_cfa_register (12)
-1:	jz     4f			# framesize == 0 ?
-	aghi   %r0,7			# align framesize to 8
-	nill   %r0,0xfff8
-	slgr   %r15,%r0			# make room for framesize bytes
-	stg    %r12,0(%r15)
-	la     %r2,160(%r15)
-	la     %r3,160(%r12)
-	srlg   %r0,%r0,3
-3:	mvc    0(8,%r2),0(%r3)		# copy additional parameters
-	la     %r2,8(%r2)
-	la     %r3,8(%r3)
-	brctg  %r0,3b
-4:	lmg    %r2,%r6,64(%r12)		# load register parameters
-	ld     %f0,104(%r12)            # restore call-clobbered arg regs
-	ld     %f2,112(%r12)
-	ld     %f4,120(%r12)
-	ld     %f6,128(%r12)
-	basr   %r14,%r1			# call resolved function
-	stg    %r2,136(%r12)
-	std    %f0,144(%r12)
-	lmg    %r2,%r3,48(%r12)		# load arguments saved by PLT
-	la     %r4,32(%r12)		# pointer to struct La_s390_32_regs
-	la     %r5,72(%r12)		# pointer to struct La_s390_32_retval
-	brasl  %r14,_dl_call_pltexit
-
-	lgr    %r15,%r12		# remove stack frame
-	cfi_def_cfa_register (15)
-	lg     %r14,32(%r15)		# restore registers
-	lg     %r12,24(%r15)
-	br     %r14
-
-	cfi_endproc
-	.size _dl_runtime_profile, .-_dl_runtime_profile
+/* Create variant of _dl_runtime_resolve/profile for machines before z13.
+   No vector registers are saved/restored.  */
+#include <dl-trampoline.h>
+
+#if defined HAVE_S390_VX_ASM_SUPPORT
+/* Create variant of _dl_runtime_resolve/profile for z13 and newer.
+   The vector registers are saved/restored, too.*/
+# define _dl_runtime_resolve _dl_runtime_resolve_vx
+# define _dl_runtime_profile _dl_runtime_profile_vx
+# define RESTORE_VRS
+# include <dl-trampoline.h>
 #endif
diff --git a/sysdeps/s390/s390-64/dl-trampoline.h b/sysdeps/s390/s390-64/dl-trampoline.h
new file mode 100644
index 0000000..658e3a3
--- /dev/null
+++ b/sysdeps/s390/s390-64/dl-trampoline.h
@@ -0,0 +1,211 @@ 
+/* PLT trampolines.  s390x version.
+   Copyright (C) 2016 Free Software Foundation, Inc.
+   This file is part of the GNU C Library.
+
+   The GNU C Library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 2.1 of the License, or (at your option) any later version.
+
+   The GNU C Library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with the GNU C Library; if not, see
+   <http://www.gnu.org/licenses/>.  */
+
+/* The PLT stubs will call _dl_runtime_resolve/_dl_runtime_profile
+ * with the following linkage:
+ *   r2 - r6 : parameter registers
+ *   f0, f2, f4, f6 : floating point parameter registers
+ *   v24, v26, v28, v30, v25, v27, v29, v31 : vector parameter registers
+ *   48(r15), 56(r15) : PLT arguments PLT1, PLT2
+ *   160(r15) : additional stack parameters
+ * The normal clobber rules for function calls apply:
+ *   r0 - r5 : call clobbered
+ *   r6 - r13 :	 call saved
+ *   r14 : return address (call clobbered)
+ *   r15 : stack pointer (call saved)
+ *   f0 - f7 : call clobbered
+ *   f8 - f15 : call saved
+ *   v0 - v7 : bytes 0-7 overlap with f0-f7: call clobbered
+               bytes 8-15: call clobbered
+ *   v8 - v15 : bytes 0-7 overlap with f8-f15: call saved
+                bytes 8-15: call clobbered
+ *   v16 - v31 : call clobbered
+ */
+
+	.globl _dl_runtime_resolve
+	.type _dl_runtime_resolve, @function
+	cfi_startproc
+	.align 16
+_dl_runtime_resolve:
+	stmg   %r2,%r5,64(%r15)	# save call-clobbered argument registers
+	cfi_offset (r2, -96)
+	cfi_offset (r3, -88)
+	cfi_offset (r4, -80)
+	cfi_offset (r5, -72)
+	std    %f0,104(%r15)
+	cfi_offset (f0, -56)
+	std    %f2,112(%r15)
+	cfi_offset (f2, -48)
+	std    %f4,120(%r15)
+	cfi_offset (f4, -40)
+	std    %f6,128(%r15)
+	cfi_offset (f6, -32)
+	stg    %r14,96(15)
+	cfi_offset (r14, -64)
+	lmg    %r2,%r3,48(%r15) # load args for fixup saved by PLT
+	lgr    %r0,%r15
+#ifdef RESTORE_VRS
+	aghi   %r15,-288        # create stack frame
+	cfi_adjust_cfa_offset (288)
+	.machine push
+	.machine "z13"
+	vstm   %v24,%v31,160(%r15)# store call-clobbered vector argument registers
+	cfi_offset (v24, -288)
+	cfi_offset (v25, -272)
+	cfi_offset (v26, -256)
+	cfi_offset (v27, -240)
+	cfi_offset (v28, -224)
+	cfi_offset (v29, -208)
+	cfi_offset (v30, -192)
+	cfi_offset (v31, -176)
+	.machine pop
+#else
+	aghi   %r15,-160        # create stack frame
+	cfi_adjust_cfa_offset (160)
+#endif
+	stg    %r0,0(%r15)      # write backchain
+	brasl  %r14,_dl_fixup	# call _dl_fixup
+	lgr    %r1,%r2		# function addr returned in r2
+#ifdef RESTORE_VRS
+	.machine push
+	.machine "z13"
+	vlm    %v24,%v31,160(%r15)# restore vector registers
+	.machine pop
+	aghi   %r15,288         # remove stack frame
+	cfi_adjust_cfa_offset (-288)
+#else
+	aghi   %r15,160         # remove stack frame
+	cfi_adjust_cfa_offset (-160)
+#endif
+	lg     %r14,96(%r15)	# restore registers
+	ld     %f0,104(%r15)
+	ld     %f2,112(%r15)
+	ld     %f4,120(%r15)
+	ld     %f6,128(%r15)
+	lmg    %r2,%r5,64(%r15)
+	br     %r1
+	cfi_endproc
+	.size _dl_runtime_resolve, .-_dl_runtime_resolve
+
+
+#ifndef PROF
+	.globl _dl_runtime_profile
+	.type _dl_runtime_profile, @function
+	cfi_startproc
+	.align 16
+_dl_runtime_profile:
+	stmg   %r2,%r6,64(%r15)		# save call-clobbered arg regs
+	cfi_offset (r2, -96)		# + r6 needed as arg for
+	cfi_offset (r3, -88)		#  _dl_profile_fixup
+	cfi_offset (r4, -80)
+	cfi_offset (r5, -72)
+	cfi_offset (r6, -64)
+	std    %f0,104(%r15)
+	cfi_offset (f0, -56)
+	std    %f2,112(%r15)
+	cfi_offset (f2, -48)
+	std    %f4,120(%r15)
+	cfi_offset (f4, -40)
+	std    %f6,128(%r15)
+	cfi_offset (f6, -32)
+	stg    %r12,24(%r15)		# r12 is used as backup of r15
+	cfi_offset (r12, -136)
+	stg    %r14,32(%r15)
+	cfi_offset (r14, -128)
+	lgr    %r12,%r15		# backup stack pointer
+	cfi_def_cfa_register (12)
+#ifdef RESTORE_VRS
+	aghi   %r15,-288		# create stack frame
+	.machine push
+	.machine "z13"
+	vstm   %v24,%v31,160(%r15)# store call-clobbered vector argument registers
+	cfi_offset (v24, -288)
+	cfi_offset (v25, -272)
+	cfi_offset (v26, -256)
+	cfi_offset (v27, -240)
+	cfi_offset (v28, -224)
+	cfi_offset (v29, -208)
+	cfi_offset (v30, -192)
+	cfi_offset (v31, -176)
+	.machine pop
+#else
+	aghi   %r15,-160		# create stack frame
+#endif
+	stg    %r12,0(%r15)		# save backchain
+	lmg    %r2,%r3,48(%r12)		# load arguments saved by PLT
+	lgr    %r4,%r14			# return address as third parameter
+	la     %r5,64(%r12)		# pointer to struct La_s390_64_regs
+	la     %r6,40(%r12)		# long int * framesize
+	brasl  %r14,_dl_profile_fixup	# call resolver
+	lgr    %r1,%r2			# function addr returned in r2
+	ld     %f0,104(%r12)		# restore call-clobbered arg fprs
+	ld     %f2,112(%r12)
+	ld     %f4,120(%r12)
+	ld     %f6,128(%r12)
+#ifdef RESTORE_VRS
+	.machine push
+	.machine "z13"
+	vlm    %v24,%v31,160(%r15)	# restore call-clobbered arg vrs
+	.machine pop
+#endif
+	lg     %r0,40(%r12)		# load framesize
+	ltgr   %r0,%r0
+	jnm    1f
+
+	lmg    %r2,%r6,64(%r12)		# framesize < 0 means no pltexit call
+					# so we can do a tail call without
+					# copying the arg overflow area
+	lgr    %r15,%r12		# remove stack frame
+	cfi_def_cfa_register (15)
+	lg     %r14,32(%r15)		# restore registers
+	lg     %r12,24(%r15)
+	br     %r1			# tail-call to resolved function
+
+	cfi_def_cfa_register (12)
+1:	jz     4f			# framesize == 0 ?
+	aghi   %r0,7			# align framesize to 8
+	nill   %r0,0xfff8
+	slgr   %r15,%r0			# make room for framesize bytes
+	stg    %r12,0(%r15)		# save backchain
+	la     %r2,160(%r15)
+	la     %r3,160(%r12)
+	srlg   %r0,%r0,3
+3:	mvc    0(8,%r2),0(%r3)		# copy additional parameters
+	la     %r2,8(%r2)		# depending on framesize
+	la     %r3,8(%r3)
+	brctg  %r0,3b
+4:	lmg    %r2,%r6,64(%r12)		# restore call-clobbered arg gprs
+	basr   %r14,%r1			# call resolved function
+	stg    %r2,136(%r12)		# store return values r2, f0
+	std    %f0,144(%r12)		# to struct La_s390_64_retval
+	lmg    %r2,%r3,48(%r12)		# load arguments saved by PLT
+	la     %r4,64(%r12)		# pointer to struct La_s390_64_regs
+	la     %r5,136(%r12)		# pointer to struct La_s390_64_retval
+	brasl  %r14,_dl_call_pltexit
+
+	lgr    %r15,%r12		# remove stack frame
+	cfi_def_cfa_register (15)
+	lg     %r14,32(%r15)		# restore registers
+	lg     %r12,24(%r15)
+	lg     %r2,136(%r15)		# restore return values
+	ld     %f0,144(%r15)
+	br     %r14			# Jump back to caller
+
+	cfi_endproc
+	.size _dl_runtime_profile, .-_dl_runtime_profile
+#endif