diff mbox series

[v6,20/20] elf: Add SVE support for aarch64 rtld-audit

Message ID 20211115183734.531155-21-adhemerval.zanella@linaro.org
State Superseded
Headers show
Series Multiple rtld-audit fixes | expand

Checks

Context Check Description
dj/TryBot-apply_patch success Patch applied to master at the time it was sent
dj/TryBot-32bit success Build for i686

Commit Message

Adhemerval Zanella Nov. 15, 2021, 6:37 p.m. UTC
To implement lazy binding is enabled when profiling or auditing used,
even when STO_AARCH64_VARIANT_PCS is set.  Also, to not incur in
performance penalties on architecture without SVE, the PLT entrypoint
is set to a newer one, _dl_runtime_profile_sve, which is used iff
'hwcap' has HWCAP_SVE bit set.

This should be a fair assumption since SVE has a defined set of
registers for argument passing and return values.  A new ABI with either
different argument passing or different registers would require a
different PLT entry, but I assume this would require another symbol flag
anyway (or at least a different ELF mark to indicate so).

The profile '_dl_runtime_profile_sve' entrypoint assumes the largest SVE
register size possible (2048 bits) and thus it requires a quite large
stack (8976 bytes).  I think it would be possible make the stack
requirement dynamic depending of the vector length, but it would make
the PLT audit function way more complex.

It extends the La_aarch64_vector with a long double pointer to a stack
alloced buffer to hold the SVE Z register, along with a pointer to hold
the P registers on La_aarch64_regs.

It means the if 'lr_sve' is 0 in either La_aarch64_regs or
La_aarch64_retval the La_aarch64_vector contains the floating-pointer
registers that can be accessed directly (non SVE hardware).  Otherwise,
'La_aarch64_vector.z' points to a memory area that holds up to 'lr_sve'
bytes for the Z registers, which can be loaded with svld1 intrinsic for
instance (as tst-audit28.c does).  The P register follows the same
logic, with each La_aarch64_regs.lr_sve_pregs pointing to an area of
memory 'lr_sve/8' in size.

So, to access the FP register as float you can use:

 static inline float regs_vec_to_float (const La_aarch64_regs *regs,
				        int idx)
 {
   float r;
   if (regs->lr_sve == 0)
     r = regs->lr_vreg[idx].s;
   else
     memcpy (&r, &regs->lr_vreg[idx].z[0], sizeof (r));
   return r;
 }

This patch is not complete yet: the tst-audit28 does not check if
compiler supports SVE (we would need a configure check to disable for
such case), I need to add a proper comment for the
_dl_runtime_profile_sve stack layout, the test need to check for the P
register state clobbering.

I also haven't check the performance penalties with this approach, and
maybe the way I am saving/restoring the SVE register might be optimized.

In any case, I checked on a SVE machine and at least the testcase work
as expected without any regressions.  I also did a sniff test on a non
SVE machine.

Checked on aarch64-linux-gnu with SVE support.
---
 elf/do-rel.h                           |   6 +-
 elf/dynamic-link.h                     |  26 ++-
 sysdeps/aarch64/Makefile               |  16 +-
 sysdeps/aarch64/bits/link.h            |   4 +
 sysdeps/aarch64/dl-link.sym            |   3 +
 sysdeps/aarch64/dl-machine.h           |  14 +-
 sysdeps/aarch64/dl-trampoline.S        | 299 ++++++++++++++++++++++++-
 sysdeps/aarch64/tst-audit28.c          |  44 ++++
 sysdeps/aarch64/tst-audit28mod.c       |  48 ++++
 sysdeps/aarch64/tst-audit28mod.h       |  74 ++++++
 sysdeps/aarch64/tst-auditmod27.c       | 187 +++++++++++-----
 sysdeps/aarch64/tst-auditmod28.c       | 193 ++++++++++++++++
 sysdeps/alpha/dl-machine.h             |   2 +-
 sysdeps/arc/dl-machine.h               |   2 +-
 sysdeps/arm/dl-machine.h               |   2 +-
 sysdeps/csky/dl-machine.h              |   2 +-
 sysdeps/hppa/dl-machine.h              |   2 +-
 sysdeps/i386/dl-machine.h              |   2 +-
 sysdeps/ia64/dl-machine.h              |   2 +-
 sysdeps/m68k/dl-machine.h              |   2 +-
 sysdeps/microblaze/dl-machine.h        |   2 +-
 sysdeps/mips/dl-machine.h              |   2 +-
 sysdeps/nios2/dl-machine.h             |   2 +-
 sysdeps/powerpc/powerpc32/dl-machine.h |   2 +-
 sysdeps/powerpc/powerpc64/dl-machine.h |   2 +-
 sysdeps/riscv/dl-machine.h             |   2 +-
 sysdeps/s390/s390-32/dl-machine.h      |   2 +-
 sysdeps/s390/s390-64/dl-machine.h      |   2 +-
 sysdeps/sh/dl-machine.h                |   2 +-
 sysdeps/sparc/sparc32/dl-machine.h     |   2 +-
 sysdeps/sparc/sparc64/dl-machine.h     |   2 +-
 sysdeps/x86_64/dl-machine.h            |   2 +-
 32 files changed, 862 insertions(+), 92 deletions(-)
 create mode 100644 sysdeps/aarch64/tst-audit28.c
 create mode 100644 sysdeps/aarch64/tst-audit28mod.c
 create mode 100644 sysdeps/aarch64/tst-audit28mod.h
 create mode 100644 sysdeps/aarch64/tst-auditmod28.c

Comments

Florian Weimer Dec. 21, 2021, 2:27 p.m. UTC | #1
* Adhemerval Zanella:

> To implement lazy binding is enabled when profiling or auditing used,
> even when STO_AARCH64_VARIANT_PCS is set.  Also, to not incur in
> performance penalties on architecture without SVE, the PLT entrypoint
> is set to a newer one, _dl_runtime_profile_sve, which is used iff
> 'hwcap' has HWCAP_SVE bit set.
>
> This should be a fair assumption since SVE has a defined set of
> registers for argument passing and return values.  A new ABI with either
> different argument passing or different registers would require a
> different PLT entry, but I assume this would require another symbol flag
> anyway (or at least a different ELF mark to indicate so).

Someone familiar with SVE needs to review this.

The restriction to the currently-specified vector calling convention
seems to be a bit arbitrary to me.  To me, STO_AARCH64_VARIANT_PCS
strongly suggests that the toolchain makes every conceivable attempt to
support arbitrary calling conventions, but that's probably just me.

Thanks,
Florian
Adhemerval Zanella Dec. 21, 2021, 2:37 p.m. UTC | #2
On 21/12/2021 11:27, Florian Weimer wrote:
> * Adhemerval Zanella:
> 
>> To implement lazy binding is enabled when profiling or auditing used,
>> even when STO_AARCH64_VARIANT_PCS is set.  Also, to not incur in
>> performance penalties on architecture without SVE, the PLT entrypoint
>> is set to a newer one, _dl_runtime_profile_sve, which is used iff
>> 'hwcap' has HWCAP_SVE bit set.
>>
>> This should be a fair assumption since SVE has a defined set of
>> registers for argument passing and return values.  A new ABI with either
>> different argument passing or different registers would require a
>> different PLT entry, but I assume this would require another symbol flag
>> anyway (or at least a different ELF mark to indicate so).
> 
> Someone familiar with SVE needs to review this.
> 
> The restriction to the currently-specified vector calling convention
> seems to be a bit arbitrary to me.  To me, STO_AARCH64_VARIANT_PCS
> strongly suggests that the toolchain makes every conceivable attempt to
> support arbitrary calling conventions, but that's probably just me.

The STO_AARCH64_VARIANT_PCS does suggest compiler might use *any* calling
convention it seems fit for the symbol.  I sent the patch initially as
RFC, hopping other ARM developers will suggest that either assuming SVE
calling convention would be fine or if we should not make any assumptions
and save/restore everything.

I can remove it from this patchset and aim to fix on 2.36.  The main
problem is would require to bump LAV_VERSION again for aarch64, since
it would require change both La_aarch64_regs and La_aarch64_retval 
layout.
Szabolcs Nagy Dec. 21, 2021, 4:45 p.m. UTC | #3
The 12/21/2021 15:27, Florian Weimer via Libc-alpha wrote:
> * Adhemerval Zanella:
> 
> > To implement lazy binding is enabled when profiling or auditing used,
> > even when STO_AARCH64_VARIANT_PCS is set.  Also, to not incur in
> > performance penalties on architecture without SVE, the PLT entrypoint
> > is set to a newer one, _dl_runtime_profile_sve, which is used iff
> > 'hwcap' has HWCAP_SVE bit set.
> >
> > This should be a fair assumption since SVE has a defined set of
> > registers for argument passing and return values.  A new ABI with either
> > different argument passing or different registers would require a
> > different PLT entry, but I assume this would require another symbol flag
> > anyway (or at least a different ELF mark to indicate so).
> 
> Someone familiar with SVE needs to review this.
> 
> The restriction to the currently-specified vector calling convention
> seems to be a bit arbitrary to me.  To me, STO_AARCH64_VARIANT_PCS
> strongly suggests that the toolchain makes every conceivable attempt to
> support arbitrary calling conventions, but that's probably just me.


thanks for working on this, but yeah this approach does not work:

A PLT hook for STO_AARCH64_VARIANT_PCS symbol has to preserve *all*
registers (except x16,x17,cc), this is the same strict requirement
as linker inserted veneers follow. (historically the dynamic linker
lazy binding code made assumptions about the call convention, but if
there are extern calls with unusual call conventions then this does
not work and it has to behave like the static linker. this is the
meaning of variant_pcs, it is not just for sve calls.)

there is no way to tell which call is an sve call, so i'd suggest
having 2 profile entries: normal and variant_pcs and the latter is
used if a module is marked with DT_AARCH64_VARIANT_PCS and it
preserves all registers.

when "all registers" increase via arch extensions then the PLT hook
code has to be updated, but ideally only the variant_pcs hooks
will require a glibc update to work on a new arch.

e.g. the user callback can take

struct regs {
  // base arg regs inline;
  ...
  void *vpcs_regs;
};

where vpcs_regs!=0 for variant_pcs calls and contains all the regs
(in a way that can represent various combinations of arch extensions,
i don't think we have to make it userfriendly to access).

(i think we don't have to get the aarch64 fixes done in 2.35 as this
is a big design work.)
Adhemerval Zanella Dec. 21, 2021, 5:08 p.m. UTC | #4
On 21/12/2021 13:45, Szabolcs Nagy wrote:
> The 12/21/2021 15:27, Florian Weimer via Libc-alpha wrote:
>> * Adhemerval Zanella:
>>
>>> To implement lazy binding is enabled when profiling or auditing used,
>>> even when STO_AARCH64_VARIANT_PCS is set.  Also, to not incur in
>>> performance penalties on architecture without SVE, the PLT entrypoint
>>> is set to a newer one, _dl_runtime_profile_sve, which is used iff
>>> 'hwcap' has HWCAP_SVE bit set.
>>>
>>> This should be a fair assumption since SVE has a defined set of
>>> registers for argument passing and return values.  A new ABI with either
>>> different argument passing or different registers would require a
>>> different PLT entry, but I assume this would require another symbol flag
>>> anyway (or at least a different ELF mark to indicate so).
>>
>> Someone familiar with SVE needs to review this.
>>
>> The restriction to the currently-specified vector calling convention
>> seems to be a bit arbitrary to me.  To me, STO_AARCH64_VARIANT_PCS
>> strongly suggests that the toolchain makes every conceivable attempt to
>> support arbitrary calling conventions, but that's probably just me.
> 
> 
> thanks for working on this, but yeah this approach does not work:
> 
> A PLT hook for STO_AARCH64_VARIANT_PCS symbol has to preserve *all*
> registers (except x16,x17,cc), this is the same strict requirement
> as linker inserted veneers follow. (historically the dynamic linker
> lazy binding code made assumptions about the call convention, but if
> there are extern calls with unusual call conventions then this does
> not work and it has to behave like the static linker. this is the
> meaning of variant_pcs, it is not just for sve calls.)
> 
> there is no way to tell which call is an sve call, so i'd suggest
> having 2 profile entries: normal and variant_pcs and the latter is
> used if a module is marked with DT_AARCH64_VARIANT_PCS and it
> preserves all registers.
> 
> when "all registers" increase via arch extensions then the PLT hook
> code has to be updated, but ideally only the variant_pcs hooks
> will require a glibc update to work on a new arch.

I was afraid we would require a change like that and based on 
STO_AARCH64_VARIANT_PCS definition is does seem the only option.

> 
> e.g. the user callback can take
> 
> struct regs {
>   // base arg regs inline;
>   ...
>   void *vpcs_regs;
> };
> > where vpcs_regs!=0 for variant_pcs calls and contains all the regs
> (in a way that can represent various combinations of arch extensions,
> i don't think we have to make it userfriendly to access).

Yeah, that was one idea I had when coding it.  It will make the
_dl_runtime_profile somewhat more complex for SVE, but it the cost
of having a non-specific ABIs for lazy binding.

I think we will need to have this new field on BZ #26643 fix at least,
and chage current code to zero it. It will make it easier to provide
SVE support on 2.36.

> 
> (i think we don't have to get the aarch64 fixes done in 2.35 as this
> is a big design work.)

Agreed, I will drop this patch for v7.
diff mbox series

Patch

diff --git a/elf/do-rel.h b/elf/do-rel.h
index c3368f09ab..8ae7bacebe 100644
--- a/elf/do-rel.h
+++ b/elf/do-rel.h
@@ -43,7 +43,7 @@  static inline void __attribute__ ((always_inline))
 elf_dynamic_do_Rel (struct link_map *map, struct r_scope_elem *scope[],
 		    ElfW(Addr) reladdr, ElfW(Addr) relsize,
 		    __typeof (((ElfW(Dyn) *) 0)->d_un.d_val) nrelative,
-		    int lazy, int skip_ifunc)
+		    int lazy, int profile, int skip_ifunc)
 {
   const ElfW(Rel) *r = (const void *) reladdr;
   const ElfW(Rel) *end = (const void *) (reladdr + relsize);
@@ -70,13 +70,13 @@  elf_dynamic_do_Rel (struct link_map *map, struct r_scope_elem *scope[],
 	  }
 	else
 # endif
-	  elf_machine_lazy_rel (map, scope, l_addr, r, skip_ifunc);
+	  elf_machine_lazy_rel (map, scope, l_addr, r, profile, skip_ifunc);
 
 # ifdef ELF_MACHINE_IRELATIVE
       if (r2 != NULL)
 	for (; r2 <= end2; ++r2)
 	  if (ELFW(R_TYPE) (r2->r_info) == ELF_MACHINE_IRELATIVE)
-	    elf_machine_lazy_rel (map, scope, l_addr, r2, skip_ifunc);
+	    elf_machine_lazy_rel (map, scope, l_addr, r2, profile, skip_ifunc);
 # endif
     }
   else
diff --git a/elf/dynamic-link.h b/elf/dynamic-link.h
index f619615e5c..d1e83d8b8c 100644
--- a/elf/dynamic-link.h
+++ b/elf/dynamic-link.h
@@ -54,11 +54,13 @@  elf_machine_rela_relative (ElfW(Addr) l_addr, const ElfW(Rela) *reloc,
 static inline void __attribute__((always_inline))
 elf_machine_lazy_rel (struct link_map *map, struct r_scope_elem *scope[],
 		      ElfW(Addr) l_addr, const ElfW(Rel) *reloc,
+		      int profile,
 		      int skip_ifunc);
 # else
 static inline void __attribute__((always_inline))
 elf_machine_lazy_rel (struct link_map *map, struct r_scope_elem *scope[],
 		      ElfW(Addr) l_addr, const ElfW(Rela) *reloc,
+		      int profile,
 		      int skip_ifunc);
 # endif
 #endif
@@ -78,7 +80,8 @@  elf_machine_lazy_rel (struct link_map *map, struct r_scope_elem *scope[],
    consumes precisely the very end of the DT_REL*, or DT_JMPREL and DT_REL*
    are completely separate and there is a gap between them.  */
 
-# define _ELF_DYNAMIC_DO_RELOC(RELOC, reloc, map, scope, do_lazy, skip_ifunc, test_rel) \
+# define _ELF_DYNAMIC_DO_RELOC(RELOC, reloc, map, scope, do_lazy, do_profile, \
+			       skip_ifunc, test_rel) \
   do {									      \
     struct { ElfW(Addr) start, size;					      \
 	     __typeof (((ElfW(Dyn) *) 0)->d_un.d_val) nrelative; int lazy; }  \
@@ -120,6 +123,7 @@  elf_machine_lazy_rel (struct link_map *map, struct r_scope_elem *scope[],
 				ranges[ranges_index].size,		      \
 				ranges[ranges_index].nrelative,		      \
 				ranges[ranges_index].lazy,		      \
+				do_profile,				      \
 				skip_ifunc);				      \
   } while (0)
 
@@ -131,19 +135,21 @@  elf_machine_lazy_rel (struct link_map *map, struct r_scope_elem *scope[],
 
 # if ! ELF_MACHINE_NO_REL
 #  include "do-rel.h"
-#  define ELF_DYNAMIC_DO_REL(map, scope, lazy, skip_ifunc)	      \
-  _ELF_DYNAMIC_DO_RELOC (REL, Rel, map, scope, lazy, skip_ifunc, _ELF_CHECK_REL)
+#  define ELF_DYNAMIC_DO_REL(map, scope, lazy, profiling, skip_ifunc)	      \
+  _ELF_DYNAMIC_DO_RELOC (REL, Rel, map, scope, lazy, profiling,	      	      \
+			 skip_ifunc, _ELF_CHECK_REL)
 # else
-#  define ELF_DYNAMIC_DO_REL(map, scope, lazy, skip_ifunc) /* Nothing to do.  */
+#  define ELF_DYNAMIC_DO_REL(map, scope, lazy, profiling, skip_ifunc) /* Nothing to do.  */
 # endif
 
 # if ! ELF_MACHINE_NO_RELA
 #  define DO_RELA
 #  include "do-rel.h"
-#  define ELF_DYNAMIC_DO_RELA(map, scope, lazy, skip_ifunc)	      \
-  _ELF_DYNAMIC_DO_RELOC (RELA, Rela, map, scope, lazy, skip_ifunc, _ELF_CHECK_REL)
+#  define ELF_DYNAMIC_DO_RELA(map, scope, lazy, profiling, skip_ifunc)	      \
+  _ELF_DYNAMIC_DO_RELOC (RELA, Rela, map, scope, lazy, profiling, skip_ifunc, \
+			 _ELF_CHECK_REL)
 # else
-#  define ELF_DYNAMIC_DO_RELA(map, scope, lazy, skip_ifunc) /* Nothing to do.  */
+#  define ELF_DYNAMIC_DO_RELA(map, scope, lazy, profiling, skip_ifunc) /* Nothing to do.  */
 # endif
 
 /* This can't just be an inline function because GCC is too dumb
@@ -152,8 +158,10 @@  elf_machine_lazy_rel (struct link_map *map, struct r_scope_elem *scope[],
   do {									      \
     int edr_lazy = elf_machine_runtime_setup ((map), (scope), (lazy),	      \
 					      (consider_profile));	      \
-    ELF_DYNAMIC_DO_REL ((map), (scope), edr_lazy, skip_ifunc);		      \
-    ELF_DYNAMIC_DO_RELA ((map), (scope), edr_lazy, skip_ifunc);		      \
+    ELF_DYNAMIC_DO_REL ((map), (scope), edr_lazy, (consider_profile),	      \
+		       	skip_ifunc);					      \
+    ELF_DYNAMIC_DO_RELA ((map), (scope), edr_lazy, (consider_profile),	      \
+			 skip_ifunc);		      \
   } while (0)
 
 #endif
diff --git a/sysdeps/aarch64/Makefile b/sysdeps/aarch64/Makefile
index 7183895d04..a2c64d7d5d 100644
--- a/sysdeps/aarch64/Makefile
+++ b/sysdeps/aarch64/Makefile
@@ -12,13 +12,16 @@  ifeq ($(subdir),elf)
 sysdep-dl-routines += dl-bti
 
 tests += tst-audit26 \
-	 tst-audit27
+	 tst-audit27 \
+	 tst-audit28
 
 modules-names += \
     tst-audit26mod \
     tst-auditmod26 \
     tst-audit27mod \
-    tst-auditmod27
+    tst-auditmod27 \
+    tst-audit28mod \
+    tst-auditmod28
 
 $(objpfx)tst-audit26: $(objpfx)tst-audit26mod.so \
 		      $(objpfx)tst-auditmod26.so
@@ -30,6 +33,15 @@  $(objpfx)tst-audit27: $(objpfx)tst-audit27mod.so \
 $(objpfx)tst-audit27mod.so: $(libsupport)
 LDFLAGS-tst-audit27 += -Wl,-z,lazy
 tst-audit27-ENV = LD_AUDIT=$(objpfx)tst-auditmod27.so
+
+$(objpfx)tst-audit28: $(objpfx)tst-audit28mod.so \
+		      $(objpfx)tst-auditmod28.so
+CFLAGS-tst-audit28.c += -march=armv8.2-a+sve
+CFLAGS-tst-audit28mod.c += -march=armv8.2-a+sve
+CFLAGS-tst-auditmod28.c += -march=armv8.2-a+sve
+$(objpfx)tst-audit28mod.so: $(libsupport)
+LDFLAGS-tst-audit28 += -Wl,-z,lazy
+tst-audit28-ENV = LD_AUDIT=$(objpfx)tst-auditmod28.so
 endif
 
 ifeq ($(subdir),elf)
diff --git a/sysdeps/aarch64/bits/link.h b/sysdeps/aarch64/bits/link.h
index 2af90ca6be..2ad0f5d500 100644
--- a/sysdeps/aarch64/bits/link.h
+++ b/sysdeps/aarch64/bits/link.h
@@ -25,6 +25,7 @@  typedef union
   float s;
   double d;
   long double q;
+  long double *z;
 } La_aarch64_vector;
 
 /* Registers for entry into PLT on AArch64.  */
@@ -34,6 +35,8 @@  typedef struct La_aarch64_regs
   La_aarch64_vector lr_vreg[8];
   uint64_t          lr_sp;
   uint64_t          lr_lr;
+  uint8_t           lr_sve;
+  uint16_t          *lr_sve_pregs[4];
 } La_aarch64_regs;
 
 /* Return values for calls from PLT on AArch64.  */
@@ -43,6 +46,7 @@  typedef struct La_aarch64_retval
   uint64_t          lrv_xreg[8];
   /* Up to eight V registers can be used for a return value.  */
   La_aarch64_vector lrv_vreg[8];
+  uint8_t           lrv_sve;
 } La_aarch64_retval;
 __BEGIN_DECLS
 
diff --git a/sysdeps/aarch64/dl-link.sym b/sysdeps/aarch64/dl-link.sym
index 70d153a1d5..9bc56b98ec 100644
--- a/sysdeps/aarch64/dl-link.sym
+++ b/sysdeps/aarch64/dl-link.sym
@@ -10,6 +10,9 @@  DL_OFFSET_RG_X0		offsetof(struct La_aarch64_regs, lr_xreg)
 DL_OFFSET_RG_V0		offsetof(struct La_aarch64_regs, lr_vreg)
 DL_OFFSET_RG_SP		offsetof(struct La_aarch64_regs, lr_sp)
 DL_OFFSET_RG_LR		offsetof(struct La_aarch64_regs, lr_lr)
+DL_OFFSET_RG_SVE        offsetof(struct La_aarch64_regs, lr_sve)
+DL_OFFSET_RG_SVE_PREGS  offsetof(struct La_aarch64_regs, lr_sve_pregs)
 
 DL_OFFSET_RV_X0		offsetof(struct La_aarch64_retval, lrv_xreg)
 DL_OFFSET_RV_V0		offsetof(struct La_aarch64_retval, lrv_vreg)
+DL_OFFSET_RV_SVE	offsetof(struct La_aarch64_retval, lrv_sve)
diff --git a/sysdeps/aarch64/dl-machine.h b/sysdeps/aarch64/dl-machine.h
index 0d5ad218c1..a039a0c32c 100644
--- a/sysdeps/aarch64/dl-machine.h
+++ b/sysdeps/aarch64/dl-machine.h
@@ -69,6 +69,9 @@  elf_machine_runtime_setup (struct link_map *l, struct r_scope_elem *scope[],
       ElfW(Addr) *got;
       extern void _dl_runtime_resolve (ElfW(Word));
       extern void _dl_runtime_profile (ElfW(Word));
+#if HAVE_AARCH64_SVE_ASM
+      extern void _dl_runtime_profile_sve (ElfW(Word));
+#endif
 
       got = (ElfW(Addr) *) D_PTR (l, l_info[DT_PLTGOT]);
       if (got[1])
@@ -85,7 +88,12 @@  elf_machine_runtime_setup (struct link_map *l, struct r_scope_elem *scope[],
 	 end in this function.  */
       if ( profile)
 	{
-	   got[2] = (ElfW(Addr)) &_dl_runtime_profile;
+#if HAVE_AARCH64_SVE_ASM
+	  if (GLRO(dl_hwcap) & HWCAP_SVE)
+	    got[2] = (ElfW(Addr)) &_dl_runtime_profile_sve;
+	  else
+#endif
+	    got[2] = (ElfW(Addr)) &_dl_runtime_profile;
 
 	  if (GLRO(dl_profile) != NULL
 	      && _dl_name_match_p (GLRO(dl_profile), l))
@@ -383,6 +391,7 @@  __attribute__ ((always_inline))
 elf_machine_lazy_rel (struct link_map *map, struct r_scope_elem *scope[],
 		      ElfW(Addr) l_addr,
 		      const ElfW(Rela) *reloc,
+		      int profile,
 		      int skip_ifunc)
 {
   ElfW(Addr) *const reloc_addr = (void *) (l_addr + reloc->r_offset);
@@ -390,7 +399,8 @@  elf_machine_lazy_rel (struct link_map *map, struct r_scope_elem *scope[],
   /* Check for unexpected PLT reloc type.  */
   if (__builtin_expect (r_type == AARCH64_R(JUMP_SLOT), 1))
     {
-      if (__glibc_unlikely (map->l_info[DT_AARCH64 (VARIANT_PCS)] != NULL))
+      if (__glibc_unlikely (map->l_info[DT_AARCH64 (VARIANT_PCS)] != NULL)
+	  && profile == 0)
 	{
 	  /* Check the symbol table for variant PCS symbols.  */
 	  const Elf_Symndx symndx = ELFW (R_SYM) (reloc->r_info);
diff --git a/sysdeps/aarch64/dl-trampoline.S b/sysdeps/aarch64/dl-trampoline.S
index 0d540651d4..0aee1fbf3d 100644
--- a/sysdeps/aarch64/dl-trampoline.S
+++ b/sysdeps/aarch64/dl-trampoline.S
@@ -205,6 +205,9 @@  _dl_runtime_profile:
 	cfi_rel_offset (q6, OFFSET_RG + DL_OFFSET_RG_V0 + 32*3 + 0)
 	cfi_rel_offset (q7, OFFSET_RG + DL_OFFSET_RG_V0 + 32*3 + 16)
 
+	strb	wzr, [x29, #OFFSET_RG + DL_OFFSET_RG_SVE]
+	strb	wzr, [x29, #OFFSET_RV + DL_OFFSET_RV_SVE]
+
 	add     x0, x29, #SF_SIZE + 16
 	ldr	x1, [x29, #OFFSET_LR]
 	stp	x0, x1, [x29, #OFFSET_RG + DL_OFFSET_RG_SP]
@@ -339,5 +342,299 @@  _dl_runtime_profile:
 
 	cfi_endproc
 	.size _dl_runtime_profile, .-_dl_runtime_profile
-#endif
 	.previous
+
+
+# define HWCAP_SVE		22
+# define ZCR_ELx_LEN_MASK        0x1ff
+
+# if HAVE_AARCH64_SVE_ASM
+	.arch armv8.2-a+sve
+	.globl _dl_runtime_profile_sve
+	.type _dl_runtime_profile_sve, #function
+	cfi_startproc
+	.align 2
+_dl_runtime_profile_sve:
+#  if HAVE_AARCH64_PAC_RET
+	PACIASP
+	cfi_window_save
+#  else
+	BTI_C
+#  endif
+        /* AArch64 we get called with:
+           ip0          &PLTGOT[2]
+           ip1          temp(dl resolver entry point)
+           [sp, #8]     lr
+           [sp, #0]     &PLTGOT[n]
+
+           Stack frame layout:
+           [sp,   #...] lr
+           [sp,   #...] &PLTGOT[n]
+           -------------------------
+	   TODO
+           [sp,   #  0] x29, lr   <- x29
+	 */
+
+#  define OFFSET_SVE_T1			16
+#  define OFFSET_SVE_SAVED_CALL_X0	OFFSET_SVE_T1 + 8
+#  define OFFSET_SVE_FS			OFFSET_SVE_SAVED_CALL_X0 + 16
+#  define OFFSET_SVE_RV			OFFSET_SVE_FS + 8
+#  define OFFSET_SVE_RG			OFFSET_SVE_RV + DL_SIZEOF_RV
+/* Maximum supported z and pregisters size in bytes.  */
+#  define SIZEOF_SVE_Z_REG		512
+#  define SIZEOF_SVE_P_REG		SIZEOF_SVE_Z_REG / 8
+/* z0-z7 for argument passing.  */
+#  define SIZEOF_SVE_RG_Z		8 * SIZEOF_SVE_Z_REG
+/* p0-p3 for argument passing.  */
+#  define SIZEOF_SVE_RG_P		4 * SIZEOF_SVE_P_REG
+/* z0-z7 for function return.  */
+#  define SIZEOF_SVE_RV_P		8 * SIZEOF_SVE_Z_REG
+/* SVE registers contents for La_aarch64_regs.lr_vreg  */
+#  define OFFSET_SVE_RG_Z		OFFSET_SVE_RG + DL_SIZEOF_RG
+/* SVE registers contents for La_aarch64.regs.lr_sve_pregs  */
+#  define OFFSET_SVE_RG_P		OFFSET_SVE_RG_Z + SIZEOF_SVE_RG_Z
+/* SVE registers contents for La_aarch64_retval.lrv_vreg  */
+#  define OFFSET_SVE_RV_Z		OFFSET_SVE_RG_P + SIZEOF_SVE_RG_P
+
+#  define SF_SVE_SIZE			OFFSET_SVE_RV_Z + SIZEOF_SVE_RV_P
+
+#  define OFFSET_SVE_PLTGOTN		SF_SVE_SIZE
+#  define OFFSET_SVE_LR			OFFSET_SVE_PLTGOTN + 8
+
+	.macro save_sve_z_reg zreg idx offset save_addr
+	.ifc \zreg, z0
+	.if \offset < 4096
+	add	x0, x29, \offset
+	.else
+	mov	x0, \offset
+	add	x0, x29, x0
+	.endif
+	.else
+	add	x0, x0, SIZEOF_SVE_Z_REG
+	.endif
+	str	\zreg, [x0, #0]
+	.if \save_addr == 1
+	str	x0, [X29, #OFFSET_RG + DL_OFFSET_RG_V0 + 16*\idx]
+	.else
+	str	x0, [X29, #OFFSET_RV + DL_OFFSET_RV_V0 + 16*\idx]
+	.endif
+	.endm
+
+	.macro save_sve_regs offset save_addr
+	save_sve_z_reg z0 0 \offset \save_addr
+	save_sve_z_reg z1 1 \offset \save_addr
+	save_sve_z_reg z2 2 \offset \save_addr
+	save_sve_z_reg z3 3 \offset \save_addr
+	save_sve_z_reg z4 4 \offset \save_addr
+	save_sve_z_reg z5 5 \offset \save_addr
+	save_sve_z_reg z6 6 \offset \save_addr
+	save_sve_z_reg z7 7 \offset \save_addr
+	.if \save_addr  == 1
+	add	x0, x0, SIZEOF_SVE_P_REG
+	str	p0, [x0, #0]
+	add	x0, x0, SIZEOF_SVE_P_REG
+	str	p1, [x0, #0]
+	add	x0, x0, SIZEOF_SVE_P_REG
+	str	p2, [x0, #0]
+	add	x0, x0, SIZEOF_SVE_P_REG
+	str	p3, [x0, #0]
+	.endif
+	.endm
+
+	.macro load_sve_regs offset
+	.if \offset < 4096
+	add	x12, x29, \offset
+	.else
+	mov	x12, \offset
+	add	x12, x29, x12
+	.endif
+	ldr	z0, [x12, #0]
+	add	x12, x12, SIZEOF_SVE_Z_REG
+	ldr	z1, [x12, #0]
+	add	x12, x12, SIZEOF_SVE_Z_REG
+	ldr	z2, [x12, #0]
+	add	x12, x12, SIZEOF_SVE_Z_REG
+	ldr	z3, [x12, #0]
+	add	x12, x12, SIZEOF_SVE_Z_REG
+	ldr	z4, [x12, #0]
+	add	x12, x12, SIZEOF_SVE_Z_REG
+	ldr	z5, [x12, #0]
+	add	x12, x12, SIZEOF_SVE_Z_REG
+	ldr	z6, [x12, #0]
+	add	x12, x12, SIZEOF_SVE_Z_REG
+	ldr	z7, [x12, #0]
+	add	x12, x12, SIZEOF_SVE_P_REG
+	ldr	p0, [x12, #0]
+	add	x12, x12, SIZEOF_SVE_P_REG
+	ldr	p1, [x12, #0]
+	add	x12, x12, SIZEOF_SVE_P_REG
+	ldr	p2, [x12, #0]
+	add	x12, x12, SIZEOF_SVE_P_REG
+	ldr	p3, [x12, #0]
+	.endm
+
+
+	/* Save arguments.  */
+	mov	x12, #SF_SVE_SIZE
+	sub	sp, sp, x12
+	cfi_adjust_cfa_offset (SF_SVE_SIZE)
+	stp	x29, x30, [SP, #0]
+	mov	x29, sp
+	cfi_def_cfa_register (x29)
+	cfi_rel_offset (x29, 0)
+	cfi_rel_offset (lr, 8)
+
+	stp	x0, x1, [x29, #OFFSET_SVE_RG + DL_OFFSET_RG_X0 + 16*0]
+	cfi_rel_offset (x0, OFFSET_SVE_RG + DL_OFFSET_RG_X0 + 16*0 + 0)
+	cfi_rel_offset (x1, OFFSET_SVE_RG + DL_OFFSET_RG_X0 + 16*0 + 8)
+	stp	x2, x3, [x29, #OFFSET_SVE_RG + DL_OFFSET_RG_X0 + 16*1]
+	cfi_rel_offset (x2, OFFSET_SVE_RG + DL_OFFSET_RG_X0 + 16*1 + 0)
+	cfi_rel_offset (x3, OFFSET_SVE_RG + DL_OFFSET_RG_X0 + 16*1 + 8)
+	stp	x4, x5, [x29, #OFFSET_SVE_RG + DL_OFFSET_RG_X0 + 16*2]
+	cfi_rel_offset (x4, OFFSET_SVE_RG + DL_OFFSET_RG_X0 + 16*2 + 0)
+	cfi_rel_offset (x5, OFFSET_SVE_RG + DL_OFFSET_RG_X0 + 16*2 + 8)
+	stp	x6, x7, [x29, #OFFSET_SVE_RG + DL_OFFSET_RG_X0 + 16*3]
+	cfi_rel_offset (x6, OFFSET_SVE_RG + DL_OFFSET_RG_X0 + 16*3 + 0)
+	cfi_rel_offset (x7, OFFSET_SVE_RG + DL_OFFSET_RG_X0 + 16*3 + 8)
+	str	x8, [x29, #OFFSET_SVE_RG + DL_OFFSET_RG_X0 + 16*4 + 0]
+	cfi_rel_offset (x8, OFFSET_SVE_RG + DL_OFFSET_RG_X0 + 16*4 + 0)
+	/* Note 8 bytes of padding is in the stack frame for alignment */
+	save_sve_regs OFFSET_SVE_RG_Z 1
+
+	/* Store the vector length on lr_sve  */
+	mov	x0, #0
+	addvl	x0, x0, #1
+	strb	w0, [x29, #OFFSET_RG + DL_OFFSET_RG_SVE]
+	strb	w0, [x29, #OFFSET_RV + DL_OFFSET_RV_SVE]
+
+	mov	x0, #SF_SVE_SIZE + 16
+	add     x0, x29, x0
+	ldr	x1, [x29, #OFFSET_SVE_LR]
+	stp	x0, x1, [x29, #OFFSET_SVE_RG + DL_OFFSET_RG_SP]
+
+	/* Get pointer to linker struct.  */
+	ldr	PTR_REG (0), [ip0, #-PTR_SIZE]
+
+	/* Prepare to call _dl_profile_fixup().  */
+	ldr	x1, [x29, OFFSET_SVE_PLTGOTN]	/* Recover &PLTGOT[n] */
+
+	sub     x1, x1, ip0
+	add     x1, x1, x1, lsl #1
+	lsl     x1, x1, #3
+	sub     x1, x1, #(RELA_SIZE<<3)
+	lsr     x1, x1, #3
+
+	stp	x0, x1, [x29, #OFFSET_SVE_SAVED_CALL_X0]
+
+	/* Set up extra args for _dl_profile_fixup */
+	ldr	x2, [x29, #OFFSET_SVE_LR]	/* load saved LR */
+	add	x3, x29, #OFFSET_SVE_RG		/* address of La_aarch64_reg */
+	add	x4, x29, #OFFSET_SVE_FS		/* address of framesize */
+	bl	_dl_profile_fixup
+
+	ldr	ip0l, [x29, #OFFSET_SVE_FS]	/* framesize == 0 */
+	cmp	ip0l, #0
+	bge	1f
+	cfi_remember_state
+
+	/* Save the return.  */
+	mov	ip0, x0
+
+	/* Get arguments and return address back.  */
+	ldp	x0, x1, [x29, #OFFSET_SVE_RG + DL_OFFSET_RG_X0 + 16*0]
+	ldp	x2, x3, [x29, #OFFSET_SVE_RG + DL_OFFSET_RG_X0 + 16*1]
+	ldp	x4, x5, [x29, #OFFSET_SVE_RG + DL_OFFSET_RG_X0 + 16*2]
+	ldp	x6, x7, [x29, #OFFSET_SVE_RG + DL_OFFSET_RG_X0 + 16*3]
+	ldr	x8,     [x29, #OFFSET_SVE_RG + DL_OFFSET_RG_X0 + 16*4]
+	load_sve_regs OFFSET_SVE_RG_Z
+
+	cfi_def_cfa_register (sp)
+	ldp	x29, x30, [x29, #0]
+	cfi_restore (x29)
+	cfi_restore (x30)
+
+#  if HAVE_AARCH64_PAC_RET
+	mov	x12, SF_SVE_SIZE
+	add	sp, sp, x12
+	cfi_adjust_cfa_offset (-SF_SVE_SIZE)
+	AUTIASP
+	cfi_window_save
+	add	sp, sp, 16
+	cfi_adjust_cfa_offset (-16)
+#  else
+	mov	x12, SF_SVE_SIZE + 16
+	add	sp, sp, x12
+	cfi_adjust_cfa_offset (- SF_SVE_SIZE - 16)
+#  endif
+
+	/* Jump to the newly found address.  */
+	br	ip0
+
+	cfi_restore_state
+1:
+	/* The new frame size is in ip0.  */
+
+	sub	PTR_REG (1), PTR_REG (29), ip0l
+	and	sp, x1, #0xfffffffffffffff0
+
+	str	x0, [x29, #OFFSET_SVE_T1]
+
+	mov	x0, sp
+	mov	x1, #SF_SVE_SIZE + 16
+	add	x1, x29, x1
+	mov	x2, ip0
+	bl	memcpy
+
+	ldr	ip0, [x29, #OFFSET_SVE_T1]
+
+	/* Call the function.  */
+	ldp	x0, x1, [x29, #OFFSET_SVE_RG + DL_OFFSET_RG_X0 + 16*0]
+	ldp	x2, x3, [x29, #OFFSET_SVE_RG + DL_OFFSET_RG_X0 + 16*1]
+	ldp	x4, x5, [x29, #OFFSET_SVE_RG + DL_OFFSET_RG_X0 + 16*2]
+	ldp	x6, x7, [x29, #OFFSET_SVE_RG + DL_OFFSET_RG_X0 + 16*3]
+	ldr	x8,     [x29, #OFFSET_SVE_RG + DL_OFFSET_RG_X0 + 16*4]
+	load_sve_regs OFFSET_SVE_RG_Z
+	blr	ip0
+	stp	x0, x1, [x29, #OFFSET_SVE_RV + DL_OFFSET_RV_X0 + 16*0]
+	stp	x2, x3, [x29, #OFFSET_SVE_RV + DL_OFFSET_RV_X0 + 16*1]
+	stp	x4, x5, [x29, #OFFSET_SVE_RV + DL_OFFSET_RV_X0 + 16*2]
+	stp	x6, x7, [x29, #OFFSET_SVE_RV + DL_OFFSET_RV_X0 + 16*3]
+	str	x8,     [x29, #OFFSET_SVE_RG + DL_OFFSET_RG_X0 + 16*4]
+	save_sve_regs OFFSET_SVE_RV_Z 0
+
+	/* Setup call to pltexit  */
+	ldp	x0, x1, [x29, #OFFSET_SVE_SAVED_CALL_X0]
+	add	x2, x29, #OFFSET_SVE_RG
+	add	x3, x29, #OFFSET_SVE_RV
+	bl	_dl_audit_pltexit
+
+	ldp	x0, x1, [x29, #OFFSET_SVE_RV + DL_OFFSET_RV_X0 + 16*0]
+	ldp	x2, x3, [x29, #OFFSET_SVE_RV + DL_OFFSET_RV_X0 + 16*1]
+	ldp	x4, x5, [x29, #OFFSET_SVE_RV + DL_OFFSET_RV_X0 + 16*2]
+	ldp	x6, x7, [x29, #OFFSET_SVE_RV + DL_OFFSET_RV_X0 + 16*3]
+	ldr	x8,     [x29, #OFFSET_SVE_RV + DL_OFFSET_RV_X0 + 16*4]
+	load_sve_regs OFFSET_SVE_RV_Z
+
+	/* LR from within La_aarch64_reg */
+	ldr	lr, [x29, #OFFSET_SVE_RG + DL_OFFSET_RG_LR]
+	cfi_restore(lr)
+#  if HAVE_AARCH64_PAC_RET
+	/* Note: LR restored from La_aarch64_reg has no PAC.  */
+	cfi_window_save
+#  endif
+	mov	sp, x29
+	cfi_def_cfa_register (sp)
+	ldr	x29, [x29, #0]
+	cfi_restore(x29)
+	mov	x12, SF_SVE_SIZE + 16
+	add	sp, sp, x12
+	cfi_adjust_cfa_offset (- SF_SVE_SIZE - 16)
+
+	br	lr
+
+	cfi_endproc
+	.size _dl_runtime_profile_sve, .-_dl_runtime_profile_sve
+	.previous
+# endif /* HAVE_AARCH64_SVE_ASM  */
+
+#endif /* !PROF  */
diff --git a/sysdeps/aarch64/tst-audit28.c b/sysdeps/aarch64/tst-audit28.c
new file mode 100644
index 0000000000..28ffbd141f
--- /dev/null
+++ b/sysdeps/aarch64/tst-audit28.c
@@ -0,0 +1,44 @@ 
+/* Check DT_AUDIT for aarch64 ABI specifics.
+   Copyright (C) 2021 Free Software Foundation, Inc.
+   This file is part of the GNU C Library.
+
+   The GNU C Library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 2.1 of the License, or (at your option) any later version.
+
+   The GNU C Library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with the GNU C Library; if not, see
+   <https://www.gnu.org/licenses/>.  */
+
+#include <array_length.h>
+#include <string.h>
+#include <support/check.h>
+#include <sys/auxv.h>
+#include "tst-audit28mod.h"
+
+int
+do_test (void)
+{
+  unsigned long hwcap = getauxval (AT_HWCAP);
+  if ((hwcap & HWCAP_SVE) == 0)
+    FAIL_UNSUPPORTED ("system does not support SVE");
+
+  {
+    svint8_t r = tst_audit28_func_sve_args (sve_args_z0 (), sve_args_z1 (),
+					    sve_args_z2 (), sve_args_z3 (),
+					    sve_args_z4 (), sve_args_z5 (),
+					    sve_args_z6 (), sve_args_z7 ());
+    if (!svptest_any (svptrue_b8  (),  svcmpeq_s8  (svptrue_b8 (), r, sve_ret ())))
+      FAIL_EXIT1 ("tst_audit28_func_sve_args(): wrong return value");
+  }
+
+  return 0;
+}
+
+#include <support/test-driver.c>
diff --git a/sysdeps/aarch64/tst-audit28mod.c b/sysdeps/aarch64/tst-audit28mod.c
new file mode 100644
index 0000000000..f5e24346b4
--- /dev/null
+++ b/sysdeps/aarch64/tst-audit28mod.c
@@ -0,0 +1,48 @@ 
+/* Check DT_AUDIT for aarch64 ABI specifics.
+   Copyright (C) 2021 Free Software Foundation, Inc.
+   This file is part of the GNU C Library.
+
+   The GNU C Library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 2.1 of the License, or (at your option) any later version.
+
+   The GNU C Library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with the GNU C Library; if not, see
+   <https://www.gnu.org/licenses/>.  */
+
+#include <array_length.h>
+#include <assert.h>
+#include <stdlib.h>
+#include <support/check.h>
+#include "tst-audit28mod.h"
+
+svint8_t
+tst_audit28_func_sve_args (svint8_t z0, svint16_t z1, svint32_t z2,
+			   svint64_t z3, svuint8_t z4, svuint16_t z5,
+			   svuint32_t z6, svuint64_t z7)
+{
+  assert (svptest_any (svptrue_b8 (),  svcmpeq_s8  (svptrue_b8 (),
+						    z0, sve_args_z0 ())));
+  assert (svptest_any (svptrue_b16 (), svcmpeq_s16 (svptrue_b16 (),
+						    z1, sve_args_z1 ())));
+  assert (svptest_any (svptrue_b32 (), svcmpeq_s32 (svptrue_b32 (),
+						    z2, sve_args_z2 ())));
+  assert (svptest_any (svptrue_b64 (), svcmpeq_s64 (svptrue_b64 (),
+						    z3, sve_args_z3 ())));
+  assert (svptest_any (svptrue_b16 (), svcmpeq_u8  (svptrue_b8  (),
+						    z4, sve_args_z4 ())));
+  assert (svptest_any (svptrue_b16 (), svcmpeq_u16 (svptrue_b16 (),
+						    z5, sve_args_z5 ())));
+  assert (svptest_any (svptrue_b16 (), svcmpeq_u32 (svptrue_b32 (),
+						    z6, sve_args_z6 ())));
+  assert (svptest_any (svptrue_b16 (), svcmpeq_u64 (svptrue_b64 (),
+						    z7, sve_args_z7 ())));
+
+  return sve_ret ();
+}
diff --git a/sysdeps/aarch64/tst-audit28mod.h b/sysdeps/aarch64/tst-audit28mod.h
new file mode 100644
index 0000000000..55e3cdbbc6
--- /dev/null
+++ b/sysdeps/aarch64/tst-audit28mod.h
@@ -0,0 +1,74 @@ 
+/* Check DT_AUDIT for aarch64 specific ABI.
+   Copyright (C) 2021 Free Software Foundation, Inc.
+   This file is part of the GNU C Library.
+
+   The GNU C Library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 2.1 of the License, or (at your option) any later version.
+
+   The GNU C Library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with the GNU C Library; if not, see
+   <https://www.gnu.org/licenses/>.  */
+
+#ifndef _TST_AUDIT27MOD_H
+#define _TST_AUDIT27MOD_H 1
+
+#include <arm_sve.h>
+
+static inline svint8_t sve_args_z0 (void)
+{
+  return svdup_s8 (INT8_MAX);
+}
+
+static inline svint16_t sve_args_z1 (void)
+{
+  return svdup_s16 (INT16_MAX);
+}
+
+static inline svint32_t sve_args_z2 (void)
+{
+  return svdup_s32 (INT32_MAX);
+}
+
+static inline svint64_t sve_args_z3 (void)
+{
+  return svdup_s64 (INT64_MAX);
+}
+
+static inline svuint8_t sve_args_z4 (void)
+{
+  return svdup_u8 (UINT8_MAX);
+}
+
+static inline svuint16_t sve_args_z5 (void)
+{
+  return svdup_u16 (UINT16_MAX);
+}
+
+static inline svuint32_t sve_args_z6 (void)
+{
+  return svdup_u32 (UINT32_MAX);
+}
+
+static inline svuint64_t sve_args_z7 (void)
+{
+  return svdup_u64 (UINT64_MAX);
+}
+
+static inline svint8_t sve_ret (void)
+{
+  return svdup_s8 (INT8_MIN);
+}
+
+#define INT_ARGS_RET 0x21
+
+svint8_t tst_audit28_func_sve_args (svint8_t z0, svint16_t z1, svint32_t z2, svint64_t z3,
+				    svuint8_t z4, svuint16_t z5, svuint32_t z6, svuint64_t z7);
+
+#endif
diff --git a/sysdeps/aarch64/tst-auditmod27.c b/sysdeps/aarch64/tst-auditmod27.c
index b1dbff8330..9a6e23d752 100644
--- a/sysdeps/aarch64/tst-auditmod27.c
+++ b/sysdeps/aarch64/tst-auditmod27.c
@@ -16,18 +16,82 @@ 
    License along with the GNU C Library; if not, see
    <https://www.gnu.org/licenses/>.  */
 
+#include <array_length.h>
 #include <assert.h>
 #include <link.h>
 #include <string.h>
 #include <stddef.h>
 #include <stdio.h>
 #include <stdlib.h>
+#include "tst-audit.h"
 #include "tst-audit27mod.h"
 
 #define TEST_NAME  "tst-audit27"
 
 #define AUDIT27_COOKIE 0
 
+static inline float regs_vec_to_float (const La_aarch64_regs *regs, int i)
+{
+  float r;
+  if (regs->lr_sve == 0)
+    r = regs->lr_vreg[i].s;
+  else
+    memcpy (&r, &regs->lr_vreg[i].z[0], sizeof (r));
+  return r;
+}
+
+static inline double regs_vec_to_double (const La_aarch64_regs *regs, int i)
+{
+  double r;
+  if (regs->lr_sve == 0)
+    r = regs->lr_vreg[i].d;
+  else
+    memcpy (&r, &regs->lr_vreg[i].z[0], sizeof (r));
+  return r;
+}
+
+static inline long double regs_vec_to_ldouble (const La_aarch64_regs *regs,
+					       int i)
+{
+  long double r;
+  if (regs->lr_sve == 0)
+    r = regs->lr_vreg[i].q;
+  else
+    memcpy (&r, &regs->lr_vreg[i].z[0], sizeof (r));
+  return r;
+}
+
+static inline float ret_vec_to_float (const La_aarch64_retval *regs, int i)
+{
+  float r;
+  if (regs->lrv_sve == 0)
+    r = regs->lrv_vreg[i].s;
+  else
+    memcpy (&r, &regs->lrv_vreg[i].z[0], sizeof (r));
+  return r;
+}
+
+static inline double ret_vec_to_double (const La_aarch64_retval *regs, int i)
+{
+  double r;
+  if (regs->lrv_sve == 0)
+    r = regs->lrv_vreg[i].d;
+  else
+    memcpy (&r, &regs->lrv_vreg[i].z[0], sizeof (r));
+  return r;
+}
+
+static inline long double ret_vec_to_ldouble (const La_aarch64_retval *regs,
+					      int i)
+{
+  long double r;
+  if (regs->lrv_sve == 0)
+    r = regs->lrv_vreg[i].q;
+  else
+    memcpy (&r, &regs->lrv_vreg[i].z[0], sizeof (r));
+  return r;
+}
+
 unsigned int
 la_version (unsigned int v)
 {
@@ -47,6 +111,7 @@  la_objopen (struct link_map *map, Lmid_t lmid, uintptr_t *cookie)
   return ck == -1 ? 0 : LA_FLG_BINDFROM | LA_FLG_BINDTO;
 }
 
+
 ElfW(Addr)
 la_aarch64_gnu_pltenter (ElfW(Sym) *sym, unsigned int ndx, uintptr_t *refcook,
 			 uintptr_t *defcook, La_aarch64_regs *regs,
@@ -55,39 +120,43 @@  la_aarch64_gnu_pltenter (ElfW(Sym) *sym, unsigned int ndx, uintptr_t *refcook,
 {
   printf ("pltenter: symname=%s, st_value=%#lx, ndx=%u, flags=%u\n",
 	  symname, (long int) sym->st_value, ndx, *flags);
+  printf ("  regs->lr_sve=%d\n", regs->lr_sve);
+  if (regs->lr_sve > 0)
+    for (int i = 0; i < array_length (regs->lr_vreg); i++)
+      printf ("  inregs->lr_vreg[%d]=%p\n", i, regs->lr_vreg[i].z);
 
   if (strcmp (symname, "tst_audit27_func_float") == 0)
     {
-      assert (regs->lr_vreg[0].s == FUNC_FLOAT_ARG0);
-      assert (regs->lr_vreg[1].s == FUNC_FLOAT_ARG1);
-      assert (regs->lr_vreg[2].s == FUNC_FLOAT_ARG2);
-      assert (regs->lr_vreg[3].s == FUNC_FLOAT_ARG3);
-      assert (regs->lr_vreg[4].s == FUNC_FLOAT_ARG4);
-      assert (regs->lr_vreg[5].s == FUNC_FLOAT_ARG5);
-      assert (regs->lr_vreg[6].s == FUNC_FLOAT_ARG6);
-      assert (regs->lr_vreg[7].s == FUNC_FLOAT_ARG7);
+      assert (regs_vec_to_float (regs, 0) == FUNC_FLOAT_ARG0);
+      assert (regs_vec_to_float (regs, 1) == FUNC_FLOAT_ARG1);
+      assert (regs_vec_to_float (regs, 2) == FUNC_FLOAT_ARG2);
+      assert (regs_vec_to_float (regs, 3) == FUNC_FLOAT_ARG3);
+      assert (regs_vec_to_float (regs, 4) == FUNC_FLOAT_ARG4);
+      assert (regs_vec_to_float (regs, 5) == FUNC_FLOAT_ARG5);
+      assert (regs_vec_to_float (regs, 6) == FUNC_FLOAT_ARG6);
+      assert (regs_vec_to_float (regs, 7) == FUNC_FLOAT_ARG7);
     }
   else if (strcmp (symname, "tst_audit27_func_double") == 0)
     {
-      assert (regs->lr_vreg[0].d == FUNC_DOUBLE_ARG0);
-      assert (regs->lr_vreg[1].d == FUNC_DOUBLE_ARG1);
-      assert (regs->lr_vreg[2].d == FUNC_DOUBLE_ARG2);
-      assert (regs->lr_vreg[3].d == FUNC_DOUBLE_ARG3);
-      assert (regs->lr_vreg[4].d == FUNC_DOUBLE_ARG4);
-      assert (regs->lr_vreg[5].d == FUNC_DOUBLE_ARG5);
-      assert (regs->lr_vreg[6].d == FUNC_DOUBLE_ARG6);
-      assert (regs->lr_vreg[7].d == FUNC_DOUBLE_ARG7);
+      assert (regs_vec_to_double (regs, 0) == FUNC_DOUBLE_ARG0);
+      assert (regs_vec_to_double (regs, 1) == FUNC_DOUBLE_ARG1);
+      assert (regs_vec_to_double (regs, 2) == FUNC_DOUBLE_ARG2);
+      assert (regs_vec_to_double (regs, 3) == FUNC_DOUBLE_ARG3);
+      assert (regs_vec_to_double (regs, 4) == FUNC_DOUBLE_ARG4);
+      assert (regs_vec_to_double (regs, 5) == FUNC_DOUBLE_ARG5);
+      assert (regs_vec_to_double (regs, 6) == FUNC_DOUBLE_ARG6);
+      assert (regs_vec_to_double (regs, 7) == FUNC_DOUBLE_ARG7);
     }
   else if (strcmp (symname, "tst_audit27_func_ldouble") == 0)
     {
-      assert (regs->lr_vreg[0].q == FUNC_LDOUBLE_ARG0);
-      assert (regs->lr_vreg[1].q == FUNC_LDOUBLE_ARG1);
-      assert (regs->lr_vreg[2].q == FUNC_LDOUBLE_ARG2);
-      assert (regs->lr_vreg[3].q == FUNC_LDOUBLE_ARG3);
-      assert (regs->lr_vreg[4].q == FUNC_LDOUBLE_ARG4);
-      assert (regs->lr_vreg[5].q == FUNC_LDOUBLE_ARG5);
-      assert (regs->lr_vreg[6].q == FUNC_LDOUBLE_ARG6);
-      assert (regs->lr_vreg[7].q == FUNC_LDOUBLE_ARG7);
+      assert (regs_vec_to_ldouble (regs, 0) == FUNC_LDOUBLE_ARG0);
+      assert (regs_vec_to_ldouble (regs, 1) == FUNC_LDOUBLE_ARG1);
+      assert (regs_vec_to_ldouble (regs, 2) == FUNC_LDOUBLE_ARG2);
+      assert (regs_vec_to_ldouble (regs, 3) == FUNC_LDOUBLE_ARG3);
+      assert (regs_vec_to_ldouble (regs, 4) == FUNC_LDOUBLE_ARG4);
+      assert (regs_vec_to_ldouble (regs, 5) == FUNC_LDOUBLE_ARG5);
+      assert (regs_vec_to_ldouble (regs, 6) == FUNC_LDOUBLE_ARG6);
+      assert (regs_vec_to_ldouble (regs, 7) == FUNC_LDOUBLE_ARG7);
     }
   else
     abort ();
@@ -117,48 +186,56 @@  la_aarch64_gnu_pltexit (ElfW(Sym) *sym, unsigned int ndx, uintptr_t *refcook,
 {
   printf ("pltexit: symname=%s, st_value=%#lx, ndx=%u\n",
 	  symname, (long int) sym->st_value, ndx);
+  printf ("  inregs->lr_sve=%d\n", inregs->lr_sve);
+  if (inregs->lr_sve > 0)
+    for (int i = 0; i < array_length (inregs->lr_vreg); i++)
+      printf ("  inregs->lr_vreg[%d]=%p\n", i, inregs->lr_vreg[i].z);
+  printf ("  outregs->lr_sve=%d\n", outregs->lrv_sve);
+  if (outregs->lrv_sve > 0)
+    for (int i = 0; i < array_length (outregs->lrv_vreg); i++)
+      printf ("  outregs->lr_vreg[%d]=%p\n", i, outregs->lrv_vreg[i].z);
 
   if (strcmp (symname, "tst_audit27_func_float") == 0)
     {
-      assert (inregs->lr_vreg[0].s == FUNC_FLOAT_ARG0);
-      assert (inregs->lr_vreg[1].s == FUNC_FLOAT_ARG1);
-      assert (inregs->lr_vreg[2].s == FUNC_FLOAT_ARG2);
-      assert (inregs->lr_vreg[3].s == FUNC_FLOAT_ARG3);
-      assert (inregs->lr_vreg[4].s == FUNC_FLOAT_ARG4);
-      assert (inregs->lr_vreg[5].s == FUNC_FLOAT_ARG5);
-      assert (inregs->lr_vreg[6].s == FUNC_FLOAT_ARG6);
-      assert (inregs->lr_vreg[7].s == FUNC_FLOAT_ARG7);
-
-      assert (outregs->lrv_vreg[0].s == FUNC_FLOAT_RET);
+      assert (regs_vec_to_float (inregs, 0) == FUNC_FLOAT_ARG0);
+      assert (regs_vec_to_float (inregs, 1) == FUNC_FLOAT_ARG1);
+      assert (regs_vec_to_float (inregs, 2) == FUNC_FLOAT_ARG2);
+      assert (regs_vec_to_float (inregs, 3) == FUNC_FLOAT_ARG3);
+      assert (regs_vec_to_float (inregs, 4) == FUNC_FLOAT_ARG4);
+      assert (regs_vec_to_float (inregs, 5) == FUNC_FLOAT_ARG5);
+      assert (regs_vec_to_float (inregs, 6) == FUNC_FLOAT_ARG6);
+      assert (regs_vec_to_float (inregs, 7) == FUNC_FLOAT_ARG7);
+
+      assert (ret_vec_to_float (outregs, 0) == FUNC_FLOAT_RET);
     }
   else if (strcmp (symname, "tst_audit27_func_double") == 0)
     {
-      assert (inregs->lr_vreg[0].d == FUNC_DOUBLE_ARG0);
-      assert (inregs->lr_vreg[1].d == FUNC_DOUBLE_ARG1);
-      assert (inregs->lr_vreg[2].d == FUNC_DOUBLE_ARG2);
-      assert (inregs->lr_vreg[3].d == FUNC_DOUBLE_ARG3);
-      assert (inregs->lr_vreg[4].d == FUNC_DOUBLE_ARG4);
-      assert (inregs->lr_vreg[5].d == FUNC_DOUBLE_ARG5);
-      assert (inregs->lr_vreg[6].d == FUNC_DOUBLE_ARG6);
-      assert (inregs->lr_vreg[7].d == FUNC_DOUBLE_ARG7);
-
-      assert (outregs->lrv_vreg[0].d == FUNC_DOUBLE_RET);
+      assert (regs_vec_to_double (inregs, 0) == FUNC_DOUBLE_ARG0);
+      assert (regs_vec_to_double (inregs, 1) == FUNC_DOUBLE_ARG1);
+      assert (regs_vec_to_double (inregs, 2) == FUNC_DOUBLE_ARG2);
+      assert (regs_vec_to_double (inregs, 3) == FUNC_DOUBLE_ARG3);
+      assert (regs_vec_to_double (inregs, 4) == FUNC_DOUBLE_ARG4);
+      assert (regs_vec_to_double (inregs, 5) == FUNC_DOUBLE_ARG5);
+      assert (regs_vec_to_double (inregs, 6) == FUNC_DOUBLE_ARG6);
+      assert (regs_vec_to_double (inregs, 7) == FUNC_DOUBLE_ARG7);
+
+      assert (ret_vec_to_double (outregs, 0) == FUNC_DOUBLE_RET);
     }
   else if (strcmp (symname, "tst_audit27_func_ldouble") == 0)
     {
-      assert (inregs->lr_vreg[0].q == FUNC_LDOUBLE_ARG0);
-      assert (inregs->lr_vreg[1].q == FUNC_LDOUBLE_ARG1);
-      assert (inregs->lr_vreg[2].q == FUNC_LDOUBLE_ARG2);
-      assert (inregs->lr_vreg[3].q == FUNC_LDOUBLE_ARG3);
-      assert (inregs->lr_vreg[4].q == FUNC_LDOUBLE_ARG4);
-      assert (inregs->lr_vreg[5].q == FUNC_LDOUBLE_ARG5);
-      assert (inregs->lr_vreg[6].q == FUNC_LDOUBLE_ARG6);
-      assert (inregs->lr_vreg[7].q == FUNC_LDOUBLE_ARG7);
-
-      assert (outregs->lrv_vreg[0].q == FUNC_LDOUBLE_RET);
+      assert (regs_vec_to_ldouble (inregs, 0) == FUNC_LDOUBLE_ARG0);
+      assert (regs_vec_to_ldouble (inregs, 1) == FUNC_LDOUBLE_ARG1);
+      assert (regs_vec_to_ldouble (inregs, 2) == FUNC_LDOUBLE_ARG2);
+      assert (regs_vec_to_ldouble (inregs, 3) == FUNC_LDOUBLE_ARG3);
+      assert (regs_vec_to_ldouble (inregs, 4) == FUNC_LDOUBLE_ARG4);
+      assert (regs_vec_to_ldouble (inregs, 5) == FUNC_LDOUBLE_ARG5);
+      assert (regs_vec_to_ldouble (inregs, 6) == FUNC_LDOUBLE_ARG6);
+      assert (regs_vec_to_ldouble (inregs, 7) == FUNC_LDOUBLE_ARG7);
+
+      assert (ret_vec_to_ldouble (outregs, 0) == FUNC_LDOUBLE_RET);
     }
   else
-    abort ();
+    return 0;
 
   /* Clobber the q registers on exit.  */
   uint8_t v = 0xff;
diff --git a/sysdeps/aarch64/tst-auditmod28.c b/sysdeps/aarch64/tst-auditmod28.c
new file mode 100644
index 0000000000..53a2162bfb
--- /dev/null
+++ b/sysdeps/aarch64/tst-auditmod28.c
@@ -0,0 +1,193 @@ 
+/* Check DT_AUDIT for aarch64 specific ABI.
+   Copyright (C) 2021 Free Software Foundation, Inc.
+   This file is part of the GNU C Library.
+
+   The GNU C Library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 2.1 of the License, or (at your option) any later version.
+
+   The GNU C Library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with the GNU C Library; if not, see
+   <https://www.gnu.org/licenses/>.  */
+
+#include <array_length.h>
+#include <assert.h>
+#include <link.h>
+#include <string.h>
+#include <stddef.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include "tst-audit28mod.h"
+
+#define TEST_NAME  "tst-audit28"
+#define TEST_FUNC  "tst_audit28_func"
+
+#define AUDIT28_COOKIE 0
+
+unsigned int
+la_version (unsigned int v)
+{
+  return v;
+}
+
+unsigned int
+la_objopen (struct link_map *map, Lmid_t lmid, uintptr_t *cookie)
+{
+  const char *p = strrchr (map->l_name, '/');
+  const char *l_name = p == NULL ? map->l_name : p + 1;
+  uintptr_t ck = -1;
+  if (strncmp (l_name, TEST_NAME, strlen (TEST_NAME)) == 0)
+    ck = AUDIT28_COOKIE;
+  *cookie = ck;
+  printf ("objopen: %ld, %s [%ld]\n", lmid, l_name, ck);
+  return ck == -1 ? 0 : LA_FLG_BINDFROM | LA_FLG_BINDTO;
+}
+
+ElfW(Addr)
+la_aarch64_gnu_pltenter (ElfW(Sym) *sym, unsigned int ndx, uintptr_t *refcook,
+			 uintptr_t *defcook, La_aarch64_regs *regs,
+			 unsigned int *flags, const char *symname,
+			 long int *framesizep)
+{
+  printf ("pltenter: symname=%s, st_value=%#lx, ndx=%u, flags=%u\n",
+	  symname, (long int) sym->st_value, ndx, *flags);
+  printf ("  regs->lr_sve=%d\n", regs->lr_sve);
+  if (regs->lr_sve > 0)
+    for (int i = 0; i < array_length (regs->lr_vreg); i++)
+      printf ("  inregs->lr_vreg[%d]=%p\n", i, regs->lr_vreg[i].z);
+
+
+  if (strcmp (symname, TEST_FUNC "_sve_args") == 0)
+    {
+      svint8_t z0 = svld1_s8 (svptrue_b8  (),
+			      (const int8_t *) regs->lr_vreg[0].z);
+      svint16_t z1 = svld1_s16 (svptrue_b16 (),
+				(const int16_t *) regs->lr_vreg[1].z);
+      svint32_t z2 = svld1_s32 (svptrue_b32 (),
+				(const int32_t *) regs->lr_vreg[2].z);
+      svint64_t z3 = svld1_s64 (svptrue_b64 (),
+				(const int64_t *) regs->lr_vreg[3].z);
+      svuint8_t z4 = svld1_u8 (svptrue_b8  (),
+			       (const uint8_t *) regs->lr_vreg[4].z);
+      svuint16_t z5 = svld1_u16 (svptrue_b16 (),
+				 (const uint16_t *) regs->lr_vreg[5].z);
+      svuint32_t z6 = svld1_u32 (svptrue_b32 (),
+				 (const uint32_t *) regs->lr_vreg[6].z);
+      svuint64_t z7 = svld1_u64 (svptrue_b64 (),
+				 (const uint64_t *) regs->lr_vreg[7].z);
+      assert (svptest_any (svptrue_b8  (),  svcmpeq_s8  (svptrue_b8 (),
+							 z0, sve_args_z0 ())));
+      assert (svptest_any (svptrue_b16 (),  svcmpeq_s16 (svptrue_b16 (),
+							 z1, sve_args_z1 ())));
+      assert (svptest_any (svptrue_b32 (),  svcmpeq_s32 (svptrue_b32 (),
+							 z2, sve_args_z2 ())));
+      assert (svptest_any (svptrue_b64 (),  svcmpeq_s64 (svptrue_b64 (),
+							 z3, sve_args_z3 ())));
+      assert (svptest_any (svptrue_b8  (),  svcmpeq_u8  (svptrue_b8 (),
+							 z4, sve_args_z4 ())));
+      assert (svptest_any (svptrue_b16 (),  svcmpeq_u16 (svptrue_b16 (),
+							 z5, sve_args_z5 ())));
+      assert (svptest_any (svptrue_b32 (),  svcmpeq_u32 (svptrue_b32 (),
+							 z6, sve_args_z6 ())));
+      assert (svptest_any (svptrue_b64 (),  svcmpeq_u64 (svptrue_b64 (),
+							 z7, sve_args_z7 ())));
+    }
+  else
+    abort ();
+
+  /* Clobber the q registers on exit.  */
+  uint8_t v = 0xff;
+  asm volatile ("dup z0.b, %w0" : : "r" (v) : "z0");
+  asm volatile ("dup z1.b, %w0" : : "r" (v) : "z1");
+  asm volatile ("dup z2.b, %w0" : : "r" (v) : "z2");
+  asm volatile ("dup z3.b, %w0" : : "r" (v) : "z3");
+  asm volatile ("dup z4.b, %w0" : : "r" (v) : "z4");
+  asm volatile ("dup z5.b, %w0" : : "r" (v) : "z5");
+  asm volatile ("dup z6.b, %w0" : : "r" (v) : "z6");
+  asm volatile ("dup z7.b, %w0" : : "r" (v) : "z7");
+
+  *framesizep = 1024;
+
+  return sym->st_value;
+}
+
+unsigned int
+la_aarch64_gnu_pltexit (ElfW(Sym) *sym, unsigned int ndx, uintptr_t *refcook,
+                        uintptr_t *defcook,
+			const struct La_aarch64_regs *inregs,
+                        struct La_aarch64_retval *outregs,
+			const char *symname)
+{
+  printf ("pltexit: symname=%s, st_value=%#lx, ndx=%u\n",
+          symname, (long int) sym->st_value, ndx);
+  printf ("  inregs->lr_sve=%d\n", inregs->lr_sve);
+  if (inregs->lr_sve > 0)
+    for (int i = 0; i < array_length (inregs->lr_vreg); i++)
+      printf ("  inregs->lr_vreg[%d]=%p\n", i, inregs->lr_vreg[i].z);
+  printf ("  outregs->lr_sve=%d\n", outregs->lrv_sve);
+  if (outregs->lrv_sve > 0)
+    for (int i = 0; i < array_length (outregs->lrv_vreg); i++)
+      printf ("  outregs->lr_vreg[%d]=%p\n", i, outregs->lrv_vreg[i].z);
+
+  if (strcmp (symname, TEST_FUNC "_sve_args") == 0)
+    {
+      svint8_t z0 = svld1_s8 (svptrue_b8  (),
+			      (const int8_t *) inregs->lr_vreg[0].z);
+      svint16_t z1 = svld1_s16 (svptrue_b16 (),
+				(const int16_t *) inregs->lr_vreg[1].z);
+      svint32_t z2 = svld1_s32 (svptrue_b32 (),
+				(const int32_t *) inregs->lr_vreg[2].z);
+      svint64_t z3 = svld1_s64 (svptrue_b64 (),
+				(const int64_t *) inregs->lr_vreg[3].z);
+      svuint8_t z4 = svld1_u8 (svptrue_b8  (),
+			       (const uint8_t *) inregs->lr_vreg[4].z);
+      svuint16_t z5 = svld1_u16 (svptrue_b16 (),
+				 (const uint16_t *) inregs->lr_vreg[5].z);
+      svuint32_t z6 = svld1_u32 (svptrue_b32 (),
+				 (const uint32_t *) inregs->lr_vreg[6].z);
+      svuint64_t z7 = svld1_u64 (svptrue_b64 (),
+				 (const uint64_t *) inregs->lr_vreg[7].z);
+      assert (svptest_any (svptrue_b8  (),  svcmpeq_s8  (svptrue_b8 (),
+							 z0, sve_args_z0 ())));
+      assert (svptest_any (svptrue_b16 (),  svcmpeq_s16 (svptrue_b16 (),
+							 z1, sve_args_z1 ())));
+      assert (svptest_any (svptrue_b32 (),  svcmpeq_s32 (svptrue_b32 (),
+							 z2, sve_args_z2 ())));
+      assert (svptest_any (svptrue_b64 (),  svcmpeq_s64 (svptrue_b64 (),
+							 z3, sve_args_z3 ())));
+      assert (svptest_any (svptrue_b8  (),  svcmpeq_u8  (svptrue_b8 (),
+							 z4, sve_args_z4 ())));
+      assert (svptest_any (svptrue_b16 (),  svcmpeq_u16 (svptrue_b16 (),
+							 z5, sve_args_z5 ())));
+      assert (svptest_any (svptrue_b32 (),  svcmpeq_u32 (svptrue_b32 (),
+							 z6, sve_args_z6 ())));
+      assert (svptest_any (svptrue_b64 (),  svcmpeq_u64 (svptrue_b64 (),
+							 z7, sve_args_z7 ())));
+
+      svint8_t r0 = svld1_s8 (svptrue_b8  (),
+			      (const int8_t *) outregs->lrv_vreg[0].z);
+      assert (svptest_any (svptrue_b8  (),  svcmpeq_s8  (svptrue_b8 (),
+							 r0, sve_ret ())));
+    }
+  else
+    abort ();
+
+  /* Clobber the q registers on exit.  */
+  uint8_t v = 0xff;
+  asm volatile ("dup z0.b, %w0" : : "r" (v) : "z0");
+  asm volatile ("dup z1.b, %w0" : : "r" (v) : "z1");
+  asm volatile ("dup z2.b, %w0" : : "r" (v) : "z2");
+  asm volatile ("dup z3.b, %w0" : : "r" (v) : "z3");
+  asm volatile ("dup z4.b, %w0" : : "r" (v) : "z4");
+  asm volatile ("dup z5.b, %w0" : : "r" (v) : "z5");
+  asm volatile ("dup z6.b, %w0" : : "r" (v) : "z6");
+  asm volatile ("dup z7.b, %w0" : : "r" (v) : "z7");
+
+  return 0;
+}
diff --git a/sysdeps/alpha/dl-machine.h b/sysdeps/alpha/dl-machine.h
index 7d6282e599..b9fc286059 100644
--- a/sysdeps/alpha/dl-machine.h
+++ b/sysdeps/alpha/dl-machine.h
@@ -509,7 +509,7 @@  static inline void
 __attribute__ ((always_inline))
 elf_machine_lazy_rel (struct link_map *map, struct r_scope_elem *scope[],
 		      Elf64_Addr l_addr, const Elf64_Rela *reloc,
-		      int skip_ifunc)
+		      int profile, int skip_ifunc)
 {
   Elf64_Addr * const reloc_addr = (void *)(l_addr + reloc->r_offset);
   unsigned long int const r_type = ELF64_R_TYPE (reloc->r_info);
diff --git a/sysdeps/arc/dl-machine.h b/sysdeps/arc/dl-machine.h
index f843ed9bd6..c2ca04181f 100644
--- a/sysdeps/arc/dl-machine.h
+++ b/sysdeps/arc/dl-machine.h
@@ -329,7 +329,7 @@  static inline void
 __attribute__ ((always_inline))
 elf_machine_lazy_rel (struct link_map *map, struct r_scope_elem *scope[],
 		      ElfW(Addr) l_addr, const ElfW(Rela) *reloc,
-		      int skip_ifunc)
+		      int profile, int skip_ifunc)
 {
   ElfW(Addr) *const reloc_addr = (void *) (l_addr + reloc->r_offset);
   const unsigned int r_type = ELFW (R_TYPE) (reloc->r_info);
diff --git a/sysdeps/arm/dl-machine.h b/sysdeps/arm/dl-machine.h
index 4ecdde4355..b747f5cc3b 100644
--- a/sysdeps/arm/dl-machine.h
+++ b/sysdeps/arm/dl-machine.h
@@ -623,7 +623,7 @@  static inline void
 __attribute__ ((always_inline))
 elf_machine_lazy_rel (struct link_map *map, struct r_scope_elem *scope[],
 		      Elf32_Addr l_addr, const Elf32_Rel *reloc,
-		      int skip_ifunc)
+		      int profile, int skip_ifunc)
 {
   Elf32_Addr *const reloc_addr = (void *) (l_addr + reloc->r_offset);
   const unsigned int r_type = ELF32_R_TYPE (reloc->r_info);
diff --git a/sysdeps/csky/dl-machine.h b/sysdeps/csky/dl-machine.h
index 4dfd957877..fa5d42413a 100644
--- a/sysdeps/csky/dl-machine.h
+++ b/sysdeps/csky/dl-machine.h
@@ -343,7 +343,7 @@  elf_machine_rela_relative (Elf32_Addr l_addr, const Elf32_Rela *reloc,
 static inline void __attribute__ ((unused, always_inline))
 elf_machine_lazy_rel (struct link_map *map, struct r_scope_elem *scope[],
 		      Elf32_Addr l_addr, const Elf32_Rela *reloc,
-		      int skip_ifunc)
+		      int profile, int skip_ifunc)
 {
   Elf32_Addr *const reloc_addr = (void *) (l_addr + reloc->r_offset);
   const unsigned int r_type = ELF32_R_TYPE (reloc->r_info);
diff --git a/sysdeps/hppa/dl-machine.h b/sysdeps/hppa/dl-machine.h
index 7c5d0e9430..d51df3e444 100644
--- a/sysdeps/hppa/dl-machine.h
+++ b/sysdeps/hppa/dl-machine.h
@@ -793,7 +793,7 @@  elf_machine_rela_relative (Elf32_Addr l_addr,
 static void __attribute__((always_inline))
 elf_machine_lazy_rel (struct link_map *map, struct r_scope_elem *scope[],
 		      Elf32_Addr l_addr, const Elf32_Rela *reloc,
-		      int skip_ifunc)
+		      int profile, int skip_ifunc)
 {
   /* We don't have anything to do here.  elf_machine_runtime_setup has
      done all the relocs already.  */
diff --git a/sysdeps/i386/dl-machine.h b/sysdeps/i386/dl-machine.h
index 2f0dbc27a9..071e4e3d55 100644
--- a/sysdeps/i386/dl-machine.h
+++ b/sysdeps/i386/dl-machine.h
@@ -641,7 +641,7 @@  static inline void
 __attribute__ ((always_inline))
 elf_machine_lazy_rel (struct link_map *map, struct r_scope_elem *scope[],
 		      Elf32_Addr l_addr, const Elf32_Rel *reloc,
-		      int skip_ifunc)
+		      int profile, int skip_ifunc)
 {
   Elf32_Addr *const reloc_addr = (void *) (l_addr + reloc->r_offset);
   const unsigned int r_type = ELF32_R_TYPE (reloc->r_info);
diff --git a/sysdeps/ia64/dl-machine.h b/sysdeps/ia64/dl-machine.h
index c9608a51b0..d13c3aa732 100644
--- a/sysdeps/ia64/dl-machine.h
+++ b/sysdeps/ia64/dl-machine.h
@@ -493,7 +493,7 @@  static inline void
 __attribute ((always_inline))
 elf_machine_lazy_rel (struct link_map *map, struct r_scope_elem *scope[],
 		      Elf64_Addr l_addr, const Elf64_Rela *reloc,
-		      int skip_ifunc)
+		      int profile, int skip_ifunc)
 {
   Elf64_Addr *const reloc_addr = (void *) (l_addr + reloc->r_offset);
   const unsigned long int r_type = ELF64_R_TYPE (reloc->r_info);
diff --git a/sysdeps/m68k/dl-machine.h b/sysdeps/m68k/dl-machine.h
index 30323d62d4..f14e63f8da 100644
--- a/sysdeps/m68k/dl-machine.h
+++ b/sysdeps/m68k/dl-machine.h
@@ -315,7 +315,7 @@  elf_machine_rela_relative (Elf32_Addr l_addr, const Elf32_Rela *reloc,
 static inline void __attribute__ ((unused, always_inline))
 elf_machine_lazy_rel (struct link_map *map, struct r_scope_elem *scope[],
 		      Elf32_Addr l_addr, const Elf32_Rela *reloc,
-		      int skip_ifunc)
+		      int profile, int skip_ifunc)
 {
   Elf32_Addr *const reloc_addr = (void *) (l_addr + reloc->r_offset);
   if (ELF32_R_TYPE (reloc->r_info) == R_68K_JMP_SLOT)
diff --git a/sysdeps/microblaze/dl-machine.h b/sysdeps/microblaze/dl-machine.h
index b8cc5a7fe6..f926f27883 100644
--- a/sysdeps/microblaze/dl-machine.h
+++ b/sysdeps/microblaze/dl-machine.h
@@ -289,7 +289,7 @@  elf_machine_rela_relative (Elf32_Addr l_addr, const Elf32_Rela *reloc,
 static inline void
 elf_machine_lazy_rel (struct link_map *map, struct r_scope_elem *scope[],
 		      Elf32_Addr l_addr, const Elf32_Rela *reloc,
-		      int skip_ifunc)
+		      int profile, int skip_ifunc)
 {
   Elf32_Addr *const reloc_addr = (void *) (l_addr + reloc->r_offset);
   if (ELF32_R_TYPE (reloc->r_info) == R_MICROBLAZE_JUMP_SLOT)
diff --git a/sysdeps/mips/dl-machine.h b/sysdeps/mips/dl-machine.h
index d7b8341b74..2d19baf84a 100644
--- a/sysdeps/mips/dl-machine.h
+++ b/sysdeps/mips/dl-machine.h
@@ -729,7 +729,7 @@  static inline void
 __attribute__((always_inline))
 elf_machine_lazy_rel (struct link_map *map, struct r_scope_elem *scope[],
 		      ElfW(Addr) l_addr, const ElfW(Rel) *reloc,
-		      int skip_ifunc)
+		      int profile, int skip_ifunc)
 {
   ElfW(Addr) *const reloc_addr = (void *) (l_addr + reloc->r_offset);
   const unsigned int r_type = ELFW(R_TYPE) (reloc->r_info);
diff --git a/sysdeps/nios2/dl-machine.h b/sysdeps/nios2/dl-machine.h
index 430ca5d7ae..b76659d442 100644
--- a/sysdeps/nios2/dl-machine.h
+++ b/sysdeps/nios2/dl-machine.h
@@ -326,7 +326,7 @@  elf_machine_rela_relative (ElfW(Addr) l_addr, const ElfW(Rela) *reloc,
 static inline void __attribute__((always_inline))
 elf_machine_lazy_rel (struct link_map *map, struct r_scope_elem *scope[],
 		      ElfW(Addr) l_addr, const ElfW(Rela) *reloc,
-		      int skip_ifunc)
+		      int profile, int skip_ifunc)
 {
   Elf32_Addr *const reloc_addr = (void *) (l_addr + reloc->r_offset);
   if (ELF32_R_TYPE (reloc->r_info) == R_NIOS2_JUMP_SLOT)
diff --git a/sysdeps/powerpc/powerpc32/dl-machine.h b/sysdeps/powerpc/powerpc32/dl-machine.h
index 8d062951ce..a64ff303f0 100644
--- a/sysdeps/powerpc/powerpc32/dl-machine.h
+++ b/sysdeps/powerpc/powerpc32/dl-machine.h
@@ -449,7 +449,7 @@  elf_machine_rela_relative (Elf32_Addr l_addr, const Elf32_Rela *reloc,
 static inline void __attribute__ ((always_inline))
 elf_machine_lazy_rel (struct link_map *map, struct r_scope_elem *scope[],
 		      Elf32_Addr l_addr, const Elf32_Rela *reloc,
-		      int skip_ifunc)
+		      int profile, int skip_ifunc)
 {
   /* elf_machine_runtime_setup handles this. */
 }
diff --git a/sysdeps/powerpc/powerpc64/dl-machine.h b/sysdeps/powerpc/powerpc64/dl-machine.h
index 3a4a21a412..e33e2a231d 100644
--- a/sysdeps/powerpc/powerpc64/dl-machine.h
+++ b/sysdeps/powerpc/powerpc64/dl-machine.h
@@ -1026,7 +1026,7 @@  elf_machine_rela (struct link_map *map, struct r_scope_elem *scope[],
 static inline void __attribute__ ((always_inline))
 elf_machine_lazy_rel (struct link_map *map, struct r_scope_elem *scope[],
 		      Elf64_Addr l_addr, const Elf64_Rela *reloc,
-		      int skip_ifunc)
+		      int profile, int skip_ifunc)
 {
   /* elf_machine_runtime_setup handles this.  */
 }
diff --git a/sysdeps/riscv/dl-machine.h b/sysdeps/riscv/dl-machine.h
index ce2b3c3875..944213a939 100644
--- a/sysdeps/riscv/dl-machine.h
+++ b/sysdeps/riscv/dl-machine.h
@@ -291,7 +291,7 @@  static inline void
 __attribute__ ((always_inline))
 elf_machine_lazy_rel (struct link_map *map, struct r_scope_elem *scope[],
 		      ElfW(Addr) l_addr, const ElfW(Rela) *reloc,
-		      int skip_ifunc)
+		      int profile, int skip_ifunc)
 {
   ElfW(Addr) *const reloc_addr = (void *) (l_addr + reloc->r_offset);
   const unsigned int r_type = ELFW (R_TYPE) (reloc->r_info);
diff --git a/sysdeps/s390/s390-32/dl-machine.h b/sysdeps/s390/s390-32/dl-machine.h
index c1d9bb41db..41ec0bab02 100644
--- a/sysdeps/s390/s390-32/dl-machine.h
+++ b/sysdeps/s390/s390-32/dl-machine.h
@@ -497,7 +497,7 @@  static inline void
 __attribute__ ((always_inline))
 elf_machine_lazy_rel (struct link_map *map, struct r_scope_elem *scope[],
 		      Elf32_Addr l_addr, const Elf32_Rela *reloc,
-		      int skip_ifunc)
+		      int profile, int skip_ifunc)
 {
   Elf32_Addr *const reloc_addr = (void *) (l_addr + reloc->r_offset);
   const unsigned int r_type = ELF32_R_TYPE (reloc->r_info);
diff --git a/sysdeps/s390/s390-64/dl-machine.h b/sysdeps/s390/s390-64/dl-machine.h
index d405f01a03..057825f309 100644
--- a/sysdeps/s390/s390-64/dl-machine.h
+++ b/sysdeps/s390/s390-64/dl-machine.h
@@ -451,7 +451,7 @@  static inline void
 __attribute__ ((always_inline))
 elf_machine_lazy_rel (struct link_map *map, struct r_scope_elem *scope[],
 		      Elf64_Addr l_addr, const Elf64_Rela *reloc,
-		      int skip_ifunc)
+		      int profile, int skip_ifunc)
 {
   Elf64_Addr *const reloc_addr = (void *) (l_addr + reloc->r_offset);
   const unsigned int r_type = ELF64_R_TYPE (reloc->r_info);
diff --git a/sysdeps/sh/dl-machine.h b/sysdeps/sh/dl-machine.h
index d14023e749..66c5046631 100644
--- a/sysdeps/sh/dl-machine.h
+++ b/sysdeps/sh/dl-machine.h
@@ -448,7 +448,7 @@  static inline void
 __attribute__ ((always_inline))
 elf_machine_lazy_rel (struct link_map *map, struct r_scope_elem *scope[],
 		      Elf32_Addr l_addr, const Elf32_Rela *reloc,
-		      int skip_ifunc)
+		      int profile, int skip_ifunc)
 {
   Elf32_Addr *const reloc_addr = (void *) (l_addr + reloc->r_offset);
   /* Check for unexpected PLT reloc type.  */
diff --git a/sysdeps/sparc/sparc32/dl-machine.h b/sysdeps/sparc/sparc32/dl-machine.h
index 78f53bc499..f0c531d9b7 100644
--- a/sysdeps/sparc/sparc32/dl-machine.h
+++ b/sysdeps/sparc/sparc32/dl-machine.h
@@ -549,7 +549,7 @@  static inline void
 __attribute__ ((always_inline))
 elf_machine_lazy_rel (struct link_map *map, struct r_scope_elem *scope[],
 		      Elf32_Addr l_addr, const Elf32_Rela *reloc,
-		      int skip_ifunc)
+		      int profile, int skip_ifunc)
 {
   Elf32_Addr *const reloc_addr = (void *) (l_addr + reloc->r_offset);
   const unsigned int r_type = ELF32_R_TYPE (reloc->r_info);
diff --git a/sysdeps/sparc/sparc64/dl-machine.h b/sysdeps/sparc/sparc64/dl-machine.h
index 3fa79d038f..b280a74c9a 100644
--- a/sysdeps/sparc/sparc64/dl-machine.h
+++ b/sysdeps/sparc/sparc64/dl-machine.h
@@ -659,7 +659,7 @@  static inline void
 __attribute__ ((always_inline))
 elf_machine_lazy_rel (struct link_map *map, struct r_scope_elem *scope[],
 		      Elf64_Addr l_addr, const Elf64_Rela *reloc,
-		      int skip_ifunc)
+		      int profile, int skip_ifunc)
 {
   Elf64_Addr *const reloc_addr = (void *) (l_addr + reloc->r_offset);
   const unsigned int r_type = ELF64_R_TYPE (reloc->r_info);
diff --git a/sysdeps/x86_64/dl-machine.h b/sysdeps/x86_64/dl-machine.h
index b1a5297b66..101fc71b84 100644
--- a/sysdeps/x86_64/dl-machine.h
+++ b/sysdeps/x86_64/dl-machine.h
@@ -540,7 +540,7 @@  static inline void
 __attribute ((always_inline))
 elf_machine_lazy_rel (struct link_map *map, struct r_scope_elem *scope[],
 		      ElfW(Addr) l_addr, const ElfW(Rela) *reloc,
-		      int skip_ifunc)
+		      int profile, int skip_ifunc)
 {
   ElfW(Addr) *const reloc_addr = (void *) (l_addr + reloc->r_offset);
   const unsigned long int r_type = ELFW(R_TYPE) (reloc->r_info);