LoongArch: Add lasx/lsx support for _dl_runtime_profile.

Message ID 20230911103401.2485168-1-caiyinyu@loongson.cn
State New
Headers
Series LoongArch: Add lasx/lsx support for _dl_runtime_profile. |

Checks

Context Check Description
redhat-pt-bot/TryBot-apply_patch success Patch applied to master at the time it was sent
redhat-pt-bot/TryBot-32bit success Build for i686
linaro-tcwg-bot/tcwg_glibc_build--master-arm success Testing passed
linaro-tcwg-bot/tcwg_glibc_check--master-arm success Testing passed
linaro-tcwg-bot/tcwg_glibc_build--master-aarch64 success Testing passed
linaro-tcwg-bot/tcwg_glibc_check--master-aarch64 success Testing passed
redhat-pt-bot/TryBot-still_applies warning Patch no longer applies to master

Commit Message

caiyinyu Sept. 11, 2023, 10:34 a.m. UTC
  ---
 sysdeps/loongarch/bits/link.h     |  24 ++-
 sysdeps/loongarch/dl-link.sym     |   8 +-
 sysdeps/loongarch/dl-machine.h    |  11 +-
 sysdeps/loongarch/dl-trampoline.S | 175 +--------------------
 sysdeps/loongarch/dl-trampoline.h | 242 ++++++++++++++++++++++++++++++
 5 files changed, 283 insertions(+), 177 deletions(-)
  

Comments

Adhemerval Zanella Netto Sept. 12, 2023, 1:37 p.m. UTC | #1
I think this characterize as a ABI break, so it should follow what aarch64
did when we fixed BZ#26643 (ce9a68c57c260c8417afc93972849ac9ad243ec4) and
bump LAV_CURRENT (it was not done on this specific commit, but rather on
32612615c58b394c3eb09f020f31310797ad3854 to fix BZ #23734).

So add a loongarch link_lavcurrent.h with value of 3 for 
!__loongarch_soft_float.

On 11/09/23 07:34, caiyinyu wrote:
> ---
>  sysdeps/loongarch/bits/link.h     |  24 ++-
>  sysdeps/loongarch/dl-link.sym     |   8 +-
>  sysdeps/loongarch/dl-machine.h    |  11 +-
>  sysdeps/loongarch/dl-trampoline.S | 175 +--------------------
>  sysdeps/loongarch/dl-trampoline.h | 242 ++++++++++++++++++++++++++++++
>  5 files changed, 283 insertions(+), 177 deletions(-)
> 
> diff --git a/sysdeps/loongarch/bits/link.h b/sysdeps/loongarch/bits/link.h
> index 7fa6131280..00f6f25f2d 100644
> --- a/sysdeps/loongarch/bits/link.h
> +++ b/sysdeps/loongarch/bits/link.h
> @@ -20,10 +20,26 @@
>  #error "Never include <bits/link.h> directly; use <link.h> instead."
>  #endif
>  
> +#ifndef __loongarch_soft_float
> +typedef float La_loongarch_vr
> +    __attribute__ ((__vector_size__ (16), __aligned__ (16)));
> +typedef float La_loongarch_xr
> +    __attribute__ ((__vector_size__ (32), __aligned__ (16)));
> +
> +typedef union
> +{
> +  double fpreg[4];
> +  La_loongarch_vr vr[2];
> +  La_loongarch_xr xr[1];
> +} La_loongarch_vector __attribute__ ((__aligned__ (16)));
> +#endif
> +
>  typedef struct La_loongarch_regs
>  {
>    unsigned long int lr_reg[8]; /* a0 - a7 */
> -  double lr_fpreg[8];	       /* fa0 - fa7 */
> +#ifndef __loongarch_soft_float
> +  La_loongarch_vector lr_vec[8]; /* fa0 - fa7 or vr0 - vr7 or xr0 - xr7*/
> +#endif
>    unsigned long int lr_ra;
>    unsigned long int lr_sp;
>  } La_loongarch_regs;
> @@ -33,8 +49,10 @@ typedef struct La_loongarch_retval
>  {
>    unsigned long int lrv_a0;
>    unsigned long int lrv_a1;
> -  double lrv_fa0;
> -  double lrv_fa1;
> +#ifndef __loongarch_soft_float
> +  La_loongarch_vector lrv_vec0;
> +  La_loongarch_vector lrv_vec1;
> +#endif
>  } La_loongarch_retval;
>  
>  __BEGIN_DECLS
> diff --git a/sysdeps/loongarch/dl-link.sym b/sysdeps/loongarch/dl-link.sym
> index 868ab7c6eb..b534968e30 100644
> --- a/sysdeps/loongarch/dl-link.sym
> +++ b/sysdeps/loongarch/dl-link.sym
> @@ -6,9 +6,13 @@ DL_SIZEOF_RG            sizeof(struct La_loongarch_regs)
>  DL_SIZEOF_RV            sizeof(struct La_loongarch_retval)
>  
>  DL_OFFSET_RG_A0         offsetof(struct La_loongarch_regs, lr_reg)
> -DL_OFFSET_RG_FA0        offsetof(struct La_loongarch_regs, lr_fpreg)
> +#ifndef __loongarch_soft_float
> +DL_OFFSET_RG_VEC0       offsetof(struct La_loongarch_regs, lr_vec)
> +#endif
>  DL_OFFSET_RG_RA         offsetof(struct La_loongarch_regs, lr_ra)
>  DL_OFFSET_RG_SP         offsetof(struct La_loongarch_regs, lr_sp)
>  
>  DL_OFFSET_RV_A0         offsetof(struct La_loongarch_retval, lrv_a0)
> -DL_OFFSET_RV_FA0        offsetof(struct La_loongarch_retval, lrv_a1)
> +#ifndef __loongarch_soft_float
> +DL_OFFSET_RV_VEC0       offsetof(struct La_loongarch_retval, lrv_vec0)
> +#endif
> diff --git a/sysdeps/loongarch/dl-machine.h b/sysdeps/loongarch/dl-machine.h
> index 066bb233ac..8a2db9de3c 100644
> --- a/sysdeps/loongarch/dl-machine.h
> +++ b/sysdeps/loongarch/dl-machine.h
> @@ -273,6 +273,8 @@ elf_machine_runtime_setup (struct link_map *l, struct r_scope_elem *scope[],
>  #if !defined __loongarch_soft_float
>        extern void _dl_runtime_resolve_lasx (void) attribute_hidden;
>        extern void _dl_runtime_resolve_lsx (void) attribute_hidden;
> +      extern void _dl_runtime_profile_lasx (void) attribute_hidden;
> +      extern void _dl_runtime_profile_lsx (void) attribute_hidden;
>  #endif
>        extern void _dl_runtime_resolve (void) attribute_hidden;
>        extern void _dl_runtime_profile (void) attribute_hidden;
> @@ -287,7 +289,14 @@ elf_machine_runtime_setup (struct link_map *l, struct r_scope_elem *scope[],
>  	 end in this function.  */
>        if (profile != 0)
>  	{
> -	   gotplt[0] = (ElfW(Addr)) &_dl_runtime_profile;
> +#if !defined __loongarch_soft_float
> +	  if (SUPPORT_LASX)
> +	    gotplt[0] = (ElfW(Addr)) &_dl_runtime_profile_lasx;
> +	  else if (SUPPORT_LSX)
> +	    gotplt[0] = (ElfW(Addr)) &_dl_runtime_profile_lsx;
> +	  else
> +#endif
> +	    gotplt[0] = (ElfW(Addr)) &_dl_runtime_profile;
>  
>  	  if (GLRO(dl_profile) != NULL
>  	      && _dl_name_match_p (GLRO(dl_profile), l))
> diff --git a/sysdeps/loongarch/dl-trampoline.S b/sysdeps/loongarch/dl-trampoline.S
> index 8fd9146978..dce1c2f122 100644
> --- a/sysdeps/loongarch/dl-trampoline.S
> +++ b/sysdeps/loongarch/dl-trampoline.S
> @@ -22,190 +22,23 @@
>  #if !defined __loongarch_soft_float
>  #define USE_LASX
>  #define _dl_runtime_resolve _dl_runtime_resolve_lasx
> +#define _dl_runtime_profile _dl_runtime_profile_lasx
>  #include "dl-trampoline.h"
>  #undef FRAME_SIZE
>  #undef USE_LASX
>  #undef _dl_runtime_resolve
> +#undef _dl_runtime_profile
>  
>  #define USE_LSX
>  #define _dl_runtime_resolve _dl_runtime_resolve_lsx
> +#define _dl_runtime_profile _dl_runtime_profile_lsx
>  #include "dl-trampoline.h"
>  #undef FRAME_SIZE
>  #undef USE_LSX
>  #undef _dl_runtime_resolve
> +#undef _dl_runtime_profile
>  #endif
>  
>  #include "dl-trampoline.h"
>  
> -#include "dl-link.h"
>  
> -ENTRY (_dl_runtime_profile)
> -       /* LoongArch we get called with:
> -	t0	      linkr_map pointer
> -	t1	      the scaled offset stored in t0, which can be used
> -		      to calculate the offset of the current symbol in .rela.plt
> -	t2	      %hi(%pcrel(.got.plt)) stored in t2, no use in this function
> -	t3	      dl resolver entry point, no use in this function
> -
> -	Stack frame layout:
> -	[sp,    #96] La_loongarch_regs
> -	[sp,    #48] La_loongarch_retval
> -	[sp,    #40] frame size return from pltenter
> -	[sp,    #32] dl_profile_call saved a1
> -	[sp,    #24] dl_profile_call saved a0
> -	[sp,    #16] T1
> -	[sp,     #0] ra, fp   <- fp
> -       */
> -
> -# define OFFSET_T1              16
> -# define OFFSET_SAVED_CALL_A0   OFFSET_T1 + 8
> -# define OFFSET_FS              OFFSET_SAVED_CALL_A0 + 16
> -# define OFFSET_RV              OFFSET_FS + 8
> -# define OFFSET_RG              OFFSET_RV + DL_SIZEOF_RV
> -
> -# define SF_SIZE                (-(-(OFFSET_RG + DL_SIZEOF_RG) & ALMASK))
> -
> -	/* Save arguments to stack. */
> -	ADDI	sp, sp, -SF_SIZE
> -	REG_S	ra, sp, 0
> -	REG_S	fp, sp, 8
> -
> -	or	fp, sp, zero
> -
> -	REG_S	a0, fp, OFFSET_RG + DL_OFFSET_RG_A0 + 0*SZREG
> -	REG_S	a1, fp, OFFSET_RG + DL_OFFSET_RG_A0 + 1*SZREG
> -	REG_S	a2, fp, OFFSET_RG + DL_OFFSET_RG_A0 + 2*SZREG
> -	REG_S	a3, fp, OFFSET_RG + DL_OFFSET_RG_A0 + 3*SZREG
> -	REG_S	a4, fp, OFFSET_RG + DL_OFFSET_RG_A0 + 4*SZREG
> -	REG_S	a5, fp, OFFSET_RG + DL_OFFSET_RG_A0 + 5*SZREG
> -	REG_S	a6, fp, OFFSET_RG + DL_OFFSET_RG_A0 + 6*SZREG
> -	REG_S	a7, fp, OFFSET_RG + DL_OFFSET_RG_A0 + 7*SZREG
> -
> -#ifndef __loongarch_soft_float
> -	FREG_S	fa0, fp, OFFSET_RG + DL_OFFSET_RG_FA0 + 0*SZFREG
> -	FREG_S	fa1, fp, OFFSET_RG + DL_OFFSET_RG_FA0 + 1*SZFREG
> -	FREG_S	fa2, fp, OFFSET_RG + DL_OFFSET_RG_FA0 + 2*SZFREG
> -	FREG_S	fa3, fp, OFFSET_RG + DL_OFFSET_RG_FA0 + 3*SZFREG
> -	FREG_S	fa4, fp, OFFSET_RG + DL_OFFSET_RG_FA0 + 4*SZFREG
> -	FREG_S	fa5, fp, OFFSET_RG + DL_OFFSET_RG_FA0 + 5*SZFREG
> -	FREG_S	fa6, fp, OFFSET_RG + DL_OFFSET_RG_FA0 + 6*SZFREG
> -	FREG_S	fa7, fp, OFFSET_RG + DL_OFFSET_RG_FA0 + 7*SZFREG
> -#endif
> -
> -	/* Update .got.plt and obtain runtime address of callee.  */
> -	SLLI	a1, t1, 1
> -	or	a0, t0, zero
> -	ADD	a1, a1, t1
> -	or	a2, ra, zero		/* return addr */
> -	ADDI	a3, fp, OFFSET_RG	/* La_loongarch_regs pointer */
> -	ADDI	a4, fp, OFFSET_FS 	/* frame size return from pltenter */
> -
> -	REG_S	a0, fp, OFFSET_SAVED_CALL_A0
> -	REG_S	a1, fp, OFFSET_SAVED_CALL_A0 + SZREG
> -
> -	la	t2, _dl_profile_fixup
> -	jirl	ra, t2, 0
> -
> -	REG_L	t3, fp, OFFSET_FS
> -	bge	t3, zero, 1f
> -
> -	/* Save the return.  */
> -	or	t4, v0, zero
> -
> -	/* Restore arguments from stack.  */
> -	REG_L	a0, fp, OFFSET_RG + DL_OFFSET_RG_A0 + 0*SZREG
> -	REG_L	a1, fp, OFFSET_RG + DL_OFFSET_RG_A0 + 1*SZREG
> -	REG_L	a2, fp, OFFSET_RG + DL_OFFSET_RG_A0 + 2*SZREG
> -	REG_L	a3, fp, OFFSET_RG + DL_OFFSET_RG_A0 + 3*SZREG
> -	REG_L	a4, fp, OFFSET_RG + DL_OFFSET_RG_A0 + 4*SZREG
> -	REG_L	a5, fp, OFFSET_RG + DL_OFFSET_RG_A0 + 5*SZREG
> -	REG_L	a6, fp, OFFSET_RG + DL_OFFSET_RG_A0 + 6*SZREG
> -	REG_L	a7, fp, OFFSET_RG + DL_OFFSET_RG_A0 + 7*SZREG
> -
> -#ifndef __loongarch_soft_float
> -	FREG_L	fa0, fp, OFFSET_RG + DL_OFFSET_RG_FA0 + 0*SZFREG
> -	FREG_L	fa1, fp, OFFSET_RG + DL_OFFSET_RG_FA0 + 1*SZFREG
> -	FREG_L	fa2, fp, OFFSET_RG + DL_OFFSET_RG_FA0 + 2*SZFREG
> -	FREG_L	fa3, fp, OFFSET_RG + DL_OFFSET_RG_FA0 + 3*SZFREG
> -	FREG_L	fa4, fp, OFFSET_RG + DL_OFFSET_RG_FA0 + 4*SZFREG
> -	FREG_L	fa5, fp, OFFSET_RG + DL_OFFSET_RG_FA0 + 5*SZFREG
> -	FREG_L	fa6, fp, OFFSET_RG + DL_OFFSET_RG_FA0 + 6*SZFREG
> -	FREG_L	fa7, fp, OFFSET_RG + DL_OFFSET_RG_FA0 + 7*SZFREG
> -#endif
> -
> -	REG_L   ra, fp, 0
> -	REG_L   fp, fp, SZREG
> -
> -	ADDI	sp, sp, SF_SIZE
> -	jirl	zero, t4, 0
> -
> -1:
> -	/* The new frame size is in t3.  */
> -	SUB	sp, fp, t3
> -	BSTRINS sp, zero, 3, 0
> -
> -	REG_S	a0, fp, OFFSET_T1
> -
> -	or	a0, sp, zero
> -	ADDI	a1, fp, SF_SIZE
> -	or	a2, t3,	zero
> -	la	t5, memcpy
> -	jirl	ra, t5, 0
> -
> -	REG_L	t6, fp, OFFSET_T1
> -
> -	/* Call the function.  */
> -	REG_L	a0, fp, OFFSET_RG + DL_OFFSET_RG_A0 + 0*SZREG
> -	REG_L	a1, fp, OFFSET_RG + DL_OFFSET_RG_A0 + 1*SZREG
> -	REG_L	a2, fp, OFFSET_RG + DL_OFFSET_RG_A0 + 2*SZREG
> -	REG_L	a3, fp, OFFSET_RG + DL_OFFSET_RG_A0 + 3*SZREG
> -	REG_L	a4, fp, OFFSET_RG + DL_OFFSET_RG_A0 + 4*SZREG
> -	REG_L	a5, fp, OFFSET_RG + DL_OFFSET_RG_A0 + 5*SZREG
> -	REG_L	a6, fp, OFFSET_RG + DL_OFFSET_RG_A0 + 6*SZREG
> -	REG_L	a7, fp, OFFSET_RG + DL_OFFSET_RG_A0 + 7*SZREG
> -
> -#ifndef __loongarch_soft_float
> -	FREG_L	fa0, fp, OFFSET_RG + DL_OFFSET_RG_FA0 + 0*SZFREG
> -	FREG_L	fa1, fp, OFFSET_RG + DL_OFFSET_RG_FA0 + 1*SZFREG
> -	FREG_L	fa2, fp, OFFSET_RG + DL_OFFSET_RG_FA0 + 2*SZFREG
> -	FREG_L	fa3, fp, OFFSET_RG + DL_OFFSET_RG_FA0 + 3*SZFREG
> -	FREG_L	fa4, fp, OFFSET_RG + DL_OFFSET_RG_FA0 + 4*SZFREG
> -	FREG_L	fa5, fp, OFFSET_RG + DL_OFFSET_RG_FA0 + 5*SZFREG
> -	FREG_L	fa6, fp, OFFSET_RG + DL_OFFSET_RG_FA0 + 6*SZFREG
> -	FREG_L	fa7, fp, OFFSET_RG + DL_OFFSET_RG_FA0 + 7*SZFREG
> -#endif
> -	jirl	ra, t6, 0
> -
> -	REG_S	a0, fp, OFFSET_SAVED_CALL_A0 + DL_OFFSET_RV_A0
> -	REG_S	a1, fp, OFFSET_SAVED_CALL_A0 + DL_OFFSET_RV_A0 + SZREG
> -
> -#ifndef __loongarch_soft_float
> -	FREG_S	fa0, fp, OFFSET_SAVED_CALL_A0 + DL_OFFSET_RV_FA0
> -	FREG_S	fa1, fp, OFFSET_SAVED_CALL_A0 + DL_OFFSET_RV_FA0 + SZFREG
> -#endif
> -
> -	/* Setup call to pltexit.  */
> -	REG_L	a0, fp, OFFSET_SAVED_CALL_A0
> -	REG_L	a1, fp, OFFSET_SAVED_CALL_A0 + SZREG
> -	ADDI	a2, fp, OFFSET_RG
> -	ADDI	a3, fp, OFFSET_RV
> -	la	t7, _dl_audit_pltexit
> -	jirl	ra, t7, 0
> -
> -	REG_L	a0, fp, OFFSET_RV + DL_OFFSET_RV_A0
> -	REG_L	a1, fp, OFFSET_RV + DL_OFFSET_RV_A0 + SZREG
> -
> -#ifndef __loongarch_soft_float
> -	FREG_L	fa0, fp, OFFSET_RV + DL_OFFSET_RV_FA0
> -	FREG_L	fa1, fp, OFFSET_RV + DL_OFFSET_RV_FA0 + SZFREG
> -#endif
> -
> -	/* RA from within La_loongarch_reg.  */
> -	REG_L   ra, fp, OFFSET_RG + DL_OFFSET_RG_RA
> -	or	sp, fp, zero
> -	ADDI	sp, sp, SF_SIZE
> -	REG_S   fp, fp, SZREG
> -
> -	jirl	zero, ra, 0
> -
> -END (_dl_runtime_profile)
> diff --git a/sysdeps/loongarch/dl-trampoline.h b/sysdeps/loongarch/dl-trampoline.h
> index 02375286f8..cb4a287c65 100644
> --- a/sysdeps/loongarch/dl-trampoline.h
> +++ b/sysdeps/loongarch/dl-trampoline.h
> @@ -125,3 +125,245 @@ ENTRY (_dl_runtime_resolve)
>  	/* Invoke the callee. */
>  	jirl	zero, t1, 0
>  END (_dl_runtime_resolve)
> +
> +#include "dl-link.h"
> +
> +ENTRY (_dl_runtime_profile)
> +       /* LoongArch we get called with:
> +	t0	      linkr_map pointer
> +	t1	      the scaled offset stored in t0, which can be used
> +		      to calculate the offset of the current symbol in .rela.plt
> +	t2	      %hi(%pcrel(.got.plt)) stored in t2, no use in this function
> +	t3	      dl resolver entry point, no use in this function
> +
> +	Stack frame layout:
> +	[sp,    #208] La_loongarch_regs
> +	[sp,    #128] La_loongarch_retval // align: 16
> +	[sp,    #112] frame size return from pltenter
> +	[sp,    #80 ] dl_profile_call saved vec1
> +	[sp,    #48 ] dl_profile_call saved vec0 // align: 16
> +	[sp,    #32 ] dl_profile_call saved a1
> +	[sp,    #24 ] dl_profile_call saved a0
> +	[sp,    #16 ] T1
> +	[sp,     #0 ] ra, fp   <- fp
> +       */
> +
> +# define OFFSET_T1              16
> +# define OFFSET_SAVED_CALL_A0   OFFSET_T1 + 8
> +# define OFFSET_FS              OFFSET_SAVED_CALL_A0 + 16 + 8 + 64
> +# define OFFSET_RV              OFFSET_FS + 8 + 8
> +# define OFFSET_RG              OFFSET_RV + DL_SIZEOF_RV
> +
> +# define SF_SIZE                (-(-(OFFSET_RG + DL_SIZEOF_RG) & ALMASK))
> +
> +	/* Save arguments to stack. */
> +	ADDI	sp, sp, -SF_SIZE
> +	REG_S	ra, sp, 0
> +	REG_S	fp, sp, 8
> +
> +	or	fp, sp, zero
> +
> +	REG_S	a0, fp, OFFSET_RG + DL_OFFSET_RG_A0 + 0*SZREG
> +	REG_S	a1, fp, OFFSET_RG + DL_OFFSET_RG_A0 + 1*SZREG
> +	REG_S	a2, fp, OFFSET_RG + DL_OFFSET_RG_A0 + 2*SZREG
> +	REG_S	a3, fp, OFFSET_RG + DL_OFFSET_RG_A0 + 3*SZREG
> +	REG_S	a4, fp, OFFSET_RG + DL_OFFSET_RG_A0 + 4*SZREG
> +	REG_S	a5, fp, OFFSET_RG + DL_OFFSET_RG_A0 + 5*SZREG
> +	REG_S	a6, fp, OFFSET_RG + DL_OFFSET_RG_A0 + 6*SZREG
> +	REG_S	a7, fp, OFFSET_RG + DL_OFFSET_RG_A0 + 7*SZREG
> +
> +#ifdef USE_LASX
> +	xvst	xr0, fp, OFFSET_RG + DL_OFFSET_RG_VEC0 + 0*SZXREG
> +	xvst	xr1, fp, OFFSET_RG + DL_OFFSET_RG_VEC0 + 1*SZXREG
> +	xvst	xr2, fp, OFFSET_RG + DL_OFFSET_RG_VEC0 + 2*SZXREG
> +	xvst	xr3, fp, OFFSET_RG + DL_OFFSET_RG_VEC0 + 3*SZXREG
> +	xvst	xr4, fp, OFFSET_RG + DL_OFFSET_RG_VEC0 + 4*SZXREG
> +	xvst	xr5, fp, OFFSET_RG + DL_OFFSET_RG_VEC0 + 5*SZXREG
> +	xvst	xr6, fp, OFFSET_RG + DL_OFFSET_RG_VEC0 + 6*SZXREG
> +	xvst	xr7, fp, OFFSET_RG + DL_OFFSET_RG_VEC0 + 7*SZXREG
> +#elif defined USE_LSX
> +	vst	vr0, fp, OFFSET_RG + DL_OFFSET_RG_VEC0 + 0*SZVREG
> +	vst	vr1, fp, OFFSET_RG + DL_OFFSET_RG_VEC0 + 1*SZVREG
> +	vst	vr2, fp, OFFSET_RG + DL_OFFSET_RG_VEC0 + 2*SZVREG
> +	vst	vr3, fp, OFFSET_RG + DL_OFFSET_RG_VEC0 + 3*SZVREG
> +	vst	vr4, fp, OFFSET_RG + DL_OFFSET_RG_VEC0 + 4*SZVREG
> +	vst	vr5, fp, OFFSET_RG + DL_OFFSET_RG_VEC0 + 5*SZVREG
> +	vst	vr6, fp, OFFSET_RG + DL_OFFSET_RG_VEC0 + 6*SZVREG
> +	vst	vr7, fp, OFFSET_RG + DL_OFFSET_RG_VEC0 + 7*SZVREG
> +#elif !defined __loongarch_soft_float
> +	FREG_S	fa0, fp, OFFSET_RG + DL_OFFSET_RG_VEC0 + 0*SZFREG
> +	FREG_S	fa1, fp, OFFSET_RG + DL_OFFSET_RG_VEC0 + 1*SZFREG
> +	FREG_S	fa2, fp, OFFSET_RG + DL_OFFSET_RG_VEC0 + 2*SZFREG
> +	FREG_S	fa3, fp, OFFSET_RG + DL_OFFSET_RG_VEC0 + 3*SZFREG
> +	FREG_S	fa4, fp, OFFSET_RG + DL_OFFSET_RG_VEC0 + 4*SZFREG
> +	FREG_S	fa5, fp, OFFSET_RG + DL_OFFSET_RG_VEC0 + 5*SZFREG
> +	FREG_S	fa6, fp, OFFSET_RG + DL_OFFSET_RG_VEC0 + 6*SZFREG
> +	FREG_S	fa7, fp, OFFSET_RG + DL_OFFSET_RG_VEC0 + 7*SZFREG
> +#endif
> +
> +	/* Update .got.plt and obtain runtime address of callee.  */
> +	SLLI	a1, t1, 1
> +	or	a0, t0, zero
> +	ADD	a1, a1, t1
> +	or	a2, ra, zero		/* return addr */
> +	ADDI	a3, fp, OFFSET_RG	/* La_loongarch_regs pointer */
> +	ADDI	a4, fp, OFFSET_FS 	/* frame size return from pltenter */
> +
> +	REG_S	a0, fp, OFFSET_SAVED_CALL_A0
> +	REG_S	a1, fp, OFFSET_SAVED_CALL_A0 + SZREG
> +
> +	la	t2, _dl_profile_fixup
> +	jirl	ra, t2, 0
> +
> +	REG_L	t3, fp, OFFSET_FS
> +	bge	t3, zero, 1f
> +
> +	/* Save the return.  */
> +	or	t4, v0, zero
> +
> +	/* Restore arguments from stack.  */
> +	REG_L	a0, fp, OFFSET_RG + DL_OFFSET_RG_A0 + 0*SZREG
> +	REG_L	a1, fp, OFFSET_RG + DL_OFFSET_RG_A0 + 1*SZREG
> +	REG_L	a2, fp, OFFSET_RG + DL_OFFSET_RG_A0 + 2*SZREG
> +	REG_L	a3, fp, OFFSET_RG + DL_OFFSET_RG_A0 + 3*SZREG
> +	REG_L	a4, fp, OFFSET_RG + DL_OFFSET_RG_A0 + 4*SZREG
> +	REG_L	a5, fp, OFFSET_RG + DL_OFFSET_RG_A0 + 5*SZREG
> +	REG_L	a6, fp, OFFSET_RG + DL_OFFSET_RG_A0 + 6*SZREG
> +	REG_L	a7, fp, OFFSET_RG + DL_OFFSET_RG_A0 + 7*SZREG
> +
> +#ifdef USE_LASX
> +	xvld	xr0, fp, OFFSET_RG + DL_OFFSET_RG_VEC0 + 0*SZXREG
> +	xvld	xr1, fp, OFFSET_RG + DL_OFFSET_RG_VEC0 + 1*SZXREG
> +	xvld	xr2, fp, OFFSET_RG + DL_OFFSET_RG_VEC0 + 2*SZXREG
> +	xvld	xr3, fp, OFFSET_RG + DL_OFFSET_RG_VEC0 + 3*SZXREG
> +	xvld	xr4, fp, OFFSET_RG + DL_OFFSET_RG_VEC0 + 4*SZXREG
> +	xvld	xr5, fp, OFFSET_RG + DL_OFFSET_RG_VEC0 + 5*SZXREG
> +	xvld	xr6, fp, OFFSET_RG + DL_OFFSET_RG_VEC0 + 6*SZXREG
> +	xvld	xr7, fp, OFFSET_RG + DL_OFFSET_RG_VEC0 + 7*SZXREG
> +#elif defined USE_LSX
> +	vld	vr0, fp, OFFSET_RG + DL_OFFSET_RG_VEC0 + 0*SZVREG
> +	vld	vr1, fp, OFFSET_RG + DL_OFFSET_RG_VEC0 + 1*SZVREG
> +	vld	vr2, fp, OFFSET_RG + DL_OFFSET_RG_VEC0 + 2*SZVREG
> +	vld	vr3, fp, OFFSET_RG + DL_OFFSET_RG_VEC0 + 3*SZVREG
> +	vld	vr4, fp, OFFSET_RG + DL_OFFSET_RG_VEC0 + 4*SZVREG
> +	vld	vr5, fp, OFFSET_RG + DL_OFFSET_RG_VEC0 + 5*SZVREG
> +	vld	vr6, fp, OFFSET_RG + DL_OFFSET_RG_VEC0 + 6*SZVREG
> +	vld	vr7, fp, OFFSET_RG + DL_OFFSET_RG_VEC0 + 7*SZVREG
> +#elif !defined __loongarch_soft_float
> +	FREG_L	fa0, fp, OFFSET_RG + DL_OFFSET_RG_VEC0 + 0*SZFREG
> +	FREG_L	fa1, fp, OFFSET_RG + DL_OFFSET_RG_VEC0 + 1*SZFREG
> +	FREG_L	fa2, fp, OFFSET_RG + DL_OFFSET_RG_VEC0 + 2*SZFREG
> +	FREG_L	fa3, fp, OFFSET_RG + DL_OFFSET_RG_VEC0 + 3*SZFREG
> +	FREG_L	fa4, fp, OFFSET_RG + DL_OFFSET_RG_VEC0 + 4*SZFREG
> +	FREG_L	fa5, fp, OFFSET_RG + DL_OFFSET_RG_VEC0 + 5*SZFREG
> +	FREG_L	fa6, fp, OFFSET_RG + DL_OFFSET_RG_VEC0 + 6*SZFREG
> +	FREG_L	fa7, fp, OFFSET_RG + DL_OFFSET_RG_VEC0 + 7*SZFREG
> +#endif
> +
> +	REG_L   ra, fp, 0
> +	REG_L   fp, fp, SZREG
> +
> +	ADDI	sp, sp, SF_SIZE
> +	jirl	zero, t4, 0
> +
> +1:
> +	/* The new frame size is in t3.  */
> +	SUB	sp, fp, t3
> +	BSTRINS sp, zero, 3, 0
> +
> +	REG_S	a0, fp, OFFSET_T1
> +
> +	or	a0, sp, zero
> +	ADDI	a1, fp, SF_SIZE
> +	or	a2, t3,	zero
> +	la	t5, memcpy
> +	jirl	ra, t5, 0
> +
> +	REG_L	t6, fp, OFFSET_T1
> +
> +	/* Call the function.  */
> +	REG_L	a0, fp, OFFSET_RG + DL_OFFSET_RG_A0 + 0*SZREG
> +	REG_L	a1, fp, OFFSET_RG + DL_OFFSET_RG_A0 + 1*SZREG
> +	REG_L	a2, fp, OFFSET_RG + DL_OFFSET_RG_A0 + 2*SZREG
> +	REG_L	a3, fp, OFFSET_RG + DL_OFFSET_RG_A0 + 3*SZREG
> +	REG_L	a4, fp, OFFSET_RG + DL_OFFSET_RG_A0 + 4*SZREG
> +	REG_L	a5, fp, OFFSET_RG + DL_OFFSET_RG_A0 + 5*SZREG
> +	REG_L	a6, fp, OFFSET_RG + DL_OFFSET_RG_A0 + 6*SZREG
> +	REG_L	a7, fp, OFFSET_RG + DL_OFFSET_RG_A0 + 7*SZREG
> +
> +#ifdef USE_LASX
> +	xvld	xr0, fp, OFFSET_RG + DL_OFFSET_RG_VEC0 + 0*SZXREG
> +	xvld	xr1, fp, OFFSET_RG + DL_OFFSET_RG_VEC0 + 1*SZXREG
> +	xvld	xr2, fp, OFFSET_RG + DL_OFFSET_RG_VEC0 + 2*SZXREG
> +	xvld	xr3, fp, OFFSET_RG + DL_OFFSET_RG_VEC0 + 3*SZXREG
> +	xvld	xr4, fp, OFFSET_RG + DL_OFFSET_RG_VEC0 + 4*SZXREG
> +	xvld	xr5, fp, OFFSET_RG + DL_OFFSET_RG_VEC0 + 5*SZXREG
> +	xvld	xr6, fp, OFFSET_RG + DL_OFFSET_RG_VEC0 + 6*SZXREG
> +	xvld	xr7, fp, OFFSET_RG + DL_OFFSET_RG_VEC0 + 7*SZXREG
> +#elif defined USE_LSX
> +	vld	vr0, fp, OFFSET_RG + DL_OFFSET_RG_VEC0 + 0*SZVREG
> +	vld	vr1, fp, OFFSET_RG + DL_OFFSET_RG_VEC0 + 1*SZVREG
> +	vld	vr2, fp, OFFSET_RG + DL_OFFSET_RG_VEC0 + 2*SZVREG
> +	vld	vr3, fp, OFFSET_RG + DL_OFFSET_RG_VEC0 + 3*SZVREG
> +	vld	vr4, fp, OFFSET_RG + DL_OFFSET_RG_VEC0 + 4*SZVREG
> +	vld	vr5, fp, OFFSET_RG + DL_OFFSET_RG_VEC0 + 5*SZVREG
> +	vld	vr6, fp, OFFSET_RG + DL_OFFSET_RG_VEC0 + 6*SZVREG
> +	vld	vr7, fp, OFFSET_RG + DL_OFFSET_RG_VEC0 + 7*SZVREG
> +#elif !defined __loongarch_soft_float
> +	FREG_L	fa0, fp, OFFSET_RG + DL_OFFSET_RG_VEC0 + 0*SZFREG
> +	FREG_L	fa1, fp, OFFSET_RG + DL_OFFSET_RG_VEC0 + 1*SZFREG
> +	FREG_L	fa2, fp, OFFSET_RG + DL_OFFSET_RG_VEC0 + 2*SZFREG
> +	FREG_L	fa3, fp, OFFSET_RG + DL_OFFSET_RG_VEC0 + 3*SZFREG
> +	FREG_L	fa4, fp, OFFSET_RG + DL_OFFSET_RG_VEC0 + 4*SZFREG
> +	FREG_L	fa5, fp, OFFSET_RG + DL_OFFSET_RG_VEC0 + 5*SZFREG
> +	FREG_L	fa6, fp, OFFSET_RG + DL_OFFSET_RG_VEC0 + 6*SZFREG
> +	FREG_L	fa7, fp, OFFSET_RG + DL_OFFSET_RG_VEC0 + 7*SZFREG
> +#endif
> +
> +	jirl	ra, t6, 0
> +
> +	REG_S	a0, fp, OFFSET_SAVED_CALL_A0 + DL_OFFSET_RV_A0
> +	REG_S	a1, fp, OFFSET_SAVED_CALL_A0 + DL_OFFSET_RV_A0 + SZREG
> +
> +#ifdef USE_LASX
> +	xvst	xr0, fp, OFFSET_SAVED_CALL_A0 + DL_OFFSET_RV_VEC0
> +	xvst	xr1, fp, OFFSET_SAVED_CALL_A0 + DL_OFFSET_RV_VEC0 + SZXREG
> +#elif defined USE_LSX
> +	vst	vr0, fp, OFFSET_SAVED_CALL_A0 + DL_OFFSET_RV_VEC0
> +	vst	vr1, fp, OFFSET_SAVED_CALL_A0 + DL_OFFSET_RV_VEC0 + SZVREG
> +#elif !defined __loongarch_soft_float
> +	FREG_S	fa0, fp, OFFSET_SAVED_CALL_A0 + DL_OFFSET_RV_VEC0
> +	FREG_S	fa1, fp, OFFSET_SAVED_CALL_A0 + DL_OFFSET_RV_VEC0 + SZFREG
> +#endif
> +
> +	/* Setup call to pltexit.  */
> +	REG_L	a0, fp, OFFSET_SAVED_CALL_A0
> +	REG_L	a1, fp, OFFSET_SAVED_CALL_A0 + SZREG
> +	ADDI	a2, fp, OFFSET_RG
> +	ADDI	a3, fp, OFFSET_RV
> +	la	t7, _dl_audit_pltexit
> +	jirl	ra, t7, 0
> +
> +	REG_L	a0, fp, OFFSET_RV + DL_OFFSET_RV_A0
> +	REG_L	a1, fp, OFFSET_RV + DL_OFFSET_RV_A0 + SZREG
> +
> +#ifdef USE_LASX
> +	xvld	xr0, fp, OFFSET_SAVED_CALL_A0 + DL_OFFSET_RV_VEC0
> +	xvld	xr1, fp, OFFSET_SAVED_CALL_A0 + DL_OFFSET_RV_VEC0 + SZXREG
> +#elif defined USE_LSX
> +	vld	vr0, fp, OFFSET_SAVED_CALL_A0 + DL_OFFSET_RV_VEC0
> +	vld	vr1, fp, OFFSET_SAVED_CALL_A0 + DL_OFFSET_RV_VEC0 + SZVREG
> +#elif !defined __loongarch_soft_float
> +	FREG_L	fa0, fp, OFFSET_SAVED_CALL_A0 + DL_OFFSET_RV_VEC0
> +	FREG_L	fa1, fp, OFFSET_SAVED_CALL_A0 + DL_OFFSET_RV_VEC0 + SZFREG
> +#endif
> +
> +	/* RA from within La_loongarch_reg.  */
> +	REG_L   ra, fp, OFFSET_RG + DL_OFFSET_RG_RA
> +	or	sp, fp, zero
> +	ADDI	sp, sp, SF_SIZE
> +	REG_S   fp, fp, SZREG
> +
> +	jirl	zero, ra, 0
> +
> +END (_dl_runtime_profile)
  

Patch

diff --git a/sysdeps/loongarch/bits/link.h b/sysdeps/loongarch/bits/link.h
index 7fa6131280..00f6f25f2d 100644
--- a/sysdeps/loongarch/bits/link.h
+++ b/sysdeps/loongarch/bits/link.h
@@ -20,10 +20,26 @@ 
 #error "Never include <bits/link.h> directly; use <link.h> instead."
 #endif
 
+#ifndef __loongarch_soft_float
+typedef float La_loongarch_vr
+    __attribute__ ((__vector_size__ (16), __aligned__ (16)));
+typedef float La_loongarch_xr
+    __attribute__ ((__vector_size__ (32), __aligned__ (16)));
+
+typedef union
+{
+  double fpreg[4];
+  La_loongarch_vr vr[2];
+  La_loongarch_xr xr[1];
+} La_loongarch_vector __attribute__ ((__aligned__ (16)));
+#endif
+
 typedef struct La_loongarch_regs
 {
   unsigned long int lr_reg[8]; /* a0 - a7 */
-  double lr_fpreg[8];	       /* fa0 - fa7 */
+#ifndef __loongarch_soft_float
+  La_loongarch_vector lr_vec[8]; /* fa0 - fa7 or vr0 - vr7 or xr0 - xr7*/
+#endif
   unsigned long int lr_ra;
   unsigned long int lr_sp;
 } La_loongarch_regs;
@@ -33,8 +49,10 @@  typedef struct La_loongarch_retval
 {
   unsigned long int lrv_a0;
   unsigned long int lrv_a1;
-  double lrv_fa0;
-  double lrv_fa1;
+#ifndef __loongarch_soft_float
+  La_loongarch_vector lrv_vec0;
+  La_loongarch_vector lrv_vec1;
+#endif
 } La_loongarch_retval;
 
 __BEGIN_DECLS
diff --git a/sysdeps/loongarch/dl-link.sym b/sysdeps/loongarch/dl-link.sym
index 868ab7c6eb..b534968e30 100644
--- a/sysdeps/loongarch/dl-link.sym
+++ b/sysdeps/loongarch/dl-link.sym
@@ -6,9 +6,13 @@  DL_SIZEOF_RG            sizeof(struct La_loongarch_regs)
 DL_SIZEOF_RV            sizeof(struct La_loongarch_retval)
 
 DL_OFFSET_RG_A0         offsetof(struct La_loongarch_regs, lr_reg)
-DL_OFFSET_RG_FA0        offsetof(struct La_loongarch_regs, lr_fpreg)
+#ifndef __loongarch_soft_float
+DL_OFFSET_RG_VEC0       offsetof(struct La_loongarch_regs, lr_vec)
+#endif
 DL_OFFSET_RG_RA         offsetof(struct La_loongarch_regs, lr_ra)
 DL_OFFSET_RG_SP         offsetof(struct La_loongarch_regs, lr_sp)
 
 DL_OFFSET_RV_A0         offsetof(struct La_loongarch_retval, lrv_a0)
-DL_OFFSET_RV_FA0        offsetof(struct La_loongarch_retval, lrv_a1)
+#ifndef __loongarch_soft_float
+DL_OFFSET_RV_VEC0       offsetof(struct La_loongarch_retval, lrv_vec0)
+#endif
diff --git a/sysdeps/loongarch/dl-machine.h b/sysdeps/loongarch/dl-machine.h
index 066bb233ac..8a2db9de3c 100644
--- a/sysdeps/loongarch/dl-machine.h
+++ b/sysdeps/loongarch/dl-machine.h
@@ -273,6 +273,8 @@  elf_machine_runtime_setup (struct link_map *l, struct r_scope_elem *scope[],
 #if !defined __loongarch_soft_float
       extern void _dl_runtime_resolve_lasx (void) attribute_hidden;
       extern void _dl_runtime_resolve_lsx (void) attribute_hidden;
+      extern void _dl_runtime_profile_lasx (void) attribute_hidden;
+      extern void _dl_runtime_profile_lsx (void) attribute_hidden;
 #endif
       extern void _dl_runtime_resolve (void) attribute_hidden;
       extern void _dl_runtime_profile (void) attribute_hidden;
@@ -287,7 +289,14 @@  elf_machine_runtime_setup (struct link_map *l, struct r_scope_elem *scope[],
 	 end in this function.  */
       if (profile != 0)
 	{
-	   gotplt[0] = (ElfW(Addr)) &_dl_runtime_profile;
+#if !defined __loongarch_soft_float
+	  if (SUPPORT_LASX)
+	    gotplt[0] = (ElfW(Addr)) &_dl_runtime_profile_lasx;
+	  else if (SUPPORT_LSX)
+	    gotplt[0] = (ElfW(Addr)) &_dl_runtime_profile_lsx;
+	  else
+#endif
+	    gotplt[0] = (ElfW(Addr)) &_dl_runtime_profile;
 
 	  if (GLRO(dl_profile) != NULL
 	      && _dl_name_match_p (GLRO(dl_profile), l))
diff --git a/sysdeps/loongarch/dl-trampoline.S b/sysdeps/loongarch/dl-trampoline.S
index 8fd9146978..dce1c2f122 100644
--- a/sysdeps/loongarch/dl-trampoline.S
+++ b/sysdeps/loongarch/dl-trampoline.S
@@ -22,190 +22,23 @@ 
 #if !defined __loongarch_soft_float
 #define USE_LASX
 #define _dl_runtime_resolve _dl_runtime_resolve_lasx
+#define _dl_runtime_profile _dl_runtime_profile_lasx
 #include "dl-trampoline.h"
 #undef FRAME_SIZE
 #undef USE_LASX
 #undef _dl_runtime_resolve
+#undef _dl_runtime_profile
 
 #define USE_LSX
 #define _dl_runtime_resolve _dl_runtime_resolve_lsx
+#define _dl_runtime_profile _dl_runtime_profile_lsx
 #include "dl-trampoline.h"
 #undef FRAME_SIZE
 #undef USE_LSX
 #undef _dl_runtime_resolve
+#undef _dl_runtime_profile
 #endif
 
 #include "dl-trampoline.h"
 
-#include "dl-link.h"
 
-ENTRY (_dl_runtime_profile)
-       /* LoongArch we get called with:
-	t0	      linkr_map pointer
-	t1	      the scaled offset stored in t0, which can be used
-		      to calculate the offset of the current symbol in .rela.plt
-	t2	      %hi(%pcrel(.got.plt)) stored in t2, no use in this function
-	t3	      dl resolver entry point, no use in this function
-
-	Stack frame layout:
-	[sp,    #96] La_loongarch_regs
-	[sp,    #48] La_loongarch_retval
-	[sp,    #40] frame size return from pltenter
-	[sp,    #32] dl_profile_call saved a1
-	[sp,    #24] dl_profile_call saved a0
-	[sp,    #16] T1
-	[sp,     #0] ra, fp   <- fp
-       */
-
-# define OFFSET_T1              16
-# define OFFSET_SAVED_CALL_A0   OFFSET_T1 + 8
-# define OFFSET_FS              OFFSET_SAVED_CALL_A0 + 16
-# define OFFSET_RV              OFFSET_FS + 8
-# define OFFSET_RG              OFFSET_RV + DL_SIZEOF_RV
-
-# define SF_SIZE                (-(-(OFFSET_RG + DL_SIZEOF_RG) & ALMASK))
-
-	/* Save arguments to stack. */
-	ADDI	sp, sp, -SF_SIZE
-	REG_S	ra, sp, 0
-	REG_S	fp, sp, 8
-
-	or	fp, sp, zero
-
-	REG_S	a0, fp, OFFSET_RG + DL_OFFSET_RG_A0 + 0*SZREG
-	REG_S	a1, fp, OFFSET_RG + DL_OFFSET_RG_A0 + 1*SZREG
-	REG_S	a2, fp, OFFSET_RG + DL_OFFSET_RG_A0 + 2*SZREG
-	REG_S	a3, fp, OFFSET_RG + DL_OFFSET_RG_A0 + 3*SZREG
-	REG_S	a4, fp, OFFSET_RG + DL_OFFSET_RG_A0 + 4*SZREG
-	REG_S	a5, fp, OFFSET_RG + DL_OFFSET_RG_A0 + 5*SZREG
-	REG_S	a6, fp, OFFSET_RG + DL_OFFSET_RG_A0 + 6*SZREG
-	REG_S	a7, fp, OFFSET_RG + DL_OFFSET_RG_A0 + 7*SZREG
-
-#ifndef __loongarch_soft_float
-	FREG_S	fa0, fp, OFFSET_RG + DL_OFFSET_RG_FA0 + 0*SZFREG
-	FREG_S	fa1, fp, OFFSET_RG + DL_OFFSET_RG_FA0 + 1*SZFREG
-	FREG_S	fa2, fp, OFFSET_RG + DL_OFFSET_RG_FA0 + 2*SZFREG
-	FREG_S	fa3, fp, OFFSET_RG + DL_OFFSET_RG_FA0 + 3*SZFREG
-	FREG_S	fa4, fp, OFFSET_RG + DL_OFFSET_RG_FA0 + 4*SZFREG
-	FREG_S	fa5, fp, OFFSET_RG + DL_OFFSET_RG_FA0 + 5*SZFREG
-	FREG_S	fa6, fp, OFFSET_RG + DL_OFFSET_RG_FA0 + 6*SZFREG
-	FREG_S	fa7, fp, OFFSET_RG + DL_OFFSET_RG_FA0 + 7*SZFREG
-#endif
-
-	/* Update .got.plt and obtain runtime address of callee.  */
-	SLLI	a1, t1, 1
-	or	a0, t0, zero
-	ADD	a1, a1, t1
-	or	a2, ra, zero		/* return addr */
-	ADDI	a3, fp, OFFSET_RG	/* La_loongarch_regs pointer */
-	ADDI	a4, fp, OFFSET_FS 	/* frame size return from pltenter */
-
-	REG_S	a0, fp, OFFSET_SAVED_CALL_A0
-	REG_S	a1, fp, OFFSET_SAVED_CALL_A0 + SZREG
-
-	la	t2, _dl_profile_fixup
-	jirl	ra, t2, 0
-
-	REG_L	t3, fp, OFFSET_FS
-	bge	t3, zero, 1f
-
-	/* Save the return.  */
-	or	t4, v0, zero
-
-	/* Restore arguments from stack.  */
-	REG_L	a0, fp, OFFSET_RG + DL_OFFSET_RG_A0 + 0*SZREG
-	REG_L	a1, fp, OFFSET_RG + DL_OFFSET_RG_A0 + 1*SZREG
-	REG_L	a2, fp, OFFSET_RG + DL_OFFSET_RG_A0 + 2*SZREG
-	REG_L	a3, fp, OFFSET_RG + DL_OFFSET_RG_A0 + 3*SZREG
-	REG_L	a4, fp, OFFSET_RG + DL_OFFSET_RG_A0 + 4*SZREG
-	REG_L	a5, fp, OFFSET_RG + DL_OFFSET_RG_A0 + 5*SZREG
-	REG_L	a6, fp, OFFSET_RG + DL_OFFSET_RG_A0 + 6*SZREG
-	REG_L	a7, fp, OFFSET_RG + DL_OFFSET_RG_A0 + 7*SZREG
-
-#ifndef __loongarch_soft_float
-	FREG_L	fa0, fp, OFFSET_RG + DL_OFFSET_RG_FA0 + 0*SZFREG
-	FREG_L	fa1, fp, OFFSET_RG + DL_OFFSET_RG_FA0 + 1*SZFREG
-	FREG_L	fa2, fp, OFFSET_RG + DL_OFFSET_RG_FA0 + 2*SZFREG
-	FREG_L	fa3, fp, OFFSET_RG + DL_OFFSET_RG_FA0 + 3*SZFREG
-	FREG_L	fa4, fp, OFFSET_RG + DL_OFFSET_RG_FA0 + 4*SZFREG
-	FREG_L	fa5, fp, OFFSET_RG + DL_OFFSET_RG_FA0 + 5*SZFREG
-	FREG_L	fa6, fp, OFFSET_RG + DL_OFFSET_RG_FA0 + 6*SZFREG
-	FREG_L	fa7, fp, OFFSET_RG + DL_OFFSET_RG_FA0 + 7*SZFREG
-#endif
-
-	REG_L   ra, fp, 0
-	REG_L   fp, fp, SZREG
-
-	ADDI	sp, sp, SF_SIZE
-	jirl	zero, t4, 0
-
-1:
-	/* The new frame size is in t3.  */
-	SUB	sp, fp, t3
-	BSTRINS sp, zero, 3, 0
-
-	REG_S	a0, fp, OFFSET_T1
-
-	or	a0, sp, zero
-	ADDI	a1, fp, SF_SIZE
-	or	a2, t3,	zero
-	la	t5, memcpy
-	jirl	ra, t5, 0
-
-	REG_L	t6, fp, OFFSET_T1
-
-	/* Call the function.  */
-	REG_L	a0, fp, OFFSET_RG + DL_OFFSET_RG_A0 + 0*SZREG
-	REG_L	a1, fp, OFFSET_RG + DL_OFFSET_RG_A0 + 1*SZREG
-	REG_L	a2, fp, OFFSET_RG + DL_OFFSET_RG_A0 + 2*SZREG
-	REG_L	a3, fp, OFFSET_RG + DL_OFFSET_RG_A0 + 3*SZREG
-	REG_L	a4, fp, OFFSET_RG + DL_OFFSET_RG_A0 + 4*SZREG
-	REG_L	a5, fp, OFFSET_RG + DL_OFFSET_RG_A0 + 5*SZREG
-	REG_L	a6, fp, OFFSET_RG + DL_OFFSET_RG_A0 + 6*SZREG
-	REG_L	a7, fp, OFFSET_RG + DL_OFFSET_RG_A0 + 7*SZREG
-
-#ifndef __loongarch_soft_float
-	FREG_L	fa0, fp, OFFSET_RG + DL_OFFSET_RG_FA0 + 0*SZFREG
-	FREG_L	fa1, fp, OFFSET_RG + DL_OFFSET_RG_FA0 + 1*SZFREG
-	FREG_L	fa2, fp, OFFSET_RG + DL_OFFSET_RG_FA0 + 2*SZFREG
-	FREG_L	fa3, fp, OFFSET_RG + DL_OFFSET_RG_FA0 + 3*SZFREG
-	FREG_L	fa4, fp, OFFSET_RG + DL_OFFSET_RG_FA0 + 4*SZFREG
-	FREG_L	fa5, fp, OFFSET_RG + DL_OFFSET_RG_FA0 + 5*SZFREG
-	FREG_L	fa6, fp, OFFSET_RG + DL_OFFSET_RG_FA0 + 6*SZFREG
-	FREG_L	fa7, fp, OFFSET_RG + DL_OFFSET_RG_FA0 + 7*SZFREG
-#endif
-	jirl	ra, t6, 0
-
-	REG_S	a0, fp, OFFSET_SAVED_CALL_A0 + DL_OFFSET_RV_A0
-	REG_S	a1, fp, OFFSET_SAVED_CALL_A0 + DL_OFFSET_RV_A0 + SZREG
-
-#ifndef __loongarch_soft_float
-	FREG_S	fa0, fp, OFFSET_SAVED_CALL_A0 + DL_OFFSET_RV_FA0
-	FREG_S	fa1, fp, OFFSET_SAVED_CALL_A0 + DL_OFFSET_RV_FA0 + SZFREG
-#endif
-
-	/* Setup call to pltexit.  */
-	REG_L	a0, fp, OFFSET_SAVED_CALL_A0
-	REG_L	a1, fp, OFFSET_SAVED_CALL_A0 + SZREG
-	ADDI	a2, fp, OFFSET_RG
-	ADDI	a3, fp, OFFSET_RV
-	la	t7, _dl_audit_pltexit
-	jirl	ra, t7, 0
-
-	REG_L	a0, fp, OFFSET_RV + DL_OFFSET_RV_A0
-	REG_L	a1, fp, OFFSET_RV + DL_OFFSET_RV_A0 + SZREG
-
-#ifndef __loongarch_soft_float
-	FREG_L	fa0, fp, OFFSET_RV + DL_OFFSET_RV_FA0
-	FREG_L	fa1, fp, OFFSET_RV + DL_OFFSET_RV_FA0 + SZFREG
-#endif
-
-	/* RA from within La_loongarch_reg.  */
-	REG_L   ra, fp, OFFSET_RG + DL_OFFSET_RG_RA
-	or	sp, fp, zero
-	ADDI	sp, sp, SF_SIZE
-	REG_S   fp, fp, SZREG
-
-	jirl	zero, ra, 0
-
-END (_dl_runtime_profile)
diff --git a/sysdeps/loongarch/dl-trampoline.h b/sysdeps/loongarch/dl-trampoline.h
index 02375286f8..cb4a287c65 100644
--- a/sysdeps/loongarch/dl-trampoline.h
+++ b/sysdeps/loongarch/dl-trampoline.h
@@ -125,3 +125,245 @@  ENTRY (_dl_runtime_resolve)
 	/* Invoke the callee. */
 	jirl	zero, t1, 0
 END (_dl_runtime_resolve)
+
+#include "dl-link.h"
+
+ENTRY (_dl_runtime_profile)
+       /* LoongArch we get called with:
+	t0	      linkr_map pointer
+	t1	      the scaled offset stored in t0, which can be used
+		      to calculate the offset of the current symbol in .rela.plt
+	t2	      %hi(%pcrel(.got.plt)) stored in t2, no use in this function
+	t3	      dl resolver entry point, no use in this function
+
+	Stack frame layout:
+	[sp,    #208] La_loongarch_regs
+	[sp,    #128] La_loongarch_retval // align: 16
+	[sp,    #112] frame size return from pltenter
+	[sp,    #80 ] dl_profile_call saved vec1
+	[sp,    #48 ] dl_profile_call saved vec0 // align: 16
+	[sp,    #32 ] dl_profile_call saved a1
+	[sp,    #24 ] dl_profile_call saved a0
+	[sp,    #16 ] T1
+	[sp,     #0 ] ra, fp   <- fp
+       */
+
+# define OFFSET_T1              16
+# define OFFSET_SAVED_CALL_A0   OFFSET_T1 + 8
+# define OFFSET_FS              OFFSET_SAVED_CALL_A0 + 16 + 8 + 64
+# define OFFSET_RV              OFFSET_FS + 8 + 8
+# define OFFSET_RG              OFFSET_RV + DL_SIZEOF_RV
+
+# define SF_SIZE                (-(-(OFFSET_RG + DL_SIZEOF_RG) & ALMASK))
+
+	/* Save arguments to stack. */
+	ADDI	sp, sp, -SF_SIZE
+	REG_S	ra, sp, 0
+	REG_S	fp, sp, 8
+
+	or	fp, sp, zero
+
+	REG_S	a0, fp, OFFSET_RG + DL_OFFSET_RG_A0 + 0*SZREG
+	REG_S	a1, fp, OFFSET_RG + DL_OFFSET_RG_A0 + 1*SZREG
+	REG_S	a2, fp, OFFSET_RG + DL_OFFSET_RG_A0 + 2*SZREG
+	REG_S	a3, fp, OFFSET_RG + DL_OFFSET_RG_A0 + 3*SZREG
+	REG_S	a4, fp, OFFSET_RG + DL_OFFSET_RG_A0 + 4*SZREG
+	REG_S	a5, fp, OFFSET_RG + DL_OFFSET_RG_A0 + 5*SZREG
+	REG_S	a6, fp, OFFSET_RG + DL_OFFSET_RG_A0 + 6*SZREG
+	REG_S	a7, fp, OFFSET_RG + DL_OFFSET_RG_A0 + 7*SZREG
+
+#ifdef USE_LASX
+	xvst	xr0, fp, OFFSET_RG + DL_OFFSET_RG_VEC0 + 0*SZXREG
+	xvst	xr1, fp, OFFSET_RG + DL_OFFSET_RG_VEC0 + 1*SZXREG
+	xvst	xr2, fp, OFFSET_RG + DL_OFFSET_RG_VEC0 + 2*SZXREG
+	xvst	xr3, fp, OFFSET_RG + DL_OFFSET_RG_VEC0 + 3*SZXREG
+	xvst	xr4, fp, OFFSET_RG + DL_OFFSET_RG_VEC0 + 4*SZXREG
+	xvst	xr5, fp, OFFSET_RG + DL_OFFSET_RG_VEC0 + 5*SZXREG
+	xvst	xr6, fp, OFFSET_RG + DL_OFFSET_RG_VEC0 + 6*SZXREG
+	xvst	xr7, fp, OFFSET_RG + DL_OFFSET_RG_VEC0 + 7*SZXREG
+#elif defined USE_LSX
+	vst	vr0, fp, OFFSET_RG + DL_OFFSET_RG_VEC0 + 0*SZVREG
+	vst	vr1, fp, OFFSET_RG + DL_OFFSET_RG_VEC0 + 1*SZVREG
+	vst	vr2, fp, OFFSET_RG + DL_OFFSET_RG_VEC0 + 2*SZVREG
+	vst	vr3, fp, OFFSET_RG + DL_OFFSET_RG_VEC0 + 3*SZVREG
+	vst	vr4, fp, OFFSET_RG + DL_OFFSET_RG_VEC0 + 4*SZVREG
+	vst	vr5, fp, OFFSET_RG + DL_OFFSET_RG_VEC0 + 5*SZVREG
+	vst	vr6, fp, OFFSET_RG + DL_OFFSET_RG_VEC0 + 6*SZVREG
+	vst	vr7, fp, OFFSET_RG + DL_OFFSET_RG_VEC0 + 7*SZVREG
+#elif !defined __loongarch_soft_float
+	FREG_S	fa0, fp, OFFSET_RG + DL_OFFSET_RG_VEC0 + 0*SZFREG
+	FREG_S	fa1, fp, OFFSET_RG + DL_OFFSET_RG_VEC0 + 1*SZFREG
+	FREG_S	fa2, fp, OFFSET_RG + DL_OFFSET_RG_VEC0 + 2*SZFREG
+	FREG_S	fa3, fp, OFFSET_RG + DL_OFFSET_RG_VEC0 + 3*SZFREG
+	FREG_S	fa4, fp, OFFSET_RG + DL_OFFSET_RG_VEC0 + 4*SZFREG
+	FREG_S	fa5, fp, OFFSET_RG + DL_OFFSET_RG_VEC0 + 5*SZFREG
+	FREG_S	fa6, fp, OFFSET_RG + DL_OFFSET_RG_VEC0 + 6*SZFREG
+	FREG_S	fa7, fp, OFFSET_RG + DL_OFFSET_RG_VEC0 + 7*SZFREG
+#endif
+
+	/* Update .got.plt and obtain runtime address of callee.  */
+	SLLI	a1, t1, 1
+	or	a0, t0, zero
+	ADD	a1, a1, t1
+	or	a2, ra, zero		/* return addr */
+	ADDI	a3, fp, OFFSET_RG	/* La_loongarch_regs pointer */
+	ADDI	a4, fp, OFFSET_FS 	/* frame size return from pltenter */
+
+	REG_S	a0, fp, OFFSET_SAVED_CALL_A0
+	REG_S	a1, fp, OFFSET_SAVED_CALL_A0 + SZREG
+
+	la	t2, _dl_profile_fixup
+	jirl	ra, t2, 0
+
+	REG_L	t3, fp, OFFSET_FS
+	bge	t3, zero, 1f
+
+	/* Save the return.  */
+	or	t4, v0, zero
+
+	/* Restore arguments from stack.  */
+	REG_L	a0, fp, OFFSET_RG + DL_OFFSET_RG_A0 + 0*SZREG
+	REG_L	a1, fp, OFFSET_RG + DL_OFFSET_RG_A0 + 1*SZREG
+	REG_L	a2, fp, OFFSET_RG + DL_OFFSET_RG_A0 + 2*SZREG
+	REG_L	a3, fp, OFFSET_RG + DL_OFFSET_RG_A0 + 3*SZREG
+	REG_L	a4, fp, OFFSET_RG + DL_OFFSET_RG_A0 + 4*SZREG
+	REG_L	a5, fp, OFFSET_RG + DL_OFFSET_RG_A0 + 5*SZREG
+	REG_L	a6, fp, OFFSET_RG + DL_OFFSET_RG_A0 + 6*SZREG
+	REG_L	a7, fp, OFFSET_RG + DL_OFFSET_RG_A0 + 7*SZREG
+
+#ifdef USE_LASX
+	xvld	xr0, fp, OFFSET_RG + DL_OFFSET_RG_VEC0 + 0*SZXREG
+	xvld	xr1, fp, OFFSET_RG + DL_OFFSET_RG_VEC0 + 1*SZXREG
+	xvld	xr2, fp, OFFSET_RG + DL_OFFSET_RG_VEC0 + 2*SZXREG
+	xvld	xr3, fp, OFFSET_RG + DL_OFFSET_RG_VEC0 + 3*SZXREG
+	xvld	xr4, fp, OFFSET_RG + DL_OFFSET_RG_VEC0 + 4*SZXREG
+	xvld	xr5, fp, OFFSET_RG + DL_OFFSET_RG_VEC0 + 5*SZXREG
+	xvld	xr6, fp, OFFSET_RG + DL_OFFSET_RG_VEC0 + 6*SZXREG
+	xvld	xr7, fp, OFFSET_RG + DL_OFFSET_RG_VEC0 + 7*SZXREG
+#elif defined USE_LSX
+	vld	vr0, fp, OFFSET_RG + DL_OFFSET_RG_VEC0 + 0*SZVREG
+	vld	vr1, fp, OFFSET_RG + DL_OFFSET_RG_VEC0 + 1*SZVREG
+	vld	vr2, fp, OFFSET_RG + DL_OFFSET_RG_VEC0 + 2*SZVREG
+	vld	vr3, fp, OFFSET_RG + DL_OFFSET_RG_VEC0 + 3*SZVREG
+	vld	vr4, fp, OFFSET_RG + DL_OFFSET_RG_VEC0 + 4*SZVREG
+	vld	vr5, fp, OFFSET_RG + DL_OFFSET_RG_VEC0 + 5*SZVREG
+	vld	vr6, fp, OFFSET_RG + DL_OFFSET_RG_VEC0 + 6*SZVREG
+	vld	vr7, fp, OFFSET_RG + DL_OFFSET_RG_VEC0 + 7*SZVREG
+#elif !defined __loongarch_soft_float
+	FREG_L	fa0, fp, OFFSET_RG + DL_OFFSET_RG_VEC0 + 0*SZFREG
+	FREG_L	fa1, fp, OFFSET_RG + DL_OFFSET_RG_VEC0 + 1*SZFREG
+	FREG_L	fa2, fp, OFFSET_RG + DL_OFFSET_RG_VEC0 + 2*SZFREG
+	FREG_L	fa3, fp, OFFSET_RG + DL_OFFSET_RG_VEC0 + 3*SZFREG
+	FREG_L	fa4, fp, OFFSET_RG + DL_OFFSET_RG_VEC0 + 4*SZFREG
+	FREG_L	fa5, fp, OFFSET_RG + DL_OFFSET_RG_VEC0 + 5*SZFREG
+	FREG_L	fa6, fp, OFFSET_RG + DL_OFFSET_RG_VEC0 + 6*SZFREG
+	FREG_L	fa7, fp, OFFSET_RG + DL_OFFSET_RG_VEC0 + 7*SZFREG
+#endif
+
+	REG_L   ra, fp, 0
+	REG_L   fp, fp, SZREG
+
+	ADDI	sp, sp, SF_SIZE
+	jirl	zero, t4, 0
+
+1:
+	/* The new frame size is in t3.  */
+	SUB	sp, fp, t3
+	BSTRINS sp, zero, 3, 0
+
+	REG_S	a0, fp, OFFSET_T1
+
+	or	a0, sp, zero
+	ADDI	a1, fp, SF_SIZE
+	or	a2, t3,	zero
+	la	t5, memcpy
+	jirl	ra, t5, 0
+
+	REG_L	t6, fp, OFFSET_T1
+
+	/* Call the function.  */
+	REG_L	a0, fp, OFFSET_RG + DL_OFFSET_RG_A0 + 0*SZREG
+	REG_L	a1, fp, OFFSET_RG + DL_OFFSET_RG_A0 + 1*SZREG
+	REG_L	a2, fp, OFFSET_RG + DL_OFFSET_RG_A0 + 2*SZREG
+	REG_L	a3, fp, OFFSET_RG + DL_OFFSET_RG_A0 + 3*SZREG
+	REG_L	a4, fp, OFFSET_RG + DL_OFFSET_RG_A0 + 4*SZREG
+	REG_L	a5, fp, OFFSET_RG + DL_OFFSET_RG_A0 + 5*SZREG
+	REG_L	a6, fp, OFFSET_RG + DL_OFFSET_RG_A0 + 6*SZREG
+	REG_L	a7, fp, OFFSET_RG + DL_OFFSET_RG_A0 + 7*SZREG
+
+#ifdef USE_LASX
+	xvld	xr0, fp, OFFSET_RG + DL_OFFSET_RG_VEC0 + 0*SZXREG
+	xvld	xr1, fp, OFFSET_RG + DL_OFFSET_RG_VEC0 + 1*SZXREG
+	xvld	xr2, fp, OFFSET_RG + DL_OFFSET_RG_VEC0 + 2*SZXREG
+	xvld	xr3, fp, OFFSET_RG + DL_OFFSET_RG_VEC0 + 3*SZXREG
+	xvld	xr4, fp, OFFSET_RG + DL_OFFSET_RG_VEC0 + 4*SZXREG
+	xvld	xr5, fp, OFFSET_RG + DL_OFFSET_RG_VEC0 + 5*SZXREG
+	xvld	xr6, fp, OFFSET_RG + DL_OFFSET_RG_VEC0 + 6*SZXREG
+	xvld	xr7, fp, OFFSET_RG + DL_OFFSET_RG_VEC0 + 7*SZXREG
+#elif defined USE_LSX
+	vld	vr0, fp, OFFSET_RG + DL_OFFSET_RG_VEC0 + 0*SZVREG
+	vld	vr1, fp, OFFSET_RG + DL_OFFSET_RG_VEC0 + 1*SZVREG
+	vld	vr2, fp, OFFSET_RG + DL_OFFSET_RG_VEC0 + 2*SZVREG
+	vld	vr3, fp, OFFSET_RG + DL_OFFSET_RG_VEC0 + 3*SZVREG
+	vld	vr4, fp, OFFSET_RG + DL_OFFSET_RG_VEC0 + 4*SZVREG
+	vld	vr5, fp, OFFSET_RG + DL_OFFSET_RG_VEC0 + 5*SZVREG
+	vld	vr6, fp, OFFSET_RG + DL_OFFSET_RG_VEC0 + 6*SZVREG
+	vld	vr7, fp, OFFSET_RG + DL_OFFSET_RG_VEC0 + 7*SZVREG
+#elif !defined __loongarch_soft_float
+	FREG_L	fa0, fp, OFFSET_RG + DL_OFFSET_RG_VEC0 + 0*SZFREG
+	FREG_L	fa1, fp, OFFSET_RG + DL_OFFSET_RG_VEC0 + 1*SZFREG
+	FREG_L	fa2, fp, OFFSET_RG + DL_OFFSET_RG_VEC0 + 2*SZFREG
+	FREG_L	fa3, fp, OFFSET_RG + DL_OFFSET_RG_VEC0 + 3*SZFREG
+	FREG_L	fa4, fp, OFFSET_RG + DL_OFFSET_RG_VEC0 + 4*SZFREG
+	FREG_L	fa5, fp, OFFSET_RG + DL_OFFSET_RG_VEC0 + 5*SZFREG
+	FREG_L	fa6, fp, OFFSET_RG + DL_OFFSET_RG_VEC0 + 6*SZFREG
+	FREG_L	fa7, fp, OFFSET_RG + DL_OFFSET_RG_VEC0 + 7*SZFREG
+#endif
+
+	jirl	ra, t6, 0
+
+	REG_S	a0, fp, OFFSET_SAVED_CALL_A0 + DL_OFFSET_RV_A0
+	REG_S	a1, fp, OFFSET_SAVED_CALL_A0 + DL_OFFSET_RV_A0 + SZREG
+
+#ifdef USE_LASX
+	xvst	xr0, fp, OFFSET_SAVED_CALL_A0 + DL_OFFSET_RV_VEC0
+	xvst	xr1, fp, OFFSET_SAVED_CALL_A0 + DL_OFFSET_RV_VEC0 + SZXREG
+#elif defined USE_LSX
+	vst	vr0, fp, OFFSET_SAVED_CALL_A0 + DL_OFFSET_RV_VEC0
+	vst	vr1, fp, OFFSET_SAVED_CALL_A0 + DL_OFFSET_RV_VEC0 + SZVREG
+#elif !defined __loongarch_soft_float
+	FREG_S	fa0, fp, OFFSET_SAVED_CALL_A0 + DL_OFFSET_RV_VEC0
+	FREG_S	fa1, fp, OFFSET_SAVED_CALL_A0 + DL_OFFSET_RV_VEC0 + SZFREG
+#endif
+
+	/* Setup call to pltexit.  */
+	REG_L	a0, fp, OFFSET_SAVED_CALL_A0
+	REG_L	a1, fp, OFFSET_SAVED_CALL_A0 + SZREG
+	ADDI	a2, fp, OFFSET_RG
+	ADDI	a3, fp, OFFSET_RV
+	la	t7, _dl_audit_pltexit
+	jirl	ra, t7, 0
+
+	REG_L	a0, fp, OFFSET_RV + DL_OFFSET_RV_A0
+	REG_L	a1, fp, OFFSET_RV + DL_OFFSET_RV_A0 + SZREG
+
+#ifdef USE_LASX
+	xvld	xr0, fp, OFFSET_SAVED_CALL_A0 + DL_OFFSET_RV_VEC0
+	xvld	xr1, fp, OFFSET_SAVED_CALL_A0 + DL_OFFSET_RV_VEC0 + SZXREG
+#elif defined USE_LSX
+	vld	vr0, fp, OFFSET_SAVED_CALL_A0 + DL_OFFSET_RV_VEC0
+	vld	vr1, fp, OFFSET_SAVED_CALL_A0 + DL_OFFSET_RV_VEC0 + SZVREG
+#elif !defined __loongarch_soft_float
+	FREG_L	fa0, fp, OFFSET_SAVED_CALL_A0 + DL_OFFSET_RV_VEC0
+	FREG_L	fa1, fp, OFFSET_SAVED_CALL_A0 + DL_OFFSET_RV_VEC0 + SZFREG
+#endif
+
+	/* RA from within La_loongarch_reg.  */
+	REG_L   ra, fp, OFFSET_RG + DL_OFFSET_RG_RA
+	or	sp, fp, zero
+	ADDI	sp, sp, SF_SIZE
+	REG_S   fp, fp, SZREG
+
+	jirl	zero, ra, 0
+
+END (_dl_runtime_profile)