LoongArch: Add support for TLS Descriptors

Message ID 20231201095759.1897728-1-mengqinggang@loongson.cn
State Superseded
Headers
Series LoongArch: Add support for TLS Descriptors |

Checks

Context Check Description
redhat-pt-bot/TryBot-apply_patch success Patch applied to master at the time it was sent
redhat-pt-bot/TryBot-32bit success Build for i686
linaro-tcwg-bot/tcwg_glibc_build--master-arm success Testing passed
linaro-tcwg-bot/tcwg_glibc_check--master-arm success Testing passed
linaro-tcwg-bot/tcwg_glibc_build--master-aarch64 success Testing passed
linaro-tcwg-bot/tcwg_glibc_check--master-aarch64 success Testing passed

Commit Message

mengqinggang Dec. 1, 2023, 9:57 a.m. UTC
  This is mostly based on AArch64 and RISC-V implementation.

Add R_LARCH_TLS_DESC32 and R_LARCH_TLS_DESC64 relocations.

For _dl_tlsdesc_dynamic function slow path, temporarily save and restore
all vector registers.
---
 elf/elf.h                                     |   2 +
 sysdeps/loongarch/Makefile                    |   6 +
 sysdeps/loongarch/dl-link.sym                 |   1 +
 sysdeps/loongarch/dl-machine.h                |  52 ++-
 sysdeps/loongarch/dl-tls.h                    |   9 +-
 sysdeps/loongarch/dl-tlsdesc.S                | 364 ++++++++++++++++++
 sysdeps/loongarch/dl-tlsdesc.h                |  49 +++
 sysdeps/loongarch/linkmap.h                   |   1 +
 sysdeps/loongarch/tlsdesc.c                   |  39 ++
 sysdeps/loongarch/tlsdesc.sym                 |  19 +
 .../unix/sysv/linux/loongarch/localplt.data   |   2 +
 11 files changed, 541 insertions(+), 3 deletions(-)
 create mode 100644 sysdeps/loongarch/dl-tlsdesc.S
 create mode 100644 sysdeps/loongarch/dl-tlsdesc.h
 create mode 100644 sysdeps/loongarch/tlsdesc.c
 create mode 100644 sysdeps/loongarch/tlsdesc.sym
  

Comments

caiyinyu Dec. 4, 2023, 1:33 a.m. UTC | #1
在 2023/12/1 下午5:57, mengqinggang 写道:
> This is mostly based on AArch64 and RISC-V implementation.
>
> Add R_LARCH_TLS_DESC32 and R_LARCH_TLS_DESC64 relocations.
>
> For _dl_tlsdesc_dynamic function slow path, temporarily save and restore
> all vector registers.
> ---
>   elf/elf.h                                     |   2 +
>   sysdeps/loongarch/Makefile                    |   6 +
>   sysdeps/loongarch/dl-link.sym                 |   1 +
>   sysdeps/loongarch/dl-machine.h                |  52 ++-
>   sysdeps/loongarch/dl-tls.h                    |   9 +-
>   sysdeps/loongarch/dl-tlsdesc.S                | 364 ++++++++++++++++++
>   sysdeps/loongarch/dl-tlsdesc.h                |  49 +++
>   sysdeps/loongarch/linkmap.h                   |   1 +
>   sysdeps/loongarch/tlsdesc.c                   |  39 ++
>   sysdeps/loongarch/tlsdesc.sym                 |  19 +
>   .../unix/sysv/linux/loongarch/localplt.data   |   2 +
>   11 files changed, 541 insertions(+), 3 deletions(-)
>   create mode 100644 sysdeps/loongarch/dl-tlsdesc.S
>   create mode 100644 sysdeps/loongarch/dl-tlsdesc.h
>   create mode 100644 sysdeps/loongarch/tlsdesc.c
>   create mode 100644 sysdeps/loongarch/tlsdesc.sym
>
> diff --git a/elf/elf.h b/elf/elf.h
> index 5c1c1972d1..72e90aec30 100644
> --- a/elf/elf.h
> +++ b/elf/elf.h
> @@ -4232,6 +4232,8 @@ enum
>   #define R_LARCH_TLS_TPREL32	10
>   #define R_LARCH_TLS_TPREL64	11
>   #define R_LARCH_IRELATIVE	12
> +#define R_LARCH_TLS_DESC32	13
> +#define R_LARCH_TLS_DESC64	14
>   
>   /* Reserved for future relocs that the dynamic linker must understand.  */
>   
> diff --git a/sysdeps/loongarch/Makefile b/sysdeps/loongarch/Makefile
> index 43d2f583cd..181389e787 100644
> --- a/sysdeps/loongarch/Makefile
> +++ b/sysdeps/loongarch/Makefile
> @@ -3,9 +3,15 @@ sysdep_headers += sys/asm.h
>   endif
>   
>   ifeq ($(subdir),elf)
> +sysdep-dl-routines += tlsdesc dl-tlsdesc
>   gen-as-const-headers += dl-link.sym
>   endif
>   
> +ifeq ($(subdir),csu)
> +gen-as-const-headers += tlsdesc.sym
> +endif
> +
> +
>   # LoongArch's assembler also needs to know about PIC as it changes the
>   # definition of some assembler macros.
>   ASFLAGS-.os += $(pic-ccflag)
> diff --git a/sysdeps/loongarch/dl-link.sym b/sysdeps/loongarch/dl-link.sym
> index b534968e30..fd81ef37d5 100644
> --- a/sysdeps/loongarch/dl-link.sym
> +++ b/sysdeps/loongarch/dl-link.sym
> @@ -1,6 +1,7 @@
>   #include <stddef.h>
>   #include <sysdep.h>
>   #include <link.h>
> +#include <dl-tlsdesc.h>
>   
>   DL_SIZEOF_RG            sizeof(struct La_loongarch_regs)
>   DL_SIZEOF_RV            sizeof(struct La_loongarch_retval)
> diff --git a/sysdeps/loongarch/dl-machine.h b/sysdeps/loongarch/dl-machine.h
> index 0d17fd21e3..0dd252a5e5 100644
> --- a/sysdeps/loongarch/dl-machine.h
> +++ b/sysdeps/loongarch/dl-machine.h
> @@ -25,7 +25,7 @@
>   #include <entry.h>
>   #include <elf/elf.h>
>   #include <sys/asm.h>
> -#include <dl-tls.h>
> +#include <dl-tlsdesc.h>
>   #include <dl-static-tls.h>
>   #include <dl-machine-rel.h>
>   
> @@ -187,6 +187,37 @@ elf_machine_rela (struct link_map *map, struct r_scope_elem *scope[],
>         *addr_field = TLS_TPREL_VALUE (sym_map, sym) + reloc->r_addend;
>         break;
>   
> +    case __WORDSIZE == 64 ? R_LARCH_TLS_DESC64 : R_LARCH_TLS_DESC32:
> +      {
> +	struct tlsdesc volatile *td =
> +	    (struct tlsdesc volatile *)addr_field;
> +	if (! sym)
> +	  {
> +	    td->arg = (void*)reloc->r_addend;
> +	    td->entry = _dl_tlsdesc_undefweak;
> +	  }
> +	else
> +	  {
> +# ifndef SHARED
> +	    CHECK_STATIC_TLS (map, sym_map);
> +# else
> +	    if (!TRY_STATIC_TLS (map, sym_map))
> +	      {
> +		td->arg = _dl_make_tlsdesc_dynamic
> +		  (sym_map, sym->st_value + reloc->r_addend);
> +		td->entry = _dl_tlsdesc_dynamic;
> +	      }
> +	    else
> +# endif
> +	      {
> +		td->arg = (void *)(TLS_TPREL_VALUE (sym_map, sym)
> +			    + reloc->r_addend);
> +		td->entry = _dl_tlsdesc_return;
> +	      }
> +	  }
> +	break;
> +      }
> +
>       case R_LARCH_COPY:
>         {
>   	  if (sym == NULL)
> @@ -255,6 +286,25 @@ elf_machine_lazy_rel (struct link_map *map, struct r_scope_elem *scope[],
>         else
>   	*reloc_addr = map->l_mach.plt;
>       }
> +  else if (__builtin_expect (r_type == R_LARCH_TLS_DESC64, 1))
> +    {
> +      const Elf_Symndx symndx = ELFW (R_SYM) (reloc->r_info);
> +      const ElfW (Sym) *symtab = (const void *)D_PTR (map, l_info[DT_SYMTAB]);
> +      const ElfW (Sym) *sym = &symtab[symndx];
> +      const struct r_found_version *version = NULL;
> +
> +      if (map->l_info[VERSYMIDX (DT_VERSYM)] != NULL)
> +	{
> +	  const ElfW (Half) *vernum =
> +	    (const void *)D_PTR (map, l_info[VERSYMIDX (DT_VERSYM)]);
> +	  version = &map->l_versions[vernum[symndx] & 0x7fff];
> +	}
> +
> +      /* Always initialize TLS descriptors completely, because lazy
> +	 initialization requires synchronization at every TLS access.  */
> +      elf_machine_rela (map, scope, reloc, sym, version, reloc_addr,
> +			skip_ifunc);
> +    }
>     else
>       _dl_reloc_bad_type (map, r_type, 1);
>   }
> diff --git a/sysdeps/loongarch/dl-tls.h b/sysdeps/loongarch/dl-tls.h
> index a551594b64..1ca376484a 100644
> --- a/sysdeps/loongarch/dl-tls.h
> +++ b/sysdeps/loongarch/dl-tls.h
> @@ -16,6 +16,9 @@
>      License along with the GNU C Library.  If not, see
>      <https://www.gnu.org/licenses/>.  */
>   
> +#ifndef _DL_TLS_H
> +#define _DL_TLS_H
> +
>   /* Type used for the representation of TLS information in the GOT.  */
>   typedef struct
>   {
> @@ -23,6 +26,8 @@ typedef struct
>     unsigned long int ti_offset;
>   } tls_index;
>   
> +extern void *__tls_get_addr (tls_index *ti);
> +
>   /* The thread pointer points to the first static TLS block.  */
>   #define TLS_TP_OFFSET 0
>   
> @@ -37,10 +42,10 @@ typedef struct
>   /* Compute the value for a DTPREL reloc.  */
>   #define TLS_DTPREL_VALUE(sym) ((sym)->st_value - TLS_DTV_OFFSET)
>   
> -extern void *__tls_get_addr (tls_index *ti);
> -
>   #define GET_ADDR_OFFSET (ti->ti_offset + TLS_DTV_OFFSET)
>   #define __TLS_GET_ADDR(__ti) (__tls_get_addr (__ti) - TLS_DTV_OFFSET)
>   
>   /* Value used for dtv entries for which the allocation is delayed.  */
>   #define TLS_DTV_UNALLOCATED ((void *) -1l)
> +
> +#endif
> diff --git a/sysdeps/loongarch/dl-tlsdesc.S b/sysdeps/loongarch/dl-tlsdesc.S
> new file mode 100644
> index 0000000000..d2c18ff527
> --- /dev/null
> +++ b/sysdeps/loongarch/dl-tlsdesc.S
> @@ -0,0 +1,364 @@
> +/* Thread-local storage handling in the ELF dynamic linker.
> +   LoongArch version.
> +   Copyright (C) 2011-2023 Free Software Foundation, Inc.
> +
> +   This file is part of the GNU C Library.
> +
> +   The GNU C Library is free software; you can redistribute it and/or
> +   modify it under the terms of the GNU Lesser General Public
> +   License as published by the Free Software Foundation; either
> +   version 2.1 of the License, or (at your option) any later version.
> +
> +   The GNU C Library is distributed in the hope that it will be useful,
> +   but WITHOUT ANY WARRANTY; without even the implied warranty of
> +   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
> +   Lesser General Public License for more details.
> +
> +   You should have received a copy of the GNU Lesser General Public
> +   License along with the GNU C Library; if not, see
> +   <https://www.gnu.org/licenses/>.  */
> +
> +#include <sysdep.h>
> +#include <tls.h>
> +#include "tlsdesc.h"
> +
> +	.text
> +
> +	/* Compute the thread pointer offset for symbols in the static
> +	   TLS block. The offset is the same for all threads.
> +	   Prototype:
> +	   _dl_tlsdesc_return (tlsdesc *);  */
> +	.hidden _dl_tlsdesc_return
> +	.global	_dl_tlsdesc_return
> +	.type	_dl_tlsdesc_return,%function
> +	cfi_startproc
> +	.align 2
> +_dl_tlsdesc_return:
> +	REG_L  a0, a0, 8
> +	RET
> +	cfi_endproc
> +	.size	_dl_tlsdesc_return, .-_dl_tlsdesc_return
> +
> +	/* Handler for undefined weak TLS symbols.
> +	   Prototype:
> +	   _dl_tlsdesc_undefweak (tlsdesc *);
> +
> +	   The second word of the descriptor contains the addend.
> +	   Return the addend minus the thread pointer. This ensures
> +	   that when the caller adds on the thread pointer it gets back
> +	   the addend.  */
> +	.hidden _dl_tlsdesc_undefweak
> +	.global	_dl_tlsdesc_undefweak
> +	.type	_dl_tlsdesc_undefweak,%function
> +	cfi_startproc
> +	.align  2
> +_dl_tlsdesc_undefweak:
> +	REG_L	a0, a0, 8
> +	sub.d	a0, a0, tp
> +	RET
> +	cfi_endproc
> +	.size	_dl_tlsdesc_undefweak, .-_dl_tlsdesc_undefweak
> +

the macros USE_LASX/LSX do not work here.
See :

commit 7f079fdc16e88ebb8020e17b2fd900e8924da29a
Author: caiyinyu <caiyinyu@loongson.cn>
Date:   Wed Jul 5 16:38:05 2023 +0800

     LoongArch: Add vector implementation for _dl_runtime_resolve.

> +#ifdef USE_LASX
> +# define FRAME_SIZE (-((-13 * SZREG - 32 * SZXREG) & ALMASK))
> +#elif defined USE_LSX
> +# define FRAME_SIZE (-((-13 * SZREG - 32 * SZVREG) & ALMASK))
> +#elif !defined __loongarch_soft_float
> +# define FRAME_SIZE (-((-13 * SZREG - 24 * SZFREG) & ALMASK))
> +#else
> +# define FRAME_SIZE (-((-13 * SZREG) & ALMASK))
> +#endif
> +
> +#ifdef SHARED
> +	/* Handler for dynamic TLS symbols.
> +	   Prototype:
> +	   _dl_tlsdesc_dynamic (tlsdesc *) ;
> +
> +	   The second word of the descriptor points to a
> +	   tlsdesc_dynamic_arg structure.
> +
> +	   Returns the offset between the thread pointer and the
> +	   object referenced by the argument.
> +
> +	   ptrdiff_t
> +	   __attribute__ ((__regparm__ (1)))
> +	   _dl_tlsdesc_dynamic (struct tlsdesc *tdp)
> +	   {
> +	     struct tlsdesc_dynamic_arg *td = tdp->arg;
> +	     dtv_t *dtv = *(dtv_t **)((char *)__thread_pointer + TCBHEAD_DTV);
> +	     if (__builtin_expect (td->gen_count <= dtv[0].counter
> +		&& (dtv[td->tlsinfo.ti_module].pointer.val
> +		    != TLS_DTV_UNALLOCATED),
> +		1))
> +	       return dtv[td->tlsinfo.ti_module].pointer.val
> +		+ td->tlsinfo.ti_offset
> +		- __thread_pointer;
> +
> +	     return ___tls_get_addr (&td->tlsinfo) - __thread_pointer;
> +	   }
> +	 */
> +	.hidden _dl_tlsdesc_dynamic
> +	.global	_dl_tlsdesc_dynamic
> +	.type	_dl_tlsdesc_dynamic,%function
> +	cfi_startproc
> +	.align 2
> +_dl_tlsdesc_dynamic:
> +	/* Save just enough registers to support fast path, if we fall
> +	   into slow path we will save additional registers.  */
> +	addi.d  $r3,$r3,-24
> +	REG_S	t0, sp, 0
> +	REG_S	t1, sp, 8
> +	REG_S	t2, sp, 16
> +
> +	REG_L	t0, tp, -SIZE_OF_DTV	  # dtv(t0) = tp + TCBHEAD_DTV dtv start
> +	REG_L	a0, a0, TLSDESC_ARG	  # td(a0) = tdp->arg
> +	REG_L	t1, a0, TLSDESC_GEN_COUNT # t1 = td->gen_count
> +	REG_L	t2, t0, DTV_COUNTER	  # t2 = dtv[0].counter
> +	bltu	t2, t1, .Lslow
> +
> +	REG_L	t1, a0, TLSDESC_MODID	  # t1 = td->tlsinfo.ti_module
> +	slli.d	t1, t1, 3 + 1 # /* sizeof(dtv_t) == sizeof(void*) * 2 */
> +	add.d	t1, t1, t0    # t1 = dtv + ti_module * sizeof(dtv_t)
> +	REG_L	t1, t1, 0 # t1 = dtv[td->tlsinfo.ti_module].pointer.val
> +	li.d	t2, TLS_DTV_UNALLOCATED
> +	beq	t1, t2, .Lslow
> +	REG_L	t2, a0, TLSDESC_MODOFF # t2 = td->tlsinfo.ti_offset
> +	# dtv[td->tlsinfo.ti_module].pointer.val + td->tlsinfo.ti_offset
> +	add.d	a0, t1, t2
> +.Lret:
> +	sub.d	a0, a0, tp
> +	REG_L	t0, sp, 0
> +	REG_L	t1, sp, 8
> +	REG_L	t2, sp, 16
> +	addi.d	sp, sp, 24
> +	RET
> +
> +.Lslow:
> +	/* This is the slow path. We need to call __tls_get_addr() which
> +	   means we need to save and restore all the register that the
> +	   callee will trash.  */
> +
> +	/* Save the remaining registers that we must treat as caller save.  */
> +	addi.d	sp, sp, -FRAME_SIZE
> +	REG_S	ra, sp, 0 * SZREG
> +	REG_S	a1, sp, 1 * SZREG
> +	REG_S	a2, sp, 2 * SZREG
> +	REG_S	a3, sp, 3 * SZREG
> +	REG_S	a4, sp, 4 * SZREG
> +	REG_S	a5, sp, 5 * SZREG
> +	REG_S	a6, sp, 6 * SZREG
> +	REG_S	a7, sp, 7 * SZREG
> +	REG_S	t4, sp, 8 * SZREG
> +	REG_S	t5, sp, 9 * SZREG
> +	REG_S	t6, sp, 10 * SZREG
> +	REG_S	t7, sp, 11 * SZREG
> +	REG_S	t8, sp, 12 * SZREG
> +
> +#ifdef USE_LASX
> +	xvst	xr0, sp, 13*SZREG + 0*SZXREG
> +	xvst	xr1, sp, 13*SZREG + 1*SZXREG
> +	xvst	xr2, sp, 13*SZREG + 2*SZXREG
> +	xvst	xr3, sp, 13*SZREG + 3*SZXREG
> +	xvst	xr4, sp, 13*SZREG + 4*SZXREG
> +	xvst	xr5, sp, 13*SZREG + 5*SZXREG
> +	xvst	xr6, sp, 13*SZREG + 6*SZXREG
> +	xvst	xr7, sp, 13*SZREG + 7*SZXREG
> +	xvst	xr8, sp, 13*SZREG + 8*SZXREG
> +	xvst	xr9, sp, 13*SZREG + 9*SZXREG
> +	xvst	xr10, sp, 13*SZREG + 10*SZXREG
> +	xvst	xr11, sp, 13*SZREG + 11*SZXREG
> +	xvst	xr12, sp, 13*SZREG + 12*SZXREG
> +	xvst	xr13, sp, 13*SZREG + 13*SZXREG
> +	xvst	xr14, sp, 13*SZREG + 14*SZXREG
> +	xvst	xr15, sp, 13*SZREG + 15*SZXREG
> +	xvst	xr16, sp, 13*SZREG + 16*SZXREG
> +	xvst	xr17, sp, 13*SZREG + 17*SZXREG
> +	xvst	xr18, sp, 13*SZREG + 18*SZXREG
> +	xvst	xr19, sp, 13*SZREG + 19*SZXREG
> +	xvst	xr20, sp, 13*SZREG + 20*SZXREG
> +	xvst	xr21, sp, 13*SZREG + 21*SZXREG
> +	xvst	xr22, sp, 13*SZREG + 22*SZXREG
> +	xvst	xr23, sp, 13*SZREG + 23*SZXREG
> +	xvst	xr23, sp, 13*SZREG + 24*SZXREG
> +	xvst	xr23, sp, 13*SZREG + 25*SZXREG
> +	xvst	xr23, sp, 13*SZREG + 26*SZXREG
> +	xvst	xr23, sp, 13*SZREG + 27*SZXREG
> +	xvst	xr23, sp, 13*SZREG + 28*SZXREG
> +	xvst	xr23, sp, 13*SZREG + 29*SZXREG
> +	xvst	xr23, sp, 13*SZREG + 30*SZXREG
> +	xvst	xr23, sp, 13*SZREG + 31*SZXREG
> +#elif defined USE_LSX
> +	vst	vr0, sp, 13*SZREG + 0*SZVREG
> +	vst	vr1, sp, 13*SZREG + 1*SZVREG
> +	vst	vr2, sp, 13*SZREG + 2*SZVREG
> +	vst	vr3, sp, 13*SZREG + 3*SZVREG
> +	vst	vr4, sp, 13*SZREG + 4*SZVREG
> +	vst	vr5, sp, 13*SZREG + 5*SZVREG
> +	vst	vr6, sp, 13*SZREG + 6*SZVREG
> +	vst	vr7, sp, 13*SZREG + 7*SZVREG
> +	vst	vr8, sp, 13*SZREG + 8*SZVREG
> +	vst	vr9, sp, 13*SZREG + 9*SZVREG
> +	vst	vr10, sp, 13*SZREG + 10*SZVREG
> +	vst	vr11, sp, 13*SZREG + 11*SZVREG
> +	vst	vr12, sp, 13*SZREG + 12*SZVREG
> +	vst	vr13, sp, 13*SZREG + 13*SZVREG
> +	vst	vr14, sp, 13*SZREG + 14*SZVREG
> +	vst	vr15, sp, 13*SZREG + 15*SZVREG
> +	vst	vr16, sp, 13*SZREG + 16*SZVREG
> +	vst	vr17, sp, 13*SZREG + 17*SZVREG
> +	vst	vr18, sp, 13*SZREG + 18*SZVREG
> +	vst	vr19, sp, 13*SZREG + 19*SZVREG
> +	vst	vr20, sp, 13*SZREG + 20*SZVREG
> +	vst	vr21, sp, 13*SZREG + 21*SZVREG
> +	vst	vr22, sp, 13*SZREG + 22*SZVREG
> +	vst	vr23, sp, 13*SZREG + 23*SZVREG
> +	vst	vr23, sp, 13*SZREG + 24*SZVREG
> +	vst	vr23, sp, 13*SZREG + 25*SZVREG
> +	vst	vr23, sp, 13*SZREG + 26*SZVREG
> +	vst	vr23, sp, 13*SZREG + 27*SZVREG
> +	vst	vr23, sp, 13*SZREG + 28*SZVREG
> +	vst	vr23, sp, 13*SZREG + 29*SZVREG
> +	vst	vr23, sp, 13*SZREG + 30*SZVREG
> +	vst	vr23, sp, 13*SZREG + 31*SZVREG
> +#elif !defined __loongarch_soft_float
> +	FREG_S	fa0, sp, 13*SZREG + 0*SZFREG
> +	FREG_S	fa1, sp, 13*SZREG + 1*SZFREG
> +	FREG_S	fa2, sp, 13*SZREG + 2*SZFREG
> +	FREG_S	fa3, sp, 13*SZREG + 3*SZFREG
> +	FREG_S	fa4, sp, 13*SZREG + 4*SZFREG
> +	FREG_S	fa5, sp, 13*SZREG + 5*SZFREG
> +	FREG_S	fa6, sp, 13*SZREG + 6*SZFREG
> +	FREG_S	fa7, sp, 13*SZREG + 7*SZFREG
> +	FREG_S	ft0, sp, 13*SZREG + 8*SZFREG
> +	FREG_S	ft1, sp, 13*SZREG + 9*SZFREG
> +	FREG_S	ft2, sp, 13*SZREG + 10*SZFREG
> +	FREG_S	ft3, sp, 13*SZREG + 11*SZFREG
> +	FREG_S	ft4, sp, 13*SZREG + 12*SZFREG
> +	FREG_S	ft5, sp, 13*SZREG + 13*SZFREG
> +	FREG_S	ft6, sp, 13*SZREG + 14*SZFREG
> +	FREG_S	ft7, sp, 13*SZREG + 15*SZFREG
> +	FREG_S	ft8, sp, 13*SZREG + 16*SZFREG
> +	FREG_S	ft9, sp, 13*SZREG + 17*SZFREG
> +	FREG_S	ft10, sp, 13*SZREG + 18*SZFREG
> +	FREG_S	ft11, sp, 13*SZREG + 19*SZFREG
> +	FREG_S	ft12, sp, 13*SZREG + 20*SZFREG
> +	FREG_S	ft13, sp, 13*SZREG + 21*SZFREG
> +	FREG_S	ft14, sp, 13*SZREG + 22*SZFREG
> +	FREG_S	ft15, sp, 13*SZREG + 23*SZFREG
> +#endif /* #ifdef USE_LASX  */
> +
> +	bl	__tls_get_addr
> +	addi.d	a0, a0, -TLS_DTV_OFFSET
> +
> +	REG_L	ra, sp, 0
> +	REG_L	a1, sp, 1 * 8
> +	REG_L	a2, sp, 2 * 8
> +	REG_L	a3, sp, 3 * 8
> +	REG_L	a4, sp, 4 * 8
> +	REG_L	a5, sp, 5 * 8
> +	REG_L	a6, sp, 6 * 8
> +	REG_L	a7, sp, 7 * 8
> +	REG_L	t4, sp, 8 * 8
> +	REG_L	t5, sp, 9 * 8
> +	REG_L	t6, sp, 10 * 8
> +	REG_L	t7, sp, 11 * 8
> +	REG_L	t8, sp, 12 * 8
> +
> +#ifdef USE_LASX
> +	xvld	xr0, sp, 13*SZREG + 0*SZXREG
> +	xvld	xr1, sp, 13*SZREG + 1*SZXREG
> +	xvld	xr2, sp, 13*SZREG + 2*SZXREG
> +	xvld	xr3, sp, 13*SZREG + 3*SZXREG
> +	xvld	xr4, sp, 13*SZREG + 4*SZXREG
> +	xvld	xr5, sp, 13*SZREG + 5*SZXREG
> +	xvld	xr6, sp, 13*SZREG + 6*SZXREG
> +	xvld	xr7, sp, 13*SZREG + 7*SZXREG
> +	xvld	xr8, sp, 13*SZREG + 8*SZXREG
> +	xvld	xr9, sp, 13*SZREG + 9*SZXREG
> +	xvld	xr10, sp, 13*SZREG + 10*SZXREG
> +	xvld	xr11, sp, 13*SZREG + 11*SZXREG
> +	xvld	xr12, sp, 13*SZREG + 12*SZXREG
> +	xvld	xr13, sp, 13*SZREG + 13*SZXREG
> +	xvld	xr14, sp, 13*SZREG + 14*SZXREG
> +	xvld	xr15, sp, 13*SZREG + 15*SZXREG
> +	xvld	xr16, sp, 13*SZREG + 16*SZXREG
> +	xvld	xr17, sp, 13*SZREG + 17*SZXREG
> +	xvld	xr18, sp, 13*SZREG + 18*SZXREG
> +	xvld	xr19, sp, 13*SZREG + 19*SZXREG
> +	xvld	xr20, sp, 13*SZREG + 20*SZXREG
> +	xvld	xr21, sp, 13*SZREG + 21*SZXREG
> +	xvld	xr22, sp, 13*SZREG + 22*SZXREG
> +	xvld	xr23, sp, 13*SZREG + 23*SZXREG
> +	xvld	xr24, sp, 13*SZREG + 24*SZXREG
> +	xvld	xr25, sp, 13*SZREG + 25*SZXREG
> +	xvld	xr26, sp, 13*SZREG + 26*SZXREG
> +	xvld	xr27, sp, 13*SZREG + 27*SZXREG
> +	xvld	xr28, sp, 13*SZREG + 28*SZXREG
> +	xvld	xr29, sp, 13*SZREG + 29*SZXREG
> +	xvld	xr30, sp, 13*SZREG + 30*SZXREG
> +	xvld	xr31, sp, 13*SZREG + 31*SZXREG
> +#elif defined USE_LSX
> +	vld	vr0, sp, 13*SZREG + 0*SZVREG
> +	vld	vr1, sp, 13*SZREG + 1*SZVREG
> +	vld	vr2, sp, 13*SZREG + 2*SZVREG
> +	vld	vr3, sp, 13*SZREG + 3*SZVREG
> +	vld	vr4, sp, 13*SZREG + 4*SZVREG
> +	vld	vr5, sp, 13*SZREG + 5*SZVREG
> +	vld	vr6, sp, 13*SZREG + 6*SZVREG
> +	vld	vr7, sp, 13*SZREG + 7*SZVREG
> +	vld	vr8, sp, 13*SZREG + 8*SZVREG
> +	vld	vr9, sp, 13*SZREG + 9*SZVREG
> +	vld	vr10, sp, 13*SZREG + 10*SZVREG
> +	vld	vr11, sp, 13*SZREG + 11*SZVREG
> +	vld	vr12, sp, 13*SZREG + 12*SZVREG
> +	vld	vr13, sp, 13*SZREG + 13*SZVREG
> +	vld	vr14, sp, 13*SZREG + 14*SZVREG
> +	vld	vr15, sp, 13*SZREG + 15*SZVREG
> +	vld	vr16, sp, 13*SZREG + 16*SZVREG
> +	vld	vr17, sp, 13*SZREG + 17*SZVREG
> +	vld	vr18, sp, 13*SZREG + 18*SZVREG
> +	vld	vr19, sp, 13*SZREG + 19*SZVREG
> +	vld	vr20, sp, 13*SZREG + 20*SZVREG
> +	vld	vr21, sp, 13*SZREG + 21*SZVREG
> +	vld	vr22, sp, 13*SZREG + 22*SZVREG
> +	vld	vr23, sp, 13*SZREG + 23*SZVREG
> +	vld	vr24, sp, 13*SZREG + 24*SZVREG
> +	vld	vr25, sp, 13*SZREG + 25*SZVREG
> +	vld	vr26, sp, 13*SZREG + 26*SZVREG
> +	vld	vr27, sp, 13*SZREG + 27*SZVREG
> +	vld	vr28, sp, 13*SZREG + 28*SZVREG
> +	vld	vr29, sp, 13*SZREG + 29*SZVREG
> +	vld	vr30, sp, 13*SZREG + 30*SZVREG
> +	vld	vr31, sp, 13*SZREG + 31*SZVREG
> +#elif !defined __loongarch_soft_float
> +	FREG_L	fa0, sp, 13*SZREG + 0*SZFREG
> +	FREG_L	fa1, sp, 13*SZREG + 1*SZFREG
> +	FREG_L	fa2, sp, 13*SZREG + 2*SZFREG
> +	FREG_L	fa3, sp, 13*SZREG + 3*SZFREG
> +	FREG_L	fa4, sp, 13*SZREG + 4*SZFREG
> +	FREG_L	fa5, sp, 13*SZREG + 5*SZFREG
> +	FREG_L	fa6, sp, 13*SZREG + 6*SZFREG
> +	FREG_L	fa7, sp, 13*SZREG + 7*SZFREG
> +	FREG_L	ft0, sp, 13*SZREG + 8*SZFREG
> +	FREG_L	ft1, sp, 13*SZREG + 9*SZFREG
> +	FREG_L	ft2, sp, 13*SZREG + 10*SZFREG
> +	FREG_L	ft3, sp, 13*SZREG + 11*SZFREG
> +	FREG_L	ft4, sp, 13*SZREG + 12*SZFREG
> +	FREG_L	ft5, sp, 13*SZREG + 13*SZFREG
> +	FREG_L	ft6, sp, 13*SZREG + 14*SZFREG
> +	FREG_L	ft7, sp, 13*SZREG + 15*SZFREG
> +	FREG_L	ft8, sp, 13*SZREG + 16*SZFREG
> +	FREG_L	ft9, sp, 13*SZREG + 17*SZFREG
> +	FREG_L	ft10, sp, 13*SZREG + 18*SZFREG
> +	FREG_L	ft11, sp, 13*SZREG + 19*SZFREG
> +	FREG_L	ft12, sp, 13*SZREG + 20*SZFREG
> +	FREG_L	ft13, sp, 13*SZREG + 21*SZFREG
> +	FREG_L	ft14, sp, 13*SZREG + 22*SZFREG
> +	FREG_L	ft15, sp, 13*SZREG + 23*SZFREG
> +#endif /* #ifdef USE_LASX  */
> +
> +	addi.d	sp, sp, FRAME_SIZE
> +	b	.Lret
> +	cfi_endproc
> +	.size	_dl_tlsdesc_dynamic, .-_dl_tlsdesc_dynamic
> +#endif /* #ifdef SHARED  */
> diff --git a/sysdeps/loongarch/dl-tlsdesc.h b/sysdeps/loongarch/dl-tlsdesc.h
> new file mode 100644
> index 0000000000..e1a9365855
> --- /dev/null
> +++ b/sysdeps/loongarch/dl-tlsdesc.h
> @@ -0,0 +1,49 @@
> +/* Thread-local storage descriptor handling in the ELF dynamic linker.
> +   LoongArch version.
> +   Copyright (C) 2011-2023 Free Software Foundation, Inc.
> +
> +   This file is part of the GNU C Library.
> +
> +   The GNU C Library is free software; you can redistribute it and/or
> +   modify it under the terms of the GNU Lesser General Public
> +   License as published by the Free Software Foundation; either
> +   version 2.1 of the License, or (at your option) any later version.
> +
> +   The GNU C Library is distributed in the hope that it will be useful,
> +   but WITHOUT ANY WARRANTY; without even the implied warranty of
> +   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
> +   Lesser General Public License for more details.
> +
> +   You should have received a copy of the GNU Lesser General Public
> +   License along with the GNU C Library; if not, see
> +   <https://www.gnu.org/licenses/>.  */
> +
> +#ifndef _DL_TLSDESC_H
> +#define _DL_TLSDESC_H
> +
> +#include <dl-tls.h>
> +
> +/* Type used to represent a TLS descriptor in the GOT.  */
> +struct tlsdesc
> +{
> +  ptrdiff_t (*entry) (struct tlsdesc *);
> +  void *arg;
> +};
> +
> +/* Type used as the argument in a TLS descriptor for a symbol that
> +   needs dynamic TLS offsets.  */
> +struct tlsdesc_dynamic_arg
> +{
> +  tls_index tlsinfo;
> +  size_t gen_count;
> +};
> +
> +extern ptrdiff_t attribute_hidden _dl_tlsdesc_return (struct tlsdesc *);
> +extern ptrdiff_t attribute_hidden _dl_tlsdesc_undefweak (struct tlsdesc *);
> +
> +# ifdef SHARED
> +extern void *_dl_make_tlsdesc_dynamic (struct link_map *, size_t);
> +extern ptrdiff_t attribute_hidden _dl_tlsdesc_dynamic (struct tlsdesc *);
> +#endif
> +
> +#endif
> diff --git a/sysdeps/loongarch/linkmap.h b/sysdeps/loongarch/linkmap.h
> index 2f5bf53421..40416b1ad4 100644
> --- a/sysdeps/loongarch/linkmap.h
> +++ b/sysdeps/loongarch/linkmap.h
> @@ -19,4 +19,5 @@
>   struct link_map_machine
>   {
>     ElfW (Addr) plt; /* Address of .plt.  */
> +  void *tlsdesc_table;    /* Address of TLS descriptor hash table.  */
>   };
> diff --git a/sysdeps/loongarch/tlsdesc.c b/sysdeps/loongarch/tlsdesc.c
> new file mode 100644
> index 0000000000..a357e7619f
> --- /dev/null
> +++ b/sysdeps/loongarch/tlsdesc.c
> @@ -0,0 +1,39 @@
> +/* Manage TLS descriptors.  AArch64 version.
> +
> +   Copyright (C) 2011-2023 Free Software Foundation, Inc.
> +
> +   This file is part of the GNU C Library.
> +
> +   The GNU C Library is free software; you can redistribute it and/or
> +   modify it under the terms of the GNU Lesser General Public
> +   License as published by the Free Software Foundation; either
> +   version 2.1 of the License, or (at your option) any later version.
> +
> +   The GNU C Library is distributed in the hope that it will be useful,
> +   but WITHOUT ANY WARRANTY; without even the implied warranty of
> +   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
> +   Lesser General Public License for more details.
> +
> +   You should have received a copy of the GNU Lesser General Public
> +   License along with the GNU C Library; if not, see
> +   <https://www.gnu.org/licenses/>.  */
> +
> +#include <ldsodefs.h>
> +#include <tls.h>
> +#include <dl-tlsdesc.h>
> +#include <dl-unmap-segments.h>
> +#include <tlsdeschtab.h>
> +
> +/* Unmap the dynamic object, but also release its TLS descriptor table
> +   if there is one.  */
> +
> +void
> +_dl_unmap (struct link_map *map)
> +{
> +  _dl_unmap_segments (map);
> +
> +#ifdef SHARED
> +  if (map->l_mach.tlsdesc_table)
> +    htab_delete (map->l_mach.tlsdesc_table);
> +#endif
> +}
> diff --git a/sysdeps/loongarch/tlsdesc.sym b/sysdeps/loongarch/tlsdesc.sym
> new file mode 100644
> index 0000000000..bcab218631
> --- /dev/null
> +++ b/sysdeps/loongarch/tlsdesc.sym
> @@ -0,0 +1,19 @@
> +#include <stddef.h>
> +#include <sysdep.h>
> +#include <tls.h>
> +#include <link.h>
> +#include <dl-tlsdesc.h>
> +
> +--
> +
> +-- Abuse tls.h macros to derive offsets relative to the thread register.
> +
> +TLSDESC_ARG		offsetof(struct tlsdesc, arg)
> +TLSDESC_GEN_COUNT	offsetof(struct tlsdesc_dynamic_arg, gen_count)
> +TLSDESC_MODID		offsetof(struct tlsdesc_dynamic_arg, tlsinfo.ti_module)
> +TLSDESC_MODOFF		offsetof(struct tlsdesc_dynamic_arg, tlsinfo.ti_offset)
> +TCBHEAD_DTV		offsetof(tcbhead_t, dtv)
> +DTV_COUNTER		offsetof(dtv_t, counter)
> +TLS_DTV_UNALLOCATED	TLS_DTV_UNALLOCATED
> +TLS_DTV_OFFSET		TLS_DTV_OFFSET
> +SIZE_OF_DTV		sizeof(tcbhead_t)
> diff --git a/sysdeps/unix/sysv/linux/loongarch/localplt.data b/sysdeps/unix/sysv/linux/loongarch/localplt.data
> index 547b1c1b7f..ec32e6d13f 100644
> --- a/sysdeps/unix/sysv/linux/loongarch/localplt.data
> +++ b/sysdeps/unix/sysv/linux/loongarch/localplt.data
> @@ -5,3 +5,5 @@ libc.so: calloc
>   libc.so: free
>   libc.so: malloc
>   libc.so: realloc
> +# The dynamic loader needs __tls_get_addr for TLS.
> +ld.so: __tls_get_addr
  
Xi Ruoyao Dec. 4, 2023, 3:28 a.m. UTC | #2
On Fri, 2023-12-01 at 17:57 +0800, mengqinggang wrote:
> +.Lslow:

/* snip */

> +#ifdef USE_LASX
> +	xvst	xr0, sp, 13*SZREG + 0*SZXREG
> +	xvst	xr1, sp, 13*SZREG + 1*SZXREG
> +	xvst	xr2, sp, 13*SZREG + 2*SZXREG
> +	xvst	xr3, sp, 13*SZREG + 3*SZXREG
> +	xvst	xr4, sp, 13*SZREG + 4*SZXREG
> +	xvst	xr5, sp, 13*SZREG + 5*SZXREG
> +	xvst	xr6, sp, 13*SZREG + 6*SZXREG
> +	xvst	xr7, sp, 13*SZREG + 7*SZXREG
> +	xvst	xr8, sp, 13*SZREG + 8*SZXREG
> +	xvst	xr9, sp, 13*SZREG + 9*SZXREG
> +	xvst	xr10, sp, 13*SZREG + 10*SZXREG
> +	xvst	xr11, sp, 13*SZREG + 11*SZXREG
> +	xvst	xr12, sp, 13*SZREG + 12*SZXREG
> +	xvst	xr13, sp, 13*SZREG + 13*SZXREG
> +	xvst	xr14, sp, 13*SZREG + 14*SZXREG
> +	xvst	xr15, sp, 13*SZREG + 15*SZXREG
> +	xvst	xr16, sp, 13*SZREG + 16*SZXREG
> +	xvst	xr17, sp, 13*SZREG + 17*SZXREG
> +	xvst	xr18, sp, 13*SZREG + 18*SZXREG
> +	xvst	xr19, sp, 13*SZREG + 19*SZXREG
> +	xvst	xr20, sp, 13*SZREG + 20*SZXREG
> +	xvst	xr21, sp, 13*SZREG + 21*SZXREG
> +	xvst	xr22, sp, 13*SZREG + 22*SZXREG
> +	xvst	xr23, sp, 13*SZREG + 23*SZXREG
> +	xvst	xr23, sp, 13*SZREG + 24*SZXREG
> +	xvst	xr23, sp, 13*SZREG + 25*SZXREG
> +	xvst	xr23, sp, 13*SZREG + 26*SZXREG
> +	xvst	xr23, sp, 13*SZREG + 27*SZXREG
> +	xvst	xr23, sp, 13*SZREG + 28*SZXREG
> +	xvst	xr23, sp, 13*SZREG + 29*SZXREG
> +	xvst	xr23, sp, 13*SZREG + 30*SZXREG
> +	xvst	xr23, sp, 13*SZREG + 31*SZXREG

The typo here should be obvious.

> +#elif defined USE_LSX
> +	vst	vr0, sp, 13*SZREG + 0*SZVREG
> +	vst	vr1, sp, 13*SZREG + 1*SZVREG
> +	vst	vr2, sp, 13*SZREG + 2*SZVREG
> +	vst	vr3, sp, 13*SZREG + 3*SZVREG
> +	vst	vr4, sp, 13*SZREG + 4*SZVREG
> +	vst	vr5, sp, 13*SZREG + 5*SZVREG
> +	vst	vr6, sp, 13*SZREG + 6*SZVREG
> +	vst	vr7, sp, 13*SZREG + 7*SZVREG
> +	vst	vr8, sp, 13*SZREG + 8*SZVREG
> +	vst	vr9, sp, 13*SZREG + 9*SZVREG
> +	vst	vr10, sp, 13*SZREG + 10*SZVREG
> +	vst	vr11, sp, 13*SZREG + 11*SZVREG
> +	vst	vr12, sp, 13*SZREG + 12*SZVREG
> +	vst	vr13, sp, 13*SZREG + 13*SZVREG
> +	vst	vr14, sp, 13*SZREG + 14*SZVREG
> +	vst	vr15, sp, 13*SZREG + 15*SZVREG
> +	vst	vr16, sp, 13*SZREG + 16*SZVREG
> +	vst	vr17, sp, 13*SZREG + 17*SZVREG
> +	vst	vr18, sp, 13*SZREG + 18*SZVREG
> +	vst	vr19, sp, 13*SZREG + 19*SZVREG
> +	vst	vr20, sp, 13*SZREG + 20*SZVREG
> +	vst	vr21, sp, 13*SZREG + 21*SZVREG
> +	vst	vr22, sp, 13*SZREG + 22*SZVREG
> +	vst	vr23, sp, 13*SZREG + 23*SZVREG
> +	vst	vr23, sp, 13*SZREG + 24*SZVREG
> +	vst	vr23, sp, 13*SZREG + 25*SZVREG
> +	vst	vr23, sp, 13*SZREG + 26*SZVREG
> +	vst	vr23, sp, 13*SZREG + 27*SZVREG
> +	vst	vr23, sp, 13*SZREG + 28*SZVREG
> +	vst	vr23, sp, 13*SZREG + 29*SZVREG
> +	vst	vr23, sp, 13*SZREG + 30*SZVREG
> +	vst	vr23, sp, 13*SZREG + 31*SZVREG

Likewise.

> +#elif !defined __loongarch_soft_float
> +	FREG_S	fa0, sp, 13*SZREG + 0*SZFREG
> +	FREG_S	fa1, sp, 13*SZREG + 1*SZFREG
> +	FREG_S	fa2, sp, 13*SZREG + 2*SZFREG
> +	FREG_S	fa3, sp, 13*SZREG + 3*SZFREG
> +	FREG_S	fa4, sp, 13*SZREG + 4*SZFREG
> +	FREG_S	fa5, sp, 13*SZREG + 5*SZFREG
> +	FREG_S	fa6, sp, 13*SZREG + 6*SZFREG
> +	FREG_S	fa7, sp, 13*SZREG + 7*SZFREG
> +	FREG_S	ft0, sp, 13*SZREG + 8*SZFREG
> +	FREG_S	ft1, sp, 13*SZREG + 9*SZFREG
> +	FREG_S	ft2, sp, 13*SZREG + 10*SZFREG
> +	FREG_S	ft3, sp, 13*SZREG + 11*SZFREG
> +	FREG_S	ft4, sp, 13*SZREG + 12*SZFREG
> +	FREG_S	ft5, sp, 13*SZREG + 13*SZFREG
> +	FREG_S	ft6, sp, 13*SZREG + 14*SZFREG
> +	FREG_S	ft7, sp, 13*SZREG + 15*SZFREG
> +	FREG_S	ft8, sp, 13*SZREG + 16*SZFREG
> +	FREG_S	ft9, sp, 13*SZREG + 17*SZFREG
> +	FREG_S	ft10, sp, 13*SZREG + 18*SZFREG
> +	FREG_S	ft11, sp, 13*SZREG + 19*SZFREG
> +	FREG_S	ft12, sp, 13*SZREG + 20*SZFREG
> +	FREG_S	ft13, sp, 13*SZREG + 21*SZFREG
> +	FREG_S	ft14, sp, 13*SZREG + 22*SZFREG
> +	FREG_S	ft15, sp, 13*SZREG + 23*SZFREG
> +#endif /* #ifdef USE_LASX  */

And generally this seems too expensive.  Would it be better to compile
libc-tls.c with -ffixed-{f0,f1,...,f31} so we can avoid saving and
restoring FPR/VRs?
  
Xi Ruoyao Dec. 4, 2023, 3:45 a.m. UTC | #3
On Mon, 2023-12-04 at 11:28 +0800, Xi Ruoyao wrote:
> And generally this seems too expensive.  Would it be better to compile
> libc-tls.c with -ffixed-{f0,f1,...,f31} so we can avoid saving and
> restoring FPR/VRs?

Note that -fcall-saved-f${x} is not enough for us because it only saves
the FPRs, not VRs.  Even if we change it to -fcall-saved-xr${x} it won't
work.

-ffixed-{f0,f1,...,f31} may cause an ICE if the compiler attempts to use
a VR or FPR, but if I read the code correctly libc-tls.c just should not
perform any floating-point operation, and we can use -mno-lsx to prevent
using vector registers.
  
Xi Ruoyao Dec. 4, 2023, 4:13 a.m. UTC | #4
On Mon, 2023-12-04 at 11:45 +0800, Xi Ruoyao wrote:
> On Mon, 2023-12-04 at 11:28 +0800, Xi Ruoyao wrote:
> > And generally this seems too expensive.  Would it be better to compile
> > libc-tls.c with -ffixed-{f0,f1,...,f31} so we can avoid saving and
> > restoring FPR/VRs?
> 
> Note that -fcall-saved-f${x} is not enough for us because it only saves
> the FPRs, not VRs.  Even if we change it to -fcall-saved-xr${x} it won't
> work.
> 
> -ffixed-{f0,f1,...,f31} may cause an ICE if the compiler attempts to use
> a VR or FPR, but if I read the code correctly libc-tls.c just should not
> perform any floating-point operation, and we can use -mno-lsx to prevent
> using vector registers.

I made up this:

diff --git a/sysdeps/loongarch/Makefile b/sysdeps/loongarch/Makefile
index 43d2f583cd..64c1ea1294 100644
--- a/sysdeps/loongarch/Makefile
+++ b/sysdeps/loongarch/Makefile
@@ -15,3 +15,16 @@ ASFLAGS-.os += $(pic-ccflag)
 ifeq (yes,$(have-cmodel-medium))
 CFLAGS-.oS += -mcmodel=medium
 endif
+
+# Make _dl_tlsdesc_dynamic slow path less expensive by denying __tls_get_addr
+# from using any FPR.
+#
+# Attention: if you see an ICE here, it's likely __tls_get_addr is doing
+# something wrong: why should it do floating-point operations anyway?!
+# Please fix it instead of complain to GCC maintainers.
+ifeq (yes,$(have-mno-lsx))
+CFLAGS-libc-tls.c += -mno-lsx
+endif
+CFLAGS-libc-tls.c += $(foreach n,30 31 \
+				 $(foreach m,0 1 2 3 4 5 6 7 8 9,$m 1$m 2$m), \
+			       -ffixed-f$n)

# diff --git a/sysdeps/loongarch/configure b/sysdeps/loongarch/configure
# (Regenerated)

diff --git a/sysdeps/loongarch/configure.ac b/sysdeps/loongarch/configure.ac
index 28a8ae5486..bacf75808c 100644
--- a/sysdeps/loongarch/configure.ac
+++ b/sysdeps/loongarch/configure.ac
@@ -65,3 +65,16 @@ rm -f conftest*])
 if test $libc_cv_loongarch_vec_asm = no; then
   AC_MSG_ERROR([binutils version is too old, use 2.41 or newer version])
 fi
+
+# Check if compiler supports -mno-lsx.
+AC_CACHE_CHECK(whether $CC supports -mno-lsx, libc_cv_loongarch_cc_mno_lsx, [dnl
+cat > conftest.c <<\EOF
+        int dummy;
+EOF
+if AC_TRY_COMMAND(${CC-cc} -c $CFLAGS -mno-lsx -o conftest 1>&AS_MESSAGE_LOG_FD); then
+  libc_cv_loongarch_cc_mno_lsx=yes
+else
+  libc_cv_loongarch_cc_mno_lsx=no
+fi
+rm -f conftest*])
+LIBC_CONFIG_VAR([have-mno-lsx], [$libc_cv_loongarch_cc_mno_lsx])

But then I found the compiled __tls_get_addr is very simple:

000000000000027c <__tls_get_addr>:
 27c:	28ffc04d 	ld.d        	$t1, $tp, -16

0000000000000280 <L0^A>:
 280:	28c0208c 	ld.d        	$t0, $a0, 8
 284:	28c041a4 	ld.d        	$a0, $t1, 16

0000000000000288 <L0^A>:
 288:	0010b084 	add.d       	$a0, $a0, $t0
 28c:	4c000020 	ret     

So I think writing __tls_get_addr in assembly should be easier.  There
are just five instructions, so this is definitely better than messing
around compiler flags or writing 90+ instructions to save/load FPRs/VRs
(and slowing down the execution).
  
Florian Weimer Dec. 4, 2023, 8:13 a.m. UTC | #5
* Xi Ruoyao:

> I made up this:
>
> diff --git a/sysdeps/loongarch/Makefile b/sysdeps/loongarch/Makefile
> index 43d2f583cd..64c1ea1294 100644
> --- a/sysdeps/loongarch/Makefile
> +++ b/sysdeps/loongarch/Makefile
> @@ -15,3 +15,16 @@ ASFLAGS-.os += $(pic-ccflag)
>  ifeq (yes,$(have-cmodel-medium))
>  CFLAGS-.oS += -mcmodel=medium
>  endif
> +
> +# Make _dl_tlsdesc_dynamic slow path less expensive by denying __tls_get_addr
> +# from using any FPR.
> +#
> +# Attention: if you see an ICE here, it's likely __tls_get_addr is doing
> +# something wrong: why should it do floating-point operations anyway?!
> +# Please fix it instead of complain to GCC maintainers.
> +ifeq (yes,$(have-mno-lsx))
> +CFLAGS-libc-tls.c += -mno-lsx
> +endif

This is not correct: __tls_get_addr may call malloc, and an interposed
malloc is free to use the full register file.  You need to perform a
context switch here, similar to what the lazy binding trampoline does.

Thanks,
Florian
  
mengqinggang Dec. 4, 2023, 8:20 a.m. UTC | #6
在 2023/12/4 下午12:13, Xi Ruoyao 写道:
> On Mon, 2023-12-04 at 11:45 +0800, Xi Ruoyao wrote:
>> On Mon, 2023-12-04 at 11:28 +0800, Xi Ruoyao wrote:
>>> And generally this seems too expensive.  Would it be better to compile
>>> libc-tls.c with -ffixed-{f0,f1,...,f31} so we can avoid saving and
>>> restoring FPR/VRs?
>> Note that -fcall-saved-f${x} is not enough for us because it only saves
>> the FPRs, not VRs.  Even if we change it to -fcall-saved-xr${x} it won't
>> work.
>>
>> -ffixed-{f0,f1,...,f31} may cause an ICE if the compiler attempts to use
>> a VR or FPR, but if I read the code correctly libc-tls.c just should not
>> perform any floating-point operation, and we can use -mno-lsx to prevent
>> using vector registers.
> I made up this:
>
> diff --git a/sysdeps/loongarch/Makefile b/sysdeps/loongarch/Makefile
> index 43d2f583cd..64c1ea1294 100644
> --- a/sysdeps/loongarch/Makefile
> +++ b/sysdeps/loongarch/Makefile
> @@ -15,3 +15,16 @@ ASFLAGS-.os += $(pic-ccflag)
>   ifeq (yes,$(have-cmodel-medium))
>   CFLAGS-.oS += -mcmodel=medium
>   endif
> +
> +# Make _dl_tlsdesc_dynamic slow path less expensive by denying __tls_get_addr
> +# from using any FPR.
> +#
> +# Attention: if you see an ICE here, it's likely __tls_get_addr is doing
> +# something wrong: why should it do floating-point operations anyway?!
> +# Please fix it instead of complain to GCC maintainers.
> +ifeq (yes,$(have-mno-lsx))
> +CFLAGS-libc-tls.c += -mno-lsx
> +endif
> +CFLAGS-libc-tls.c += $(foreach n,30 31 \
> +				 $(foreach m,0 1 2 3 4 5 6 7 8 9,$m 1$m 2$m), \
> +			       -ffixed-f$n)
>
> # diff --git a/sysdeps/loongarch/configure b/sysdeps/loongarch/configure
> # (Regenerated)
>
> diff --git a/sysdeps/loongarch/configure.ac b/sysdeps/loongarch/configure.ac
> index 28a8ae5486..bacf75808c 100644
> --- a/sysdeps/loongarch/configure.ac
> +++ b/sysdeps/loongarch/configure.ac
> @@ -65,3 +65,16 @@ rm -f conftest*])
>   if test $libc_cv_loongarch_vec_asm = no; then
>     AC_MSG_ERROR([binutils version is too old, use 2.41 or newer version])
>   fi
> +
> +# Check if compiler supports -mno-lsx.
> +AC_CACHE_CHECK(whether $CC supports -mno-lsx, libc_cv_loongarch_cc_mno_lsx, [dnl
> +cat > conftest.c <<\EOF
> +        int dummy;
> +EOF
> +if AC_TRY_COMMAND(${CC-cc} -c $CFLAGS -mno-lsx -o conftest 1>&AS_MESSAGE_LOG_FD); then
> +  libc_cv_loongarch_cc_mno_lsx=yes
> +else
> +  libc_cv_loongarch_cc_mno_lsx=no
> +fi
> +rm -f conftest*])
> +LIBC_CONFIG_VAR([have-mno-lsx], [$libc_cv_loongarch_cc_mno_lsx])
>
> But then I found the compiled __tls_get_addr is very simple:


This _tls_get_addr can only be used in static linking.


>
> 000000000000027c <__tls_get_addr>:
>   27c:	28ffc04d 	ld.d        	$t1, $tp, -16
>
> 0000000000000280 <L0^A>:
>   280:	28c0208c 	ld.d        	$t0, $a0, 8
>   284:	28c041a4 	ld.d        	$a0, $t1, 16
>
> 0000000000000288 <L0^A>:
>   288:	0010b084 	add.d       	$a0, $a0, $t0
>   28c:	4c000020 	ret
>
> So I think writing __tls_get_addr in assembly should be easier.  There
> are just five instructions, so this is definitely better than messing
> around compiler flags or writing 90+ instructions to save/load FPRs/VRs
> (and slowing down the execution).
>
  
Xi Ruoyao Dec. 4, 2023, 8:36 a.m. UTC | #7
On Mon, 2023-12-04 at 09:13 +0100, Florian Weimer wrote:
> * Xi Ruoyao:
> 
> > I made up this:
> > 
> > diff --git a/sysdeps/loongarch/Makefile b/sysdeps/loongarch/Makefile
> > index 43d2f583cd..64c1ea1294 100644
> > --- a/sysdeps/loongarch/Makefile
> > +++ b/sysdeps/loongarch/Makefile
> > @@ -15,3 +15,16 @@ ASFLAGS-.os += $(pic-ccflag)
> >   ifeq (yes,$(have-cmodel-medium))
> >   CFLAGS-.oS += -mcmodel=medium
> >   endif
> > +
> > +# Make _dl_tlsdesc_dynamic slow path less expensive by denying __tls_get_addr
> > +# from using any FPR.
> > +#
> > +# Attention: if you see an ICE here, it's likely __tls_get_addr is doing
> > +# something wrong: why should it do floating-point operations anyway?!
> > +# Please fix it instead of complain to GCC maintainers.
> > +ifeq (yes,$(have-mno-lsx))
> > +CFLAGS-libc-tls.c += -mno-lsx
> > +endif
> 
> This is not correct: __tls_get_addr may call malloc, and an interposed
> malloc is free to use the full register file.  You need to perform a
> context switch here, similar to what the lazy binding trampoline does.

Alright, but then do we need to save and restore fcsr and fcc as well? 
AFAIK they should be saved during a context switch, and AFAIK there are
no rules saying "interposed malloc cannot alter floating-point execution
environment".
  
Florian Weimer Dec. 4, 2023, 8:49 a.m. UTC | #8
* Xi Ruoyao:

> On Mon, 2023-12-04 at 09:13 +0100, Florian Weimer wrote:
>> * Xi Ruoyao:
>> 
>> > I made up this:
>> > 
>> > diff --git a/sysdeps/loongarch/Makefile b/sysdeps/loongarch/Makefile
>> > index 43d2f583cd..64c1ea1294 100644
>> > --- a/sysdeps/loongarch/Makefile
>> > +++ b/sysdeps/loongarch/Makefile
>> > @@ -15,3 +15,16 @@ ASFLAGS-.os += $(pic-ccflag)
>> >   ifeq (yes,$(have-cmodel-medium))
>> >   CFLAGS-.oS += -mcmodel=medium
>> >   endif
>> > +
>> > +# Make _dl_tlsdesc_dynamic slow path less expensive by denying __tls_get_addr
>> > +# from using any FPR.
>> > +#
>> > +# Attention: if you see an ICE here, it's likely __tls_get_addr is doing
>> > +# something wrong: why should it do floating-point operations anyway?!
>> > +# Please fix it instead of complain to GCC maintainers.
>> > +ifeq (yes,$(have-mno-lsx))
>> > +CFLAGS-libc-tls.c += -mno-lsx
>> > +endif
>> 
>> This is not correct: __tls_get_addr may call malloc, and an interposed
>> malloc is free to use the full register file.  You need to perform a
>> context switch here, similar to what the lazy binding trampoline does.
>
> Alright, but then do we need to save and restore fcsr and fcc as well? 
> AFAIK they should be saved during a context switch, and AFAIK there are
> no rules saying "interposed malloc cannot alter floating-point execution
> environment".

Sorry, I'm not familiar with those register names and floating point
matters.

That being said, I don't think malloc may change the rounding mode and
other floating point environment aspects.  Not sure about raising
exceptions, though.

Thanks,
Florian
  
Xi Ruoyao Dec. 4, 2023, 9:01 a.m. UTC | #9
On Mon, 2023-12-04 at 09:49 +0100, Florian Weimer wrote:
> * Xi Ruoyao:
> 
> > On Mon, 2023-12-04 at 09:13 +0100, Florian Weimer wrote:
> > > * Xi Ruoyao:
> > > 
> > > > I made up this:
> > > > 
> > > > diff --git a/sysdeps/loongarch/Makefile b/sysdeps/loongarch/Makefile
> > > > index 43d2f583cd..64c1ea1294 100644
> > > > --- a/sysdeps/loongarch/Makefile
> > > > +++ b/sysdeps/loongarch/Makefile
> > > > @@ -15,3 +15,16 @@ ASFLAGS-.os += $(pic-ccflag)
> > > >   ifeq (yes,$(have-cmodel-medium))
> > > >   CFLAGS-.oS += -mcmodel=medium
> > > >   endif
> > > > +
> > > > +# Make _dl_tlsdesc_dynamic slow path less expensive by denying __tls_get_addr
> > > > +# from using any FPR.
> > > > +#
> > > > +# Attention: if you see an ICE here, it's likely __tls_get_addr is doing
> > > > +# something wrong: why should it do floating-point operations anyway?!
> > > > +# Please fix it instead of complain to GCC maintainers.
> > > > +ifeq (yes,$(have-mno-lsx))
> > > > +CFLAGS-libc-tls.c += -mno-lsx
> > > > +endif
> > > 
> > > This is not correct: __tls_get_addr may call malloc, and an interposed
> > > malloc is free to use the full register file.  You need to perform a
> > > context switch here, similar to what the lazy binding trampoline does.
> > 
> > Alright, but then do we need to save and restore fcsr and fcc as well? 
> > AFAIK they should be saved during a context switch, and AFAIK there are
> > no rules saying "interposed malloc cannot alter floating-point execution
> > environment".
> 
> Sorry, I'm not familiar with those register names and floating point
> matters.
> 
> That being said, I don't think malloc may change the rounding mode and
> other floating point environment aspects.  Not sure about raising
> exceptions, though.

For example, for comparing some floating-point values and storing the
result into a TLS variable, the compiler may generate something like

fcmp.clt.d $fcc0, $f0, $f1
pcalau12i $a0,%desc_pc_hi20(tls_sym)
ld.d $a1,$a0,%desc_ld_pc_lo12(tls_sym)
addi.d $a0,$a0,%desc_add_pc_lo12(tls_sym)
jirl $ra,$a1,%desc_call(tls_sym)
movcf2gr $t0,$fcc0
st.d $t0,$a0,0

Now if the interposed malloc clobbers fcc0 (well, it's simple: just
compile the TU containing malloc with -fzero-call-used-regs=all), this
sequence will blow up.

So we need to either save and restore fcc registers here, or in the
compiler add (clobber fcc0) (clobber fcc1) ... (clobber fcc7) for
got_load_tls_desc.  Currently the draft GCC patch does not have these
clobbers.

(Note that this is not a problem with RISC-V because they use FPRs for
outputs of floating-point comparisons and they don't have dedicated FCC
registers).
  
Florian Weimer Dec. 4, 2023, 9:54 a.m. UTC | #10
* Xi Ruoyao:

> So we need to either save and restore fcc registers here, or in the
> compiler add (clobber fcc0) (clobber fcc1) ... (clobber fcc7) for
> got_load_tls_desc.  Currently the draft GCC patch does not have these
> clobbers.

The problem with not having clobbers is that every ISA extension
requires patching the trampoline code.  The downside of clobbers is that
they apply unconditionally.  The trampoline, on the other hand, can save
the registers not preserved by the standard calling convention on the
slow path only, around the call into C code.

Thanks,
Florian
  
mengqinggang Dec. 6, 2023, 7:46 a.m. UTC | #11
在 2023/12/4 下午5:01, Xi Ruoyao 写道:
> On Mon, 2023-12-04 at 09:49 +0100, Florian Weimer wrote:
>> * Xi Ruoyao:
>>
>>> On Mon, 2023-12-04 at 09:13 +0100, Florian Weimer wrote:
>>>> * Xi Ruoyao:
>>>>
>>>>> I made up this:
>>>>>
>>>>> diff --git a/sysdeps/loongarch/Makefile b/sysdeps/loongarch/Makefile
>>>>> index 43d2f583cd..64c1ea1294 100644
>>>>> --- a/sysdeps/loongarch/Makefile
>>>>> +++ b/sysdeps/loongarch/Makefile
>>>>> @@ -15,3 +15,16 @@ ASFLAGS-.os += $(pic-ccflag)
>>>>>    ifeq (yes,$(have-cmodel-medium))
>>>>>    CFLAGS-.oS += -mcmodel=medium
>>>>>    endif
>>>>> +
>>>>> +# Make _dl_tlsdesc_dynamic slow path less expensive by denying __tls_get_addr
>>>>> +# from using any FPR.
>>>>> +#
>>>>> +# Attention: if you see an ICE here, it's likely __tls_get_addr is doing
>>>>> +# something wrong: why should it do floating-point operations anyway?!
>>>>> +# Please fix it instead of complain to GCC maintainers.
>>>>> +ifeq (yes,$(have-mno-lsx))
>>>>> +CFLAGS-libc-tls.c += -mno-lsx
>>>>> +endif
>>>> This is not correct: __tls_get_addr may call malloc, and an interposed
>>>> malloc is free to use the full register file.  You need to perform a
>>>> context switch here, similar to what the lazy binding trampoline does.
>>> Alright, but then do we need to save and restore fcsr and fcc as well?
>>> AFAIK they should be saved during a context switch, and AFAIK there are
>>> no rules saying "interposed malloc cannot alter floating-point execution
>>> environment".
>> Sorry, I'm not familiar with those register names and floating point
>> matters.
>>
>> That being said, I don't think malloc may change the rounding mode and
>> other floating point environment aspects.  Not sure about raising
>> exceptions, though.
> For example, for comparing some floating-point values and storing the
> result into a TLS variable, the compiler may generate something like
>
> fcmp.clt.d $fcc0, $f0, $f1
> pcalau12i $a0,%desc_pc_hi20(tls_sym)
> ld.d $a1,$a0,%desc_ld_pc_lo12(tls_sym)
> addi.d $a0,$a0,%desc_add_pc_lo12(tls_sym)
> jirl $ra,$a1,%desc_call(tls_sym)
> movcf2gr $t0,$fcc0
> st.d $t0,$a0,0

For AArch64, FCMP and CSET similar to FCMP.CLT.D and MOVCF2GR.
ADD, STR and LDR can be inserted between FCMP and CSET.
But it seems that there is no BLR (similar to JIRL) inserted between 
FCMP and CSET.
AARCH64 has no save and restore FPSR and FPCR in _dl_tlsdesc_dynamic 
function.

>
> Now if the interposed malloc clobbers fcc0 (well, it's simple: just
> compile the TU containing malloc with -fzero-call-used-regs=all), this
> sequence will blow up.
>
> So we need to either save and restore fcc registers here, or in the
> compiler add (clobber fcc0) (clobber fcc1) ... (clobber fcc7) for
> got_load_tls_desc.  Currently the draft GCC patch does not have these
> clobbers.
>
> (Note that this is not a problem with RISC-V because they use FPRs for
> outputs of floating-point comparisons and they don't have dedicated FCC
> registers).
>
  
Xi Ruoyao Dec. 6, 2023, 8:05 a.m. UTC | #12
On Wed, 2023-12-06 at 15:46 +0800, mengqinggang wrote:
> For AArch64, FCMP and CSET similar to FCMP.CLT.D and MOVCF2GR.
> ADD, STR and LDR can be inserted between FCMP and CSET.
> But it seems that there is no BLR (similar to JIRL) inserted between 
> FCMP and CSET.
> AARCH64 has no save and restore FPSR and FPCR in _dl_tlsdesc_dynamic 
> function.

AArch64 tlsdesc call pattern clobber CC registers:

(define_insn "tlsdesc_small_advsimd_<mode>"
  [(set (reg:PTR R0_REGNUM)
        (unspec:PTR [(match_operand 0 "aarch64_valid_symref" "S")]
                    UNSPEC_TLSDESC))
   (clobber (reg:DI LR_REGNUM))
   (clobber (reg:CC CC_REGNUM)) # <=============== !!!!!!!!!!!!!
   (clobber (match_scratch:DI 1 "=r"))
   (use (reg:DI FP_REGNUM))]
  "TARGET_TLS_DESC && !TARGET_SVE"
  "adrp\\tx0, %A0\;ldr\\t%<w>1, [x0, #%L0]\;add\\t<w>0, <w>0, %L0\;.tlsdesccall\\t%0\;blr\\t%1"
  [(set_attr "type" "call")
   (set_attr "length" "16")])

So GCC won't insert it between FCMP and CSET.  If you want the same
effect you need (clobber (reg:CC FCC_REG_FIRST)) (clobber (reg:CC
FCC_REG_FIRST + 1)) ... ... (clobber (reg:CC FCC_REG_LAST)) as I've
already suggested.
  
Xi Ruoyao Dec. 6, 2023, 8:07 a.m. UTC | #13
On Wed, 2023-12-06 at 16:05 +0800, Xi Ruoyao wrote:
> On Wed, 2023-12-06 at 15:46 +0800, mengqinggang wrote:
> > For AArch64, FCMP and CSET similar to FCMP.CLT.D and MOVCF2GR.
> > ADD, STR and LDR can be inserted between FCMP and CSET.
> > But it seems that there is no BLR (similar to JIRL) inserted between
> > FCMP and CSET.
> > AARCH64 has no save and restore FPSR and FPCR in _dl_tlsdesc_dynamic
> > function.
> 
> AArch64 tlsdesc call pattern clobber CC registers:
> 
> (define_insn "tlsdesc_small_advsimd_<mode>"
>   [(set (reg:PTR R0_REGNUM)
>         (unspec:PTR [(match_operand 0 "aarch64_valid_symref" "S")]
>                     UNSPEC_TLSDESC))
>    (clobber (reg:DI LR_REGNUM))
>    (clobber (reg:CC CC_REGNUM)) # <=============== !!!!!!!!!!!!!
>    (clobber (match_scratch:DI 1 "=r"))
>    (use (reg:DI FP_REGNUM))]
>   "TARGET_TLS_DESC && !TARGET_SVE"
>   "adrp\\tx0, %A0\;ldr\\t%<w>1, [x0, #%L0]\;add\\t<w>0, <w>0, %L0\;.tlsdesccall\\t%0\;blr\\t%1"
>   [(set_attr "type" "call")
>    (set_attr "length" "16")])
> 
> So GCC won't insert it between FCMP and CSET.  If you want the same
> effect you need (clobber (reg:CC FCC_REG_FIRST)) (clobber (reg:CC
> FCC_REG_FIRST + 1)) ... ... (clobber (reg:CC FCC_REG_LAST)) as I've
> already suggested.

Correction: it should be reg:FCC instead of reg:CC, as we are using
FCCmode instead of CCmode of AArch64.
  
Tatsuyuki Ishi Jan. 7, 2024, 11:03 p.m. UTC | #14
> On Dec 1, 2023, at 18:57, mengqinggang <mengqinggang@loongson.cn> wrote:
> 
> This is mostly based on AArch64 and RISC-V implementation.
> 
> Add R_LARCH_TLS_DESC32 and R_LARCH_TLS_DESC64 relocations.
> 
> For _dl_tlsdesc_dynamic function slow path, temporarily save and restore
> all vector registers.
> ---
> elf/elf.h                                     |   2 +
> sysdeps/loongarch/Makefile                    |   6 +
> sysdeps/loongarch/dl-link.sym                 |   1 +
> sysdeps/loongarch/dl-machine.h                |  52 ++-
> sysdeps/loongarch/dl-tls.h                    |   9 +-
> sysdeps/loongarch/dl-tlsdesc.S                | 364 ++++++++++++++++++
> sysdeps/loongarch/dl-tlsdesc.h                |  49 +++
> sysdeps/loongarch/linkmap.h                   |   1 +
> sysdeps/loongarch/tlsdesc.c                   |  39 ++
> sysdeps/loongarch/tlsdesc.sym                 |  19 +
> .../unix/sysv/linux/loongarch/localplt.data   |   2 +
> 11 files changed, 541 insertions(+), 3 deletions(-)
> create mode 100644 sysdeps/loongarch/dl-tlsdesc.S
> create mode 100644 sysdeps/loongarch/dl-tlsdesc.h
> create mode 100644 sysdeps/loongarch/tlsdesc.c
> create mode 100644 sysdeps/loongarch/tlsdesc.sym
> 
> diff --git a/elf/elf.h b/elf/elf.h
> index 5c1c1972d1..72e90aec30 100644
> --- a/elf/elf.h
> +++ b/elf/elf.h
> @@ -4232,6 +4232,8 @@ enum
> #define R_LARCH_TLS_TPREL32	10
> #define R_LARCH_TLS_TPREL64	11
> #define R_LARCH_IRELATIVE	12
> +#define R_LARCH_TLS_DESC32	13
> +#define R_LARCH_TLS_DESC64	14

Does there need to be separate relocations for 32- and 64-bit? For RISC-V this was determinable from the bitness of the ELF binary, and a lot of old relocations had meaningless 32 and 64 suffixes by accident [1].

[1]: https://github.com/riscv-non-isa/riscv-elf-psabi-doc/pull/373#discussion_r1153477626

> 
> /* Reserved for future relocs that the dynamic linker must understand.  */
> 
> diff --git a/sysdeps/loongarch/Makefile b/sysdeps/loongarch/Makefile
> index 43d2f583cd..181389e787 100644
> --- a/sysdeps/loongarch/Makefile
> +++ b/sysdeps/loongarch/Makefile
> @@ -3,9 +3,15 @@ sysdep_headers += sys/asm.h
> endif
> 
> ifeq ($(subdir),elf)
> +sysdep-dl-routines += tlsdesc dl-tlsdesc
> gen-as-const-headers += dl-link.sym
> endif
> 
> +ifeq ($(subdir),csu)
> +gen-as-const-headers += tlsdesc.sym
> +endif
> +
> +
> # LoongArch's assembler also needs to know about PIC as it changes the
> # definition of some assembler macros.
> ASFLAGS-.os += $(pic-ccflag)
> diff --git a/sysdeps/loongarch/dl-link.sym b/sysdeps/loongarch/dl-link.sym
> index b534968e30..fd81ef37d5 100644
> --- a/sysdeps/loongarch/dl-link.sym
> +++ b/sysdeps/loongarch/dl-link.sym
> @@ -1,6 +1,7 @@
> #include <stddef.h>
> #include <sysdep.h>
> #include <link.h>
> +#include <dl-tlsdesc.h>
> 
> DL_SIZEOF_RG            sizeof(struct La_loongarch_regs)
> DL_SIZEOF_RV            sizeof(struct La_loongarch_retval)
> diff --git a/sysdeps/loongarch/dl-machine.h b/sysdeps/loongarch/dl-machine.h
> index 0d17fd21e3..0dd252a5e5 100644
> --- a/sysdeps/loongarch/dl-machine.h
> +++ b/sysdeps/loongarch/dl-machine.h
> @@ -25,7 +25,7 @@
> #include <entry.h>
> #include <elf/elf.h>
> #include <sys/asm.h>
> -#include <dl-tls.h>
> +#include <dl-tlsdesc.h>
> #include <dl-static-tls.h>
> #include <dl-machine-rel.h>
> 
> @@ -187,6 +187,37 @@ elf_machine_rela (struct link_map *map, struct r_scope_elem *scope[],
>      *addr_field = TLS_TPREL_VALUE (sym_map, sym) + reloc->r_addend;
>      break;
> 
> +    case __WORDSIZE == 64 ? R_LARCH_TLS_DESC64 : R_LARCH_TLS_DESC32:
> +      {
> +	struct tlsdesc volatile *td =
> +	    (struct tlsdesc volatile *)addr_field;
> +	if (! sym)
> +	  {
> +	    td->arg = (void*)reloc->r_addend;
> +	    td->entry = _dl_tlsdesc_undefweak;
> +	  }
> +	else
> +	  {
> +# ifndef SHARED
> +	    CHECK_STATIC_TLS (map, sym_map);
> +# else
> +	    if (!TRY_STATIC_TLS (map, sym_map))
> +	      {
> +		td->arg = _dl_make_tlsdesc_dynamic
> +		  (sym_map, sym->st_value + reloc->r_addend);
> +		td->entry = _dl_tlsdesc_dynamic;
> +	      }
> +	    else
> +# endif
> +	      {
> +		td->arg = (void *)(TLS_TPREL_VALUE (sym_map, sym)
> +			    + reloc->r_addend);
> +		td->entry = _dl_tlsdesc_return;
> +	      }
> +	  }
> +	break;
> +      }
> +
>    case R_LARCH_COPY:
>      {
> 	  if (sym == NULL)
> @@ -255,6 +286,25 @@ elf_machine_lazy_rel (struct link_map *map, struct r_scope_elem *scope[],
>      else
> 	*reloc_addr = map->l_mach.plt;
>    }
> +  else if (__builtin_expect (r_type == R_LARCH_TLS_DESC64, 1))
> +    {
> +      const Elf_Symndx symndx = ELFW (R_SYM) (reloc->r_info);
> +      const ElfW (Sym) *symtab = (const void *)D_PTR (map, l_info[DT_SYMTAB]);
> +      const ElfW (Sym) *sym = &symtab[symndx];
> +      const struct r_found_version *version = NULL;
> +
> +      if (map->l_info[VERSYMIDX (DT_VERSYM)] != NULL)
> +	{
> +	  const ElfW (Half) *vernum =
> +	    (const void *)D_PTR (map, l_info[VERSYMIDX (DT_VERSYM)]);
> +	  version = &map->l_versions[vernum[symndx] & 0x7fff];
> +	}
> +
> +      /* Always initialize TLS descriptors completely, because lazy
> +	 initialization requires synchronization at every TLS access.  */
> +      elf_machine_rela (map, scope, reloc, sym, version, reloc_addr,
> +			skip_ifunc);
> +    }
>  else
>    _dl_reloc_bad_type (map, r_type, 1);
> }
> diff --git a/sysdeps/loongarch/dl-tls.h b/sysdeps/loongarch/dl-tls.h
> index a551594b64..1ca376484a 100644
> --- a/sysdeps/loongarch/dl-tls.h
> +++ b/sysdeps/loongarch/dl-tls.h
> @@ -16,6 +16,9 @@
>   License along with the GNU C Library.  If not, see
>   <https://www.gnu.org/licenses/>.  */
> 
> +#ifndef _DL_TLS_H
> +#define _DL_TLS_H
> +
> /* Type used for the representation of TLS information in the GOT.  */
> typedef struct
> {
> @@ -23,6 +26,8 @@ typedef struct
>  unsigned long int ti_offset;
> } tls_index;
> 
> +extern void *__tls_get_addr (tls_index *ti);
> +
> /* The thread pointer points to the first static TLS block.  */
> #define TLS_TP_OFFSET 0
> 
> @@ -37,10 +42,10 @@ typedef struct
> /* Compute the value for a DTPREL reloc.  */
> #define TLS_DTPREL_VALUE(sym) ((sym)->st_value - TLS_DTV_OFFSET)
> 
> -extern void *__tls_get_addr (tls_index *ti);
> -
> #define GET_ADDR_OFFSET (ti->ti_offset + TLS_DTV_OFFSET)
> #define __TLS_GET_ADDR(__ti) (__tls_get_addr (__ti) - TLS_DTV_OFFSET)
> 
> /* Value used for dtv entries for which the allocation is delayed.  */
> #define TLS_DTV_UNALLOCATED ((void *) -1l)
> +
> +#endif
> diff --git a/sysdeps/loongarch/dl-tlsdesc.S b/sysdeps/loongarch/dl-tlsdesc.S
> new file mode 100644
> index 0000000000..d2c18ff527
> --- /dev/null
> +++ b/sysdeps/loongarch/dl-tlsdesc.S
> @@ -0,0 +1,364 @@
> +/* Thread-local storage handling in the ELF dynamic linker.
> +   LoongArch version.
> +   Copyright (C) 2011-2023 Free Software Foundation, Inc.
> +
> +   This file is part of the GNU C Library.
> +
> +   The GNU C Library is free software; you can redistribute it and/or
> +   modify it under the terms of the GNU Lesser General Public
> +   License as published by the Free Software Foundation; either
> +   version 2.1 of the License, or (at your option) any later version.
> +
> +   The GNU C Library is distributed in the hope that it will be useful,
> +   but WITHOUT ANY WARRANTY; without even the implied warranty of
> +   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
> +   Lesser General Public License for more details.
> +
> +   You should have received a copy of the GNU Lesser General Public
> +   License along with the GNU C Library; if not, see
> +   <https://www.gnu.org/licenses/>.  */
> +
> +#include <sysdep.h>
> +#include <tls.h>
> +#include "tlsdesc.h"
> +
> +	.text
> +
> +	/* Compute the thread pointer offset for symbols in the static
> +	   TLS block. The offset is the same for all threads.
> +	   Prototype:
> +	   _dl_tlsdesc_return (tlsdesc *);  */
> +	.hidden _dl_tlsdesc_return
> +	.global	_dl_tlsdesc_return
> +	.type	_dl_tlsdesc_return,%function
> +	cfi_startproc
> +	.align 2
> +_dl_tlsdesc_return:
> +	REG_L  a0, a0, 8
> +	RET
> +	cfi_endproc
> +	.size	_dl_tlsdesc_return, .-_dl_tlsdesc_return
> +
> +	/* Handler for undefined weak TLS symbols.
> +	   Prototype:
> +	   _dl_tlsdesc_undefweak (tlsdesc *);
> +
> +	   The second word of the descriptor contains the addend.
> +	   Return the addend minus the thread pointer. This ensures
> +	   that when the caller adds on the thread pointer it gets back
> +	   the addend.  */
> +	.hidden _dl_tlsdesc_undefweak
> +	.global	_dl_tlsdesc_undefweak
> +	.type	_dl_tlsdesc_undefweak,%function
> +	cfi_startproc
> +	.align  2
> +_dl_tlsdesc_undefweak:
> +	REG_L	a0, a0, 8
> +	sub.d	a0, a0, tp
> +	RET
> +	cfi_endproc
> +	.size	_dl_tlsdesc_undefweak, .-_dl_tlsdesc_undefweak
> +
> +#ifdef USE_LASX
> +# define FRAME_SIZE (-((-13 * SZREG - 32 * SZXREG) & ALMASK))
> +#elif defined USE_LSX
> +# define FRAME_SIZE (-((-13 * SZREG - 32 * SZVREG) & ALMASK))
> +#elif !defined __loongarch_soft_float
> +# define FRAME_SIZE (-((-13 * SZREG - 24 * SZFREG) & ALMASK))
> +#else
> +# define FRAME_SIZE (-((-13 * SZREG) & ALMASK))
> +#endif
> +
> +#ifdef SHARED
> +	/* Handler for dynamic TLS symbols.
> +	   Prototype:
> +	   _dl_tlsdesc_dynamic (tlsdesc *) ;
> +
> +	   The second word of the descriptor points to a
> +	   tlsdesc_dynamic_arg structure.
> +
> +	   Returns the offset between the thread pointer and the
> +	   object referenced by the argument.
> +
> +	   ptrdiff_t
> +	   __attribute__ ((__regparm__ (1)))
> +	   _dl_tlsdesc_dynamic (struct tlsdesc *tdp)
> +	   {
> +	     struct tlsdesc_dynamic_arg *td = tdp->arg;
> +	     dtv_t *dtv = *(dtv_t **)((char *)__thread_pointer + TCBHEAD_DTV);
> +	     if (__builtin_expect (td->gen_count <= dtv[0].counter
> +		&& (dtv[td->tlsinfo.ti_module].pointer.val
> +		    != TLS_DTV_UNALLOCATED),
> +		1))
> +	       return dtv[td->tlsinfo.ti_module].pointer.val
> +		+ td->tlsinfo.ti_offset
> +		- __thread_pointer;
> +
> +	     return ___tls_get_addr (&td->tlsinfo) - __thread_pointer;
> +	   }
> +	 */
> +	.hidden _dl_tlsdesc_dynamic
> +	.global	_dl_tlsdesc_dynamic
> +	.type	_dl_tlsdesc_dynamic,%function
> +	cfi_startproc
> +	.align 2
> +_dl_tlsdesc_dynamic:
> +	/* Save just enough registers to support fast path, if we fall
> +	   into slow path we will save additional registers.  */
> +	addi.d  $r3,$r3,-24

The stack alignment is broken here. The fast path doesn’t need stack alignment, but it breaks the assumption in the slow path that sp is already aligned. You need to either keep the fast path offset aligned (simpler, but some wasted stack space), or calculate the correct offset to re-align in the slow path.

> +	REG_S	t0, sp, 0
> +	REG_S	t1, sp, 8
> +	REG_S	t2, sp, 16
> +
> +	REG_L	t0, tp, -SIZE_OF_DTV	  # dtv(t0) = tp + TCBHEAD_DTV dtv start
> +	REG_L	a0, a0, TLSDESC_ARG	  # td(a0) = tdp->arg
> +	REG_L	t1, a0, TLSDESC_GEN_COUNT # t1 = td->gen_count
> +	REG_L	t2, t0, DTV_COUNTER	  # t2 = dtv[0].counter
> +	bltu	t2, t1, .Lslow
> +
> +	REG_L	t1, a0, TLSDESC_MODID	  # t1 = td->tlsinfo.ti_module
> +	slli.d	t1, t1, 3 + 1 # /* sizeof(dtv_t) == sizeof(void*) * 2 */

32-bit handling in this patch, in particular here, does not seem correct. On 32-bit the field of the related structures (void* and unsigned long) should also be 32-bit, but the offsets in the stub are hardcoded to those of 64-bit.

Also note that REG_L is supposed to be a macro that switches the instruction based on 32- or 64-bit. I’m not sure why it’s hardcoded to the 64-bit version in LoongArch, but I think one should either stick to writing all instructions with .d suffix typed out (instead of the bit-agnostic versions), or define more bit-agnostic macros like REG_ADD so that everything that depends on pointer size can be portable across 32- and 64-bit.

> +	add.d	t1, t1, t0    # t1 = dtv + ti_module * sizeof(dtv_t)
> +	REG_L	t1, t1, 0 # t1 = dtv[td->tlsinfo.ti_module].pointer.val
> +	li.d	t2, TLS_DTV_UNALLOCATED
> +	beq	t1, t2, .Lslow
> +	REG_L	t2, a0, TLSDESC_MODOFF # t2 = td->tlsinfo.ti_offset
> +	# dtv[td->tlsinfo.ti_module].pointer.val + td->tlsinfo.ti_offset
> +	add.d	a0, t1, t2
> +.Lret:
> +	sub.d	a0, a0, tp
> +	REG_L	t0, sp, 0
> +	REG_L	t1, sp, 8
> +	REG_L	t2, sp, 16
> +	addi.d	sp, sp, 24
> +	RET
> +
> +.Lslow:
> +	/* This is the slow path. We need to call __tls_get_addr() which
> +	   means we need to save and restore all the register that the
> +	   callee will trash.  */
> +
> +	/* Save the remaining registers that we must treat as caller save.  */
> +	addi.d	sp, sp, -FRAME_SIZE
> +	REG_S	ra, sp, 0 * SZREG
> +	REG_S	a1, sp, 1 * SZREG
> +	REG_S	a2, sp, 2 * SZREG
> +	REG_S	a3, sp, 3 * SZREG
> +	REG_S	a4, sp, 4 * SZREG
> +	REG_S	a5, sp, 5 * SZREG
> +	REG_S	a6, sp, 6 * SZREG
> +	REG_S	a7, sp, 7 * SZREG
> +	REG_S	t4, sp, 8 * SZREG
> +	REG_S	t5, sp, 9 * SZREG
> +	REG_S	t6, sp, 10 * SZREG
> +	REG_S	t7, sp, 11 * SZREG
> +	REG_S	t8, sp, 12 * SZREG
> +
> +#ifdef USE_LASX
> +	xvst	xr0, sp, 13*SZREG + 0*SZXREG
> +	xvst	xr1, sp, 13*SZREG + 1*SZXREG
> +	xvst	xr2, sp, 13*SZREG + 2*SZXREG
> +	xvst	xr3, sp, 13*SZREG + 3*SZXREG
> +	xvst	xr4, sp, 13*SZREG + 4*SZXREG
> +	xvst	xr5, sp, 13*SZREG + 5*SZXREG
> +	xvst	xr6, sp, 13*SZREG + 6*SZXREG
> +	xvst	xr7, sp, 13*SZREG + 7*SZXREG
> +	xvst	xr8, sp, 13*SZREG + 8*SZXREG
> +	xvst	xr9, sp, 13*SZREG + 9*SZXREG
> +	xvst	xr10, sp, 13*SZREG + 10*SZXREG
> +	xvst	xr11, sp, 13*SZREG + 11*SZXREG
> +	xvst	xr12, sp, 13*SZREG + 12*SZXREG
> +	xvst	xr13, sp, 13*SZREG + 13*SZXREG
> +	xvst	xr14, sp, 13*SZREG + 14*SZXREG
> +	xvst	xr15, sp, 13*SZREG + 15*SZXREG
> +	xvst	xr16, sp, 13*SZREG + 16*SZXREG
> +	xvst	xr17, sp, 13*SZREG + 17*SZXREG
> +	xvst	xr18, sp, 13*SZREG + 18*SZXREG
> +	xvst	xr19, sp, 13*SZREG + 19*SZXREG
> +	xvst	xr20, sp, 13*SZREG + 20*SZXREG
> +	xvst	xr21, sp, 13*SZREG + 21*SZXREG
> +	xvst	xr22, sp, 13*SZREG + 22*SZXREG
> +	xvst	xr23, sp, 13*SZREG + 23*SZXREG
> +	xvst	xr23, sp, 13*SZREG + 24*SZXREG
> +	xvst	xr23, sp, 13*SZREG + 25*SZXREG
> +	xvst	xr23, sp, 13*SZREG + 26*SZXREG
> +	xvst	xr23, sp, 13*SZREG + 27*SZXREG
> +	xvst	xr23, sp, 13*SZREG + 28*SZXREG
> +	xvst	xr23, sp, 13*SZREG + 29*SZXREG
> +	xvst	xr23, sp, 13*SZREG + 30*SZXREG
> +	xvst	xr23, sp, 13*SZREG + 31*SZXREG
> +#elif defined USE_LSX
> +	vst	vr0, sp, 13*SZREG + 0*SZVREG
> +	vst	vr1, sp, 13*SZREG + 1*SZVREG
> +	vst	vr2, sp, 13*SZREG + 2*SZVREG
> +	vst	vr3, sp, 13*SZREG + 3*SZVREG
> +	vst	vr4, sp, 13*SZREG + 4*SZVREG
> +	vst	vr5, sp, 13*SZREG + 5*SZVREG
> +	vst	vr6, sp, 13*SZREG + 6*SZVREG
> +	vst	vr7, sp, 13*SZREG + 7*SZVREG
> +	vst	vr8, sp, 13*SZREG + 8*SZVREG
> +	vst	vr9, sp, 13*SZREG + 9*SZVREG
> +	vst	vr10, sp, 13*SZREG + 10*SZVREG
> +	vst	vr11, sp, 13*SZREG + 11*SZVREG
> +	vst	vr12, sp, 13*SZREG + 12*SZVREG
> +	vst	vr13, sp, 13*SZREG + 13*SZVREG
> +	vst	vr14, sp, 13*SZREG + 14*SZVREG
> +	vst	vr15, sp, 13*SZREG + 15*SZVREG
> +	vst	vr16, sp, 13*SZREG + 16*SZVREG
> +	vst	vr17, sp, 13*SZREG + 17*SZVREG
> +	vst	vr18, sp, 13*SZREG + 18*SZVREG
> +	vst	vr19, sp, 13*SZREG + 19*SZVREG
> +	vst	vr20, sp, 13*SZREG + 20*SZVREG
> +	vst	vr21, sp, 13*SZREG + 21*SZVREG
> +	vst	vr22, sp, 13*SZREG + 22*SZVREG
> +	vst	vr23, sp, 13*SZREG + 23*SZVREG
> +	vst	vr23, sp, 13*SZREG + 24*SZVREG
> +	vst	vr23, sp, 13*SZREG + 25*SZVREG
> +	vst	vr23, sp, 13*SZREG + 26*SZVREG
> +	vst	vr23, sp, 13*SZREG + 27*SZVREG
> +	vst	vr23, sp, 13*SZREG + 28*SZVREG
> +	vst	vr23, sp, 13*SZREG + 29*SZVREG
> +	vst	vr23, sp, 13*SZREG + 30*SZVREG
> +	vst	vr23, sp, 13*SZREG + 31*SZVREG
> +#elif !defined __loongarch_soft_float
> +	FREG_S	fa0, sp, 13*SZREG + 0*SZFREG
> +	FREG_S	fa1, sp, 13*SZREG + 1*SZFREG
> +	FREG_S	fa2, sp, 13*SZREG + 2*SZFREG
> +	FREG_S	fa3, sp, 13*SZREG + 3*SZFREG
> +	FREG_S	fa4, sp, 13*SZREG + 4*SZFREG
> +	FREG_S	fa5, sp, 13*SZREG + 5*SZFREG
> +	FREG_S	fa6, sp, 13*SZREG + 6*SZFREG
> +	FREG_S	fa7, sp, 13*SZREG + 7*SZFREG
> +	FREG_S	ft0, sp, 13*SZREG + 8*SZFREG
> +	FREG_S	ft1, sp, 13*SZREG + 9*SZFREG
> +	FREG_S	ft2, sp, 13*SZREG + 10*SZFREG
> +	FREG_S	ft3, sp, 13*SZREG + 11*SZFREG
> +	FREG_S	ft4, sp, 13*SZREG + 12*SZFREG
> +	FREG_S	ft5, sp, 13*SZREG + 13*SZFREG
> +	FREG_S	ft6, sp, 13*SZREG + 14*SZFREG
> +	FREG_S	ft7, sp, 13*SZREG + 15*SZFREG
> +	FREG_S	ft8, sp, 13*SZREG + 16*SZFREG
> +	FREG_S	ft9, sp, 13*SZREG + 17*SZFREG
> +	FREG_S	ft10, sp, 13*SZREG + 18*SZFREG
> +	FREG_S	ft11, sp, 13*SZREG + 19*SZFREG
> +	FREG_S	ft12, sp, 13*SZREG + 20*SZFREG
> +	FREG_S	ft13, sp, 13*SZREG + 21*SZFREG
> +	FREG_S	ft14, sp, 13*SZREG + 22*SZFREG
> +	FREG_S	ft15, sp, 13*SZREG + 23*SZFREG
> +#endif /* #ifdef USE_LASX  */
> +
> +	bl	__tls_get_addr
> +	addi.d	a0, a0, -TLS_DTV_OFFSET
> +
> +	REG_L	ra, sp, 0
> +	REG_L	a1, sp, 1 * 8
> +	REG_L	a2, sp, 2 * 8
> +	REG_L	a3, sp, 3 * 8
> +	REG_L	a4, sp, 4 * 8
> +	REG_L	a5, sp, 5 * 8
> +	REG_L	a6, sp, 6 * 8
> +	REG_L	a7, sp, 7 * 8
> +	REG_L	t4, sp, 8 * 8
> +	REG_L	t5, sp, 9 * 8
> +	REG_L	t6, sp, 10 * 8
> +	REG_L	t7, sp, 11 * 8
> +	REG_L	t8, sp, 12 * 8
> +
> +#ifdef USE_LASX
> +	xvld	xr0, sp, 13*SZREG + 0*SZXREG
> +	xvld	xr1, sp, 13*SZREG + 1*SZXREG
> +	xvld	xr2, sp, 13*SZREG + 2*SZXREG
> +	xvld	xr3, sp, 13*SZREG + 3*SZXREG
> +	xvld	xr4, sp, 13*SZREG + 4*SZXREG
> +	xvld	xr5, sp, 13*SZREG + 5*SZXREG
> +	xvld	xr6, sp, 13*SZREG + 6*SZXREG
> +	xvld	xr7, sp, 13*SZREG + 7*SZXREG
> +	xvld	xr8, sp, 13*SZREG + 8*SZXREG
> +	xvld	xr9, sp, 13*SZREG + 9*SZXREG
> +	xvld	xr10, sp, 13*SZREG + 10*SZXREG
> +	xvld	xr11, sp, 13*SZREG + 11*SZXREG
> +	xvld	xr12, sp, 13*SZREG + 12*SZXREG
> +	xvld	xr13, sp, 13*SZREG + 13*SZXREG
> +	xvld	xr14, sp, 13*SZREG + 14*SZXREG
> +	xvld	xr15, sp, 13*SZREG + 15*SZXREG
> +	xvld	xr16, sp, 13*SZREG + 16*SZXREG
> +	xvld	xr17, sp, 13*SZREG + 17*SZXREG
> +	xvld	xr18, sp, 13*SZREG + 18*SZXREG
> +	xvld	xr19, sp, 13*SZREG + 19*SZXREG
> +	xvld	xr20, sp, 13*SZREG + 20*SZXREG
> +	xvld	xr21, sp, 13*SZREG + 21*SZXREG
> +	xvld	xr22, sp, 13*SZREG + 22*SZXREG
> +	xvld	xr23, sp, 13*SZREG + 23*SZXREG
> +	xvld	xr24, sp, 13*SZREG + 24*SZXREG
> +	xvld	xr25, sp, 13*SZREG + 25*SZXREG
> +	xvld	xr26, sp, 13*SZREG + 26*SZXREG
> +	xvld	xr27, sp, 13*SZREG + 27*SZXREG
> +	xvld	xr28, sp, 13*SZREG + 28*SZXREG
> +	xvld	xr29, sp, 13*SZREG + 29*SZXREG
> +	xvld	xr30, sp, 13*SZREG + 30*SZXREG
> +	xvld	xr31, sp, 13*SZREG + 31*SZXREG
> +#elif defined USE_LSX
> +	vld	vr0, sp, 13*SZREG + 0*SZVREG
> +	vld	vr1, sp, 13*SZREG + 1*SZVREG
> +	vld	vr2, sp, 13*SZREG + 2*SZVREG
> +	vld	vr3, sp, 13*SZREG + 3*SZVREG
> +	vld	vr4, sp, 13*SZREG + 4*SZVREG
> +	vld	vr5, sp, 13*SZREG + 5*SZVREG
> +	vld	vr6, sp, 13*SZREG + 6*SZVREG
> +	vld	vr7, sp, 13*SZREG + 7*SZVREG
> +	vld	vr8, sp, 13*SZREG + 8*SZVREG
> +	vld	vr9, sp, 13*SZREG + 9*SZVREG
> +	vld	vr10, sp, 13*SZREG + 10*SZVREG
> +	vld	vr11, sp, 13*SZREG + 11*SZVREG
> +	vld	vr12, sp, 13*SZREG + 12*SZVREG
> +	vld	vr13, sp, 13*SZREG + 13*SZVREG
> +	vld	vr14, sp, 13*SZREG + 14*SZVREG
> +	vld	vr15, sp, 13*SZREG + 15*SZVREG
> +	vld	vr16, sp, 13*SZREG + 16*SZVREG
> +	vld	vr17, sp, 13*SZREG + 17*SZVREG
> +	vld	vr18, sp, 13*SZREG + 18*SZVREG
> +	vld	vr19, sp, 13*SZREG + 19*SZVREG
> +	vld	vr20, sp, 13*SZREG + 20*SZVREG
> +	vld	vr21, sp, 13*SZREG + 21*SZVREG
> +	vld	vr22, sp, 13*SZREG + 22*SZVREG
> +	vld	vr23, sp, 13*SZREG + 23*SZVREG
> +	vld	vr24, sp, 13*SZREG + 24*SZVREG
> +	vld	vr25, sp, 13*SZREG + 25*SZVREG
> +	vld	vr26, sp, 13*SZREG + 26*SZVREG
> +	vld	vr27, sp, 13*SZREG + 27*SZVREG
> +	vld	vr28, sp, 13*SZREG + 28*SZVREG
> +	vld	vr29, sp, 13*SZREG + 29*SZVREG
> +	vld	vr30, sp, 13*SZREG + 30*SZVREG
> +	vld	vr31, sp, 13*SZREG + 31*SZVREG
> +#elif !defined __loongarch_soft_float
> +	FREG_L	fa0, sp, 13*SZREG + 0*SZFREG
> +	FREG_L	fa1, sp, 13*SZREG + 1*SZFREG
> +	FREG_L	fa2, sp, 13*SZREG + 2*SZFREG
> +	FREG_L	fa3, sp, 13*SZREG + 3*SZFREG
> +	FREG_L	fa4, sp, 13*SZREG + 4*SZFREG
> +	FREG_L	fa5, sp, 13*SZREG + 5*SZFREG
> +	FREG_L	fa6, sp, 13*SZREG + 6*SZFREG
> +	FREG_L	fa7, sp, 13*SZREG + 7*SZFREG
> +	FREG_L	ft0, sp, 13*SZREG + 8*SZFREG
> +	FREG_L	ft1, sp, 13*SZREG + 9*SZFREG
> +	FREG_L	ft2, sp, 13*SZREG + 10*SZFREG
> +	FREG_L	ft3, sp, 13*SZREG + 11*SZFREG
> +	FREG_L	ft4, sp, 13*SZREG + 12*SZFREG
> +	FREG_L	ft5, sp, 13*SZREG + 13*SZFREG
> +	FREG_L	ft6, sp, 13*SZREG + 14*SZFREG
> +	FREG_L	ft7, sp, 13*SZREG + 15*SZFREG
> +	FREG_L	ft8, sp, 13*SZREG + 16*SZFREG
> +	FREG_L	ft9, sp, 13*SZREG + 17*SZFREG
> +	FREG_L	ft10, sp, 13*SZREG + 18*SZFREG
> +	FREG_L	ft11, sp, 13*SZREG + 19*SZFREG
> +	FREG_L	ft12, sp, 13*SZREG + 20*SZFREG
> +	FREG_L	ft13, sp, 13*SZREG + 21*SZFREG
> +	FREG_L	ft14, sp, 13*SZREG + 22*SZFREG
> +	FREG_L	ft15, sp, 13*SZREG + 23*SZFREG
> +#endif /* #ifdef USE_LASX  */
> +
> +	addi.d	sp, sp, FRAME_SIZE
> +	b	.Lret
> +	cfi_endproc
> +	.size	_dl_tlsdesc_dynamic, .-_dl_tlsdesc_dynamic
> +#endif /* #ifdef SHARED  */
> diff --git a/sysdeps/loongarch/dl-tlsdesc.h b/sysdeps/loongarch/dl-tlsdesc.h
> new file mode 100644
> index 0000000000..e1a9365855
> --- /dev/null
> +++ b/sysdeps/loongarch/dl-tlsdesc.h
> @@ -0,0 +1,49 @@
> +/* Thread-local storage descriptor handling in the ELF dynamic linker.
> +   LoongArch version.
> +   Copyright (C) 2011-2023 Free Software Foundation, Inc.
> +
> +   This file is part of the GNU C Library.
> +
> +   The GNU C Library is free software; you can redistribute it and/or
> +   modify it under the terms of the GNU Lesser General Public
> +   License as published by the Free Software Foundation; either
> +   version 2.1 of the License, or (at your option) any later version.
> +
> +   The GNU C Library is distributed in the hope that it will be useful,
> +   but WITHOUT ANY WARRANTY; without even the implied warranty of
> +   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
> +   Lesser General Public License for more details.
> +
> +   You should have received a copy of the GNU Lesser General Public
> +   License along with the GNU C Library; if not, see
> +   <https://www.gnu.org/licenses/>.  */
> +
> +#ifndef _DL_TLSDESC_H
> +#define _DL_TLSDESC_H
> +
> +#include <dl-tls.h>
> +
> +/* Type used to represent a TLS descriptor in the GOT.  */
> +struct tlsdesc
> +{
> +  ptrdiff_t (*entry) (struct tlsdesc *);
> +  void *arg;
> +};
> +
> +/* Type used as the argument in a TLS descriptor for a symbol that
> +   needs dynamic TLS offsets.  */
> +struct tlsdesc_dynamic_arg
> +{
> +  tls_index tlsinfo;
> +  size_t gen_count;
> +};
> +
> +extern ptrdiff_t attribute_hidden _dl_tlsdesc_return (struct tlsdesc *);
> +extern ptrdiff_t attribute_hidden _dl_tlsdesc_undefweak (struct tlsdesc *);
> +
> +# ifdef SHARED
> +extern void *_dl_make_tlsdesc_dynamic (struct link_map *, size_t);
> +extern ptrdiff_t attribute_hidden _dl_tlsdesc_dynamic (struct tlsdesc *);
> +#endif
> +
> +#endif
> diff --git a/sysdeps/loongarch/linkmap.h b/sysdeps/loongarch/linkmap.h
> index 2f5bf53421..40416b1ad4 100644
> --- a/sysdeps/loongarch/linkmap.h
> +++ b/sysdeps/loongarch/linkmap.h
> @@ -19,4 +19,5 @@
> struct link_map_machine
> {
>  ElfW (Addr) plt; /* Address of .plt.  */
> +  void *tlsdesc_table;    /* Address of TLS descriptor hash table.  */
> };
> diff --git a/sysdeps/loongarch/tlsdesc.c b/sysdeps/loongarch/tlsdesc.c
> new file mode 100644
> index 0000000000..a357e7619f
> --- /dev/null
> +++ b/sysdeps/loongarch/tlsdesc.c
> @@ -0,0 +1,39 @@
> +/* Manage TLS descriptors.  AArch64 version.
> +
> +   Copyright (C) 2011-2023 Free Software Foundation, Inc.
> +
> +   This file is part of the GNU C Library.
> +
> +   The GNU C Library is free software; you can redistribute it and/or
> +   modify it under the terms of the GNU Lesser General Public
> +   License as published by the Free Software Foundation; either
> +   version 2.1 of the License, or (at your option) any later version.
> +
> +   The GNU C Library is distributed in the hope that it will be useful,
> +   but WITHOUT ANY WARRANTY; without even the implied warranty of
> +   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
> +   Lesser General Public License for more details.
> +
> +   You should have received a copy of the GNU Lesser General Public
> +   License along with the GNU C Library; if not, see
> +   <https://www.gnu.org/licenses/>.  */
> +
> +#include <ldsodefs.h>
> +#include <tls.h>
> +#include <dl-tlsdesc.h>
> +#include <dl-unmap-segments.h>
> +#include <tlsdeschtab.h>
> +
> +/* Unmap the dynamic object, but also release its TLS descriptor table
> +   if there is one.  */
> +
> +void
> +_dl_unmap (struct link_map *map)
> +{
> +  _dl_unmap_segments (map);
> +
> +#ifdef SHARED
> +  if (map->l_mach.tlsdesc_table)
> +    htab_delete (map->l_mach.tlsdesc_table);
> +#endif
> +}
> diff --git a/sysdeps/loongarch/tlsdesc.sym b/sysdeps/loongarch/tlsdesc.sym
> new file mode 100644
> index 0000000000..bcab218631
> --- /dev/null
> +++ b/sysdeps/loongarch/tlsdesc.sym
> @@ -0,0 +1,19 @@
> +#include <stddef.h>
> +#include <sysdep.h>
> +#include <tls.h>
> +#include <link.h>
> +#include <dl-tlsdesc.h>
> +
> +--
> +
> +-- Abuse tls.h macros to derive offsets relative to the thread register.
> +
> +TLSDESC_ARG		offsetof(struct tlsdesc, arg)
> +TLSDESC_GEN_COUNT	offsetof(struct tlsdesc_dynamic_arg, gen_count)
> +TLSDESC_MODID		offsetof(struct tlsdesc_dynamic_arg, tlsinfo.ti_module)
> +TLSDESC_MODOFF		offsetof(struct tlsdesc_dynamic_arg, tlsinfo.ti_offset)
> +TCBHEAD_DTV		offsetof(tcbhead_t, dtv)
> +DTV_COUNTER		offsetof(dtv_t, counter)
> +TLS_DTV_UNALLOCATED	TLS_DTV_UNALLOCATED
> +TLS_DTV_OFFSET		TLS_DTV_OFFSET
> +SIZE_OF_DTV		sizeof(tcbhead_t)
> diff --git a/sysdeps/unix/sysv/linux/loongarch/localplt.data b/sysdeps/unix/sysv/linux/loongarch/localplt.data
> index 547b1c1b7f..ec32e6d13f 100644
> --- a/sysdeps/unix/sysv/linux/loongarch/localplt.data
> +++ b/sysdeps/unix/sysv/linux/loongarch/localplt.data
> @@ -5,3 +5,5 @@ libc.so: calloc
> libc.so: free
> libc.so: malloc
> libc.so: realloc
> +# The dynamic loader needs __tls_get_addr for TLS.
> +ld.so: __tls_get_addr
> -- 
> 2.36.0
> 
>
  
mengqinggang Jan. 8, 2024, 1:52 a.m. UTC | #15
在 2024/1/8 上午7:03, Tatsuyuki Ishi 写道:
>> On Dec 1, 2023, at 18:57, mengqinggang <mengqinggang@loongson.cn> wrote:
>>
>> This is mostly based on AArch64 and RISC-V implementation.
>>
>> Add R_LARCH_TLS_DESC32 and R_LARCH_TLS_DESC64 relocations.
>>
>> For _dl_tlsdesc_dynamic function slow path, temporarily save and restore
>> all vector registers.
>> ---
>> elf/elf.h                                     |   2 +
>> sysdeps/loongarch/Makefile                    |   6 +
>> sysdeps/loongarch/dl-link.sym                 |   1 +
>> sysdeps/loongarch/dl-machine.h                |  52 ++-
>> sysdeps/loongarch/dl-tls.h                    |   9 +-
>> sysdeps/loongarch/dl-tlsdesc.S                | 364 ++++++++++++++++++
>> sysdeps/loongarch/dl-tlsdesc.h                |  49 +++
>> sysdeps/loongarch/linkmap.h                   |   1 +
>> sysdeps/loongarch/tlsdesc.c                   |  39 ++
>> sysdeps/loongarch/tlsdesc.sym                 |  19 +
>> .../unix/sysv/linux/loongarch/localplt.data   |   2 +
>> 11 files changed, 541 insertions(+), 3 deletions(-)
>> create mode 100644 sysdeps/loongarch/dl-tlsdesc.S
>> create mode 100644 sysdeps/loongarch/dl-tlsdesc.h
>> create mode 100644 sysdeps/loongarch/tlsdesc.c
>> create mode 100644 sysdeps/loongarch/tlsdesc.sym
>>
>> diff --git a/elf/elf.h b/elf/elf.h
>> index 5c1c1972d1..72e90aec30 100644
>> --- a/elf/elf.h
>> +++ b/elf/elf.h
>> @@ -4232,6 +4232,8 @@ enum
>> #define R_LARCH_TLS_TPREL32	10
>> #define R_LARCH_TLS_TPREL64	11
>> #define R_LARCH_IRELATIVE	12
>> +#define R_LARCH_TLS_DESC32	13
>> +#define R_LARCH_TLS_DESC64	14
> Does there need to be separate relocations for 32- and 64-bit? For RISC-V this was determinable from the bitness of the ELF binary, and a lot of old relocations had meaningless 32 and 64 suffixes by accident [1].
>
> [1]: https://github.com/riscv-non-isa/riscv-elf-psabi-doc/pull/373#discussion_r1153477626


Actually there is no need to separate 32- and 64-bit. Add 32- and 64-bit 
relocations
is mainly to be consistent with other TLS type dynamic relocations.


>> /* Reserved for future relocs that the dynamic linker must understand.  */
>>
>> diff --git a/sysdeps/loongarch/Makefile b/sysdeps/loongarch/Makefile
>> index 43d2f583cd..181389e787 100644
>> --- a/sysdeps/loongarch/Makefile
>> +++ b/sysdeps/loongarch/Makefile
>> @@ -3,9 +3,15 @@ sysdep_headers += sys/asm.h
>> endif
>>
>> ifeq ($(subdir),elf)
>> +sysdep-dl-routines += tlsdesc dl-tlsdesc
>> gen-as-const-headers += dl-link.sym
>> endif
>>
>> +ifeq ($(subdir),csu)
>> +gen-as-const-headers += tlsdesc.sym
>> +endif
>> +
>> +
>> # LoongArch's assembler also needs to know about PIC as it changes the
>> # definition of some assembler macros.
>> ASFLAGS-.os += $(pic-ccflag)
>> diff --git a/sysdeps/loongarch/dl-link.sym b/sysdeps/loongarch/dl-link.sym
>> index b534968e30..fd81ef37d5 100644
>> --- a/sysdeps/loongarch/dl-link.sym
>> +++ b/sysdeps/loongarch/dl-link.sym
>> @@ -1,6 +1,7 @@
>> #include <stddef.h>
>> #include <sysdep.h>
>> #include <link.h>
>> +#include <dl-tlsdesc.h>
>>
>> DL_SIZEOF_RG            sizeof(struct La_loongarch_regs)
>> DL_SIZEOF_RV            sizeof(struct La_loongarch_retval)
>> diff --git a/sysdeps/loongarch/dl-machine.h b/sysdeps/loongarch/dl-machine.h
>> index 0d17fd21e3..0dd252a5e5 100644
>> --- a/sysdeps/loongarch/dl-machine.h
>> +++ b/sysdeps/loongarch/dl-machine.h
>> @@ -25,7 +25,7 @@
>> #include <entry.h>
>> #include <elf/elf.h>
>> #include <sys/asm.h>
>> -#include <dl-tls.h>
>> +#include <dl-tlsdesc.h>
>> #include <dl-static-tls.h>
>> #include <dl-machine-rel.h>
>>
>> @@ -187,6 +187,37 @@ elf_machine_rela (struct link_map *map, struct r_scope_elem *scope[],
>>       *addr_field = TLS_TPREL_VALUE (sym_map, sym) + reloc->r_addend;
>>       break;
>>
>> +    case __WORDSIZE == 64 ? R_LARCH_TLS_DESC64 : R_LARCH_TLS_DESC32:
>> +      {
>> +	struct tlsdesc volatile *td =
>> +	    (struct tlsdesc volatile *)addr_field;
>> +	if (! sym)
>> +	  {
>> +	    td->arg = (void*)reloc->r_addend;
>> +	    td->entry = _dl_tlsdesc_undefweak;
>> +	  }
>> +	else
>> +	  {
>> +# ifndef SHARED
>> +	    CHECK_STATIC_TLS (map, sym_map);
>> +# else
>> +	    if (!TRY_STATIC_TLS (map, sym_map))
>> +	      {
>> +		td->arg = _dl_make_tlsdesc_dynamic
>> +		  (sym_map, sym->st_value + reloc->r_addend);
>> +		td->entry = _dl_tlsdesc_dynamic;
>> +	      }
>> +	    else
>> +# endif
>> +	      {
>> +		td->arg = (void *)(TLS_TPREL_VALUE (sym_map, sym)
>> +			    + reloc->r_addend);
>> +		td->entry = _dl_tlsdesc_return;
>> +	      }
>> +	  }
>> +	break;
>> +      }
>> +
>>     case R_LARCH_COPY:
>>       {
>> 	  if (sym == NULL)
>> @@ -255,6 +286,25 @@ elf_machine_lazy_rel (struct link_map *map, struct r_scope_elem *scope[],
>>       else
>> 	*reloc_addr = map->l_mach.plt;
>>     }
>> +  else if (__builtin_expect (r_type == R_LARCH_TLS_DESC64, 1))
>> +    {
>> +      const Elf_Symndx symndx = ELFW (R_SYM) (reloc->r_info);
>> +      const ElfW (Sym) *symtab = (const void *)D_PTR (map, l_info[DT_SYMTAB]);
>> +      const ElfW (Sym) *sym = &symtab[symndx];
>> +      const struct r_found_version *version = NULL;
>> +
>> +      if (map->l_info[VERSYMIDX (DT_VERSYM)] != NULL)
>> +	{
>> +	  const ElfW (Half) *vernum =
>> +	    (const void *)D_PTR (map, l_info[VERSYMIDX (DT_VERSYM)]);
>> +	  version = &map->l_versions[vernum[symndx] & 0x7fff];
>> +	}
>> +
>> +      /* Always initialize TLS descriptors completely, because lazy
>> +	 initialization requires synchronization at every TLS access.  */
>> +      elf_machine_rela (map, scope, reloc, sym, version, reloc_addr,
>> +			skip_ifunc);
>> +    }
>>   else
>>     _dl_reloc_bad_type (map, r_type, 1);
>> }
>> diff --git a/sysdeps/loongarch/dl-tls.h b/sysdeps/loongarch/dl-tls.h
>> index a551594b64..1ca376484a 100644
>> --- a/sysdeps/loongarch/dl-tls.h
>> +++ b/sysdeps/loongarch/dl-tls.h
>> @@ -16,6 +16,9 @@
>>    License along with the GNU C Library.  If not, see
>>    <https://www.gnu.org/licenses/>.  */
>>
>> +#ifndef _DL_TLS_H
>> +#define _DL_TLS_H
>> +
>> /* Type used for the representation of TLS information in the GOT.  */
>> typedef struct
>> {
>> @@ -23,6 +26,8 @@ typedef struct
>>   unsigned long int ti_offset;
>> } tls_index;
>>
>> +extern void *__tls_get_addr (tls_index *ti);
>> +
>> /* The thread pointer points to the first static TLS block.  */
>> #define TLS_TP_OFFSET 0
>>
>> @@ -37,10 +42,10 @@ typedef struct
>> /* Compute the value for a DTPREL reloc.  */
>> #define TLS_DTPREL_VALUE(sym) ((sym)->st_value - TLS_DTV_OFFSET)
>>
>> -extern void *__tls_get_addr (tls_index *ti);
>> -
>> #define GET_ADDR_OFFSET (ti->ti_offset + TLS_DTV_OFFSET)
>> #define __TLS_GET_ADDR(__ti) (__tls_get_addr (__ti) - TLS_DTV_OFFSET)
>>
>> /* Value used for dtv entries for which the allocation is delayed.  */
>> #define TLS_DTV_UNALLOCATED ((void *) -1l)
>> +
>> +#endif
>> diff --git a/sysdeps/loongarch/dl-tlsdesc.S b/sysdeps/loongarch/dl-tlsdesc.S
>> new file mode 100644
>> index 0000000000..d2c18ff527
>> --- /dev/null
>> +++ b/sysdeps/loongarch/dl-tlsdesc.S
>> @@ -0,0 +1,364 @@
>> +/* Thread-local storage handling in the ELF dynamic linker.
>> +   LoongArch version.
>> +   Copyright (C) 2011-2023 Free Software Foundation, Inc.
>> +
>> +   This file is part of the GNU C Library.
>> +
>> +   The GNU C Library is free software; you can redistribute it and/or
>> +   modify it under the terms of the GNU Lesser General Public
>> +   License as published by the Free Software Foundation; either
>> +   version 2.1 of the License, or (at your option) any later version.
>> +
>> +   The GNU C Library is distributed in the hope that it will be useful,
>> +   but WITHOUT ANY WARRANTY; without even the implied warranty of
>> +   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
>> +   Lesser General Public License for more details.
>> +
>> +   You should have received a copy of the GNU Lesser General Public
>> +   License along with the GNU C Library; if not, see
>> +   <https://www.gnu.org/licenses/>.  */
>> +
>> +#include <sysdep.h>
>> +#include <tls.h>
>> +#include "tlsdesc.h"
>> +
>> +	.text
>> +
>> +	/* Compute the thread pointer offset for symbols in the static
>> +	   TLS block. The offset is the same for all threads.
>> +	   Prototype:
>> +	   _dl_tlsdesc_return (tlsdesc *);  */
>> +	.hidden _dl_tlsdesc_return
>> +	.global	_dl_tlsdesc_return
>> +	.type	_dl_tlsdesc_return,%function
>> +	cfi_startproc
>> +	.align 2
>> +_dl_tlsdesc_return:
>> +	REG_L  a0, a0, 8
>> +	RET
>> +	cfi_endproc
>> +	.size	_dl_tlsdesc_return, .-_dl_tlsdesc_return
>> +
>> +	/* Handler for undefined weak TLS symbols.
>> +	   Prototype:
>> +	   _dl_tlsdesc_undefweak (tlsdesc *);
>> +
>> +	   The second word of the descriptor contains the addend.
>> +	   Return the addend minus the thread pointer. This ensures
>> +	   that when the caller adds on the thread pointer it gets back
>> +	   the addend.  */
>> +	.hidden _dl_tlsdesc_undefweak
>> +	.global	_dl_tlsdesc_undefweak
>> +	.type	_dl_tlsdesc_undefweak,%function
>> +	cfi_startproc
>> +	.align  2
>> +_dl_tlsdesc_undefweak:
>> +	REG_L	a0, a0, 8
>> +	sub.d	a0, a0, tp
>> +	RET
>> +	cfi_endproc
>> +	.size	_dl_tlsdesc_undefweak, .-_dl_tlsdesc_undefweak
>> +
>> +#ifdef USE_LASX
>> +# define FRAME_SIZE (-((-13 * SZREG - 32 * SZXREG) & ALMASK))
>> +#elif defined USE_LSX
>> +# define FRAME_SIZE (-((-13 * SZREG - 32 * SZVREG) & ALMASK))
>> +#elif !defined __loongarch_soft_float
>> +# define FRAME_SIZE (-((-13 * SZREG - 24 * SZFREG) & ALMASK))
>> +#else
>> +# define FRAME_SIZE (-((-13 * SZREG) & ALMASK))
>> +#endif
>> +
>> +#ifdef SHARED
>> +	/* Handler for dynamic TLS symbols.
>> +	   Prototype:
>> +	   _dl_tlsdesc_dynamic (tlsdesc *) ;
>> +
>> +	   The second word of the descriptor points to a
>> +	   tlsdesc_dynamic_arg structure.
>> +
>> +	   Returns the offset between the thread pointer and the
>> +	   object referenced by the argument.
>> +
>> +	   ptrdiff_t
>> +	   __attribute__ ((__regparm__ (1)))
>> +	   _dl_tlsdesc_dynamic (struct tlsdesc *tdp)
>> +	   {
>> +	     struct tlsdesc_dynamic_arg *td = tdp->arg;
>> +	     dtv_t *dtv = *(dtv_t **)((char *)__thread_pointer + TCBHEAD_DTV);
>> +	     if (__builtin_expect (td->gen_count <= dtv[0].counter
>> +		&& (dtv[td->tlsinfo.ti_module].pointer.val
>> +		    != TLS_DTV_UNALLOCATED),
>> +		1))
>> +	       return dtv[td->tlsinfo.ti_module].pointer.val
>> +		+ td->tlsinfo.ti_offset
>> +		- __thread_pointer;
>> +
>> +	     return ___tls_get_addr (&td->tlsinfo) - __thread_pointer;
>> +	   }
>> +	 */
>> +	.hidden _dl_tlsdesc_dynamic
>> +	.global	_dl_tlsdesc_dynamic
>> +	.type	_dl_tlsdesc_dynamic,%function
>> +	cfi_startproc
>> +	.align 2
>> +_dl_tlsdesc_dynamic:
>> +	/* Save just enough registers to support fast path, if we fall
>> +	   into slow path we will save additional registers.  */
>> +	addi.d  $r3,$r3,-24
> The stack alignment is broken here. The fast path doesn’t need stack alignment, but it breaks the assumption in the slow path that sp is already aligned. You need to either keep the fast path offset aligned (simpler, but some wasted stack space), or calculate the correct offset to re-align in the slow path.
>
>> +	REG_S	t0, sp, 0
>> +	REG_S	t1, sp, 8
>> +	REG_S	t2, sp, 16
>> +
>> +	REG_L	t0, tp, -SIZE_OF_DTV	  # dtv(t0) = tp + TCBHEAD_DTV dtv start
>> +	REG_L	a0, a0, TLSDESC_ARG	  # td(a0) = tdp->arg
>> +	REG_L	t1, a0, TLSDESC_GEN_COUNT # t1 = td->gen_count
>> +	REG_L	t2, t0, DTV_COUNTER	  # t2 = dtv[0].counter
>> +	bltu	t2, t1, .Lslow
>> +
>> +	REG_L	t1, a0, TLSDESC_MODID	  # t1 = td->tlsinfo.ti_module
>> +	slli.d	t1, t1, 3 + 1 # /* sizeof(dtv_t) == sizeof(void*) * 2 */
> 32-bit handling in this patch, in particular here, does not seem correct. On 32-bit the field of the related structures (void* and unsigned long) should also be 32-bit, but the offsets in the stub are hardcoded to those of 64-bit.
>
> Also note that REG_L is supposed to be a macro that switches the instruction based on 32- or 64-bit. I’m not sure why it’s hardcoded to the 64-bit version in LoongArch, but I think one should either stick to writing all instructions with .d suffix typed out (instead of the bit-agnostic versions), or define more bit-agnostic macros like REG_ADD so that everything that depends on pointer size can be portable across 32- and 64-bit.
>
>> +	add.d	t1, t1, t0    # t1 = dtv + ti_module * sizeof(dtv_t)
>> +	REG_L	t1, t1, 0 # t1 = dtv[td->tlsinfo.ti_module].pointer.val
>> +	li.d	t2, TLS_DTV_UNALLOCATED
>> +	beq	t1, t2, .Lslow
>> +	REG_L	t2, a0, TLSDESC_MODOFF # t2 = td->tlsinfo.ti_offset
>> +	# dtv[td->tlsinfo.ti_module].pointer.val + td->tlsinfo.ti_offset
>> +	add.d	a0, t1, t2
>> +.Lret:
>> +	sub.d	a0, a0, tp
>> +	REG_L	t0, sp, 0
>> +	REG_L	t1, sp, 8
>> +	REG_L	t2, sp, 16
>> +	addi.d	sp, sp, 24
>> +	RET
>> +
>> +.Lslow:
>> +	/* This is the slow path. We need to call __tls_get_addr() which
>> +	   means we need to save and restore all the register that the
>> +	   callee will trash.  */
>> +
>> +	/* Save the remaining registers that we must treat as caller save.  */
>> +	addi.d	sp, sp, -FRAME_SIZE
>> +	REG_S	ra, sp, 0 * SZREG
>> +	REG_S	a1, sp, 1 * SZREG
>> +	REG_S	a2, sp, 2 * SZREG
>> +	REG_S	a3, sp, 3 * SZREG
>> +	REG_S	a4, sp, 4 * SZREG
>> +	REG_S	a5, sp, 5 * SZREG
>> +	REG_S	a6, sp, 6 * SZREG
>> +	REG_S	a7, sp, 7 * SZREG
>> +	REG_S	t4, sp, 8 * SZREG
>> +	REG_S	t5, sp, 9 * SZREG
>> +	REG_S	t6, sp, 10 * SZREG
>> +	REG_S	t7, sp, 11 * SZREG
>> +	REG_S	t8, sp, 12 * SZREG
>> +
>> +#ifdef USE_LASX
>> +	xvst	xr0, sp, 13*SZREG + 0*SZXREG
>> +	xvst	xr1, sp, 13*SZREG + 1*SZXREG
>> +	xvst	xr2, sp, 13*SZREG + 2*SZXREG
>> +	xvst	xr3, sp, 13*SZREG + 3*SZXREG
>> +	xvst	xr4, sp, 13*SZREG + 4*SZXREG
>> +	xvst	xr5, sp, 13*SZREG + 5*SZXREG
>> +	xvst	xr6, sp, 13*SZREG + 6*SZXREG
>> +	xvst	xr7, sp, 13*SZREG + 7*SZXREG
>> +	xvst	xr8, sp, 13*SZREG + 8*SZXREG
>> +	xvst	xr9, sp, 13*SZREG + 9*SZXREG
>> +	xvst	xr10, sp, 13*SZREG + 10*SZXREG
>> +	xvst	xr11, sp, 13*SZREG + 11*SZXREG
>> +	xvst	xr12, sp, 13*SZREG + 12*SZXREG
>> +	xvst	xr13, sp, 13*SZREG + 13*SZXREG
>> +	xvst	xr14, sp, 13*SZREG + 14*SZXREG
>> +	xvst	xr15, sp, 13*SZREG + 15*SZXREG
>> +	xvst	xr16, sp, 13*SZREG + 16*SZXREG
>> +	xvst	xr17, sp, 13*SZREG + 17*SZXREG
>> +	xvst	xr18, sp, 13*SZREG + 18*SZXREG
>> +	xvst	xr19, sp, 13*SZREG + 19*SZXREG
>> +	xvst	xr20, sp, 13*SZREG + 20*SZXREG
>> +	xvst	xr21, sp, 13*SZREG + 21*SZXREG
>> +	xvst	xr22, sp, 13*SZREG + 22*SZXREG
>> +	xvst	xr23, sp, 13*SZREG + 23*SZXREG
>> +	xvst	xr23, sp, 13*SZREG + 24*SZXREG
>> +	xvst	xr23, sp, 13*SZREG + 25*SZXREG
>> +	xvst	xr23, sp, 13*SZREG + 26*SZXREG
>> +	xvst	xr23, sp, 13*SZREG + 27*SZXREG
>> +	xvst	xr23, sp, 13*SZREG + 28*SZXREG
>> +	xvst	xr23, sp, 13*SZREG + 29*SZXREG
>> +	xvst	xr23, sp, 13*SZREG + 30*SZXREG
>> +	xvst	xr23, sp, 13*SZREG + 31*SZXREG
>> +#elif defined USE_LSX
>> +	vst	vr0, sp, 13*SZREG + 0*SZVREG
>> +	vst	vr1, sp, 13*SZREG + 1*SZVREG
>> +	vst	vr2, sp, 13*SZREG + 2*SZVREG
>> +	vst	vr3, sp, 13*SZREG + 3*SZVREG
>> +	vst	vr4, sp, 13*SZREG + 4*SZVREG
>> +	vst	vr5, sp, 13*SZREG + 5*SZVREG
>> +	vst	vr6, sp, 13*SZREG + 6*SZVREG
>> +	vst	vr7, sp, 13*SZREG + 7*SZVREG
>> +	vst	vr8, sp, 13*SZREG + 8*SZVREG
>> +	vst	vr9, sp, 13*SZREG + 9*SZVREG
>> +	vst	vr10, sp, 13*SZREG + 10*SZVREG
>> +	vst	vr11, sp, 13*SZREG + 11*SZVREG
>> +	vst	vr12, sp, 13*SZREG + 12*SZVREG
>> +	vst	vr13, sp, 13*SZREG + 13*SZVREG
>> +	vst	vr14, sp, 13*SZREG + 14*SZVREG
>> +	vst	vr15, sp, 13*SZREG + 15*SZVREG
>> +	vst	vr16, sp, 13*SZREG + 16*SZVREG
>> +	vst	vr17, sp, 13*SZREG + 17*SZVREG
>> +	vst	vr18, sp, 13*SZREG + 18*SZVREG
>> +	vst	vr19, sp, 13*SZREG + 19*SZVREG
>> +	vst	vr20, sp, 13*SZREG + 20*SZVREG
>> +	vst	vr21, sp, 13*SZREG + 21*SZVREG
>> +	vst	vr22, sp, 13*SZREG + 22*SZVREG
>> +	vst	vr23, sp, 13*SZREG + 23*SZVREG
>> +	vst	vr23, sp, 13*SZREG + 24*SZVREG
>> +	vst	vr23, sp, 13*SZREG + 25*SZVREG
>> +	vst	vr23, sp, 13*SZREG + 26*SZVREG
>> +	vst	vr23, sp, 13*SZREG + 27*SZVREG
>> +	vst	vr23, sp, 13*SZREG + 28*SZVREG
>> +	vst	vr23, sp, 13*SZREG + 29*SZVREG
>> +	vst	vr23, sp, 13*SZREG + 30*SZVREG
>> +	vst	vr23, sp, 13*SZREG + 31*SZVREG
>> +#elif !defined __loongarch_soft_float
>> +	FREG_S	fa0, sp, 13*SZREG + 0*SZFREG
>> +	FREG_S	fa1, sp, 13*SZREG + 1*SZFREG
>> +	FREG_S	fa2, sp, 13*SZREG + 2*SZFREG
>> +	FREG_S	fa3, sp, 13*SZREG + 3*SZFREG
>> +	FREG_S	fa4, sp, 13*SZREG + 4*SZFREG
>> +	FREG_S	fa5, sp, 13*SZREG + 5*SZFREG
>> +	FREG_S	fa6, sp, 13*SZREG + 6*SZFREG
>> +	FREG_S	fa7, sp, 13*SZREG + 7*SZFREG
>> +	FREG_S	ft0, sp, 13*SZREG + 8*SZFREG
>> +	FREG_S	ft1, sp, 13*SZREG + 9*SZFREG
>> +	FREG_S	ft2, sp, 13*SZREG + 10*SZFREG
>> +	FREG_S	ft3, sp, 13*SZREG + 11*SZFREG
>> +	FREG_S	ft4, sp, 13*SZREG + 12*SZFREG
>> +	FREG_S	ft5, sp, 13*SZREG + 13*SZFREG
>> +	FREG_S	ft6, sp, 13*SZREG + 14*SZFREG
>> +	FREG_S	ft7, sp, 13*SZREG + 15*SZFREG
>> +	FREG_S	ft8, sp, 13*SZREG + 16*SZFREG
>> +	FREG_S	ft9, sp, 13*SZREG + 17*SZFREG
>> +	FREG_S	ft10, sp, 13*SZREG + 18*SZFREG
>> +	FREG_S	ft11, sp, 13*SZREG + 19*SZFREG
>> +	FREG_S	ft12, sp, 13*SZREG + 20*SZFREG
>> +	FREG_S	ft13, sp, 13*SZREG + 21*SZFREG
>> +	FREG_S	ft14, sp, 13*SZREG + 22*SZFREG
>> +	FREG_S	ft15, sp, 13*SZREG + 23*SZFREG
>> +#endif /* #ifdef USE_LASX  */
>> +
>> +	bl	__tls_get_addr
>> +	addi.d	a0, a0, -TLS_DTV_OFFSET
>> +
>> +	REG_L	ra, sp, 0
>> +	REG_L	a1, sp, 1 * 8
>> +	REG_L	a2, sp, 2 * 8
>> +	REG_L	a3, sp, 3 * 8
>> +	REG_L	a4, sp, 4 * 8
>> +	REG_L	a5, sp, 5 * 8
>> +	REG_L	a6, sp, 6 * 8
>> +	REG_L	a7, sp, 7 * 8
>> +	REG_L	t4, sp, 8 * 8
>> +	REG_L	t5, sp, 9 * 8
>> +	REG_L	t6, sp, 10 * 8
>> +	REG_L	t7, sp, 11 * 8
>> +	REG_L	t8, sp, 12 * 8
>> +
>> +#ifdef USE_LASX
>> +	xvld	xr0, sp, 13*SZREG + 0*SZXREG
>> +	xvld	xr1, sp, 13*SZREG + 1*SZXREG
>> +	xvld	xr2, sp, 13*SZREG + 2*SZXREG
>> +	xvld	xr3, sp, 13*SZREG + 3*SZXREG
>> +	xvld	xr4, sp, 13*SZREG + 4*SZXREG
>> +	xvld	xr5, sp, 13*SZREG + 5*SZXREG
>> +	xvld	xr6, sp, 13*SZREG + 6*SZXREG
>> +	xvld	xr7, sp, 13*SZREG + 7*SZXREG
>> +	xvld	xr8, sp, 13*SZREG + 8*SZXREG
>> +	xvld	xr9, sp, 13*SZREG + 9*SZXREG
>> +	xvld	xr10, sp, 13*SZREG + 10*SZXREG
>> +	xvld	xr11, sp, 13*SZREG + 11*SZXREG
>> +	xvld	xr12, sp, 13*SZREG + 12*SZXREG
>> +	xvld	xr13, sp, 13*SZREG + 13*SZXREG
>> +	xvld	xr14, sp, 13*SZREG + 14*SZXREG
>> +	xvld	xr15, sp, 13*SZREG + 15*SZXREG
>> +	xvld	xr16, sp, 13*SZREG + 16*SZXREG
>> +	xvld	xr17, sp, 13*SZREG + 17*SZXREG
>> +	xvld	xr18, sp, 13*SZREG + 18*SZXREG
>> +	xvld	xr19, sp, 13*SZREG + 19*SZXREG
>> +	xvld	xr20, sp, 13*SZREG + 20*SZXREG
>> +	xvld	xr21, sp, 13*SZREG + 21*SZXREG
>> +	xvld	xr22, sp, 13*SZREG + 22*SZXREG
>> +	xvld	xr23, sp, 13*SZREG + 23*SZXREG
>> +	xvld	xr24, sp, 13*SZREG + 24*SZXREG
>> +	xvld	xr25, sp, 13*SZREG + 25*SZXREG
>> +	xvld	xr26, sp, 13*SZREG + 26*SZXREG
>> +	xvld	xr27, sp, 13*SZREG + 27*SZXREG
>> +	xvld	xr28, sp, 13*SZREG + 28*SZXREG
>> +	xvld	xr29, sp, 13*SZREG + 29*SZXREG
>> +	xvld	xr30, sp, 13*SZREG + 30*SZXREG
>> +	xvld	xr31, sp, 13*SZREG + 31*SZXREG
>> +#elif defined USE_LSX
>> +	vld	vr0, sp, 13*SZREG + 0*SZVREG
>> +	vld	vr1, sp, 13*SZREG + 1*SZVREG
>> +	vld	vr2, sp, 13*SZREG + 2*SZVREG
>> +	vld	vr3, sp, 13*SZREG + 3*SZVREG
>> +	vld	vr4, sp, 13*SZREG + 4*SZVREG
>> +	vld	vr5, sp, 13*SZREG + 5*SZVREG
>> +	vld	vr6, sp, 13*SZREG + 6*SZVREG
>> +	vld	vr7, sp, 13*SZREG + 7*SZVREG
>> +	vld	vr8, sp, 13*SZREG + 8*SZVREG
>> +	vld	vr9, sp, 13*SZREG + 9*SZVREG
>> +	vld	vr10, sp, 13*SZREG + 10*SZVREG
>> +	vld	vr11, sp, 13*SZREG + 11*SZVREG
>> +	vld	vr12, sp, 13*SZREG + 12*SZVREG
>> +	vld	vr13, sp, 13*SZREG + 13*SZVREG
>> +	vld	vr14, sp, 13*SZREG + 14*SZVREG
>> +	vld	vr15, sp, 13*SZREG + 15*SZVREG
>> +	vld	vr16, sp, 13*SZREG + 16*SZVREG
>> +	vld	vr17, sp, 13*SZREG + 17*SZVREG
>> +	vld	vr18, sp, 13*SZREG + 18*SZVREG
>> +	vld	vr19, sp, 13*SZREG + 19*SZVREG
>> +	vld	vr20, sp, 13*SZREG + 20*SZVREG
>> +	vld	vr21, sp, 13*SZREG + 21*SZVREG
>> +	vld	vr22, sp, 13*SZREG + 22*SZVREG
>> +	vld	vr23, sp, 13*SZREG + 23*SZVREG
>> +	vld	vr24, sp, 13*SZREG + 24*SZVREG
>> +	vld	vr25, sp, 13*SZREG + 25*SZVREG
>> +	vld	vr26, sp, 13*SZREG + 26*SZVREG
>> +	vld	vr27, sp, 13*SZREG + 27*SZVREG
>> +	vld	vr28, sp, 13*SZREG + 28*SZVREG
>> +	vld	vr29, sp, 13*SZREG + 29*SZVREG
>> +	vld	vr30, sp, 13*SZREG + 30*SZVREG
>> +	vld	vr31, sp, 13*SZREG + 31*SZVREG
>> +#elif !defined __loongarch_soft_float
>> +	FREG_L	fa0, sp, 13*SZREG + 0*SZFREG
>> +	FREG_L	fa1, sp, 13*SZREG + 1*SZFREG
>> +	FREG_L	fa2, sp, 13*SZREG + 2*SZFREG
>> +	FREG_L	fa3, sp, 13*SZREG + 3*SZFREG
>> +	FREG_L	fa4, sp, 13*SZREG + 4*SZFREG
>> +	FREG_L	fa5, sp, 13*SZREG + 5*SZFREG
>> +	FREG_L	fa6, sp, 13*SZREG + 6*SZFREG
>> +	FREG_L	fa7, sp, 13*SZREG + 7*SZFREG
>> +	FREG_L	ft0, sp, 13*SZREG + 8*SZFREG
>> +	FREG_L	ft1, sp, 13*SZREG + 9*SZFREG
>> +	FREG_L	ft2, sp, 13*SZREG + 10*SZFREG
>> +	FREG_L	ft3, sp, 13*SZREG + 11*SZFREG
>> +	FREG_L	ft4, sp, 13*SZREG + 12*SZFREG
>> +	FREG_L	ft5, sp, 13*SZREG + 13*SZFREG
>> +	FREG_L	ft6, sp, 13*SZREG + 14*SZFREG
>> +	FREG_L	ft7, sp, 13*SZREG + 15*SZFREG
>> +	FREG_L	ft8, sp, 13*SZREG + 16*SZFREG
>> +	FREG_L	ft9, sp, 13*SZREG + 17*SZFREG
>> +	FREG_L	ft10, sp, 13*SZREG + 18*SZFREG
>> +	FREG_L	ft11, sp, 13*SZREG + 19*SZFREG
>> +	FREG_L	ft12, sp, 13*SZREG + 20*SZFREG
>> +	FREG_L	ft13, sp, 13*SZREG + 21*SZFREG
>> +	FREG_L	ft14, sp, 13*SZREG + 22*SZFREG
>> +	FREG_L	ft15, sp, 13*SZREG + 23*SZFREG
>> +#endif /* #ifdef USE_LASX  */
>> +
>> +	addi.d	sp, sp, FRAME_SIZE
>> +	b	.Lret
>> +	cfi_endproc
>> +	.size	_dl_tlsdesc_dynamic, .-_dl_tlsdesc_dynamic
>> +#endif /* #ifdef SHARED  */
>> diff --git a/sysdeps/loongarch/dl-tlsdesc.h b/sysdeps/loongarch/dl-tlsdesc.h
>> new file mode 100644
>> index 0000000000..e1a9365855
>> --- /dev/null
>> +++ b/sysdeps/loongarch/dl-tlsdesc.h
>> @@ -0,0 +1,49 @@
>> +/* Thread-local storage descriptor handling in the ELF dynamic linker.
>> +   LoongArch version.
>> +   Copyright (C) 2011-2023 Free Software Foundation, Inc.
>> +
>> +   This file is part of the GNU C Library.
>> +
>> +   The GNU C Library is free software; you can redistribute it and/or
>> +   modify it under the terms of the GNU Lesser General Public
>> +   License as published by the Free Software Foundation; either
>> +   version 2.1 of the License, or (at your option) any later version.
>> +
>> +   The GNU C Library is distributed in the hope that it will be useful,
>> +   but WITHOUT ANY WARRANTY; without even the implied warranty of
>> +   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
>> +   Lesser General Public License for more details.
>> +
>> +   You should have received a copy of the GNU Lesser General Public
>> +   License along with the GNU C Library; if not, see
>> +   <https://www.gnu.org/licenses/>.  */
>> +
>> +#ifndef _DL_TLSDESC_H
>> +#define _DL_TLSDESC_H
>> +
>> +#include <dl-tls.h>
>> +
>> +/* Type used to represent a TLS descriptor in the GOT.  */
>> +struct tlsdesc
>> +{
>> +  ptrdiff_t (*entry) (struct tlsdesc *);
>> +  void *arg;
>> +};
>> +
>> +/* Type used as the argument in a TLS descriptor for a symbol that
>> +   needs dynamic TLS offsets.  */
>> +struct tlsdesc_dynamic_arg
>> +{
>> +  tls_index tlsinfo;
>> +  size_t gen_count;
>> +};
>> +
>> +extern ptrdiff_t attribute_hidden _dl_tlsdesc_return (struct tlsdesc *);
>> +extern ptrdiff_t attribute_hidden _dl_tlsdesc_undefweak (struct tlsdesc *);
>> +
>> +# ifdef SHARED
>> +extern void *_dl_make_tlsdesc_dynamic (struct link_map *, size_t);
>> +extern ptrdiff_t attribute_hidden _dl_tlsdesc_dynamic (struct tlsdesc *);
>> +#endif
>> +
>> +#endif
>> diff --git a/sysdeps/loongarch/linkmap.h b/sysdeps/loongarch/linkmap.h
>> index 2f5bf53421..40416b1ad4 100644
>> --- a/sysdeps/loongarch/linkmap.h
>> +++ b/sysdeps/loongarch/linkmap.h
>> @@ -19,4 +19,5 @@
>> struct link_map_machine
>> {
>>   ElfW (Addr) plt; /* Address of .plt.  */
>> +  void *tlsdesc_table;    /* Address of TLS descriptor hash table.  */
>> };
>> diff --git a/sysdeps/loongarch/tlsdesc.c b/sysdeps/loongarch/tlsdesc.c
>> new file mode 100644
>> index 0000000000..a357e7619f
>> --- /dev/null
>> +++ b/sysdeps/loongarch/tlsdesc.c
>> @@ -0,0 +1,39 @@
>> +/* Manage TLS descriptors.  AArch64 version.
>> +
>> +   Copyright (C) 2011-2023 Free Software Foundation, Inc.
>> +
>> +   This file is part of the GNU C Library.
>> +
>> +   The GNU C Library is free software; you can redistribute it and/or
>> +   modify it under the terms of the GNU Lesser General Public
>> +   License as published by the Free Software Foundation; either
>> +   version 2.1 of the License, or (at your option) any later version.
>> +
>> +   The GNU C Library is distributed in the hope that it will be useful,
>> +   but WITHOUT ANY WARRANTY; without even the implied warranty of
>> +   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
>> +   Lesser General Public License for more details.
>> +
>> +   You should have received a copy of the GNU Lesser General Public
>> +   License along with the GNU C Library; if not, see
>> +   <https://www.gnu.org/licenses/>.  */
>> +
>> +#include <ldsodefs.h>
>> +#include <tls.h>
>> +#include <dl-tlsdesc.h>
>> +#include <dl-unmap-segments.h>
>> +#include <tlsdeschtab.h>
>> +
>> +/* Unmap the dynamic object, but also release its TLS descriptor table
>> +   if there is one.  */
>> +
>> +void
>> +_dl_unmap (struct link_map *map)
>> +{
>> +  _dl_unmap_segments (map);
>> +
>> +#ifdef SHARED
>> +  if (map->l_mach.tlsdesc_table)
>> +    htab_delete (map->l_mach.tlsdesc_table);
>> +#endif
>> +}
>> diff --git a/sysdeps/loongarch/tlsdesc.sym b/sysdeps/loongarch/tlsdesc.sym
>> new file mode 100644
>> index 0000000000..bcab218631
>> --- /dev/null
>> +++ b/sysdeps/loongarch/tlsdesc.sym
>> @@ -0,0 +1,19 @@
>> +#include <stddef.h>
>> +#include <sysdep.h>
>> +#include <tls.h>
>> +#include <link.h>
>> +#include <dl-tlsdesc.h>
>> +
>> +--
>> +
>> +-- Abuse tls.h macros to derive offsets relative to the thread register.
>> +
>> +TLSDESC_ARG		offsetof(struct tlsdesc, arg)
>> +TLSDESC_GEN_COUNT	offsetof(struct tlsdesc_dynamic_arg, gen_count)
>> +TLSDESC_MODID		offsetof(struct tlsdesc_dynamic_arg, tlsinfo.ti_module)
>> +TLSDESC_MODOFF		offsetof(struct tlsdesc_dynamic_arg, tlsinfo.ti_offset)
>> +TCBHEAD_DTV		offsetof(tcbhead_t, dtv)
>> +DTV_COUNTER		offsetof(dtv_t, counter)
>> +TLS_DTV_UNALLOCATED	TLS_DTV_UNALLOCATED
>> +TLS_DTV_OFFSET		TLS_DTV_OFFSET
>> +SIZE_OF_DTV		sizeof(tcbhead_t)
>> diff --git a/sysdeps/unix/sysv/linux/loongarch/localplt.data b/sysdeps/unix/sysv/linux/loongarch/localplt.data
>> index 547b1c1b7f..ec32e6d13f 100644
>> --- a/sysdeps/unix/sysv/linux/loongarch/localplt.data
>> +++ b/sysdeps/unix/sysv/linux/loongarch/localplt.data
>> @@ -5,3 +5,5 @@ libc.so: calloc
>> libc.so: free
>> libc.so: malloc
>> libc.so: realloc
>> +# The dynamic loader needs __tls_get_addr for TLS.
>> +ld.so: __tls_get_addr
>> -- 
>> 2.36.0
>>
>>
  
mengqinggang Jan. 8, 2024, 2:39 a.m. UTC | #16
在 2024/1/8 上午7:03, Tatsuyuki Ishi 写道:
>> On Dec 1, 2023, at 18:57, mengqinggang <mengqinggang@loongson.cn> wrote:
>>
>> This is mostly based on AArch64 and RISC-V implementation.
>>
>> Add R_LARCH_TLS_DESC32 and R_LARCH_TLS_DESC64 relocations.
>>
>> For _dl_tlsdesc_dynamic function slow path, temporarily save and restore
>> all vector registers.
>> ---
>> elf/elf.h                                     |   2 +
>> sysdeps/loongarch/Makefile                    |   6 +
>> sysdeps/loongarch/dl-link.sym                 |   1 +
>> sysdeps/loongarch/dl-machine.h                |  52 ++-
>> sysdeps/loongarch/dl-tls.h                    |   9 +-
>> sysdeps/loongarch/dl-tlsdesc.S                | 364 ++++++++++++++++++
>> sysdeps/loongarch/dl-tlsdesc.h                |  49 +++
>> sysdeps/loongarch/linkmap.h                   |   1 +
>> sysdeps/loongarch/tlsdesc.c                   |  39 ++
>> sysdeps/loongarch/tlsdesc.sym                 |  19 +
>> .../unix/sysv/linux/loongarch/localplt.data   |   2 +
>> 11 files changed, 541 insertions(+), 3 deletions(-)
>> create mode 100644 sysdeps/loongarch/dl-tlsdesc.S
>> create mode 100644 sysdeps/loongarch/dl-tlsdesc.h
>> create mode 100644 sysdeps/loongarch/tlsdesc.c
>> create mode 100644 sysdeps/loongarch/tlsdesc.sym
>>
>> diff --git a/elf/elf.h b/elf/elf.h
>> index 5c1c1972d1..72e90aec30 100644
>> --- a/elf/elf.h
>> +++ b/elf/elf.h
>> @@ -4232,6 +4232,8 @@ enum
>> #define R_LARCH_TLS_TPREL32	10
>> #define R_LARCH_TLS_TPREL64	11
>> #define R_LARCH_IRELATIVE	12
>> +#define R_LARCH_TLS_DESC32	13
>> +#define R_LARCH_TLS_DESC64	14
> Does there need to be separate relocations for 32- and 64-bit? For RISC-V this was determinable from the bitness of the ELF binary, and a lot of old relocations had meaningless 32 and 64 suffixes by accident [1].
>
> [1]: https://github.com/riscv-non-isa/riscv-elf-psabi-doc/pull/373#discussion_r1153477626
>
>> /* Reserved for future relocs that the dynamic linker must understand.  */
>>
>> diff --git a/sysdeps/loongarch/Makefile b/sysdeps/loongarch/Makefile
>> index 43d2f583cd..181389e787 100644
>> --- a/sysdeps/loongarch/Makefile
>> +++ b/sysdeps/loongarch/Makefile
>> @@ -3,9 +3,15 @@ sysdep_headers += sys/asm.h
>> endif
>>
>> ifeq ($(subdir),elf)
>> +sysdep-dl-routines += tlsdesc dl-tlsdesc
>> gen-as-const-headers += dl-link.sym
>> endif
>>
>> +ifeq ($(subdir),csu)
>> +gen-as-const-headers += tlsdesc.sym
>> +endif
>> +
>> +
>> # LoongArch's assembler also needs to know about PIC as it changes the
>> # definition of some assembler macros.
>> ASFLAGS-.os += $(pic-ccflag)
>> diff --git a/sysdeps/loongarch/dl-link.sym b/sysdeps/loongarch/dl-link.sym
>> index b534968e30..fd81ef37d5 100644
>> --- a/sysdeps/loongarch/dl-link.sym
>> +++ b/sysdeps/loongarch/dl-link.sym
>> @@ -1,6 +1,7 @@
>> #include <stddef.h>
>> #include <sysdep.h>
>> #include <link.h>
>> +#include <dl-tlsdesc.h>
>>
>> DL_SIZEOF_RG            sizeof(struct La_loongarch_regs)
>> DL_SIZEOF_RV            sizeof(struct La_loongarch_retval)
>> diff --git a/sysdeps/loongarch/dl-machine.h b/sysdeps/loongarch/dl-machine.h
>> index 0d17fd21e3..0dd252a5e5 100644
>> --- a/sysdeps/loongarch/dl-machine.h
>> +++ b/sysdeps/loongarch/dl-machine.h
>> @@ -25,7 +25,7 @@
>> #include <entry.h>
>> #include <elf/elf.h>
>> #include <sys/asm.h>
>> -#include <dl-tls.h>
>> +#include <dl-tlsdesc.h>
>> #include <dl-static-tls.h>
>> #include <dl-machine-rel.h>
>>
>> @@ -187,6 +187,37 @@ elf_machine_rela (struct link_map *map, struct r_scope_elem *scope[],
>>       *addr_field = TLS_TPREL_VALUE (sym_map, sym) + reloc->r_addend;
>>       break;
>>
>> +    case __WORDSIZE == 64 ? R_LARCH_TLS_DESC64 : R_LARCH_TLS_DESC32:
>> +      {
>> +	struct tlsdesc volatile *td =
>> +	    (struct tlsdesc volatile *)addr_field;
>> +	if (! sym)
>> +	  {
>> +	    td->arg = (void*)reloc->r_addend;
>> +	    td->entry = _dl_tlsdesc_undefweak;
>> +	  }
>> +	else
>> +	  {
>> +# ifndef SHARED
>> +	    CHECK_STATIC_TLS (map, sym_map);
>> +# else
>> +	    if (!TRY_STATIC_TLS (map, sym_map))
>> +	      {
>> +		td->arg = _dl_make_tlsdesc_dynamic
>> +		  (sym_map, sym->st_value + reloc->r_addend);
>> +		td->entry = _dl_tlsdesc_dynamic;
>> +	      }
>> +	    else
>> +# endif
>> +	      {
>> +		td->arg = (void *)(TLS_TPREL_VALUE (sym_map, sym)
>> +			    + reloc->r_addend);
>> +		td->entry = _dl_tlsdesc_return;
>> +	      }
>> +	  }
>> +	break;
>> +      }
>> +
>>     case R_LARCH_COPY:
>>       {
>> 	  if (sym == NULL)
>> @@ -255,6 +286,25 @@ elf_machine_lazy_rel (struct link_map *map, struct r_scope_elem *scope[],
>>       else
>> 	*reloc_addr = map->l_mach.plt;
>>     }
>> +  else if (__builtin_expect (r_type == R_LARCH_TLS_DESC64, 1))
>> +    {
>> +      const Elf_Symndx symndx = ELFW (R_SYM) (reloc->r_info);
>> +      const ElfW (Sym) *symtab = (const void *)D_PTR (map, l_info[DT_SYMTAB]);
>> +      const ElfW (Sym) *sym = &symtab[symndx];
>> +      const struct r_found_version *version = NULL;
>> +
>> +      if (map->l_info[VERSYMIDX (DT_VERSYM)] != NULL)
>> +	{
>> +	  const ElfW (Half) *vernum =
>> +	    (const void *)D_PTR (map, l_info[VERSYMIDX (DT_VERSYM)]);
>> +	  version = &map->l_versions[vernum[symndx] & 0x7fff];
>> +	}
>> +
>> +      /* Always initialize TLS descriptors completely, because lazy
>> +	 initialization requires synchronization at every TLS access.  */
>> +      elf_machine_rela (map, scope, reloc, sym, version, reloc_addr,
>> +			skip_ifunc);
>> +    }
>>   else
>>     _dl_reloc_bad_type (map, r_type, 1);
>> }
>> diff --git a/sysdeps/loongarch/dl-tls.h b/sysdeps/loongarch/dl-tls.h
>> index a551594b64..1ca376484a 100644
>> --- a/sysdeps/loongarch/dl-tls.h
>> +++ b/sysdeps/loongarch/dl-tls.h
>> @@ -16,6 +16,9 @@
>>    License along with the GNU C Library.  If not, see
>>    <https://www.gnu.org/licenses/>.  */
>>
>> +#ifndef _DL_TLS_H
>> +#define _DL_TLS_H
>> +
>> /* Type used for the representation of TLS information in the GOT.  */
>> typedef struct
>> {
>> @@ -23,6 +26,8 @@ typedef struct
>>   unsigned long int ti_offset;
>> } tls_index;
>>
>> +extern void *__tls_get_addr (tls_index *ti);
>> +
>> /* The thread pointer points to the first static TLS block.  */
>> #define TLS_TP_OFFSET 0
>>
>> @@ -37,10 +42,10 @@ typedef struct
>> /* Compute the value for a DTPREL reloc.  */
>> #define TLS_DTPREL_VALUE(sym) ((sym)->st_value - TLS_DTV_OFFSET)
>>
>> -extern void *__tls_get_addr (tls_index *ti);
>> -
>> #define GET_ADDR_OFFSET (ti->ti_offset + TLS_DTV_OFFSET)
>> #define __TLS_GET_ADDR(__ti) (__tls_get_addr (__ti) - TLS_DTV_OFFSET)
>>
>> /* Value used for dtv entries for which the allocation is delayed.  */
>> #define TLS_DTV_UNALLOCATED ((void *) -1l)
>> +
>> +#endif
>> diff --git a/sysdeps/loongarch/dl-tlsdesc.S b/sysdeps/loongarch/dl-tlsdesc.S
>> new file mode 100644
>> index 0000000000..d2c18ff527
>> --- /dev/null
>> +++ b/sysdeps/loongarch/dl-tlsdesc.S
>> @@ -0,0 +1,364 @@
>> +/* Thread-local storage handling in the ELF dynamic linker.
>> +   LoongArch version.
>> +   Copyright (C) 2011-2023 Free Software Foundation, Inc.
>> +
>> +   This file is part of the GNU C Library.
>> +
>> +   The GNU C Library is free software; you can redistribute it and/or
>> +   modify it under the terms of the GNU Lesser General Public
>> +   License as published by the Free Software Foundation; either
>> +   version 2.1 of the License, or (at your option) any later version.
>> +
>> +   The GNU C Library is distributed in the hope that it will be useful,
>> +   but WITHOUT ANY WARRANTY; without even the implied warranty of
>> +   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
>> +   Lesser General Public License for more details.
>> +
>> +   You should have received a copy of the GNU Lesser General Public
>> +   License along with the GNU C Library; if not, see
>> +   <https://www.gnu.org/licenses/>.  */
>> +
>> +#include <sysdep.h>
>> +#include <tls.h>
>> +#include "tlsdesc.h"
>> +
>> +	.text
>> +
>> +	/* Compute the thread pointer offset for symbols in the static
>> +	   TLS block. The offset is the same for all threads.
>> +	   Prototype:
>> +	   _dl_tlsdesc_return (tlsdesc *);  */
>> +	.hidden _dl_tlsdesc_return
>> +	.global	_dl_tlsdesc_return
>> +	.type	_dl_tlsdesc_return,%function
>> +	cfi_startproc
>> +	.align 2
>> +_dl_tlsdesc_return:
>> +	REG_L  a0, a0, 8
>> +	RET
>> +	cfi_endproc
>> +	.size	_dl_tlsdesc_return, .-_dl_tlsdesc_return
>> +
>> +	/* Handler for undefined weak TLS symbols.
>> +	   Prototype:
>> +	   _dl_tlsdesc_undefweak (tlsdesc *);
>> +
>> +	   The second word of the descriptor contains the addend.
>> +	   Return the addend minus the thread pointer. This ensures
>> +	   that when the caller adds on the thread pointer it gets back
>> +	   the addend.  */
>> +	.hidden _dl_tlsdesc_undefweak
>> +	.global	_dl_tlsdesc_undefweak
>> +	.type	_dl_tlsdesc_undefweak,%function
>> +	cfi_startproc
>> +	.align  2
>> +_dl_tlsdesc_undefweak:
>> +	REG_L	a0, a0, 8
>> +	sub.d	a0, a0, tp
>> +	RET
>> +	cfi_endproc
>> +	.size	_dl_tlsdesc_undefweak, .-_dl_tlsdesc_undefweak
>> +
>> +#ifdef USE_LASX
>> +# define FRAME_SIZE (-((-13 * SZREG - 32 * SZXREG) & ALMASK))
>> +#elif defined USE_LSX
>> +# define FRAME_SIZE (-((-13 * SZREG - 32 * SZVREG) & ALMASK))
>> +#elif !defined __loongarch_soft_float
>> +# define FRAME_SIZE (-((-13 * SZREG - 24 * SZFREG) & ALMASK))
>> +#else
>> +# define FRAME_SIZE (-((-13 * SZREG) & ALMASK))
>> +#endif
>> +
>> +#ifdef SHARED
>> +	/* Handler for dynamic TLS symbols.
>> +	   Prototype:
>> +	   _dl_tlsdesc_dynamic (tlsdesc *) ;
>> +
>> +	   The second word of the descriptor points to a
>> +	   tlsdesc_dynamic_arg structure.
>> +
>> +	   Returns the offset between the thread pointer and the
>> +	   object referenced by the argument.
>> +
>> +	   ptrdiff_t
>> +	   __attribute__ ((__regparm__ (1)))
>> +	   _dl_tlsdesc_dynamic (struct tlsdesc *tdp)
>> +	   {
>> +	     struct tlsdesc_dynamic_arg *td = tdp->arg;
>> +	     dtv_t *dtv = *(dtv_t **)((char *)__thread_pointer + TCBHEAD_DTV);
>> +	     if (__builtin_expect (td->gen_count <= dtv[0].counter
>> +		&& (dtv[td->tlsinfo.ti_module].pointer.val
>> +		    != TLS_DTV_UNALLOCATED),
>> +		1))
>> +	       return dtv[td->tlsinfo.ti_module].pointer.val
>> +		+ td->tlsinfo.ti_offset
>> +		- __thread_pointer;
>> +
>> +	     return ___tls_get_addr (&td->tlsinfo) - __thread_pointer;
>> +	   }
>> +	 */
>> +	.hidden _dl_tlsdesc_dynamic
>> +	.global	_dl_tlsdesc_dynamic
>> +	.type	_dl_tlsdesc_dynamic,%function
>> +	cfi_startproc
>> +	.align 2
>> +_dl_tlsdesc_dynamic:
>> +	/* Save just enough registers to support fast path, if we fall
>> +	   into slow path we will save additional registers.  */
>> +	addi.d  $r3,$r3,-24
> The stack alignment is broken here. The fast path doesn’t need stack alignment, but it breaks the assumption in the slow path that sp is already aligned. You need to either keep the fast path offset aligned (simpler, but some wasted stack space), or calculate the correct offset to re-align in the slow path.


I will  fix this in next version patch.


>> +	REG_S	t0, sp, 0
>> +	REG_S	t1, sp, 8
>> +	REG_S	t2, sp, 16
>> +
>> +	REG_L	t0, tp, -SIZE_OF_DTV	  # dtv(t0) = tp + TCBHEAD_DTV dtv start
>> +	REG_L	a0, a0, TLSDESC_ARG	  # td(a0) = tdp->arg
>> +	REG_L	t1, a0, TLSDESC_GEN_COUNT # t1 = td->gen_count
>> +	REG_L	t2, t0, DTV_COUNTER	  # t2 = dtv[0].counter
>> +	bltu	t2, t1, .Lslow
>> +
>> +	REG_L	t1, a0, TLSDESC_MODID	  # t1 = td->tlsinfo.ti_module
>> +	slli.d	t1, t1, 3 + 1 # /* sizeof(dtv_t) == sizeof(void*) * 2 */
> 32-bit handling in this patch, in particular here, does not seem correct. On 32-bit the field of the related structures (void* and unsigned long) should also be 32-bit, but the offsets in the stub are hardcoded to those of 64-bit.
>
> Also note that REG_L is supposed to be a macro that switches the instruction based on 32- or 64-bit. I’m not sure why it’s hardcoded to the 64-bit version in LoongArch, but I think one should either stick to writing all instructions with .d suffix typed out (instead of the bit-agnostic versions), or define more bit-agnostic macros like REG_ADD so that everything that depends on pointer size can be portable across 32- and 64-bit.


Glibc currently only supports LoongArch 64-bit architecture. I will try 
to fix this in next version patch.


>> +	add.d	t1, t1, t0    # t1 = dtv + ti_module * sizeof(dtv_t)
>> +	REG_L	t1, t1, 0 # t1 = dtv[td->tlsinfo.ti_module].pointer.val
>> +	li.d	t2, TLS_DTV_UNALLOCATED
>> +	beq	t1, t2, .Lslow
>> +	REG_L	t2, a0, TLSDESC_MODOFF # t2 = td->tlsinfo.ti_offset
>> +	# dtv[td->tlsinfo.ti_module].pointer.val + td->tlsinfo.ti_offset
>> +	add.d	a0, t1, t2
>> +.Lret:
>> +	sub.d	a0, a0, tp
>> +	REG_L	t0, sp, 0
>> +	REG_L	t1, sp, 8
>> +	REG_L	t2, sp, 16
>> +	addi.d	sp, sp, 24
>> +	RET
>> +
>> +.Lslow:
>> +	/* This is the slow path. We need to call __tls_get_addr() which
>> +	   means we need to save and restore all the register that the
>> +	   callee will trash.  */
>> +
>> +	/* Save the remaining registers that we must treat as caller save.  */
>> +	addi.d	sp, sp, -FRAME_SIZE
>> +	REG_S	ra, sp, 0 * SZREG
>> +	REG_S	a1, sp, 1 * SZREG
>> +	REG_S	a2, sp, 2 * SZREG
>> +	REG_S	a3, sp, 3 * SZREG
>> +	REG_S	a4, sp, 4 * SZREG
>> +	REG_S	a5, sp, 5 * SZREG
>> +	REG_S	a6, sp, 6 * SZREG
>> +	REG_S	a7, sp, 7 * SZREG
>> +	REG_S	t4, sp, 8 * SZREG
>> +	REG_S	t5, sp, 9 * SZREG
>> +	REG_S	t6, sp, 10 * SZREG
>> +	REG_S	t7, sp, 11 * SZREG
>> +	REG_S	t8, sp, 12 * SZREG
>> +
>> +#ifdef USE_LASX
>> +	xvst	xr0, sp, 13*SZREG + 0*SZXREG
>> +	xvst	xr1, sp, 13*SZREG + 1*SZXREG
>> +	xvst	xr2, sp, 13*SZREG + 2*SZXREG
>> +	xvst	xr3, sp, 13*SZREG + 3*SZXREG
>> +	xvst	xr4, sp, 13*SZREG + 4*SZXREG
>> +	xvst	xr5, sp, 13*SZREG + 5*SZXREG
>> +	xvst	xr6, sp, 13*SZREG + 6*SZXREG
>> +	xvst	xr7, sp, 13*SZREG + 7*SZXREG
>> +	xvst	xr8, sp, 13*SZREG + 8*SZXREG
>> +	xvst	xr9, sp, 13*SZREG + 9*SZXREG
>> +	xvst	xr10, sp, 13*SZREG + 10*SZXREG
>> +	xvst	xr11, sp, 13*SZREG + 11*SZXREG
>> +	xvst	xr12, sp, 13*SZREG + 12*SZXREG
>> +	xvst	xr13, sp, 13*SZREG + 13*SZXREG
>> +	xvst	xr14, sp, 13*SZREG + 14*SZXREG
>> +	xvst	xr15, sp, 13*SZREG + 15*SZXREG
>> +	xvst	xr16, sp, 13*SZREG + 16*SZXREG
>> +	xvst	xr17, sp, 13*SZREG + 17*SZXREG
>> +	xvst	xr18, sp, 13*SZREG + 18*SZXREG
>> +	xvst	xr19, sp, 13*SZREG + 19*SZXREG
>> +	xvst	xr20, sp, 13*SZREG + 20*SZXREG
>> +	xvst	xr21, sp, 13*SZREG + 21*SZXREG
>> +	xvst	xr22, sp, 13*SZREG + 22*SZXREG
>> +	xvst	xr23, sp, 13*SZREG + 23*SZXREG
>> +	xvst	xr23, sp, 13*SZREG + 24*SZXREG
>> +	xvst	xr23, sp, 13*SZREG + 25*SZXREG
>> +	xvst	xr23, sp, 13*SZREG + 26*SZXREG
>> +	xvst	xr23, sp, 13*SZREG + 27*SZXREG
>> +	xvst	xr23, sp, 13*SZREG + 28*SZXREG
>> +	xvst	xr23, sp, 13*SZREG + 29*SZXREG
>> +	xvst	xr23, sp, 13*SZREG + 30*SZXREG
>> +	xvst	xr23, sp, 13*SZREG + 31*SZXREG
>> +#elif defined USE_LSX
>> +	vst	vr0, sp, 13*SZREG + 0*SZVREG
>> +	vst	vr1, sp, 13*SZREG + 1*SZVREG
>> +	vst	vr2, sp, 13*SZREG + 2*SZVREG
>> +	vst	vr3, sp, 13*SZREG + 3*SZVREG
>> +	vst	vr4, sp, 13*SZREG + 4*SZVREG
>> +	vst	vr5, sp, 13*SZREG + 5*SZVREG
>> +	vst	vr6, sp, 13*SZREG + 6*SZVREG
>> +	vst	vr7, sp, 13*SZREG + 7*SZVREG
>> +	vst	vr8, sp, 13*SZREG + 8*SZVREG
>> +	vst	vr9, sp, 13*SZREG + 9*SZVREG
>> +	vst	vr10, sp, 13*SZREG + 10*SZVREG
>> +	vst	vr11, sp, 13*SZREG + 11*SZVREG
>> +	vst	vr12, sp, 13*SZREG + 12*SZVREG
>> +	vst	vr13, sp, 13*SZREG + 13*SZVREG
>> +	vst	vr14, sp, 13*SZREG + 14*SZVREG
>> +	vst	vr15, sp, 13*SZREG + 15*SZVREG
>> +	vst	vr16, sp, 13*SZREG + 16*SZVREG
>> +	vst	vr17, sp, 13*SZREG + 17*SZVREG
>> +	vst	vr18, sp, 13*SZREG + 18*SZVREG
>> +	vst	vr19, sp, 13*SZREG + 19*SZVREG
>> +	vst	vr20, sp, 13*SZREG + 20*SZVREG
>> +	vst	vr21, sp, 13*SZREG + 21*SZVREG
>> +	vst	vr22, sp, 13*SZREG + 22*SZVREG
>> +	vst	vr23, sp, 13*SZREG + 23*SZVREG
>> +	vst	vr23, sp, 13*SZREG + 24*SZVREG
>> +	vst	vr23, sp, 13*SZREG + 25*SZVREG
>> +	vst	vr23, sp, 13*SZREG + 26*SZVREG
>> +	vst	vr23, sp, 13*SZREG + 27*SZVREG
>> +	vst	vr23, sp, 13*SZREG + 28*SZVREG
>> +	vst	vr23, sp, 13*SZREG + 29*SZVREG
>> +	vst	vr23, sp, 13*SZREG + 30*SZVREG
>> +	vst	vr23, sp, 13*SZREG + 31*SZVREG
>> +#elif !defined __loongarch_soft_float
>> +	FREG_S	fa0, sp, 13*SZREG + 0*SZFREG
>> +	FREG_S	fa1, sp, 13*SZREG + 1*SZFREG
>> +	FREG_S	fa2, sp, 13*SZREG + 2*SZFREG
>> +	FREG_S	fa3, sp, 13*SZREG + 3*SZFREG
>> +	FREG_S	fa4, sp, 13*SZREG + 4*SZFREG
>> +	FREG_S	fa5, sp, 13*SZREG + 5*SZFREG
>> +	FREG_S	fa6, sp, 13*SZREG + 6*SZFREG
>> +	FREG_S	fa7, sp, 13*SZREG + 7*SZFREG
>> +	FREG_S	ft0, sp, 13*SZREG + 8*SZFREG
>> +	FREG_S	ft1, sp, 13*SZREG + 9*SZFREG
>> +	FREG_S	ft2, sp, 13*SZREG + 10*SZFREG
>> +	FREG_S	ft3, sp, 13*SZREG + 11*SZFREG
>> +	FREG_S	ft4, sp, 13*SZREG + 12*SZFREG
>> +	FREG_S	ft5, sp, 13*SZREG + 13*SZFREG
>> +	FREG_S	ft6, sp, 13*SZREG + 14*SZFREG
>> +	FREG_S	ft7, sp, 13*SZREG + 15*SZFREG
>> +	FREG_S	ft8, sp, 13*SZREG + 16*SZFREG
>> +	FREG_S	ft9, sp, 13*SZREG + 17*SZFREG
>> +	FREG_S	ft10, sp, 13*SZREG + 18*SZFREG
>> +	FREG_S	ft11, sp, 13*SZREG + 19*SZFREG
>> +	FREG_S	ft12, sp, 13*SZREG + 20*SZFREG
>> +	FREG_S	ft13, sp, 13*SZREG + 21*SZFREG
>> +	FREG_S	ft14, sp, 13*SZREG + 22*SZFREG
>> +	FREG_S	ft15, sp, 13*SZREG + 23*SZFREG
>> +#endif /* #ifdef USE_LASX  */
>> +
>> +	bl	__tls_get_addr
>> +	addi.d	a0, a0, -TLS_DTV_OFFSET
>> +
>> +	REG_L	ra, sp, 0
>> +	REG_L	a1, sp, 1 * 8
>> +	REG_L	a2, sp, 2 * 8
>> +	REG_L	a3, sp, 3 * 8
>> +	REG_L	a4, sp, 4 * 8
>> +	REG_L	a5, sp, 5 * 8
>> +	REG_L	a6, sp, 6 * 8
>> +	REG_L	a7, sp, 7 * 8
>> +	REG_L	t4, sp, 8 * 8
>> +	REG_L	t5, sp, 9 * 8
>> +	REG_L	t6, sp, 10 * 8
>> +	REG_L	t7, sp, 11 * 8
>> +	REG_L	t8, sp, 12 * 8
>> +
>> +#ifdef USE_LASX
>> +	xvld	xr0, sp, 13*SZREG + 0*SZXREG
>> +	xvld	xr1, sp, 13*SZREG + 1*SZXREG
>> +	xvld	xr2, sp, 13*SZREG + 2*SZXREG
>> +	xvld	xr3, sp, 13*SZREG + 3*SZXREG
>> +	xvld	xr4, sp, 13*SZREG + 4*SZXREG
>> +	xvld	xr5, sp, 13*SZREG + 5*SZXREG
>> +	xvld	xr6, sp, 13*SZREG + 6*SZXREG
>> +	xvld	xr7, sp, 13*SZREG + 7*SZXREG
>> +	xvld	xr8, sp, 13*SZREG + 8*SZXREG
>> +	xvld	xr9, sp, 13*SZREG + 9*SZXREG
>> +	xvld	xr10, sp, 13*SZREG + 10*SZXREG
>> +	xvld	xr11, sp, 13*SZREG + 11*SZXREG
>> +	xvld	xr12, sp, 13*SZREG + 12*SZXREG
>> +	xvld	xr13, sp, 13*SZREG + 13*SZXREG
>> +	xvld	xr14, sp, 13*SZREG + 14*SZXREG
>> +	xvld	xr15, sp, 13*SZREG + 15*SZXREG
>> +	xvld	xr16, sp, 13*SZREG + 16*SZXREG
>> +	xvld	xr17, sp, 13*SZREG + 17*SZXREG
>> +	xvld	xr18, sp, 13*SZREG + 18*SZXREG
>> +	xvld	xr19, sp, 13*SZREG + 19*SZXREG
>> +	xvld	xr20, sp, 13*SZREG + 20*SZXREG
>> +	xvld	xr21, sp, 13*SZREG + 21*SZXREG
>> +	xvld	xr22, sp, 13*SZREG + 22*SZXREG
>> +	xvld	xr23, sp, 13*SZREG + 23*SZXREG
>> +	xvld	xr24, sp, 13*SZREG + 24*SZXREG
>> +	xvld	xr25, sp, 13*SZREG + 25*SZXREG
>> +	xvld	xr26, sp, 13*SZREG + 26*SZXREG
>> +	xvld	xr27, sp, 13*SZREG + 27*SZXREG
>> +	xvld	xr28, sp, 13*SZREG + 28*SZXREG
>> +	xvld	xr29, sp, 13*SZREG + 29*SZXREG
>> +	xvld	xr30, sp, 13*SZREG + 30*SZXREG
>> +	xvld	xr31, sp, 13*SZREG + 31*SZXREG
>> +#elif defined USE_LSX
>> +	vld	vr0, sp, 13*SZREG + 0*SZVREG
>> +	vld	vr1, sp, 13*SZREG + 1*SZVREG
>> +	vld	vr2, sp, 13*SZREG + 2*SZVREG
>> +	vld	vr3, sp, 13*SZREG + 3*SZVREG
>> +	vld	vr4, sp, 13*SZREG + 4*SZVREG
>> +	vld	vr5, sp, 13*SZREG + 5*SZVREG
>> +	vld	vr6, sp, 13*SZREG + 6*SZVREG
>> +	vld	vr7, sp, 13*SZREG + 7*SZVREG
>> +	vld	vr8, sp, 13*SZREG + 8*SZVREG
>> +	vld	vr9, sp, 13*SZREG + 9*SZVREG
>> +	vld	vr10, sp, 13*SZREG + 10*SZVREG
>> +	vld	vr11, sp, 13*SZREG + 11*SZVREG
>> +	vld	vr12, sp, 13*SZREG + 12*SZVREG
>> +	vld	vr13, sp, 13*SZREG + 13*SZVREG
>> +	vld	vr14, sp, 13*SZREG + 14*SZVREG
>> +	vld	vr15, sp, 13*SZREG + 15*SZVREG
>> +	vld	vr16, sp, 13*SZREG + 16*SZVREG
>> +	vld	vr17, sp, 13*SZREG + 17*SZVREG
>> +	vld	vr18, sp, 13*SZREG + 18*SZVREG
>> +	vld	vr19, sp, 13*SZREG + 19*SZVREG
>> +	vld	vr20, sp, 13*SZREG + 20*SZVREG
>> +	vld	vr21, sp, 13*SZREG + 21*SZVREG
>> +	vld	vr22, sp, 13*SZREG + 22*SZVREG
>> +	vld	vr23, sp, 13*SZREG + 23*SZVREG
>> +	vld	vr24, sp, 13*SZREG + 24*SZVREG
>> +	vld	vr25, sp, 13*SZREG + 25*SZVREG
>> +	vld	vr26, sp, 13*SZREG + 26*SZVREG
>> +	vld	vr27, sp, 13*SZREG + 27*SZVREG
>> +	vld	vr28, sp, 13*SZREG + 28*SZVREG
>> +	vld	vr29, sp, 13*SZREG + 29*SZVREG
>> +	vld	vr30, sp, 13*SZREG + 30*SZVREG
>> +	vld	vr31, sp, 13*SZREG + 31*SZVREG
>> +#elif !defined __loongarch_soft_float
>> +	FREG_L	fa0, sp, 13*SZREG + 0*SZFREG
>> +	FREG_L	fa1, sp, 13*SZREG + 1*SZFREG
>> +	FREG_L	fa2, sp, 13*SZREG + 2*SZFREG
>> +	FREG_L	fa3, sp, 13*SZREG + 3*SZFREG
>> +	FREG_L	fa4, sp, 13*SZREG + 4*SZFREG
>> +	FREG_L	fa5, sp, 13*SZREG + 5*SZFREG
>> +	FREG_L	fa6, sp, 13*SZREG + 6*SZFREG
>> +	FREG_L	fa7, sp, 13*SZREG + 7*SZFREG
>> +	FREG_L	ft0, sp, 13*SZREG + 8*SZFREG
>> +	FREG_L	ft1, sp, 13*SZREG + 9*SZFREG
>> +	FREG_L	ft2, sp, 13*SZREG + 10*SZFREG
>> +	FREG_L	ft3, sp, 13*SZREG + 11*SZFREG
>> +	FREG_L	ft4, sp, 13*SZREG + 12*SZFREG
>> +	FREG_L	ft5, sp, 13*SZREG + 13*SZFREG
>> +	FREG_L	ft6, sp, 13*SZREG + 14*SZFREG
>> +	FREG_L	ft7, sp, 13*SZREG + 15*SZFREG
>> +	FREG_L	ft8, sp, 13*SZREG + 16*SZFREG
>> +	FREG_L	ft9, sp, 13*SZREG + 17*SZFREG
>> +	FREG_L	ft10, sp, 13*SZREG + 18*SZFREG
>> +	FREG_L	ft11, sp, 13*SZREG + 19*SZFREG
>> +	FREG_L	ft12, sp, 13*SZREG + 20*SZFREG
>> +	FREG_L	ft13, sp, 13*SZREG + 21*SZFREG
>> +	FREG_L	ft14, sp, 13*SZREG + 22*SZFREG
>> +	FREG_L	ft15, sp, 13*SZREG + 23*SZFREG
>> +#endif /* #ifdef USE_LASX  */
>> +
>> +	addi.d	sp, sp, FRAME_SIZE
>> +	b	.Lret
>> +	cfi_endproc
>> +	.size	_dl_tlsdesc_dynamic, .-_dl_tlsdesc_dynamic
>> +#endif /* #ifdef SHARED  */
>> diff --git a/sysdeps/loongarch/dl-tlsdesc.h b/sysdeps/loongarch/dl-tlsdesc.h
>> new file mode 100644
>> index 0000000000..e1a9365855
>> --- /dev/null
>> +++ b/sysdeps/loongarch/dl-tlsdesc.h
>> @@ -0,0 +1,49 @@
>> +/* Thread-local storage descriptor handling in the ELF dynamic linker.
>> +   LoongArch version.
>> +   Copyright (C) 2011-2023 Free Software Foundation, Inc.
>> +
>> +   This file is part of the GNU C Library.
>> +
>> +   The GNU C Library is free software; you can redistribute it and/or
>> +   modify it under the terms of the GNU Lesser General Public
>> +   License as published by the Free Software Foundation; either
>> +   version 2.1 of the License, or (at your option) any later version.
>> +
>> +   The GNU C Library is distributed in the hope that it will be useful,
>> +   but WITHOUT ANY WARRANTY; without even the implied warranty of
>> +   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
>> +   Lesser General Public License for more details.
>> +
>> +   You should have received a copy of the GNU Lesser General Public
>> +   License along with the GNU C Library; if not, see
>> +   <https://www.gnu.org/licenses/>.  */
>> +
>> +#ifndef _DL_TLSDESC_H
>> +#define _DL_TLSDESC_H
>> +
>> +#include <dl-tls.h>
>> +
>> +/* Type used to represent a TLS descriptor in the GOT.  */
>> +struct tlsdesc
>> +{
>> +  ptrdiff_t (*entry) (struct tlsdesc *);
>> +  void *arg;
>> +};
>> +
>> +/* Type used as the argument in a TLS descriptor for a symbol that
>> +   needs dynamic TLS offsets.  */
>> +struct tlsdesc_dynamic_arg
>> +{
>> +  tls_index tlsinfo;
>> +  size_t gen_count;
>> +};
>> +
>> +extern ptrdiff_t attribute_hidden _dl_tlsdesc_return (struct tlsdesc *);
>> +extern ptrdiff_t attribute_hidden _dl_tlsdesc_undefweak (struct tlsdesc *);
>> +
>> +# ifdef SHARED
>> +extern void *_dl_make_tlsdesc_dynamic (struct link_map *, size_t);
>> +extern ptrdiff_t attribute_hidden _dl_tlsdesc_dynamic (struct tlsdesc *);
>> +#endif
>> +
>> +#endif
>> diff --git a/sysdeps/loongarch/linkmap.h b/sysdeps/loongarch/linkmap.h
>> index 2f5bf53421..40416b1ad4 100644
>> --- a/sysdeps/loongarch/linkmap.h
>> +++ b/sysdeps/loongarch/linkmap.h
>> @@ -19,4 +19,5 @@
>> struct link_map_machine
>> {
>>   ElfW (Addr) plt; /* Address of .plt.  */
>> +  void *tlsdesc_table;    /* Address of TLS descriptor hash table.  */
>> };
>> diff --git a/sysdeps/loongarch/tlsdesc.c b/sysdeps/loongarch/tlsdesc.c
>> new file mode 100644
>> index 0000000000..a357e7619f
>> --- /dev/null
>> +++ b/sysdeps/loongarch/tlsdesc.c
>> @@ -0,0 +1,39 @@
>> +/* Manage TLS descriptors.  AArch64 version.
>> +
>> +   Copyright (C) 2011-2023 Free Software Foundation, Inc.
>> +
>> +   This file is part of the GNU C Library.
>> +
>> +   The GNU C Library is free software; you can redistribute it and/or
>> +   modify it under the terms of the GNU Lesser General Public
>> +   License as published by the Free Software Foundation; either
>> +   version 2.1 of the License, or (at your option) any later version.
>> +
>> +   The GNU C Library is distributed in the hope that it will be useful,
>> +   but WITHOUT ANY WARRANTY; without even the implied warranty of
>> +   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
>> +   Lesser General Public License for more details.
>> +
>> +   You should have received a copy of the GNU Lesser General Public
>> +   License along with the GNU C Library; if not, see
>> +   <https://www.gnu.org/licenses/>.  */
>> +
>> +#include <ldsodefs.h>
>> +#include <tls.h>
>> +#include <dl-tlsdesc.h>
>> +#include <dl-unmap-segments.h>
>> +#include <tlsdeschtab.h>
>> +
>> +/* Unmap the dynamic object, but also release its TLS descriptor table
>> +   if there is one.  */
>> +
>> +void
>> +_dl_unmap (struct link_map *map)
>> +{
>> +  _dl_unmap_segments (map);
>> +
>> +#ifdef SHARED
>> +  if (map->l_mach.tlsdesc_table)
>> +    htab_delete (map->l_mach.tlsdesc_table);
>> +#endif
>> +}
>> diff --git a/sysdeps/loongarch/tlsdesc.sym b/sysdeps/loongarch/tlsdesc.sym
>> new file mode 100644
>> index 0000000000..bcab218631
>> --- /dev/null
>> +++ b/sysdeps/loongarch/tlsdesc.sym
>> @@ -0,0 +1,19 @@
>> +#include <stddef.h>
>> +#include <sysdep.h>
>> +#include <tls.h>
>> +#include <link.h>
>> +#include <dl-tlsdesc.h>
>> +
>> +--
>> +
>> +-- Abuse tls.h macros to derive offsets relative to the thread register.
>> +
>> +TLSDESC_ARG		offsetof(struct tlsdesc, arg)
>> +TLSDESC_GEN_COUNT	offsetof(struct tlsdesc_dynamic_arg, gen_count)
>> +TLSDESC_MODID		offsetof(struct tlsdesc_dynamic_arg, tlsinfo.ti_module)
>> +TLSDESC_MODOFF		offsetof(struct tlsdesc_dynamic_arg, tlsinfo.ti_offset)
>> +TCBHEAD_DTV		offsetof(tcbhead_t, dtv)
>> +DTV_COUNTER		offsetof(dtv_t, counter)
>> +TLS_DTV_UNALLOCATED	TLS_DTV_UNALLOCATED
>> +TLS_DTV_OFFSET		TLS_DTV_OFFSET
>> +SIZE_OF_DTV		sizeof(tcbhead_t)
>> diff --git a/sysdeps/unix/sysv/linux/loongarch/localplt.data b/sysdeps/unix/sysv/linux/loongarch/localplt.data
>> index 547b1c1b7f..ec32e6d13f 100644
>> --- a/sysdeps/unix/sysv/linux/loongarch/localplt.data
>> +++ b/sysdeps/unix/sysv/linux/loongarch/localplt.data
>> @@ -5,3 +5,5 @@ libc.so: calloc
>> libc.so: free
>> libc.so: malloc
>> libc.so: realloc
>> +# The dynamic loader needs __tls_get_addr for TLS.
>> +ld.so: __tls_get_addr
>> -- 
>> 2.36.0
>>
>>
  

Patch

diff --git a/elf/elf.h b/elf/elf.h
index 5c1c1972d1..72e90aec30 100644
--- a/elf/elf.h
+++ b/elf/elf.h
@@ -4232,6 +4232,8 @@  enum
 #define R_LARCH_TLS_TPREL32	10
 #define R_LARCH_TLS_TPREL64	11
 #define R_LARCH_IRELATIVE	12
+#define R_LARCH_TLS_DESC32	13
+#define R_LARCH_TLS_DESC64	14
 
 /* Reserved for future relocs that the dynamic linker must understand.  */
 
diff --git a/sysdeps/loongarch/Makefile b/sysdeps/loongarch/Makefile
index 43d2f583cd..181389e787 100644
--- a/sysdeps/loongarch/Makefile
+++ b/sysdeps/loongarch/Makefile
@@ -3,9 +3,15 @@  sysdep_headers += sys/asm.h
 endif
 
 ifeq ($(subdir),elf)
+sysdep-dl-routines += tlsdesc dl-tlsdesc
 gen-as-const-headers += dl-link.sym
 endif
 
+ifeq ($(subdir),csu)
+gen-as-const-headers += tlsdesc.sym
+endif
+
+
 # LoongArch's assembler also needs to know about PIC as it changes the
 # definition of some assembler macros.
 ASFLAGS-.os += $(pic-ccflag)
diff --git a/sysdeps/loongarch/dl-link.sym b/sysdeps/loongarch/dl-link.sym
index b534968e30..fd81ef37d5 100644
--- a/sysdeps/loongarch/dl-link.sym
+++ b/sysdeps/loongarch/dl-link.sym
@@ -1,6 +1,7 @@ 
 #include <stddef.h>
 #include <sysdep.h>
 #include <link.h>
+#include <dl-tlsdesc.h>
 
 DL_SIZEOF_RG            sizeof(struct La_loongarch_regs)
 DL_SIZEOF_RV            sizeof(struct La_loongarch_retval)
diff --git a/sysdeps/loongarch/dl-machine.h b/sysdeps/loongarch/dl-machine.h
index 0d17fd21e3..0dd252a5e5 100644
--- a/sysdeps/loongarch/dl-machine.h
+++ b/sysdeps/loongarch/dl-machine.h
@@ -25,7 +25,7 @@ 
 #include <entry.h>
 #include <elf/elf.h>
 #include <sys/asm.h>
-#include <dl-tls.h>
+#include <dl-tlsdesc.h>
 #include <dl-static-tls.h>
 #include <dl-machine-rel.h>
 
@@ -187,6 +187,37 @@  elf_machine_rela (struct link_map *map, struct r_scope_elem *scope[],
       *addr_field = TLS_TPREL_VALUE (sym_map, sym) + reloc->r_addend;
       break;
 
+    case __WORDSIZE == 64 ? R_LARCH_TLS_DESC64 : R_LARCH_TLS_DESC32:
+      {
+	struct tlsdesc volatile *td =
+	    (struct tlsdesc volatile *)addr_field;
+	if (! sym)
+	  {
+	    td->arg = (void*)reloc->r_addend;
+	    td->entry = _dl_tlsdesc_undefweak;
+	  }
+	else
+	  {
+# ifndef SHARED
+	    CHECK_STATIC_TLS (map, sym_map);
+# else
+	    if (!TRY_STATIC_TLS (map, sym_map))
+	      {
+		td->arg = _dl_make_tlsdesc_dynamic
+		  (sym_map, sym->st_value + reloc->r_addend);
+		td->entry = _dl_tlsdesc_dynamic;
+	      }
+	    else
+# endif
+	      {
+		td->arg = (void *)(TLS_TPREL_VALUE (sym_map, sym)
+			    + reloc->r_addend);
+		td->entry = _dl_tlsdesc_return;
+	      }
+	  }
+	break;
+      }
+
     case R_LARCH_COPY:
       {
 	  if (sym == NULL)
@@ -255,6 +286,25 @@  elf_machine_lazy_rel (struct link_map *map, struct r_scope_elem *scope[],
       else
 	*reloc_addr = map->l_mach.plt;
     }
+  else if (__builtin_expect (r_type == R_LARCH_TLS_DESC64, 1))
+    {
+      const Elf_Symndx symndx = ELFW (R_SYM) (reloc->r_info);
+      const ElfW (Sym) *symtab = (const void *)D_PTR (map, l_info[DT_SYMTAB]);
+      const ElfW (Sym) *sym = &symtab[symndx];
+      const struct r_found_version *version = NULL;
+
+      if (map->l_info[VERSYMIDX (DT_VERSYM)] != NULL)
+	{
+	  const ElfW (Half) *vernum =
+	    (const void *)D_PTR (map, l_info[VERSYMIDX (DT_VERSYM)]);
+	  version = &map->l_versions[vernum[symndx] & 0x7fff];
+	}
+
+      /* Always initialize TLS descriptors completely, because lazy
+	 initialization requires synchronization at every TLS access.  */
+      elf_machine_rela (map, scope, reloc, sym, version, reloc_addr,
+			skip_ifunc);
+    }
   else
     _dl_reloc_bad_type (map, r_type, 1);
 }
diff --git a/sysdeps/loongarch/dl-tls.h b/sysdeps/loongarch/dl-tls.h
index a551594b64..1ca376484a 100644
--- a/sysdeps/loongarch/dl-tls.h
+++ b/sysdeps/loongarch/dl-tls.h
@@ -16,6 +16,9 @@ 
    License along with the GNU C Library.  If not, see
    <https://www.gnu.org/licenses/>.  */
 
+#ifndef _DL_TLS_H
+#define _DL_TLS_H
+
 /* Type used for the representation of TLS information in the GOT.  */
 typedef struct
 {
@@ -23,6 +26,8 @@  typedef struct
   unsigned long int ti_offset;
 } tls_index;
 
+extern void *__tls_get_addr (tls_index *ti);
+
 /* The thread pointer points to the first static TLS block.  */
 #define TLS_TP_OFFSET 0
 
@@ -37,10 +42,10 @@  typedef struct
 /* Compute the value for a DTPREL reloc.  */
 #define TLS_DTPREL_VALUE(sym) ((sym)->st_value - TLS_DTV_OFFSET)
 
-extern void *__tls_get_addr (tls_index *ti);
-
 #define GET_ADDR_OFFSET (ti->ti_offset + TLS_DTV_OFFSET)
 #define __TLS_GET_ADDR(__ti) (__tls_get_addr (__ti) - TLS_DTV_OFFSET)
 
 /* Value used for dtv entries for which the allocation is delayed.  */
 #define TLS_DTV_UNALLOCATED ((void *) -1l)
+
+#endif
diff --git a/sysdeps/loongarch/dl-tlsdesc.S b/sysdeps/loongarch/dl-tlsdesc.S
new file mode 100644
index 0000000000..d2c18ff527
--- /dev/null
+++ b/sysdeps/loongarch/dl-tlsdesc.S
@@ -0,0 +1,364 @@ 
+/* Thread-local storage handling in the ELF dynamic linker.
+   LoongArch version.
+   Copyright (C) 2011-2023 Free Software Foundation, Inc.
+
+   This file is part of the GNU C Library.
+
+   The GNU C Library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 2.1 of the License, or (at your option) any later version.
+
+   The GNU C Library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with the GNU C Library; if not, see
+   <https://www.gnu.org/licenses/>.  */
+
+#include <sysdep.h>
+#include <tls.h>
+#include "tlsdesc.h"
+
+	.text
+
+	/* Compute the thread pointer offset for symbols in the static
+	   TLS block. The offset is the same for all threads.
+	   Prototype:
+	   _dl_tlsdesc_return (tlsdesc *);  */
+	.hidden _dl_tlsdesc_return
+	.global	_dl_tlsdesc_return
+	.type	_dl_tlsdesc_return,%function
+	cfi_startproc
+	.align 2
+_dl_tlsdesc_return:
+	REG_L  a0, a0, 8
+	RET
+	cfi_endproc
+	.size	_dl_tlsdesc_return, .-_dl_tlsdesc_return
+
+	/* Handler for undefined weak TLS symbols.
+	   Prototype:
+	   _dl_tlsdesc_undefweak (tlsdesc *);
+
+	   The second word of the descriptor contains the addend.
+	   Return the addend minus the thread pointer. This ensures
+	   that when the caller adds on the thread pointer it gets back
+	   the addend.  */
+	.hidden _dl_tlsdesc_undefweak
+	.global	_dl_tlsdesc_undefweak
+	.type	_dl_tlsdesc_undefweak,%function
+	cfi_startproc
+	.align  2
+_dl_tlsdesc_undefweak:
+	REG_L	a0, a0, 8
+	sub.d	a0, a0, tp
+	RET
+	cfi_endproc
+	.size	_dl_tlsdesc_undefweak, .-_dl_tlsdesc_undefweak
+
+#ifdef USE_LASX
+# define FRAME_SIZE (-((-13 * SZREG - 32 * SZXREG) & ALMASK))
+#elif defined USE_LSX
+# define FRAME_SIZE (-((-13 * SZREG - 32 * SZVREG) & ALMASK))
+#elif !defined __loongarch_soft_float
+# define FRAME_SIZE (-((-13 * SZREG - 24 * SZFREG) & ALMASK))
+#else
+# define FRAME_SIZE (-((-13 * SZREG) & ALMASK))
+#endif
+
+#ifdef SHARED
+	/* Handler for dynamic TLS symbols.
+	   Prototype:
+	   _dl_tlsdesc_dynamic (tlsdesc *) ;
+
+	   The second word of the descriptor points to a
+	   tlsdesc_dynamic_arg structure.
+
+	   Returns the offset between the thread pointer and the
+	   object referenced by the argument.
+
+	   ptrdiff_t
+	   __attribute__ ((__regparm__ (1)))
+	   _dl_tlsdesc_dynamic (struct tlsdesc *tdp)
+	   {
+	     struct tlsdesc_dynamic_arg *td = tdp->arg;
+	     dtv_t *dtv = *(dtv_t **)((char *)__thread_pointer + TCBHEAD_DTV);
+	     if (__builtin_expect (td->gen_count <= dtv[0].counter
+		&& (dtv[td->tlsinfo.ti_module].pointer.val
+		    != TLS_DTV_UNALLOCATED),
+		1))
+	       return dtv[td->tlsinfo.ti_module].pointer.val
+		+ td->tlsinfo.ti_offset
+		- __thread_pointer;
+
+	     return ___tls_get_addr (&td->tlsinfo) - __thread_pointer;
+	   }
+	 */
+	.hidden _dl_tlsdesc_dynamic
+	.global	_dl_tlsdesc_dynamic
+	.type	_dl_tlsdesc_dynamic,%function
+	cfi_startproc
+	.align 2
+_dl_tlsdesc_dynamic:
+	/* Save just enough registers to support fast path, if we fall
+	   into slow path we will save additional registers.  */
+	addi.d  $r3,$r3,-24
+	REG_S	t0, sp, 0
+	REG_S	t1, sp, 8
+	REG_S	t2, sp, 16
+
+	REG_L	t0, tp, -SIZE_OF_DTV	  # dtv(t0) = tp + TCBHEAD_DTV dtv start
+	REG_L	a0, a0, TLSDESC_ARG	  # td(a0) = tdp->arg
+	REG_L	t1, a0, TLSDESC_GEN_COUNT # t1 = td->gen_count
+	REG_L	t2, t0, DTV_COUNTER	  # t2 = dtv[0].counter
+	bltu	t2, t1, .Lslow
+
+	REG_L	t1, a0, TLSDESC_MODID	  # t1 = td->tlsinfo.ti_module
+	slli.d	t1, t1, 3 + 1 # /* sizeof(dtv_t) == sizeof(void*) * 2 */
+	add.d	t1, t1, t0    # t1 = dtv + ti_module * sizeof(dtv_t)
+	REG_L	t1, t1, 0 # t1 = dtv[td->tlsinfo.ti_module].pointer.val
+	li.d	t2, TLS_DTV_UNALLOCATED
+	beq	t1, t2, .Lslow
+	REG_L	t2, a0, TLSDESC_MODOFF # t2 = td->tlsinfo.ti_offset
+	# dtv[td->tlsinfo.ti_module].pointer.val + td->tlsinfo.ti_offset
+	add.d	a0, t1, t2
+.Lret:
+	sub.d	a0, a0, tp
+	REG_L	t0, sp, 0
+	REG_L	t1, sp, 8
+	REG_L	t2, sp, 16
+	addi.d	sp, sp, 24
+	RET
+
+.Lslow:
+	/* This is the slow path. We need to call __tls_get_addr() which
+	   means we need to save and restore all the register that the
+	   callee will trash.  */
+
+	/* Save the remaining registers that we must treat as caller save.  */
+	addi.d	sp, sp, -FRAME_SIZE
+	REG_S	ra, sp, 0 * SZREG
+	REG_S	a1, sp, 1 * SZREG
+	REG_S	a2, sp, 2 * SZREG
+	REG_S	a3, sp, 3 * SZREG
+	REG_S	a4, sp, 4 * SZREG
+	REG_S	a5, sp, 5 * SZREG
+	REG_S	a6, sp, 6 * SZREG
+	REG_S	a7, sp, 7 * SZREG
+	REG_S	t4, sp, 8 * SZREG
+	REG_S	t5, sp, 9 * SZREG
+	REG_S	t6, sp, 10 * SZREG
+	REG_S	t7, sp, 11 * SZREG
+	REG_S	t8, sp, 12 * SZREG
+
+#ifdef USE_LASX
+	xvst	xr0, sp, 13*SZREG + 0*SZXREG
+	xvst	xr1, sp, 13*SZREG + 1*SZXREG
+	xvst	xr2, sp, 13*SZREG + 2*SZXREG
+	xvst	xr3, sp, 13*SZREG + 3*SZXREG
+	xvst	xr4, sp, 13*SZREG + 4*SZXREG
+	xvst	xr5, sp, 13*SZREG + 5*SZXREG
+	xvst	xr6, sp, 13*SZREG + 6*SZXREG
+	xvst	xr7, sp, 13*SZREG + 7*SZXREG
+	xvst	xr8, sp, 13*SZREG + 8*SZXREG
+	xvst	xr9, sp, 13*SZREG + 9*SZXREG
+	xvst	xr10, sp, 13*SZREG + 10*SZXREG
+	xvst	xr11, sp, 13*SZREG + 11*SZXREG
+	xvst	xr12, sp, 13*SZREG + 12*SZXREG
+	xvst	xr13, sp, 13*SZREG + 13*SZXREG
+	xvst	xr14, sp, 13*SZREG + 14*SZXREG
+	xvst	xr15, sp, 13*SZREG + 15*SZXREG
+	xvst	xr16, sp, 13*SZREG + 16*SZXREG
+	xvst	xr17, sp, 13*SZREG + 17*SZXREG
+	xvst	xr18, sp, 13*SZREG + 18*SZXREG
+	xvst	xr19, sp, 13*SZREG + 19*SZXREG
+	xvst	xr20, sp, 13*SZREG + 20*SZXREG
+	xvst	xr21, sp, 13*SZREG + 21*SZXREG
+	xvst	xr22, sp, 13*SZREG + 22*SZXREG
+	xvst	xr23, sp, 13*SZREG + 23*SZXREG
+	xvst	xr23, sp, 13*SZREG + 24*SZXREG
+	xvst	xr23, sp, 13*SZREG + 25*SZXREG
+	xvst	xr23, sp, 13*SZREG + 26*SZXREG
+	xvst	xr23, sp, 13*SZREG + 27*SZXREG
+	xvst	xr23, sp, 13*SZREG + 28*SZXREG
+	xvst	xr23, sp, 13*SZREG + 29*SZXREG
+	xvst	xr23, sp, 13*SZREG + 30*SZXREG
+	xvst	xr23, sp, 13*SZREG + 31*SZXREG
+#elif defined USE_LSX
+	vst	vr0, sp, 13*SZREG + 0*SZVREG
+	vst	vr1, sp, 13*SZREG + 1*SZVREG
+	vst	vr2, sp, 13*SZREG + 2*SZVREG
+	vst	vr3, sp, 13*SZREG + 3*SZVREG
+	vst	vr4, sp, 13*SZREG + 4*SZVREG
+	vst	vr5, sp, 13*SZREG + 5*SZVREG
+	vst	vr6, sp, 13*SZREG + 6*SZVREG
+	vst	vr7, sp, 13*SZREG + 7*SZVREG
+	vst	vr8, sp, 13*SZREG + 8*SZVREG
+	vst	vr9, sp, 13*SZREG + 9*SZVREG
+	vst	vr10, sp, 13*SZREG + 10*SZVREG
+	vst	vr11, sp, 13*SZREG + 11*SZVREG
+	vst	vr12, sp, 13*SZREG + 12*SZVREG
+	vst	vr13, sp, 13*SZREG + 13*SZVREG
+	vst	vr14, sp, 13*SZREG + 14*SZVREG
+	vst	vr15, sp, 13*SZREG + 15*SZVREG
+	vst	vr16, sp, 13*SZREG + 16*SZVREG
+	vst	vr17, sp, 13*SZREG + 17*SZVREG
+	vst	vr18, sp, 13*SZREG + 18*SZVREG
+	vst	vr19, sp, 13*SZREG + 19*SZVREG
+	vst	vr20, sp, 13*SZREG + 20*SZVREG
+	vst	vr21, sp, 13*SZREG + 21*SZVREG
+	vst	vr22, sp, 13*SZREG + 22*SZVREG
+	vst	vr23, sp, 13*SZREG + 23*SZVREG
+	vst	vr23, sp, 13*SZREG + 24*SZVREG
+	vst	vr23, sp, 13*SZREG + 25*SZVREG
+	vst	vr23, sp, 13*SZREG + 26*SZVREG
+	vst	vr23, sp, 13*SZREG + 27*SZVREG
+	vst	vr23, sp, 13*SZREG + 28*SZVREG
+	vst	vr23, sp, 13*SZREG + 29*SZVREG
+	vst	vr23, sp, 13*SZREG + 30*SZVREG
+	vst	vr23, sp, 13*SZREG + 31*SZVREG
+#elif !defined __loongarch_soft_float
+	FREG_S	fa0, sp, 13*SZREG + 0*SZFREG
+	FREG_S	fa1, sp, 13*SZREG + 1*SZFREG
+	FREG_S	fa2, sp, 13*SZREG + 2*SZFREG
+	FREG_S	fa3, sp, 13*SZREG + 3*SZFREG
+	FREG_S	fa4, sp, 13*SZREG + 4*SZFREG
+	FREG_S	fa5, sp, 13*SZREG + 5*SZFREG
+	FREG_S	fa6, sp, 13*SZREG + 6*SZFREG
+	FREG_S	fa7, sp, 13*SZREG + 7*SZFREG
+	FREG_S	ft0, sp, 13*SZREG + 8*SZFREG
+	FREG_S	ft1, sp, 13*SZREG + 9*SZFREG
+	FREG_S	ft2, sp, 13*SZREG + 10*SZFREG
+	FREG_S	ft3, sp, 13*SZREG + 11*SZFREG
+	FREG_S	ft4, sp, 13*SZREG + 12*SZFREG
+	FREG_S	ft5, sp, 13*SZREG + 13*SZFREG
+	FREG_S	ft6, sp, 13*SZREG + 14*SZFREG
+	FREG_S	ft7, sp, 13*SZREG + 15*SZFREG
+	FREG_S	ft8, sp, 13*SZREG + 16*SZFREG
+	FREG_S	ft9, sp, 13*SZREG + 17*SZFREG
+	FREG_S	ft10, sp, 13*SZREG + 18*SZFREG
+	FREG_S	ft11, sp, 13*SZREG + 19*SZFREG
+	FREG_S	ft12, sp, 13*SZREG + 20*SZFREG
+	FREG_S	ft13, sp, 13*SZREG + 21*SZFREG
+	FREG_S	ft14, sp, 13*SZREG + 22*SZFREG
+	FREG_S	ft15, sp, 13*SZREG + 23*SZFREG
+#endif /* #ifdef USE_LASX  */
+
+	bl	__tls_get_addr
+	addi.d	a0, a0, -TLS_DTV_OFFSET
+
+	REG_L	ra, sp, 0
+	REG_L	a1, sp, 1 * 8
+	REG_L	a2, sp, 2 * 8
+	REG_L	a3, sp, 3 * 8
+	REG_L	a4, sp, 4 * 8
+	REG_L	a5, sp, 5 * 8
+	REG_L	a6, sp, 6 * 8
+	REG_L	a7, sp, 7 * 8
+	REG_L	t4, sp, 8 * 8
+	REG_L	t5, sp, 9 * 8
+	REG_L	t6, sp, 10 * 8
+	REG_L	t7, sp, 11 * 8
+	REG_L	t8, sp, 12 * 8
+
+#ifdef USE_LASX
+	xvld	xr0, sp, 13*SZREG + 0*SZXREG
+	xvld	xr1, sp, 13*SZREG + 1*SZXREG
+	xvld	xr2, sp, 13*SZREG + 2*SZXREG
+	xvld	xr3, sp, 13*SZREG + 3*SZXREG
+	xvld	xr4, sp, 13*SZREG + 4*SZXREG
+	xvld	xr5, sp, 13*SZREG + 5*SZXREG
+	xvld	xr6, sp, 13*SZREG + 6*SZXREG
+	xvld	xr7, sp, 13*SZREG + 7*SZXREG
+	xvld	xr8, sp, 13*SZREG + 8*SZXREG
+	xvld	xr9, sp, 13*SZREG + 9*SZXREG
+	xvld	xr10, sp, 13*SZREG + 10*SZXREG
+	xvld	xr11, sp, 13*SZREG + 11*SZXREG
+	xvld	xr12, sp, 13*SZREG + 12*SZXREG
+	xvld	xr13, sp, 13*SZREG + 13*SZXREG
+	xvld	xr14, sp, 13*SZREG + 14*SZXREG
+	xvld	xr15, sp, 13*SZREG + 15*SZXREG
+	xvld	xr16, sp, 13*SZREG + 16*SZXREG
+	xvld	xr17, sp, 13*SZREG + 17*SZXREG
+	xvld	xr18, sp, 13*SZREG + 18*SZXREG
+	xvld	xr19, sp, 13*SZREG + 19*SZXREG
+	xvld	xr20, sp, 13*SZREG + 20*SZXREG
+	xvld	xr21, sp, 13*SZREG + 21*SZXREG
+	xvld	xr22, sp, 13*SZREG + 22*SZXREG
+	xvld	xr23, sp, 13*SZREG + 23*SZXREG
+	xvld	xr24, sp, 13*SZREG + 24*SZXREG
+	xvld	xr25, sp, 13*SZREG + 25*SZXREG
+	xvld	xr26, sp, 13*SZREG + 26*SZXREG
+	xvld	xr27, sp, 13*SZREG + 27*SZXREG
+	xvld	xr28, sp, 13*SZREG + 28*SZXREG
+	xvld	xr29, sp, 13*SZREG + 29*SZXREG
+	xvld	xr30, sp, 13*SZREG + 30*SZXREG
+	xvld	xr31, sp, 13*SZREG + 31*SZXREG
+#elif defined USE_LSX
+	vld	vr0, sp, 13*SZREG + 0*SZVREG
+	vld	vr1, sp, 13*SZREG + 1*SZVREG
+	vld	vr2, sp, 13*SZREG + 2*SZVREG
+	vld	vr3, sp, 13*SZREG + 3*SZVREG
+	vld	vr4, sp, 13*SZREG + 4*SZVREG
+	vld	vr5, sp, 13*SZREG + 5*SZVREG
+	vld	vr6, sp, 13*SZREG + 6*SZVREG
+	vld	vr7, sp, 13*SZREG + 7*SZVREG
+	vld	vr8, sp, 13*SZREG + 8*SZVREG
+	vld	vr9, sp, 13*SZREG + 9*SZVREG
+	vld	vr10, sp, 13*SZREG + 10*SZVREG
+	vld	vr11, sp, 13*SZREG + 11*SZVREG
+	vld	vr12, sp, 13*SZREG + 12*SZVREG
+	vld	vr13, sp, 13*SZREG + 13*SZVREG
+	vld	vr14, sp, 13*SZREG + 14*SZVREG
+	vld	vr15, sp, 13*SZREG + 15*SZVREG
+	vld	vr16, sp, 13*SZREG + 16*SZVREG
+	vld	vr17, sp, 13*SZREG + 17*SZVREG
+	vld	vr18, sp, 13*SZREG + 18*SZVREG
+	vld	vr19, sp, 13*SZREG + 19*SZVREG
+	vld	vr20, sp, 13*SZREG + 20*SZVREG
+	vld	vr21, sp, 13*SZREG + 21*SZVREG
+	vld	vr22, sp, 13*SZREG + 22*SZVREG
+	vld	vr23, sp, 13*SZREG + 23*SZVREG
+	vld	vr24, sp, 13*SZREG + 24*SZVREG
+	vld	vr25, sp, 13*SZREG + 25*SZVREG
+	vld	vr26, sp, 13*SZREG + 26*SZVREG
+	vld	vr27, sp, 13*SZREG + 27*SZVREG
+	vld	vr28, sp, 13*SZREG + 28*SZVREG
+	vld	vr29, sp, 13*SZREG + 29*SZVREG
+	vld	vr30, sp, 13*SZREG + 30*SZVREG
+	vld	vr31, sp, 13*SZREG + 31*SZVREG
+#elif !defined __loongarch_soft_float
+	FREG_L	fa0, sp, 13*SZREG + 0*SZFREG
+	FREG_L	fa1, sp, 13*SZREG + 1*SZFREG
+	FREG_L	fa2, sp, 13*SZREG + 2*SZFREG
+	FREG_L	fa3, sp, 13*SZREG + 3*SZFREG
+	FREG_L	fa4, sp, 13*SZREG + 4*SZFREG
+	FREG_L	fa5, sp, 13*SZREG + 5*SZFREG
+	FREG_L	fa6, sp, 13*SZREG + 6*SZFREG
+	FREG_L	fa7, sp, 13*SZREG + 7*SZFREG
+	FREG_L	ft0, sp, 13*SZREG + 8*SZFREG
+	FREG_L	ft1, sp, 13*SZREG + 9*SZFREG
+	FREG_L	ft2, sp, 13*SZREG + 10*SZFREG
+	FREG_L	ft3, sp, 13*SZREG + 11*SZFREG
+	FREG_L	ft4, sp, 13*SZREG + 12*SZFREG
+	FREG_L	ft5, sp, 13*SZREG + 13*SZFREG
+	FREG_L	ft6, sp, 13*SZREG + 14*SZFREG
+	FREG_L	ft7, sp, 13*SZREG + 15*SZFREG
+	FREG_L	ft8, sp, 13*SZREG + 16*SZFREG
+	FREG_L	ft9, sp, 13*SZREG + 17*SZFREG
+	FREG_L	ft10, sp, 13*SZREG + 18*SZFREG
+	FREG_L	ft11, sp, 13*SZREG + 19*SZFREG
+	FREG_L	ft12, sp, 13*SZREG + 20*SZFREG
+	FREG_L	ft13, sp, 13*SZREG + 21*SZFREG
+	FREG_L	ft14, sp, 13*SZREG + 22*SZFREG
+	FREG_L	ft15, sp, 13*SZREG + 23*SZFREG
+#endif /* #ifdef USE_LASX  */
+
+	addi.d	sp, sp, FRAME_SIZE
+	b	.Lret
+	cfi_endproc
+	.size	_dl_tlsdesc_dynamic, .-_dl_tlsdesc_dynamic
+#endif /* #ifdef SHARED  */
diff --git a/sysdeps/loongarch/dl-tlsdesc.h b/sysdeps/loongarch/dl-tlsdesc.h
new file mode 100644
index 0000000000..e1a9365855
--- /dev/null
+++ b/sysdeps/loongarch/dl-tlsdesc.h
@@ -0,0 +1,49 @@ 
+/* Thread-local storage descriptor handling in the ELF dynamic linker.
+   LoongArch version.
+   Copyright (C) 2011-2023 Free Software Foundation, Inc.
+
+   This file is part of the GNU C Library.
+
+   The GNU C Library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 2.1 of the License, or (at your option) any later version.
+
+   The GNU C Library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with the GNU C Library; if not, see
+   <https://www.gnu.org/licenses/>.  */
+
+#ifndef _DL_TLSDESC_H
+#define _DL_TLSDESC_H
+
+#include <dl-tls.h>
+
+/* Type used to represent a TLS descriptor in the GOT.  */
+struct tlsdesc
+{
+  ptrdiff_t (*entry) (struct tlsdesc *);
+  void *arg;
+};
+
+/* Type used as the argument in a TLS descriptor for a symbol that
+   needs dynamic TLS offsets.  */
+struct tlsdesc_dynamic_arg
+{
+  tls_index tlsinfo;
+  size_t gen_count;
+};
+
+extern ptrdiff_t attribute_hidden _dl_tlsdesc_return (struct tlsdesc *);
+extern ptrdiff_t attribute_hidden _dl_tlsdesc_undefweak (struct tlsdesc *);
+
+# ifdef SHARED
+extern void *_dl_make_tlsdesc_dynamic (struct link_map *, size_t);
+extern ptrdiff_t attribute_hidden _dl_tlsdesc_dynamic (struct tlsdesc *);
+#endif
+
+#endif
diff --git a/sysdeps/loongarch/linkmap.h b/sysdeps/loongarch/linkmap.h
index 2f5bf53421..40416b1ad4 100644
--- a/sysdeps/loongarch/linkmap.h
+++ b/sysdeps/loongarch/linkmap.h
@@ -19,4 +19,5 @@ 
 struct link_map_machine
 {
   ElfW (Addr) plt; /* Address of .plt.  */
+  void *tlsdesc_table;    /* Address of TLS descriptor hash table.  */
 };
diff --git a/sysdeps/loongarch/tlsdesc.c b/sysdeps/loongarch/tlsdesc.c
new file mode 100644
index 0000000000..a357e7619f
--- /dev/null
+++ b/sysdeps/loongarch/tlsdesc.c
@@ -0,0 +1,39 @@ 
+/* Manage TLS descriptors.  AArch64 version.
+
+   Copyright (C) 2011-2023 Free Software Foundation, Inc.
+
+   This file is part of the GNU C Library.
+
+   The GNU C Library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 2.1 of the License, or (at your option) any later version.
+
+   The GNU C Library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with the GNU C Library; if not, see
+   <https://www.gnu.org/licenses/>.  */
+
+#include <ldsodefs.h>
+#include <tls.h>
+#include <dl-tlsdesc.h>
+#include <dl-unmap-segments.h>
+#include <tlsdeschtab.h>
+
+/* Unmap the dynamic object, but also release its TLS descriptor table
+   if there is one.  */
+
+void
+_dl_unmap (struct link_map *map)
+{
+  _dl_unmap_segments (map);
+
+#ifdef SHARED
+  if (map->l_mach.tlsdesc_table)
+    htab_delete (map->l_mach.tlsdesc_table);
+#endif
+}
diff --git a/sysdeps/loongarch/tlsdesc.sym b/sysdeps/loongarch/tlsdesc.sym
new file mode 100644
index 0000000000..bcab218631
--- /dev/null
+++ b/sysdeps/loongarch/tlsdesc.sym
@@ -0,0 +1,19 @@ 
+#include <stddef.h>
+#include <sysdep.h>
+#include <tls.h>
+#include <link.h>
+#include <dl-tlsdesc.h>
+
+--
+
+-- Abuse tls.h macros to derive offsets relative to the thread register.
+
+TLSDESC_ARG		offsetof(struct tlsdesc, arg)
+TLSDESC_GEN_COUNT	offsetof(struct tlsdesc_dynamic_arg, gen_count)
+TLSDESC_MODID		offsetof(struct tlsdesc_dynamic_arg, tlsinfo.ti_module)
+TLSDESC_MODOFF		offsetof(struct tlsdesc_dynamic_arg, tlsinfo.ti_offset)
+TCBHEAD_DTV		offsetof(tcbhead_t, dtv)
+DTV_COUNTER		offsetof(dtv_t, counter)
+TLS_DTV_UNALLOCATED	TLS_DTV_UNALLOCATED
+TLS_DTV_OFFSET		TLS_DTV_OFFSET
+SIZE_OF_DTV		sizeof(tcbhead_t)
diff --git a/sysdeps/unix/sysv/linux/loongarch/localplt.data b/sysdeps/unix/sysv/linux/loongarch/localplt.data
index 547b1c1b7f..ec32e6d13f 100644
--- a/sysdeps/unix/sysv/linux/loongarch/localplt.data
+++ b/sysdeps/unix/sysv/linux/loongarch/localplt.data
@@ -5,3 +5,5 @@  libc.so: calloc
 libc.so: free
 libc.so: malloc
 libc.so: realloc
+# The dynamic loader needs __tls_get_addr for TLS.
+ld.so: __tls_get_addr