LoongArch: Add support for TLS Descriptors
Checks
Context |
Check |
Description |
redhat-pt-bot/TryBot-apply_patch |
success
|
Patch applied to master at the time it was sent
|
redhat-pt-bot/TryBot-32bit |
success
|
Build for i686
|
linaro-tcwg-bot/tcwg_glibc_build--master-arm |
success
|
Testing passed
|
linaro-tcwg-bot/tcwg_glibc_check--master-arm |
success
|
Testing passed
|
linaro-tcwg-bot/tcwg_glibc_build--master-aarch64 |
success
|
Testing passed
|
linaro-tcwg-bot/tcwg_glibc_check--master-aarch64 |
success
|
Testing passed
|
Commit Message
This is mostly based on AArch64 and RISC-V implementation.
Add R_LARCH_TLS_DESC32 and R_LARCH_TLS_DESC64 relocations.
For _dl_tlsdesc_dynamic function slow path, temporarily save and restore
all vector registers.
---
elf/elf.h | 2 +
sysdeps/loongarch/Makefile | 6 +
sysdeps/loongarch/dl-link.sym | 1 +
sysdeps/loongarch/dl-machine.h | 52 ++-
sysdeps/loongarch/dl-tls.h | 9 +-
sysdeps/loongarch/dl-tlsdesc.S | 364 ++++++++++++++++++
sysdeps/loongarch/dl-tlsdesc.h | 49 +++
sysdeps/loongarch/linkmap.h | 1 +
sysdeps/loongarch/tlsdesc.c | 39 ++
sysdeps/loongarch/tlsdesc.sym | 19 +
.../unix/sysv/linux/loongarch/localplt.data | 2 +
11 files changed, 541 insertions(+), 3 deletions(-)
create mode 100644 sysdeps/loongarch/dl-tlsdesc.S
create mode 100644 sysdeps/loongarch/dl-tlsdesc.h
create mode 100644 sysdeps/loongarch/tlsdesc.c
create mode 100644 sysdeps/loongarch/tlsdesc.sym
Comments
在 2023/12/1 下午5:57, mengqinggang 写道:
> This is mostly based on AArch64 and RISC-V implementation.
>
> Add R_LARCH_TLS_DESC32 and R_LARCH_TLS_DESC64 relocations.
>
> For _dl_tlsdesc_dynamic function slow path, temporarily save and restore
> all vector registers.
> ---
> elf/elf.h | 2 +
> sysdeps/loongarch/Makefile | 6 +
> sysdeps/loongarch/dl-link.sym | 1 +
> sysdeps/loongarch/dl-machine.h | 52 ++-
> sysdeps/loongarch/dl-tls.h | 9 +-
> sysdeps/loongarch/dl-tlsdesc.S | 364 ++++++++++++++++++
> sysdeps/loongarch/dl-tlsdesc.h | 49 +++
> sysdeps/loongarch/linkmap.h | 1 +
> sysdeps/loongarch/tlsdesc.c | 39 ++
> sysdeps/loongarch/tlsdesc.sym | 19 +
> .../unix/sysv/linux/loongarch/localplt.data | 2 +
> 11 files changed, 541 insertions(+), 3 deletions(-)
> create mode 100644 sysdeps/loongarch/dl-tlsdesc.S
> create mode 100644 sysdeps/loongarch/dl-tlsdesc.h
> create mode 100644 sysdeps/loongarch/tlsdesc.c
> create mode 100644 sysdeps/loongarch/tlsdesc.sym
>
> diff --git a/elf/elf.h b/elf/elf.h
> index 5c1c1972d1..72e90aec30 100644
> --- a/elf/elf.h
> +++ b/elf/elf.h
> @@ -4232,6 +4232,8 @@ enum
> #define R_LARCH_TLS_TPREL32 10
> #define R_LARCH_TLS_TPREL64 11
> #define R_LARCH_IRELATIVE 12
> +#define R_LARCH_TLS_DESC32 13
> +#define R_LARCH_TLS_DESC64 14
>
> /* Reserved for future relocs that the dynamic linker must understand. */
>
> diff --git a/sysdeps/loongarch/Makefile b/sysdeps/loongarch/Makefile
> index 43d2f583cd..181389e787 100644
> --- a/sysdeps/loongarch/Makefile
> +++ b/sysdeps/loongarch/Makefile
> @@ -3,9 +3,15 @@ sysdep_headers += sys/asm.h
> endif
>
> ifeq ($(subdir),elf)
> +sysdep-dl-routines += tlsdesc dl-tlsdesc
> gen-as-const-headers += dl-link.sym
> endif
>
> +ifeq ($(subdir),csu)
> +gen-as-const-headers += tlsdesc.sym
> +endif
> +
> +
> # LoongArch's assembler also needs to know about PIC as it changes the
> # definition of some assembler macros.
> ASFLAGS-.os += $(pic-ccflag)
> diff --git a/sysdeps/loongarch/dl-link.sym b/sysdeps/loongarch/dl-link.sym
> index b534968e30..fd81ef37d5 100644
> --- a/sysdeps/loongarch/dl-link.sym
> +++ b/sysdeps/loongarch/dl-link.sym
> @@ -1,6 +1,7 @@
> #include <stddef.h>
> #include <sysdep.h>
> #include <link.h>
> +#include <dl-tlsdesc.h>
>
> DL_SIZEOF_RG sizeof(struct La_loongarch_regs)
> DL_SIZEOF_RV sizeof(struct La_loongarch_retval)
> diff --git a/sysdeps/loongarch/dl-machine.h b/sysdeps/loongarch/dl-machine.h
> index 0d17fd21e3..0dd252a5e5 100644
> --- a/sysdeps/loongarch/dl-machine.h
> +++ b/sysdeps/loongarch/dl-machine.h
> @@ -25,7 +25,7 @@
> #include <entry.h>
> #include <elf/elf.h>
> #include <sys/asm.h>
> -#include <dl-tls.h>
> +#include <dl-tlsdesc.h>
> #include <dl-static-tls.h>
> #include <dl-machine-rel.h>
>
> @@ -187,6 +187,37 @@ elf_machine_rela (struct link_map *map, struct r_scope_elem *scope[],
> *addr_field = TLS_TPREL_VALUE (sym_map, sym) + reloc->r_addend;
> break;
>
> + case __WORDSIZE == 64 ? R_LARCH_TLS_DESC64 : R_LARCH_TLS_DESC32:
> + {
> + struct tlsdesc volatile *td =
> + (struct tlsdesc volatile *)addr_field;
> + if (! sym)
> + {
> + td->arg = (void*)reloc->r_addend;
> + td->entry = _dl_tlsdesc_undefweak;
> + }
> + else
> + {
> +# ifndef SHARED
> + CHECK_STATIC_TLS (map, sym_map);
> +# else
> + if (!TRY_STATIC_TLS (map, sym_map))
> + {
> + td->arg = _dl_make_tlsdesc_dynamic
> + (sym_map, sym->st_value + reloc->r_addend);
> + td->entry = _dl_tlsdesc_dynamic;
> + }
> + else
> +# endif
> + {
> + td->arg = (void *)(TLS_TPREL_VALUE (sym_map, sym)
> + + reloc->r_addend);
> + td->entry = _dl_tlsdesc_return;
> + }
> + }
> + break;
> + }
> +
> case R_LARCH_COPY:
> {
> if (sym == NULL)
> @@ -255,6 +286,25 @@ elf_machine_lazy_rel (struct link_map *map, struct r_scope_elem *scope[],
> else
> *reloc_addr = map->l_mach.plt;
> }
> + else if (__builtin_expect (r_type == R_LARCH_TLS_DESC64, 1))
> + {
> + const Elf_Symndx symndx = ELFW (R_SYM) (reloc->r_info);
> + const ElfW (Sym) *symtab = (const void *)D_PTR (map, l_info[DT_SYMTAB]);
> + const ElfW (Sym) *sym = &symtab[symndx];
> + const struct r_found_version *version = NULL;
> +
> + if (map->l_info[VERSYMIDX (DT_VERSYM)] != NULL)
> + {
> + const ElfW (Half) *vernum =
> + (const void *)D_PTR (map, l_info[VERSYMIDX (DT_VERSYM)]);
> + version = &map->l_versions[vernum[symndx] & 0x7fff];
> + }
> +
> + /* Always initialize TLS descriptors completely, because lazy
> + initialization requires synchronization at every TLS access. */
> + elf_machine_rela (map, scope, reloc, sym, version, reloc_addr,
> + skip_ifunc);
> + }
> else
> _dl_reloc_bad_type (map, r_type, 1);
> }
> diff --git a/sysdeps/loongarch/dl-tls.h b/sysdeps/loongarch/dl-tls.h
> index a551594b64..1ca376484a 100644
> --- a/sysdeps/loongarch/dl-tls.h
> +++ b/sysdeps/loongarch/dl-tls.h
> @@ -16,6 +16,9 @@
> License along with the GNU C Library. If not, see
> <https://www.gnu.org/licenses/>. */
>
> +#ifndef _DL_TLS_H
> +#define _DL_TLS_H
> +
> /* Type used for the representation of TLS information in the GOT. */
> typedef struct
> {
> @@ -23,6 +26,8 @@ typedef struct
> unsigned long int ti_offset;
> } tls_index;
>
> +extern void *__tls_get_addr (tls_index *ti);
> +
> /* The thread pointer points to the first static TLS block. */
> #define TLS_TP_OFFSET 0
>
> @@ -37,10 +42,10 @@ typedef struct
> /* Compute the value for a DTPREL reloc. */
> #define TLS_DTPREL_VALUE(sym) ((sym)->st_value - TLS_DTV_OFFSET)
>
> -extern void *__tls_get_addr (tls_index *ti);
> -
> #define GET_ADDR_OFFSET (ti->ti_offset + TLS_DTV_OFFSET)
> #define __TLS_GET_ADDR(__ti) (__tls_get_addr (__ti) - TLS_DTV_OFFSET)
>
> /* Value used for dtv entries for which the allocation is delayed. */
> #define TLS_DTV_UNALLOCATED ((void *) -1l)
> +
> +#endif
> diff --git a/sysdeps/loongarch/dl-tlsdesc.S b/sysdeps/loongarch/dl-tlsdesc.S
> new file mode 100644
> index 0000000000..d2c18ff527
> --- /dev/null
> +++ b/sysdeps/loongarch/dl-tlsdesc.S
> @@ -0,0 +1,364 @@
> +/* Thread-local storage handling in the ELF dynamic linker.
> + LoongArch version.
> + Copyright (C) 2011-2023 Free Software Foundation, Inc.
> +
> + This file is part of the GNU C Library.
> +
> + The GNU C Library is free software; you can redistribute it and/or
> + modify it under the terms of the GNU Lesser General Public
> + License as published by the Free Software Foundation; either
> + version 2.1 of the License, or (at your option) any later version.
> +
> + The GNU C Library is distributed in the hope that it will be useful,
> + but WITHOUT ANY WARRANTY; without even the implied warranty of
> + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
> + Lesser General Public License for more details.
> +
> + You should have received a copy of the GNU Lesser General Public
> + License along with the GNU C Library; if not, see
> + <https://www.gnu.org/licenses/>. */
> +
> +#include <sysdep.h>
> +#include <tls.h>
> +#include "tlsdesc.h"
> +
> + .text
> +
> + /* Compute the thread pointer offset for symbols in the static
> + TLS block. The offset is the same for all threads.
> + Prototype:
> + _dl_tlsdesc_return (tlsdesc *); */
> + .hidden _dl_tlsdesc_return
> + .global _dl_tlsdesc_return
> + .type _dl_tlsdesc_return,%function
> + cfi_startproc
> + .align 2
> +_dl_tlsdesc_return:
> + REG_L a0, a0, 8
> + RET
> + cfi_endproc
> + .size _dl_tlsdesc_return, .-_dl_tlsdesc_return
> +
> + /* Handler for undefined weak TLS symbols.
> + Prototype:
> + _dl_tlsdesc_undefweak (tlsdesc *);
> +
> + The second word of the descriptor contains the addend.
> + Return the addend minus the thread pointer. This ensures
> + that when the caller adds on the thread pointer it gets back
> + the addend. */
> + .hidden _dl_tlsdesc_undefweak
> + .global _dl_tlsdesc_undefweak
> + .type _dl_tlsdesc_undefweak,%function
> + cfi_startproc
> + .align 2
> +_dl_tlsdesc_undefweak:
> + REG_L a0, a0, 8
> + sub.d a0, a0, tp
> + RET
> + cfi_endproc
> + .size _dl_tlsdesc_undefweak, .-_dl_tlsdesc_undefweak
> +
the macros USE_LASX/LSX do not work here.
See :
commit 7f079fdc16e88ebb8020e17b2fd900e8924da29a
Author: caiyinyu <caiyinyu@loongson.cn>
Date: Wed Jul 5 16:38:05 2023 +0800
LoongArch: Add vector implementation for _dl_runtime_resolve.
> +#ifdef USE_LASX
> +# define FRAME_SIZE (-((-13 * SZREG - 32 * SZXREG) & ALMASK))
> +#elif defined USE_LSX
> +# define FRAME_SIZE (-((-13 * SZREG - 32 * SZVREG) & ALMASK))
> +#elif !defined __loongarch_soft_float
> +# define FRAME_SIZE (-((-13 * SZREG - 24 * SZFREG) & ALMASK))
> +#else
> +# define FRAME_SIZE (-((-13 * SZREG) & ALMASK))
> +#endif
> +
> +#ifdef SHARED
> + /* Handler for dynamic TLS symbols.
> + Prototype:
> + _dl_tlsdesc_dynamic (tlsdesc *) ;
> +
> + The second word of the descriptor points to a
> + tlsdesc_dynamic_arg structure.
> +
> + Returns the offset between the thread pointer and the
> + object referenced by the argument.
> +
> + ptrdiff_t
> + __attribute__ ((__regparm__ (1)))
> + _dl_tlsdesc_dynamic (struct tlsdesc *tdp)
> + {
> + struct tlsdesc_dynamic_arg *td = tdp->arg;
> + dtv_t *dtv = *(dtv_t **)((char *)__thread_pointer + TCBHEAD_DTV);
> + if (__builtin_expect (td->gen_count <= dtv[0].counter
> + && (dtv[td->tlsinfo.ti_module].pointer.val
> + != TLS_DTV_UNALLOCATED),
> + 1))
> + return dtv[td->tlsinfo.ti_module].pointer.val
> + + td->tlsinfo.ti_offset
> + - __thread_pointer;
> +
> + return ___tls_get_addr (&td->tlsinfo) - __thread_pointer;
> + }
> + */
> + .hidden _dl_tlsdesc_dynamic
> + .global _dl_tlsdesc_dynamic
> + .type _dl_tlsdesc_dynamic,%function
> + cfi_startproc
> + .align 2
> +_dl_tlsdesc_dynamic:
> + /* Save just enough registers to support fast path, if we fall
> + into slow path we will save additional registers. */
> + addi.d $r3,$r3,-24
> + REG_S t0, sp, 0
> + REG_S t1, sp, 8
> + REG_S t2, sp, 16
> +
> + REG_L t0, tp, -SIZE_OF_DTV # dtv(t0) = tp + TCBHEAD_DTV dtv start
> + REG_L a0, a0, TLSDESC_ARG # td(a0) = tdp->arg
> + REG_L t1, a0, TLSDESC_GEN_COUNT # t1 = td->gen_count
> + REG_L t2, t0, DTV_COUNTER # t2 = dtv[0].counter
> + bltu t2, t1, .Lslow
> +
> + REG_L t1, a0, TLSDESC_MODID # t1 = td->tlsinfo.ti_module
> + slli.d t1, t1, 3 + 1 # /* sizeof(dtv_t) == sizeof(void*) * 2 */
> + add.d t1, t1, t0 # t1 = dtv + ti_module * sizeof(dtv_t)
> + REG_L t1, t1, 0 # t1 = dtv[td->tlsinfo.ti_module].pointer.val
> + li.d t2, TLS_DTV_UNALLOCATED
> + beq t1, t2, .Lslow
> + REG_L t2, a0, TLSDESC_MODOFF # t2 = td->tlsinfo.ti_offset
> + # dtv[td->tlsinfo.ti_module].pointer.val + td->tlsinfo.ti_offset
> + add.d a0, t1, t2
> +.Lret:
> + sub.d a0, a0, tp
> + REG_L t0, sp, 0
> + REG_L t1, sp, 8
> + REG_L t2, sp, 16
> + addi.d sp, sp, 24
> + RET
> +
> +.Lslow:
> + /* This is the slow path. We need to call __tls_get_addr() which
> + means we need to save and restore all the register that the
> + callee will trash. */
> +
> + /* Save the remaining registers that we must treat as caller save. */
> + addi.d sp, sp, -FRAME_SIZE
> + REG_S ra, sp, 0 * SZREG
> + REG_S a1, sp, 1 * SZREG
> + REG_S a2, sp, 2 * SZREG
> + REG_S a3, sp, 3 * SZREG
> + REG_S a4, sp, 4 * SZREG
> + REG_S a5, sp, 5 * SZREG
> + REG_S a6, sp, 6 * SZREG
> + REG_S a7, sp, 7 * SZREG
> + REG_S t4, sp, 8 * SZREG
> + REG_S t5, sp, 9 * SZREG
> + REG_S t6, sp, 10 * SZREG
> + REG_S t7, sp, 11 * SZREG
> + REG_S t8, sp, 12 * SZREG
> +
> +#ifdef USE_LASX
> + xvst xr0, sp, 13*SZREG + 0*SZXREG
> + xvst xr1, sp, 13*SZREG + 1*SZXREG
> + xvst xr2, sp, 13*SZREG + 2*SZXREG
> + xvst xr3, sp, 13*SZREG + 3*SZXREG
> + xvst xr4, sp, 13*SZREG + 4*SZXREG
> + xvst xr5, sp, 13*SZREG + 5*SZXREG
> + xvst xr6, sp, 13*SZREG + 6*SZXREG
> + xvst xr7, sp, 13*SZREG + 7*SZXREG
> + xvst xr8, sp, 13*SZREG + 8*SZXREG
> + xvst xr9, sp, 13*SZREG + 9*SZXREG
> + xvst xr10, sp, 13*SZREG + 10*SZXREG
> + xvst xr11, sp, 13*SZREG + 11*SZXREG
> + xvst xr12, sp, 13*SZREG + 12*SZXREG
> + xvst xr13, sp, 13*SZREG + 13*SZXREG
> + xvst xr14, sp, 13*SZREG + 14*SZXREG
> + xvst xr15, sp, 13*SZREG + 15*SZXREG
> + xvst xr16, sp, 13*SZREG + 16*SZXREG
> + xvst xr17, sp, 13*SZREG + 17*SZXREG
> + xvst xr18, sp, 13*SZREG + 18*SZXREG
> + xvst xr19, sp, 13*SZREG + 19*SZXREG
> + xvst xr20, sp, 13*SZREG + 20*SZXREG
> + xvst xr21, sp, 13*SZREG + 21*SZXREG
> + xvst xr22, sp, 13*SZREG + 22*SZXREG
> + xvst xr23, sp, 13*SZREG + 23*SZXREG
> + xvst xr23, sp, 13*SZREG + 24*SZXREG
> + xvst xr23, sp, 13*SZREG + 25*SZXREG
> + xvst xr23, sp, 13*SZREG + 26*SZXREG
> + xvst xr23, sp, 13*SZREG + 27*SZXREG
> + xvst xr23, sp, 13*SZREG + 28*SZXREG
> + xvst xr23, sp, 13*SZREG + 29*SZXREG
> + xvst xr23, sp, 13*SZREG + 30*SZXREG
> + xvst xr23, sp, 13*SZREG + 31*SZXREG
> +#elif defined USE_LSX
> + vst vr0, sp, 13*SZREG + 0*SZVREG
> + vst vr1, sp, 13*SZREG + 1*SZVREG
> + vst vr2, sp, 13*SZREG + 2*SZVREG
> + vst vr3, sp, 13*SZREG + 3*SZVREG
> + vst vr4, sp, 13*SZREG + 4*SZVREG
> + vst vr5, sp, 13*SZREG + 5*SZVREG
> + vst vr6, sp, 13*SZREG + 6*SZVREG
> + vst vr7, sp, 13*SZREG + 7*SZVREG
> + vst vr8, sp, 13*SZREG + 8*SZVREG
> + vst vr9, sp, 13*SZREG + 9*SZVREG
> + vst vr10, sp, 13*SZREG + 10*SZVREG
> + vst vr11, sp, 13*SZREG + 11*SZVREG
> + vst vr12, sp, 13*SZREG + 12*SZVREG
> + vst vr13, sp, 13*SZREG + 13*SZVREG
> + vst vr14, sp, 13*SZREG + 14*SZVREG
> + vst vr15, sp, 13*SZREG + 15*SZVREG
> + vst vr16, sp, 13*SZREG + 16*SZVREG
> + vst vr17, sp, 13*SZREG + 17*SZVREG
> + vst vr18, sp, 13*SZREG + 18*SZVREG
> + vst vr19, sp, 13*SZREG + 19*SZVREG
> + vst vr20, sp, 13*SZREG + 20*SZVREG
> + vst vr21, sp, 13*SZREG + 21*SZVREG
> + vst vr22, sp, 13*SZREG + 22*SZVREG
> + vst vr23, sp, 13*SZREG + 23*SZVREG
> + vst vr23, sp, 13*SZREG + 24*SZVREG
> + vst vr23, sp, 13*SZREG + 25*SZVREG
> + vst vr23, sp, 13*SZREG + 26*SZVREG
> + vst vr23, sp, 13*SZREG + 27*SZVREG
> + vst vr23, sp, 13*SZREG + 28*SZVREG
> + vst vr23, sp, 13*SZREG + 29*SZVREG
> + vst vr23, sp, 13*SZREG + 30*SZVREG
> + vst vr23, sp, 13*SZREG + 31*SZVREG
> +#elif !defined __loongarch_soft_float
> + FREG_S fa0, sp, 13*SZREG + 0*SZFREG
> + FREG_S fa1, sp, 13*SZREG + 1*SZFREG
> + FREG_S fa2, sp, 13*SZREG + 2*SZFREG
> + FREG_S fa3, sp, 13*SZREG + 3*SZFREG
> + FREG_S fa4, sp, 13*SZREG + 4*SZFREG
> + FREG_S fa5, sp, 13*SZREG + 5*SZFREG
> + FREG_S fa6, sp, 13*SZREG + 6*SZFREG
> + FREG_S fa7, sp, 13*SZREG + 7*SZFREG
> + FREG_S ft0, sp, 13*SZREG + 8*SZFREG
> + FREG_S ft1, sp, 13*SZREG + 9*SZFREG
> + FREG_S ft2, sp, 13*SZREG + 10*SZFREG
> + FREG_S ft3, sp, 13*SZREG + 11*SZFREG
> + FREG_S ft4, sp, 13*SZREG + 12*SZFREG
> + FREG_S ft5, sp, 13*SZREG + 13*SZFREG
> + FREG_S ft6, sp, 13*SZREG + 14*SZFREG
> + FREG_S ft7, sp, 13*SZREG + 15*SZFREG
> + FREG_S ft8, sp, 13*SZREG + 16*SZFREG
> + FREG_S ft9, sp, 13*SZREG + 17*SZFREG
> + FREG_S ft10, sp, 13*SZREG + 18*SZFREG
> + FREG_S ft11, sp, 13*SZREG + 19*SZFREG
> + FREG_S ft12, sp, 13*SZREG + 20*SZFREG
> + FREG_S ft13, sp, 13*SZREG + 21*SZFREG
> + FREG_S ft14, sp, 13*SZREG + 22*SZFREG
> + FREG_S ft15, sp, 13*SZREG + 23*SZFREG
> +#endif /* #ifdef USE_LASX */
> +
> + bl __tls_get_addr
> + addi.d a0, a0, -TLS_DTV_OFFSET
> +
> + REG_L ra, sp, 0
> + REG_L a1, sp, 1 * 8
> + REG_L a2, sp, 2 * 8
> + REG_L a3, sp, 3 * 8
> + REG_L a4, sp, 4 * 8
> + REG_L a5, sp, 5 * 8
> + REG_L a6, sp, 6 * 8
> + REG_L a7, sp, 7 * 8
> + REG_L t4, sp, 8 * 8
> + REG_L t5, sp, 9 * 8
> + REG_L t6, sp, 10 * 8
> + REG_L t7, sp, 11 * 8
> + REG_L t8, sp, 12 * 8
> +
> +#ifdef USE_LASX
> + xvld xr0, sp, 13*SZREG + 0*SZXREG
> + xvld xr1, sp, 13*SZREG + 1*SZXREG
> + xvld xr2, sp, 13*SZREG + 2*SZXREG
> + xvld xr3, sp, 13*SZREG + 3*SZXREG
> + xvld xr4, sp, 13*SZREG + 4*SZXREG
> + xvld xr5, sp, 13*SZREG + 5*SZXREG
> + xvld xr6, sp, 13*SZREG + 6*SZXREG
> + xvld xr7, sp, 13*SZREG + 7*SZXREG
> + xvld xr8, sp, 13*SZREG + 8*SZXREG
> + xvld xr9, sp, 13*SZREG + 9*SZXREG
> + xvld xr10, sp, 13*SZREG + 10*SZXREG
> + xvld xr11, sp, 13*SZREG + 11*SZXREG
> + xvld xr12, sp, 13*SZREG + 12*SZXREG
> + xvld xr13, sp, 13*SZREG + 13*SZXREG
> + xvld xr14, sp, 13*SZREG + 14*SZXREG
> + xvld xr15, sp, 13*SZREG + 15*SZXREG
> + xvld xr16, sp, 13*SZREG + 16*SZXREG
> + xvld xr17, sp, 13*SZREG + 17*SZXREG
> + xvld xr18, sp, 13*SZREG + 18*SZXREG
> + xvld xr19, sp, 13*SZREG + 19*SZXREG
> + xvld xr20, sp, 13*SZREG + 20*SZXREG
> + xvld xr21, sp, 13*SZREG + 21*SZXREG
> + xvld xr22, sp, 13*SZREG + 22*SZXREG
> + xvld xr23, sp, 13*SZREG + 23*SZXREG
> + xvld xr24, sp, 13*SZREG + 24*SZXREG
> + xvld xr25, sp, 13*SZREG + 25*SZXREG
> + xvld xr26, sp, 13*SZREG + 26*SZXREG
> + xvld xr27, sp, 13*SZREG + 27*SZXREG
> + xvld xr28, sp, 13*SZREG + 28*SZXREG
> + xvld xr29, sp, 13*SZREG + 29*SZXREG
> + xvld xr30, sp, 13*SZREG + 30*SZXREG
> + xvld xr31, sp, 13*SZREG + 31*SZXREG
> +#elif defined USE_LSX
> + vld vr0, sp, 13*SZREG + 0*SZVREG
> + vld vr1, sp, 13*SZREG + 1*SZVREG
> + vld vr2, sp, 13*SZREG + 2*SZVREG
> + vld vr3, sp, 13*SZREG + 3*SZVREG
> + vld vr4, sp, 13*SZREG + 4*SZVREG
> + vld vr5, sp, 13*SZREG + 5*SZVREG
> + vld vr6, sp, 13*SZREG + 6*SZVREG
> + vld vr7, sp, 13*SZREG + 7*SZVREG
> + vld vr8, sp, 13*SZREG + 8*SZVREG
> + vld vr9, sp, 13*SZREG + 9*SZVREG
> + vld vr10, sp, 13*SZREG + 10*SZVREG
> + vld vr11, sp, 13*SZREG + 11*SZVREG
> + vld vr12, sp, 13*SZREG + 12*SZVREG
> + vld vr13, sp, 13*SZREG + 13*SZVREG
> + vld vr14, sp, 13*SZREG + 14*SZVREG
> + vld vr15, sp, 13*SZREG + 15*SZVREG
> + vld vr16, sp, 13*SZREG + 16*SZVREG
> + vld vr17, sp, 13*SZREG + 17*SZVREG
> + vld vr18, sp, 13*SZREG + 18*SZVREG
> + vld vr19, sp, 13*SZREG + 19*SZVREG
> + vld vr20, sp, 13*SZREG + 20*SZVREG
> + vld vr21, sp, 13*SZREG + 21*SZVREG
> + vld vr22, sp, 13*SZREG + 22*SZVREG
> + vld vr23, sp, 13*SZREG + 23*SZVREG
> + vld vr24, sp, 13*SZREG + 24*SZVREG
> + vld vr25, sp, 13*SZREG + 25*SZVREG
> + vld vr26, sp, 13*SZREG + 26*SZVREG
> + vld vr27, sp, 13*SZREG + 27*SZVREG
> + vld vr28, sp, 13*SZREG + 28*SZVREG
> + vld vr29, sp, 13*SZREG + 29*SZVREG
> + vld vr30, sp, 13*SZREG + 30*SZVREG
> + vld vr31, sp, 13*SZREG + 31*SZVREG
> +#elif !defined __loongarch_soft_float
> + FREG_L fa0, sp, 13*SZREG + 0*SZFREG
> + FREG_L fa1, sp, 13*SZREG + 1*SZFREG
> + FREG_L fa2, sp, 13*SZREG + 2*SZFREG
> + FREG_L fa3, sp, 13*SZREG + 3*SZFREG
> + FREG_L fa4, sp, 13*SZREG + 4*SZFREG
> + FREG_L fa5, sp, 13*SZREG + 5*SZFREG
> + FREG_L fa6, sp, 13*SZREG + 6*SZFREG
> + FREG_L fa7, sp, 13*SZREG + 7*SZFREG
> + FREG_L ft0, sp, 13*SZREG + 8*SZFREG
> + FREG_L ft1, sp, 13*SZREG + 9*SZFREG
> + FREG_L ft2, sp, 13*SZREG + 10*SZFREG
> + FREG_L ft3, sp, 13*SZREG + 11*SZFREG
> + FREG_L ft4, sp, 13*SZREG + 12*SZFREG
> + FREG_L ft5, sp, 13*SZREG + 13*SZFREG
> + FREG_L ft6, sp, 13*SZREG + 14*SZFREG
> + FREG_L ft7, sp, 13*SZREG + 15*SZFREG
> + FREG_L ft8, sp, 13*SZREG + 16*SZFREG
> + FREG_L ft9, sp, 13*SZREG + 17*SZFREG
> + FREG_L ft10, sp, 13*SZREG + 18*SZFREG
> + FREG_L ft11, sp, 13*SZREG + 19*SZFREG
> + FREG_L ft12, sp, 13*SZREG + 20*SZFREG
> + FREG_L ft13, sp, 13*SZREG + 21*SZFREG
> + FREG_L ft14, sp, 13*SZREG + 22*SZFREG
> + FREG_L ft15, sp, 13*SZREG + 23*SZFREG
> +#endif /* #ifdef USE_LASX */
> +
> + addi.d sp, sp, FRAME_SIZE
> + b .Lret
> + cfi_endproc
> + .size _dl_tlsdesc_dynamic, .-_dl_tlsdesc_dynamic
> +#endif /* #ifdef SHARED */
> diff --git a/sysdeps/loongarch/dl-tlsdesc.h b/sysdeps/loongarch/dl-tlsdesc.h
> new file mode 100644
> index 0000000000..e1a9365855
> --- /dev/null
> +++ b/sysdeps/loongarch/dl-tlsdesc.h
> @@ -0,0 +1,49 @@
> +/* Thread-local storage descriptor handling in the ELF dynamic linker.
> + LoongArch version.
> + Copyright (C) 2011-2023 Free Software Foundation, Inc.
> +
> + This file is part of the GNU C Library.
> +
> + The GNU C Library is free software; you can redistribute it and/or
> + modify it under the terms of the GNU Lesser General Public
> + License as published by the Free Software Foundation; either
> + version 2.1 of the License, or (at your option) any later version.
> +
> + The GNU C Library is distributed in the hope that it will be useful,
> + but WITHOUT ANY WARRANTY; without even the implied warranty of
> + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
> + Lesser General Public License for more details.
> +
> + You should have received a copy of the GNU Lesser General Public
> + License along with the GNU C Library; if not, see
> + <https://www.gnu.org/licenses/>. */
> +
> +#ifndef _DL_TLSDESC_H
> +#define _DL_TLSDESC_H
> +
> +#include <dl-tls.h>
> +
> +/* Type used to represent a TLS descriptor in the GOT. */
> +struct tlsdesc
> +{
> + ptrdiff_t (*entry) (struct tlsdesc *);
> + void *arg;
> +};
> +
> +/* Type used as the argument in a TLS descriptor for a symbol that
> + needs dynamic TLS offsets. */
> +struct tlsdesc_dynamic_arg
> +{
> + tls_index tlsinfo;
> + size_t gen_count;
> +};
> +
> +extern ptrdiff_t attribute_hidden _dl_tlsdesc_return (struct tlsdesc *);
> +extern ptrdiff_t attribute_hidden _dl_tlsdesc_undefweak (struct tlsdesc *);
> +
> +# ifdef SHARED
> +extern void *_dl_make_tlsdesc_dynamic (struct link_map *, size_t);
> +extern ptrdiff_t attribute_hidden _dl_tlsdesc_dynamic (struct tlsdesc *);
> +#endif
> +
> +#endif
> diff --git a/sysdeps/loongarch/linkmap.h b/sysdeps/loongarch/linkmap.h
> index 2f5bf53421..40416b1ad4 100644
> --- a/sysdeps/loongarch/linkmap.h
> +++ b/sysdeps/loongarch/linkmap.h
> @@ -19,4 +19,5 @@
> struct link_map_machine
> {
> ElfW (Addr) plt; /* Address of .plt. */
> + void *tlsdesc_table; /* Address of TLS descriptor hash table. */
> };
> diff --git a/sysdeps/loongarch/tlsdesc.c b/sysdeps/loongarch/tlsdesc.c
> new file mode 100644
> index 0000000000..a357e7619f
> --- /dev/null
> +++ b/sysdeps/loongarch/tlsdesc.c
> @@ -0,0 +1,39 @@
> +/* Manage TLS descriptors. AArch64 version.
> +
> + Copyright (C) 2011-2023 Free Software Foundation, Inc.
> +
> + This file is part of the GNU C Library.
> +
> + The GNU C Library is free software; you can redistribute it and/or
> + modify it under the terms of the GNU Lesser General Public
> + License as published by the Free Software Foundation; either
> + version 2.1 of the License, or (at your option) any later version.
> +
> + The GNU C Library is distributed in the hope that it will be useful,
> + but WITHOUT ANY WARRANTY; without even the implied warranty of
> + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
> + Lesser General Public License for more details.
> +
> + You should have received a copy of the GNU Lesser General Public
> + License along with the GNU C Library; if not, see
> + <https://www.gnu.org/licenses/>. */
> +
> +#include <ldsodefs.h>
> +#include <tls.h>
> +#include <dl-tlsdesc.h>
> +#include <dl-unmap-segments.h>
> +#include <tlsdeschtab.h>
> +
> +/* Unmap the dynamic object, but also release its TLS descriptor table
> + if there is one. */
> +
> +void
> +_dl_unmap (struct link_map *map)
> +{
> + _dl_unmap_segments (map);
> +
> +#ifdef SHARED
> + if (map->l_mach.tlsdesc_table)
> + htab_delete (map->l_mach.tlsdesc_table);
> +#endif
> +}
> diff --git a/sysdeps/loongarch/tlsdesc.sym b/sysdeps/loongarch/tlsdesc.sym
> new file mode 100644
> index 0000000000..bcab218631
> --- /dev/null
> +++ b/sysdeps/loongarch/tlsdesc.sym
> @@ -0,0 +1,19 @@
> +#include <stddef.h>
> +#include <sysdep.h>
> +#include <tls.h>
> +#include <link.h>
> +#include <dl-tlsdesc.h>
> +
> +--
> +
> +-- Abuse tls.h macros to derive offsets relative to the thread register.
> +
> +TLSDESC_ARG offsetof(struct tlsdesc, arg)
> +TLSDESC_GEN_COUNT offsetof(struct tlsdesc_dynamic_arg, gen_count)
> +TLSDESC_MODID offsetof(struct tlsdesc_dynamic_arg, tlsinfo.ti_module)
> +TLSDESC_MODOFF offsetof(struct tlsdesc_dynamic_arg, tlsinfo.ti_offset)
> +TCBHEAD_DTV offsetof(tcbhead_t, dtv)
> +DTV_COUNTER offsetof(dtv_t, counter)
> +TLS_DTV_UNALLOCATED TLS_DTV_UNALLOCATED
> +TLS_DTV_OFFSET TLS_DTV_OFFSET
> +SIZE_OF_DTV sizeof(tcbhead_t)
> diff --git a/sysdeps/unix/sysv/linux/loongarch/localplt.data b/sysdeps/unix/sysv/linux/loongarch/localplt.data
> index 547b1c1b7f..ec32e6d13f 100644
> --- a/sysdeps/unix/sysv/linux/loongarch/localplt.data
> +++ b/sysdeps/unix/sysv/linux/loongarch/localplt.data
> @@ -5,3 +5,5 @@ libc.so: calloc
> libc.so: free
> libc.so: malloc
> libc.so: realloc
> +# The dynamic loader needs __tls_get_addr for TLS.
> +ld.so: __tls_get_addr
On Fri, 2023-12-01 at 17:57 +0800, mengqinggang wrote:
> +.Lslow:
/* snip */
> +#ifdef USE_LASX
> + xvst xr0, sp, 13*SZREG + 0*SZXREG
> + xvst xr1, sp, 13*SZREG + 1*SZXREG
> + xvst xr2, sp, 13*SZREG + 2*SZXREG
> + xvst xr3, sp, 13*SZREG + 3*SZXREG
> + xvst xr4, sp, 13*SZREG + 4*SZXREG
> + xvst xr5, sp, 13*SZREG + 5*SZXREG
> + xvst xr6, sp, 13*SZREG + 6*SZXREG
> + xvst xr7, sp, 13*SZREG + 7*SZXREG
> + xvst xr8, sp, 13*SZREG + 8*SZXREG
> + xvst xr9, sp, 13*SZREG + 9*SZXREG
> + xvst xr10, sp, 13*SZREG + 10*SZXREG
> + xvst xr11, sp, 13*SZREG + 11*SZXREG
> + xvst xr12, sp, 13*SZREG + 12*SZXREG
> + xvst xr13, sp, 13*SZREG + 13*SZXREG
> + xvst xr14, sp, 13*SZREG + 14*SZXREG
> + xvst xr15, sp, 13*SZREG + 15*SZXREG
> + xvst xr16, sp, 13*SZREG + 16*SZXREG
> + xvst xr17, sp, 13*SZREG + 17*SZXREG
> + xvst xr18, sp, 13*SZREG + 18*SZXREG
> + xvst xr19, sp, 13*SZREG + 19*SZXREG
> + xvst xr20, sp, 13*SZREG + 20*SZXREG
> + xvst xr21, sp, 13*SZREG + 21*SZXREG
> + xvst xr22, sp, 13*SZREG + 22*SZXREG
> + xvst xr23, sp, 13*SZREG + 23*SZXREG
> + xvst xr23, sp, 13*SZREG + 24*SZXREG
> + xvst xr23, sp, 13*SZREG + 25*SZXREG
> + xvst xr23, sp, 13*SZREG + 26*SZXREG
> + xvst xr23, sp, 13*SZREG + 27*SZXREG
> + xvst xr23, sp, 13*SZREG + 28*SZXREG
> + xvst xr23, sp, 13*SZREG + 29*SZXREG
> + xvst xr23, sp, 13*SZREG + 30*SZXREG
> + xvst xr23, sp, 13*SZREG + 31*SZXREG
The typo here should be obvious.
> +#elif defined USE_LSX
> + vst vr0, sp, 13*SZREG + 0*SZVREG
> + vst vr1, sp, 13*SZREG + 1*SZVREG
> + vst vr2, sp, 13*SZREG + 2*SZVREG
> + vst vr3, sp, 13*SZREG + 3*SZVREG
> + vst vr4, sp, 13*SZREG + 4*SZVREG
> + vst vr5, sp, 13*SZREG + 5*SZVREG
> + vst vr6, sp, 13*SZREG + 6*SZVREG
> + vst vr7, sp, 13*SZREG + 7*SZVREG
> + vst vr8, sp, 13*SZREG + 8*SZVREG
> + vst vr9, sp, 13*SZREG + 9*SZVREG
> + vst vr10, sp, 13*SZREG + 10*SZVREG
> + vst vr11, sp, 13*SZREG + 11*SZVREG
> + vst vr12, sp, 13*SZREG + 12*SZVREG
> + vst vr13, sp, 13*SZREG + 13*SZVREG
> + vst vr14, sp, 13*SZREG + 14*SZVREG
> + vst vr15, sp, 13*SZREG + 15*SZVREG
> + vst vr16, sp, 13*SZREG + 16*SZVREG
> + vst vr17, sp, 13*SZREG + 17*SZVREG
> + vst vr18, sp, 13*SZREG + 18*SZVREG
> + vst vr19, sp, 13*SZREG + 19*SZVREG
> + vst vr20, sp, 13*SZREG + 20*SZVREG
> + vst vr21, sp, 13*SZREG + 21*SZVREG
> + vst vr22, sp, 13*SZREG + 22*SZVREG
> + vst vr23, sp, 13*SZREG + 23*SZVREG
> + vst vr23, sp, 13*SZREG + 24*SZVREG
> + vst vr23, sp, 13*SZREG + 25*SZVREG
> + vst vr23, sp, 13*SZREG + 26*SZVREG
> + vst vr23, sp, 13*SZREG + 27*SZVREG
> + vst vr23, sp, 13*SZREG + 28*SZVREG
> + vst vr23, sp, 13*SZREG + 29*SZVREG
> + vst vr23, sp, 13*SZREG + 30*SZVREG
> + vst vr23, sp, 13*SZREG + 31*SZVREG
Likewise.
> +#elif !defined __loongarch_soft_float
> + FREG_S fa0, sp, 13*SZREG + 0*SZFREG
> + FREG_S fa1, sp, 13*SZREG + 1*SZFREG
> + FREG_S fa2, sp, 13*SZREG + 2*SZFREG
> + FREG_S fa3, sp, 13*SZREG + 3*SZFREG
> + FREG_S fa4, sp, 13*SZREG + 4*SZFREG
> + FREG_S fa5, sp, 13*SZREG + 5*SZFREG
> + FREG_S fa6, sp, 13*SZREG + 6*SZFREG
> + FREG_S fa7, sp, 13*SZREG + 7*SZFREG
> + FREG_S ft0, sp, 13*SZREG + 8*SZFREG
> + FREG_S ft1, sp, 13*SZREG + 9*SZFREG
> + FREG_S ft2, sp, 13*SZREG + 10*SZFREG
> + FREG_S ft3, sp, 13*SZREG + 11*SZFREG
> + FREG_S ft4, sp, 13*SZREG + 12*SZFREG
> + FREG_S ft5, sp, 13*SZREG + 13*SZFREG
> + FREG_S ft6, sp, 13*SZREG + 14*SZFREG
> + FREG_S ft7, sp, 13*SZREG + 15*SZFREG
> + FREG_S ft8, sp, 13*SZREG + 16*SZFREG
> + FREG_S ft9, sp, 13*SZREG + 17*SZFREG
> + FREG_S ft10, sp, 13*SZREG + 18*SZFREG
> + FREG_S ft11, sp, 13*SZREG + 19*SZFREG
> + FREG_S ft12, sp, 13*SZREG + 20*SZFREG
> + FREG_S ft13, sp, 13*SZREG + 21*SZFREG
> + FREG_S ft14, sp, 13*SZREG + 22*SZFREG
> + FREG_S ft15, sp, 13*SZREG + 23*SZFREG
> +#endif /* #ifdef USE_LASX */
And generally this seems too expensive. Would it be better to compile
libc-tls.c with -ffixed-{f0,f1,...,f31} so we can avoid saving and
restoring FPR/VRs?
On Mon, 2023-12-04 at 11:28 +0800, Xi Ruoyao wrote:
> And generally this seems too expensive. Would it be better to compile
> libc-tls.c with -ffixed-{f0,f1,...,f31} so we can avoid saving and
> restoring FPR/VRs?
Note that -fcall-saved-f${x} is not enough for us because it only saves
the FPRs, not VRs. Even if we change it to -fcall-saved-xr${x} it won't
work.
-ffixed-{f0,f1,...,f31} may cause an ICE if the compiler attempts to use
a VR or FPR, but if I read the code correctly libc-tls.c just should not
perform any floating-point operation, and we can use -mno-lsx to prevent
using vector registers.
On Mon, 2023-12-04 at 11:45 +0800, Xi Ruoyao wrote:
> On Mon, 2023-12-04 at 11:28 +0800, Xi Ruoyao wrote:
> > And generally this seems too expensive. Would it be better to compile
> > libc-tls.c with -ffixed-{f0,f1,...,f31} so we can avoid saving and
> > restoring FPR/VRs?
>
> Note that -fcall-saved-f${x} is not enough for us because it only saves
> the FPRs, not VRs. Even if we change it to -fcall-saved-xr${x} it won't
> work.
>
> -ffixed-{f0,f1,...,f31} may cause an ICE if the compiler attempts to use
> a VR or FPR, but if I read the code correctly libc-tls.c just should not
> perform any floating-point operation, and we can use -mno-lsx to prevent
> using vector registers.
I made up this:
diff --git a/sysdeps/loongarch/Makefile b/sysdeps/loongarch/Makefile
index 43d2f583cd..64c1ea1294 100644
--- a/sysdeps/loongarch/Makefile
+++ b/sysdeps/loongarch/Makefile
@@ -15,3 +15,16 @@ ASFLAGS-.os += $(pic-ccflag)
ifeq (yes,$(have-cmodel-medium))
CFLAGS-.oS += -mcmodel=medium
endif
+
+# Make _dl_tlsdesc_dynamic slow path less expensive by denying __tls_get_addr
+# from using any FPR.
+#
+# Attention: if you see an ICE here, it's likely __tls_get_addr is doing
+# something wrong: why should it do floating-point operations anyway?!
+# Please fix it instead of complain to GCC maintainers.
+ifeq (yes,$(have-mno-lsx))
+CFLAGS-libc-tls.c += -mno-lsx
+endif
+CFLAGS-libc-tls.c += $(foreach n,30 31 \
+ $(foreach m,0 1 2 3 4 5 6 7 8 9,$m 1$m 2$m), \
+ -ffixed-f$n)
# diff --git a/sysdeps/loongarch/configure b/sysdeps/loongarch/configure
# (Regenerated)
diff --git a/sysdeps/loongarch/configure.ac b/sysdeps/loongarch/configure.ac
index 28a8ae5486..bacf75808c 100644
--- a/sysdeps/loongarch/configure.ac
+++ b/sysdeps/loongarch/configure.ac
@@ -65,3 +65,16 @@ rm -f conftest*])
if test $libc_cv_loongarch_vec_asm = no; then
AC_MSG_ERROR([binutils version is too old, use 2.41 or newer version])
fi
+
+# Check if compiler supports -mno-lsx.
+AC_CACHE_CHECK(whether $CC supports -mno-lsx, libc_cv_loongarch_cc_mno_lsx, [dnl
+cat > conftest.c <<\EOF
+ int dummy;
+EOF
+if AC_TRY_COMMAND(${CC-cc} -c $CFLAGS -mno-lsx -o conftest 1>&AS_MESSAGE_LOG_FD); then
+ libc_cv_loongarch_cc_mno_lsx=yes
+else
+ libc_cv_loongarch_cc_mno_lsx=no
+fi
+rm -f conftest*])
+LIBC_CONFIG_VAR([have-mno-lsx], [$libc_cv_loongarch_cc_mno_lsx])
But then I found the compiled __tls_get_addr is very simple:
000000000000027c <__tls_get_addr>:
27c: 28ffc04d ld.d $t1, $tp, -16
0000000000000280 <L0^A>:
280: 28c0208c ld.d $t0, $a0, 8
284: 28c041a4 ld.d $a0, $t1, 16
0000000000000288 <L0^A>:
288: 0010b084 add.d $a0, $a0, $t0
28c: 4c000020 ret
So I think writing __tls_get_addr in assembly should be easier. There
are just five instructions, so this is definitely better than messing
around compiler flags or writing 90+ instructions to save/load FPRs/VRs
(and slowing down the execution).
* Xi Ruoyao:
> I made up this:
>
> diff --git a/sysdeps/loongarch/Makefile b/sysdeps/loongarch/Makefile
> index 43d2f583cd..64c1ea1294 100644
> --- a/sysdeps/loongarch/Makefile
> +++ b/sysdeps/loongarch/Makefile
> @@ -15,3 +15,16 @@ ASFLAGS-.os += $(pic-ccflag)
> ifeq (yes,$(have-cmodel-medium))
> CFLAGS-.oS += -mcmodel=medium
> endif
> +
> +# Make _dl_tlsdesc_dynamic slow path less expensive by denying __tls_get_addr
> +# from using any FPR.
> +#
> +# Attention: if you see an ICE here, it's likely __tls_get_addr is doing
> +# something wrong: why should it do floating-point operations anyway?!
> +# Please fix it instead of complain to GCC maintainers.
> +ifeq (yes,$(have-mno-lsx))
> +CFLAGS-libc-tls.c += -mno-lsx
> +endif
This is not correct: __tls_get_addr may call malloc, and an interposed
malloc is free to use the full register file. You need to perform a
context switch here, similar to what the lazy binding trampoline does.
Thanks,
Florian
在 2023/12/4 下午12:13, Xi Ruoyao 写道:
> On Mon, 2023-12-04 at 11:45 +0800, Xi Ruoyao wrote:
>> On Mon, 2023-12-04 at 11:28 +0800, Xi Ruoyao wrote:
>>> And generally this seems too expensive. Would it be better to compile
>>> libc-tls.c with -ffixed-{f0,f1,...,f31} so we can avoid saving and
>>> restoring FPR/VRs?
>> Note that -fcall-saved-f${x} is not enough for us because it only saves
>> the FPRs, not VRs. Even if we change it to -fcall-saved-xr${x} it won't
>> work.
>>
>> -ffixed-{f0,f1,...,f31} may cause an ICE if the compiler attempts to use
>> a VR or FPR, but if I read the code correctly libc-tls.c just should not
>> perform any floating-point operation, and we can use -mno-lsx to prevent
>> using vector registers.
> I made up this:
>
> diff --git a/sysdeps/loongarch/Makefile b/sysdeps/loongarch/Makefile
> index 43d2f583cd..64c1ea1294 100644
> --- a/sysdeps/loongarch/Makefile
> +++ b/sysdeps/loongarch/Makefile
> @@ -15,3 +15,16 @@ ASFLAGS-.os += $(pic-ccflag)
> ifeq (yes,$(have-cmodel-medium))
> CFLAGS-.oS += -mcmodel=medium
> endif
> +
> +# Make _dl_tlsdesc_dynamic slow path less expensive by denying __tls_get_addr
> +# from using any FPR.
> +#
> +# Attention: if you see an ICE here, it's likely __tls_get_addr is doing
> +# something wrong: why should it do floating-point operations anyway?!
> +# Please fix it instead of complain to GCC maintainers.
> +ifeq (yes,$(have-mno-lsx))
> +CFLAGS-libc-tls.c += -mno-lsx
> +endif
> +CFLAGS-libc-tls.c += $(foreach n,30 31 \
> + $(foreach m,0 1 2 3 4 5 6 7 8 9,$m 1$m 2$m), \
> + -ffixed-f$n)
>
> # diff --git a/sysdeps/loongarch/configure b/sysdeps/loongarch/configure
> # (Regenerated)
>
> diff --git a/sysdeps/loongarch/configure.ac b/sysdeps/loongarch/configure.ac
> index 28a8ae5486..bacf75808c 100644
> --- a/sysdeps/loongarch/configure.ac
> +++ b/sysdeps/loongarch/configure.ac
> @@ -65,3 +65,16 @@ rm -f conftest*])
> if test $libc_cv_loongarch_vec_asm = no; then
> AC_MSG_ERROR([binutils version is too old, use 2.41 or newer version])
> fi
> +
> +# Check if compiler supports -mno-lsx.
> +AC_CACHE_CHECK(whether $CC supports -mno-lsx, libc_cv_loongarch_cc_mno_lsx, [dnl
> +cat > conftest.c <<\EOF
> + int dummy;
> +EOF
> +if AC_TRY_COMMAND(${CC-cc} -c $CFLAGS -mno-lsx -o conftest 1>&AS_MESSAGE_LOG_FD); then
> + libc_cv_loongarch_cc_mno_lsx=yes
> +else
> + libc_cv_loongarch_cc_mno_lsx=no
> +fi
> +rm -f conftest*])
> +LIBC_CONFIG_VAR([have-mno-lsx], [$libc_cv_loongarch_cc_mno_lsx])
>
> But then I found the compiled __tls_get_addr is very simple:
This _tls_get_addr can only be used in static linking.
>
> 000000000000027c <__tls_get_addr>:
> 27c: 28ffc04d ld.d $t1, $tp, -16
>
> 0000000000000280 <L0^A>:
> 280: 28c0208c ld.d $t0, $a0, 8
> 284: 28c041a4 ld.d $a0, $t1, 16
>
> 0000000000000288 <L0^A>:
> 288: 0010b084 add.d $a0, $a0, $t0
> 28c: 4c000020 ret
>
> So I think writing __tls_get_addr in assembly should be easier. There
> are just five instructions, so this is definitely better than messing
> around compiler flags or writing 90+ instructions to save/load FPRs/VRs
> (and slowing down the execution).
>
On Mon, 2023-12-04 at 09:13 +0100, Florian Weimer wrote:
> * Xi Ruoyao:
>
> > I made up this:
> >
> > diff --git a/sysdeps/loongarch/Makefile b/sysdeps/loongarch/Makefile
> > index 43d2f583cd..64c1ea1294 100644
> > --- a/sysdeps/loongarch/Makefile
> > +++ b/sysdeps/loongarch/Makefile
> > @@ -15,3 +15,16 @@ ASFLAGS-.os += $(pic-ccflag)
> > ifeq (yes,$(have-cmodel-medium))
> > CFLAGS-.oS += -mcmodel=medium
> > endif
> > +
> > +# Make _dl_tlsdesc_dynamic slow path less expensive by denying __tls_get_addr
> > +# from using any FPR.
> > +#
> > +# Attention: if you see an ICE here, it's likely __tls_get_addr is doing
> > +# something wrong: why should it do floating-point operations anyway?!
> > +# Please fix it instead of complain to GCC maintainers.
> > +ifeq (yes,$(have-mno-lsx))
> > +CFLAGS-libc-tls.c += -mno-lsx
> > +endif
>
> This is not correct: __tls_get_addr may call malloc, and an interposed
> malloc is free to use the full register file. You need to perform a
> context switch here, similar to what the lazy binding trampoline does.
Alright, but then do we need to save and restore fcsr and fcc as well?
AFAIK they should be saved during a context switch, and AFAIK there are
no rules saying "interposed malloc cannot alter floating-point execution
environment".
* Xi Ruoyao:
> On Mon, 2023-12-04 at 09:13 +0100, Florian Weimer wrote:
>> * Xi Ruoyao:
>>
>> > I made up this:
>> >
>> > diff --git a/sysdeps/loongarch/Makefile b/sysdeps/loongarch/Makefile
>> > index 43d2f583cd..64c1ea1294 100644
>> > --- a/sysdeps/loongarch/Makefile
>> > +++ b/sysdeps/loongarch/Makefile
>> > @@ -15,3 +15,16 @@ ASFLAGS-.os += $(pic-ccflag)
>> > ifeq (yes,$(have-cmodel-medium))
>> > CFLAGS-.oS += -mcmodel=medium
>> > endif
>> > +
>> > +# Make _dl_tlsdesc_dynamic slow path less expensive by denying __tls_get_addr
>> > +# from using any FPR.
>> > +#
>> > +# Attention: if you see an ICE here, it's likely __tls_get_addr is doing
>> > +# something wrong: why should it do floating-point operations anyway?!
>> > +# Please fix it instead of complain to GCC maintainers.
>> > +ifeq (yes,$(have-mno-lsx))
>> > +CFLAGS-libc-tls.c += -mno-lsx
>> > +endif
>>
>> This is not correct: __tls_get_addr may call malloc, and an interposed
>> malloc is free to use the full register file. You need to perform a
>> context switch here, similar to what the lazy binding trampoline does.
>
> Alright, but then do we need to save and restore fcsr and fcc as well?
> AFAIK they should be saved during a context switch, and AFAIK there are
> no rules saying "interposed malloc cannot alter floating-point execution
> environment".
Sorry, I'm not familiar with those register names and floating point
matters.
That being said, I don't think malloc may change the rounding mode and
other floating point environment aspects. Not sure about raising
exceptions, though.
Thanks,
Florian
On Mon, 2023-12-04 at 09:49 +0100, Florian Weimer wrote:
> * Xi Ruoyao:
>
> > On Mon, 2023-12-04 at 09:13 +0100, Florian Weimer wrote:
> > > * Xi Ruoyao:
> > >
> > > > I made up this:
> > > >
> > > > diff --git a/sysdeps/loongarch/Makefile b/sysdeps/loongarch/Makefile
> > > > index 43d2f583cd..64c1ea1294 100644
> > > > --- a/sysdeps/loongarch/Makefile
> > > > +++ b/sysdeps/loongarch/Makefile
> > > > @@ -15,3 +15,16 @@ ASFLAGS-.os += $(pic-ccflag)
> > > > ifeq (yes,$(have-cmodel-medium))
> > > > CFLAGS-.oS += -mcmodel=medium
> > > > endif
> > > > +
> > > > +# Make _dl_tlsdesc_dynamic slow path less expensive by denying __tls_get_addr
> > > > +# from using any FPR.
> > > > +#
> > > > +# Attention: if you see an ICE here, it's likely __tls_get_addr is doing
> > > > +# something wrong: why should it do floating-point operations anyway?!
> > > > +# Please fix it instead of complain to GCC maintainers.
> > > > +ifeq (yes,$(have-mno-lsx))
> > > > +CFLAGS-libc-tls.c += -mno-lsx
> > > > +endif
> > >
> > > This is not correct: __tls_get_addr may call malloc, and an interposed
> > > malloc is free to use the full register file. You need to perform a
> > > context switch here, similar to what the lazy binding trampoline does.
> >
> > Alright, but then do we need to save and restore fcsr and fcc as well?
> > AFAIK they should be saved during a context switch, and AFAIK there are
> > no rules saying "interposed malloc cannot alter floating-point execution
> > environment".
>
> Sorry, I'm not familiar with those register names and floating point
> matters.
>
> That being said, I don't think malloc may change the rounding mode and
> other floating point environment aspects. Not sure about raising
> exceptions, though.
For example, for comparing some floating-point values and storing the
result into a TLS variable, the compiler may generate something like
fcmp.clt.d $fcc0, $f0, $f1
pcalau12i $a0,%desc_pc_hi20(tls_sym)
ld.d $a1,$a0,%desc_ld_pc_lo12(tls_sym)
addi.d $a0,$a0,%desc_add_pc_lo12(tls_sym)
jirl $ra,$a1,%desc_call(tls_sym)
movcf2gr $t0,$fcc0
st.d $t0,$a0,0
Now if the interposed malloc clobbers fcc0 (well, it's simple: just
compile the TU containing malloc with -fzero-call-used-regs=all), this
sequence will blow up.
So we need to either save and restore fcc registers here, or in the
compiler add (clobber fcc0) (clobber fcc1) ... (clobber fcc7) for
got_load_tls_desc. Currently the draft GCC patch does not have these
clobbers.
(Note that this is not a problem with RISC-V because they use FPRs for
outputs of floating-point comparisons and they don't have dedicated FCC
registers).
* Xi Ruoyao:
> So we need to either save and restore fcc registers here, or in the
> compiler add (clobber fcc0) (clobber fcc1) ... (clobber fcc7) for
> got_load_tls_desc. Currently the draft GCC patch does not have these
> clobbers.
The problem with not having clobbers is that every ISA extension
requires patching the trampoline code. The downside of clobbers is that
they apply unconditionally. The trampoline, on the other hand, can save
the registers not preserved by the standard calling convention on the
slow path only, around the call into C code.
Thanks,
Florian
在 2023/12/4 下午5:01, Xi Ruoyao 写道:
> On Mon, 2023-12-04 at 09:49 +0100, Florian Weimer wrote:
>> * Xi Ruoyao:
>>
>>> On Mon, 2023-12-04 at 09:13 +0100, Florian Weimer wrote:
>>>> * Xi Ruoyao:
>>>>
>>>>> I made up this:
>>>>>
>>>>> diff --git a/sysdeps/loongarch/Makefile b/sysdeps/loongarch/Makefile
>>>>> index 43d2f583cd..64c1ea1294 100644
>>>>> --- a/sysdeps/loongarch/Makefile
>>>>> +++ b/sysdeps/loongarch/Makefile
>>>>> @@ -15,3 +15,16 @@ ASFLAGS-.os += $(pic-ccflag)
>>>>> ifeq (yes,$(have-cmodel-medium))
>>>>> CFLAGS-.oS += -mcmodel=medium
>>>>> endif
>>>>> +
>>>>> +# Make _dl_tlsdesc_dynamic slow path less expensive by denying __tls_get_addr
>>>>> +# from using any FPR.
>>>>> +#
>>>>> +# Attention: if you see an ICE here, it's likely __tls_get_addr is doing
>>>>> +# something wrong: why should it do floating-point operations anyway?!
>>>>> +# Please fix it instead of complain to GCC maintainers.
>>>>> +ifeq (yes,$(have-mno-lsx))
>>>>> +CFLAGS-libc-tls.c += -mno-lsx
>>>>> +endif
>>>> This is not correct: __tls_get_addr may call malloc, and an interposed
>>>> malloc is free to use the full register file. You need to perform a
>>>> context switch here, similar to what the lazy binding trampoline does.
>>> Alright, but then do we need to save and restore fcsr and fcc as well?
>>> AFAIK they should be saved during a context switch, and AFAIK there are
>>> no rules saying "interposed malloc cannot alter floating-point execution
>>> environment".
>> Sorry, I'm not familiar with those register names and floating point
>> matters.
>>
>> That being said, I don't think malloc may change the rounding mode and
>> other floating point environment aspects. Not sure about raising
>> exceptions, though.
> For example, for comparing some floating-point values and storing the
> result into a TLS variable, the compiler may generate something like
>
> fcmp.clt.d $fcc0, $f0, $f1
> pcalau12i $a0,%desc_pc_hi20(tls_sym)
> ld.d $a1,$a0,%desc_ld_pc_lo12(tls_sym)
> addi.d $a0,$a0,%desc_add_pc_lo12(tls_sym)
> jirl $ra,$a1,%desc_call(tls_sym)
> movcf2gr $t0,$fcc0
> st.d $t0,$a0,0
For AArch64, FCMP and CSET similar to FCMP.CLT.D and MOVCF2GR.
ADD, STR and LDR can be inserted between FCMP and CSET.
But it seems that there is no BLR (similar to JIRL) inserted between
FCMP and CSET.
AARCH64 has no save and restore FPSR and FPCR in _dl_tlsdesc_dynamic
function.
>
> Now if the interposed malloc clobbers fcc0 (well, it's simple: just
> compile the TU containing malloc with -fzero-call-used-regs=all), this
> sequence will blow up.
>
> So we need to either save and restore fcc registers here, or in the
> compiler add (clobber fcc0) (clobber fcc1) ... (clobber fcc7) for
> got_load_tls_desc. Currently the draft GCC patch does not have these
> clobbers.
>
> (Note that this is not a problem with RISC-V because they use FPRs for
> outputs of floating-point comparisons and they don't have dedicated FCC
> registers).
>
On Wed, 2023-12-06 at 15:46 +0800, mengqinggang wrote:
> For AArch64, FCMP and CSET similar to FCMP.CLT.D and MOVCF2GR.
> ADD, STR and LDR can be inserted between FCMP and CSET.
> But it seems that there is no BLR (similar to JIRL) inserted between
> FCMP and CSET.
> AARCH64 has no save and restore FPSR and FPCR in _dl_tlsdesc_dynamic
> function.
AArch64 tlsdesc call pattern clobber CC registers:
(define_insn "tlsdesc_small_advsimd_<mode>"
[(set (reg:PTR R0_REGNUM)
(unspec:PTR [(match_operand 0 "aarch64_valid_symref" "S")]
UNSPEC_TLSDESC))
(clobber (reg:DI LR_REGNUM))
(clobber (reg:CC CC_REGNUM)) # <=============== !!!!!!!!!!!!!
(clobber (match_scratch:DI 1 "=r"))
(use (reg:DI FP_REGNUM))]
"TARGET_TLS_DESC && !TARGET_SVE"
"adrp\\tx0, %A0\;ldr\\t%<w>1, [x0, #%L0]\;add\\t<w>0, <w>0, %L0\;.tlsdesccall\\t%0\;blr\\t%1"
[(set_attr "type" "call")
(set_attr "length" "16")])
So GCC won't insert it between FCMP and CSET. If you want the same
effect you need (clobber (reg:CC FCC_REG_FIRST)) (clobber (reg:CC
FCC_REG_FIRST + 1)) ... ... (clobber (reg:CC FCC_REG_LAST)) as I've
already suggested.
On Wed, 2023-12-06 at 16:05 +0800, Xi Ruoyao wrote:
> On Wed, 2023-12-06 at 15:46 +0800, mengqinggang wrote:
> > For AArch64, FCMP and CSET similar to FCMP.CLT.D and MOVCF2GR.
> > ADD, STR and LDR can be inserted between FCMP and CSET.
> > But it seems that there is no BLR (similar to JIRL) inserted between
> > FCMP and CSET.
> > AARCH64 has no save and restore FPSR and FPCR in _dl_tlsdesc_dynamic
> > function.
>
> AArch64 tlsdesc call pattern clobber CC registers:
>
> (define_insn "tlsdesc_small_advsimd_<mode>"
> [(set (reg:PTR R0_REGNUM)
> (unspec:PTR [(match_operand 0 "aarch64_valid_symref" "S")]
> UNSPEC_TLSDESC))
> (clobber (reg:DI LR_REGNUM))
> (clobber (reg:CC CC_REGNUM)) # <=============== !!!!!!!!!!!!!
> (clobber (match_scratch:DI 1 "=r"))
> (use (reg:DI FP_REGNUM))]
> "TARGET_TLS_DESC && !TARGET_SVE"
> "adrp\\tx0, %A0\;ldr\\t%<w>1, [x0, #%L0]\;add\\t<w>0, <w>0, %L0\;.tlsdesccall\\t%0\;blr\\t%1"
> [(set_attr "type" "call")
> (set_attr "length" "16")])
>
> So GCC won't insert it between FCMP and CSET. If you want the same
> effect you need (clobber (reg:CC FCC_REG_FIRST)) (clobber (reg:CC
> FCC_REG_FIRST + 1)) ... ... (clobber (reg:CC FCC_REG_LAST)) as I've
> already suggested.
Correction: it should be reg:FCC instead of reg:CC, as we are using
FCCmode instead of CCmode of AArch64.
> On Dec 1, 2023, at 18:57, mengqinggang <mengqinggang@loongson.cn> wrote:
>
> This is mostly based on AArch64 and RISC-V implementation.
>
> Add R_LARCH_TLS_DESC32 and R_LARCH_TLS_DESC64 relocations.
>
> For _dl_tlsdesc_dynamic function slow path, temporarily save and restore
> all vector registers.
> ---
> elf/elf.h | 2 +
> sysdeps/loongarch/Makefile | 6 +
> sysdeps/loongarch/dl-link.sym | 1 +
> sysdeps/loongarch/dl-machine.h | 52 ++-
> sysdeps/loongarch/dl-tls.h | 9 +-
> sysdeps/loongarch/dl-tlsdesc.S | 364 ++++++++++++++++++
> sysdeps/loongarch/dl-tlsdesc.h | 49 +++
> sysdeps/loongarch/linkmap.h | 1 +
> sysdeps/loongarch/tlsdesc.c | 39 ++
> sysdeps/loongarch/tlsdesc.sym | 19 +
> .../unix/sysv/linux/loongarch/localplt.data | 2 +
> 11 files changed, 541 insertions(+), 3 deletions(-)
> create mode 100644 sysdeps/loongarch/dl-tlsdesc.S
> create mode 100644 sysdeps/loongarch/dl-tlsdesc.h
> create mode 100644 sysdeps/loongarch/tlsdesc.c
> create mode 100644 sysdeps/loongarch/tlsdesc.sym
>
> diff --git a/elf/elf.h b/elf/elf.h
> index 5c1c1972d1..72e90aec30 100644
> --- a/elf/elf.h
> +++ b/elf/elf.h
> @@ -4232,6 +4232,8 @@ enum
> #define R_LARCH_TLS_TPREL32 10
> #define R_LARCH_TLS_TPREL64 11
> #define R_LARCH_IRELATIVE 12
> +#define R_LARCH_TLS_DESC32 13
> +#define R_LARCH_TLS_DESC64 14
Does there need to be separate relocations for 32- and 64-bit? For RISC-V this was determinable from the bitness of the ELF binary, and a lot of old relocations had meaningless 32 and 64 suffixes by accident [1].
[1]: https://github.com/riscv-non-isa/riscv-elf-psabi-doc/pull/373#discussion_r1153477626
>
> /* Reserved for future relocs that the dynamic linker must understand. */
>
> diff --git a/sysdeps/loongarch/Makefile b/sysdeps/loongarch/Makefile
> index 43d2f583cd..181389e787 100644
> --- a/sysdeps/loongarch/Makefile
> +++ b/sysdeps/loongarch/Makefile
> @@ -3,9 +3,15 @@ sysdep_headers += sys/asm.h
> endif
>
> ifeq ($(subdir),elf)
> +sysdep-dl-routines += tlsdesc dl-tlsdesc
> gen-as-const-headers += dl-link.sym
> endif
>
> +ifeq ($(subdir),csu)
> +gen-as-const-headers += tlsdesc.sym
> +endif
> +
> +
> # LoongArch's assembler also needs to know about PIC as it changes the
> # definition of some assembler macros.
> ASFLAGS-.os += $(pic-ccflag)
> diff --git a/sysdeps/loongarch/dl-link.sym b/sysdeps/loongarch/dl-link.sym
> index b534968e30..fd81ef37d5 100644
> --- a/sysdeps/loongarch/dl-link.sym
> +++ b/sysdeps/loongarch/dl-link.sym
> @@ -1,6 +1,7 @@
> #include <stddef.h>
> #include <sysdep.h>
> #include <link.h>
> +#include <dl-tlsdesc.h>
>
> DL_SIZEOF_RG sizeof(struct La_loongarch_regs)
> DL_SIZEOF_RV sizeof(struct La_loongarch_retval)
> diff --git a/sysdeps/loongarch/dl-machine.h b/sysdeps/loongarch/dl-machine.h
> index 0d17fd21e3..0dd252a5e5 100644
> --- a/sysdeps/loongarch/dl-machine.h
> +++ b/sysdeps/loongarch/dl-machine.h
> @@ -25,7 +25,7 @@
> #include <entry.h>
> #include <elf/elf.h>
> #include <sys/asm.h>
> -#include <dl-tls.h>
> +#include <dl-tlsdesc.h>
> #include <dl-static-tls.h>
> #include <dl-machine-rel.h>
>
> @@ -187,6 +187,37 @@ elf_machine_rela (struct link_map *map, struct r_scope_elem *scope[],
> *addr_field = TLS_TPREL_VALUE (sym_map, sym) + reloc->r_addend;
> break;
>
> + case __WORDSIZE == 64 ? R_LARCH_TLS_DESC64 : R_LARCH_TLS_DESC32:
> + {
> + struct tlsdesc volatile *td =
> + (struct tlsdesc volatile *)addr_field;
> + if (! sym)
> + {
> + td->arg = (void*)reloc->r_addend;
> + td->entry = _dl_tlsdesc_undefweak;
> + }
> + else
> + {
> +# ifndef SHARED
> + CHECK_STATIC_TLS (map, sym_map);
> +# else
> + if (!TRY_STATIC_TLS (map, sym_map))
> + {
> + td->arg = _dl_make_tlsdesc_dynamic
> + (sym_map, sym->st_value + reloc->r_addend);
> + td->entry = _dl_tlsdesc_dynamic;
> + }
> + else
> +# endif
> + {
> + td->arg = (void *)(TLS_TPREL_VALUE (sym_map, sym)
> + + reloc->r_addend);
> + td->entry = _dl_tlsdesc_return;
> + }
> + }
> + break;
> + }
> +
> case R_LARCH_COPY:
> {
> if (sym == NULL)
> @@ -255,6 +286,25 @@ elf_machine_lazy_rel (struct link_map *map, struct r_scope_elem *scope[],
> else
> *reloc_addr = map->l_mach.plt;
> }
> + else if (__builtin_expect (r_type == R_LARCH_TLS_DESC64, 1))
> + {
> + const Elf_Symndx symndx = ELFW (R_SYM) (reloc->r_info);
> + const ElfW (Sym) *symtab = (const void *)D_PTR (map, l_info[DT_SYMTAB]);
> + const ElfW (Sym) *sym = &symtab[symndx];
> + const struct r_found_version *version = NULL;
> +
> + if (map->l_info[VERSYMIDX (DT_VERSYM)] != NULL)
> + {
> + const ElfW (Half) *vernum =
> + (const void *)D_PTR (map, l_info[VERSYMIDX (DT_VERSYM)]);
> + version = &map->l_versions[vernum[symndx] & 0x7fff];
> + }
> +
> + /* Always initialize TLS descriptors completely, because lazy
> + initialization requires synchronization at every TLS access. */
> + elf_machine_rela (map, scope, reloc, sym, version, reloc_addr,
> + skip_ifunc);
> + }
> else
> _dl_reloc_bad_type (map, r_type, 1);
> }
> diff --git a/sysdeps/loongarch/dl-tls.h b/sysdeps/loongarch/dl-tls.h
> index a551594b64..1ca376484a 100644
> --- a/sysdeps/loongarch/dl-tls.h
> +++ b/sysdeps/loongarch/dl-tls.h
> @@ -16,6 +16,9 @@
> License along with the GNU C Library. If not, see
> <https://www.gnu.org/licenses/>. */
>
> +#ifndef _DL_TLS_H
> +#define _DL_TLS_H
> +
> /* Type used for the representation of TLS information in the GOT. */
> typedef struct
> {
> @@ -23,6 +26,8 @@ typedef struct
> unsigned long int ti_offset;
> } tls_index;
>
> +extern void *__tls_get_addr (tls_index *ti);
> +
> /* The thread pointer points to the first static TLS block. */
> #define TLS_TP_OFFSET 0
>
> @@ -37,10 +42,10 @@ typedef struct
> /* Compute the value for a DTPREL reloc. */
> #define TLS_DTPREL_VALUE(sym) ((sym)->st_value - TLS_DTV_OFFSET)
>
> -extern void *__tls_get_addr (tls_index *ti);
> -
> #define GET_ADDR_OFFSET (ti->ti_offset + TLS_DTV_OFFSET)
> #define __TLS_GET_ADDR(__ti) (__tls_get_addr (__ti) - TLS_DTV_OFFSET)
>
> /* Value used for dtv entries for which the allocation is delayed. */
> #define TLS_DTV_UNALLOCATED ((void *) -1l)
> +
> +#endif
> diff --git a/sysdeps/loongarch/dl-tlsdesc.S b/sysdeps/loongarch/dl-tlsdesc.S
> new file mode 100644
> index 0000000000..d2c18ff527
> --- /dev/null
> +++ b/sysdeps/loongarch/dl-tlsdesc.S
> @@ -0,0 +1,364 @@
> +/* Thread-local storage handling in the ELF dynamic linker.
> + LoongArch version.
> + Copyright (C) 2011-2023 Free Software Foundation, Inc.
> +
> + This file is part of the GNU C Library.
> +
> + The GNU C Library is free software; you can redistribute it and/or
> + modify it under the terms of the GNU Lesser General Public
> + License as published by the Free Software Foundation; either
> + version 2.1 of the License, or (at your option) any later version.
> +
> + The GNU C Library is distributed in the hope that it will be useful,
> + but WITHOUT ANY WARRANTY; without even the implied warranty of
> + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
> + Lesser General Public License for more details.
> +
> + You should have received a copy of the GNU Lesser General Public
> + License along with the GNU C Library; if not, see
> + <https://www.gnu.org/licenses/>. */
> +
> +#include <sysdep.h>
> +#include <tls.h>
> +#include "tlsdesc.h"
> +
> + .text
> +
> + /* Compute the thread pointer offset for symbols in the static
> + TLS block. The offset is the same for all threads.
> + Prototype:
> + _dl_tlsdesc_return (tlsdesc *); */
> + .hidden _dl_tlsdesc_return
> + .global _dl_tlsdesc_return
> + .type _dl_tlsdesc_return,%function
> + cfi_startproc
> + .align 2
> +_dl_tlsdesc_return:
> + REG_L a0, a0, 8
> + RET
> + cfi_endproc
> + .size _dl_tlsdesc_return, .-_dl_tlsdesc_return
> +
> + /* Handler for undefined weak TLS symbols.
> + Prototype:
> + _dl_tlsdesc_undefweak (tlsdesc *);
> +
> + The second word of the descriptor contains the addend.
> + Return the addend minus the thread pointer. This ensures
> + that when the caller adds on the thread pointer it gets back
> + the addend. */
> + .hidden _dl_tlsdesc_undefweak
> + .global _dl_tlsdesc_undefweak
> + .type _dl_tlsdesc_undefweak,%function
> + cfi_startproc
> + .align 2
> +_dl_tlsdesc_undefweak:
> + REG_L a0, a0, 8
> + sub.d a0, a0, tp
> + RET
> + cfi_endproc
> + .size _dl_tlsdesc_undefweak, .-_dl_tlsdesc_undefweak
> +
> +#ifdef USE_LASX
> +# define FRAME_SIZE (-((-13 * SZREG - 32 * SZXREG) & ALMASK))
> +#elif defined USE_LSX
> +# define FRAME_SIZE (-((-13 * SZREG - 32 * SZVREG) & ALMASK))
> +#elif !defined __loongarch_soft_float
> +# define FRAME_SIZE (-((-13 * SZREG - 24 * SZFREG) & ALMASK))
> +#else
> +# define FRAME_SIZE (-((-13 * SZREG) & ALMASK))
> +#endif
> +
> +#ifdef SHARED
> + /* Handler for dynamic TLS symbols.
> + Prototype:
> + _dl_tlsdesc_dynamic (tlsdesc *) ;
> +
> + The second word of the descriptor points to a
> + tlsdesc_dynamic_arg structure.
> +
> + Returns the offset between the thread pointer and the
> + object referenced by the argument.
> +
> + ptrdiff_t
> + __attribute__ ((__regparm__ (1)))
> + _dl_tlsdesc_dynamic (struct tlsdesc *tdp)
> + {
> + struct tlsdesc_dynamic_arg *td = tdp->arg;
> + dtv_t *dtv = *(dtv_t **)((char *)__thread_pointer + TCBHEAD_DTV);
> + if (__builtin_expect (td->gen_count <= dtv[0].counter
> + && (dtv[td->tlsinfo.ti_module].pointer.val
> + != TLS_DTV_UNALLOCATED),
> + 1))
> + return dtv[td->tlsinfo.ti_module].pointer.val
> + + td->tlsinfo.ti_offset
> + - __thread_pointer;
> +
> + return ___tls_get_addr (&td->tlsinfo) - __thread_pointer;
> + }
> + */
> + .hidden _dl_tlsdesc_dynamic
> + .global _dl_tlsdesc_dynamic
> + .type _dl_tlsdesc_dynamic,%function
> + cfi_startproc
> + .align 2
> +_dl_tlsdesc_dynamic:
> + /* Save just enough registers to support fast path, if we fall
> + into slow path we will save additional registers. */
> + addi.d $r3,$r3,-24
The stack alignment is broken here. The fast path doesn’t need stack alignment, but it breaks the assumption in the slow path that sp is already aligned. You need to either keep the fast path offset aligned (simpler, but some wasted stack space), or calculate the correct offset to re-align in the slow path.
> + REG_S t0, sp, 0
> + REG_S t1, sp, 8
> + REG_S t2, sp, 16
> +
> + REG_L t0, tp, -SIZE_OF_DTV # dtv(t0) = tp + TCBHEAD_DTV dtv start
> + REG_L a0, a0, TLSDESC_ARG # td(a0) = tdp->arg
> + REG_L t1, a0, TLSDESC_GEN_COUNT # t1 = td->gen_count
> + REG_L t2, t0, DTV_COUNTER # t2 = dtv[0].counter
> + bltu t2, t1, .Lslow
> +
> + REG_L t1, a0, TLSDESC_MODID # t1 = td->tlsinfo.ti_module
> + slli.d t1, t1, 3 + 1 # /* sizeof(dtv_t) == sizeof(void*) * 2 */
32-bit handling in this patch, in particular here, does not seem correct. On 32-bit the field of the related structures (void* and unsigned long) should also be 32-bit, but the offsets in the stub are hardcoded to those of 64-bit.
Also note that REG_L is supposed to be a macro that switches the instruction based on 32- or 64-bit. I’m not sure why it’s hardcoded to the 64-bit version in LoongArch, but I think one should either stick to writing all instructions with .d suffix typed out (instead of the bit-agnostic versions), or define more bit-agnostic macros like REG_ADD so that everything that depends on pointer size can be portable across 32- and 64-bit.
> + add.d t1, t1, t0 # t1 = dtv + ti_module * sizeof(dtv_t)
> + REG_L t1, t1, 0 # t1 = dtv[td->tlsinfo.ti_module].pointer.val
> + li.d t2, TLS_DTV_UNALLOCATED
> + beq t1, t2, .Lslow
> + REG_L t2, a0, TLSDESC_MODOFF # t2 = td->tlsinfo.ti_offset
> + # dtv[td->tlsinfo.ti_module].pointer.val + td->tlsinfo.ti_offset
> + add.d a0, t1, t2
> +.Lret:
> + sub.d a0, a0, tp
> + REG_L t0, sp, 0
> + REG_L t1, sp, 8
> + REG_L t2, sp, 16
> + addi.d sp, sp, 24
> + RET
> +
> +.Lslow:
> + /* This is the slow path. We need to call __tls_get_addr() which
> + means we need to save and restore all the register that the
> + callee will trash. */
> +
> + /* Save the remaining registers that we must treat as caller save. */
> + addi.d sp, sp, -FRAME_SIZE
> + REG_S ra, sp, 0 * SZREG
> + REG_S a1, sp, 1 * SZREG
> + REG_S a2, sp, 2 * SZREG
> + REG_S a3, sp, 3 * SZREG
> + REG_S a4, sp, 4 * SZREG
> + REG_S a5, sp, 5 * SZREG
> + REG_S a6, sp, 6 * SZREG
> + REG_S a7, sp, 7 * SZREG
> + REG_S t4, sp, 8 * SZREG
> + REG_S t5, sp, 9 * SZREG
> + REG_S t6, sp, 10 * SZREG
> + REG_S t7, sp, 11 * SZREG
> + REG_S t8, sp, 12 * SZREG
> +
> +#ifdef USE_LASX
> + xvst xr0, sp, 13*SZREG + 0*SZXREG
> + xvst xr1, sp, 13*SZREG + 1*SZXREG
> + xvst xr2, sp, 13*SZREG + 2*SZXREG
> + xvst xr3, sp, 13*SZREG + 3*SZXREG
> + xvst xr4, sp, 13*SZREG + 4*SZXREG
> + xvst xr5, sp, 13*SZREG + 5*SZXREG
> + xvst xr6, sp, 13*SZREG + 6*SZXREG
> + xvst xr7, sp, 13*SZREG + 7*SZXREG
> + xvst xr8, sp, 13*SZREG + 8*SZXREG
> + xvst xr9, sp, 13*SZREG + 9*SZXREG
> + xvst xr10, sp, 13*SZREG + 10*SZXREG
> + xvst xr11, sp, 13*SZREG + 11*SZXREG
> + xvst xr12, sp, 13*SZREG + 12*SZXREG
> + xvst xr13, sp, 13*SZREG + 13*SZXREG
> + xvst xr14, sp, 13*SZREG + 14*SZXREG
> + xvst xr15, sp, 13*SZREG + 15*SZXREG
> + xvst xr16, sp, 13*SZREG + 16*SZXREG
> + xvst xr17, sp, 13*SZREG + 17*SZXREG
> + xvst xr18, sp, 13*SZREG + 18*SZXREG
> + xvst xr19, sp, 13*SZREG + 19*SZXREG
> + xvst xr20, sp, 13*SZREG + 20*SZXREG
> + xvst xr21, sp, 13*SZREG + 21*SZXREG
> + xvst xr22, sp, 13*SZREG + 22*SZXREG
> + xvst xr23, sp, 13*SZREG + 23*SZXREG
> + xvst xr23, sp, 13*SZREG + 24*SZXREG
> + xvst xr23, sp, 13*SZREG + 25*SZXREG
> + xvst xr23, sp, 13*SZREG + 26*SZXREG
> + xvst xr23, sp, 13*SZREG + 27*SZXREG
> + xvst xr23, sp, 13*SZREG + 28*SZXREG
> + xvst xr23, sp, 13*SZREG + 29*SZXREG
> + xvst xr23, sp, 13*SZREG + 30*SZXREG
> + xvst xr23, sp, 13*SZREG + 31*SZXREG
> +#elif defined USE_LSX
> + vst vr0, sp, 13*SZREG + 0*SZVREG
> + vst vr1, sp, 13*SZREG + 1*SZVREG
> + vst vr2, sp, 13*SZREG + 2*SZVREG
> + vst vr3, sp, 13*SZREG + 3*SZVREG
> + vst vr4, sp, 13*SZREG + 4*SZVREG
> + vst vr5, sp, 13*SZREG + 5*SZVREG
> + vst vr6, sp, 13*SZREG + 6*SZVREG
> + vst vr7, sp, 13*SZREG + 7*SZVREG
> + vst vr8, sp, 13*SZREG + 8*SZVREG
> + vst vr9, sp, 13*SZREG + 9*SZVREG
> + vst vr10, sp, 13*SZREG + 10*SZVREG
> + vst vr11, sp, 13*SZREG + 11*SZVREG
> + vst vr12, sp, 13*SZREG + 12*SZVREG
> + vst vr13, sp, 13*SZREG + 13*SZVREG
> + vst vr14, sp, 13*SZREG + 14*SZVREG
> + vst vr15, sp, 13*SZREG + 15*SZVREG
> + vst vr16, sp, 13*SZREG + 16*SZVREG
> + vst vr17, sp, 13*SZREG + 17*SZVREG
> + vst vr18, sp, 13*SZREG + 18*SZVREG
> + vst vr19, sp, 13*SZREG + 19*SZVREG
> + vst vr20, sp, 13*SZREG + 20*SZVREG
> + vst vr21, sp, 13*SZREG + 21*SZVREG
> + vst vr22, sp, 13*SZREG + 22*SZVREG
> + vst vr23, sp, 13*SZREG + 23*SZVREG
> + vst vr23, sp, 13*SZREG + 24*SZVREG
> + vst vr23, sp, 13*SZREG + 25*SZVREG
> + vst vr23, sp, 13*SZREG + 26*SZVREG
> + vst vr23, sp, 13*SZREG + 27*SZVREG
> + vst vr23, sp, 13*SZREG + 28*SZVREG
> + vst vr23, sp, 13*SZREG + 29*SZVREG
> + vst vr23, sp, 13*SZREG + 30*SZVREG
> + vst vr23, sp, 13*SZREG + 31*SZVREG
> +#elif !defined __loongarch_soft_float
> + FREG_S fa0, sp, 13*SZREG + 0*SZFREG
> + FREG_S fa1, sp, 13*SZREG + 1*SZFREG
> + FREG_S fa2, sp, 13*SZREG + 2*SZFREG
> + FREG_S fa3, sp, 13*SZREG + 3*SZFREG
> + FREG_S fa4, sp, 13*SZREG + 4*SZFREG
> + FREG_S fa5, sp, 13*SZREG + 5*SZFREG
> + FREG_S fa6, sp, 13*SZREG + 6*SZFREG
> + FREG_S fa7, sp, 13*SZREG + 7*SZFREG
> + FREG_S ft0, sp, 13*SZREG + 8*SZFREG
> + FREG_S ft1, sp, 13*SZREG + 9*SZFREG
> + FREG_S ft2, sp, 13*SZREG + 10*SZFREG
> + FREG_S ft3, sp, 13*SZREG + 11*SZFREG
> + FREG_S ft4, sp, 13*SZREG + 12*SZFREG
> + FREG_S ft5, sp, 13*SZREG + 13*SZFREG
> + FREG_S ft6, sp, 13*SZREG + 14*SZFREG
> + FREG_S ft7, sp, 13*SZREG + 15*SZFREG
> + FREG_S ft8, sp, 13*SZREG + 16*SZFREG
> + FREG_S ft9, sp, 13*SZREG + 17*SZFREG
> + FREG_S ft10, sp, 13*SZREG + 18*SZFREG
> + FREG_S ft11, sp, 13*SZREG + 19*SZFREG
> + FREG_S ft12, sp, 13*SZREG + 20*SZFREG
> + FREG_S ft13, sp, 13*SZREG + 21*SZFREG
> + FREG_S ft14, sp, 13*SZREG + 22*SZFREG
> + FREG_S ft15, sp, 13*SZREG + 23*SZFREG
> +#endif /* #ifdef USE_LASX */
> +
> + bl __tls_get_addr
> + addi.d a0, a0, -TLS_DTV_OFFSET
> +
> + REG_L ra, sp, 0
> + REG_L a1, sp, 1 * 8
> + REG_L a2, sp, 2 * 8
> + REG_L a3, sp, 3 * 8
> + REG_L a4, sp, 4 * 8
> + REG_L a5, sp, 5 * 8
> + REG_L a6, sp, 6 * 8
> + REG_L a7, sp, 7 * 8
> + REG_L t4, sp, 8 * 8
> + REG_L t5, sp, 9 * 8
> + REG_L t6, sp, 10 * 8
> + REG_L t7, sp, 11 * 8
> + REG_L t8, sp, 12 * 8
> +
> +#ifdef USE_LASX
> + xvld xr0, sp, 13*SZREG + 0*SZXREG
> + xvld xr1, sp, 13*SZREG + 1*SZXREG
> + xvld xr2, sp, 13*SZREG + 2*SZXREG
> + xvld xr3, sp, 13*SZREG + 3*SZXREG
> + xvld xr4, sp, 13*SZREG + 4*SZXREG
> + xvld xr5, sp, 13*SZREG + 5*SZXREG
> + xvld xr6, sp, 13*SZREG + 6*SZXREG
> + xvld xr7, sp, 13*SZREG + 7*SZXREG
> + xvld xr8, sp, 13*SZREG + 8*SZXREG
> + xvld xr9, sp, 13*SZREG + 9*SZXREG
> + xvld xr10, sp, 13*SZREG + 10*SZXREG
> + xvld xr11, sp, 13*SZREG + 11*SZXREG
> + xvld xr12, sp, 13*SZREG + 12*SZXREG
> + xvld xr13, sp, 13*SZREG + 13*SZXREG
> + xvld xr14, sp, 13*SZREG + 14*SZXREG
> + xvld xr15, sp, 13*SZREG + 15*SZXREG
> + xvld xr16, sp, 13*SZREG + 16*SZXREG
> + xvld xr17, sp, 13*SZREG + 17*SZXREG
> + xvld xr18, sp, 13*SZREG + 18*SZXREG
> + xvld xr19, sp, 13*SZREG + 19*SZXREG
> + xvld xr20, sp, 13*SZREG + 20*SZXREG
> + xvld xr21, sp, 13*SZREG + 21*SZXREG
> + xvld xr22, sp, 13*SZREG + 22*SZXREG
> + xvld xr23, sp, 13*SZREG + 23*SZXREG
> + xvld xr24, sp, 13*SZREG + 24*SZXREG
> + xvld xr25, sp, 13*SZREG + 25*SZXREG
> + xvld xr26, sp, 13*SZREG + 26*SZXREG
> + xvld xr27, sp, 13*SZREG + 27*SZXREG
> + xvld xr28, sp, 13*SZREG + 28*SZXREG
> + xvld xr29, sp, 13*SZREG + 29*SZXREG
> + xvld xr30, sp, 13*SZREG + 30*SZXREG
> + xvld xr31, sp, 13*SZREG + 31*SZXREG
> +#elif defined USE_LSX
> + vld vr0, sp, 13*SZREG + 0*SZVREG
> + vld vr1, sp, 13*SZREG + 1*SZVREG
> + vld vr2, sp, 13*SZREG + 2*SZVREG
> + vld vr3, sp, 13*SZREG + 3*SZVREG
> + vld vr4, sp, 13*SZREG + 4*SZVREG
> + vld vr5, sp, 13*SZREG + 5*SZVREG
> + vld vr6, sp, 13*SZREG + 6*SZVREG
> + vld vr7, sp, 13*SZREG + 7*SZVREG
> + vld vr8, sp, 13*SZREG + 8*SZVREG
> + vld vr9, sp, 13*SZREG + 9*SZVREG
> + vld vr10, sp, 13*SZREG + 10*SZVREG
> + vld vr11, sp, 13*SZREG + 11*SZVREG
> + vld vr12, sp, 13*SZREG + 12*SZVREG
> + vld vr13, sp, 13*SZREG + 13*SZVREG
> + vld vr14, sp, 13*SZREG + 14*SZVREG
> + vld vr15, sp, 13*SZREG + 15*SZVREG
> + vld vr16, sp, 13*SZREG + 16*SZVREG
> + vld vr17, sp, 13*SZREG + 17*SZVREG
> + vld vr18, sp, 13*SZREG + 18*SZVREG
> + vld vr19, sp, 13*SZREG + 19*SZVREG
> + vld vr20, sp, 13*SZREG + 20*SZVREG
> + vld vr21, sp, 13*SZREG + 21*SZVREG
> + vld vr22, sp, 13*SZREG + 22*SZVREG
> + vld vr23, sp, 13*SZREG + 23*SZVREG
> + vld vr24, sp, 13*SZREG + 24*SZVREG
> + vld vr25, sp, 13*SZREG + 25*SZVREG
> + vld vr26, sp, 13*SZREG + 26*SZVREG
> + vld vr27, sp, 13*SZREG + 27*SZVREG
> + vld vr28, sp, 13*SZREG + 28*SZVREG
> + vld vr29, sp, 13*SZREG + 29*SZVREG
> + vld vr30, sp, 13*SZREG + 30*SZVREG
> + vld vr31, sp, 13*SZREG + 31*SZVREG
> +#elif !defined __loongarch_soft_float
> + FREG_L fa0, sp, 13*SZREG + 0*SZFREG
> + FREG_L fa1, sp, 13*SZREG + 1*SZFREG
> + FREG_L fa2, sp, 13*SZREG + 2*SZFREG
> + FREG_L fa3, sp, 13*SZREG + 3*SZFREG
> + FREG_L fa4, sp, 13*SZREG + 4*SZFREG
> + FREG_L fa5, sp, 13*SZREG + 5*SZFREG
> + FREG_L fa6, sp, 13*SZREG + 6*SZFREG
> + FREG_L fa7, sp, 13*SZREG + 7*SZFREG
> + FREG_L ft0, sp, 13*SZREG + 8*SZFREG
> + FREG_L ft1, sp, 13*SZREG + 9*SZFREG
> + FREG_L ft2, sp, 13*SZREG + 10*SZFREG
> + FREG_L ft3, sp, 13*SZREG + 11*SZFREG
> + FREG_L ft4, sp, 13*SZREG + 12*SZFREG
> + FREG_L ft5, sp, 13*SZREG + 13*SZFREG
> + FREG_L ft6, sp, 13*SZREG + 14*SZFREG
> + FREG_L ft7, sp, 13*SZREG + 15*SZFREG
> + FREG_L ft8, sp, 13*SZREG + 16*SZFREG
> + FREG_L ft9, sp, 13*SZREG + 17*SZFREG
> + FREG_L ft10, sp, 13*SZREG + 18*SZFREG
> + FREG_L ft11, sp, 13*SZREG + 19*SZFREG
> + FREG_L ft12, sp, 13*SZREG + 20*SZFREG
> + FREG_L ft13, sp, 13*SZREG + 21*SZFREG
> + FREG_L ft14, sp, 13*SZREG + 22*SZFREG
> + FREG_L ft15, sp, 13*SZREG + 23*SZFREG
> +#endif /* #ifdef USE_LASX */
> +
> + addi.d sp, sp, FRAME_SIZE
> + b .Lret
> + cfi_endproc
> + .size _dl_tlsdesc_dynamic, .-_dl_tlsdesc_dynamic
> +#endif /* #ifdef SHARED */
> diff --git a/sysdeps/loongarch/dl-tlsdesc.h b/sysdeps/loongarch/dl-tlsdesc.h
> new file mode 100644
> index 0000000000..e1a9365855
> --- /dev/null
> +++ b/sysdeps/loongarch/dl-tlsdesc.h
> @@ -0,0 +1,49 @@
> +/* Thread-local storage descriptor handling in the ELF dynamic linker.
> + LoongArch version.
> + Copyright (C) 2011-2023 Free Software Foundation, Inc.
> +
> + This file is part of the GNU C Library.
> +
> + The GNU C Library is free software; you can redistribute it and/or
> + modify it under the terms of the GNU Lesser General Public
> + License as published by the Free Software Foundation; either
> + version 2.1 of the License, or (at your option) any later version.
> +
> + The GNU C Library is distributed in the hope that it will be useful,
> + but WITHOUT ANY WARRANTY; without even the implied warranty of
> + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
> + Lesser General Public License for more details.
> +
> + You should have received a copy of the GNU Lesser General Public
> + License along with the GNU C Library; if not, see
> + <https://www.gnu.org/licenses/>. */
> +
> +#ifndef _DL_TLSDESC_H
> +#define _DL_TLSDESC_H
> +
> +#include <dl-tls.h>
> +
> +/* Type used to represent a TLS descriptor in the GOT. */
> +struct tlsdesc
> +{
> + ptrdiff_t (*entry) (struct tlsdesc *);
> + void *arg;
> +};
> +
> +/* Type used as the argument in a TLS descriptor for a symbol that
> + needs dynamic TLS offsets. */
> +struct tlsdesc_dynamic_arg
> +{
> + tls_index tlsinfo;
> + size_t gen_count;
> +};
> +
> +extern ptrdiff_t attribute_hidden _dl_tlsdesc_return (struct tlsdesc *);
> +extern ptrdiff_t attribute_hidden _dl_tlsdesc_undefweak (struct tlsdesc *);
> +
> +# ifdef SHARED
> +extern void *_dl_make_tlsdesc_dynamic (struct link_map *, size_t);
> +extern ptrdiff_t attribute_hidden _dl_tlsdesc_dynamic (struct tlsdesc *);
> +#endif
> +
> +#endif
> diff --git a/sysdeps/loongarch/linkmap.h b/sysdeps/loongarch/linkmap.h
> index 2f5bf53421..40416b1ad4 100644
> --- a/sysdeps/loongarch/linkmap.h
> +++ b/sysdeps/loongarch/linkmap.h
> @@ -19,4 +19,5 @@
> struct link_map_machine
> {
> ElfW (Addr) plt; /* Address of .plt. */
> + void *tlsdesc_table; /* Address of TLS descriptor hash table. */
> };
> diff --git a/sysdeps/loongarch/tlsdesc.c b/sysdeps/loongarch/tlsdesc.c
> new file mode 100644
> index 0000000000..a357e7619f
> --- /dev/null
> +++ b/sysdeps/loongarch/tlsdesc.c
> @@ -0,0 +1,39 @@
> +/* Manage TLS descriptors. AArch64 version.
> +
> + Copyright (C) 2011-2023 Free Software Foundation, Inc.
> +
> + This file is part of the GNU C Library.
> +
> + The GNU C Library is free software; you can redistribute it and/or
> + modify it under the terms of the GNU Lesser General Public
> + License as published by the Free Software Foundation; either
> + version 2.1 of the License, or (at your option) any later version.
> +
> + The GNU C Library is distributed in the hope that it will be useful,
> + but WITHOUT ANY WARRANTY; without even the implied warranty of
> + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
> + Lesser General Public License for more details.
> +
> + You should have received a copy of the GNU Lesser General Public
> + License along with the GNU C Library; if not, see
> + <https://www.gnu.org/licenses/>. */
> +
> +#include <ldsodefs.h>
> +#include <tls.h>
> +#include <dl-tlsdesc.h>
> +#include <dl-unmap-segments.h>
> +#include <tlsdeschtab.h>
> +
> +/* Unmap the dynamic object, but also release its TLS descriptor table
> + if there is one. */
> +
> +void
> +_dl_unmap (struct link_map *map)
> +{
> + _dl_unmap_segments (map);
> +
> +#ifdef SHARED
> + if (map->l_mach.tlsdesc_table)
> + htab_delete (map->l_mach.tlsdesc_table);
> +#endif
> +}
> diff --git a/sysdeps/loongarch/tlsdesc.sym b/sysdeps/loongarch/tlsdesc.sym
> new file mode 100644
> index 0000000000..bcab218631
> --- /dev/null
> +++ b/sysdeps/loongarch/tlsdesc.sym
> @@ -0,0 +1,19 @@
> +#include <stddef.h>
> +#include <sysdep.h>
> +#include <tls.h>
> +#include <link.h>
> +#include <dl-tlsdesc.h>
> +
> +--
> +
> +-- Abuse tls.h macros to derive offsets relative to the thread register.
> +
> +TLSDESC_ARG offsetof(struct tlsdesc, arg)
> +TLSDESC_GEN_COUNT offsetof(struct tlsdesc_dynamic_arg, gen_count)
> +TLSDESC_MODID offsetof(struct tlsdesc_dynamic_arg, tlsinfo.ti_module)
> +TLSDESC_MODOFF offsetof(struct tlsdesc_dynamic_arg, tlsinfo.ti_offset)
> +TCBHEAD_DTV offsetof(tcbhead_t, dtv)
> +DTV_COUNTER offsetof(dtv_t, counter)
> +TLS_DTV_UNALLOCATED TLS_DTV_UNALLOCATED
> +TLS_DTV_OFFSET TLS_DTV_OFFSET
> +SIZE_OF_DTV sizeof(tcbhead_t)
> diff --git a/sysdeps/unix/sysv/linux/loongarch/localplt.data b/sysdeps/unix/sysv/linux/loongarch/localplt.data
> index 547b1c1b7f..ec32e6d13f 100644
> --- a/sysdeps/unix/sysv/linux/loongarch/localplt.data
> +++ b/sysdeps/unix/sysv/linux/loongarch/localplt.data
> @@ -5,3 +5,5 @@ libc.so: calloc
> libc.so: free
> libc.so: malloc
> libc.so: realloc
> +# The dynamic loader needs __tls_get_addr for TLS.
> +ld.so: __tls_get_addr
> --
> 2.36.0
>
>
在 2024/1/8 上午7:03, Tatsuyuki Ishi 写道:
>> On Dec 1, 2023, at 18:57, mengqinggang <mengqinggang@loongson.cn> wrote:
>>
>> This is mostly based on AArch64 and RISC-V implementation.
>>
>> Add R_LARCH_TLS_DESC32 and R_LARCH_TLS_DESC64 relocations.
>>
>> For _dl_tlsdesc_dynamic function slow path, temporarily save and restore
>> all vector registers.
>> ---
>> elf/elf.h | 2 +
>> sysdeps/loongarch/Makefile | 6 +
>> sysdeps/loongarch/dl-link.sym | 1 +
>> sysdeps/loongarch/dl-machine.h | 52 ++-
>> sysdeps/loongarch/dl-tls.h | 9 +-
>> sysdeps/loongarch/dl-tlsdesc.S | 364 ++++++++++++++++++
>> sysdeps/loongarch/dl-tlsdesc.h | 49 +++
>> sysdeps/loongarch/linkmap.h | 1 +
>> sysdeps/loongarch/tlsdesc.c | 39 ++
>> sysdeps/loongarch/tlsdesc.sym | 19 +
>> .../unix/sysv/linux/loongarch/localplt.data | 2 +
>> 11 files changed, 541 insertions(+), 3 deletions(-)
>> create mode 100644 sysdeps/loongarch/dl-tlsdesc.S
>> create mode 100644 sysdeps/loongarch/dl-tlsdesc.h
>> create mode 100644 sysdeps/loongarch/tlsdesc.c
>> create mode 100644 sysdeps/loongarch/tlsdesc.sym
>>
>> diff --git a/elf/elf.h b/elf/elf.h
>> index 5c1c1972d1..72e90aec30 100644
>> --- a/elf/elf.h
>> +++ b/elf/elf.h
>> @@ -4232,6 +4232,8 @@ enum
>> #define R_LARCH_TLS_TPREL32 10
>> #define R_LARCH_TLS_TPREL64 11
>> #define R_LARCH_IRELATIVE 12
>> +#define R_LARCH_TLS_DESC32 13
>> +#define R_LARCH_TLS_DESC64 14
> Does there need to be separate relocations for 32- and 64-bit? For RISC-V this was determinable from the bitness of the ELF binary, and a lot of old relocations had meaningless 32 and 64 suffixes by accident [1].
>
> [1]: https://github.com/riscv-non-isa/riscv-elf-psabi-doc/pull/373#discussion_r1153477626
Actually there is no need to separate 32- and 64-bit. Add 32- and 64-bit
relocations
is mainly to be consistent with other TLS type dynamic relocations.
>> /* Reserved for future relocs that the dynamic linker must understand. */
>>
>> diff --git a/sysdeps/loongarch/Makefile b/sysdeps/loongarch/Makefile
>> index 43d2f583cd..181389e787 100644
>> --- a/sysdeps/loongarch/Makefile
>> +++ b/sysdeps/loongarch/Makefile
>> @@ -3,9 +3,15 @@ sysdep_headers += sys/asm.h
>> endif
>>
>> ifeq ($(subdir),elf)
>> +sysdep-dl-routines += tlsdesc dl-tlsdesc
>> gen-as-const-headers += dl-link.sym
>> endif
>>
>> +ifeq ($(subdir),csu)
>> +gen-as-const-headers += tlsdesc.sym
>> +endif
>> +
>> +
>> # LoongArch's assembler also needs to know about PIC as it changes the
>> # definition of some assembler macros.
>> ASFLAGS-.os += $(pic-ccflag)
>> diff --git a/sysdeps/loongarch/dl-link.sym b/sysdeps/loongarch/dl-link.sym
>> index b534968e30..fd81ef37d5 100644
>> --- a/sysdeps/loongarch/dl-link.sym
>> +++ b/sysdeps/loongarch/dl-link.sym
>> @@ -1,6 +1,7 @@
>> #include <stddef.h>
>> #include <sysdep.h>
>> #include <link.h>
>> +#include <dl-tlsdesc.h>
>>
>> DL_SIZEOF_RG sizeof(struct La_loongarch_regs)
>> DL_SIZEOF_RV sizeof(struct La_loongarch_retval)
>> diff --git a/sysdeps/loongarch/dl-machine.h b/sysdeps/loongarch/dl-machine.h
>> index 0d17fd21e3..0dd252a5e5 100644
>> --- a/sysdeps/loongarch/dl-machine.h
>> +++ b/sysdeps/loongarch/dl-machine.h
>> @@ -25,7 +25,7 @@
>> #include <entry.h>
>> #include <elf/elf.h>
>> #include <sys/asm.h>
>> -#include <dl-tls.h>
>> +#include <dl-tlsdesc.h>
>> #include <dl-static-tls.h>
>> #include <dl-machine-rel.h>
>>
>> @@ -187,6 +187,37 @@ elf_machine_rela (struct link_map *map, struct r_scope_elem *scope[],
>> *addr_field = TLS_TPREL_VALUE (sym_map, sym) + reloc->r_addend;
>> break;
>>
>> + case __WORDSIZE == 64 ? R_LARCH_TLS_DESC64 : R_LARCH_TLS_DESC32:
>> + {
>> + struct tlsdesc volatile *td =
>> + (struct tlsdesc volatile *)addr_field;
>> + if (! sym)
>> + {
>> + td->arg = (void*)reloc->r_addend;
>> + td->entry = _dl_tlsdesc_undefweak;
>> + }
>> + else
>> + {
>> +# ifndef SHARED
>> + CHECK_STATIC_TLS (map, sym_map);
>> +# else
>> + if (!TRY_STATIC_TLS (map, sym_map))
>> + {
>> + td->arg = _dl_make_tlsdesc_dynamic
>> + (sym_map, sym->st_value + reloc->r_addend);
>> + td->entry = _dl_tlsdesc_dynamic;
>> + }
>> + else
>> +# endif
>> + {
>> + td->arg = (void *)(TLS_TPREL_VALUE (sym_map, sym)
>> + + reloc->r_addend);
>> + td->entry = _dl_tlsdesc_return;
>> + }
>> + }
>> + break;
>> + }
>> +
>> case R_LARCH_COPY:
>> {
>> if (sym == NULL)
>> @@ -255,6 +286,25 @@ elf_machine_lazy_rel (struct link_map *map, struct r_scope_elem *scope[],
>> else
>> *reloc_addr = map->l_mach.plt;
>> }
>> + else if (__builtin_expect (r_type == R_LARCH_TLS_DESC64, 1))
>> + {
>> + const Elf_Symndx symndx = ELFW (R_SYM) (reloc->r_info);
>> + const ElfW (Sym) *symtab = (const void *)D_PTR (map, l_info[DT_SYMTAB]);
>> + const ElfW (Sym) *sym = &symtab[symndx];
>> + const struct r_found_version *version = NULL;
>> +
>> + if (map->l_info[VERSYMIDX (DT_VERSYM)] != NULL)
>> + {
>> + const ElfW (Half) *vernum =
>> + (const void *)D_PTR (map, l_info[VERSYMIDX (DT_VERSYM)]);
>> + version = &map->l_versions[vernum[symndx] & 0x7fff];
>> + }
>> +
>> + /* Always initialize TLS descriptors completely, because lazy
>> + initialization requires synchronization at every TLS access. */
>> + elf_machine_rela (map, scope, reloc, sym, version, reloc_addr,
>> + skip_ifunc);
>> + }
>> else
>> _dl_reloc_bad_type (map, r_type, 1);
>> }
>> diff --git a/sysdeps/loongarch/dl-tls.h b/sysdeps/loongarch/dl-tls.h
>> index a551594b64..1ca376484a 100644
>> --- a/sysdeps/loongarch/dl-tls.h
>> +++ b/sysdeps/loongarch/dl-tls.h
>> @@ -16,6 +16,9 @@
>> License along with the GNU C Library. If not, see
>> <https://www.gnu.org/licenses/>. */
>>
>> +#ifndef _DL_TLS_H
>> +#define _DL_TLS_H
>> +
>> /* Type used for the representation of TLS information in the GOT. */
>> typedef struct
>> {
>> @@ -23,6 +26,8 @@ typedef struct
>> unsigned long int ti_offset;
>> } tls_index;
>>
>> +extern void *__tls_get_addr (tls_index *ti);
>> +
>> /* The thread pointer points to the first static TLS block. */
>> #define TLS_TP_OFFSET 0
>>
>> @@ -37,10 +42,10 @@ typedef struct
>> /* Compute the value for a DTPREL reloc. */
>> #define TLS_DTPREL_VALUE(sym) ((sym)->st_value - TLS_DTV_OFFSET)
>>
>> -extern void *__tls_get_addr (tls_index *ti);
>> -
>> #define GET_ADDR_OFFSET (ti->ti_offset + TLS_DTV_OFFSET)
>> #define __TLS_GET_ADDR(__ti) (__tls_get_addr (__ti) - TLS_DTV_OFFSET)
>>
>> /* Value used for dtv entries for which the allocation is delayed. */
>> #define TLS_DTV_UNALLOCATED ((void *) -1l)
>> +
>> +#endif
>> diff --git a/sysdeps/loongarch/dl-tlsdesc.S b/sysdeps/loongarch/dl-tlsdesc.S
>> new file mode 100644
>> index 0000000000..d2c18ff527
>> --- /dev/null
>> +++ b/sysdeps/loongarch/dl-tlsdesc.S
>> @@ -0,0 +1,364 @@
>> +/* Thread-local storage handling in the ELF dynamic linker.
>> + LoongArch version.
>> + Copyright (C) 2011-2023 Free Software Foundation, Inc.
>> +
>> + This file is part of the GNU C Library.
>> +
>> + The GNU C Library is free software; you can redistribute it and/or
>> + modify it under the terms of the GNU Lesser General Public
>> + License as published by the Free Software Foundation; either
>> + version 2.1 of the License, or (at your option) any later version.
>> +
>> + The GNU C Library is distributed in the hope that it will be useful,
>> + but WITHOUT ANY WARRANTY; without even the implied warranty of
>> + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
>> + Lesser General Public License for more details.
>> +
>> + You should have received a copy of the GNU Lesser General Public
>> + License along with the GNU C Library; if not, see
>> + <https://www.gnu.org/licenses/>. */
>> +
>> +#include <sysdep.h>
>> +#include <tls.h>
>> +#include "tlsdesc.h"
>> +
>> + .text
>> +
>> + /* Compute the thread pointer offset for symbols in the static
>> + TLS block. The offset is the same for all threads.
>> + Prototype:
>> + _dl_tlsdesc_return (tlsdesc *); */
>> + .hidden _dl_tlsdesc_return
>> + .global _dl_tlsdesc_return
>> + .type _dl_tlsdesc_return,%function
>> + cfi_startproc
>> + .align 2
>> +_dl_tlsdesc_return:
>> + REG_L a0, a0, 8
>> + RET
>> + cfi_endproc
>> + .size _dl_tlsdesc_return, .-_dl_tlsdesc_return
>> +
>> + /* Handler for undefined weak TLS symbols.
>> + Prototype:
>> + _dl_tlsdesc_undefweak (tlsdesc *);
>> +
>> + The second word of the descriptor contains the addend.
>> + Return the addend minus the thread pointer. This ensures
>> + that when the caller adds on the thread pointer it gets back
>> + the addend. */
>> + .hidden _dl_tlsdesc_undefweak
>> + .global _dl_tlsdesc_undefweak
>> + .type _dl_tlsdesc_undefweak,%function
>> + cfi_startproc
>> + .align 2
>> +_dl_tlsdesc_undefweak:
>> + REG_L a0, a0, 8
>> + sub.d a0, a0, tp
>> + RET
>> + cfi_endproc
>> + .size _dl_tlsdesc_undefweak, .-_dl_tlsdesc_undefweak
>> +
>> +#ifdef USE_LASX
>> +# define FRAME_SIZE (-((-13 * SZREG - 32 * SZXREG) & ALMASK))
>> +#elif defined USE_LSX
>> +# define FRAME_SIZE (-((-13 * SZREG - 32 * SZVREG) & ALMASK))
>> +#elif !defined __loongarch_soft_float
>> +# define FRAME_SIZE (-((-13 * SZREG - 24 * SZFREG) & ALMASK))
>> +#else
>> +# define FRAME_SIZE (-((-13 * SZREG) & ALMASK))
>> +#endif
>> +
>> +#ifdef SHARED
>> + /* Handler for dynamic TLS symbols.
>> + Prototype:
>> + _dl_tlsdesc_dynamic (tlsdesc *) ;
>> +
>> + The second word of the descriptor points to a
>> + tlsdesc_dynamic_arg structure.
>> +
>> + Returns the offset between the thread pointer and the
>> + object referenced by the argument.
>> +
>> + ptrdiff_t
>> + __attribute__ ((__regparm__ (1)))
>> + _dl_tlsdesc_dynamic (struct tlsdesc *tdp)
>> + {
>> + struct tlsdesc_dynamic_arg *td = tdp->arg;
>> + dtv_t *dtv = *(dtv_t **)((char *)__thread_pointer + TCBHEAD_DTV);
>> + if (__builtin_expect (td->gen_count <= dtv[0].counter
>> + && (dtv[td->tlsinfo.ti_module].pointer.val
>> + != TLS_DTV_UNALLOCATED),
>> + 1))
>> + return dtv[td->tlsinfo.ti_module].pointer.val
>> + + td->tlsinfo.ti_offset
>> + - __thread_pointer;
>> +
>> + return ___tls_get_addr (&td->tlsinfo) - __thread_pointer;
>> + }
>> + */
>> + .hidden _dl_tlsdesc_dynamic
>> + .global _dl_tlsdesc_dynamic
>> + .type _dl_tlsdesc_dynamic,%function
>> + cfi_startproc
>> + .align 2
>> +_dl_tlsdesc_dynamic:
>> + /* Save just enough registers to support fast path, if we fall
>> + into slow path we will save additional registers. */
>> + addi.d $r3,$r3,-24
> The stack alignment is broken here. The fast path doesn’t need stack alignment, but it breaks the assumption in the slow path that sp is already aligned. You need to either keep the fast path offset aligned (simpler, but some wasted stack space), or calculate the correct offset to re-align in the slow path.
>
>> + REG_S t0, sp, 0
>> + REG_S t1, sp, 8
>> + REG_S t2, sp, 16
>> +
>> + REG_L t0, tp, -SIZE_OF_DTV # dtv(t0) = tp + TCBHEAD_DTV dtv start
>> + REG_L a0, a0, TLSDESC_ARG # td(a0) = tdp->arg
>> + REG_L t1, a0, TLSDESC_GEN_COUNT # t1 = td->gen_count
>> + REG_L t2, t0, DTV_COUNTER # t2 = dtv[0].counter
>> + bltu t2, t1, .Lslow
>> +
>> + REG_L t1, a0, TLSDESC_MODID # t1 = td->tlsinfo.ti_module
>> + slli.d t1, t1, 3 + 1 # /* sizeof(dtv_t) == sizeof(void*) * 2 */
> 32-bit handling in this patch, in particular here, does not seem correct. On 32-bit the field of the related structures (void* and unsigned long) should also be 32-bit, but the offsets in the stub are hardcoded to those of 64-bit.
>
> Also note that REG_L is supposed to be a macro that switches the instruction based on 32- or 64-bit. I’m not sure why it’s hardcoded to the 64-bit version in LoongArch, but I think one should either stick to writing all instructions with .d suffix typed out (instead of the bit-agnostic versions), or define more bit-agnostic macros like REG_ADD so that everything that depends on pointer size can be portable across 32- and 64-bit.
>
>> + add.d t1, t1, t0 # t1 = dtv + ti_module * sizeof(dtv_t)
>> + REG_L t1, t1, 0 # t1 = dtv[td->tlsinfo.ti_module].pointer.val
>> + li.d t2, TLS_DTV_UNALLOCATED
>> + beq t1, t2, .Lslow
>> + REG_L t2, a0, TLSDESC_MODOFF # t2 = td->tlsinfo.ti_offset
>> + # dtv[td->tlsinfo.ti_module].pointer.val + td->tlsinfo.ti_offset
>> + add.d a0, t1, t2
>> +.Lret:
>> + sub.d a0, a0, tp
>> + REG_L t0, sp, 0
>> + REG_L t1, sp, 8
>> + REG_L t2, sp, 16
>> + addi.d sp, sp, 24
>> + RET
>> +
>> +.Lslow:
>> + /* This is the slow path. We need to call __tls_get_addr() which
>> + means we need to save and restore all the register that the
>> + callee will trash. */
>> +
>> + /* Save the remaining registers that we must treat as caller save. */
>> + addi.d sp, sp, -FRAME_SIZE
>> + REG_S ra, sp, 0 * SZREG
>> + REG_S a1, sp, 1 * SZREG
>> + REG_S a2, sp, 2 * SZREG
>> + REG_S a3, sp, 3 * SZREG
>> + REG_S a4, sp, 4 * SZREG
>> + REG_S a5, sp, 5 * SZREG
>> + REG_S a6, sp, 6 * SZREG
>> + REG_S a7, sp, 7 * SZREG
>> + REG_S t4, sp, 8 * SZREG
>> + REG_S t5, sp, 9 * SZREG
>> + REG_S t6, sp, 10 * SZREG
>> + REG_S t7, sp, 11 * SZREG
>> + REG_S t8, sp, 12 * SZREG
>> +
>> +#ifdef USE_LASX
>> + xvst xr0, sp, 13*SZREG + 0*SZXREG
>> + xvst xr1, sp, 13*SZREG + 1*SZXREG
>> + xvst xr2, sp, 13*SZREG + 2*SZXREG
>> + xvst xr3, sp, 13*SZREG + 3*SZXREG
>> + xvst xr4, sp, 13*SZREG + 4*SZXREG
>> + xvst xr5, sp, 13*SZREG + 5*SZXREG
>> + xvst xr6, sp, 13*SZREG + 6*SZXREG
>> + xvst xr7, sp, 13*SZREG + 7*SZXREG
>> + xvst xr8, sp, 13*SZREG + 8*SZXREG
>> + xvst xr9, sp, 13*SZREG + 9*SZXREG
>> + xvst xr10, sp, 13*SZREG + 10*SZXREG
>> + xvst xr11, sp, 13*SZREG + 11*SZXREG
>> + xvst xr12, sp, 13*SZREG + 12*SZXREG
>> + xvst xr13, sp, 13*SZREG + 13*SZXREG
>> + xvst xr14, sp, 13*SZREG + 14*SZXREG
>> + xvst xr15, sp, 13*SZREG + 15*SZXREG
>> + xvst xr16, sp, 13*SZREG + 16*SZXREG
>> + xvst xr17, sp, 13*SZREG + 17*SZXREG
>> + xvst xr18, sp, 13*SZREG + 18*SZXREG
>> + xvst xr19, sp, 13*SZREG + 19*SZXREG
>> + xvst xr20, sp, 13*SZREG + 20*SZXREG
>> + xvst xr21, sp, 13*SZREG + 21*SZXREG
>> + xvst xr22, sp, 13*SZREG + 22*SZXREG
>> + xvst xr23, sp, 13*SZREG + 23*SZXREG
>> + xvst xr23, sp, 13*SZREG + 24*SZXREG
>> + xvst xr23, sp, 13*SZREG + 25*SZXREG
>> + xvst xr23, sp, 13*SZREG + 26*SZXREG
>> + xvst xr23, sp, 13*SZREG + 27*SZXREG
>> + xvst xr23, sp, 13*SZREG + 28*SZXREG
>> + xvst xr23, sp, 13*SZREG + 29*SZXREG
>> + xvst xr23, sp, 13*SZREG + 30*SZXREG
>> + xvst xr23, sp, 13*SZREG + 31*SZXREG
>> +#elif defined USE_LSX
>> + vst vr0, sp, 13*SZREG + 0*SZVREG
>> + vst vr1, sp, 13*SZREG + 1*SZVREG
>> + vst vr2, sp, 13*SZREG + 2*SZVREG
>> + vst vr3, sp, 13*SZREG + 3*SZVREG
>> + vst vr4, sp, 13*SZREG + 4*SZVREG
>> + vst vr5, sp, 13*SZREG + 5*SZVREG
>> + vst vr6, sp, 13*SZREG + 6*SZVREG
>> + vst vr7, sp, 13*SZREG + 7*SZVREG
>> + vst vr8, sp, 13*SZREG + 8*SZVREG
>> + vst vr9, sp, 13*SZREG + 9*SZVREG
>> + vst vr10, sp, 13*SZREG + 10*SZVREG
>> + vst vr11, sp, 13*SZREG + 11*SZVREG
>> + vst vr12, sp, 13*SZREG + 12*SZVREG
>> + vst vr13, sp, 13*SZREG + 13*SZVREG
>> + vst vr14, sp, 13*SZREG + 14*SZVREG
>> + vst vr15, sp, 13*SZREG + 15*SZVREG
>> + vst vr16, sp, 13*SZREG + 16*SZVREG
>> + vst vr17, sp, 13*SZREG + 17*SZVREG
>> + vst vr18, sp, 13*SZREG + 18*SZVREG
>> + vst vr19, sp, 13*SZREG + 19*SZVREG
>> + vst vr20, sp, 13*SZREG + 20*SZVREG
>> + vst vr21, sp, 13*SZREG + 21*SZVREG
>> + vst vr22, sp, 13*SZREG + 22*SZVREG
>> + vst vr23, sp, 13*SZREG + 23*SZVREG
>> + vst vr23, sp, 13*SZREG + 24*SZVREG
>> + vst vr23, sp, 13*SZREG + 25*SZVREG
>> + vst vr23, sp, 13*SZREG + 26*SZVREG
>> + vst vr23, sp, 13*SZREG + 27*SZVREG
>> + vst vr23, sp, 13*SZREG + 28*SZVREG
>> + vst vr23, sp, 13*SZREG + 29*SZVREG
>> + vst vr23, sp, 13*SZREG + 30*SZVREG
>> + vst vr23, sp, 13*SZREG + 31*SZVREG
>> +#elif !defined __loongarch_soft_float
>> + FREG_S fa0, sp, 13*SZREG + 0*SZFREG
>> + FREG_S fa1, sp, 13*SZREG + 1*SZFREG
>> + FREG_S fa2, sp, 13*SZREG + 2*SZFREG
>> + FREG_S fa3, sp, 13*SZREG + 3*SZFREG
>> + FREG_S fa4, sp, 13*SZREG + 4*SZFREG
>> + FREG_S fa5, sp, 13*SZREG + 5*SZFREG
>> + FREG_S fa6, sp, 13*SZREG + 6*SZFREG
>> + FREG_S fa7, sp, 13*SZREG + 7*SZFREG
>> + FREG_S ft0, sp, 13*SZREG + 8*SZFREG
>> + FREG_S ft1, sp, 13*SZREG + 9*SZFREG
>> + FREG_S ft2, sp, 13*SZREG + 10*SZFREG
>> + FREG_S ft3, sp, 13*SZREG + 11*SZFREG
>> + FREG_S ft4, sp, 13*SZREG + 12*SZFREG
>> + FREG_S ft5, sp, 13*SZREG + 13*SZFREG
>> + FREG_S ft6, sp, 13*SZREG + 14*SZFREG
>> + FREG_S ft7, sp, 13*SZREG + 15*SZFREG
>> + FREG_S ft8, sp, 13*SZREG + 16*SZFREG
>> + FREG_S ft9, sp, 13*SZREG + 17*SZFREG
>> + FREG_S ft10, sp, 13*SZREG + 18*SZFREG
>> + FREG_S ft11, sp, 13*SZREG + 19*SZFREG
>> + FREG_S ft12, sp, 13*SZREG + 20*SZFREG
>> + FREG_S ft13, sp, 13*SZREG + 21*SZFREG
>> + FREG_S ft14, sp, 13*SZREG + 22*SZFREG
>> + FREG_S ft15, sp, 13*SZREG + 23*SZFREG
>> +#endif /* #ifdef USE_LASX */
>> +
>> + bl __tls_get_addr
>> + addi.d a0, a0, -TLS_DTV_OFFSET
>> +
>> + REG_L ra, sp, 0
>> + REG_L a1, sp, 1 * 8
>> + REG_L a2, sp, 2 * 8
>> + REG_L a3, sp, 3 * 8
>> + REG_L a4, sp, 4 * 8
>> + REG_L a5, sp, 5 * 8
>> + REG_L a6, sp, 6 * 8
>> + REG_L a7, sp, 7 * 8
>> + REG_L t4, sp, 8 * 8
>> + REG_L t5, sp, 9 * 8
>> + REG_L t6, sp, 10 * 8
>> + REG_L t7, sp, 11 * 8
>> + REG_L t8, sp, 12 * 8
>> +
>> +#ifdef USE_LASX
>> + xvld xr0, sp, 13*SZREG + 0*SZXREG
>> + xvld xr1, sp, 13*SZREG + 1*SZXREG
>> + xvld xr2, sp, 13*SZREG + 2*SZXREG
>> + xvld xr3, sp, 13*SZREG + 3*SZXREG
>> + xvld xr4, sp, 13*SZREG + 4*SZXREG
>> + xvld xr5, sp, 13*SZREG + 5*SZXREG
>> + xvld xr6, sp, 13*SZREG + 6*SZXREG
>> + xvld xr7, sp, 13*SZREG + 7*SZXREG
>> + xvld xr8, sp, 13*SZREG + 8*SZXREG
>> + xvld xr9, sp, 13*SZREG + 9*SZXREG
>> + xvld xr10, sp, 13*SZREG + 10*SZXREG
>> + xvld xr11, sp, 13*SZREG + 11*SZXREG
>> + xvld xr12, sp, 13*SZREG + 12*SZXREG
>> + xvld xr13, sp, 13*SZREG + 13*SZXREG
>> + xvld xr14, sp, 13*SZREG + 14*SZXREG
>> + xvld xr15, sp, 13*SZREG + 15*SZXREG
>> + xvld xr16, sp, 13*SZREG + 16*SZXREG
>> + xvld xr17, sp, 13*SZREG + 17*SZXREG
>> + xvld xr18, sp, 13*SZREG + 18*SZXREG
>> + xvld xr19, sp, 13*SZREG + 19*SZXREG
>> + xvld xr20, sp, 13*SZREG + 20*SZXREG
>> + xvld xr21, sp, 13*SZREG + 21*SZXREG
>> + xvld xr22, sp, 13*SZREG + 22*SZXREG
>> + xvld xr23, sp, 13*SZREG + 23*SZXREG
>> + xvld xr24, sp, 13*SZREG + 24*SZXREG
>> + xvld xr25, sp, 13*SZREG + 25*SZXREG
>> + xvld xr26, sp, 13*SZREG + 26*SZXREG
>> + xvld xr27, sp, 13*SZREG + 27*SZXREG
>> + xvld xr28, sp, 13*SZREG + 28*SZXREG
>> + xvld xr29, sp, 13*SZREG + 29*SZXREG
>> + xvld xr30, sp, 13*SZREG + 30*SZXREG
>> + xvld xr31, sp, 13*SZREG + 31*SZXREG
>> +#elif defined USE_LSX
>> + vld vr0, sp, 13*SZREG + 0*SZVREG
>> + vld vr1, sp, 13*SZREG + 1*SZVREG
>> + vld vr2, sp, 13*SZREG + 2*SZVREG
>> + vld vr3, sp, 13*SZREG + 3*SZVREG
>> + vld vr4, sp, 13*SZREG + 4*SZVREG
>> + vld vr5, sp, 13*SZREG + 5*SZVREG
>> + vld vr6, sp, 13*SZREG + 6*SZVREG
>> + vld vr7, sp, 13*SZREG + 7*SZVREG
>> + vld vr8, sp, 13*SZREG + 8*SZVREG
>> + vld vr9, sp, 13*SZREG + 9*SZVREG
>> + vld vr10, sp, 13*SZREG + 10*SZVREG
>> + vld vr11, sp, 13*SZREG + 11*SZVREG
>> + vld vr12, sp, 13*SZREG + 12*SZVREG
>> + vld vr13, sp, 13*SZREG + 13*SZVREG
>> + vld vr14, sp, 13*SZREG + 14*SZVREG
>> + vld vr15, sp, 13*SZREG + 15*SZVREG
>> + vld vr16, sp, 13*SZREG + 16*SZVREG
>> + vld vr17, sp, 13*SZREG + 17*SZVREG
>> + vld vr18, sp, 13*SZREG + 18*SZVREG
>> + vld vr19, sp, 13*SZREG + 19*SZVREG
>> + vld vr20, sp, 13*SZREG + 20*SZVREG
>> + vld vr21, sp, 13*SZREG + 21*SZVREG
>> + vld vr22, sp, 13*SZREG + 22*SZVREG
>> + vld vr23, sp, 13*SZREG + 23*SZVREG
>> + vld vr24, sp, 13*SZREG + 24*SZVREG
>> + vld vr25, sp, 13*SZREG + 25*SZVREG
>> + vld vr26, sp, 13*SZREG + 26*SZVREG
>> + vld vr27, sp, 13*SZREG + 27*SZVREG
>> + vld vr28, sp, 13*SZREG + 28*SZVREG
>> + vld vr29, sp, 13*SZREG + 29*SZVREG
>> + vld vr30, sp, 13*SZREG + 30*SZVREG
>> + vld vr31, sp, 13*SZREG + 31*SZVREG
>> +#elif !defined __loongarch_soft_float
>> + FREG_L fa0, sp, 13*SZREG + 0*SZFREG
>> + FREG_L fa1, sp, 13*SZREG + 1*SZFREG
>> + FREG_L fa2, sp, 13*SZREG + 2*SZFREG
>> + FREG_L fa3, sp, 13*SZREG + 3*SZFREG
>> + FREG_L fa4, sp, 13*SZREG + 4*SZFREG
>> + FREG_L fa5, sp, 13*SZREG + 5*SZFREG
>> + FREG_L fa6, sp, 13*SZREG + 6*SZFREG
>> + FREG_L fa7, sp, 13*SZREG + 7*SZFREG
>> + FREG_L ft0, sp, 13*SZREG + 8*SZFREG
>> + FREG_L ft1, sp, 13*SZREG + 9*SZFREG
>> + FREG_L ft2, sp, 13*SZREG + 10*SZFREG
>> + FREG_L ft3, sp, 13*SZREG + 11*SZFREG
>> + FREG_L ft4, sp, 13*SZREG + 12*SZFREG
>> + FREG_L ft5, sp, 13*SZREG + 13*SZFREG
>> + FREG_L ft6, sp, 13*SZREG + 14*SZFREG
>> + FREG_L ft7, sp, 13*SZREG + 15*SZFREG
>> + FREG_L ft8, sp, 13*SZREG + 16*SZFREG
>> + FREG_L ft9, sp, 13*SZREG + 17*SZFREG
>> + FREG_L ft10, sp, 13*SZREG + 18*SZFREG
>> + FREG_L ft11, sp, 13*SZREG + 19*SZFREG
>> + FREG_L ft12, sp, 13*SZREG + 20*SZFREG
>> + FREG_L ft13, sp, 13*SZREG + 21*SZFREG
>> + FREG_L ft14, sp, 13*SZREG + 22*SZFREG
>> + FREG_L ft15, sp, 13*SZREG + 23*SZFREG
>> +#endif /* #ifdef USE_LASX */
>> +
>> + addi.d sp, sp, FRAME_SIZE
>> + b .Lret
>> + cfi_endproc
>> + .size _dl_tlsdesc_dynamic, .-_dl_tlsdesc_dynamic
>> +#endif /* #ifdef SHARED */
>> diff --git a/sysdeps/loongarch/dl-tlsdesc.h b/sysdeps/loongarch/dl-tlsdesc.h
>> new file mode 100644
>> index 0000000000..e1a9365855
>> --- /dev/null
>> +++ b/sysdeps/loongarch/dl-tlsdesc.h
>> @@ -0,0 +1,49 @@
>> +/* Thread-local storage descriptor handling in the ELF dynamic linker.
>> + LoongArch version.
>> + Copyright (C) 2011-2023 Free Software Foundation, Inc.
>> +
>> + This file is part of the GNU C Library.
>> +
>> + The GNU C Library is free software; you can redistribute it and/or
>> + modify it under the terms of the GNU Lesser General Public
>> + License as published by the Free Software Foundation; either
>> + version 2.1 of the License, or (at your option) any later version.
>> +
>> + The GNU C Library is distributed in the hope that it will be useful,
>> + but WITHOUT ANY WARRANTY; without even the implied warranty of
>> + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
>> + Lesser General Public License for more details.
>> +
>> + You should have received a copy of the GNU Lesser General Public
>> + License along with the GNU C Library; if not, see
>> + <https://www.gnu.org/licenses/>. */
>> +
>> +#ifndef _DL_TLSDESC_H
>> +#define _DL_TLSDESC_H
>> +
>> +#include <dl-tls.h>
>> +
>> +/* Type used to represent a TLS descriptor in the GOT. */
>> +struct tlsdesc
>> +{
>> + ptrdiff_t (*entry) (struct tlsdesc *);
>> + void *arg;
>> +};
>> +
>> +/* Type used as the argument in a TLS descriptor for a symbol that
>> + needs dynamic TLS offsets. */
>> +struct tlsdesc_dynamic_arg
>> +{
>> + tls_index tlsinfo;
>> + size_t gen_count;
>> +};
>> +
>> +extern ptrdiff_t attribute_hidden _dl_tlsdesc_return (struct tlsdesc *);
>> +extern ptrdiff_t attribute_hidden _dl_tlsdesc_undefweak (struct tlsdesc *);
>> +
>> +# ifdef SHARED
>> +extern void *_dl_make_tlsdesc_dynamic (struct link_map *, size_t);
>> +extern ptrdiff_t attribute_hidden _dl_tlsdesc_dynamic (struct tlsdesc *);
>> +#endif
>> +
>> +#endif
>> diff --git a/sysdeps/loongarch/linkmap.h b/sysdeps/loongarch/linkmap.h
>> index 2f5bf53421..40416b1ad4 100644
>> --- a/sysdeps/loongarch/linkmap.h
>> +++ b/sysdeps/loongarch/linkmap.h
>> @@ -19,4 +19,5 @@
>> struct link_map_machine
>> {
>> ElfW (Addr) plt; /* Address of .plt. */
>> + void *tlsdesc_table; /* Address of TLS descriptor hash table. */
>> };
>> diff --git a/sysdeps/loongarch/tlsdesc.c b/sysdeps/loongarch/tlsdesc.c
>> new file mode 100644
>> index 0000000000..a357e7619f
>> --- /dev/null
>> +++ b/sysdeps/loongarch/tlsdesc.c
>> @@ -0,0 +1,39 @@
>> +/* Manage TLS descriptors. AArch64 version.
>> +
>> + Copyright (C) 2011-2023 Free Software Foundation, Inc.
>> +
>> + This file is part of the GNU C Library.
>> +
>> + The GNU C Library is free software; you can redistribute it and/or
>> + modify it under the terms of the GNU Lesser General Public
>> + License as published by the Free Software Foundation; either
>> + version 2.1 of the License, or (at your option) any later version.
>> +
>> + The GNU C Library is distributed in the hope that it will be useful,
>> + but WITHOUT ANY WARRANTY; without even the implied warranty of
>> + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
>> + Lesser General Public License for more details.
>> +
>> + You should have received a copy of the GNU Lesser General Public
>> + License along with the GNU C Library; if not, see
>> + <https://www.gnu.org/licenses/>. */
>> +
>> +#include <ldsodefs.h>
>> +#include <tls.h>
>> +#include <dl-tlsdesc.h>
>> +#include <dl-unmap-segments.h>
>> +#include <tlsdeschtab.h>
>> +
>> +/* Unmap the dynamic object, but also release its TLS descriptor table
>> + if there is one. */
>> +
>> +void
>> +_dl_unmap (struct link_map *map)
>> +{
>> + _dl_unmap_segments (map);
>> +
>> +#ifdef SHARED
>> + if (map->l_mach.tlsdesc_table)
>> + htab_delete (map->l_mach.tlsdesc_table);
>> +#endif
>> +}
>> diff --git a/sysdeps/loongarch/tlsdesc.sym b/sysdeps/loongarch/tlsdesc.sym
>> new file mode 100644
>> index 0000000000..bcab218631
>> --- /dev/null
>> +++ b/sysdeps/loongarch/tlsdesc.sym
>> @@ -0,0 +1,19 @@
>> +#include <stddef.h>
>> +#include <sysdep.h>
>> +#include <tls.h>
>> +#include <link.h>
>> +#include <dl-tlsdesc.h>
>> +
>> +--
>> +
>> +-- Abuse tls.h macros to derive offsets relative to the thread register.
>> +
>> +TLSDESC_ARG offsetof(struct tlsdesc, arg)
>> +TLSDESC_GEN_COUNT offsetof(struct tlsdesc_dynamic_arg, gen_count)
>> +TLSDESC_MODID offsetof(struct tlsdesc_dynamic_arg, tlsinfo.ti_module)
>> +TLSDESC_MODOFF offsetof(struct tlsdesc_dynamic_arg, tlsinfo.ti_offset)
>> +TCBHEAD_DTV offsetof(tcbhead_t, dtv)
>> +DTV_COUNTER offsetof(dtv_t, counter)
>> +TLS_DTV_UNALLOCATED TLS_DTV_UNALLOCATED
>> +TLS_DTV_OFFSET TLS_DTV_OFFSET
>> +SIZE_OF_DTV sizeof(tcbhead_t)
>> diff --git a/sysdeps/unix/sysv/linux/loongarch/localplt.data b/sysdeps/unix/sysv/linux/loongarch/localplt.data
>> index 547b1c1b7f..ec32e6d13f 100644
>> --- a/sysdeps/unix/sysv/linux/loongarch/localplt.data
>> +++ b/sysdeps/unix/sysv/linux/loongarch/localplt.data
>> @@ -5,3 +5,5 @@ libc.so: calloc
>> libc.so: free
>> libc.so: malloc
>> libc.so: realloc
>> +# The dynamic loader needs __tls_get_addr for TLS.
>> +ld.so: __tls_get_addr
>> --
>> 2.36.0
>>
>>
在 2024/1/8 上午7:03, Tatsuyuki Ishi 写道:
>> On Dec 1, 2023, at 18:57, mengqinggang <mengqinggang@loongson.cn> wrote:
>>
>> This is mostly based on AArch64 and RISC-V implementation.
>>
>> Add R_LARCH_TLS_DESC32 and R_LARCH_TLS_DESC64 relocations.
>>
>> For _dl_tlsdesc_dynamic function slow path, temporarily save and restore
>> all vector registers.
>> ---
>> elf/elf.h | 2 +
>> sysdeps/loongarch/Makefile | 6 +
>> sysdeps/loongarch/dl-link.sym | 1 +
>> sysdeps/loongarch/dl-machine.h | 52 ++-
>> sysdeps/loongarch/dl-tls.h | 9 +-
>> sysdeps/loongarch/dl-tlsdesc.S | 364 ++++++++++++++++++
>> sysdeps/loongarch/dl-tlsdesc.h | 49 +++
>> sysdeps/loongarch/linkmap.h | 1 +
>> sysdeps/loongarch/tlsdesc.c | 39 ++
>> sysdeps/loongarch/tlsdesc.sym | 19 +
>> .../unix/sysv/linux/loongarch/localplt.data | 2 +
>> 11 files changed, 541 insertions(+), 3 deletions(-)
>> create mode 100644 sysdeps/loongarch/dl-tlsdesc.S
>> create mode 100644 sysdeps/loongarch/dl-tlsdesc.h
>> create mode 100644 sysdeps/loongarch/tlsdesc.c
>> create mode 100644 sysdeps/loongarch/tlsdesc.sym
>>
>> diff --git a/elf/elf.h b/elf/elf.h
>> index 5c1c1972d1..72e90aec30 100644
>> --- a/elf/elf.h
>> +++ b/elf/elf.h
>> @@ -4232,6 +4232,8 @@ enum
>> #define R_LARCH_TLS_TPREL32 10
>> #define R_LARCH_TLS_TPREL64 11
>> #define R_LARCH_IRELATIVE 12
>> +#define R_LARCH_TLS_DESC32 13
>> +#define R_LARCH_TLS_DESC64 14
> Does there need to be separate relocations for 32- and 64-bit? For RISC-V this was determinable from the bitness of the ELF binary, and a lot of old relocations had meaningless 32 and 64 suffixes by accident [1].
>
> [1]: https://github.com/riscv-non-isa/riscv-elf-psabi-doc/pull/373#discussion_r1153477626
>
>> /* Reserved for future relocs that the dynamic linker must understand. */
>>
>> diff --git a/sysdeps/loongarch/Makefile b/sysdeps/loongarch/Makefile
>> index 43d2f583cd..181389e787 100644
>> --- a/sysdeps/loongarch/Makefile
>> +++ b/sysdeps/loongarch/Makefile
>> @@ -3,9 +3,15 @@ sysdep_headers += sys/asm.h
>> endif
>>
>> ifeq ($(subdir),elf)
>> +sysdep-dl-routines += tlsdesc dl-tlsdesc
>> gen-as-const-headers += dl-link.sym
>> endif
>>
>> +ifeq ($(subdir),csu)
>> +gen-as-const-headers += tlsdesc.sym
>> +endif
>> +
>> +
>> # LoongArch's assembler also needs to know about PIC as it changes the
>> # definition of some assembler macros.
>> ASFLAGS-.os += $(pic-ccflag)
>> diff --git a/sysdeps/loongarch/dl-link.sym b/sysdeps/loongarch/dl-link.sym
>> index b534968e30..fd81ef37d5 100644
>> --- a/sysdeps/loongarch/dl-link.sym
>> +++ b/sysdeps/loongarch/dl-link.sym
>> @@ -1,6 +1,7 @@
>> #include <stddef.h>
>> #include <sysdep.h>
>> #include <link.h>
>> +#include <dl-tlsdesc.h>
>>
>> DL_SIZEOF_RG sizeof(struct La_loongarch_regs)
>> DL_SIZEOF_RV sizeof(struct La_loongarch_retval)
>> diff --git a/sysdeps/loongarch/dl-machine.h b/sysdeps/loongarch/dl-machine.h
>> index 0d17fd21e3..0dd252a5e5 100644
>> --- a/sysdeps/loongarch/dl-machine.h
>> +++ b/sysdeps/loongarch/dl-machine.h
>> @@ -25,7 +25,7 @@
>> #include <entry.h>
>> #include <elf/elf.h>
>> #include <sys/asm.h>
>> -#include <dl-tls.h>
>> +#include <dl-tlsdesc.h>
>> #include <dl-static-tls.h>
>> #include <dl-machine-rel.h>
>>
>> @@ -187,6 +187,37 @@ elf_machine_rela (struct link_map *map, struct r_scope_elem *scope[],
>> *addr_field = TLS_TPREL_VALUE (sym_map, sym) + reloc->r_addend;
>> break;
>>
>> + case __WORDSIZE == 64 ? R_LARCH_TLS_DESC64 : R_LARCH_TLS_DESC32:
>> + {
>> + struct tlsdesc volatile *td =
>> + (struct tlsdesc volatile *)addr_field;
>> + if (! sym)
>> + {
>> + td->arg = (void*)reloc->r_addend;
>> + td->entry = _dl_tlsdesc_undefweak;
>> + }
>> + else
>> + {
>> +# ifndef SHARED
>> + CHECK_STATIC_TLS (map, sym_map);
>> +# else
>> + if (!TRY_STATIC_TLS (map, sym_map))
>> + {
>> + td->arg = _dl_make_tlsdesc_dynamic
>> + (sym_map, sym->st_value + reloc->r_addend);
>> + td->entry = _dl_tlsdesc_dynamic;
>> + }
>> + else
>> +# endif
>> + {
>> + td->arg = (void *)(TLS_TPREL_VALUE (sym_map, sym)
>> + + reloc->r_addend);
>> + td->entry = _dl_tlsdesc_return;
>> + }
>> + }
>> + break;
>> + }
>> +
>> case R_LARCH_COPY:
>> {
>> if (sym == NULL)
>> @@ -255,6 +286,25 @@ elf_machine_lazy_rel (struct link_map *map, struct r_scope_elem *scope[],
>> else
>> *reloc_addr = map->l_mach.plt;
>> }
>> + else if (__builtin_expect (r_type == R_LARCH_TLS_DESC64, 1))
>> + {
>> + const Elf_Symndx symndx = ELFW (R_SYM) (reloc->r_info);
>> + const ElfW (Sym) *symtab = (const void *)D_PTR (map, l_info[DT_SYMTAB]);
>> + const ElfW (Sym) *sym = &symtab[symndx];
>> + const struct r_found_version *version = NULL;
>> +
>> + if (map->l_info[VERSYMIDX (DT_VERSYM)] != NULL)
>> + {
>> + const ElfW (Half) *vernum =
>> + (const void *)D_PTR (map, l_info[VERSYMIDX (DT_VERSYM)]);
>> + version = &map->l_versions[vernum[symndx] & 0x7fff];
>> + }
>> +
>> + /* Always initialize TLS descriptors completely, because lazy
>> + initialization requires synchronization at every TLS access. */
>> + elf_machine_rela (map, scope, reloc, sym, version, reloc_addr,
>> + skip_ifunc);
>> + }
>> else
>> _dl_reloc_bad_type (map, r_type, 1);
>> }
>> diff --git a/sysdeps/loongarch/dl-tls.h b/sysdeps/loongarch/dl-tls.h
>> index a551594b64..1ca376484a 100644
>> --- a/sysdeps/loongarch/dl-tls.h
>> +++ b/sysdeps/loongarch/dl-tls.h
>> @@ -16,6 +16,9 @@
>> License along with the GNU C Library. If not, see
>> <https://www.gnu.org/licenses/>. */
>>
>> +#ifndef _DL_TLS_H
>> +#define _DL_TLS_H
>> +
>> /* Type used for the representation of TLS information in the GOT. */
>> typedef struct
>> {
>> @@ -23,6 +26,8 @@ typedef struct
>> unsigned long int ti_offset;
>> } tls_index;
>>
>> +extern void *__tls_get_addr (tls_index *ti);
>> +
>> /* The thread pointer points to the first static TLS block. */
>> #define TLS_TP_OFFSET 0
>>
>> @@ -37,10 +42,10 @@ typedef struct
>> /* Compute the value for a DTPREL reloc. */
>> #define TLS_DTPREL_VALUE(sym) ((sym)->st_value - TLS_DTV_OFFSET)
>>
>> -extern void *__tls_get_addr (tls_index *ti);
>> -
>> #define GET_ADDR_OFFSET (ti->ti_offset + TLS_DTV_OFFSET)
>> #define __TLS_GET_ADDR(__ti) (__tls_get_addr (__ti) - TLS_DTV_OFFSET)
>>
>> /* Value used for dtv entries for which the allocation is delayed. */
>> #define TLS_DTV_UNALLOCATED ((void *) -1l)
>> +
>> +#endif
>> diff --git a/sysdeps/loongarch/dl-tlsdesc.S b/sysdeps/loongarch/dl-tlsdesc.S
>> new file mode 100644
>> index 0000000000..d2c18ff527
>> --- /dev/null
>> +++ b/sysdeps/loongarch/dl-tlsdesc.S
>> @@ -0,0 +1,364 @@
>> +/* Thread-local storage handling in the ELF dynamic linker.
>> + LoongArch version.
>> + Copyright (C) 2011-2023 Free Software Foundation, Inc.
>> +
>> + This file is part of the GNU C Library.
>> +
>> + The GNU C Library is free software; you can redistribute it and/or
>> + modify it under the terms of the GNU Lesser General Public
>> + License as published by the Free Software Foundation; either
>> + version 2.1 of the License, or (at your option) any later version.
>> +
>> + The GNU C Library is distributed in the hope that it will be useful,
>> + but WITHOUT ANY WARRANTY; without even the implied warranty of
>> + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
>> + Lesser General Public License for more details.
>> +
>> + You should have received a copy of the GNU Lesser General Public
>> + License along with the GNU C Library; if not, see
>> + <https://www.gnu.org/licenses/>. */
>> +
>> +#include <sysdep.h>
>> +#include <tls.h>
>> +#include "tlsdesc.h"
>> +
>> + .text
>> +
>> + /* Compute the thread pointer offset for symbols in the static
>> + TLS block. The offset is the same for all threads.
>> + Prototype:
>> + _dl_tlsdesc_return (tlsdesc *); */
>> + .hidden _dl_tlsdesc_return
>> + .global _dl_tlsdesc_return
>> + .type _dl_tlsdesc_return,%function
>> + cfi_startproc
>> + .align 2
>> +_dl_tlsdesc_return:
>> + REG_L a0, a0, 8
>> + RET
>> + cfi_endproc
>> + .size _dl_tlsdesc_return, .-_dl_tlsdesc_return
>> +
>> + /* Handler for undefined weak TLS symbols.
>> + Prototype:
>> + _dl_tlsdesc_undefweak (tlsdesc *);
>> +
>> + The second word of the descriptor contains the addend.
>> + Return the addend minus the thread pointer. This ensures
>> + that when the caller adds on the thread pointer it gets back
>> + the addend. */
>> + .hidden _dl_tlsdesc_undefweak
>> + .global _dl_tlsdesc_undefweak
>> + .type _dl_tlsdesc_undefweak,%function
>> + cfi_startproc
>> + .align 2
>> +_dl_tlsdesc_undefweak:
>> + REG_L a0, a0, 8
>> + sub.d a0, a0, tp
>> + RET
>> + cfi_endproc
>> + .size _dl_tlsdesc_undefweak, .-_dl_tlsdesc_undefweak
>> +
>> +#ifdef USE_LASX
>> +# define FRAME_SIZE (-((-13 * SZREG - 32 * SZXREG) & ALMASK))
>> +#elif defined USE_LSX
>> +# define FRAME_SIZE (-((-13 * SZREG - 32 * SZVREG) & ALMASK))
>> +#elif !defined __loongarch_soft_float
>> +# define FRAME_SIZE (-((-13 * SZREG - 24 * SZFREG) & ALMASK))
>> +#else
>> +# define FRAME_SIZE (-((-13 * SZREG) & ALMASK))
>> +#endif
>> +
>> +#ifdef SHARED
>> + /* Handler for dynamic TLS symbols.
>> + Prototype:
>> + _dl_tlsdesc_dynamic (tlsdesc *) ;
>> +
>> + The second word of the descriptor points to a
>> + tlsdesc_dynamic_arg structure.
>> +
>> + Returns the offset between the thread pointer and the
>> + object referenced by the argument.
>> +
>> + ptrdiff_t
>> + __attribute__ ((__regparm__ (1)))
>> + _dl_tlsdesc_dynamic (struct tlsdesc *tdp)
>> + {
>> + struct tlsdesc_dynamic_arg *td = tdp->arg;
>> + dtv_t *dtv = *(dtv_t **)((char *)__thread_pointer + TCBHEAD_DTV);
>> + if (__builtin_expect (td->gen_count <= dtv[0].counter
>> + && (dtv[td->tlsinfo.ti_module].pointer.val
>> + != TLS_DTV_UNALLOCATED),
>> + 1))
>> + return dtv[td->tlsinfo.ti_module].pointer.val
>> + + td->tlsinfo.ti_offset
>> + - __thread_pointer;
>> +
>> + return ___tls_get_addr (&td->tlsinfo) - __thread_pointer;
>> + }
>> + */
>> + .hidden _dl_tlsdesc_dynamic
>> + .global _dl_tlsdesc_dynamic
>> + .type _dl_tlsdesc_dynamic,%function
>> + cfi_startproc
>> + .align 2
>> +_dl_tlsdesc_dynamic:
>> + /* Save just enough registers to support fast path, if we fall
>> + into slow path we will save additional registers. */
>> + addi.d $r3,$r3,-24
> The stack alignment is broken here. The fast path doesn’t need stack alignment, but it breaks the assumption in the slow path that sp is already aligned. You need to either keep the fast path offset aligned (simpler, but some wasted stack space), or calculate the correct offset to re-align in the slow path.
I will fix this in next version patch.
>> + REG_S t0, sp, 0
>> + REG_S t1, sp, 8
>> + REG_S t2, sp, 16
>> +
>> + REG_L t0, tp, -SIZE_OF_DTV # dtv(t0) = tp + TCBHEAD_DTV dtv start
>> + REG_L a0, a0, TLSDESC_ARG # td(a0) = tdp->arg
>> + REG_L t1, a0, TLSDESC_GEN_COUNT # t1 = td->gen_count
>> + REG_L t2, t0, DTV_COUNTER # t2 = dtv[0].counter
>> + bltu t2, t1, .Lslow
>> +
>> + REG_L t1, a0, TLSDESC_MODID # t1 = td->tlsinfo.ti_module
>> + slli.d t1, t1, 3 + 1 # /* sizeof(dtv_t) == sizeof(void*) * 2 */
> 32-bit handling in this patch, in particular here, does not seem correct. On 32-bit the field of the related structures (void* and unsigned long) should also be 32-bit, but the offsets in the stub are hardcoded to those of 64-bit.
>
> Also note that REG_L is supposed to be a macro that switches the instruction based on 32- or 64-bit. I’m not sure why it’s hardcoded to the 64-bit version in LoongArch, but I think one should either stick to writing all instructions with .d suffix typed out (instead of the bit-agnostic versions), or define more bit-agnostic macros like REG_ADD so that everything that depends on pointer size can be portable across 32- and 64-bit.
Glibc currently only supports LoongArch 64-bit architecture. I will try
to fix this in next version patch.
>> + add.d t1, t1, t0 # t1 = dtv + ti_module * sizeof(dtv_t)
>> + REG_L t1, t1, 0 # t1 = dtv[td->tlsinfo.ti_module].pointer.val
>> + li.d t2, TLS_DTV_UNALLOCATED
>> + beq t1, t2, .Lslow
>> + REG_L t2, a0, TLSDESC_MODOFF # t2 = td->tlsinfo.ti_offset
>> + # dtv[td->tlsinfo.ti_module].pointer.val + td->tlsinfo.ti_offset
>> + add.d a0, t1, t2
>> +.Lret:
>> + sub.d a0, a0, tp
>> + REG_L t0, sp, 0
>> + REG_L t1, sp, 8
>> + REG_L t2, sp, 16
>> + addi.d sp, sp, 24
>> + RET
>> +
>> +.Lslow:
>> + /* This is the slow path. We need to call __tls_get_addr() which
>> + means we need to save and restore all the register that the
>> + callee will trash. */
>> +
>> + /* Save the remaining registers that we must treat as caller save. */
>> + addi.d sp, sp, -FRAME_SIZE
>> + REG_S ra, sp, 0 * SZREG
>> + REG_S a1, sp, 1 * SZREG
>> + REG_S a2, sp, 2 * SZREG
>> + REG_S a3, sp, 3 * SZREG
>> + REG_S a4, sp, 4 * SZREG
>> + REG_S a5, sp, 5 * SZREG
>> + REG_S a6, sp, 6 * SZREG
>> + REG_S a7, sp, 7 * SZREG
>> + REG_S t4, sp, 8 * SZREG
>> + REG_S t5, sp, 9 * SZREG
>> + REG_S t6, sp, 10 * SZREG
>> + REG_S t7, sp, 11 * SZREG
>> + REG_S t8, sp, 12 * SZREG
>> +
>> +#ifdef USE_LASX
>> + xvst xr0, sp, 13*SZREG + 0*SZXREG
>> + xvst xr1, sp, 13*SZREG + 1*SZXREG
>> + xvst xr2, sp, 13*SZREG + 2*SZXREG
>> + xvst xr3, sp, 13*SZREG + 3*SZXREG
>> + xvst xr4, sp, 13*SZREG + 4*SZXREG
>> + xvst xr5, sp, 13*SZREG + 5*SZXREG
>> + xvst xr6, sp, 13*SZREG + 6*SZXREG
>> + xvst xr7, sp, 13*SZREG + 7*SZXREG
>> + xvst xr8, sp, 13*SZREG + 8*SZXREG
>> + xvst xr9, sp, 13*SZREG + 9*SZXREG
>> + xvst xr10, sp, 13*SZREG + 10*SZXREG
>> + xvst xr11, sp, 13*SZREG + 11*SZXREG
>> + xvst xr12, sp, 13*SZREG + 12*SZXREG
>> + xvst xr13, sp, 13*SZREG + 13*SZXREG
>> + xvst xr14, sp, 13*SZREG + 14*SZXREG
>> + xvst xr15, sp, 13*SZREG + 15*SZXREG
>> + xvst xr16, sp, 13*SZREG + 16*SZXREG
>> + xvst xr17, sp, 13*SZREG + 17*SZXREG
>> + xvst xr18, sp, 13*SZREG + 18*SZXREG
>> + xvst xr19, sp, 13*SZREG + 19*SZXREG
>> + xvst xr20, sp, 13*SZREG + 20*SZXREG
>> + xvst xr21, sp, 13*SZREG + 21*SZXREG
>> + xvst xr22, sp, 13*SZREG + 22*SZXREG
>> + xvst xr23, sp, 13*SZREG + 23*SZXREG
>> + xvst xr23, sp, 13*SZREG + 24*SZXREG
>> + xvst xr23, sp, 13*SZREG + 25*SZXREG
>> + xvst xr23, sp, 13*SZREG + 26*SZXREG
>> + xvst xr23, sp, 13*SZREG + 27*SZXREG
>> + xvst xr23, sp, 13*SZREG + 28*SZXREG
>> + xvst xr23, sp, 13*SZREG + 29*SZXREG
>> + xvst xr23, sp, 13*SZREG + 30*SZXREG
>> + xvst xr23, sp, 13*SZREG + 31*SZXREG
>> +#elif defined USE_LSX
>> + vst vr0, sp, 13*SZREG + 0*SZVREG
>> + vst vr1, sp, 13*SZREG + 1*SZVREG
>> + vst vr2, sp, 13*SZREG + 2*SZVREG
>> + vst vr3, sp, 13*SZREG + 3*SZVREG
>> + vst vr4, sp, 13*SZREG + 4*SZVREG
>> + vst vr5, sp, 13*SZREG + 5*SZVREG
>> + vst vr6, sp, 13*SZREG + 6*SZVREG
>> + vst vr7, sp, 13*SZREG + 7*SZVREG
>> + vst vr8, sp, 13*SZREG + 8*SZVREG
>> + vst vr9, sp, 13*SZREG + 9*SZVREG
>> + vst vr10, sp, 13*SZREG + 10*SZVREG
>> + vst vr11, sp, 13*SZREG + 11*SZVREG
>> + vst vr12, sp, 13*SZREG + 12*SZVREG
>> + vst vr13, sp, 13*SZREG + 13*SZVREG
>> + vst vr14, sp, 13*SZREG + 14*SZVREG
>> + vst vr15, sp, 13*SZREG + 15*SZVREG
>> + vst vr16, sp, 13*SZREG + 16*SZVREG
>> + vst vr17, sp, 13*SZREG + 17*SZVREG
>> + vst vr18, sp, 13*SZREG + 18*SZVREG
>> + vst vr19, sp, 13*SZREG + 19*SZVREG
>> + vst vr20, sp, 13*SZREG + 20*SZVREG
>> + vst vr21, sp, 13*SZREG + 21*SZVREG
>> + vst vr22, sp, 13*SZREG + 22*SZVREG
>> + vst vr23, sp, 13*SZREG + 23*SZVREG
>> + vst vr23, sp, 13*SZREG + 24*SZVREG
>> + vst vr23, sp, 13*SZREG + 25*SZVREG
>> + vst vr23, sp, 13*SZREG + 26*SZVREG
>> + vst vr23, sp, 13*SZREG + 27*SZVREG
>> + vst vr23, sp, 13*SZREG + 28*SZVREG
>> + vst vr23, sp, 13*SZREG + 29*SZVREG
>> + vst vr23, sp, 13*SZREG + 30*SZVREG
>> + vst vr23, sp, 13*SZREG + 31*SZVREG
>> +#elif !defined __loongarch_soft_float
>> + FREG_S fa0, sp, 13*SZREG + 0*SZFREG
>> + FREG_S fa1, sp, 13*SZREG + 1*SZFREG
>> + FREG_S fa2, sp, 13*SZREG + 2*SZFREG
>> + FREG_S fa3, sp, 13*SZREG + 3*SZFREG
>> + FREG_S fa4, sp, 13*SZREG + 4*SZFREG
>> + FREG_S fa5, sp, 13*SZREG + 5*SZFREG
>> + FREG_S fa6, sp, 13*SZREG + 6*SZFREG
>> + FREG_S fa7, sp, 13*SZREG + 7*SZFREG
>> + FREG_S ft0, sp, 13*SZREG + 8*SZFREG
>> + FREG_S ft1, sp, 13*SZREG + 9*SZFREG
>> + FREG_S ft2, sp, 13*SZREG + 10*SZFREG
>> + FREG_S ft3, sp, 13*SZREG + 11*SZFREG
>> + FREG_S ft4, sp, 13*SZREG + 12*SZFREG
>> + FREG_S ft5, sp, 13*SZREG + 13*SZFREG
>> + FREG_S ft6, sp, 13*SZREG + 14*SZFREG
>> + FREG_S ft7, sp, 13*SZREG + 15*SZFREG
>> + FREG_S ft8, sp, 13*SZREG + 16*SZFREG
>> + FREG_S ft9, sp, 13*SZREG + 17*SZFREG
>> + FREG_S ft10, sp, 13*SZREG + 18*SZFREG
>> + FREG_S ft11, sp, 13*SZREG + 19*SZFREG
>> + FREG_S ft12, sp, 13*SZREG + 20*SZFREG
>> + FREG_S ft13, sp, 13*SZREG + 21*SZFREG
>> + FREG_S ft14, sp, 13*SZREG + 22*SZFREG
>> + FREG_S ft15, sp, 13*SZREG + 23*SZFREG
>> +#endif /* #ifdef USE_LASX */
>> +
>> + bl __tls_get_addr
>> + addi.d a0, a0, -TLS_DTV_OFFSET
>> +
>> + REG_L ra, sp, 0
>> + REG_L a1, sp, 1 * 8
>> + REG_L a2, sp, 2 * 8
>> + REG_L a3, sp, 3 * 8
>> + REG_L a4, sp, 4 * 8
>> + REG_L a5, sp, 5 * 8
>> + REG_L a6, sp, 6 * 8
>> + REG_L a7, sp, 7 * 8
>> + REG_L t4, sp, 8 * 8
>> + REG_L t5, sp, 9 * 8
>> + REG_L t6, sp, 10 * 8
>> + REG_L t7, sp, 11 * 8
>> + REG_L t8, sp, 12 * 8
>> +
>> +#ifdef USE_LASX
>> + xvld xr0, sp, 13*SZREG + 0*SZXREG
>> + xvld xr1, sp, 13*SZREG + 1*SZXREG
>> + xvld xr2, sp, 13*SZREG + 2*SZXREG
>> + xvld xr3, sp, 13*SZREG + 3*SZXREG
>> + xvld xr4, sp, 13*SZREG + 4*SZXREG
>> + xvld xr5, sp, 13*SZREG + 5*SZXREG
>> + xvld xr6, sp, 13*SZREG + 6*SZXREG
>> + xvld xr7, sp, 13*SZREG + 7*SZXREG
>> + xvld xr8, sp, 13*SZREG + 8*SZXREG
>> + xvld xr9, sp, 13*SZREG + 9*SZXREG
>> + xvld xr10, sp, 13*SZREG + 10*SZXREG
>> + xvld xr11, sp, 13*SZREG + 11*SZXREG
>> + xvld xr12, sp, 13*SZREG + 12*SZXREG
>> + xvld xr13, sp, 13*SZREG + 13*SZXREG
>> + xvld xr14, sp, 13*SZREG + 14*SZXREG
>> + xvld xr15, sp, 13*SZREG + 15*SZXREG
>> + xvld xr16, sp, 13*SZREG + 16*SZXREG
>> + xvld xr17, sp, 13*SZREG + 17*SZXREG
>> + xvld xr18, sp, 13*SZREG + 18*SZXREG
>> + xvld xr19, sp, 13*SZREG + 19*SZXREG
>> + xvld xr20, sp, 13*SZREG + 20*SZXREG
>> + xvld xr21, sp, 13*SZREG + 21*SZXREG
>> + xvld xr22, sp, 13*SZREG + 22*SZXREG
>> + xvld xr23, sp, 13*SZREG + 23*SZXREG
>> + xvld xr24, sp, 13*SZREG + 24*SZXREG
>> + xvld xr25, sp, 13*SZREG + 25*SZXREG
>> + xvld xr26, sp, 13*SZREG + 26*SZXREG
>> + xvld xr27, sp, 13*SZREG + 27*SZXREG
>> + xvld xr28, sp, 13*SZREG + 28*SZXREG
>> + xvld xr29, sp, 13*SZREG + 29*SZXREG
>> + xvld xr30, sp, 13*SZREG + 30*SZXREG
>> + xvld xr31, sp, 13*SZREG + 31*SZXREG
>> +#elif defined USE_LSX
>> + vld vr0, sp, 13*SZREG + 0*SZVREG
>> + vld vr1, sp, 13*SZREG + 1*SZVREG
>> + vld vr2, sp, 13*SZREG + 2*SZVREG
>> + vld vr3, sp, 13*SZREG + 3*SZVREG
>> + vld vr4, sp, 13*SZREG + 4*SZVREG
>> + vld vr5, sp, 13*SZREG + 5*SZVREG
>> + vld vr6, sp, 13*SZREG + 6*SZVREG
>> + vld vr7, sp, 13*SZREG + 7*SZVREG
>> + vld vr8, sp, 13*SZREG + 8*SZVREG
>> + vld vr9, sp, 13*SZREG + 9*SZVREG
>> + vld vr10, sp, 13*SZREG + 10*SZVREG
>> + vld vr11, sp, 13*SZREG + 11*SZVREG
>> + vld vr12, sp, 13*SZREG + 12*SZVREG
>> + vld vr13, sp, 13*SZREG + 13*SZVREG
>> + vld vr14, sp, 13*SZREG + 14*SZVREG
>> + vld vr15, sp, 13*SZREG + 15*SZVREG
>> + vld vr16, sp, 13*SZREG + 16*SZVREG
>> + vld vr17, sp, 13*SZREG + 17*SZVREG
>> + vld vr18, sp, 13*SZREG + 18*SZVREG
>> + vld vr19, sp, 13*SZREG + 19*SZVREG
>> + vld vr20, sp, 13*SZREG + 20*SZVREG
>> + vld vr21, sp, 13*SZREG + 21*SZVREG
>> + vld vr22, sp, 13*SZREG + 22*SZVREG
>> + vld vr23, sp, 13*SZREG + 23*SZVREG
>> + vld vr24, sp, 13*SZREG + 24*SZVREG
>> + vld vr25, sp, 13*SZREG + 25*SZVREG
>> + vld vr26, sp, 13*SZREG + 26*SZVREG
>> + vld vr27, sp, 13*SZREG + 27*SZVREG
>> + vld vr28, sp, 13*SZREG + 28*SZVREG
>> + vld vr29, sp, 13*SZREG + 29*SZVREG
>> + vld vr30, sp, 13*SZREG + 30*SZVREG
>> + vld vr31, sp, 13*SZREG + 31*SZVREG
>> +#elif !defined __loongarch_soft_float
>> + FREG_L fa0, sp, 13*SZREG + 0*SZFREG
>> + FREG_L fa1, sp, 13*SZREG + 1*SZFREG
>> + FREG_L fa2, sp, 13*SZREG + 2*SZFREG
>> + FREG_L fa3, sp, 13*SZREG + 3*SZFREG
>> + FREG_L fa4, sp, 13*SZREG + 4*SZFREG
>> + FREG_L fa5, sp, 13*SZREG + 5*SZFREG
>> + FREG_L fa6, sp, 13*SZREG + 6*SZFREG
>> + FREG_L fa7, sp, 13*SZREG + 7*SZFREG
>> + FREG_L ft0, sp, 13*SZREG + 8*SZFREG
>> + FREG_L ft1, sp, 13*SZREG + 9*SZFREG
>> + FREG_L ft2, sp, 13*SZREG + 10*SZFREG
>> + FREG_L ft3, sp, 13*SZREG + 11*SZFREG
>> + FREG_L ft4, sp, 13*SZREG + 12*SZFREG
>> + FREG_L ft5, sp, 13*SZREG + 13*SZFREG
>> + FREG_L ft6, sp, 13*SZREG + 14*SZFREG
>> + FREG_L ft7, sp, 13*SZREG + 15*SZFREG
>> + FREG_L ft8, sp, 13*SZREG + 16*SZFREG
>> + FREG_L ft9, sp, 13*SZREG + 17*SZFREG
>> + FREG_L ft10, sp, 13*SZREG + 18*SZFREG
>> + FREG_L ft11, sp, 13*SZREG + 19*SZFREG
>> + FREG_L ft12, sp, 13*SZREG + 20*SZFREG
>> + FREG_L ft13, sp, 13*SZREG + 21*SZFREG
>> + FREG_L ft14, sp, 13*SZREG + 22*SZFREG
>> + FREG_L ft15, sp, 13*SZREG + 23*SZFREG
>> +#endif /* #ifdef USE_LASX */
>> +
>> + addi.d sp, sp, FRAME_SIZE
>> + b .Lret
>> + cfi_endproc
>> + .size _dl_tlsdesc_dynamic, .-_dl_tlsdesc_dynamic
>> +#endif /* #ifdef SHARED */
>> diff --git a/sysdeps/loongarch/dl-tlsdesc.h b/sysdeps/loongarch/dl-tlsdesc.h
>> new file mode 100644
>> index 0000000000..e1a9365855
>> --- /dev/null
>> +++ b/sysdeps/loongarch/dl-tlsdesc.h
>> @@ -0,0 +1,49 @@
>> +/* Thread-local storage descriptor handling in the ELF dynamic linker.
>> + LoongArch version.
>> + Copyright (C) 2011-2023 Free Software Foundation, Inc.
>> +
>> + This file is part of the GNU C Library.
>> +
>> + The GNU C Library is free software; you can redistribute it and/or
>> + modify it under the terms of the GNU Lesser General Public
>> + License as published by the Free Software Foundation; either
>> + version 2.1 of the License, or (at your option) any later version.
>> +
>> + The GNU C Library is distributed in the hope that it will be useful,
>> + but WITHOUT ANY WARRANTY; without even the implied warranty of
>> + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
>> + Lesser General Public License for more details.
>> +
>> + You should have received a copy of the GNU Lesser General Public
>> + License along with the GNU C Library; if not, see
>> + <https://www.gnu.org/licenses/>. */
>> +
>> +#ifndef _DL_TLSDESC_H
>> +#define _DL_TLSDESC_H
>> +
>> +#include <dl-tls.h>
>> +
>> +/* Type used to represent a TLS descriptor in the GOT. */
>> +struct tlsdesc
>> +{
>> + ptrdiff_t (*entry) (struct tlsdesc *);
>> + void *arg;
>> +};
>> +
>> +/* Type used as the argument in a TLS descriptor for a symbol that
>> + needs dynamic TLS offsets. */
>> +struct tlsdesc_dynamic_arg
>> +{
>> + tls_index tlsinfo;
>> + size_t gen_count;
>> +};
>> +
>> +extern ptrdiff_t attribute_hidden _dl_tlsdesc_return (struct tlsdesc *);
>> +extern ptrdiff_t attribute_hidden _dl_tlsdesc_undefweak (struct tlsdesc *);
>> +
>> +# ifdef SHARED
>> +extern void *_dl_make_tlsdesc_dynamic (struct link_map *, size_t);
>> +extern ptrdiff_t attribute_hidden _dl_tlsdesc_dynamic (struct tlsdesc *);
>> +#endif
>> +
>> +#endif
>> diff --git a/sysdeps/loongarch/linkmap.h b/sysdeps/loongarch/linkmap.h
>> index 2f5bf53421..40416b1ad4 100644
>> --- a/sysdeps/loongarch/linkmap.h
>> +++ b/sysdeps/loongarch/linkmap.h
>> @@ -19,4 +19,5 @@
>> struct link_map_machine
>> {
>> ElfW (Addr) plt; /* Address of .plt. */
>> + void *tlsdesc_table; /* Address of TLS descriptor hash table. */
>> };
>> diff --git a/sysdeps/loongarch/tlsdesc.c b/sysdeps/loongarch/tlsdesc.c
>> new file mode 100644
>> index 0000000000..a357e7619f
>> --- /dev/null
>> +++ b/sysdeps/loongarch/tlsdesc.c
>> @@ -0,0 +1,39 @@
>> +/* Manage TLS descriptors. AArch64 version.
>> +
>> + Copyright (C) 2011-2023 Free Software Foundation, Inc.
>> +
>> + This file is part of the GNU C Library.
>> +
>> + The GNU C Library is free software; you can redistribute it and/or
>> + modify it under the terms of the GNU Lesser General Public
>> + License as published by the Free Software Foundation; either
>> + version 2.1 of the License, or (at your option) any later version.
>> +
>> + The GNU C Library is distributed in the hope that it will be useful,
>> + but WITHOUT ANY WARRANTY; without even the implied warranty of
>> + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
>> + Lesser General Public License for more details.
>> +
>> + You should have received a copy of the GNU Lesser General Public
>> + License along with the GNU C Library; if not, see
>> + <https://www.gnu.org/licenses/>. */
>> +
>> +#include <ldsodefs.h>
>> +#include <tls.h>
>> +#include <dl-tlsdesc.h>
>> +#include <dl-unmap-segments.h>
>> +#include <tlsdeschtab.h>
>> +
>> +/* Unmap the dynamic object, but also release its TLS descriptor table
>> + if there is one. */
>> +
>> +void
>> +_dl_unmap (struct link_map *map)
>> +{
>> + _dl_unmap_segments (map);
>> +
>> +#ifdef SHARED
>> + if (map->l_mach.tlsdesc_table)
>> + htab_delete (map->l_mach.tlsdesc_table);
>> +#endif
>> +}
>> diff --git a/sysdeps/loongarch/tlsdesc.sym b/sysdeps/loongarch/tlsdesc.sym
>> new file mode 100644
>> index 0000000000..bcab218631
>> --- /dev/null
>> +++ b/sysdeps/loongarch/tlsdesc.sym
>> @@ -0,0 +1,19 @@
>> +#include <stddef.h>
>> +#include <sysdep.h>
>> +#include <tls.h>
>> +#include <link.h>
>> +#include <dl-tlsdesc.h>
>> +
>> +--
>> +
>> +-- Abuse tls.h macros to derive offsets relative to the thread register.
>> +
>> +TLSDESC_ARG offsetof(struct tlsdesc, arg)
>> +TLSDESC_GEN_COUNT offsetof(struct tlsdesc_dynamic_arg, gen_count)
>> +TLSDESC_MODID offsetof(struct tlsdesc_dynamic_arg, tlsinfo.ti_module)
>> +TLSDESC_MODOFF offsetof(struct tlsdesc_dynamic_arg, tlsinfo.ti_offset)
>> +TCBHEAD_DTV offsetof(tcbhead_t, dtv)
>> +DTV_COUNTER offsetof(dtv_t, counter)
>> +TLS_DTV_UNALLOCATED TLS_DTV_UNALLOCATED
>> +TLS_DTV_OFFSET TLS_DTV_OFFSET
>> +SIZE_OF_DTV sizeof(tcbhead_t)
>> diff --git a/sysdeps/unix/sysv/linux/loongarch/localplt.data b/sysdeps/unix/sysv/linux/loongarch/localplt.data
>> index 547b1c1b7f..ec32e6d13f 100644
>> --- a/sysdeps/unix/sysv/linux/loongarch/localplt.data
>> +++ b/sysdeps/unix/sysv/linux/loongarch/localplt.data
>> @@ -5,3 +5,5 @@ libc.so: calloc
>> libc.so: free
>> libc.so: malloc
>> libc.so: realloc
>> +# The dynamic loader needs __tls_get_addr for TLS.
>> +ld.so: __tls_get_addr
>> --
>> 2.36.0
>>
>>
@@ -4232,6 +4232,8 @@ enum
#define R_LARCH_TLS_TPREL32 10
#define R_LARCH_TLS_TPREL64 11
#define R_LARCH_IRELATIVE 12
+#define R_LARCH_TLS_DESC32 13
+#define R_LARCH_TLS_DESC64 14
/* Reserved for future relocs that the dynamic linker must understand. */
@@ -3,9 +3,15 @@ sysdep_headers += sys/asm.h
endif
ifeq ($(subdir),elf)
+sysdep-dl-routines += tlsdesc dl-tlsdesc
gen-as-const-headers += dl-link.sym
endif
+ifeq ($(subdir),csu)
+gen-as-const-headers += tlsdesc.sym
+endif
+
+
# LoongArch's assembler also needs to know about PIC as it changes the
# definition of some assembler macros.
ASFLAGS-.os += $(pic-ccflag)
@@ -1,6 +1,7 @@
#include <stddef.h>
#include <sysdep.h>
#include <link.h>
+#include <dl-tlsdesc.h>
DL_SIZEOF_RG sizeof(struct La_loongarch_regs)
DL_SIZEOF_RV sizeof(struct La_loongarch_retval)
@@ -25,7 +25,7 @@
#include <entry.h>
#include <elf/elf.h>
#include <sys/asm.h>
-#include <dl-tls.h>
+#include <dl-tlsdesc.h>
#include <dl-static-tls.h>
#include <dl-machine-rel.h>
@@ -187,6 +187,37 @@ elf_machine_rela (struct link_map *map, struct r_scope_elem *scope[],
*addr_field = TLS_TPREL_VALUE (sym_map, sym) + reloc->r_addend;
break;
+ case __WORDSIZE == 64 ? R_LARCH_TLS_DESC64 : R_LARCH_TLS_DESC32:
+ {
+ struct tlsdesc volatile *td =
+ (struct tlsdesc volatile *)addr_field;
+ if (! sym)
+ {
+ td->arg = (void*)reloc->r_addend;
+ td->entry = _dl_tlsdesc_undefweak;
+ }
+ else
+ {
+# ifndef SHARED
+ CHECK_STATIC_TLS (map, sym_map);
+# else
+ if (!TRY_STATIC_TLS (map, sym_map))
+ {
+ td->arg = _dl_make_tlsdesc_dynamic
+ (sym_map, sym->st_value + reloc->r_addend);
+ td->entry = _dl_tlsdesc_dynamic;
+ }
+ else
+# endif
+ {
+ td->arg = (void *)(TLS_TPREL_VALUE (sym_map, sym)
+ + reloc->r_addend);
+ td->entry = _dl_tlsdesc_return;
+ }
+ }
+ break;
+ }
+
case R_LARCH_COPY:
{
if (sym == NULL)
@@ -255,6 +286,25 @@ elf_machine_lazy_rel (struct link_map *map, struct r_scope_elem *scope[],
else
*reloc_addr = map->l_mach.plt;
}
+ else if (__builtin_expect (r_type == R_LARCH_TLS_DESC64, 1))
+ {
+ const Elf_Symndx symndx = ELFW (R_SYM) (reloc->r_info);
+ const ElfW (Sym) *symtab = (const void *)D_PTR (map, l_info[DT_SYMTAB]);
+ const ElfW (Sym) *sym = &symtab[symndx];
+ const struct r_found_version *version = NULL;
+
+ if (map->l_info[VERSYMIDX (DT_VERSYM)] != NULL)
+ {
+ const ElfW (Half) *vernum =
+ (const void *)D_PTR (map, l_info[VERSYMIDX (DT_VERSYM)]);
+ version = &map->l_versions[vernum[symndx] & 0x7fff];
+ }
+
+ /* Always initialize TLS descriptors completely, because lazy
+ initialization requires synchronization at every TLS access. */
+ elf_machine_rela (map, scope, reloc, sym, version, reloc_addr,
+ skip_ifunc);
+ }
else
_dl_reloc_bad_type (map, r_type, 1);
}
@@ -16,6 +16,9 @@
License along with the GNU C Library. If not, see
<https://www.gnu.org/licenses/>. */
+#ifndef _DL_TLS_H
+#define _DL_TLS_H
+
/* Type used for the representation of TLS information in the GOT. */
typedef struct
{
@@ -23,6 +26,8 @@ typedef struct
unsigned long int ti_offset;
} tls_index;
+extern void *__tls_get_addr (tls_index *ti);
+
/* The thread pointer points to the first static TLS block. */
#define TLS_TP_OFFSET 0
@@ -37,10 +42,10 @@ typedef struct
/* Compute the value for a DTPREL reloc. */
#define TLS_DTPREL_VALUE(sym) ((sym)->st_value - TLS_DTV_OFFSET)
-extern void *__tls_get_addr (tls_index *ti);
-
#define GET_ADDR_OFFSET (ti->ti_offset + TLS_DTV_OFFSET)
#define __TLS_GET_ADDR(__ti) (__tls_get_addr (__ti) - TLS_DTV_OFFSET)
/* Value used for dtv entries for which the allocation is delayed. */
#define TLS_DTV_UNALLOCATED ((void *) -1l)
+
+#endif
new file mode 100644
@@ -0,0 +1,364 @@
+/* Thread-local storage handling in the ELF dynamic linker.
+ LoongArch version.
+ Copyright (C) 2011-2023 Free Software Foundation, Inc.
+
+ This file is part of the GNU C Library.
+
+ The GNU C Library is free software; you can redistribute it and/or
+ modify it under the terms of the GNU Lesser General Public
+ License as published by the Free Software Foundation; either
+ version 2.1 of the License, or (at your option) any later version.
+
+ The GNU C Library is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ Lesser General Public License for more details.
+
+ You should have received a copy of the GNU Lesser General Public
+ License along with the GNU C Library; if not, see
+ <https://www.gnu.org/licenses/>. */
+
+#include <sysdep.h>
+#include <tls.h>
+#include "tlsdesc.h"
+
+ .text
+
+ /* Compute the thread pointer offset for symbols in the static
+ TLS block. The offset is the same for all threads.
+ Prototype:
+ _dl_tlsdesc_return (tlsdesc *); */
+ .hidden _dl_tlsdesc_return
+ .global _dl_tlsdesc_return
+ .type _dl_tlsdesc_return,%function
+ cfi_startproc
+ .align 2
+_dl_tlsdesc_return:
+ REG_L a0, a0, 8
+ RET
+ cfi_endproc
+ .size _dl_tlsdesc_return, .-_dl_tlsdesc_return
+
+ /* Handler for undefined weak TLS symbols.
+ Prototype:
+ _dl_tlsdesc_undefweak (tlsdesc *);
+
+ The second word of the descriptor contains the addend.
+ Return the addend minus the thread pointer. This ensures
+ that when the caller adds on the thread pointer it gets back
+ the addend. */
+ .hidden _dl_tlsdesc_undefweak
+ .global _dl_tlsdesc_undefweak
+ .type _dl_tlsdesc_undefweak,%function
+ cfi_startproc
+ .align 2
+_dl_tlsdesc_undefweak:
+ REG_L a0, a0, 8
+ sub.d a0, a0, tp
+ RET
+ cfi_endproc
+ .size _dl_tlsdesc_undefweak, .-_dl_tlsdesc_undefweak
+
+#ifdef USE_LASX
+# define FRAME_SIZE (-((-13 * SZREG - 32 * SZXREG) & ALMASK))
+#elif defined USE_LSX
+# define FRAME_SIZE (-((-13 * SZREG - 32 * SZVREG) & ALMASK))
+#elif !defined __loongarch_soft_float
+# define FRAME_SIZE (-((-13 * SZREG - 24 * SZFREG) & ALMASK))
+#else
+# define FRAME_SIZE (-((-13 * SZREG) & ALMASK))
+#endif
+
+#ifdef SHARED
+ /* Handler for dynamic TLS symbols.
+ Prototype:
+ _dl_tlsdesc_dynamic (tlsdesc *) ;
+
+ The second word of the descriptor points to a
+ tlsdesc_dynamic_arg structure.
+
+ Returns the offset between the thread pointer and the
+ object referenced by the argument.
+
+ ptrdiff_t
+ __attribute__ ((__regparm__ (1)))
+ _dl_tlsdesc_dynamic (struct tlsdesc *tdp)
+ {
+ struct tlsdesc_dynamic_arg *td = tdp->arg;
+ dtv_t *dtv = *(dtv_t **)((char *)__thread_pointer + TCBHEAD_DTV);
+ if (__builtin_expect (td->gen_count <= dtv[0].counter
+ && (dtv[td->tlsinfo.ti_module].pointer.val
+ != TLS_DTV_UNALLOCATED),
+ 1))
+ return dtv[td->tlsinfo.ti_module].pointer.val
+ + td->tlsinfo.ti_offset
+ - __thread_pointer;
+
+ return ___tls_get_addr (&td->tlsinfo) - __thread_pointer;
+ }
+ */
+ .hidden _dl_tlsdesc_dynamic
+ .global _dl_tlsdesc_dynamic
+ .type _dl_tlsdesc_dynamic,%function
+ cfi_startproc
+ .align 2
+_dl_tlsdesc_dynamic:
+ /* Save just enough registers to support fast path, if we fall
+ into slow path we will save additional registers. */
+ addi.d $r3,$r3,-24
+ REG_S t0, sp, 0
+ REG_S t1, sp, 8
+ REG_S t2, sp, 16
+
+ REG_L t0, tp, -SIZE_OF_DTV # dtv(t0) = tp + TCBHEAD_DTV dtv start
+ REG_L a0, a0, TLSDESC_ARG # td(a0) = tdp->arg
+ REG_L t1, a0, TLSDESC_GEN_COUNT # t1 = td->gen_count
+ REG_L t2, t0, DTV_COUNTER # t2 = dtv[0].counter
+ bltu t2, t1, .Lslow
+
+ REG_L t1, a0, TLSDESC_MODID # t1 = td->tlsinfo.ti_module
+ slli.d t1, t1, 3 + 1 # /* sizeof(dtv_t) == sizeof(void*) * 2 */
+ add.d t1, t1, t0 # t1 = dtv + ti_module * sizeof(dtv_t)
+ REG_L t1, t1, 0 # t1 = dtv[td->tlsinfo.ti_module].pointer.val
+ li.d t2, TLS_DTV_UNALLOCATED
+ beq t1, t2, .Lslow
+ REG_L t2, a0, TLSDESC_MODOFF # t2 = td->tlsinfo.ti_offset
+ # dtv[td->tlsinfo.ti_module].pointer.val + td->tlsinfo.ti_offset
+ add.d a0, t1, t2
+.Lret:
+ sub.d a0, a0, tp
+ REG_L t0, sp, 0
+ REG_L t1, sp, 8
+ REG_L t2, sp, 16
+ addi.d sp, sp, 24
+ RET
+
+.Lslow:
+ /* This is the slow path. We need to call __tls_get_addr() which
+ means we need to save and restore all the register that the
+ callee will trash. */
+
+ /* Save the remaining registers that we must treat as caller save. */
+ addi.d sp, sp, -FRAME_SIZE
+ REG_S ra, sp, 0 * SZREG
+ REG_S a1, sp, 1 * SZREG
+ REG_S a2, sp, 2 * SZREG
+ REG_S a3, sp, 3 * SZREG
+ REG_S a4, sp, 4 * SZREG
+ REG_S a5, sp, 5 * SZREG
+ REG_S a6, sp, 6 * SZREG
+ REG_S a7, sp, 7 * SZREG
+ REG_S t4, sp, 8 * SZREG
+ REG_S t5, sp, 9 * SZREG
+ REG_S t6, sp, 10 * SZREG
+ REG_S t7, sp, 11 * SZREG
+ REG_S t8, sp, 12 * SZREG
+
+#ifdef USE_LASX
+ xvst xr0, sp, 13*SZREG + 0*SZXREG
+ xvst xr1, sp, 13*SZREG + 1*SZXREG
+ xvst xr2, sp, 13*SZREG + 2*SZXREG
+ xvst xr3, sp, 13*SZREG + 3*SZXREG
+ xvst xr4, sp, 13*SZREG + 4*SZXREG
+ xvst xr5, sp, 13*SZREG + 5*SZXREG
+ xvst xr6, sp, 13*SZREG + 6*SZXREG
+ xvst xr7, sp, 13*SZREG + 7*SZXREG
+ xvst xr8, sp, 13*SZREG + 8*SZXREG
+ xvst xr9, sp, 13*SZREG + 9*SZXREG
+ xvst xr10, sp, 13*SZREG + 10*SZXREG
+ xvst xr11, sp, 13*SZREG + 11*SZXREG
+ xvst xr12, sp, 13*SZREG + 12*SZXREG
+ xvst xr13, sp, 13*SZREG + 13*SZXREG
+ xvst xr14, sp, 13*SZREG + 14*SZXREG
+ xvst xr15, sp, 13*SZREG + 15*SZXREG
+ xvst xr16, sp, 13*SZREG + 16*SZXREG
+ xvst xr17, sp, 13*SZREG + 17*SZXREG
+ xvst xr18, sp, 13*SZREG + 18*SZXREG
+ xvst xr19, sp, 13*SZREG + 19*SZXREG
+ xvst xr20, sp, 13*SZREG + 20*SZXREG
+ xvst xr21, sp, 13*SZREG + 21*SZXREG
+ xvst xr22, sp, 13*SZREG + 22*SZXREG
+ xvst xr23, sp, 13*SZREG + 23*SZXREG
+ xvst xr23, sp, 13*SZREG + 24*SZXREG
+ xvst xr23, sp, 13*SZREG + 25*SZXREG
+ xvst xr23, sp, 13*SZREG + 26*SZXREG
+ xvst xr23, sp, 13*SZREG + 27*SZXREG
+ xvst xr23, sp, 13*SZREG + 28*SZXREG
+ xvst xr23, sp, 13*SZREG + 29*SZXREG
+ xvst xr23, sp, 13*SZREG + 30*SZXREG
+ xvst xr23, sp, 13*SZREG + 31*SZXREG
+#elif defined USE_LSX
+ vst vr0, sp, 13*SZREG + 0*SZVREG
+ vst vr1, sp, 13*SZREG + 1*SZVREG
+ vst vr2, sp, 13*SZREG + 2*SZVREG
+ vst vr3, sp, 13*SZREG + 3*SZVREG
+ vst vr4, sp, 13*SZREG + 4*SZVREG
+ vst vr5, sp, 13*SZREG + 5*SZVREG
+ vst vr6, sp, 13*SZREG + 6*SZVREG
+ vst vr7, sp, 13*SZREG + 7*SZVREG
+ vst vr8, sp, 13*SZREG + 8*SZVREG
+ vst vr9, sp, 13*SZREG + 9*SZVREG
+ vst vr10, sp, 13*SZREG + 10*SZVREG
+ vst vr11, sp, 13*SZREG + 11*SZVREG
+ vst vr12, sp, 13*SZREG + 12*SZVREG
+ vst vr13, sp, 13*SZREG + 13*SZVREG
+ vst vr14, sp, 13*SZREG + 14*SZVREG
+ vst vr15, sp, 13*SZREG + 15*SZVREG
+ vst vr16, sp, 13*SZREG + 16*SZVREG
+ vst vr17, sp, 13*SZREG + 17*SZVREG
+ vst vr18, sp, 13*SZREG + 18*SZVREG
+ vst vr19, sp, 13*SZREG + 19*SZVREG
+ vst vr20, sp, 13*SZREG + 20*SZVREG
+ vst vr21, sp, 13*SZREG + 21*SZVREG
+ vst vr22, sp, 13*SZREG + 22*SZVREG
+ vst vr23, sp, 13*SZREG + 23*SZVREG
+ vst vr23, sp, 13*SZREG + 24*SZVREG
+ vst vr23, sp, 13*SZREG + 25*SZVREG
+ vst vr23, sp, 13*SZREG + 26*SZVREG
+ vst vr23, sp, 13*SZREG + 27*SZVREG
+ vst vr23, sp, 13*SZREG + 28*SZVREG
+ vst vr23, sp, 13*SZREG + 29*SZVREG
+ vst vr23, sp, 13*SZREG + 30*SZVREG
+ vst vr23, sp, 13*SZREG + 31*SZVREG
+#elif !defined __loongarch_soft_float
+ FREG_S fa0, sp, 13*SZREG + 0*SZFREG
+ FREG_S fa1, sp, 13*SZREG + 1*SZFREG
+ FREG_S fa2, sp, 13*SZREG + 2*SZFREG
+ FREG_S fa3, sp, 13*SZREG + 3*SZFREG
+ FREG_S fa4, sp, 13*SZREG + 4*SZFREG
+ FREG_S fa5, sp, 13*SZREG + 5*SZFREG
+ FREG_S fa6, sp, 13*SZREG + 6*SZFREG
+ FREG_S fa7, sp, 13*SZREG + 7*SZFREG
+ FREG_S ft0, sp, 13*SZREG + 8*SZFREG
+ FREG_S ft1, sp, 13*SZREG + 9*SZFREG
+ FREG_S ft2, sp, 13*SZREG + 10*SZFREG
+ FREG_S ft3, sp, 13*SZREG + 11*SZFREG
+ FREG_S ft4, sp, 13*SZREG + 12*SZFREG
+ FREG_S ft5, sp, 13*SZREG + 13*SZFREG
+ FREG_S ft6, sp, 13*SZREG + 14*SZFREG
+ FREG_S ft7, sp, 13*SZREG + 15*SZFREG
+ FREG_S ft8, sp, 13*SZREG + 16*SZFREG
+ FREG_S ft9, sp, 13*SZREG + 17*SZFREG
+ FREG_S ft10, sp, 13*SZREG + 18*SZFREG
+ FREG_S ft11, sp, 13*SZREG + 19*SZFREG
+ FREG_S ft12, sp, 13*SZREG + 20*SZFREG
+ FREG_S ft13, sp, 13*SZREG + 21*SZFREG
+ FREG_S ft14, sp, 13*SZREG + 22*SZFREG
+ FREG_S ft15, sp, 13*SZREG + 23*SZFREG
+#endif /* #ifdef USE_LASX */
+
+ bl __tls_get_addr
+ addi.d a0, a0, -TLS_DTV_OFFSET
+
+ REG_L ra, sp, 0
+ REG_L a1, sp, 1 * 8
+ REG_L a2, sp, 2 * 8
+ REG_L a3, sp, 3 * 8
+ REG_L a4, sp, 4 * 8
+ REG_L a5, sp, 5 * 8
+ REG_L a6, sp, 6 * 8
+ REG_L a7, sp, 7 * 8
+ REG_L t4, sp, 8 * 8
+ REG_L t5, sp, 9 * 8
+ REG_L t6, sp, 10 * 8
+ REG_L t7, sp, 11 * 8
+ REG_L t8, sp, 12 * 8
+
+#ifdef USE_LASX
+ xvld xr0, sp, 13*SZREG + 0*SZXREG
+ xvld xr1, sp, 13*SZREG + 1*SZXREG
+ xvld xr2, sp, 13*SZREG + 2*SZXREG
+ xvld xr3, sp, 13*SZREG + 3*SZXREG
+ xvld xr4, sp, 13*SZREG + 4*SZXREG
+ xvld xr5, sp, 13*SZREG + 5*SZXREG
+ xvld xr6, sp, 13*SZREG + 6*SZXREG
+ xvld xr7, sp, 13*SZREG + 7*SZXREG
+ xvld xr8, sp, 13*SZREG + 8*SZXREG
+ xvld xr9, sp, 13*SZREG + 9*SZXREG
+ xvld xr10, sp, 13*SZREG + 10*SZXREG
+ xvld xr11, sp, 13*SZREG + 11*SZXREG
+ xvld xr12, sp, 13*SZREG + 12*SZXREG
+ xvld xr13, sp, 13*SZREG + 13*SZXREG
+ xvld xr14, sp, 13*SZREG + 14*SZXREG
+ xvld xr15, sp, 13*SZREG + 15*SZXREG
+ xvld xr16, sp, 13*SZREG + 16*SZXREG
+ xvld xr17, sp, 13*SZREG + 17*SZXREG
+ xvld xr18, sp, 13*SZREG + 18*SZXREG
+ xvld xr19, sp, 13*SZREG + 19*SZXREG
+ xvld xr20, sp, 13*SZREG + 20*SZXREG
+ xvld xr21, sp, 13*SZREG + 21*SZXREG
+ xvld xr22, sp, 13*SZREG + 22*SZXREG
+ xvld xr23, sp, 13*SZREG + 23*SZXREG
+ xvld xr24, sp, 13*SZREG + 24*SZXREG
+ xvld xr25, sp, 13*SZREG + 25*SZXREG
+ xvld xr26, sp, 13*SZREG + 26*SZXREG
+ xvld xr27, sp, 13*SZREG + 27*SZXREG
+ xvld xr28, sp, 13*SZREG + 28*SZXREG
+ xvld xr29, sp, 13*SZREG + 29*SZXREG
+ xvld xr30, sp, 13*SZREG + 30*SZXREG
+ xvld xr31, sp, 13*SZREG + 31*SZXREG
+#elif defined USE_LSX
+ vld vr0, sp, 13*SZREG + 0*SZVREG
+ vld vr1, sp, 13*SZREG + 1*SZVREG
+ vld vr2, sp, 13*SZREG + 2*SZVREG
+ vld vr3, sp, 13*SZREG + 3*SZVREG
+ vld vr4, sp, 13*SZREG + 4*SZVREG
+ vld vr5, sp, 13*SZREG + 5*SZVREG
+ vld vr6, sp, 13*SZREG + 6*SZVREG
+ vld vr7, sp, 13*SZREG + 7*SZVREG
+ vld vr8, sp, 13*SZREG + 8*SZVREG
+ vld vr9, sp, 13*SZREG + 9*SZVREG
+ vld vr10, sp, 13*SZREG + 10*SZVREG
+ vld vr11, sp, 13*SZREG + 11*SZVREG
+ vld vr12, sp, 13*SZREG + 12*SZVREG
+ vld vr13, sp, 13*SZREG + 13*SZVREG
+ vld vr14, sp, 13*SZREG + 14*SZVREG
+ vld vr15, sp, 13*SZREG + 15*SZVREG
+ vld vr16, sp, 13*SZREG + 16*SZVREG
+ vld vr17, sp, 13*SZREG + 17*SZVREG
+ vld vr18, sp, 13*SZREG + 18*SZVREG
+ vld vr19, sp, 13*SZREG + 19*SZVREG
+ vld vr20, sp, 13*SZREG + 20*SZVREG
+ vld vr21, sp, 13*SZREG + 21*SZVREG
+ vld vr22, sp, 13*SZREG + 22*SZVREG
+ vld vr23, sp, 13*SZREG + 23*SZVREG
+ vld vr24, sp, 13*SZREG + 24*SZVREG
+ vld vr25, sp, 13*SZREG + 25*SZVREG
+ vld vr26, sp, 13*SZREG + 26*SZVREG
+ vld vr27, sp, 13*SZREG + 27*SZVREG
+ vld vr28, sp, 13*SZREG + 28*SZVREG
+ vld vr29, sp, 13*SZREG + 29*SZVREG
+ vld vr30, sp, 13*SZREG + 30*SZVREG
+ vld vr31, sp, 13*SZREG + 31*SZVREG
+#elif !defined __loongarch_soft_float
+ FREG_L fa0, sp, 13*SZREG + 0*SZFREG
+ FREG_L fa1, sp, 13*SZREG + 1*SZFREG
+ FREG_L fa2, sp, 13*SZREG + 2*SZFREG
+ FREG_L fa3, sp, 13*SZREG + 3*SZFREG
+ FREG_L fa4, sp, 13*SZREG + 4*SZFREG
+ FREG_L fa5, sp, 13*SZREG + 5*SZFREG
+ FREG_L fa6, sp, 13*SZREG + 6*SZFREG
+ FREG_L fa7, sp, 13*SZREG + 7*SZFREG
+ FREG_L ft0, sp, 13*SZREG + 8*SZFREG
+ FREG_L ft1, sp, 13*SZREG + 9*SZFREG
+ FREG_L ft2, sp, 13*SZREG + 10*SZFREG
+ FREG_L ft3, sp, 13*SZREG + 11*SZFREG
+ FREG_L ft4, sp, 13*SZREG + 12*SZFREG
+ FREG_L ft5, sp, 13*SZREG + 13*SZFREG
+ FREG_L ft6, sp, 13*SZREG + 14*SZFREG
+ FREG_L ft7, sp, 13*SZREG + 15*SZFREG
+ FREG_L ft8, sp, 13*SZREG + 16*SZFREG
+ FREG_L ft9, sp, 13*SZREG + 17*SZFREG
+ FREG_L ft10, sp, 13*SZREG + 18*SZFREG
+ FREG_L ft11, sp, 13*SZREG + 19*SZFREG
+ FREG_L ft12, sp, 13*SZREG + 20*SZFREG
+ FREG_L ft13, sp, 13*SZREG + 21*SZFREG
+ FREG_L ft14, sp, 13*SZREG + 22*SZFREG
+ FREG_L ft15, sp, 13*SZREG + 23*SZFREG
+#endif /* #ifdef USE_LASX */
+
+ addi.d sp, sp, FRAME_SIZE
+ b .Lret
+ cfi_endproc
+ .size _dl_tlsdesc_dynamic, .-_dl_tlsdesc_dynamic
+#endif /* #ifdef SHARED */
new file mode 100644
@@ -0,0 +1,49 @@
+/* Thread-local storage descriptor handling in the ELF dynamic linker.
+ LoongArch version.
+ Copyright (C) 2011-2023 Free Software Foundation, Inc.
+
+ This file is part of the GNU C Library.
+
+ The GNU C Library is free software; you can redistribute it and/or
+ modify it under the terms of the GNU Lesser General Public
+ License as published by the Free Software Foundation; either
+ version 2.1 of the License, or (at your option) any later version.
+
+ The GNU C Library is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ Lesser General Public License for more details.
+
+ You should have received a copy of the GNU Lesser General Public
+ License along with the GNU C Library; if not, see
+ <https://www.gnu.org/licenses/>. */
+
+#ifndef _DL_TLSDESC_H
+#define _DL_TLSDESC_H
+
+#include <dl-tls.h>
+
+/* Type used to represent a TLS descriptor in the GOT. */
+struct tlsdesc
+{
+ ptrdiff_t (*entry) (struct tlsdesc *);
+ void *arg;
+};
+
+/* Type used as the argument in a TLS descriptor for a symbol that
+ needs dynamic TLS offsets. */
+struct tlsdesc_dynamic_arg
+{
+ tls_index tlsinfo;
+ size_t gen_count;
+};
+
+extern ptrdiff_t attribute_hidden _dl_tlsdesc_return (struct tlsdesc *);
+extern ptrdiff_t attribute_hidden _dl_tlsdesc_undefweak (struct tlsdesc *);
+
+# ifdef SHARED
+extern void *_dl_make_tlsdesc_dynamic (struct link_map *, size_t);
+extern ptrdiff_t attribute_hidden _dl_tlsdesc_dynamic (struct tlsdesc *);
+#endif
+
+#endif
@@ -19,4 +19,5 @@
struct link_map_machine
{
ElfW (Addr) plt; /* Address of .plt. */
+ void *tlsdesc_table; /* Address of TLS descriptor hash table. */
};
new file mode 100644
@@ -0,0 +1,39 @@
+/* Manage TLS descriptors. AArch64 version.
+
+ Copyright (C) 2011-2023 Free Software Foundation, Inc.
+
+ This file is part of the GNU C Library.
+
+ The GNU C Library is free software; you can redistribute it and/or
+ modify it under the terms of the GNU Lesser General Public
+ License as published by the Free Software Foundation; either
+ version 2.1 of the License, or (at your option) any later version.
+
+ The GNU C Library is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ Lesser General Public License for more details.
+
+ You should have received a copy of the GNU Lesser General Public
+ License along with the GNU C Library; if not, see
+ <https://www.gnu.org/licenses/>. */
+
+#include <ldsodefs.h>
+#include <tls.h>
+#include <dl-tlsdesc.h>
+#include <dl-unmap-segments.h>
+#include <tlsdeschtab.h>
+
+/* Unmap the dynamic object, but also release its TLS descriptor table
+ if there is one. */
+
+void
+_dl_unmap (struct link_map *map)
+{
+ _dl_unmap_segments (map);
+
+#ifdef SHARED
+ if (map->l_mach.tlsdesc_table)
+ htab_delete (map->l_mach.tlsdesc_table);
+#endif
+}
new file mode 100644
@@ -0,0 +1,19 @@
+#include <stddef.h>
+#include <sysdep.h>
+#include <tls.h>
+#include <link.h>
+#include <dl-tlsdesc.h>
+
+--
+
+-- Abuse tls.h macros to derive offsets relative to the thread register.
+
+TLSDESC_ARG offsetof(struct tlsdesc, arg)
+TLSDESC_GEN_COUNT offsetof(struct tlsdesc_dynamic_arg, gen_count)
+TLSDESC_MODID offsetof(struct tlsdesc_dynamic_arg, tlsinfo.ti_module)
+TLSDESC_MODOFF offsetof(struct tlsdesc_dynamic_arg, tlsinfo.ti_offset)
+TCBHEAD_DTV offsetof(tcbhead_t, dtv)
+DTV_COUNTER offsetof(dtv_t, counter)
+TLS_DTV_UNALLOCATED TLS_DTV_UNALLOCATED
+TLS_DTV_OFFSET TLS_DTV_OFFSET
+SIZE_OF_DTV sizeof(tcbhead_t)
@@ -5,3 +5,5 @@ libc.so: calloc
libc.so: free
libc.so: malloc
libc.so: realloc
+# The dynamic loader needs __tls_get_addr for TLS.
+ld.so: __tls_get_addr