[v7,2/2] x86: Update _dl_tlsdesc_dynamic to preserve caller-saved registers

Message ID 20240216002114.2255406-3-hjl.tools@gmail.com
State Superseded
Headers
Series x86: Update _dl_tlsdesc_dynamic to preserve caller-saved registers |

Checks

Context Check Description
redhat-pt-bot/TryBot-apply_patch success Patch applied to master at the time it was sent
redhat-pt-bot/TryBot-32bit success Build for i686
linaro-tcwg-bot/tcwg_glibc_build--master-arm success Testing passed
linaro-tcwg-bot/tcwg_glibc_build--master-aarch64 success Testing passed
linaro-tcwg-bot/tcwg_glibc_check--master-arm fail Testing failed
linaro-tcwg-bot/tcwg_glibc_check--master-aarch64 success Testing passed

Commit Message

H.J. Lu Feb. 16, 2024, 12:21 a.m. UTC
  Compiler generates the following instruction sequence for GNU2 dynamic
TLS access:

	leaq	tls_var@TLSDESC(%rip), %rax
	call	*tls_var@TLSCALL(%rax)

or

	leal	tls_var@TLSDESC(%ebx), %eax
	call	*tls_var@TLSCALL(%eax)

CALL instruction is transparent to compiler which assumes all registers,
except for EFLAGS and RAX/EAX, are unchanged after CALL.  When
_dl_tlsdesc_dynamic is called, it calls __tls_get_addr on the slow
path.  __tls_get_addr is a normal function which doesn't preserve any
caller-saved registers.  _dl_tlsdesc_dynamic saved and restored integer
caller-saved registers, but didn't preserve any other caller-saved
registers.  Add _dl_tlsdesc_dynamic IFUNC functions for FNSAVE, FXSAVE,
XSAVE and XSAVEC to save and restore all caller-saved registers.  This
fixes BZ #31372.

Add GLRO(dl_x86_64_runtime_resolve) with GLRO(dl_x86_tlsdesc_dynamic)
to optimize elf_machine_runtime_setup.
---
 elf/Makefile                                 |  36 +++-
 elf/malloc-for-test.c                        |  32 ++++
 elf/malloc-for-test.map.in                   |   8 +
 elf/tst-gnu2-tls2.c                          |  97 ++++++++++
 elf/tst-gnu2-tls2.h                          |  26 +++
 elf/tst-gnu2-tls2mod0.c                      |  28 +++
 elf/tst-gnu2-tls2mod1.c                      |  28 +++
 elf/tst-gnu2-tls2mod2.c                      |  28 +++
 sysdeps/i386/dl-machine.h                    |   2 +-
 sysdeps/i386/dl-tlsdesc-dynamic.h            | 190 +++++++++++++++++++
 sysdeps/i386/dl-tlsdesc.S                    | 115 +++++------
 sysdeps/i386/tst-gnu2-tls2.c                 |   5 +
 sysdeps/x86/Makefile                         |   7 +-
 sysdeps/x86/cpu-features.c                   |  56 +++++-
 sysdeps/x86/dl-procinfo.c                    |  16 ++
 sysdeps/{x86_64 => x86}/features-offsets.sym |   2 +
 sysdeps/x86/malloc-for-test.c                |  33 ++++
 sysdeps/x86/sysdep.h                         |   6 +
 sysdeps/x86_64/Makefile                      |   2 +-
 sysdeps/x86_64/dl-machine.h                  |  19 +-
 sysdeps/x86_64/dl-procinfo.c                 |  16 ++
 sysdeps/x86_64/dl-tlsdesc-dynamic.h          | 166 ++++++++++++++++
 sysdeps/x86_64/dl-tlsdesc.S                  | 108 ++++-------
 sysdeps/x86_64/dl-trampoline-save.h          |  34 ++++
 sysdeps/x86_64/dl-trampoline-state.h         |  51 +++++
 sysdeps/x86_64/dl-trampoline.S               |  20 +-
 sysdeps/x86_64/dl-trampoline.h               |  34 +---
 27 files changed, 950 insertions(+), 215 deletions(-)
 create mode 100644 elf/malloc-for-test.c
 create mode 100644 elf/malloc-for-test.map.in
 create mode 100644 elf/tst-gnu2-tls2.c
 create mode 100644 elf/tst-gnu2-tls2.h
 create mode 100644 elf/tst-gnu2-tls2mod0.c
 create mode 100644 elf/tst-gnu2-tls2mod1.c
 create mode 100644 elf/tst-gnu2-tls2mod2.c
 create mode 100644 sysdeps/i386/dl-tlsdesc-dynamic.h
 create mode 100644 sysdeps/i386/tst-gnu2-tls2.c
 rename sysdeps/{x86_64 => x86}/features-offsets.sym (89%)
 create mode 100644 sysdeps/x86/malloc-for-test.c
 create mode 100644 sysdeps/x86_64/dl-tlsdesc-dynamic.h
 create mode 100644 sysdeps/x86_64/dl-trampoline-save.h
 create mode 100644 sysdeps/x86_64/dl-trampoline-state.h
  

Comments

Noah Goldstein Feb. 16, 2024, 8:04 a.m. UTC | #1
On Fri, Feb 16, 2024 at 12:21 AM H.J. Lu <hjl.tools@gmail.com> wrote:
>
> Compiler generates the following instruction sequence for GNU2 dynamic
> TLS access:
>
>         leaq    tls_var@TLSDESC(%rip), %rax
>         call    *tls_var@TLSCALL(%rax)
>
> or
>
>         leal    tls_var@TLSDESC(%ebx), %eax
>         call    *tls_var@TLSCALL(%eax)
>
> CALL instruction is transparent to compiler which assumes all registers,
> except for EFLAGS and RAX/EAX, are unchanged after CALL.  When
> _dl_tlsdesc_dynamic is called, it calls __tls_get_addr on the slow
> path.  __tls_get_addr is a normal function which doesn't preserve any
> caller-saved registers.  _dl_tlsdesc_dynamic saved and restored integer
> caller-saved registers, but didn't preserve any other caller-saved
> registers.  Add _dl_tlsdesc_dynamic IFUNC functions for FNSAVE, FXSAVE,
> XSAVE and XSAVEC to save and restore all caller-saved registers.  This
> fixes BZ #31372.
>
> Add GLRO(dl_x86_64_runtime_resolve) with GLRO(dl_x86_tlsdesc_dynamic)
> to optimize elf_machine_runtime_setup.
> ---
>  elf/Makefile                                 |  36 +++-
>  elf/malloc-for-test.c                        |  32 ++++
>  elf/malloc-for-test.map.in                   |   8 +
>  elf/tst-gnu2-tls2.c                          |  97 ++++++++++
>  elf/tst-gnu2-tls2.h                          |  26 +++
>  elf/tst-gnu2-tls2mod0.c                      |  28 +++
>  elf/tst-gnu2-tls2mod1.c                      |  28 +++
>  elf/tst-gnu2-tls2mod2.c                      |  28 +++
>  sysdeps/i386/dl-machine.h                    |   2 +-
>  sysdeps/i386/dl-tlsdesc-dynamic.h            | 190 +++++++++++++++++++
>  sysdeps/i386/dl-tlsdesc.S                    | 115 +++++------
>  sysdeps/i386/tst-gnu2-tls2.c                 |   5 +
>  sysdeps/x86/Makefile                         |   7 +-
>  sysdeps/x86/cpu-features.c                   |  56 +++++-
>  sysdeps/x86/dl-procinfo.c                    |  16 ++
>  sysdeps/{x86_64 => x86}/features-offsets.sym |   2 +
>  sysdeps/x86/malloc-for-test.c                |  33 ++++
>  sysdeps/x86/sysdep.h                         |   6 +
>  sysdeps/x86_64/Makefile                      |   2 +-
>  sysdeps/x86_64/dl-machine.h                  |  19 +-
>  sysdeps/x86_64/dl-procinfo.c                 |  16 ++
>  sysdeps/x86_64/dl-tlsdesc-dynamic.h          | 166 ++++++++++++++++
>  sysdeps/x86_64/dl-tlsdesc.S                  | 108 ++++-------
>  sysdeps/x86_64/dl-trampoline-save.h          |  34 ++++
>  sysdeps/x86_64/dl-trampoline-state.h         |  51 +++++
>  sysdeps/x86_64/dl-trampoline.S               |  20 +-
>  sysdeps/x86_64/dl-trampoline.h               |  34 +---
>  27 files changed, 950 insertions(+), 215 deletions(-)
>  create mode 100644 elf/malloc-for-test.c
>  create mode 100644 elf/malloc-for-test.map.in
>  create mode 100644 elf/tst-gnu2-tls2.c
>  create mode 100644 elf/tst-gnu2-tls2.h
>  create mode 100644 elf/tst-gnu2-tls2mod0.c
>  create mode 100644 elf/tst-gnu2-tls2mod1.c
>  create mode 100644 elf/tst-gnu2-tls2mod2.c
>  create mode 100644 sysdeps/i386/dl-tlsdesc-dynamic.h
>  create mode 100644 sysdeps/i386/tst-gnu2-tls2.c
>  rename sysdeps/{x86_64 => x86}/features-offsets.sym (89%)
>  create mode 100644 sysdeps/x86/malloc-for-test.c
>  create mode 100644 sysdeps/x86_64/dl-tlsdesc-dynamic.h
>  create mode 100644 sysdeps/x86_64/dl-trampoline-save.h
>  create mode 100644 sysdeps/x86_64/dl-trampoline-state.h
>
> diff --git a/elf/Makefile b/elf/Makefile
> index 5d78b659ce..94d00f02be 100644
> --- a/elf/Makefile
> +++ b/elf/Makefile
> @@ -183,8 +183,16 @@ routines += unwind-dw2-fde-glibc
>  shared-only-routines += unwind-dw2-fde-glibc
>  endif
>
> -before-compile  += $(objpfx)trusted-dirs.h
> -generated      += trusted-dirs.h trusted-dirs.st for-renamed/renamed.so
> +before-compile += \
> +  $(objpfx)malloc-for-test.map \
> +  $(objpfx)trusted-dirs.h \
> +# before-compile
> +generated += \
> +  for-renamed/renamed.so \
> +  malloc-for-test.map \
> +  trusted-dirs.h \
> +  trusted-dirs.st \
> +# generated
>  generated-dirs += for-renamed
>
>  ifeq ($(build-shared),yes)
> @@ -424,6 +432,7 @@ tests += \
>    tst-glibc-hwcaps-prepend \
>    tst-global1 \
>    tst-global2 \
> +  tst-gnu2-tls2 \
>    tst-initfinilazyfail \
>    tst-initorder \
>    tst-initorder2 \
> @@ -699,6 +708,7 @@ modules-names += \
>    libtracemod5-1 \
>    ltglobmod1 \
>    ltglobmod2 \
> +  malloc-for-test \
>    neededobj1 \
>    neededobj2 \
>    neededobj3 \
> @@ -846,6 +856,9 @@ modules-names += \
>    tst-filterobj-flt \
>    tst-finilazyfailmod \
>    tst-globalmod2 \
> +  tst-gnu2-tls2mod0 \
> +  tst-gnu2-tls2mod1 \
> +  tst-gnu2-tls2mod2 \
>    tst-initlazyfailmod \
>    tst-initorder2a \
>    tst-initorder2b \
> @@ -3044,8 +3057,27 @@ $(objpfx)tst-tlsgap.out: \
>    $(objpfx)tst-tlsgap-mod0.so \
>    $(objpfx)tst-tlsgap-mod1.so \
>    $(objpfx)tst-tlsgap-mod2.so
> +
> +$(objpfx)tst-gnu2-tls2: \
> +  $(shared-thread-library) \
> +  $(objpfx)malloc-for-test.so
> +$(objpfx)tst-gnu2-tls2.out: \
> +  $(objpfx)tst-gnu2-tls2mod0.so \
> +  $(objpfx)tst-gnu2-tls2mod1.so \
> +  $(objpfx)tst-gnu2-tls2mod2.so
> +
> +LDFLAGS-malloc-for-test.so += -Wl,--version-script=malloc-for-test.map
> +
> +$(objpfx)malloc-for-test.map: $(objpfx)../abi-versions.h
> +       echo "#include \"malloc-for-test.map.in\"" \
> +         | $(CC) -E -I$(objpfx).. - \
> +         | sed -n '/GLIBC/,$$ p' | sed -n '/#/q;p' > $@
> +
>  ifeq (yes,$(have-mtls-dialect-gnu2))
>  CFLAGS-tst-tlsgap-mod0.c += -mtls-dialect=gnu2
>  CFLAGS-tst-tlsgap-mod1.c += -mtls-dialect=gnu2
>  CFLAGS-tst-tlsgap-mod2.c += -mtls-dialect=gnu2
> +CFLAGS-tst-gnu2-tls2mod0.c += -mtls-dialect=gnu2
> +CFLAGS-tst-gnu2-tls2mod1.c += -mtls-dialect=gnu2
> +CFLAGS-tst-gnu2-tls2mod2.c += -mtls-dialect=gnu2
>  endif
> diff --git a/elf/malloc-for-test.c b/elf/malloc-for-test.c
> new file mode 100644
> index 0000000000..1bec69eda7
> --- /dev/null
> +++ b/elf/malloc-for-test.c
> @@ -0,0 +1,32 @@
> +/* A malloc for intercept test.
> +   Copyright (C) 2024 Free Software Foundation, Inc.
> +   This file is part of the GNU C Library.
> +
> +   The GNU C Library is free software; you can redistribute it and/or
> +   modify it under the terms of the GNU Lesser General Public
> +   License as published by the Free Software Foundation; either
> +   version 2.1 of the License, or (at your option) any later version.
> +
> +   The GNU C Library is distributed in the hope that it will be useful,
> +   but WITHOUT ANY WARRANTY; without even the implied warranty of
> +   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
> +   Lesser General Public License for more details.
> +
> +   You should have received a copy of the GNU Lesser General Public
> +   License along with the GNU C Library; if not, see
> +   <http://www.gnu.org/licenses/>.  */
> +
> +#include <stdlib.h>
> +
> +extern void * __libc_malloc (size_t);
> +
> +#ifndef PREPARE_MALLOC
> +# define PREPARE_MALLOC()
> +#endif
> +
> +void *
> +malloc (size_t n)
> +{
> +  PREPARE_MALLOC ();
> +  return __libc_malloc (n);
> +}
> diff --git a/elf/malloc-for-test.map.in b/elf/malloc-for-test.map.in
> new file mode 100644
> index 0000000000..2b96d95954
> --- /dev/null
> +++ b/elf/malloc-for-test.map.in
> @@ -0,0 +1,8 @@
> +#include <abi-versions.h>
> +
> +VERSION_libc_GLIBC_2_0 {
> +  global:
> +    malloc;
> +  local:
> +    *;
> +};
> diff --git a/elf/tst-gnu2-tls2.c b/elf/tst-gnu2-tls2.c
> new file mode 100644
> index 0000000000..34427f9a0f
> --- /dev/null
> +++ b/elf/tst-gnu2-tls2.c
> @@ -0,0 +1,97 @@
> +/* Test TLSDESC relocation.
> +   Copyright (C) 2024 Free Software Foundation, Inc.
> +   This file is part of the GNU C Library.
> +
> +   The GNU C Library is free software; you can redistribute it and/or
> +   modify it under the terms of the GNU Lesser General Public
> +   License as published by the Free Software Foundation; either
> +   version 2.1 of the License, or (at your option) any later version.
> +
> +   The GNU C Library is distributed in the hope that it will be useful,
> +   but WITHOUT ANY WARRANTY; without even the implied warranty of
> +   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
> +   Lesser General Public License for more details.
> +
> +   You should have received a copy of the GNU Lesser General Public
> +   License along with the GNU C Library; if not, see
> +   <http://www.gnu.org/licenses/>.  */
> +
> +#include <stdio.h>
> +#include <string.h>
> +#include <dlfcn.h>
> +#include <pthread.h>
> +#include <support/xdlfcn.h>
> +#include <support/xthread.h>
> +#include <support/check.h>
> +#include <support/test-driver.h>
> +#include "tst-gnu2-tls2.h"
> +
> +#ifndef IS_SUPPORTED
> +# define IS_SUPPORTED() true
> +#endif
> +
> +static void *mod[3];
> +#define MOD(i) "tst-gnu2-tls2mod" #i ".so"
> +static const char *modname[3] = { MOD(0), MOD(1), MOD(2) };
> +#undef MOD
> +
> +static void
> +open_mod (int i)
> +{
> +  mod[i] = xdlopen (modname[i], RTLD_LAZY);
> +  printf ("open %s\n", modname[i]);
> +}
> +
> +static void
> +close_mod (int i)
> +{
> +  xdlclose (mod[i]);
> +  mod[i] = NULL;
> +  printf ("close %s\n", modname[i]);
> +}
> +
> +static void
> +access_mod (int i, const char *sym)
> +{
> +  struct tls var = { -1, -1, -1, -1 };
> +  struct tls *(*f) (struct tls *) = xdlsym (mod[i], sym);
> +  struct tls *p = f (&var);
> +  printf ("access %s: %s() = %p\n", modname[i], sym, p);
> +  TEST_VERIFY_EXIT (memcmp (p, &var, sizeof (var)) == 0);
> +  ++(p->a);
> +}
> +
> +static void *
> +start (void *arg)
> +{
> +  /* The DTV generation is at the last dlopen of mod0 and the
> +     entry for mod1 is NULL.  */
> +
> +  open_mod (1); /* Reuse modid of mod1. Uses dynamic TLS.  */
> +
> +  /* Force the slow path in GNU2 TLS descriptor call.  */
> +  access_mod (1, "apply_tls");
> +
> +  return arg;
> +}
> +
> +static int
> +do_test (void)
> +{
> +  if (!IS_SUPPORTED ())
> +    return EXIT_UNSUPPORTED;
> +
> +  open_mod (0);
> +  open_mod (1);
> +  open_mod (2);
> +  close_mod (0);
> +  close_mod (1); /* Create modid gap at mod1.  */
> +  open_mod (0); /* Reuse modid of mod0, bump generation count.  */
> +
> +  /* Create a thread where DTV of mod1 is NULL.  */
> +  pthread_t t = xpthread_create (NULL, start, NULL);
> +  xpthread_join (t);
> +  return 0;
> +}
> +
> +#include <support/test-driver.c>
> diff --git a/elf/tst-gnu2-tls2.h b/elf/tst-gnu2-tls2.h
> new file mode 100644
> index 0000000000..e33f4dbe27
> --- /dev/null
> +++ b/elf/tst-gnu2-tls2.h
> @@ -0,0 +1,26 @@
> +/* Test TLSDESC relocation.
> +   Copyright (C) 2024 Free Software Foundation, Inc.
> +   This file is part of the GNU C Library.
> +
> +   The GNU C Library is free software; you can redistribute it and/or
> +   modify it under the terms of the GNU Lesser General Public
> +   License as published by the Free Software Foundation; either
> +   version 2.1 of the License, or (at your option) any later version.
> +
> +   The GNU C Library is distributed in the hope that it will be useful,
> +   but WITHOUT ANY WARRANTY; without even the implied warranty of
> +   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
> +   Lesser General Public License for more details.
> +
> +   You should have received a copy of the GNU Lesser General Public
> +   License along with the GNU C Library; if not, see
> +   <https://www.gnu.org/licenses/>.  */
> +
> +#include <stdint.h>
> +
> +struct tls
> +{
> +  int64_t a, b, c, d;
> +};
> +
> +extern struct tls *apply_tls (struct tls *);
> diff --git a/elf/tst-gnu2-tls2mod0.c b/elf/tst-gnu2-tls2mod0.c
> new file mode 100644
> index 0000000000..67dc0d464d
> --- /dev/null
> +++ b/elf/tst-gnu2-tls2mod0.c
> @@ -0,0 +1,28 @@
> +/* DSO used by tst-gnu2-tls2.
> +   Copyright (C) 2024 Free Software Foundation, Inc.
> +   This file is part of the GNU C Library.
> +
> +   The GNU C Library is free software; you can redistribute it and/or
> +   modify it under the terms of the GNU Lesser General Public
> +   License as published by the Free Software Foundation; either
> +   version 2.1 of the License, or (at your option) any later version.
> +
> +   The GNU C Library is distributed in the hope that it will be useful,
> +   but WITHOUT ANY WARRANTY; without even the implied warranty of
> +   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
> +   Lesser General Public License for more details.
> +
> +   You should have received a copy of the GNU Lesser General Public
> +   License along with the GNU C Library; if not, see
> +   <https://www.gnu.org/licenses/>.  */
> +
> +#include "tst-gnu2-tls2.h"
> +
> +__thread struct tls tls_var0 __attribute__ ((visibility ("hidden")));
> +
> +struct tls *
> +apply_tls (struct tls *p)
> +{
> +  tls_var0 = *p;
> +  return &tls_var0;
> +}
> diff --git a/elf/tst-gnu2-tls2mod1.c b/elf/tst-gnu2-tls2mod1.c
> new file mode 100644
> index 0000000000..a4ae6db24f
> --- /dev/null
> +++ b/elf/tst-gnu2-tls2mod1.c
> @@ -0,0 +1,28 @@
> +/* DSO used by tst-gnu2-tls2.
> +   Copyright (C) 2024 Free Software Foundation, Inc.
> +   This file is part of the GNU C Library.
> +
> +   The GNU C Library is free software; you can redistribute it and/or
> +   modify it under the terms of the GNU Lesser General Public
> +   License as published by the Free Software Foundation; either
> +   version 2.1 of the License, or (at your option) any later version.
> +
> +   The GNU C Library is distributed in the hope that it will be useful,
> +   but WITHOUT ANY WARRANTY; without even the implied warranty of
> +   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
> +   Lesser General Public License for more details.
> +
> +   You should have received a copy of the GNU Lesser General Public
> +   License along with the GNU C Library; if not, see
> +   <https://www.gnu.org/licenses/>.  */
> +
> +#include "tst-gnu2-tls2.h"
> +
> +__thread struct tls tls_var1[100] __attribute__ ((visibility ("hidden")));
> +
> +struct tls *
> +apply_tls (struct tls *p)
> +{
> +  tls_var1[1] = *p;
> +  return &tls_var1[1];
> +}
> diff --git a/elf/tst-gnu2-tls2mod2.c b/elf/tst-gnu2-tls2mod2.c
> new file mode 100644
> index 0000000000..2d13921717
> --- /dev/null
> +++ b/elf/tst-gnu2-tls2mod2.c
> @@ -0,0 +1,28 @@
> +/* DSO used by tst-gnu2-tls2.
> +   Copyright (C) 2024 Free Software Foundation, Inc.
> +   This file is part of the GNU C Library.
> +
> +   The GNU C Library is free software; you can redistribute it and/or
> +   modify it under the terms of the GNU Lesser General Public
> +   License as published by the Free Software Foundation; either
> +   version 2.1 of the License, or (at your option) any later version.
> +
> +   The GNU C Library is distributed in the hope that it will be useful,
> +   but WITHOUT ANY WARRANTY; without even the implied warranty of
> +   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
> +   Lesser General Public License for more details.
> +
> +   You should have received a copy of the GNU Lesser General Public
> +   License along with the GNU C Library; if not, see
> +   <https://www.gnu.org/licenses/>.  */
> +
> +#include "tst-gnu2-tls2.h"
> +
> +__thread struct tls tls_var2 __attribute__ ((visibility ("hidden")));
> +
> +struct tls *
> +apply_tls (struct tls *p)
> +{
> +  tls_var2 = *p;
> +  return &tls_var2;
> +}
> diff --git a/sysdeps/i386/dl-machine.h b/sysdeps/i386/dl-machine.h
> index fc1ef96587..50d74fe6e9 100644
> --- a/sysdeps/i386/dl-machine.h
> +++ b/sysdeps/i386/dl-machine.h
> @@ -347,7 +347,7 @@ and creates an unsatisfiable circular dependency.\n",
>                   {
>                     td->arg = _dl_make_tlsdesc_dynamic
>                       (sym_map, sym->st_value + (ElfW(Word))td->arg);
> -                   td->entry = _dl_tlsdesc_dynamic;
> +                   td->entry = GLRO(dl_x86_tlsdesc_dynamic);
>                   }
>                 else
>  #  endif
> diff --git a/sysdeps/i386/dl-tlsdesc-dynamic.h b/sysdeps/i386/dl-tlsdesc-dynamic.h
> new file mode 100644
> index 0000000000..c857c68c55
> --- /dev/null
> +++ b/sysdeps/i386/dl-tlsdesc-dynamic.h
> @@ -0,0 +1,190 @@
> +/* Thread-local storage handling in the ELF dynamic linker.  i386 version.
> +   Copyright (C) 2004-2024 Free Software Foundation, Inc.
> +   This file is part of the GNU C Library.
> +
> +   The GNU C Library is free software; you can redistribute it and/or
> +   modify it under the terms of the GNU Lesser General Public
> +   License as published by the Free Software Foundation; either
> +   version 2.1 of the License, or (at your option) any later version.
> +
> +   The GNU C Library is distributed in the hope that it will be useful,
> +   but WITHOUT ANY WARRANTY; without even the implied warranty of
> +   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
> +   Lesser General Public License for more details.
> +
> +   You should have received a copy of the GNU Lesser General Public
> +   License along with the GNU C Library; if not, see
> +   <https://www.gnu.org/licenses/>.  */
> +
> +#undef REGISTER_SAVE_AREA
> +
> +#if !defined USE_FNSAVE && (STATE_SAVE_ALIGNMENT % 16) != 0
> +# error STATE_SAVE_ALIGNMENT must be multiple of 16
> +#endif
> +
> +#if DL_RUNTIME_RESOLVE_REALIGN_STACK
> +# ifdef USE_FNSAVE
> +#  error USE_FNSAVE shouldn't be defined
> +# endif
> +# ifdef USE_FXSAVE
> +/* Use fxsave to save all registers.  */
> +#  define REGISTER_SAVE_AREA   512
> +# endif
> +#else
> +# ifdef USE_FNSAVE
> +/* Use fnsave to save x87 FPU stack registers.  */
> +#  define REGISTER_SAVE_AREA   108
> +# else
> +#  ifndef USE_FXSAVE
> +#   error USE_FXSAVE must be defined
> +#  endif
> +/* Use fxsave to save all registers.  Add 12 bytes to align the stack
> +   to 16 bytes.  */
> +#  define REGISTER_SAVE_AREA   (512 + 12)
> +# endif
> +#endif
> +
> +       .hidden _dl_tlsdesc_dynamic
> +       .global _dl_tlsdesc_dynamic
> +       .type   _dl_tlsdesc_dynamic,@function
> +
> +     /* This function is used for symbols that need dynamic TLS.
> +
nit: comment start at line start.
> +       %eax points to the TLS descriptor, such that 0(%eax) points to
> +       _dl_tlsdesc_dynamic itself, and 4(%eax) points to a struct
> +       tlsdesc_dynamic_arg object.  It must return in %eax the offset
> +       between the thread pointer and the object denoted by the
> +       argument, without clobbering any registers.
> +
> +       The assembly code that follows is a rendition of the following
> +       C code, hand-optimized a little bit.
> +
> +ptrdiff_t
> +__attribute__ ((__regparm__ (1)))
> +_dl_tlsdesc_dynamic (struct tlsdesc *tdp)
> +{
> +  struct tlsdesc_dynamic_arg *td = tdp->arg;
> +  dtv_t *dtv = *(dtv_t **)((char *)__thread_pointer + DTV_OFFSET);
> +  if (__builtin_expect (td->gen_count <= dtv[0].counter
> +                       && (dtv[td->tlsinfo.ti_module].pointer.val
> +                           != TLS_DTV_UNALLOCATED),
> +                       1))
> +    return dtv[td->tlsinfo.ti_module].pointer.val + td->tlsinfo.ti_offset
> +      - __thread_pointer;
> +
> +  return ___tls_get_addr (&td->tlsinfo) - __thread_pointer;
> +}
> +*/
> +       cfi_startproc
> +       .align 16
> +_dl_tlsdesc_dynamic:
> +       /* Like all TLS resolvers, preserve call-clobbered registers.
> +          We need two scratch regs anyway.  */
> +       subl    $32, %esp
> +       cfi_adjust_cfa_offset (32)
> +       movl    %ecx, 20(%esp)
> +       movl    %edx, 24(%esp)
> +       movl    TLSDESC_ARG(%eax), %eax
> +       movl    %gs:DTV_OFFSET, %edx
> +       movl    TLSDESC_GEN_COUNT(%eax), %ecx
> +       cmpl    (%edx), %ecx
> +       ja      2f
> +       movl    TLSDESC_MODID(%eax), %ecx
> +       movl    (%edx,%ecx,8), %edx
maybe 8 -> TLSDESC_DTV_SIZE?
> +       cmpl    $-1, %edx
-1 -> TLS_DTV_UNALLOCATED
> +       je      2f
> +       movl    TLSDESC_MODOFF(%eax), %eax
> +       addl    %edx, %eax
> +1:
> +       movl    20(%esp), %ecx
> +       subl    %gs:0, %eax
> +       movl    24(%esp), %edx
> +       addl    $32, %esp
> +       cfi_adjust_cfa_offset (-32)
> +       ret
> +       .p2align 4,,7
> +2:
> +       cfi_adjust_cfa_offset (32)
I still don't understand what this cfi is for?
You already have `cfi_adjust_cfa_offset (32)` above right after
the `subl $32, %esp`
> +#if DL_RUNTIME_RESOLVE_REALIGN_STACK
> +       movl    %ebx, -28(%esp)
> +       movl    %esp, %ebx
> +       cfi_def_cfa_register(%ebx)
> +       and     $-STATE_SAVE_ALIGNMENT, %esp
> +#endif
> +#ifdef REGISTER_SAVE_AREA
> +       subl    $REGISTER_SAVE_AREA, %esp
> +# if !DL_RUNTIME_RESOLVE_REALIGN_STACK
> +       cfi_adjust_cfa_offset(REGISTER_SAVE_AREA)
> +# endif
> +#else
> +# if !DL_RUNTIME_RESOLVE_REALIGN_STACK
> +#  error DL_RUNTIME_RESOLVE_REALIGN_STACK must be true
> +# endif
> +       # Allocate stack space of the required size to save the state.
nit: comment with /* or //
likewise below.
> +       LOAD_PIC_REG (cx)
> +       subl    RTLD_GLOBAL_RO_DL_X86_CPU_FEATURES_OFFSET+XSAVE_STATE_SIZE_OFFSET+_rtld_local_ro@GOTOFF(%ecx), %esp
> +#endif
> +#ifdef USE_FNSAVE
> +       fnsave  (%esp)
> +#elif defined USE_FXSAVE
> +       fxsave  (%esp)
> +#else
> +       # Save the argument for ___tls_get_addr in EAX.
> +       movl    %eax, %ecx
> +       movl    $TLSDESC_CALL_STATE_SAVE_MASK, %eax
> +       xorl    %edx, %edx
> +       # Clear the XSAVE Header.
> +# ifdef USE_XSAVE
> +       movl    %edx, (512)(%esp)
> +       movl    %edx, (512 + 4 * 1)(%esp)
> +       movl    %edx, (512 + 4 * 2)(%esp)
> +       movl    %edx, (512 + 4 * 3)(%esp)
> +# endif
> +       movl    %edx, (512 + 4 * 4)(%esp)
> +       movl    %edx, (512 + 4 * 5)(%esp)
> +       movl    %edx, (512 + 4 * 6)(%esp)
> +       movl    %edx, (512 + 4 * 7)(%esp)
> +       movl    %edx, (512 + 4 * 8)(%esp)
> +       movl    %edx, (512 + 4 * 9)(%esp)
> +       movl    %edx, (512 + 4 * 10)(%esp)
> +       movl    %edx, (512 + 4 * 11)(%esp)
> +       movl    %edx, (512 + 4 * 12)(%esp)
> +       movl    %edx, (512 + 4 * 13)(%esp)
> +       movl    %edx, (512 + 4 * 14)(%esp)
> +       movl    %edx, (512 + 4 * 15)(%esp)
> +# ifdef USE_XSAVE
> +       xsave   (%esp)
> +# else
> +       xsavec  (%esp)
> +# endif
> +       # Restore the argument for ___tls_get_addr in EAX.
> +       movl    %ecx, %eax
> +#endif
> +       call    HIDDEN_JUMPTARGET (___tls_get_addr)
> +       # Get register content back.
> +#ifdef USE_FNSAVE
> +       frstor  (%esp)
> +#elif defined USE_FXSAVE
> +       fxrstor (%esp)
> +#else
> +       /* Save and retore ___tls_get_addr return value stored in EAX.  */
> +       movl    %eax, %ecx
> +       movl    $TLSDESC_CALL_STATE_SAVE_MASK, %eax
> +       xorl    %edx, %edx
> +       xrstor  (%esp)
> +       movl    %ecx, %eax
> +#endif
> +#if DL_RUNTIME_RESOLVE_REALIGN_STACK
> +       mov     %ebx, %esp
> +       cfi_def_cfa_register(%esp)
> +       movl    -28(%esp), %ebx
> +       cfi_restore(%ebx)
> +#else
> +       addl    $REGISTER_SAVE_AREA, %esp
> +       cfi_adjust_cfa_offset(-REGISTER_SAVE_AREA)
> +#endif
> +       jmp     1b
> +       cfi_endproc
> +       .size   _dl_tlsdesc_dynamic, .-_dl_tlsdesc_dynamic
> +
> +#undef STATE_SAVE_ALIGNMENT
> diff --git a/sysdeps/i386/dl-tlsdesc.S b/sysdeps/i386/dl-tlsdesc.S
> index 90d93caa0c..f002feee56 100644
> --- a/sysdeps/i386/dl-tlsdesc.S
> +++ b/sysdeps/i386/dl-tlsdesc.S
> @@ -18,8 +18,27 @@
>
>  #include <sysdep.h>
>  #include <tls.h>
> +#include <cpu-features-offsets.h>
> +#include <features-offsets.h>
>  #include "tlsdesc.h"
>
> +#ifndef DL_STACK_ALIGNMENT
> +/* Due to GCC bug:
> +
> +   https://gcc.gnu.org/bugzilla/show_bug.cgi?id=58066
> +
> +   __tls_get_addr may be called with 4-byte stack alignment.  Although
> +   this bug has been fixed in GCC 4.9.4, 5.3 and 6, we can't assume
> +   that stack will be always aligned at 16 bytes.  */
> +# define DL_STACK_ALIGNMENT 4
> +#endif
> +
> +/* True if _dl_tlsdesc_dynamic should align stack for STATE_SAVE or align
> +   stack to MINIMUM_ALIGNMENT bytes before calling ___tls_get_addr.  */
> +#define DL_RUNTIME_RESOLVE_REALIGN_STACK \
> +  (STATE_SAVE_ALIGNMENT > DL_STACK_ALIGNMENT \
> +   || MINIMUM_ALIGNMENT > DL_STACK_ALIGNMENT)
> +
>         .text
>
>       /* This function is used to compute the TP offset for symbols in
> @@ -65,69 +84,35 @@ _dl_tlsdesc_undefweak:
>         .size   _dl_tlsdesc_undefweak, .-_dl_tlsdesc_undefweak
>
>  #ifdef SHARED
> -       .hidden _dl_tlsdesc_dynamic
> -       .global _dl_tlsdesc_dynamic
> -       .type   _dl_tlsdesc_dynamic,@function
> -
> -     /* This function is used for symbols that need dynamic TLS.
> -
> -       %eax points to the TLS descriptor, such that 0(%eax) points to
> -       _dl_tlsdesc_dynamic itself, and 4(%eax) points to a struct
> -       tlsdesc_dynamic_arg object.  It must return in %eax the offset
> -       between the thread pointer and the object denoted by the
> -       argument, without clobbering any registers.
> -
> -       The assembly code that follows is a rendition of the following
> -       C code, hand-optimized a little bit.
> -
> -ptrdiff_t
> -__attribute__ ((__regparm__ (1)))
> -_dl_tlsdesc_dynamic (struct tlsdesc *tdp)
> -{
> -  struct tlsdesc_dynamic_arg *td = tdp->arg;
> -  dtv_t *dtv = *(dtv_t **)((char *)__thread_pointer + DTV_OFFSET);
> -  if (__builtin_expect (td->gen_count <= dtv[0].counter
> -                       && (dtv[td->tlsinfo.ti_module].pointer.val
> -                           != TLS_DTV_UNALLOCATED),
> -                       1))
> -    return dtv[td->tlsinfo.ti_module].pointer.val + td->tlsinfo.ti_offset
> -      - __thread_pointer;
> -
> -  return ___tls_get_addr (&td->tlsinfo) - __thread_pointer;
> -}
> -*/
> -       cfi_startproc
> -       .align 16
> -_dl_tlsdesc_dynamic:
> -       /* Like all TLS resolvers, preserve call-clobbered registers.
> -          We need two scratch regs anyway.  */
> -       subl    $28, %esp
> -       cfi_adjust_cfa_offset (28)
> -       movl    %ecx, 20(%esp)
> -       movl    %edx, 24(%esp)
> -       movl    TLSDESC_ARG(%eax), %eax
> -       movl    %gs:DTV_OFFSET, %edx
> -       movl    TLSDESC_GEN_COUNT(%eax), %ecx
> -       cmpl    (%edx), %ecx
> -       ja      .Lslow
> -       movl    TLSDESC_MODID(%eax), %ecx
> -       movl    (%edx,%ecx,8), %edx
> -       cmpl    $-1, %edx
> -       je      .Lslow
> -       movl    TLSDESC_MODOFF(%eax), %eax
> -       addl    %edx, %eax
> -.Lret:
> -       movl    20(%esp), %ecx
> -       subl    %gs:0, %eax
> -       movl    24(%esp), %edx
> -       addl    $28, %esp
> -       cfi_adjust_cfa_offset (-28)
> -       ret
> -       .p2align 4,,7
> -.Lslow:
> -       cfi_adjust_cfa_offset (28)
> -       call    HIDDEN_JUMPTARGET (___tls_get_addr)
> -       jmp     .Lret
> -       cfi_endproc
> -       .size   _dl_tlsdesc_dynamic, .-_dl_tlsdesc_dynamic
> +# define USE_FNSAVE
> +# define MINIMUM_ALIGNMENT     4
> +# define STATE_SAVE_ALIGNMENT  4
> +# define _dl_tlsdesc_dynamic   _dl_tlsdesc_dynamic_fnsave
> +# include "dl-tlsdesc-dynamic.h"
> +# undef _dl_tlsdesc_dynamic
> +# undef MINIMUM_ALIGNMENT
> +# undef USE_FNSAVE
> +
> +# define MINIMUM_ALIGNMENT     16
> +
> +# define USE_FXSAVE
> +# define STATE_SAVE_ALIGNMENT  16
> +# define _dl_tlsdesc_dynamic   _dl_tlsdesc_dynamic_fxsave
> +# include "dl-tlsdesc-dynamic.h"
> +# undef _dl_tlsdesc_dynamic
> +# undef USE_FXSAVE
> +
> +# define USE_XSAVE
> +# define STATE_SAVE_ALIGNMENT  64
> +# define _dl_tlsdesc_dynamic   _dl_tlsdesc_dynamic_xsave
> +# include "dl-tlsdesc-dynamic.h"
> +# undef _dl_tlsdesc_dynamic
> +# undef USE_XSAVE
> +
> +# define USE_XSAVEC
> +# define STATE_SAVE_ALIGNMENT  64
> +# define _dl_tlsdesc_dynamic   _dl_tlsdesc_dynamic_xsavec
> +# include "dl-tlsdesc-dynamic.h"
> +# undef _dl_tlsdesc_dynamic
> +# undef USE_XSAVEC
>  #endif /* SHARED */
> diff --git a/sysdeps/i386/tst-gnu2-tls2.c b/sysdeps/i386/tst-gnu2-tls2.c
> new file mode 100644
> index 0000000000..92e7fbff89
> --- /dev/null
> +++ b/sysdeps/i386/tst-gnu2-tls2.c
> @@ -0,0 +1,5 @@
> +#include <sys/platform/x86.h>
> +
> +#define IS_SUPPORTED() CPU_FEATURE_ACTIVE (SSE2)
> +
> +#include <elf/tst-gnu2-tls2.c>
> diff --git a/sysdeps/x86/Makefile b/sysdeps/x86/Makefile
> index 73b29cc78c..581086305d 100644
> --- a/sysdeps/x86/Makefile
> +++ b/sysdeps/x86/Makefile
> @@ -1,5 +1,5 @@
>  ifeq ($(subdir),csu)
> -gen-as-const-headers += cpu-features-offsets.sym
> +gen-as-const-headers += cpu-features-offsets.sym features-offsets.sym
>  endif
>
>  ifeq ($(subdir),elf)
> @@ -86,6 +86,11 @@ endif
>  tst-ifunc-isa-2-ENV = GLIBC_TUNABLES=glibc.cpu.hwcaps=-SSE4_2,-AVX,-AVX2,-AVX512F
>  tst-ifunc-isa-2-static-ENV = $(tst-ifunc-isa-2-ENV)
>  tst-hwcap-tunables-ARGS = -- $(host-test-program-cmd)
> +
> +CFLAGS-malloc-for-test.c += -msse2
> +CFLAGS-tst-gnu2-tls2mod0.c += -msse2 -mtune=haswell
> +CFLAGS-tst-gnu2-tls2mod1.c += -msse2 -mtune=haswell
> +CFLAGS-tst-gnu2-tls2mod2.c += -msse2 -mtune=haswell
>  endif
>
>  ifeq ($(subdir),math)
> diff --git a/sysdeps/x86/cpu-features.c b/sysdeps/x86/cpu-features.c
> index 25e6622a79..835113b42f 100644
> --- a/sysdeps/x86/cpu-features.c
> +++ b/sysdeps/x86/cpu-features.c
> @@ -27,8 +27,13 @@
>  extern void TUNABLE_CALLBACK (set_hwcaps) (tunable_val_t *)
>    attribute_hidden;
>
> -#if defined SHARED && defined __x86_64__
> -# include <dl-plt-rewrite.h>
> +#if defined SHARED
> +extern void _dl_tlsdesc_dynamic_fxsave (void) attribute_hidden;
> +extern void _dl_tlsdesc_dynamic_xsave (void) attribute_hidden;
> +extern void _dl_tlsdesc_dynamic_xsavec (void) attribute_hidden;
> +
> +# ifdef __x86_64__
> +#  include <dl-plt-rewrite.h>
>
>  static void
>  TUNABLE_CALLBACK (set_plt_rewrite) (tunable_val_t *valp)
> @@ -47,6 +52,15 @@ TUNABLE_CALLBACK (set_plt_rewrite) (tunable_val_t *valp)
>                  : plt_rewrite_jmp);
>      }
>  }
> +# else
> +extern void _dl_tlsdesc_dynamic_fnsave (void) attribute_hidden;
> +# endif
> +#endif
> +
> +#ifdef __x86_64__
> +extern void _dl_runtime_resolve_fxsave (void) attribute_hidden;
> +extern void _dl_runtime_resolve_xsave (void) attribute_hidden;
> +extern void _dl_runtime_resolve_xsavec (void) attribute_hidden;
>  #endif
>
>  #ifdef __LP64__
> @@ -1130,6 +1144,44 @@ no_cpuid:
>                TUNABLE_CALLBACK (set_x86_shstk));
>  #endif
>
> +  if (GLRO(dl_x86_cpu_features).xsave_state_size != 0)
> +    {
> +      if (CPU_FEATURE_USABLE_P (cpu_features, XSAVEC))
> +       {
> +#ifdef __x86_64__
> +         GLRO(dl_x86_64_runtime_resolve) = _dl_runtime_resolve_xsavec;
> +#endif
> +#ifdef SHARED
> +         GLRO(dl_x86_tlsdesc_dynamic) = _dl_tlsdesc_dynamic_xsavec;
> +#endif
> +       }
> +      else
> +       {
> +#ifdef __x86_64__
> +         GLRO(dl_x86_64_runtime_resolve) = _dl_runtime_resolve_xsave;
> +#endif
> +#ifdef SHARED
> +         GLRO(dl_x86_tlsdesc_dynamic) = _dl_tlsdesc_dynamic_xsave;
> +#endif
> +       }
> +    }
> +  else
> +    {
> +#ifdef __x86_64__
> +      GLRO(dl_x86_64_runtime_resolve) = _dl_runtime_resolve_fxsave;
> +# ifdef SHARED
> +      GLRO(dl_x86_tlsdesc_dynamic) = _dl_tlsdesc_dynamic_fxsave;
> +# endif
> +#else
> +# ifdef SHARED
> +      if (CPU_FEATURE_USABLE_P (cpu_features, FXSR))
> +       GLRO(dl_x86_tlsdesc_dynamic) = _dl_tlsdesc_dynamic_fxsave;
> +      else
> +       GLRO(dl_x86_tlsdesc_dynamic) = _dl_tlsdesc_dynamic_fnsave;
> +# endif
> +#endif
> +    }
> +
>  #ifdef SHARED
>  # ifdef __x86_64__
>    TUNABLE_GET (plt_rewrite, tunable_val_t *,
> diff --git a/sysdeps/x86/dl-procinfo.c b/sysdeps/x86/dl-procinfo.c
> index ee957b4d70..5920d4b320 100644
> --- a/sysdeps/x86/dl-procinfo.c
> +++ b/sysdeps/x86/dl-procinfo.c
> @@ -86,3 +86,19 @@ PROCINFO_CLASS const char _dl_x86_platforms[4][9]
>  #else
>  ,
>  #endif
> +
> +#if defined SHARED && !IS_IN (ldconfig)
> +# if !defined PROCINFO_DECL
> +  ._dl_x86_tlsdesc_dynamic
> +# else
> +PROCINFO_CLASS void * _dl_x86_tlsdesc_dynamic
> +# endif
> +# ifndef PROCINFO_DECL
> += NULL
> +# endif
> +# ifdef PROCINFO_DECL
> +;
> +# else
> +,
> +# endif
> +#endif
> diff --git a/sysdeps/x86_64/features-offsets.sym b/sysdeps/x86/features-offsets.sym
> similarity index 89%
> rename from sysdeps/x86_64/features-offsets.sym
> rename to sysdeps/x86/features-offsets.sym
> index 9e4be3393a..77e990c705 100644
> --- a/sysdeps/x86_64/features-offsets.sym
> +++ b/sysdeps/x86/features-offsets.sym
> @@ -3,4 +3,6 @@
>  #include <ldsodefs.h>
>
>  RTLD_GLOBAL_RO_DL_X86_CPU_FEATURES_OFFSET offsetof (struct rtld_global_ro, _dl_x86_cpu_features)
> +#ifdef __x86_64__
>  RTLD_GLOBAL_DL_X86_FEATURE_1_OFFSET offsetof (struct rtld_global, _dl_x86_feature_1)
> +#endif
> diff --git a/sysdeps/x86/malloc-for-test.c b/sysdeps/x86/malloc-for-test.c
> new file mode 100644
> index 0000000000..02f4dead5d
> --- /dev/null
> +++ b/sysdeps/x86/malloc-for-test.c
> @@ -0,0 +1,33 @@
> +/*  A malloc for intercept test.  x86 version.
> +   Copyright (C) 2024 Free Software Foundation, Inc.
> +   This file is part of the GNU C Library.
> +
> +   The GNU C Library is free software; you can redistribute it and/or
> +   modify it under the terms of the GNU Lesser General Public
> +   License as published by the Free Software Foundation; either
> +   version 2.1 of the License, or (at your option) any later version.
> +
> +   The GNU C Library is distributed in the hope that it will be useful,
> +   but WITHOUT ANY WARRANTY; without even the implied warranty of
> +   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
> +   Lesser General Public License for more details.
> +
> +   You should have received a copy of the GNU Lesser General Public
> +   License along with the GNU C Library; if not, see
> +   <http://www.gnu.org/licenses/>.  */
> +
> +
> +/* Clear XMM0...XMM7  */
> +#define PREPARE_MALLOC()                               \
> +{                                                      \
> +  asm volatile ("xorps %%xmm0, %%xmm0" : : : "xmm0" ); \
> +  asm volatile ("xorps %%xmm1, %%xmm1" : : : "xmm1" ); \
> +  asm volatile ("xorps %%xmm2, %%xmm2" : : : "xmm2" ); \
> +  asm volatile ("xorps %%xmm3, %%xmm3" : : : "xmm3" ); \
> +  asm volatile ("xorps %%xmm4, %%xmm4" : : : "xmm4" ); \
> +  asm volatile ("xorps %%xmm5, %%xmm5" : : : "xmm5" ); \
> +  asm volatile ("xorps %%xmm6, %%xmm6" : : : "xmm6" ); \
> +  asm volatile ("xorps %%xmm7, %%xmm7" : : : "xmm7" ); \
> +}
> +
> +#include <elf/malloc-for-test.c>
> diff --git a/sysdeps/x86/sysdep.h b/sysdeps/x86/sysdep.h
> index 837fd28734..485cad9c02 100644
> --- a/sysdeps/x86/sysdep.h
> +++ b/sysdeps/x86/sysdep.h
> @@ -70,6 +70,12 @@
>     | (1 << X86_XSTATE_ZMM_H_ID))
>  #endif
>
> +/* States which should be saved for TLSDESC_CALL and TLS_DESC_CALL.
> +   Compiler assumes that all registers, including x87 FPU stack registers,
> +   are unchanged after CALL, except for EFLAGS and RAX/EAX.  */
> +#define TLSDESC_CALL_STATE_SAVE_MASK   \
> +  (STATE_SAVE_MASK | (1 << X86_XSTATE_X87_ID))
> +
>  /* Constants for bits in __x86_string_control:  */
>
>  /* Avoid short distance REP MOVSB.  */
> diff --git a/sysdeps/x86_64/Makefile b/sysdeps/x86_64/Makefile
> index 145f25e7f6..9337e95093 100644
> --- a/sysdeps/x86_64/Makefile
> +++ b/sysdeps/x86_64/Makefile
> @@ -10,7 +10,7 @@ LDFLAGS-rtld += -Wl,-z,nomark-plt
>  endif
>
>  ifeq ($(subdir),csu)
> -gen-as-const-headers += features-offsets.sym link-defines.sym
> +gen-as-const-headers += link-defines.sym
>  endif
>
>  ifeq ($(subdir),gmon)
> diff --git a/sysdeps/x86_64/dl-machine.h b/sysdeps/x86_64/dl-machine.h
> index 6d605d0d32..ff5d45f7cb 100644
> --- a/sysdeps/x86_64/dl-machine.h
> +++ b/sysdeps/x86_64/dl-machine.h
> @@ -71,9 +71,6 @@ elf_machine_runtime_setup (struct link_map *l, struct r_scope_elem *scope[],
>                            int lazy, int profile)
>  {
>    Elf64_Addr *got;
> -  extern void _dl_runtime_resolve_fxsave (ElfW(Word)) attribute_hidden;
> -  extern void _dl_runtime_resolve_xsave (ElfW(Word)) attribute_hidden;
> -  extern void _dl_runtime_resolve_xsavec (ElfW(Word)) attribute_hidden;
>    extern void _dl_runtime_profile_sse (ElfW(Word)) attribute_hidden;
>    extern void _dl_runtime_profile_avx (ElfW(Word)) attribute_hidden;
>    extern void _dl_runtime_profile_avx512 (ElfW(Word)) attribute_hidden;
> @@ -96,8 +93,6 @@ elf_machine_runtime_setup (struct link_map *l, struct r_scope_elem *scope[],
>        /* Identify this shared object.  */
>        *(ElfW(Addr) *) (got + 1) = (ElfW(Addr)) l;
>
> -      const struct cpu_features* cpu_features = __get_cpu_features ();
> -
>  #ifdef SHARED
>        /* The got[2] entry contains the address of a function which gets
>          called to get the address of a so far unresolved function and
> @@ -107,6 +102,7 @@ elf_machine_runtime_setup (struct link_map *l, struct r_scope_elem *scope[],
>          end in this function.  */
>        if (__glibc_unlikely (profile))
>         {
> +         const struct cpu_features* cpu_features = __get_cpu_features ();
>           if (X86_ISA_CPU_FEATURE_USABLE_P (cpu_features, AVX512F))
>             *(ElfW(Addr) *) (got + 2) = (ElfW(Addr)) &_dl_runtime_profile_avx512;
>           else if (X86_ISA_CPU_FEATURE_USABLE_P (cpu_features, AVX))
> @@ -126,15 +122,8 @@ elf_machine_runtime_setup (struct link_map *l, struct r_scope_elem *scope[],
>           /* This function will get called to fix up the GOT entry
>              indicated by the offset on the stack, and then jump to
>              the resolved address.  */
> -         if (MINIMUM_X86_ISA_LEVEL >= AVX_X86_ISA_LEVEL
> -             || GLRO(dl_x86_cpu_features).xsave_state_size != 0)
> -           *(ElfW(Addr) *) (got + 2)
> -             = (CPU_FEATURE_USABLE_P (cpu_features, XSAVEC)
> -                ? (ElfW(Addr)) &_dl_runtime_resolve_xsavec
> -                : (ElfW(Addr)) &_dl_runtime_resolve_xsave);
> -         else
> -           *(ElfW(Addr) *) (got + 2)
> -             = (ElfW(Addr)) &_dl_runtime_resolve_fxsave;
> +         *(ElfW(Addr) *) (got + 2)
> +           = (ElfW(Addr)) GLRO(dl_x86_64_runtime_resolve);
>         }
>      }
>
> @@ -383,7 +372,7 @@ and creates an unsatisfiable circular dependency.\n",
>                   {
>                     td->arg = _dl_make_tlsdesc_dynamic
>                       (sym_map, sym->st_value + reloc->r_addend);
> -                   td->entry = _dl_tlsdesc_dynamic;
> +                   td->entry = GLRO(dl_x86_tlsdesc_dynamic);
>                   }
>                 else
>  #  endif
> diff --git a/sysdeps/x86_64/dl-procinfo.c b/sysdeps/x86_64/dl-procinfo.c
> index 4d1d790fbb..06637a8154 100644
> --- a/sysdeps/x86_64/dl-procinfo.c
> +++ b/sysdeps/x86_64/dl-procinfo.c
> @@ -41,5 +41,21 @@
>
>  #include <sysdeps/x86/dl-procinfo.c>
>
> +#if !IS_IN (ldconfig)
> +# if !defined PROCINFO_DECL && defined SHARED
> +  ._dl_x86_64_runtime_resolve
> +# else
> +PROCINFO_CLASS void * _dl_x86_64_runtime_resolve
> +# endif
> +# ifndef PROCINFO_DECL
> += NULL
> +# endif
> +# if !defined SHARED || defined PROCINFO_DECL
> +;
> +# else
> +,
> +# endif
> +#endif
> +
>  #undef PROCINFO_DECL
>  #undef PROCINFO_CLASS
> diff --git a/sysdeps/x86_64/dl-tlsdesc-dynamic.h b/sysdeps/x86_64/dl-tlsdesc-dynamic.h
> new file mode 100644
> index 0000000000..ce0bc094ec
> --- /dev/null
> +++ b/sysdeps/x86_64/dl-tlsdesc-dynamic.h
> @@ -0,0 +1,166 @@
> +/* Thread-local storage handling in the ELF dynamic linker.  x86_64 version.
> +   Copyright (C) 2004-2024 Free Software Foundation, Inc.
> +   This file is part of the GNU C Library.
> +
> +   The GNU C Library is free software; you can redistribute it and/or
> +   modify it under the terms of the GNU Lesser General Public
> +   License as published by the Free Software Foundation; either
> +   version 2.1 of the License, or (at your option) any later version.
> +
> +   The GNU C Library is distributed in the hope that it will be useful,
> +   but WITHOUT ANY WARRANTY; without even the implied warranty of
> +   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
> +   Lesser General Public License for more details.
> +
> +   You should have received a copy of the GNU Lesser General Public
> +   License along with the GNU C Library; if not, see
> +   <https://www.gnu.org/licenses/>.  */
> +
> +#ifndef SECTION
> +# define SECTION(p)    p
> +#endif
> +
> +#undef REGISTER_SAVE_AREA
> +#undef LOCAL_STORAGE_AREA
> +#undef BASE
> +
> +#include "dl-trampoline-state.h"
> +
> +       .section SECTION(.text),"ax",@progbits
> +
> +       .hidden _dl_tlsdesc_dynamic
> +       .global _dl_tlsdesc_dynamic
> +       .type   _dl_tlsdesc_dynamic,@function
> +
> +     /* %rax points to the TLS descriptor, such that 0(%rax) points to
> +       _dl_tlsdesc_dynamic itself, and 8(%rax) points to a struct
> +       tlsdesc_dynamic_arg object.  It must return in %rax the offset
> +       between the thread pointer and the object denoted by the
> +       argument, without clobbering any registers.
> +
> +       The assembly code that follows is a rendition of the following
> +       C code, hand-optimized a little bit.
> +
> +ptrdiff_t
> +_dl_tlsdesc_dynamic (register struct tlsdesc *tdp asm ("%rax"))
> +{
> +  struct tlsdesc_dynamic_arg *td = tdp->arg;
> +  dtv_t *dtv = *(dtv_t **)((char *)__thread_pointer + DTV_OFFSET);
> +  if (__builtin_expect (td->gen_count <= dtv[0].counter
> +                       && (dtv[td->tlsinfo.ti_module].pointer.val
> +                           != TLS_DTV_UNALLOCATED),
> +                       1))
> +    return dtv[td->tlsinfo.ti_module].pointer.val + td->tlsinfo.ti_offset
> +      - __thread_pointer;
> +
> +  return __tls_get_addr_internal (&td->tlsinfo) - __thread_pointer;
> +}
> +*/
basically same comments for x86 version as for i386.
> +       cfi_startproc
> +       .align 16
> +_dl_tlsdesc_dynamic:
> +       _CET_ENDBR
> +       /* Preserve call-clobbered registers that we modify.
> +          We need two scratch regs anyway.  */
> +       movq    %rsi, -16(%rsp)
> +       mov     %fs:DTV_OFFSET, %RSI_LP
> +       movq    %rdi, -8(%rsp)
> +       movq    TLSDESC_ARG(%rax), %rdi
> +       movq    (%rsi), %rax
> +       cmpq    %rax, TLSDESC_GEN_COUNT(%rdi)
> +       ja      2f
> +       movq    TLSDESC_MODID(%rdi), %rax
> +       salq    $4, %rax
> +       movq    (%rax,%rsi), %rax
> +       cmpq    $-1, %rax
> +       je      2f
> +       addq    TLSDESC_MODOFF(%rdi), %rax
> +1:
> +       movq    -16(%rsp), %rsi
> +       sub     %fs:0, %RAX_LP
> +       movq    -8(%rsp), %rdi
> +       ret
> +2:
> +#if DL_RUNTIME_RESOLVE_REALIGN_STACK
> +       movq    %rbx, -24(%rsp)
> +       mov     %RSP_LP, %RBX_LP
> +       cfi_def_cfa_register(%rbx)
> +       and     $-STATE_SAVE_ALIGNMENT, %RSP_LP
> +#endif
> +#ifdef REGISTER_SAVE_AREA
> +# if DL_RUNTIME_RESOLVE_REALIGN_STACK
> +       # STATE_SAVE_OFFSET has space for 8 integer registers.  But we
> +       # need space for RCX, RDX, RSI, RDI, R8, R9, R10 and R11, plus
> +       # RBX above.
> +       sub     $(REGISTER_SAVE_AREA + STATE_SAVE_ALIGNMENT), %RSP_LP
> +# else
> +       sub     $REGISTER_SAVE_AREA, %RSP_LP
> +       cfi_adjust_cfa_offset(REGISTER_SAVE_AREA)
> +# endif
> +#else
> +       # Allocate stack space of the required size to save the state.
> +       sub     _rtld_local_ro+RTLD_GLOBAL_RO_DL_X86_CPU_FEATURES_OFFSET+XSAVE_STATE_SIZE_OFFSET(%rip), %RSP_LP
> +#endif
> +       /* Besides rdi and rsi, saved above, save rcx, rdx, r8, r9,
> +          r10 and r11.  */
> +       movq    %rcx, REGISTER_SAVE_RCX(%rsp)
> +       movq    %rdx, REGISTER_SAVE_RDX(%rsp)
> +       movq    %r8, REGISTER_SAVE_R8(%rsp)
> +       movq    %r9, REGISTER_SAVE_R9(%rsp)
> +       movq    %r10, REGISTER_SAVE_R10(%rsp)
> +       movq    %r11, REGISTER_SAVE_R11(%rsp)
> +#ifdef USE_FXSAVE
> +       fxsave  STATE_SAVE_OFFSET(%rsp)
> +#else
> +       movl    $TLSDESC_CALL_STATE_SAVE_MASK, %eax
> +       xorl    %edx, %edx
> +       # Clear the XSAVE Header.
> +# ifdef USE_XSAVE
> +       movq    %rdx, (STATE_SAVE_OFFSET + 512)(%rsp)
> +       movq    %rdx, (STATE_SAVE_OFFSET + 512 + 8)(%rsp)
> +# endif
> +       movq    %rdx, (STATE_SAVE_OFFSET + 512 + 8 * 2)(%rsp)
> +       movq    %rdx, (STATE_SAVE_OFFSET + 512 + 8 * 3)(%rsp)
> +       movq    %rdx, (STATE_SAVE_OFFSET + 512 + 8 * 4)(%rsp)
> +       movq    %rdx, (STATE_SAVE_OFFSET + 512 + 8 * 5)(%rsp)
> +       movq    %rdx, (STATE_SAVE_OFFSET + 512 + 8 * 6)(%rsp)
> +       movq    %rdx, (STATE_SAVE_OFFSET + 512 + 8 * 7)(%rsp)
> +# ifdef USE_XSAVE
> +       xsave   STATE_SAVE_OFFSET(%rsp)
> +# else
> +       xsavec  STATE_SAVE_OFFSET(%rsp)
> +# endif
> +#endif
> +       /* %rdi already points to the tlsinfo data structure.  */
> +       call    HIDDEN_JUMPTARGET (__tls_get_addr)
> +       # Get register content back.
> +#ifdef USE_FXSAVE
> +       fxrstor STATE_SAVE_OFFSET(%rsp)
> +#else
> +       /* Save and retore __tls_get_addr return value stored in RAX.  */
> +       mov     %RAX_LP, %RCX_LP
> +       movl    $TLSDESC_CALL_STATE_SAVE_MASK, %eax
> +       xorl    %edx, %edx
> +       xrstor  STATE_SAVE_OFFSET(%rsp)
> +       mov     %RCX_LP, %RAX_LP
> +#endif
> +       movq    REGISTER_SAVE_R11(%rsp), %r11
> +       movq    REGISTER_SAVE_R10(%rsp), %r10
> +       movq    REGISTER_SAVE_R9(%rsp), %r9
> +       movq    REGISTER_SAVE_R8(%rsp), %r8
> +       movq    REGISTER_SAVE_RDX(%rsp), %rdx
> +       movq    REGISTER_SAVE_RCX(%rsp), %rcx
> +#if DL_RUNTIME_RESOLVE_REALIGN_STACK
> +       mov     %RBX_LP, %RSP_LP
> +       cfi_def_cfa_register(%rsp)
> +       movq    -24(%rsp), %rbx
> +       cfi_restore(%rbx)
> +#else
> +       add     $REGISTER_SAVE_AREA, %RSP_LP
> +       cfi_adjust_cfa_offset(-REGISTER_SAVE_AREA)
> +#endif
> +       jmp     1b
> +       cfi_endproc
> +       .size   _dl_tlsdesc_dynamic, .-_dl_tlsdesc_dynamic
> +
> +#undef STATE_SAVE_ALIGNMENT
> diff --git a/sysdeps/x86_64/dl-tlsdesc.S b/sysdeps/x86_64/dl-tlsdesc.S
> index f748af2ece..ea69f5223a 100644
> --- a/sysdeps/x86_64/dl-tlsdesc.S
> +++ b/sysdeps/x86_64/dl-tlsdesc.S
> @@ -18,7 +18,19 @@
>
>  #include <sysdep.h>
>  #include <tls.h>
> +#include <cpu-features-offsets.h>
> +#include <features-offsets.h>
>  #include "tlsdesc.h"
> +#include "dl-trampoline-save.h"
> +
> +/* Area on stack to save and restore registers used for parameter
> +   passing when calling _dl_tlsdesc_dynamic.  */
> +#define REGISTER_SAVE_RCX      0
> +#define REGISTER_SAVE_RDX      (REGISTER_SAVE_RCX + 8)
> +#define REGISTER_SAVE_R8       (REGISTER_SAVE_RDX + 8)
> +#define REGISTER_SAVE_R9       (REGISTER_SAVE_R8 + 8)
> +#define REGISTER_SAVE_R10      (REGISTER_SAVE_R9 + 8)
> +#define REGISTER_SAVE_R11      (REGISTER_SAVE_R10 + 8)
>
>         .text
>
> @@ -67,80 +79,24 @@ _dl_tlsdesc_undefweak:
>         .size   _dl_tlsdesc_undefweak, .-_dl_tlsdesc_undefweak
>
>  #ifdef SHARED
> -       .hidden _dl_tlsdesc_dynamic
> -       .global _dl_tlsdesc_dynamic
> -       .type   _dl_tlsdesc_dynamic,@function
> -
> -     /* %rax points to the TLS descriptor, such that 0(%rax) points to
> -       _dl_tlsdesc_dynamic itself, and 8(%rax) points to a struct
> -       tlsdesc_dynamic_arg object.  It must return in %rax the offset
> -       between the thread pointer and the object denoted by the
> -       argument, without clobbering any registers.
> -
> -       The assembly code that follows is a rendition of the following
> -       C code, hand-optimized a little bit.
> -
> -ptrdiff_t
> -_dl_tlsdesc_dynamic (register struct tlsdesc *tdp asm ("%rax"))
> -{
> -  struct tlsdesc_dynamic_arg *td = tdp->arg;
> -  dtv_t *dtv = *(dtv_t **)((char *)__thread_pointer + DTV_OFFSET);
> -  if (__builtin_expect (td->gen_count <= dtv[0].counter
> -                       && (dtv[td->tlsinfo.ti_module].pointer.val
> -                           != TLS_DTV_UNALLOCATED),
> -                       1))
> -    return dtv[td->tlsinfo.ti_module].pointer.val + td->tlsinfo.ti_offset
> -      - __thread_pointer;
> -
> -  return __tls_get_addr_internal (&td->tlsinfo) - __thread_pointer;
> -}
> -*/
> -       cfi_startproc
> -       .align 16
> -_dl_tlsdesc_dynamic:
> -       _CET_ENDBR
> -       /* Preserve call-clobbered registers that we modify.
> -          We need two scratch regs anyway.  */
> -       movq    %rsi, -16(%rsp)
> -       mov     %fs:DTV_OFFSET, %RSI_LP
> -       movq    %rdi, -8(%rsp)
> -       movq    TLSDESC_ARG(%rax), %rdi
> -       movq    (%rsi), %rax
> -       cmpq    %rax, TLSDESC_GEN_COUNT(%rdi)
> -       ja      .Lslow
> -       movq    TLSDESC_MODID(%rdi), %rax
> -       salq    $4, %rax
> -       movq    (%rax,%rsi), %rax
> -       cmpq    $-1, %rax
> -       je      .Lslow
> -       addq    TLSDESC_MODOFF(%rdi), %rax
> -.Lret:
> -       movq    -16(%rsp), %rsi
> -       sub     %fs:0, %RAX_LP
> -       movq    -8(%rsp), %rdi
> -       ret
> -.Lslow:
> -       /* Besides rdi and rsi, saved above, save rdx, rcx, r8, r9,
> -          r10 and r11.  Also, align the stack, that's off by 8 bytes.  */
> -       subq    $72, %rsp
> -       cfi_adjust_cfa_offset (72)
> -       movq    %rdx, 8(%rsp)
> -       movq    %rcx, 16(%rsp)
> -       movq    %r8, 24(%rsp)
> -       movq    %r9, 32(%rsp)
> -       movq    %r10, 40(%rsp)
> -       movq    %r11, 48(%rsp)
> -       /* %rdi already points to the tlsinfo data structure.  */
> -       call    HIDDEN_JUMPTARGET (__tls_get_addr)
> -       movq    8(%rsp), %rdx
> -       movq    16(%rsp), %rcx
> -       movq    24(%rsp), %r8
> -       movq    32(%rsp), %r9
> -       movq    40(%rsp), %r10
> -       movq    48(%rsp), %r11
> -       addq    $72, %rsp
> -       cfi_adjust_cfa_offset (-72)
> -       jmp     .Lret
> -       cfi_endproc
> -       .size   _dl_tlsdesc_dynamic, .-_dl_tlsdesc_dynamic
> +# define USE_FXSAVE
> +# define STATE_SAVE_ALIGNMENT  16
> +# define _dl_tlsdesc_dynamic   _dl_tlsdesc_dynamic_fxsave
> +# include "dl-tlsdesc-dynamic.h"
> +# undef _dl_tlsdesc_dynamic
> +# undef USE_FXSAVE
> +
> +# define USE_XSAVE
> +# define STATE_SAVE_ALIGNMENT  64
> +# define _dl_tlsdesc_dynamic   _dl_tlsdesc_dynamic_xsave
> +# include "dl-tlsdesc-dynamic.h"
> +# undef _dl_tlsdesc_dynamic
> +# undef USE_XSAVE
> +
> +# define USE_XSAVEC
> +# define STATE_SAVE_ALIGNMENT  64
> +# define _dl_tlsdesc_dynamic   _dl_tlsdesc_dynamic_xsavec
> +# include "dl-tlsdesc-dynamic.h"
> +# undef _dl_tlsdesc_dynamic
> +# undef USE_XSAVEC
>  #endif /* SHARED */
> diff --git a/sysdeps/x86_64/dl-trampoline-save.h b/sysdeps/x86_64/dl-trampoline-save.h
> new file mode 100644
> index 0000000000..84eac4a8ac
> --- /dev/null
> +++ b/sysdeps/x86_64/dl-trampoline-save.h
> @@ -0,0 +1,34 @@
> +/* x86-64 PLT trampoline register save macros.
> +   Copyright (C) 2024 Free Software Foundation, Inc.
> +   This file is part of the GNU C Library.
> +
> +   The GNU C Library is free software; you can redistribute it and/or
> +   modify it under the terms of the GNU Lesser General Public
> +   License as published by the Free Software Foundation; either
> +   version 2.1 of the License, or (at your option) any later version.
> +
> +   The GNU C Library is distributed in the hope that it will be useful,
> +   but WITHOUT ANY WARRANTY; without even the implied warranty of
> +   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
> +   Lesser General Public License for more details.
> +
> +   You should have received a copy of the GNU Lesser General Public
> +   License along with the GNU C Library; if not, see
> +   <https://www.gnu.org/licenses/>.  */
> +
> +#ifndef DL_STACK_ALIGNMENT
> +/* Due to GCC bug:
> +
> +   https://gcc.gnu.org/bugzilla/show_bug.cgi?id=58066
> +
> +   __tls_get_addr may be called with 8-byte stack alignment.  Although
> +   this bug has been fixed in GCC 4.9.4, 5.3 and 6, we can't assume
> +   that stack will be always aligned at 16 bytes.  */
> +# define DL_STACK_ALIGNMENT 8
> +#endif
> +
> +/* True if _dl_runtime_resolve should align stack for STATE_SAVE or align
> +   stack to 16 bytes before calling _dl_fixup.  */
> +#define DL_RUNTIME_RESOLVE_REALIGN_STACK \
> +  (STATE_SAVE_ALIGNMENT > DL_STACK_ALIGNMENT \
> +   || 16 > DL_STACK_ALIGNMENT)
> diff --git a/sysdeps/x86_64/dl-trampoline-state.h b/sysdeps/x86_64/dl-trampoline-state.h
> new file mode 100644
> index 0000000000..575f120797
> --- /dev/null
> +++ b/sysdeps/x86_64/dl-trampoline-state.h
> @@ -0,0 +1,51 @@
> +/* x86-64 PLT dl-trampoline state macros.
> +   Copyright (C) 2024 Free Software Foundation, Inc.
> +   This file is part of the GNU C Library.
> +
> +   The GNU C Library is free software; you can redistribute it and/or
> +   modify it under the terms of the GNU Lesser General Public
> +   License as published by the Free Software Foundation; either
> +   version 2.1 of the License, or (at your option) any later version.
> +
> +   The GNU C Library is distributed in the hope that it will be useful,
> +   but WITHOUT ANY WARRANTY; without even the implied warranty of
> +   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
> +   Lesser General Public License for more details.
> +
> +   You should have received a copy of the GNU Lesser General Public
> +   License along with the GNU C Library; if not, see
> +   <https://www.gnu.org/licenses/>.  */
> +
> +#if (STATE_SAVE_ALIGNMENT % 16) != 0
> +# error STATE_SAVE_ALIGNMENT must be multiple of 16
> +#endif
> +
> +#if (STATE_SAVE_OFFSET % STATE_SAVE_ALIGNMENT) != 0
> +# error STATE_SAVE_OFFSET must be multiple of STATE_SAVE_ALIGNMENT
> +#endif
> +
> +#if DL_RUNTIME_RESOLVE_REALIGN_STACK
> +/* Local stack area before jumping to function address: RBX.  */
> +# define LOCAL_STORAGE_AREA    8
> +# define BASE                  rbx
> +# ifdef USE_FXSAVE
> +/* Use fxsave to save XMM registers.  */
> +#  define REGISTER_SAVE_AREA   (512 + STATE_SAVE_OFFSET)
> +#  if (REGISTER_SAVE_AREA % 16) != 0
> +#   error REGISTER_SAVE_AREA must be multiple of 16
> +#  endif
> +# endif
> +#else
> +# ifndef USE_FXSAVE
> +#  error USE_FXSAVE must be defined
> +# endif
> +/* Use fxsave to save XMM registers.  */
> +# define REGISTER_SAVE_AREA    (512 + STATE_SAVE_OFFSET + 8)
> +/* Local stack area before jumping to function address:  All saved
> +   registers.  */
> +# define LOCAL_STORAGE_AREA    REGISTER_SAVE_AREA
> +# define BASE                  rsp
> +# if (REGISTER_SAVE_AREA % 16) != 8
> +#  error REGISTER_SAVE_AREA must be odd multiple of 8
> +# endif
> +#endif
> diff --git a/sysdeps/x86_64/dl-trampoline.S b/sysdeps/x86_64/dl-trampoline.S
> index b2e7e0f69b..87c5137837 100644
> --- a/sysdeps/x86_64/dl-trampoline.S
> +++ b/sysdeps/x86_64/dl-trampoline.S
> @@ -22,25 +22,7 @@
>  #include <features-offsets.h>
>  #include <link-defines.h>
>  #include <isa-level.h>
> -
> -#ifndef DL_STACK_ALIGNMENT
> -/* Due to GCC bug:
> -
> -   https://gcc.gnu.org/bugzilla/show_bug.cgi?id=58066
> -
> -   __tls_get_addr may be called with 8-byte stack alignment.  Although
> -   this bug has been fixed in GCC 4.9.4, 5.3 and 6, we can't assume
> -   that stack will be always aligned at 16 bytes.  We use unaligned
> -   16-byte move to load and store SSE registers, which has no penalty
> -   on modern processors if stack is 16-byte aligned.  */
> -# define DL_STACK_ALIGNMENT 8
> -#endif
> -
> -/* True if _dl_runtime_resolve should align stack for STATE_SAVE or align
> -   stack to 16 bytes before calling _dl_fixup.  */
> -#define DL_RUNTIME_RESOLVE_REALIGN_STACK \
> -  (STATE_SAVE_ALIGNMENT > DL_STACK_ALIGNMENT \
> -   || 16 > DL_STACK_ALIGNMENT)
> +#include "dl-trampoline-save.h"
>
>  /* Area on stack to save and restore registers used for parameter
>     passing when calling _dl_fixup.  */
> diff --git a/sysdeps/x86_64/dl-trampoline.h b/sysdeps/x86_64/dl-trampoline.h
> index f55c6ea040..d9ccfb40d4 100644
> --- a/sysdeps/x86_64/dl-trampoline.h
> +++ b/sysdeps/x86_64/dl-trampoline.h
> @@ -27,39 +27,7 @@
>  # undef LOCAL_STORAGE_AREA
>  # undef BASE
>
> -# if (STATE_SAVE_ALIGNMENT % 16) != 0
> -#  error STATE_SAVE_ALIGNMENT must be multiple of 16
> -# endif
> -
> -# if (STATE_SAVE_OFFSET % STATE_SAVE_ALIGNMENT) != 0
> -#  error STATE_SAVE_OFFSET must be multiple of STATE_SAVE_ALIGNMENT
> -# endif
> -
> -# if DL_RUNTIME_RESOLVE_REALIGN_STACK
> -/* Local stack area before jumping to function address: RBX.  */
> -#  define LOCAL_STORAGE_AREA   8
> -#  define BASE                 rbx
> -#  ifdef USE_FXSAVE
> -/* Use fxsave to save XMM registers.  */
> -#   define REGISTER_SAVE_AREA  (512 + STATE_SAVE_OFFSET)
> -#   if (REGISTER_SAVE_AREA % 16) != 0
> -#    error REGISTER_SAVE_AREA must be multiple of 16
> -#   endif
> -#  endif
> -# else
> -#  ifndef USE_FXSAVE
> -#   error USE_FXSAVE must be defined
> -#  endif
> -/* Use fxsave to save XMM registers.  */
> -#  define REGISTER_SAVE_AREA   (512 + STATE_SAVE_OFFSET + 8)
> -/* Local stack area before jumping to function address:  All saved
> -   registers.  */
> -#  define LOCAL_STORAGE_AREA   REGISTER_SAVE_AREA
> -#  define BASE                 rsp
> -#  if (REGISTER_SAVE_AREA % 16) != 8
> -#   error REGISTER_SAVE_AREA must be odd multiple of 8
> -#  endif
> -# endif
> +# include "dl-trampoline-state.h"
>
>         .globl _dl_runtime_resolve
>         .hidden _dl_runtime_resolve
> --
> 2.43.0
>
  
H.J. Lu Feb. 16, 2024, 12:15 p.m. UTC | #2
On Fri, Feb 16, 2024 at 12:05 AM Noah Goldstein <goldstein.w.n@gmail.com> wrote:
>
> On Fri, Feb 16, 2024 at 12:21 AM H.J. Lu <hjl.tools@gmail.com> wrote:
> >
> > Compiler generates the following instruction sequence for GNU2 dynamic
> > TLS access:
> >
> >         leaq    tls_var@TLSDESC(%rip), %rax
> >         call    *tls_var@TLSCALL(%rax)
> >
> > or
> >
> >         leal    tls_var@TLSDESC(%ebx), %eax
> >         call    *tls_var@TLSCALL(%eax)
> >
> > CALL instruction is transparent to compiler which assumes all registers,
> > except for EFLAGS and RAX/EAX, are unchanged after CALL.  When
> > _dl_tlsdesc_dynamic is called, it calls __tls_get_addr on the slow
> > path.  __tls_get_addr is a normal function which doesn't preserve any
> > caller-saved registers.  _dl_tlsdesc_dynamic saved and restored integer
> > caller-saved registers, but didn't preserve any other caller-saved
> > registers.  Add _dl_tlsdesc_dynamic IFUNC functions for FNSAVE, FXSAVE,
> > XSAVE and XSAVEC to save and restore all caller-saved registers.  This
> > fixes BZ #31372.
> >
> > Add GLRO(dl_x86_64_runtime_resolve) with GLRO(dl_x86_tlsdesc_dynamic)
> > to optimize elf_machine_runtime_setup.
> > ---
> >  elf/Makefile                                 |  36 +++-
> >  elf/malloc-for-test.c                        |  32 ++++
> >  elf/malloc-for-test.map.in                   |   8 +
> >  elf/tst-gnu2-tls2.c                          |  97 ++++++++++
> >  elf/tst-gnu2-tls2.h                          |  26 +++
> >  elf/tst-gnu2-tls2mod0.c                      |  28 +++
> >  elf/tst-gnu2-tls2mod1.c                      |  28 +++
> >  elf/tst-gnu2-tls2mod2.c                      |  28 +++
> >  sysdeps/i386/dl-machine.h                    |   2 +-
> >  sysdeps/i386/dl-tlsdesc-dynamic.h            | 190 +++++++++++++++++++
> >  sysdeps/i386/dl-tlsdesc.S                    | 115 +++++------
> >  sysdeps/i386/tst-gnu2-tls2.c                 |   5 +
> >  sysdeps/x86/Makefile                         |   7 +-
> >  sysdeps/x86/cpu-features.c                   |  56 +++++-
> >  sysdeps/x86/dl-procinfo.c                    |  16 ++
> >  sysdeps/{x86_64 => x86}/features-offsets.sym |   2 +
> >  sysdeps/x86/malloc-for-test.c                |  33 ++++
> >  sysdeps/x86/sysdep.h                         |   6 +
> >  sysdeps/x86_64/Makefile                      |   2 +-
> >  sysdeps/x86_64/dl-machine.h                  |  19 +-
> >  sysdeps/x86_64/dl-procinfo.c                 |  16 ++
> >  sysdeps/x86_64/dl-tlsdesc-dynamic.h          | 166 ++++++++++++++++
> >  sysdeps/x86_64/dl-tlsdesc.S                  | 108 ++++-------
> >  sysdeps/x86_64/dl-trampoline-save.h          |  34 ++++
> >  sysdeps/x86_64/dl-trampoline-state.h         |  51 +++++
> >  sysdeps/x86_64/dl-trampoline.S               |  20 +-
> >  sysdeps/x86_64/dl-trampoline.h               |  34 +---
> >  27 files changed, 950 insertions(+), 215 deletions(-)
> >  create mode 100644 elf/malloc-for-test.c
> >  create mode 100644 elf/malloc-for-test.map.in
> >  create mode 100644 elf/tst-gnu2-tls2.c
> >  create mode 100644 elf/tst-gnu2-tls2.h
> >  create mode 100644 elf/tst-gnu2-tls2mod0.c
> >  create mode 100644 elf/tst-gnu2-tls2mod1.c
> >  create mode 100644 elf/tst-gnu2-tls2mod2.c
> >  create mode 100644 sysdeps/i386/dl-tlsdesc-dynamic.h
> >  create mode 100644 sysdeps/i386/tst-gnu2-tls2.c
> >  rename sysdeps/{x86_64 => x86}/features-offsets.sym (89%)
> >  create mode 100644 sysdeps/x86/malloc-for-test.c
> >  create mode 100644 sysdeps/x86_64/dl-tlsdesc-dynamic.h
> >  create mode 100644 sysdeps/x86_64/dl-trampoline-save.h
> >  create mode 100644 sysdeps/x86_64/dl-trampoline-state.h
> >
...
> > diff --git a/sysdeps/i386/dl-tlsdesc-dynamic.h b/sysdeps/i386/dl-tlsdesc-dynamic.h
> > new file mode 100644
> > index 0000000000..c857c68c55
> > --- /dev/null
> > +++ b/sysdeps/i386/dl-tlsdesc-dynamic.h
> > @@ -0,0 +1,190 @@
> > +/* Thread-local storage handling in the ELF dynamic linker.  i386 version.
> > +   Copyright (C) 2004-2024 Free Software Foundation, Inc.
> > +   This file is part of the GNU C Library.
> > +
> > +   The GNU C Library is free software; you can redistribute it and/or
> > +   modify it under the terms of the GNU Lesser General Public
> > +   License as published by the Free Software Foundation; either
> > +   version 2.1 of the License, or (at your option) any later version.
> > +
> > +   The GNU C Library is distributed in the hope that it will be useful,
> > +   but WITHOUT ANY WARRANTY; without even the implied warranty of
> > +   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
> > +   Lesser General Public License for more details.
> > +
> > +   You should have received a copy of the GNU Lesser General Public
> > +   License along with the GNU C Library; if not, see
> > +   <https://www.gnu.org/licenses/>.  */
> > +
> > +#undef REGISTER_SAVE_AREA
> > +
> > +#if !defined USE_FNSAVE && (STATE_SAVE_ALIGNMENT % 16) != 0
> > +# error STATE_SAVE_ALIGNMENT must be multiple of 16
> > +#endif
> > +
> > +#if DL_RUNTIME_RESOLVE_REALIGN_STACK
> > +# ifdef USE_FNSAVE
> > +#  error USE_FNSAVE shouldn't be defined
> > +# endif
> > +# ifdef USE_FXSAVE
> > +/* Use fxsave to save all registers.  */
> > +#  define REGISTER_SAVE_AREA   512
> > +# endif
> > +#else
> > +# ifdef USE_FNSAVE
> > +/* Use fnsave to save x87 FPU stack registers.  */
> > +#  define REGISTER_SAVE_AREA   108
> > +# else
> > +#  ifndef USE_FXSAVE
> > +#   error USE_FXSAVE must be defined
> > +#  endif
> > +/* Use fxsave to save all registers.  Add 12 bytes to align the stack
> > +   to 16 bytes.  */
> > +#  define REGISTER_SAVE_AREA   (512 + 12)
> > +# endif
> > +#endif
> > +
> > +       .hidden _dl_tlsdesc_dynamic
> > +       .global _dl_tlsdesc_dynamic
> > +       .type   _dl_tlsdesc_dynamic,@function
> > +
> > +     /* This function is used for symbols that need dynamic TLS.
> > +
> nit: comment start at line start.

This is copied from the original dl-tlsdesc.S.  I prefer to leave it
alone.

> > +       %eax points to the TLS descriptor, such that 0(%eax) points to
> > +       _dl_tlsdesc_dynamic itself, and 4(%eax) points to a struct
> > +       tlsdesc_dynamic_arg object.  It must return in %eax the offset
> > +       between the thread pointer and the object denoted by the
> > +       argument, without clobbering any registers.
> > +
> > +       The assembly code that follows is a rendition of the following
> > +       C code, hand-optimized a little bit.
> > +
> > +ptrdiff_t
> > +__attribute__ ((__regparm__ (1)))
> > +_dl_tlsdesc_dynamic (struct tlsdesc *tdp)
> > +{
> > +  struct tlsdesc_dynamic_arg *td = tdp->arg;
> > +  dtv_t *dtv = *(dtv_t **)((char *)__thread_pointer + DTV_OFFSET);
> > +  if (__builtin_expect (td->gen_count <= dtv[0].counter
> > +                       && (dtv[td->tlsinfo.ti_module].pointer.val
> > +                           != TLS_DTV_UNALLOCATED),
> > +                       1))
> > +    return dtv[td->tlsinfo.ti_module].pointer.val + td->tlsinfo.ti_offset
> > +      - __thread_pointer;
> > +
> > +  return ___tls_get_addr (&td->tlsinfo) - __thread_pointer;
> > +}
> > +*/
> > +       cfi_startproc
> > +       .align 16
> > +_dl_tlsdesc_dynamic:
> > +       /* Like all TLS resolvers, preserve call-clobbered registers.
> > +          We need two scratch regs anyway.  */
> > +       subl    $32, %esp
> > +       cfi_adjust_cfa_offset (32)
> > +       movl    %ecx, 20(%esp)
> > +       movl    %edx, 24(%esp)
> > +       movl    TLSDESC_ARG(%eax), %eax
> > +       movl    %gs:DTV_OFFSET, %edx
> > +       movl    TLSDESC_GEN_COUNT(%eax), %ecx
> > +       cmpl    (%edx), %ecx
> > +       ja      2f
> > +       movl    TLSDESC_MODID(%eax), %ecx
> > +       movl    (%edx,%ecx,8), %edx
> maybe 8 -> TLSDESC_DTV_SIZE?

This is copied from the original dl-tlsdesc.S.  I prefer to leave it
alone.

> > +       cmpl    $-1, %edx
> -1 -> TLS_DTV_UNALLOCATED

This is copied from the original dl-tlsdesc.S.  I prefer to leave it
alone.

> > +       je      2f
> > +       movl    TLSDESC_MODOFF(%eax), %eax
> > +       addl    %edx, %eax
> > +1:
> > +       movl    20(%esp), %ecx
> > +       subl    %gs:0, %eax
> > +       movl    24(%esp), %edx
> > +       addl    $32, %esp
> > +       cfi_adjust_cfa_offset (-32)
> > +       ret
> > +       .p2align 4,,7
> > +2:
> > +       cfi_adjust_cfa_offset (32)
> I still don't understand what this cfi is for?
> You already have `cfi_adjust_cfa_offset (32)` above right after
> the `subl $32, %esp`

There are

        subl    $32, %esp
        cfi_adjust_cfa_offset (32)
...
        addl    $32, %esp
        cfi_adjust_cfa_offset (-32)
        ret
        .p2align 4,,7
2:

What is the CFA at the label 2 for GDB?  GDB only consumes CFI
directives.

> > +#if DL_RUNTIME_RESOLVE_REALIGN_STACK
> > +       movl    %ebx, -28(%esp)
> > +       movl    %esp, %ebx
> > +       cfi_def_cfa_register(%ebx)
> > +       and     $-STATE_SAVE_ALIGNMENT, %esp
> > +#endif
> > +#ifdef REGISTER_SAVE_AREA
> > +       subl    $REGISTER_SAVE_AREA, %esp
> > +# if !DL_RUNTIME_RESOLVE_REALIGN_STACK
> > +       cfi_adjust_cfa_offset(REGISTER_SAVE_AREA)
> > +# endif
> > +#else
> > +# if !DL_RUNTIME_RESOLVE_REALIGN_STACK
> > +#  error DL_RUNTIME_RESOLVE_REALIGN_STACK must be true
> > +# endif
> > +       # Allocate stack space of the required size to save the state.
> nit: comment with /* or //
> likewise below.

Will fix them.

> > +       LOAD_PIC_REG (cx)
> > +       subl    RTLD_GLOBAL_RO_DL_X86_CPU_FEATURES_OFFSET+XSAVE_STATE_SIZE_OFFSET+_rtld_local_ro@GOTOFF(%ecx), %esp
> > +#endif
> > +#ifdef USE_FNSAVE
> > +       fnsave  (%esp)
> > +#elif defined USE_FXSAVE
> > +       fxsave  (%esp)
> > +#else
> > +       # Save the argument for ___tls_get_addr in EAX.
> > +       movl    %eax, %ecx
> > +       movl    $TLSDESC_CALL_STATE_SAVE_MASK, %eax
> > +       xorl    %edx, %edx
> > +       # Clear the XSAVE Header.
> > +# ifdef USE_XSAVE
> > +       movl    %edx, (512)(%esp)
> > +       movl    %edx, (512 + 4 * 1)(%esp)
> > +       movl    %edx, (512 + 4 * 2)(%esp)
> > +       movl    %edx, (512 + 4 * 3)(%esp)
> > +# endif
> > +       movl    %edx, (512 + 4 * 4)(%esp)
> > +       movl    %edx, (512 + 4 * 5)(%esp)
> > +       movl    %edx, (512 + 4 * 6)(%esp)
> > +       movl    %edx, (512 + 4 * 7)(%esp)
> > +       movl    %edx, (512 + 4 * 8)(%esp)
> > +       movl    %edx, (512 + 4 * 9)(%esp)
> > +       movl    %edx, (512 + 4 * 10)(%esp)
> > +       movl    %edx, (512 + 4 * 11)(%esp)
> > +       movl    %edx, (512 + 4 * 12)(%esp)
> > +       movl    %edx, (512 + 4 * 13)(%esp)
> > +       movl    %edx, (512 + 4 * 14)(%esp)
> > +       movl    %edx, (512 + 4 * 15)(%esp)
> > +# ifdef USE_XSAVE
> > +       xsave   (%esp)
> > +# else
> > +       xsavec  (%esp)
> > +# endif
> > +       # Restore the argument for ___tls_get_addr in EAX.
> > +       movl    %ecx, %eax
> > +#endif
> > +       call    HIDDEN_JUMPTARGET (___tls_get_addr)
> > +       # Get register content back.
> > +#ifdef USE_FNSAVE
> > +       frstor  (%esp)
> > +#elif defined USE_FXSAVE
> > +       fxrstor (%esp)
> > +#else
> > +       /* Save and retore ___tls_get_addr return value stored in EAX.  */
> > +       movl    %eax, %ecx
> > +       movl    $TLSDESC_CALL_STATE_SAVE_MASK, %eax
> > +       xorl    %edx, %edx
> > +       xrstor  (%esp)
> > +       movl    %ecx, %eax
> > +#endif
> > +#if DL_RUNTIME_RESOLVE_REALIGN_STACK
> > +       mov     %ebx, %esp
> > +       cfi_def_cfa_register(%esp)
> > +       movl    -28(%esp), %ebx
> > +       cfi_restore(%ebx)
> > +#else
> > +       addl    $REGISTER_SAVE_AREA, %esp
> > +       cfi_adjust_cfa_offset(-REGISTER_SAVE_AREA)
> > +#endif
> > +       jmp     1b
> > +       cfi_endproc
> > +       .size   _dl_tlsdesc_dynamic, .-_dl_tlsdesc_dynamic
> > +
> > +#undef STATE_SAVE_ALIGNMENT
> > diff --git a/sysdeps/i386/dl-tlsdesc.S b/sysdeps/i386/dl-tlsdesc.S
> > index 90d93caa0c..f002feee56 100644
> > --- a/sysdeps/i386/dl-tlsdesc.S
> > +++ b/sysdeps/i386/dl-tlsdesc.S
> > @@ -18,8 +18,27 @@
> >
> >  #include <sysdep.h>
> >  #include <tls.h>
> > +#include <cpu-features-offsets.h>
> > +#include <features-offsets.h>
> >  #include "tlsdesc.h"
> >
> > +#ifndef DL_STACK_ALIGNMENT
> > +/* Due to GCC bug:
> > +
> > +   https://gcc.gnu.org/bugzilla/show_bug.cgi?id=58066
> > +
> > +   __tls_get_addr may be called with 4-byte stack alignment.  Although
> > +   this bug has been fixed in GCC 4.9.4, 5.3 and 6, we can't assume
> > +   that stack will be always aligned at 16 bytes.  */
> > +# define DL_STACK_ALIGNMENT 4
> > +#endif
> > +
> > +/* True if _dl_tlsdesc_dynamic should align stack for STATE_SAVE or align
> > +   stack to MINIMUM_ALIGNMENT bytes before calling ___tls_get_addr.  */
> > +#define DL_RUNTIME_RESOLVE_REALIGN_STACK \
> > +  (STATE_SAVE_ALIGNMENT > DL_STACK_ALIGNMENT \
> > +   || MINIMUM_ALIGNMENT > DL_STACK_ALIGNMENT)
> > +
> >         .text
> >
> >       /* This function is used to compute the TP offset for symbols in
> > @@ -65,69 +84,35 @@ _dl_tlsdesc_undefweak:
> >         .size   _dl_tlsdesc_undefweak, .-_dl_tlsdesc_undefweak
> >
> >  #ifdef SHARED
> > -       .hidden _dl_tlsdesc_dynamic
> > -       .global _dl_tlsdesc_dynamic
> > -       .type   _dl_tlsdesc_dynamic,@function
> > -
> > -     /* This function is used for symbols that need dynamic TLS.
> > -
> > -       %eax points to the TLS descriptor, such that 0(%eax) points to
> > -       _dl_tlsdesc_dynamic itself, and 4(%eax) points to a struct
> > -       tlsdesc_dynamic_arg object.  It must return in %eax the offset
> > -       between the thread pointer and the object denoted by the
> > -       argument, without clobbering any registers.
> > -
> > -       The assembly code that follows is a rendition of the following
> > -       C code, hand-optimized a little bit.
> > -
> > -ptrdiff_t
> > -__attribute__ ((__regparm__ (1)))
> > -_dl_tlsdesc_dynamic (struct tlsdesc *tdp)
> > -{
> > -  struct tlsdesc_dynamic_arg *td = tdp->arg;
> > -  dtv_t *dtv = *(dtv_t **)((char *)__thread_pointer + DTV_OFFSET);
> > -  if (__builtin_expect (td->gen_count <= dtv[0].counter
> > -                       && (dtv[td->tlsinfo.ti_module].pointer.val
> > -                           != TLS_DTV_UNALLOCATED),
> > -                       1))
> > -    return dtv[td->tlsinfo.ti_module].pointer.val + td->tlsinfo.ti_offset
> > -      - __thread_pointer;
> > -
> > -  return ___tls_get_addr (&td->tlsinfo) - __thread_pointer;
> > -}
> > -*/
> > -       cfi_startproc
> > -       .align 16
> > -_dl_tlsdesc_dynamic:
> > -       /* Like all TLS resolvers, preserve call-clobbered registers.
> > -          We need two scratch regs anyway.  */
> > -       subl    $28, %esp
> > -       cfi_adjust_cfa_offset (28)
> > -       movl    %ecx, 20(%esp)
> > -       movl    %edx, 24(%esp)
> > -       movl    TLSDESC_ARG(%eax), %eax
> > -       movl    %gs:DTV_OFFSET, %edx
> > -       movl    TLSDESC_GEN_COUNT(%eax), %ecx
> > -       cmpl    (%edx), %ecx
> > -       ja      .Lslow
> > -       movl    TLSDESC_MODID(%eax), %ecx
> > -       movl    (%edx,%ecx,8), %edx
> > -       cmpl    $-1, %edx
> > -       je      .Lslow
> > -       movl    TLSDESC_MODOFF(%eax), %eax
> > -       addl    %edx, %eax
> > -.Lret:
> > -       movl    20(%esp), %ecx
> > -       subl    %gs:0, %eax
> > -       movl    24(%esp), %edx
> > -       addl    $28, %esp
> > -       cfi_adjust_cfa_offset (-28)
> > -       ret
> > -       .p2align 4,,7
> > -.Lslow:
> > -       cfi_adjust_cfa_offset (28)
> > -       call    HIDDEN_JUMPTARGET (___tls_get_addr)
> > -       jmp     .Lret
> > -       cfi_endproc
> > -       .size   _dl_tlsdesc_dynamic, .-_dl_tlsdesc_dynamic
> > +# define USE_FNSAVE
> > +# define MINIMUM_ALIGNMENT     4
> > +# define STATE_SAVE_ALIGNMENT  4
> > +# define _dl_tlsdesc_dynamic   _dl_tlsdesc_dynamic_fnsave
> > +# include "dl-tlsdesc-dynamic.h"
> > +# undef _dl_tlsdesc_dynamic
> > +# undef MINIMUM_ALIGNMENT
> > +# undef USE_FNSAVE
> > +
> > +# define MINIMUM_ALIGNMENT     16
> > +
> > +# define USE_FXSAVE
> > +# define STATE_SAVE_ALIGNMENT  16
> > +# define _dl_tlsdesc_dynamic   _dl_tlsdesc_dynamic_fxsave
> > +# include "dl-tlsdesc-dynamic.h"
> > +# undef _dl_tlsdesc_dynamic
> > +# undef USE_FXSAVE
> > +
> > +# define USE_XSAVE
> > +# define STATE_SAVE_ALIGNMENT  64
> > +# define _dl_tlsdesc_dynamic   _dl_tlsdesc_dynamic_xsave
> > +# include "dl-tlsdesc-dynamic.h"
> > +# undef _dl_tlsdesc_dynamic
> > +# undef USE_XSAVE
> > +
> > +# define USE_XSAVEC
> > +# define STATE_SAVE_ALIGNMENT  64
> > +# define _dl_tlsdesc_dynamic   _dl_tlsdesc_dynamic_xsavec
> > +# include "dl-tlsdesc-dynamic.h"
> > +# undef _dl_tlsdesc_dynamic
> > +# undef USE_XSAVEC
> >  #endif /* SHARED */
> > diff --git a/sysdeps/i386/tst-gnu2-tls2.c b/sysdeps/i386/tst-gnu2-tls2.c
> > new file mode 100644
> > index 0000000000..92e7fbff89
> > --- /dev/null
> > +++ b/sysdeps/i386/tst-gnu2-tls2.c
> > @@ -0,0 +1,5 @@
> > +#include <sys/platform/x86.h>
> > +
> > +#define IS_SUPPORTED() CPU_FEATURE_ACTIVE (SSE2)
> > +
> > +#include <elf/tst-gnu2-tls2.c>
> > diff --git a/sysdeps/x86/Makefile b/sysdeps/x86/Makefile
> > index 73b29cc78c..581086305d 100644
> > --- a/sysdeps/x86/Makefile
> > +++ b/sysdeps/x86/Makefile
> > @@ -1,5 +1,5 @@
> >  ifeq ($(subdir),csu)
> > -gen-as-const-headers += cpu-features-offsets.sym
> > +gen-as-const-headers += cpu-features-offsets.sym features-offsets.sym
> >  endif
> >
> >  ifeq ($(subdir),elf)
> > @@ -86,6 +86,11 @@ endif
> >  tst-ifunc-isa-2-ENV = GLIBC_TUNABLES=glibc.cpu.hwcaps=-SSE4_2,-AVX,-AVX2,-AVX512F
> >  tst-ifunc-isa-2-static-ENV = $(tst-ifunc-isa-2-ENV)
> >  tst-hwcap-tunables-ARGS = -- $(host-test-program-cmd)
> > +
> > +CFLAGS-malloc-for-test.c += -msse2
> > +CFLAGS-tst-gnu2-tls2mod0.c += -msse2 -mtune=haswell
> > +CFLAGS-tst-gnu2-tls2mod1.c += -msse2 -mtune=haswell
> > +CFLAGS-tst-gnu2-tls2mod2.c += -msse2 -mtune=haswell
> >  endif
> >
> >  ifeq ($(subdir),math)
> > diff --git a/sysdeps/x86/cpu-features.c b/sysdeps/x86/cpu-features.c
> > index 25e6622a79..835113b42f 100644
> > --- a/sysdeps/x86/cpu-features.c
> > +++ b/sysdeps/x86/cpu-features.c
> > @@ -27,8 +27,13 @@
> >  extern void TUNABLE_CALLBACK (set_hwcaps) (tunable_val_t *)
> >    attribute_hidden;
> >
> > -#if defined SHARED && defined __x86_64__
> > -# include <dl-plt-rewrite.h>
> > +#if defined SHARED
> > +extern void _dl_tlsdesc_dynamic_fxsave (void) attribute_hidden;
> > +extern void _dl_tlsdesc_dynamic_xsave (void) attribute_hidden;
> > +extern void _dl_tlsdesc_dynamic_xsavec (void) attribute_hidden;
> > +
> > +# ifdef __x86_64__
> > +#  include <dl-plt-rewrite.h>
> >
> >  static void
> >  TUNABLE_CALLBACK (set_plt_rewrite) (tunable_val_t *valp)
> > @@ -47,6 +52,15 @@ TUNABLE_CALLBACK (set_plt_rewrite) (tunable_val_t *valp)
> >                  : plt_rewrite_jmp);
> >      }
> >  }
> > +# else
> > +extern void _dl_tlsdesc_dynamic_fnsave (void) attribute_hidden;
> > +# endif
> > +#endif
> > +
> > +#ifdef __x86_64__
> > +extern void _dl_runtime_resolve_fxsave (void) attribute_hidden;
> > +extern void _dl_runtime_resolve_xsave (void) attribute_hidden;
> > +extern void _dl_runtime_resolve_xsavec (void) attribute_hidden;
> >  #endif
> >
> >  #ifdef __LP64__
> > @@ -1130,6 +1144,44 @@ no_cpuid:
> >                TUNABLE_CALLBACK (set_x86_shstk));
> >  #endif
> >
> > +  if (GLRO(dl_x86_cpu_features).xsave_state_size != 0)
> > +    {
> > +      if (CPU_FEATURE_USABLE_P (cpu_features, XSAVEC))
> > +       {
> > +#ifdef __x86_64__
> > +         GLRO(dl_x86_64_runtime_resolve) = _dl_runtime_resolve_xsavec;
> > +#endif
> > +#ifdef SHARED
> > +         GLRO(dl_x86_tlsdesc_dynamic) = _dl_tlsdesc_dynamic_xsavec;
> > +#endif
> > +       }
> > +      else
> > +       {
> > +#ifdef __x86_64__
> > +         GLRO(dl_x86_64_runtime_resolve) = _dl_runtime_resolve_xsave;
> > +#endif
> > +#ifdef SHARED
> > +         GLRO(dl_x86_tlsdesc_dynamic) = _dl_tlsdesc_dynamic_xsave;
> > +#endif
> > +       }
> > +    }
> > +  else
> > +    {
> > +#ifdef __x86_64__
> > +      GLRO(dl_x86_64_runtime_resolve) = _dl_runtime_resolve_fxsave;
> > +# ifdef SHARED
> > +      GLRO(dl_x86_tlsdesc_dynamic) = _dl_tlsdesc_dynamic_fxsave;
> > +# endif
> > +#else
> > +# ifdef SHARED
> > +      if (CPU_FEATURE_USABLE_P (cpu_features, FXSR))
> > +       GLRO(dl_x86_tlsdesc_dynamic) = _dl_tlsdesc_dynamic_fxsave;
> > +      else
> > +       GLRO(dl_x86_tlsdesc_dynamic) = _dl_tlsdesc_dynamic_fnsave;
> > +# endif
> > +#endif
> > +    }
> > +
> >  #ifdef SHARED
> >  # ifdef __x86_64__
> >    TUNABLE_GET (plt_rewrite, tunable_val_t *,
> > diff --git a/sysdeps/x86/dl-procinfo.c b/sysdeps/x86/dl-procinfo.c
> > index ee957b4d70..5920d4b320 100644
> > --- a/sysdeps/x86/dl-procinfo.c
> > +++ b/sysdeps/x86/dl-procinfo.c
> > @@ -86,3 +86,19 @@ PROCINFO_CLASS const char _dl_x86_platforms[4][9]
> >  #else
> >  ,
> >  #endif
> > +
> > +#if defined SHARED && !IS_IN (ldconfig)
> > +# if !defined PROCINFO_DECL
> > +  ._dl_x86_tlsdesc_dynamic
> > +# else
> > +PROCINFO_CLASS void * _dl_x86_tlsdesc_dynamic
> > +# endif
> > +# ifndef PROCINFO_DECL
> > += NULL
> > +# endif
> > +# ifdef PROCINFO_DECL
> > +;
> > +# else
> > +,
> > +# endif
> > +#endif
> > diff --git a/sysdeps/x86_64/features-offsets.sym b/sysdeps/x86/features-offsets.sym
> > similarity index 89%
> > rename from sysdeps/x86_64/features-offsets.sym
> > rename to sysdeps/x86/features-offsets.sym
> > index 9e4be3393a..77e990c705 100644
> > --- a/sysdeps/x86_64/features-offsets.sym
> > +++ b/sysdeps/x86/features-offsets.sym
> > @@ -3,4 +3,6 @@
> >  #include <ldsodefs.h>
> >
> >  RTLD_GLOBAL_RO_DL_X86_CPU_FEATURES_OFFSET offsetof (struct rtld_global_ro, _dl_x86_cpu_features)
> > +#ifdef __x86_64__
> >  RTLD_GLOBAL_DL_X86_FEATURE_1_OFFSET offsetof (struct rtld_global, _dl_x86_feature_1)
> > +#endif
> > diff --git a/sysdeps/x86/malloc-for-test.c b/sysdeps/x86/malloc-for-test.c
> > new file mode 100644
> > index 0000000000..02f4dead5d
> > --- /dev/null
> > +++ b/sysdeps/x86/malloc-for-test.c
> > @@ -0,0 +1,33 @@
> > +/*  A malloc for intercept test.  x86 version.
> > +   Copyright (C) 2024 Free Software Foundation, Inc.
> > +   This file is part of the GNU C Library.
> > +
> > +   The GNU C Library is free software; you can redistribute it and/or
> > +   modify it under the terms of the GNU Lesser General Public
> > +   License as published by the Free Software Foundation; either
> > +   version 2.1 of the License, or (at your option) any later version.
> > +
> > +   The GNU C Library is distributed in the hope that it will be useful,
> > +   but WITHOUT ANY WARRANTY; without even the implied warranty of
> > +   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
> > +   Lesser General Public License for more details.
> > +
> > +   You should have received a copy of the GNU Lesser General Public
> > +   License along with the GNU C Library; if not, see
> > +   <http://www.gnu.org/licenses/>.  */
> > +
> > +
> > +/* Clear XMM0...XMM7  */
> > +#define PREPARE_MALLOC()                               \
> > +{                                                      \
> > +  asm volatile ("xorps %%xmm0, %%xmm0" : : : "xmm0" ); \
> > +  asm volatile ("xorps %%xmm1, %%xmm1" : : : "xmm1" ); \
> > +  asm volatile ("xorps %%xmm2, %%xmm2" : : : "xmm2" ); \
> > +  asm volatile ("xorps %%xmm3, %%xmm3" : : : "xmm3" ); \
> > +  asm volatile ("xorps %%xmm4, %%xmm4" : : : "xmm4" ); \
> > +  asm volatile ("xorps %%xmm5, %%xmm5" : : : "xmm5" ); \
> > +  asm volatile ("xorps %%xmm6, %%xmm6" : : : "xmm6" ); \
> > +  asm volatile ("xorps %%xmm7, %%xmm7" : : : "xmm7" ); \
> > +}
> > +
> > +#include <elf/malloc-for-test.c>
> > diff --git a/sysdeps/x86/sysdep.h b/sysdeps/x86/sysdep.h
> > index 837fd28734..485cad9c02 100644
> > --- a/sysdeps/x86/sysdep.h
> > +++ b/sysdeps/x86/sysdep.h
> > @@ -70,6 +70,12 @@
> >     | (1 << X86_XSTATE_ZMM_H_ID))
> >  #endif
> >
> > +/* States which should be saved for TLSDESC_CALL and TLS_DESC_CALL.
> > +   Compiler assumes that all registers, including x87 FPU stack registers,
> > +   are unchanged after CALL, except for EFLAGS and RAX/EAX.  */
> > +#define TLSDESC_CALL_STATE_SAVE_MASK   \
> > +  (STATE_SAVE_MASK | (1 << X86_XSTATE_X87_ID))
> > +
> >  /* Constants for bits in __x86_string_control:  */
> >
> >  /* Avoid short distance REP MOVSB.  */
> > diff --git a/sysdeps/x86_64/Makefile b/sysdeps/x86_64/Makefile
> > index 145f25e7f6..9337e95093 100644
> > --- a/sysdeps/x86_64/Makefile
> > +++ b/sysdeps/x86_64/Makefile
> > @@ -10,7 +10,7 @@ LDFLAGS-rtld += -Wl,-z,nomark-plt
> >  endif
> >
> >  ifeq ($(subdir),csu)
> > -gen-as-const-headers += features-offsets.sym link-defines.sym
> > +gen-as-const-headers += link-defines.sym
> >  endif
> >
> >  ifeq ($(subdir),gmon)
> > diff --git a/sysdeps/x86_64/dl-machine.h b/sysdeps/x86_64/dl-machine.h
> > index 6d605d0d32..ff5d45f7cb 100644
> > --- a/sysdeps/x86_64/dl-machine.h
> > +++ b/sysdeps/x86_64/dl-machine.h
> > @@ -71,9 +71,6 @@ elf_machine_runtime_setup (struct link_map *l, struct r_scope_elem *scope[],
> >                            int lazy, int profile)
> >  {
> >    Elf64_Addr *got;
> > -  extern void _dl_runtime_resolve_fxsave (ElfW(Word)) attribute_hidden;
> > -  extern void _dl_runtime_resolve_xsave (ElfW(Word)) attribute_hidden;
> > -  extern void _dl_runtime_resolve_xsavec (ElfW(Word)) attribute_hidden;
> >    extern void _dl_runtime_profile_sse (ElfW(Word)) attribute_hidden;
> >    extern void _dl_runtime_profile_avx (ElfW(Word)) attribute_hidden;
> >    extern void _dl_runtime_profile_avx512 (ElfW(Word)) attribute_hidden;
> > @@ -96,8 +93,6 @@ elf_machine_runtime_setup (struct link_map *l, struct r_scope_elem *scope[],
> >        /* Identify this shared object.  */
> >        *(ElfW(Addr) *) (got + 1) = (ElfW(Addr)) l;
> >
> > -      const struct cpu_features* cpu_features = __get_cpu_features ();
> > -
> >  #ifdef SHARED
> >        /* The got[2] entry contains the address of a function which gets
> >          called to get the address of a so far unresolved function and
> > @@ -107,6 +102,7 @@ elf_machine_runtime_setup (struct link_map *l, struct r_scope_elem *scope[],
> >          end in this function.  */
> >        if (__glibc_unlikely (profile))
> >         {
> > +         const struct cpu_features* cpu_features = __get_cpu_features ();
> >           if (X86_ISA_CPU_FEATURE_USABLE_P (cpu_features, AVX512F))
> >             *(ElfW(Addr) *) (got + 2) = (ElfW(Addr)) &_dl_runtime_profile_avx512;
> >           else if (X86_ISA_CPU_FEATURE_USABLE_P (cpu_features, AVX))
> > @@ -126,15 +122,8 @@ elf_machine_runtime_setup (struct link_map *l, struct r_scope_elem *scope[],
> >           /* This function will get called to fix up the GOT entry
> >              indicated by the offset on the stack, and then jump to
> >              the resolved address.  */
> > -         if (MINIMUM_X86_ISA_LEVEL >= AVX_X86_ISA_LEVEL
> > -             || GLRO(dl_x86_cpu_features).xsave_state_size != 0)
> > -           *(ElfW(Addr) *) (got + 2)
> > -             = (CPU_FEATURE_USABLE_P (cpu_features, XSAVEC)
> > -                ? (ElfW(Addr)) &_dl_runtime_resolve_xsavec
> > -                : (ElfW(Addr)) &_dl_runtime_resolve_xsave);
> > -         else
> > -           *(ElfW(Addr) *) (got + 2)
> > -             = (ElfW(Addr)) &_dl_runtime_resolve_fxsave;
> > +         *(ElfW(Addr) *) (got + 2)
> > +           = (ElfW(Addr)) GLRO(dl_x86_64_runtime_resolve);
> >         }
> >      }
> >
> > @@ -383,7 +372,7 @@ and creates an unsatisfiable circular dependency.\n",
> >                   {
> >                     td->arg = _dl_make_tlsdesc_dynamic
> >                       (sym_map, sym->st_value + reloc->r_addend);
> > -                   td->entry = _dl_tlsdesc_dynamic;
> > +                   td->entry = GLRO(dl_x86_tlsdesc_dynamic);
> >                   }
> >                 else
> >  #  endif
> > diff --git a/sysdeps/x86_64/dl-procinfo.c b/sysdeps/x86_64/dl-procinfo.c
> > index 4d1d790fbb..06637a8154 100644
> > --- a/sysdeps/x86_64/dl-procinfo.c
> > +++ b/sysdeps/x86_64/dl-procinfo.c
> > @@ -41,5 +41,21 @@
> >
> >  #include <sysdeps/x86/dl-procinfo.c>
> >
> > +#if !IS_IN (ldconfig)
> > +# if !defined PROCINFO_DECL && defined SHARED
> > +  ._dl_x86_64_runtime_resolve
> > +# else
> > +PROCINFO_CLASS void * _dl_x86_64_runtime_resolve
> > +# endif
> > +# ifndef PROCINFO_DECL
> > += NULL
> > +# endif
> > +# if !defined SHARED || defined PROCINFO_DECL
> > +;
> > +# else
> > +,
> > +# endif
> > +#endif
> > +
> >  #undef PROCINFO_DECL
> >  #undef PROCINFO_CLASS
> > diff --git a/sysdeps/x86_64/dl-tlsdesc-dynamic.h b/sysdeps/x86_64/dl-tlsdesc-dynamic.h
> > new file mode 100644
> > index 0000000000..ce0bc094ec
> > --- /dev/null
> > +++ b/sysdeps/x86_64/dl-tlsdesc-dynamic.h
> > @@ -0,0 +1,166 @@
> > +/* Thread-local storage handling in the ELF dynamic linker.  x86_64 version.
> > +   Copyright (C) 2004-2024 Free Software Foundation, Inc.
> > +   This file is part of the GNU C Library.
> > +
> > +   The GNU C Library is free software; you can redistribute it and/or
> > +   modify it under the terms of the GNU Lesser General Public
> > +   License as published by the Free Software Foundation; either
> > +   version 2.1 of the License, or (at your option) any later version.
> > +
> > +   The GNU C Library is distributed in the hope that it will be useful,
> > +   but WITHOUT ANY WARRANTY; without even the implied warranty of
> > +   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
> > +   Lesser General Public License for more details.
> > +
> > +   You should have received a copy of the GNU Lesser General Public
> > +   License along with the GNU C Library; if not, see
> > +   <https://www.gnu.org/licenses/>.  */
> > +
> > +#ifndef SECTION
> > +# define SECTION(p)    p
> > +#endif
> > +
> > +#undef REGISTER_SAVE_AREA
> > +#undef LOCAL_STORAGE_AREA
> > +#undef BASE
> > +
> > +#include "dl-trampoline-state.h"
> > +
> > +       .section SECTION(.text),"ax",@progbits
> > +
> > +       .hidden _dl_tlsdesc_dynamic
> > +       .global _dl_tlsdesc_dynamic
> > +       .type   _dl_tlsdesc_dynamic,@function
> > +
> > +     /* %rax points to the TLS descriptor, such that 0(%rax) points to
> > +       _dl_tlsdesc_dynamic itself, and 8(%rax) points to a struct
> > +       tlsdesc_dynamic_arg object.  It must return in %rax the offset
> > +       between the thread pointer and the object denoted by the
> > +       argument, without clobbering any registers.
> > +
> > +       The assembly code that follows is a rendition of the following
> > +       C code, hand-optimized a little bit.
> > +
> > +ptrdiff_t
> > +_dl_tlsdesc_dynamic (register struct tlsdesc *tdp asm ("%rax"))
> > +{
> > +  struct tlsdesc_dynamic_arg *td = tdp->arg;
> > +  dtv_t *dtv = *(dtv_t **)((char *)__thread_pointer + DTV_OFFSET);
> > +  if (__builtin_expect (td->gen_count <= dtv[0].counter
> > +                       && (dtv[td->tlsinfo.ti_module].pointer.val
> > +                           != TLS_DTV_UNALLOCATED),
> > +                       1))
> > +    return dtv[td->tlsinfo.ti_module].pointer.val + td->tlsinfo.ti_offset
> > +      - __thread_pointer;
> > +
> > +  return __tls_get_addr_internal (&td->tlsinfo) - __thread_pointer;
> > +}
> > +*/
> basically same comments for x86 version as for i386.

I will leave the original code alone and only change comments
to /* ... */.
  

Patch

diff --git a/elf/Makefile b/elf/Makefile
index 5d78b659ce..94d00f02be 100644
--- a/elf/Makefile
+++ b/elf/Makefile
@@ -183,8 +183,16 @@  routines += unwind-dw2-fde-glibc
 shared-only-routines += unwind-dw2-fde-glibc
 endif
 
-before-compile  += $(objpfx)trusted-dirs.h
-generated	+= trusted-dirs.h trusted-dirs.st for-renamed/renamed.so
+before-compile += \
+  $(objpfx)malloc-for-test.map \
+  $(objpfx)trusted-dirs.h \
+# before-compile
+generated += \
+  for-renamed/renamed.so \
+  malloc-for-test.map \
+  trusted-dirs.h \
+  trusted-dirs.st \
+# generated
 generated-dirs	+= for-renamed
 
 ifeq ($(build-shared),yes)
@@ -424,6 +432,7 @@  tests += \
   tst-glibc-hwcaps-prepend \
   tst-global1 \
   tst-global2 \
+  tst-gnu2-tls2 \
   tst-initfinilazyfail \
   tst-initorder \
   tst-initorder2 \
@@ -699,6 +708,7 @@  modules-names += \
   libtracemod5-1 \
   ltglobmod1 \
   ltglobmod2 \
+  malloc-for-test \
   neededobj1 \
   neededobj2 \
   neededobj3 \
@@ -846,6 +856,9 @@  modules-names += \
   tst-filterobj-flt \
   tst-finilazyfailmod \
   tst-globalmod2 \
+  tst-gnu2-tls2mod0 \
+  tst-gnu2-tls2mod1 \
+  tst-gnu2-tls2mod2 \
   tst-initlazyfailmod \
   tst-initorder2a \
   tst-initorder2b \
@@ -3044,8 +3057,27 @@  $(objpfx)tst-tlsgap.out: \
   $(objpfx)tst-tlsgap-mod0.so \
   $(objpfx)tst-tlsgap-mod1.so \
   $(objpfx)tst-tlsgap-mod2.so
+
+$(objpfx)tst-gnu2-tls2: \
+  $(shared-thread-library) \
+  $(objpfx)malloc-for-test.so
+$(objpfx)tst-gnu2-tls2.out: \
+  $(objpfx)tst-gnu2-tls2mod0.so \
+  $(objpfx)tst-gnu2-tls2mod1.so \
+  $(objpfx)tst-gnu2-tls2mod2.so
+
+LDFLAGS-malloc-for-test.so += -Wl,--version-script=malloc-for-test.map
+
+$(objpfx)malloc-for-test.map: $(objpfx)../abi-versions.h
+	echo "#include \"malloc-for-test.map.in\"" \
+	  | $(CC) -E -I$(objpfx).. - \
+	  | sed -n '/GLIBC/,$$ p' | sed -n '/#/q;p' > $@
+
 ifeq (yes,$(have-mtls-dialect-gnu2))
 CFLAGS-tst-tlsgap-mod0.c += -mtls-dialect=gnu2
 CFLAGS-tst-tlsgap-mod1.c += -mtls-dialect=gnu2
 CFLAGS-tst-tlsgap-mod2.c += -mtls-dialect=gnu2
+CFLAGS-tst-gnu2-tls2mod0.c += -mtls-dialect=gnu2
+CFLAGS-tst-gnu2-tls2mod1.c += -mtls-dialect=gnu2
+CFLAGS-tst-gnu2-tls2mod2.c += -mtls-dialect=gnu2
 endif
diff --git a/elf/malloc-for-test.c b/elf/malloc-for-test.c
new file mode 100644
index 0000000000..1bec69eda7
--- /dev/null
+++ b/elf/malloc-for-test.c
@@ -0,0 +1,32 @@ 
+/* A malloc for intercept test.
+   Copyright (C) 2024 Free Software Foundation, Inc.
+   This file is part of the GNU C Library.
+
+   The GNU C Library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 2.1 of the License, or (at your option) any later version.
+
+   The GNU C Library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with the GNU C Library; if not, see
+   <http://www.gnu.org/licenses/>.  */
+
+#include <stdlib.h>
+
+extern void * __libc_malloc (size_t);
+
+#ifndef PREPARE_MALLOC
+# define PREPARE_MALLOC()
+#endif
+
+void *
+malloc (size_t n)
+{
+  PREPARE_MALLOC ();
+  return __libc_malloc (n);
+}
diff --git a/elf/malloc-for-test.map.in b/elf/malloc-for-test.map.in
new file mode 100644
index 0000000000..2b96d95954
--- /dev/null
+++ b/elf/malloc-for-test.map.in
@@ -0,0 +1,8 @@ 
+#include <abi-versions.h>
+
+VERSION_libc_GLIBC_2_0 {
+  global:
+    malloc;
+  local:
+    *;
+};
diff --git a/elf/tst-gnu2-tls2.c b/elf/tst-gnu2-tls2.c
new file mode 100644
index 0000000000..34427f9a0f
--- /dev/null
+++ b/elf/tst-gnu2-tls2.c
@@ -0,0 +1,97 @@ 
+/* Test TLSDESC relocation.
+   Copyright (C) 2024 Free Software Foundation, Inc.
+   This file is part of the GNU C Library.
+
+   The GNU C Library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 2.1 of the License, or (at your option) any later version.
+
+   The GNU C Library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with the GNU C Library; if not, see
+   <http://www.gnu.org/licenses/>.  */
+
+#include <stdio.h>
+#include <string.h>
+#include <dlfcn.h>
+#include <pthread.h>
+#include <support/xdlfcn.h>
+#include <support/xthread.h>
+#include <support/check.h>
+#include <support/test-driver.h>
+#include "tst-gnu2-tls2.h"
+
+#ifndef IS_SUPPORTED
+# define IS_SUPPORTED() true
+#endif
+
+static void *mod[3];
+#define MOD(i) "tst-gnu2-tls2mod" #i ".so"
+static const char *modname[3] = { MOD(0), MOD(1), MOD(2) };
+#undef MOD
+
+static void
+open_mod (int i)
+{
+  mod[i] = xdlopen (modname[i], RTLD_LAZY);
+  printf ("open %s\n", modname[i]);
+}
+
+static void
+close_mod (int i)
+{
+  xdlclose (mod[i]);
+  mod[i] = NULL;
+  printf ("close %s\n", modname[i]);
+}
+
+static void
+access_mod (int i, const char *sym)
+{
+  struct tls var = { -1, -1, -1, -1 };
+  struct tls *(*f) (struct tls *) = xdlsym (mod[i], sym);
+  struct tls *p = f (&var);
+  printf ("access %s: %s() = %p\n", modname[i], sym, p);
+  TEST_VERIFY_EXIT (memcmp (p, &var, sizeof (var)) == 0);
+  ++(p->a);
+}
+
+static void *
+start (void *arg)
+{
+  /* The DTV generation is at the last dlopen of mod0 and the
+     entry for mod1 is NULL.  */
+
+  open_mod (1); /* Reuse modid of mod1. Uses dynamic TLS.  */
+
+  /* Force the slow path in GNU2 TLS descriptor call.  */
+  access_mod (1, "apply_tls");
+
+  return arg;
+}
+
+static int
+do_test (void)
+{
+  if (!IS_SUPPORTED ())
+    return EXIT_UNSUPPORTED;
+
+  open_mod (0);
+  open_mod (1);
+  open_mod (2);
+  close_mod (0);
+  close_mod (1); /* Create modid gap at mod1.  */
+  open_mod (0); /* Reuse modid of mod0, bump generation count.  */
+
+  /* Create a thread where DTV of mod1 is NULL.  */
+  pthread_t t = xpthread_create (NULL, start, NULL);
+  xpthread_join (t);
+  return 0;
+}
+
+#include <support/test-driver.c>
diff --git a/elf/tst-gnu2-tls2.h b/elf/tst-gnu2-tls2.h
new file mode 100644
index 0000000000..e33f4dbe27
--- /dev/null
+++ b/elf/tst-gnu2-tls2.h
@@ -0,0 +1,26 @@ 
+/* Test TLSDESC relocation.
+   Copyright (C) 2024 Free Software Foundation, Inc.
+   This file is part of the GNU C Library.
+
+   The GNU C Library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 2.1 of the License, or (at your option) any later version.
+
+   The GNU C Library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with the GNU C Library; if not, see
+   <https://www.gnu.org/licenses/>.  */
+
+#include <stdint.h>
+
+struct tls
+{
+  int64_t a, b, c, d;
+};
+
+extern struct tls *apply_tls (struct tls *);
diff --git a/elf/tst-gnu2-tls2mod0.c b/elf/tst-gnu2-tls2mod0.c
new file mode 100644
index 0000000000..67dc0d464d
--- /dev/null
+++ b/elf/tst-gnu2-tls2mod0.c
@@ -0,0 +1,28 @@ 
+/* DSO used by tst-gnu2-tls2.
+   Copyright (C) 2024 Free Software Foundation, Inc.
+   This file is part of the GNU C Library.
+
+   The GNU C Library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 2.1 of the License, or (at your option) any later version.
+
+   The GNU C Library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with the GNU C Library; if not, see
+   <https://www.gnu.org/licenses/>.  */
+
+#include "tst-gnu2-tls2.h"
+
+__thread struct tls tls_var0 __attribute__ ((visibility ("hidden")));
+
+struct tls *
+apply_tls (struct tls *p)
+{
+  tls_var0 = *p;
+  return &tls_var0;
+}
diff --git a/elf/tst-gnu2-tls2mod1.c b/elf/tst-gnu2-tls2mod1.c
new file mode 100644
index 0000000000..a4ae6db24f
--- /dev/null
+++ b/elf/tst-gnu2-tls2mod1.c
@@ -0,0 +1,28 @@ 
+/* DSO used by tst-gnu2-tls2.
+   Copyright (C) 2024 Free Software Foundation, Inc.
+   This file is part of the GNU C Library.
+
+   The GNU C Library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 2.1 of the License, or (at your option) any later version.
+
+   The GNU C Library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with the GNU C Library; if not, see
+   <https://www.gnu.org/licenses/>.  */
+
+#include "tst-gnu2-tls2.h"
+
+__thread struct tls tls_var1[100] __attribute__ ((visibility ("hidden")));
+
+struct tls *
+apply_tls (struct tls *p)
+{
+  tls_var1[1] = *p;
+  return &tls_var1[1];
+}
diff --git a/elf/tst-gnu2-tls2mod2.c b/elf/tst-gnu2-tls2mod2.c
new file mode 100644
index 0000000000..2d13921717
--- /dev/null
+++ b/elf/tst-gnu2-tls2mod2.c
@@ -0,0 +1,28 @@ 
+/* DSO used by tst-gnu2-tls2.
+   Copyright (C) 2024 Free Software Foundation, Inc.
+   This file is part of the GNU C Library.
+
+   The GNU C Library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 2.1 of the License, or (at your option) any later version.
+
+   The GNU C Library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with the GNU C Library; if not, see
+   <https://www.gnu.org/licenses/>.  */
+
+#include "tst-gnu2-tls2.h"
+
+__thread struct tls tls_var2 __attribute__ ((visibility ("hidden")));
+
+struct tls *
+apply_tls (struct tls *p)
+{
+  tls_var2 = *p;
+  return &tls_var2;
+}
diff --git a/sysdeps/i386/dl-machine.h b/sysdeps/i386/dl-machine.h
index fc1ef96587..50d74fe6e9 100644
--- a/sysdeps/i386/dl-machine.h
+++ b/sysdeps/i386/dl-machine.h
@@ -347,7 +347,7 @@  and creates an unsatisfiable circular dependency.\n",
 		  {
 		    td->arg = _dl_make_tlsdesc_dynamic
 		      (sym_map, sym->st_value + (ElfW(Word))td->arg);
-		    td->entry = _dl_tlsdesc_dynamic;
+		    td->entry = GLRO(dl_x86_tlsdesc_dynamic);
 		  }
 		else
 #  endif
diff --git a/sysdeps/i386/dl-tlsdesc-dynamic.h b/sysdeps/i386/dl-tlsdesc-dynamic.h
new file mode 100644
index 0000000000..c857c68c55
--- /dev/null
+++ b/sysdeps/i386/dl-tlsdesc-dynamic.h
@@ -0,0 +1,190 @@ 
+/* Thread-local storage handling in the ELF dynamic linker.  i386 version.
+   Copyright (C) 2004-2024 Free Software Foundation, Inc.
+   This file is part of the GNU C Library.
+
+   The GNU C Library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 2.1 of the License, or (at your option) any later version.
+
+   The GNU C Library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with the GNU C Library; if not, see
+   <https://www.gnu.org/licenses/>.  */
+
+#undef REGISTER_SAVE_AREA
+
+#if !defined USE_FNSAVE && (STATE_SAVE_ALIGNMENT % 16) != 0
+# error STATE_SAVE_ALIGNMENT must be multiple of 16
+#endif
+
+#if DL_RUNTIME_RESOLVE_REALIGN_STACK
+# ifdef USE_FNSAVE
+#  error USE_FNSAVE shouldn't be defined
+# endif
+# ifdef USE_FXSAVE
+/* Use fxsave to save all registers.  */
+#  define REGISTER_SAVE_AREA	512
+# endif
+#else
+# ifdef USE_FNSAVE
+/* Use fnsave to save x87 FPU stack registers.  */
+#  define REGISTER_SAVE_AREA	108
+# else
+#  ifndef USE_FXSAVE
+#   error USE_FXSAVE must be defined
+#  endif
+/* Use fxsave to save all registers.  Add 12 bytes to align the stack
+   to 16 bytes.  */
+#  define REGISTER_SAVE_AREA	(512 + 12)
+# endif
+#endif
+
+	.hidden _dl_tlsdesc_dynamic
+	.global	_dl_tlsdesc_dynamic
+	.type	_dl_tlsdesc_dynamic,@function
+
+     /* This function is used for symbols that need dynamic TLS.
+
+	%eax points to the TLS descriptor, such that 0(%eax) points to
+	_dl_tlsdesc_dynamic itself, and 4(%eax) points to a struct
+	tlsdesc_dynamic_arg object.  It must return in %eax the offset
+	between the thread pointer and the object denoted by the
+	argument, without clobbering any registers.
+
+	The assembly code that follows is a rendition of the following
+	C code, hand-optimized a little bit.
+
+ptrdiff_t
+__attribute__ ((__regparm__ (1)))
+_dl_tlsdesc_dynamic (struct tlsdesc *tdp)
+{
+  struct tlsdesc_dynamic_arg *td = tdp->arg;
+  dtv_t *dtv = *(dtv_t **)((char *)__thread_pointer + DTV_OFFSET);
+  if (__builtin_expect (td->gen_count <= dtv[0].counter
+			&& (dtv[td->tlsinfo.ti_module].pointer.val
+			    != TLS_DTV_UNALLOCATED),
+			1))
+    return dtv[td->tlsinfo.ti_module].pointer.val + td->tlsinfo.ti_offset
+      - __thread_pointer;
+
+  return ___tls_get_addr (&td->tlsinfo) - __thread_pointer;
+}
+*/
+	cfi_startproc
+	.align 16
+_dl_tlsdesc_dynamic:
+	/* Like all TLS resolvers, preserve call-clobbered registers.
+	   We need two scratch regs anyway.  */
+	subl	$32, %esp
+	cfi_adjust_cfa_offset (32)
+	movl	%ecx, 20(%esp)
+	movl	%edx, 24(%esp)
+	movl	TLSDESC_ARG(%eax), %eax
+	movl	%gs:DTV_OFFSET, %edx
+	movl	TLSDESC_GEN_COUNT(%eax), %ecx
+	cmpl	(%edx), %ecx
+	ja	2f
+	movl	TLSDESC_MODID(%eax), %ecx
+	movl	(%edx,%ecx,8), %edx
+	cmpl	$-1, %edx
+	je	2f
+	movl	TLSDESC_MODOFF(%eax), %eax
+	addl	%edx, %eax
+1:
+	movl	20(%esp), %ecx
+	subl	%gs:0, %eax
+	movl	24(%esp), %edx
+	addl	$32, %esp
+	cfi_adjust_cfa_offset (-32)
+	ret
+	.p2align 4,,7
+2:
+	cfi_adjust_cfa_offset (32)
+#if DL_RUNTIME_RESOLVE_REALIGN_STACK
+	movl	%ebx, -28(%esp)
+	movl	%esp, %ebx
+	cfi_def_cfa_register(%ebx)
+	and	$-STATE_SAVE_ALIGNMENT, %esp
+#endif
+#ifdef REGISTER_SAVE_AREA
+	subl	$REGISTER_SAVE_AREA, %esp
+# if !DL_RUNTIME_RESOLVE_REALIGN_STACK
+	cfi_adjust_cfa_offset(REGISTER_SAVE_AREA)
+# endif
+#else
+# if !DL_RUNTIME_RESOLVE_REALIGN_STACK
+#  error DL_RUNTIME_RESOLVE_REALIGN_STACK must be true
+# endif
+	# Allocate stack space of the required size to save the state.
+	LOAD_PIC_REG (cx)
+	subl	RTLD_GLOBAL_RO_DL_X86_CPU_FEATURES_OFFSET+XSAVE_STATE_SIZE_OFFSET+_rtld_local_ro@GOTOFF(%ecx), %esp
+#endif
+#ifdef USE_FNSAVE
+	fnsave	(%esp)
+#elif defined USE_FXSAVE
+	fxsave	(%esp)
+#else
+	# Save the argument for ___tls_get_addr in EAX.
+	movl	%eax, %ecx
+	movl	$TLSDESC_CALL_STATE_SAVE_MASK, %eax
+	xorl	%edx, %edx
+	# Clear the XSAVE Header.
+# ifdef USE_XSAVE
+	movl	%edx, (512)(%esp)
+	movl	%edx, (512 + 4 * 1)(%esp)
+	movl	%edx, (512 + 4 * 2)(%esp)
+	movl	%edx, (512 + 4 * 3)(%esp)
+# endif
+	movl	%edx, (512 + 4 * 4)(%esp)
+	movl	%edx, (512 + 4 * 5)(%esp)
+	movl	%edx, (512 + 4 * 6)(%esp)
+	movl	%edx, (512 + 4 * 7)(%esp)
+	movl	%edx, (512 + 4 * 8)(%esp)
+	movl	%edx, (512 + 4 * 9)(%esp)
+	movl	%edx, (512 + 4 * 10)(%esp)
+	movl	%edx, (512 + 4 * 11)(%esp)
+	movl	%edx, (512 + 4 * 12)(%esp)
+	movl	%edx, (512 + 4 * 13)(%esp)
+	movl	%edx, (512 + 4 * 14)(%esp)
+	movl	%edx, (512 + 4 * 15)(%esp)
+# ifdef USE_XSAVE
+	xsave	(%esp)
+# else
+	xsavec	(%esp)
+# endif
+	# Restore the argument for ___tls_get_addr in EAX.
+	movl	%ecx, %eax
+#endif
+	call	HIDDEN_JUMPTARGET (___tls_get_addr)
+	# Get register content back.
+#ifdef USE_FNSAVE
+	frstor	(%esp)
+#elif defined USE_FXSAVE
+	fxrstor	(%esp)
+#else
+	/* Save and retore ___tls_get_addr return value stored in EAX.  */
+	movl	%eax, %ecx
+	movl	$TLSDESC_CALL_STATE_SAVE_MASK, %eax
+	xorl	%edx, %edx
+	xrstor	(%esp)
+	movl	%ecx, %eax
+#endif
+#if DL_RUNTIME_RESOLVE_REALIGN_STACK
+	mov	%ebx, %esp
+	cfi_def_cfa_register(%esp)
+	movl	-28(%esp), %ebx
+	cfi_restore(%ebx)
+#else
+	addl	$REGISTER_SAVE_AREA, %esp
+	cfi_adjust_cfa_offset(-REGISTER_SAVE_AREA)
+#endif
+	jmp	1b
+	cfi_endproc
+	.size	_dl_tlsdesc_dynamic, .-_dl_tlsdesc_dynamic
+
+#undef STATE_SAVE_ALIGNMENT
diff --git a/sysdeps/i386/dl-tlsdesc.S b/sysdeps/i386/dl-tlsdesc.S
index 90d93caa0c..f002feee56 100644
--- a/sysdeps/i386/dl-tlsdesc.S
+++ b/sysdeps/i386/dl-tlsdesc.S
@@ -18,8 +18,27 @@ 
 
 #include <sysdep.h>
 #include <tls.h>
+#include <cpu-features-offsets.h>
+#include <features-offsets.h>
 #include "tlsdesc.h"
 
+#ifndef DL_STACK_ALIGNMENT
+/* Due to GCC bug:
+
+   https://gcc.gnu.org/bugzilla/show_bug.cgi?id=58066
+
+   __tls_get_addr may be called with 4-byte stack alignment.  Although
+   this bug has been fixed in GCC 4.9.4, 5.3 and 6, we can't assume
+   that stack will be always aligned at 16 bytes.  */
+# define DL_STACK_ALIGNMENT 4
+#endif
+
+/* True if _dl_tlsdesc_dynamic should align stack for STATE_SAVE or align
+   stack to MINIMUM_ALIGNMENT bytes before calling ___tls_get_addr.  */
+#define DL_RUNTIME_RESOLVE_REALIGN_STACK \
+  (STATE_SAVE_ALIGNMENT > DL_STACK_ALIGNMENT \
+   || MINIMUM_ALIGNMENT > DL_STACK_ALIGNMENT)
+
 	.text
 
      /* This function is used to compute the TP offset for symbols in
@@ -65,69 +84,35 @@  _dl_tlsdesc_undefweak:
 	.size	_dl_tlsdesc_undefweak, .-_dl_tlsdesc_undefweak
 
 #ifdef SHARED
-	.hidden _dl_tlsdesc_dynamic
-	.global	_dl_tlsdesc_dynamic
-	.type	_dl_tlsdesc_dynamic,@function
-
-     /* This function is used for symbols that need dynamic TLS.
-
-	%eax points to the TLS descriptor, such that 0(%eax) points to
-	_dl_tlsdesc_dynamic itself, and 4(%eax) points to a struct
-	tlsdesc_dynamic_arg object.  It must return in %eax the offset
-	between the thread pointer and the object denoted by the
-	argument, without clobbering any registers.
-
-	The assembly code that follows is a rendition of the following
-	C code, hand-optimized a little bit.
-
-ptrdiff_t
-__attribute__ ((__regparm__ (1)))
-_dl_tlsdesc_dynamic (struct tlsdesc *tdp)
-{
-  struct tlsdesc_dynamic_arg *td = tdp->arg;
-  dtv_t *dtv = *(dtv_t **)((char *)__thread_pointer + DTV_OFFSET);
-  if (__builtin_expect (td->gen_count <= dtv[0].counter
-			&& (dtv[td->tlsinfo.ti_module].pointer.val
-			    != TLS_DTV_UNALLOCATED),
-			1))
-    return dtv[td->tlsinfo.ti_module].pointer.val + td->tlsinfo.ti_offset
-      - __thread_pointer;
-
-  return ___tls_get_addr (&td->tlsinfo) - __thread_pointer;
-}
-*/
-	cfi_startproc
-	.align 16
-_dl_tlsdesc_dynamic:
-	/* Like all TLS resolvers, preserve call-clobbered registers.
-	   We need two scratch regs anyway.  */
-	subl	$28, %esp
-	cfi_adjust_cfa_offset (28)
-	movl	%ecx, 20(%esp)
-	movl	%edx, 24(%esp)
-	movl	TLSDESC_ARG(%eax), %eax
-	movl	%gs:DTV_OFFSET, %edx
-	movl	TLSDESC_GEN_COUNT(%eax), %ecx
-	cmpl	(%edx), %ecx
-	ja	.Lslow
-	movl	TLSDESC_MODID(%eax), %ecx
-	movl	(%edx,%ecx,8), %edx
-	cmpl	$-1, %edx
-	je	.Lslow
-	movl	TLSDESC_MODOFF(%eax), %eax
-	addl	%edx, %eax
-.Lret:
-	movl	20(%esp), %ecx
-	subl	%gs:0, %eax
-	movl	24(%esp), %edx
-	addl	$28, %esp
-	cfi_adjust_cfa_offset (-28)
-	ret
-	.p2align 4,,7
-.Lslow:
-	cfi_adjust_cfa_offset (28)
-	call	HIDDEN_JUMPTARGET (___tls_get_addr)
-	jmp	.Lret
-	cfi_endproc
-	.size	_dl_tlsdesc_dynamic, .-_dl_tlsdesc_dynamic
+# define USE_FNSAVE
+# define MINIMUM_ALIGNMENT	4
+# define STATE_SAVE_ALIGNMENT	4
+# define _dl_tlsdesc_dynamic	_dl_tlsdesc_dynamic_fnsave
+# include "dl-tlsdesc-dynamic.h"
+# undef _dl_tlsdesc_dynamic
+# undef MINIMUM_ALIGNMENT
+# undef USE_FNSAVE
+
+# define MINIMUM_ALIGNMENT	16
+
+# define USE_FXSAVE
+# define STATE_SAVE_ALIGNMENT	16
+# define _dl_tlsdesc_dynamic	_dl_tlsdesc_dynamic_fxsave
+# include "dl-tlsdesc-dynamic.h"
+# undef _dl_tlsdesc_dynamic
+# undef USE_FXSAVE
+
+# define USE_XSAVE
+# define STATE_SAVE_ALIGNMENT	64
+# define _dl_tlsdesc_dynamic	_dl_tlsdesc_dynamic_xsave
+# include "dl-tlsdesc-dynamic.h"
+# undef _dl_tlsdesc_dynamic
+# undef USE_XSAVE
+
+# define USE_XSAVEC
+# define STATE_SAVE_ALIGNMENT	64
+# define _dl_tlsdesc_dynamic	_dl_tlsdesc_dynamic_xsavec
+# include "dl-tlsdesc-dynamic.h"
+# undef _dl_tlsdesc_dynamic
+# undef USE_XSAVEC
 #endif /* SHARED */
diff --git a/sysdeps/i386/tst-gnu2-tls2.c b/sysdeps/i386/tst-gnu2-tls2.c
new file mode 100644
index 0000000000..92e7fbff89
--- /dev/null
+++ b/sysdeps/i386/tst-gnu2-tls2.c
@@ -0,0 +1,5 @@ 
+#include <sys/platform/x86.h>
+
+#define IS_SUPPORTED() CPU_FEATURE_ACTIVE (SSE2)
+
+#include <elf/tst-gnu2-tls2.c>
diff --git a/sysdeps/x86/Makefile b/sysdeps/x86/Makefile
index 73b29cc78c..581086305d 100644
--- a/sysdeps/x86/Makefile
+++ b/sysdeps/x86/Makefile
@@ -1,5 +1,5 @@ 
 ifeq ($(subdir),csu)
-gen-as-const-headers += cpu-features-offsets.sym
+gen-as-const-headers += cpu-features-offsets.sym features-offsets.sym
 endif
 
 ifeq ($(subdir),elf)
@@ -86,6 +86,11 @@  endif
 tst-ifunc-isa-2-ENV = GLIBC_TUNABLES=glibc.cpu.hwcaps=-SSE4_2,-AVX,-AVX2,-AVX512F
 tst-ifunc-isa-2-static-ENV = $(tst-ifunc-isa-2-ENV)
 tst-hwcap-tunables-ARGS = -- $(host-test-program-cmd)
+
+CFLAGS-malloc-for-test.c += -msse2
+CFLAGS-tst-gnu2-tls2mod0.c += -msse2 -mtune=haswell
+CFLAGS-tst-gnu2-tls2mod1.c += -msse2 -mtune=haswell
+CFLAGS-tst-gnu2-tls2mod2.c += -msse2 -mtune=haswell
 endif
 
 ifeq ($(subdir),math)
diff --git a/sysdeps/x86/cpu-features.c b/sysdeps/x86/cpu-features.c
index 25e6622a79..835113b42f 100644
--- a/sysdeps/x86/cpu-features.c
+++ b/sysdeps/x86/cpu-features.c
@@ -27,8 +27,13 @@ 
 extern void TUNABLE_CALLBACK (set_hwcaps) (tunable_val_t *)
   attribute_hidden;
 
-#if defined SHARED && defined __x86_64__
-# include <dl-plt-rewrite.h>
+#if defined SHARED
+extern void _dl_tlsdesc_dynamic_fxsave (void) attribute_hidden;
+extern void _dl_tlsdesc_dynamic_xsave (void) attribute_hidden;
+extern void _dl_tlsdesc_dynamic_xsavec (void) attribute_hidden;
+
+# ifdef __x86_64__
+#  include <dl-plt-rewrite.h>
 
 static void
 TUNABLE_CALLBACK (set_plt_rewrite) (tunable_val_t *valp)
@@ -47,6 +52,15 @@  TUNABLE_CALLBACK (set_plt_rewrite) (tunable_val_t *valp)
 		 : plt_rewrite_jmp);
     }
 }
+# else
+extern void _dl_tlsdesc_dynamic_fnsave (void) attribute_hidden;
+# endif
+#endif
+
+#ifdef __x86_64__
+extern void _dl_runtime_resolve_fxsave (void) attribute_hidden;
+extern void _dl_runtime_resolve_xsave (void) attribute_hidden;
+extern void _dl_runtime_resolve_xsavec (void) attribute_hidden;
 #endif
 
 #ifdef __LP64__
@@ -1130,6 +1144,44 @@  no_cpuid:
 	       TUNABLE_CALLBACK (set_x86_shstk));
 #endif
 
+  if (GLRO(dl_x86_cpu_features).xsave_state_size != 0)
+    {
+      if (CPU_FEATURE_USABLE_P (cpu_features, XSAVEC))
+	{
+#ifdef __x86_64__
+	  GLRO(dl_x86_64_runtime_resolve) = _dl_runtime_resolve_xsavec;
+#endif
+#ifdef SHARED
+	  GLRO(dl_x86_tlsdesc_dynamic) = _dl_tlsdesc_dynamic_xsavec;
+#endif
+	}
+      else
+	{
+#ifdef __x86_64__
+	  GLRO(dl_x86_64_runtime_resolve) = _dl_runtime_resolve_xsave;
+#endif
+#ifdef SHARED
+	  GLRO(dl_x86_tlsdesc_dynamic) = _dl_tlsdesc_dynamic_xsave;
+#endif
+	}
+    }
+  else
+    {
+#ifdef __x86_64__
+      GLRO(dl_x86_64_runtime_resolve) = _dl_runtime_resolve_fxsave;
+# ifdef SHARED
+      GLRO(dl_x86_tlsdesc_dynamic) = _dl_tlsdesc_dynamic_fxsave;
+# endif
+#else
+# ifdef SHARED
+      if (CPU_FEATURE_USABLE_P (cpu_features, FXSR))
+	GLRO(dl_x86_tlsdesc_dynamic) = _dl_tlsdesc_dynamic_fxsave;
+      else
+	GLRO(dl_x86_tlsdesc_dynamic) = _dl_tlsdesc_dynamic_fnsave;
+# endif
+#endif
+    }
+
 #ifdef SHARED
 # ifdef __x86_64__
   TUNABLE_GET (plt_rewrite, tunable_val_t *,
diff --git a/sysdeps/x86/dl-procinfo.c b/sysdeps/x86/dl-procinfo.c
index ee957b4d70..5920d4b320 100644
--- a/sysdeps/x86/dl-procinfo.c
+++ b/sysdeps/x86/dl-procinfo.c
@@ -86,3 +86,19 @@  PROCINFO_CLASS const char _dl_x86_platforms[4][9]
 #else
 ,
 #endif
+
+#if defined SHARED && !IS_IN (ldconfig)
+# if !defined PROCINFO_DECL
+  ._dl_x86_tlsdesc_dynamic
+# else
+PROCINFO_CLASS void * _dl_x86_tlsdesc_dynamic
+# endif
+# ifndef PROCINFO_DECL
+= NULL
+# endif
+# ifdef PROCINFO_DECL
+;
+# else
+,
+# endif
+#endif
diff --git a/sysdeps/x86_64/features-offsets.sym b/sysdeps/x86/features-offsets.sym
similarity index 89%
rename from sysdeps/x86_64/features-offsets.sym
rename to sysdeps/x86/features-offsets.sym
index 9e4be3393a..77e990c705 100644
--- a/sysdeps/x86_64/features-offsets.sym
+++ b/sysdeps/x86/features-offsets.sym
@@ -3,4 +3,6 @@ 
 #include <ldsodefs.h>
 
 RTLD_GLOBAL_RO_DL_X86_CPU_FEATURES_OFFSET offsetof (struct rtld_global_ro, _dl_x86_cpu_features)
+#ifdef __x86_64__
 RTLD_GLOBAL_DL_X86_FEATURE_1_OFFSET offsetof (struct rtld_global, _dl_x86_feature_1)
+#endif
diff --git a/sysdeps/x86/malloc-for-test.c b/sysdeps/x86/malloc-for-test.c
new file mode 100644
index 0000000000..02f4dead5d
--- /dev/null
+++ b/sysdeps/x86/malloc-for-test.c
@@ -0,0 +1,33 @@ 
+/*  A malloc for intercept test.  x86 version.
+   Copyright (C) 2024 Free Software Foundation, Inc.
+   This file is part of the GNU C Library.
+
+   The GNU C Library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 2.1 of the License, or (at your option) any later version.
+
+   The GNU C Library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with the GNU C Library; if not, see
+   <http://www.gnu.org/licenses/>.  */
+
+
+/* Clear XMM0...XMM7  */
+#define PREPARE_MALLOC()				\
+{							\
+  asm volatile ("xorps %%xmm0, %%xmm0" : : : "xmm0" );	\
+  asm volatile ("xorps %%xmm1, %%xmm1" : : : "xmm1" );	\
+  asm volatile ("xorps %%xmm2, %%xmm2" : : : "xmm2" );	\
+  asm volatile ("xorps %%xmm3, %%xmm3" : : : "xmm3" );	\
+  asm volatile ("xorps %%xmm4, %%xmm4" : : : "xmm4" );	\
+  asm volatile ("xorps %%xmm5, %%xmm5" : : : "xmm5" );	\
+  asm volatile ("xorps %%xmm6, %%xmm6" : : : "xmm6" );	\
+  asm volatile ("xorps %%xmm7, %%xmm7" : : : "xmm7" );	\
+}
+
+#include <elf/malloc-for-test.c>
diff --git a/sysdeps/x86/sysdep.h b/sysdeps/x86/sysdep.h
index 837fd28734..485cad9c02 100644
--- a/sysdeps/x86/sysdep.h
+++ b/sysdeps/x86/sysdep.h
@@ -70,6 +70,12 @@ 
    | (1 << X86_XSTATE_ZMM_H_ID))
 #endif
 
+/* States which should be saved for TLSDESC_CALL and TLS_DESC_CALL.
+   Compiler assumes that all registers, including x87 FPU stack registers,
+   are unchanged after CALL, except for EFLAGS and RAX/EAX.  */
+#define TLSDESC_CALL_STATE_SAVE_MASK	\
+  (STATE_SAVE_MASK | (1 << X86_XSTATE_X87_ID))
+
 /* Constants for bits in __x86_string_control:  */
 
 /* Avoid short distance REP MOVSB.  */
diff --git a/sysdeps/x86_64/Makefile b/sysdeps/x86_64/Makefile
index 145f25e7f6..9337e95093 100644
--- a/sysdeps/x86_64/Makefile
+++ b/sysdeps/x86_64/Makefile
@@ -10,7 +10,7 @@  LDFLAGS-rtld += -Wl,-z,nomark-plt
 endif
 
 ifeq ($(subdir),csu)
-gen-as-const-headers += features-offsets.sym link-defines.sym
+gen-as-const-headers += link-defines.sym
 endif
 
 ifeq ($(subdir),gmon)
diff --git a/sysdeps/x86_64/dl-machine.h b/sysdeps/x86_64/dl-machine.h
index 6d605d0d32..ff5d45f7cb 100644
--- a/sysdeps/x86_64/dl-machine.h
+++ b/sysdeps/x86_64/dl-machine.h
@@ -71,9 +71,6 @@  elf_machine_runtime_setup (struct link_map *l, struct r_scope_elem *scope[],
 			   int lazy, int profile)
 {
   Elf64_Addr *got;
-  extern void _dl_runtime_resolve_fxsave (ElfW(Word)) attribute_hidden;
-  extern void _dl_runtime_resolve_xsave (ElfW(Word)) attribute_hidden;
-  extern void _dl_runtime_resolve_xsavec (ElfW(Word)) attribute_hidden;
   extern void _dl_runtime_profile_sse (ElfW(Word)) attribute_hidden;
   extern void _dl_runtime_profile_avx (ElfW(Word)) attribute_hidden;
   extern void _dl_runtime_profile_avx512 (ElfW(Word)) attribute_hidden;
@@ -96,8 +93,6 @@  elf_machine_runtime_setup (struct link_map *l, struct r_scope_elem *scope[],
       /* Identify this shared object.  */
       *(ElfW(Addr) *) (got + 1) = (ElfW(Addr)) l;
 
-      const struct cpu_features* cpu_features = __get_cpu_features ();
-
 #ifdef SHARED
       /* The got[2] entry contains the address of a function which gets
 	 called to get the address of a so far unresolved function and
@@ -107,6 +102,7 @@  elf_machine_runtime_setup (struct link_map *l, struct r_scope_elem *scope[],
 	 end in this function.  */
       if (__glibc_unlikely (profile))
 	{
+	  const struct cpu_features* cpu_features = __get_cpu_features ();
 	  if (X86_ISA_CPU_FEATURE_USABLE_P (cpu_features, AVX512F))
 	    *(ElfW(Addr) *) (got + 2) = (ElfW(Addr)) &_dl_runtime_profile_avx512;
 	  else if (X86_ISA_CPU_FEATURE_USABLE_P (cpu_features, AVX))
@@ -126,15 +122,8 @@  elf_machine_runtime_setup (struct link_map *l, struct r_scope_elem *scope[],
 	  /* This function will get called to fix up the GOT entry
 	     indicated by the offset on the stack, and then jump to
 	     the resolved address.  */
-	  if (MINIMUM_X86_ISA_LEVEL >= AVX_X86_ISA_LEVEL
-	      || GLRO(dl_x86_cpu_features).xsave_state_size != 0)
-	    *(ElfW(Addr) *) (got + 2)
-	      = (CPU_FEATURE_USABLE_P (cpu_features, XSAVEC)
-		 ? (ElfW(Addr)) &_dl_runtime_resolve_xsavec
-		 : (ElfW(Addr)) &_dl_runtime_resolve_xsave);
-	  else
-	    *(ElfW(Addr) *) (got + 2)
-	      = (ElfW(Addr)) &_dl_runtime_resolve_fxsave;
+	  *(ElfW(Addr) *) (got + 2)
+	    = (ElfW(Addr)) GLRO(dl_x86_64_runtime_resolve);
 	}
     }
 
@@ -383,7 +372,7 @@  and creates an unsatisfiable circular dependency.\n",
 		  {
 		    td->arg = _dl_make_tlsdesc_dynamic
 		      (sym_map, sym->st_value + reloc->r_addend);
-		    td->entry = _dl_tlsdesc_dynamic;
+		    td->entry = GLRO(dl_x86_tlsdesc_dynamic);
 		  }
 		else
 #  endif
diff --git a/sysdeps/x86_64/dl-procinfo.c b/sysdeps/x86_64/dl-procinfo.c
index 4d1d790fbb..06637a8154 100644
--- a/sysdeps/x86_64/dl-procinfo.c
+++ b/sysdeps/x86_64/dl-procinfo.c
@@ -41,5 +41,21 @@ 
 
 #include <sysdeps/x86/dl-procinfo.c>
 
+#if !IS_IN (ldconfig)
+# if !defined PROCINFO_DECL && defined SHARED
+  ._dl_x86_64_runtime_resolve
+# else
+PROCINFO_CLASS void * _dl_x86_64_runtime_resolve
+# endif
+# ifndef PROCINFO_DECL
+= NULL
+# endif
+# if !defined SHARED || defined PROCINFO_DECL
+;
+# else
+,
+# endif
+#endif
+
 #undef PROCINFO_DECL
 #undef PROCINFO_CLASS
diff --git a/sysdeps/x86_64/dl-tlsdesc-dynamic.h b/sysdeps/x86_64/dl-tlsdesc-dynamic.h
new file mode 100644
index 0000000000..ce0bc094ec
--- /dev/null
+++ b/sysdeps/x86_64/dl-tlsdesc-dynamic.h
@@ -0,0 +1,166 @@ 
+/* Thread-local storage handling in the ELF dynamic linker.  x86_64 version.
+   Copyright (C) 2004-2024 Free Software Foundation, Inc.
+   This file is part of the GNU C Library.
+
+   The GNU C Library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 2.1 of the License, or (at your option) any later version.
+
+   The GNU C Library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with the GNU C Library; if not, see
+   <https://www.gnu.org/licenses/>.  */
+
+#ifndef SECTION
+# define SECTION(p)	p
+#endif
+
+#undef REGISTER_SAVE_AREA
+#undef LOCAL_STORAGE_AREA
+#undef BASE
+
+#include "dl-trampoline-state.h"
+
+	.section SECTION(.text),"ax",@progbits
+
+	.hidden _dl_tlsdesc_dynamic
+	.global	_dl_tlsdesc_dynamic
+	.type	_dl_tlsdesc_dynamic,@function
+
+     /* %rax points to the TLS descriptor, such that 0(%rax) points to
+	_dl_tlsdesc_dynamic itself, and 8(%rax) points to a struct
+	tlsdesc_dynamic_arg object.  It must return in %rax the offset
+	between the thread pointer and the object denoted by the
+	argument, without clobbering any registers.
+
+	The assembly code that follows is a rendition of the following
+	C code, hand-optimized a little bit.
+
+ptrdiff_t
+_dl_tlsdesc_dynamic (register struct tlsdesc *tdp asm ("%rax"))
+{
+  struct tlsdesc_dynamic_arg *td = tdp->arg;
+  dtv_t *dtv = *(dtv_t **)((char *)__thread_pointer + DTV_OFFSET);
+  if (__builtin_expect (td->gen_count <= dtv[0].counter
+			&& (dtv[td->tlsinfo.ti_module].pointer.val
+			    != TLS_DTV_UNALLOCATED),
+			1))
+    return dtv[td->tlsinfo.ti_module].pointer.val + td->tlsinfo.ti_offset
+      - __thread_pointer;
+
+  return __tls_get_addr_internal (&td->tlsinfo) - __thread_pointer;
+}
+*/
+	cfi_startproc
+	.align 16
+_dl_tlsdesc_dynamic:
+	_CET_ENDBR
+	/* Preserve call-clobbered registers that we modify.
+	   We need two scratch regs anyway.  */
+	movq	%rsi, -16(%rsp)
+	mov	%fs:DTV_OFFSET, %RSI_LP
+	movq	%rdi, -8(%rsp)
+	movq	TLSDESC_ARG(%rax), %rdi
+	movq	(%rsi), %rax
+	cmpq	%rax, TLSDESC_GEN_COUNT(%rdi)
+	ja	2f
+	movq	TLSDESC_MODID(%rdi), %rax
+	salq	$4, %rax
+	movq	(%rax,%rsi), %rax
+	cmpq	$-1, %rax
+	je	2f
+	addq	TLSDESC_MODOFF(%rdi), %rax
+1:
+	movq	-16(%rsp), %rsi
+	sub	%fs:0, %RAX_LP
+	movq	-8(%rsp), %rdi
+	ret
+2:
+#if DL_RUNTIME_RESOLVE_REALIGN_STACK
+	movq	%rbx, -24(%rsp)
+	mov	%RSP_LP, %RBX_LP
+	cfi_def_cfa_register(%rbx)
+	and	$-STATE_SAVE_ALIGNMENT, %RSP_LP
+#endif
+#ifdef REGISTER_SAVE_AREA
+# if DL_RUNTIME_RESOLVE_REALIGN_STACK
+	# STATE_SAVE_OFFSET has space for 8 integer registers.  But we
+	# need space for RCX, RDX, RSI, RDI, R8, R9, R10 and R11, plus
+	# RBX above.
+	sub	$(REGISTER_SAVE_AREA + STATE_SAVE_ALIGNMENT), %RSP_LP
+# else
+	sub	$REGISTER_SAVE_AREA, %RSP_LP
+	cfi_adjust_cfa_offset(REGISTER_SAVE_AREA)
+# endif
+#else
+	# Allocate stack space of the required size to save the state.
+	sub	_rtld_local_ro+RTLD_GLOBAL_RO_DL_X86_CPU_FEATURES_OFFSET+XSAVE_STATE_SIZE_OFFSET(%rip), %RSP_LP
+#endif
+	/* Besides rdi and rsi, saved above, save rcx, rdx, r8, r9,
+	   r10 and r11.  */
+	movq	%rcx, REGISTER_SAVE_RCX(%rsp)
+	movq	%rdx, REGISTER_SAVE_RDX(%rsp)
+	movq	%r8, REGISTER_SAVE_R8(%rsp)
+	movq	%r9, REGISTER_SAVE_R9(%rsp)
+	movq	%r10, REGISTER_SAVE_R10(%rsp)
+	movq	%r11, REGISTER_SAVE_R11(%rsp)
+#ifdef USE_FXSAVE
+	fxsave	STATE_SAVE_OFFSET(%rsp)
+#else
+	movl	$TLSDESC_CALL_STATE_SAVE_MASK, %eax
+	xorl	%edx, %edx
+	# Clear the XSAVE Header.
+# ifdef USE_XSAVE
+	movq	%rdx, (STATE_SAVE_OFFSET + 512)(%rsp)
+	movq	%rdx, (STATE_SAVE_OFFSET + 512 + 8)(%rsp)
+# endif
+	movq	%rdx, (STATE_SAVE_OFFSET + 512 + 8 * 2)(%rsp)
+	movq	%rdx, (STATE_SAVE_OFFSET + 512 + 8 * 3)(%rsp)
+	movq	%rdx, (STATE_SAVE_OFFSET + 512 + 8 * 4)(%rsp)
+	movq	%rdx, (STATE_SAVE_OFFSET + 512 + 8 * 5)(%rsp)
+	movq	%rdx, (STATE_SAVE_OFFSET + 512 + 8 * 6)(%rsp)
+	movq	%rdx, (STATE_SAVE_OFFSET + 512 + 8 * 7)(%rsp)
+# ifdef USE_XSAVE
+	xsave	STATE_SAVE_OFFSET(%rsp)
+# else
+	xsavec	STATE_SAVE_OFFSET(%rsp)
+# endif
+#endif
+	/* %rdi already points to the tlsinfo data structure.  */
+	call	HIDDEN_JUMPTARGET (__tls_get_addr)
+	# Get register content back.
+#ifdef USE_FXSAVE
+	fxrstor	STATE_SAVE_OFFSET(%rsp)
+#else
+	/* Save and retore __tls_get_addr return value stored in RAX.  */
+	mov	%RAX_LP, %RCX_LP
+	movl	$TLSDESC_CALL_STATE_SAVE_MASK, %eax
+	xorl	%edx, %edx
+	xrstor	STATE_SAVE_OFFSET(%rsp)
+	mov	%RCX_LP, %RAX_LP
+#endif
+	movq	REGISTER_SAVE_R11(%rsp), %r11
+	movq	REGISTER_SAVE_R10(%rsp), %r10
+	movq	REGISTER_SAVE_R9(%rsp), %r9
+	movq	REGISTER_SAVE_R8(%rsp), %r8
+	movq	REGISTER_SAVE_RDX(%rsp), %rdx
+	movq	REGISTER_SAVE_RCX(%rsp), %rcx
+#if DL_RUNTIME_RESOLVE_REALIGN_STACK
+	mov	%RBX_LP, %RSP_LP
+	cfi_def_cfa_register(%rsp)
+	movq	-24(%rsp), %rbx
+	cfi_restore(%rbx)
+#else
+	add	$REGISTER_SAVE_AREA, %RSP_LP
+	cfi_adjust_cfa_offset(-REGISTER_SAVE_AREA)
+#endif
+	jmp	1b
+	cfi_endproc
+	.size	_dl_tlsdesc_dynamic, .-_dl_tlsdesc_dynamic
+
+#undef STATE_SAVE_ALIGNMENT
diff --git a/sysdeps/x86_64/dl-tlsdesc.S b/sysdeps/x86_64/dl-tlsdesc.S
index f748af2ece..ea69f5223a 100644
--- a/sysdeps/x86_64/dl-tlsdesc.S
+++ b/sysdeps/x86_64/dl-tlsdesc.S
@@ -18,7 +18,19 @@ 
 
 #include <sysdep.h>
 #include <tls.h>
+#include <cpu-features-offsets.h>
+#include <features-offsets.h>
 #include "tlsdesc.h"
+#include "dl-trampoline-save.h"
+
+/* Area on stack to save and restore registers used for parameter
+   passing when calling _dl_tlsdesc_dynamic.  */
+#define REGISTER_SAVE_RCX	0
+#define REGISTER_SAVE_RDX	(REGISTER_SAVE_RCX + 8)
+#define REGISTER_SAVE_R8	(REGISTER_SAVE_RDX + 8)
+#define REGISTER_SAVE_R9	(REGISTER_SAVE_R8 + 8)
+#define REGISTER_SAVE_R10	(REGISTER_SAVE_R9 + 8)
+#define REGISTER_SAVE_R11	(REGISTER_SAVE_R10 + 8)
 
 	.text
 
@@ -67,80 +79,24 @@  _dl_tlsdesc_undefweak:
 	.size	_dl_tlsdesc_undefweak, .-_dl_tlsdesc_undefweak
 
 #ifdef SHARED
-	.hidden _dl_tlsdesc_dynamic
-	.global	_dl_tlsdesc_dynamic
-	.type	_dl_tlsdesc_dynamic,@function
-
-     /* %rax points to the TLS descriptor, such that 0(%rax) points to
-	_dl_tlsdesc_dynamic itself, and 8(%rax) points to a struct
-	tlsdesc_dynamic_arg object.  It must return in %rax the offset
-	between the thread pointer and the object denoted by the
-	argument, without clobbering any registers.
-
-	The assembly code that follows is a rendition of the following
-	C code, hand-optimized a little bit.
-
-ptrdiff_t
-_dl_tlsdesc_dynamic (register struct tlsdesc *tdp asm ("%rax"))
-{
-  struct tlsdesc_dynamic_arg *td = tdp->arg;
-  dtv_t *dtv = *(dtv_t **)((char *)__thread_pointer + DTV_OFFSET);
-  if (__builtin_expect (td->gen_count <= dtv[0].counter
-			&& (dtv[td->tlsinfo.ti_module].pointer.val
-			    != TLS_DTV_UNALLOCATED),
-			1))
-    return dtv[td->tlsinfo.ti_module].pointer.val + td->tlsinfo.ti_offset
-      - __thread_pointer;
-
-  return __tls_get_addr_internal (&td->tlsinfo) - __thread_pointer;
-}
-*/
-	cfi_startproc
-	.align 16
-_dl_tlsdesc_dynamic:
-	_CET_ENDBR
-	/* Preserve call-clobbered registers that we modify.
-	   We need two scratch regs anyway.  */
-	movq	%rsi, -16(%rsp)
-	mov	%fs:DTV_OFFSET, %RSI_LP
-	movq	%rdi, -8(%rsp)
-	movq	TLSDESC_ARG(%rax), %rdi
-	movq	(%rsi), %rax
-	cmpq	%rax, TLSDESC_GEN_COUNT(%rdi)
-	ja	.Lslow
-	movq	TLSDESC_MODID(%rdi), %rax
-	salq	$4, %rax
-	movq	(%rax,%rsi), %rax
-	cmpq	$-1, %rax
-	je	.Lslow
-	addq	TLSDESC_MODOFF(%rdi), %rax
-.Lret:
-	movq	-16(%rsp), %rsi
-	sub	%fs:0, %RAX_LP
-	movq	-8(%rsp), %rdi
-	ret
-.Lslow:
-	/* Besides rdi and rsi, saved above, save rdx, rcx, r8, r9,
-	   r10 and r11.  Also, align the stack, that's off by 8 bytes.	*/
-	subq	$72, %rsp
-	cfi_adjust_cfa_offset (72)
-	movq	%rdx, 8(%rsp)
-	movq	%rcx, 16(%rsp)
-	movq	%r8, 24(%rsp)
-	movq	%r9, 32(%rsp)
-	movq	%r10, 40(%rsp)
-	movq	%r11, 48(%rsp)
-	/* %rdi already points to the tlsinfo data structure.  */
-	call	HIDDEN_JUMPTARGET (__tls_get_addr)
-	movq	8(%rsp), %rdx
-	movq	16(%rsp), %rcx
-	movq	24(%rsp), %r8
-	movq	32(%rsp), %r9
-	movq	40(%rsp), %r10
-	movq	48(%rsp), %r11
-	addq	$72, %rsp
-	cfi_adjust_cfa_offset (-72)
-	jmp	.Lret
-	cfi_endproc
-	.size	_dl_tlsdesc_dynamic, .-_dl_tlsdesc_dynamic
+# define USE_FXSAVE
+# define STATE_SAVE_ALIGNMENT	16
+# define _dl_tlsdesc_dynamic	_dl_tlsdesc_dynamic_fxsave
+# include "dl-tlsdesc-dynamic.h"
+# undef _dl_tlsdesc_dynamic
+# undef USE_FXSAVE
+
+# define USE_XSAVE
+# define STATE_SAVE_ALIGNMENT	64
+# define _dl_tlsdesc_dynamic	_dl_tlsdesc_dynamic_xsave
+# include "dl-tlsdesc-dynamic.h"
+# undef _dl_tlsdesc_dynamic
+# undef USE_XSAVE
+
+# define USE_XSAVEC
+# define STATE_SAVE_ALIGNMENT	64
+# define _dl_tlsdesc_dynamic	_dl_tlsdesc_dynamic_xsavec
+# include "dl-tlsdesc-dynamic.h"
+# undef _dl_tlsdesc_dynamic
+# undef USE_XSAVEC
 #endif /* SHARED */
diff --git a/sysdeps/x86_64/dl-trampoline-save.h b/sysdeps/x86_64/dl-trampoline-save.h
new file mode 100644
index 0000000000..84eac4a8ac
--- /dev/null
+++ b/sysdeps/x86_64/dl-trampoline-save.h
@@ -0,0 +1,34 @@ 
+/* x86-64 PLT trampoline register save macros.
+   Copyright (C) 2024 Free Software Foundation, Inc.
+   This file is part of the GNU C Library.
+
+   The GNU C Library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 2.1 of the License, or (at your option) any later version.
+
+   The GNU C Library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with the GNU C Library; if not, see
+   <https://www.gnu.org/licenses/>.  */
+
+#ifndef DL_STACK_ALIGNMENT
+/* Due to GCC bug:
+
+   https://gcc.gnu.org/bugzilla/show_bug.cgi?id=58066
+
+   __tls_get_addr may be called with 8-byte stack alignment.  Although
+   this bug has been fixed in GCC 4.9.4, 5.3 and 6, we can't assume
+   that stack will be always aligned at 16 bytes.  */
+# define DL_STACK_ALIGNMENT 8
+#endif
+
+/* True if _dl_runtime_resolve should align stack for STATE_SAVE or align
+   stack to 16 bytes before calling _dl_fixup.  */
+#define DL_RUNTIME_RESOLVE_REALIGN_STACK \
+  (STATE_SAVE_ALIGNMENT > DL_STACK_ALIGNMENT \
+   || 16 > DL_STACK_ALIGNMENT)
diff --git a/sysdeps/x86_64/dl-trampoline-state.h b/sysdeps/x86_64/dl-trampoline-state.h
new file mode 100644
index 0000000000..575f120797
--- /dev/null
+++ b/sysdeps/x86_64/dl-trampoline-state.h
@@ -0,0 +1,51 @@ 
+/* x86-64 PLT dl-trampoline state macros.
+   Copyright (C) 2024 Free Software Foundation, Inc.
+   This file is part of the GNU C Library.
+
+   The GNU C Library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 2.1 of the License, or (at your option) any later version.
+
+   The GNU C Library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with the GNU C Library; if not, see
+   <https://www.gnu.org/licenses/>.  */
+
+#if (STATE_SAVE_ALIGNMENT % 16) != 0
+# error STATE_SAVE_ALIGNMENT must be multiple of 16
+#endif
+
+#if (STATE_SAVE_OFFSET % STATE_SAVE_ALIGNMENT) != 0
+# error STATE_SAVE_OFFSET must be multiple of STATE_SAVE_ALIGNMENT
+#endif
+
+#if DL_RUNTIME_RESOLVE_REALIGN_STACK
+/* Local stack area before jumping to function address: RBX.  */
+# define LOCAL_STORAGE_AREA	8
+# define BASE			rbx
+# ifdef USE_FXSAVE
+/* Use fxsave to save XMM registers.  */
+#  define REGISTER_SAVE_AREA	(512 + STATE_SAVE_OFFSET)
+#  if (REGISTER_SAVE_AREA % 16) != 0
+#   error REGISTER_SAVE_AREA must be multiple of 16
+#  endif
+# endif
+#else
+# ifndef USE_FXSAVE
+#  error USE_FXSAVE must be defined
+# endif
+/* Use fxsave to save XMM registers.  */
+# define REGISTER_SAVE_AREA	(512 + STATE_SAVE_OFFSET + 8)
+/* Local stack area before jumping to function address:  All saved
+   registers.  */
+# define LOCAL_STORAGE_AREA	REGISTER_SAVE_AREA
+# define BASE			rsp
+# if (REGISTER_SAVE_AREA % 16) != 8
+#  error REGISTER_SAVE_AREA must be odd multiple of 8
+# endif
+#endif
diff --git a/sysdeps/x86_64/dl-trampoline.S b/sysdeps/x86_64/dl-trampoline.S
index b2e7e0f69b..87c5137837 100644
--- a/sysdeps/x86_64/dl-trampoline.S
+++ b/sysdeps/x86_64/dl-trampoline.S
@@ -22,25 +22,7 @@ 
 #include <features-offsets.h>
 #include <link-defines.h>
 #include <isa-level.h>
-
-#ifndef DL_STACK_ALIGNMENT
-/* Due to GCC bug:
-
-   https://gcc.gnu.org/bugzilla/show_bug.cgi?id=58066
-
-   __tls_get_addr may be called with 8-byte stack alignment.  Although
-   this bug has been fixed in GCC 4.9.4, 5.3 and 6, we can't assume
-   that stack will be always aligned at 16 bytes.  We use unaligned
-   16-byte move to load and store SSE registers, which has no penalty
-   on modern processors if stack is 16-byte aligned.  */
-# define DL_STACK_ALIGNMENT 8
-#endif
-
-/* True if _dl_runtime_resolve should align stack for STATE_SAVE or align
-   stack to 16 bytes before calling _dl_fixup.  */
-#define DL_RUNTIME_RESOLVE_REALIGN_STACK \
-  (STATE_SAVE_ALIGNMENT > DL_STACK_ALIGNMENT \
-   || 16 > DL_STACK_ALIGNMENT)
+#include "dl-trampoline-save.h"
 
 /* Area on stack to save and restore registers used for parameter
    passing when calling _dl_fixup.  */
diff --git a/sysdeps/x86_64/dl-trampoline.h b/sysdeps/x86_64/dl-trampoline.h
index f55c6ea040..d9ccfb40d4 100644
--- a/sysdeps/x86_64/dl-trampoline.h
+++ b/sysdeps/x86_64/dl-trampoline.h
@@ -27,39 +27,7 @@ 
 # undef LOCAL_STORAGE_AREA
 # undef BASE
 
-# if (STATE_SAVE_ALIGNMENT % 16) != 0
-#  error STATE_SAVE_ALIGNMENT must be multiple of 16
-# endif
-
-# if (STATE_SAVE_OFFSET % STATE_SAVE_ALIGNMENT) != 0
-#  error STATE_SAVE_OFFSET must be multiple of STATE_SAVE_ALIGNMENT
-# endif
-
-# if DL_RUNTIME_RESOLVE_REALIGN_STACK
-/* Local stack area before jumping to function address: RBX.  */
-#  define LOCAL_STORAGE_AREA	8
-#  define BASE			rbx
-#  ifdef USE_FXSAVE
-/* Use fxsave to save XMM registers.  */
-#   define REGISTER_SAVE_AREA	(512 + STATE_SAVE_OFFSET)
-#   if (REGISTER_SAVE_AREA % 16) != 0
-#    error REGISTER_SAVE_AREA must be multiple of 16
-#   endif
-#  endif
-# else
-#  ifndef USE_FXSAVE
-#   error USE_FXSAVE must be defined
-#  endif
-/* Use fxsave to save XMM registers.  */
-#  define REGISTER_SAVE_AREA	(512 + STATE_SAVE_OFFSET + 8)
-/* Local stack area before jumping to function address:  All saved
-   registers.  */
-#  define LOCAL_STORAGE_AREA	REGISTER_SAVE_AREA
-#  define BASE			rsp
-#  if (REGISTER_SAVE_AREA % 16) != 8
-#   error REGISTER_SAVE_AREA must be odd multiple of 8
-#  endif
-# endif
+# include "dl-trampoline-state.h"
 
 	.globl _dl_runtime_resolve
 	.hidden _dl_runtime_resolve