[v4,2/2] x86: Update _dl_tlsdesc_dynamic to preserve caller-saved registers

Message ID 20240213041501.2494232-3-hjl.tools@gmail.com
State Superseded
Headers
Series x86: Update _dl_tlsdesc_dynamic to preserve caller-saved registers |

Checks

Context Check Description
redhat-pt-bot/TryBot-apply_patch success Patch applied to master at the time it was sent
linaro-tcwg-bot/tcwg_glibc_build--master-aarch64 success Testing passed
linaro-tcwg-bot/tcwg_glibc_check--master-aarch64 success Testing passed
redhat-pt-bot/TryBot-32bit success Build for i686
linaro-tcwg-bot/tcwg_glibc_build--master-arm success Testing passed
linaro-tcwg-bot/tcwg_glibc_check--master-arm fail Testing failed

Commit Message

H.J. Lu Feb. 13, 2024, 4:15 a.m. UTC
  Compiler generates the following instruction sequence for GNU2 dynamic
TLS access:

	leaq	tls_var@TLSDESC(%rip), %rax
	call	*tls_var@TLSCALL(%rax)

or

	leal	tls_var@TLSDESC(%ebx), %eax
	call	*tls_var@TLSCALL(%eax)

CALL instruction is transparent to compiler which assumes all registers,
except for EFLAGS and RAX/EAX, are unchanged after CALL.  When
_dl_tlsdesc_dynamic is called, it calls __tls_get_addr on the slow
path.  __tls_get_addr is a normal function which doesn't preserve any
caller-saved registers.  _dl_tlsdesc_dynamic saved and restored integer
caller-saved registers, but didn't preserve any other caller-saved
registers.  Add _dl_tlsdesc_dynamic IFUNC functions for FNSAVE, FXSAVE,
XSAVE and XSAVEC to save and restore all caller-saved registers.  This
fixes BZ #31372.

Add GLRO(dl_x86_64_runtime_resolve) with GLRO(dl_x86_tlsdesc_dynamic)
to optimize elf_machine_runtime_setup.
---
 elf/Makefile                                 |  19 ++
 elf/malloc-for-test.c                        |  32 ++++
 elf/malloc-for-test.map                      |   6 +
 elf/tst-gnu2-tls2.c                          |  97 ++++++++++
 elf/tst-gnu2-tls2.h                          |  26 +++
 elf/tst-gnu2-tls2mod0.c                      |  28 +++
 elf/tst-gnu2-tls2mod1.c                      |  28 +++
 elf/tst-gnu2-tls2mod2.c                      |  28 +++
 sysdeps/i386/dl-machine.h                    |   2 +-
 sysdeps/i386/dl-tlsdesc-dynamic.h            | 187 +++++++++++++++++++
 sysdeps/i386/dl-tlsdesc.S                    | 115 +++++-------
 sysdeps/i386/tst-gnu2-tls2.c                 |   5 +
 sysdeps/x86/Makefile                         |   7 +-
 sysdeps/x86/cpu-features.c                   |  56 +++++-
 sysdeps/x86/dl-procinfo.c                    |  16 ++
 sysdeps/{x86_64 => x86}/features-offsets.sym |   2 +
 sysdeps/x86/malloc-for-test.c                |  33 ++++
 sysdeps/x86/sysdep.h                         |   6 +
 sysdeps/x86_64/Makefile                      |   2 +-
 sysdeps/x86_64/dl-machine.h                  |  19 +-
 sysdeps/x86_64/dl-procinfo.c                 |  16 ++
 sysdeps/x86_64/dl-tlsdesc-dynamic.h          | 166 ++++++++++++++++
 sysdeps/x86_64/dl-tlsdesc.S                  | 108 ++++-------
 sysdeps/x86_64/dl-trampoline-save.h          |  34 ++++
 sysdeps/x86_64/dl-trampoline-state.h         |  51 +++++
 sysdeps/x86_64/dl-trampoline.S               |  20 +-
 sysdeps/x86_64/dl-trampoline.h               |  34 +---
 27 files changed, 930 insertions(+), 213 deletions(-)
 create mode 100644 elf/malloc-for-test.c
 create mode 100644 elf/malloc-for-test.map
 create mode 100644 elf/tst-gnu2-tls2.c
 create mode 100644 elf/tst-gnu2-tls2.h
 create mode 100644 elf/tst-gnu2-tls2mod0.c
 create mode 100644 elf/tst-gnu2-tls2mod1.c
 create mode 100644 elf/tst-gnu2-tls2mod2.c
 create mode 100644 sysdeps/i386/dl-tlsdesc-dynamic.h
 create mode 100644 sysdeps/i386/tst-gnu2-tls2.c
 rename sysdeps/{x86_64 => x86}/features-offsets.sym (89%)
 create mode 100644 sysdeps/x86/malloc-for-test.c
 create mode 100644 sysdeps/x86_64/dl-tlsdesc-dynamic.h
 create mode 100644 sysdeps/x86_64/dl-trampoline-save.h
 create mode 100644 sysdeps/x86_64/dl-trampoline-state.h
  

Comments

Noah Goldstein Feb. 14, 2024, 10:44 p.m. UTC | #1
On Tue, Feb 13, 2024 at 4:15 AM H.J. Lu <hjl.tools@gmail.com> wrote:
>
> Compiler generates the following instruction sequence for GNU2 dynamic
> TLS access:
>
>         leaq    tls_var@TLSDESC(%rip), %rax
>         call    *tls_var@TLSCALL(%rax)
>
> or
>
>         leal    tls_var@TLSDESC(%ebx), %eax
>         call    *tls_var@TLSCALL(%eax)
>
> CALL instruction is transparent to compiler which assumes all registers,
> except for EFLAGS and RAX/EAX, are unchanged after CALL.  When
> _dl_tlsdesc_dynamic is called, it calls __tls_get_addr on the slow
> path.  __tls_get_addr is a normal function which doesn't preserve any
> caller-saved registers.  _dl_tlsdesc_dynamic saved and restored integer
> caller-saved registers, but didn't preserve any other caller-saved
> registers.  Add _dl_tlsdesc_dynamic IFUNC functions for FNSAVE, FXSAVE,
> XSAVE and XSAVEC to save and restore all caller-saved registers.  This
> fixes BZ #31372.
>
> Add GLRO(dl_x86_64_runtime_resolve) with GLRO(dl_x86_tlsdesc_dynamic)
> to optimize elf_machine_runtime_setup.
> ---
>  elf/Makefile                                 |  19 ++
>  elf/malloc-for-test.c                        |  32 ++++
>  elf/malloc-for-test.map                      |   6 +
>  elf/tst-gnu2-tls2.c                          |  97 ++++++++++
>  elf/tst-gnu2-tls2.h                          |  26 +++
>  elf/tst-gnu2-tls2mod0.c                      |  28 +++
>  elf/tst-gnu2-tls2mod1.c                      |  28 +++
>  elf/tst-gnu2-tls2mod2.c                      |  28 +++
>  sysdeps/i386/dl-machine.h                    |   2 +-
>  sysdeps/i386/dl-tlsdesc-dynamic.h            | 187 +++++++++++++++++++
>  sysdeps/i386/dl-tlsdesc.S                    | 115 +++++-------
>  sysdeps/i386/tst-gnu2-tls2.c                 |   5 +
>  sysdeps/x86/Makefile                         |   7 +-
>  sysdeps/x86/cpu-features.c                   |  56 +++++-
>  sysdeps/x86/dl-procinfo.c                    |  16 ++
>  sysdeps/{x86_64 => x86}/features-offsets.sym |   2 +
>  sysdeps/x86/malloc-for-test.c                |  33 ++++
>  sysdeps/x86/sysdep.h                         |   6 +
>  sysdeps/x86_64/Makefile                      |   2 +-
>  sysdeps/x86_64/dl-machine.h                  |  19 +-
>  sysdeps/x86_64/dl-procinfo.c                 |  16 ++
>  sysdeps/x86_64/dl-tlsdesc-dynamic.h          | 166 ++++++++++++++++
>  sysdeps/x86_64/dl-tlsdesc.S                  | 108 ++++-------
>  sysdeps/x86_64/dl-trampoline-save.h          |  34 ++++
>  sysdeps/x86_64/dl-trampoline-state.h         |  51 +++++
>  sysdeps/x86_64/dl-trampoline.S               |  20 +-
>  sysdeps/x86_64/dl-trampoline.h               |  34 +---
>  27 files changed, 930 insertions(+), 213 deletions(-)
>  create mode 100644 elf/malloc-for-test.c
>  create mode 100644 elf/malloc-for-test.map
>  create mode 100644 elf/tst-gnu2-tls2.c
>  create mode 100644 elf/tst-gnu2-tls2.h
>  create mode 100644 elf/tst-gnu2-tls2mod0.c
>  create mode 100644 elf/tst-gnu2-tls2mod1.c
>  create mode 100644 elf/tst-gnu2-tls2mod2.c
>  create mode 100644 sysdeps/i386/dl-tlsdesc-dynamic.h
>  create mode 100644 sysdeps/i386/tst-gnu2-tls2.c
>  rename sysdeps/{x86_64 => x86}/features-offsets.sym (89%)
>  create mode 100644 sysdeps/x86/malloc-for-test.c
>  create mode 100644 sysdeps/x86_64/dl-tlsdesc-dynamic.h
>  create mode 100644 sysdeps/x86_64/dl-trampoline-save.h
>  create mode 100644 sysdeps/x86_64/dl-trampoline-state.h
>
> diff --git a/elf/Makefile b/elf/Makefile
> index 5d78b659ce..e0665d2007 100644
> --- a/elf/Makefile
> +++ b/elf/Makefile
> @@ -424,6 +424,7 @@ tests += \
>    tst-glibc-hwcaps-prepend \
>    tst-global1 \
>    tst-global2 \
> +  tst-gnu2-tls2 \
>    tst-initfinilazyfail \
>    tst-initorder \
>    tst-initorder2 \
> @@ -699,6 +700,7 @@ modules-names += \
>    libtracemod5-1 \
>    ltglobmod1 \
>    ltglobmod2 \
> +  malloc-for-test \
>    neededobj1 \
>    neededobj2 \
>    neededobj3 \
> @@ -846,6 +848,9 @@ modules-names += \
>    tst-filterobj-flt \
>    tst-finilazyfailmod \
>    tst-globalmod2 \
> +  tst-gnu2-tls2mod0 \
> +  tst-gnu2-tls2mod1 \
> +  tst-gnu2-tls2mod2 \
>    tst-initlazyfailmod \
>    tst-initorder2a \
>    tst-initorder2b \
> @@ -3044,8 +3049,22 @@ $(objpfx)tst-tlsgap.out: \
>    $(objpfx)tst-tlsgap-mod0.so \
>    $(objpfx)tst-tlsgap-mod1.so \
>    $(objpfx)tst-tlsgap-mod2.so
> +
> +$(objpfx)tst-gnu2-tls2: \
> +  $(shared-thread-library) \
> +  $(objpfx)malloc-for-test.so
> +$(objpfx)tst-gnu2-tls2.out: \
> +  $(objpfx)tst-gnu2-tls2mod0.so \
> +  $(objpfx)tst-gnu2-tls2mod1.so \
> +  $(objpfx)tst-gnu2-tls2mod2.so
> +
> +LDFLAGS-malloc-for-test.so += -Wl,--version-script=malloc-for-test.map
> +
>  ifeq (yes,$(have-mtls-dialect-gnu2))
>  CFLAGS-tst-tlsgap-mod0.c += -mtls-dialect=gnu2
>  CFLAGS-tst-tlsgap-mod1.c += -mtls-dialect=gnu2
>  CFLAGS-tst-tlsgap-mod2.c += -mtls-dialect=gnu2
> +CFLAGS-tst-gnu2-tls2mod0.c += -mtls-dialect=gnu2
> +CFLAGS-tst-gnu2-tls2mod1.c += -mtls-dialect=gnu2
> +CFLAGS-tst-gnu2-tls2mod2.c += -mtls-dialect=gnu2
>  endif
> diff --git a/elf/malloc-for-test.c b/elf/malloc-for-test.c
> new file mode 100644
> index 0000000000..1bec69eda7
> --- /dev/null
> +++ b/elf/malloc-for-test.c
> @@ -0,0 +1,32 @@
> +/* A malloc for intercept test.
> +   Copyright (C) 2024 Free Software Foundation, Inc.
> +   This file is part of the GNU C Library.
> +
> +   The GNU C Library is free software; you can redistribute it and/or
> +   modify it under the terms of the GNU Lesser General Public
> +   License as published by the Free Software Foundation; either
> +   version 2.1 of the License, or (at your option) any later version.
> +
> +   The GNU C Library is distributed in the hope that it will be useful,
> +   but WITHOUT ANY WARRANTY; without even the implied warranty of
> +   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
> +   Lesser General Public License for more details.
> +
> +   You should have received a copy of the GNU Lesser General Public
> +   License along with the GNU C Library; if not, see
> +   <http://www.gnu.org/licenses/>.  */
> +
> +#include <stdlib.h>
> +
> +extern void * __libc_malloc (size_t);
> +
> +#ifndef PREPARE_MALLOC
> +# define PREPARE_MALLOC()
> +#endif
> +
> +void *
> +malloc (size_t n)
> +{
> +  PREPARE_MALLOC ();
> +  return __libc_malloc (n);
> +}
> diff --git a/elf/malloc-for-test.map b/elf/malloc-for-test.map
> new file mode 100644
> index 0000000000..8437cf4346
> --- /dev/null
> +++ b/elf/malloc-for-test.map
> @@ -0,0 +1,6 @@
> +GLIBC_2.0 {
> +  global:
> +    malloc;
> +  local:
> +    *;
> +};
> diff --git a/elf/tst-gnu2-tls2.c b/elf/tst-gnu2-tls2.c
> new file mode 100644
> index 0000000000..34427f9a0f
> --- /dev/null
> +++ b/elf/tst-gnu2-tls2.c
> @@ -0,0 +1,97 @@
> +/* Test TLSDESC relocation.
> +   Copyright (C) 2024 Free Software Foundation, Inc.
> +   This file is part of the GNU C Library.
> +
> +   The GNU C Library is free software; you can redistribute it and/or
> +   modify it under the terms of the GNU Lesser General Public
> +   License as published by the Free Software Foundation; either
> +   version 2.1 of the License, or (at your option) any later version.
> +
> +   The GNU C Library is distributed in the hope that it will be useful,
> +   but WITHOUT ANY WARRANTY; without even the implied warranty of
> +   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
> +   Lesser General Public License for more details.
> +
> +   You should have received a copy of the GNU Lesser General Public
> +   License along with the GNU C Library; if not, see
> +   <http://www.gnu.org/licenses/>.  */
> +
> +#include <stdio.h>
> +#include <string.h>
> +#include <dlfcn.h>
> +#include <pthread.h>
> +#include <support/xdlfcn.h>
> +#include <support/xthread.h>
> +#include <support/check.h>
> +#include <support/test-driver.h>
> +#include "tst-gnu2-tls2.h"
> +
> +#ifndef IS_SUPPORTED
> +# define IS_SUPPORTED() true
> +#endif
> +
> +static void *mod[3];
> +#define MOD(i) "tst-gnu2-tls2mod" #i ".so"
> +static const char *modname[3] = { MOD(0), MOD(1), MOD(2) };
> +#undef MOD
> +
> +static void
> +open_mod (int i)
> +{
> +  mod[i] = xdlopen (modname[i], RTLD_LAZY);
> +  printf ("open %s\n", modname[i]);
> +}
> +
> +static void
> +close_mod (int i)
> +{
> +  xdlclose (mod[i]);
> +  mod[i] = NULL;
> +  printf ("close %s\n", modname[i]);
> +}
> +
> +static void
> +access_mod (int i, const char *sym)
> +{
> +  struct tls var = { -1, -1, -1, -1 };
> +  struct tls *(*f) (struct tls *) = xdlsym (mod[i], sym);
> +  struct tls *p = f (&var);
> +  printf ("access %s: %s() = %p\n", modname[i], sym, p);
> +  TEST_VERIFY_EXIT (memcmp (p, &var, sizeof (var)) == 0);
> +  ++(p->a);
> +}
> +
> +static void *
> +start (void *arg)
> +{
> +  /* The DTV generation is at the last dlopen of mod0 and the
> +     entry for mod1 is NULL.  */
> +
> +  open_mod (1); /* Reuse modid of mod1. Uses dynamic TLS.  */
> +
> +  /* Force the slow path in GNU2 TLS descriptor call.  */
> +  access_mod (1, "apply_tls");
> +
> +  return arg;
> +}
> +
> +static int
> +do_test (void)
> +{
> +  if (!IS_SUPPORTED ())
> +    return EXIT_UNSUPPORTED;
> +
> +  open_mod (0);
> +  open_mod (1);
> +  open_mod (2);
> +  close_mod (0);
> +  close_mod (1); /* Create modid gap at mod1.  */
> +  open_mod (0); /* Reuse modid of mod0, bump generation count.  */
> +
> +  /* Create a thread where DTV of mod1 is NULL.  */
> +  pthread_t t = xpthread_create (NULL, start, NULL);
> +  xpthread_join (t);
> +  return 0;
> +}
> +
> +#include <support/test-driver.c>
> diff --git a/elf/tst-gnu2-tls2.h b/elf/tst-gnu2-tls2.h
> new file mode 100644
> index 0000000000..e33f4dbe27
> --- /dev/null
> +++ b/elf/tst-gnu2-tls2.h
> @@ -0,0 +1,26 @@
> +/* Test TLSDESC relocation.
> +   Copyright (C) 2024 Free Software Foundation, Inc.
> +   This file is part of the GNU C Library.
> +
> +   The GNU C Library is free software; you can redistribute it and/or
> +   modify it under the terms of the GNU Lesser General Public
> +   License as published by the Free Software Foundation; either
> +   version 2.1 of the License, or (at your option) any later version.
> +
> +   The GNU C Library is distributed in the hope that it will be useful,
> +   but WITHOUT ANY WARRANTY; without even the implied warranty of
> +   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
> +   Lesser General Public License for more details.
> +
> +   You should have received a copy of the GNU Lesser General Public
> +   License along with the GNU C Library; if not, see
> +   <https://www.gnu.org/licenses/>.  */
> +
> +#include <stdint.h>
> +
> +struct tls
> +{
> +  int64_t a, b, c, d;
> +};
> +
> +extern struct tls *apply_tls (struct tls *);
> diff --git a/elf/tst-gnu2-tls2mod0.c b/elf/tst-gnu2-tls2mod0.c
> new file mode 100644
> index 0000000000..67dc0d464d
> --- /dev/null
> +++ b/elf/tst-gnu2-tls2mod0.c
> @@ -0,0 +1,28 @@
> +/* DSO used by tst-gnu2-tls2.
> +   Copyright (C) 2024 Free Software Foundation, Inc.
> +   This file is part of the GNU C Library.
> +
> +   The GNU C Library is free software; you can redistribute it and/or
> +   modify it under the terms of the GNU Lesser General Public
> +   License as published by the Free Software Foundation; either
> +   version 2.1 of the License, or (at your option) any later version.
> +
> +   The GNU C Library is distributed in the hope that it will be useful,
> +   but WITHOUT ANY WARRANTY; without even the implied warranty of
> +   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
> +   Lesser General Public License for more details.
> +
> +   You should have received a copy of the GNU Lesser General Public
> +   License along with the GNU C Library; if not, see
> +   <https://www.gnu.org/licenses/>.  */
> +
> +#include "tst-gnu2-tls2.h"
> +
> +__thread struct tls tls_var0 __attribute__ ((visibility ("hidden")));
> +
> +struct tls *
> +apply_tls (struct tls *p)
> +{
> +  tls_var0 = *p;
> +  return &tls_var0;
> +}
> diff --git a/elf/tst-gnu2-tls2mod1.c b/elf/tst-gnu2-tls2mod1.c
> new file mode 100644
> index 0000000000..a4ae6db24f
> --- /dev/null
> +++ b/elf/tst-gnu2-tls2mod1.c
> @@ -0,0 +1,28 @@
> +/* DSO used by tst-gnu2-tls2.
> +   Copyright (C) 2024 Free Software Foundation, Inc.
> +   This file is part of the GNU C Library.
> +
> +   The GNU C Library is free software; you can redistribute it and/or
> +   modify it under the terms of the GNU Lesser General Public
> +   License as published by the Free Software Foundation; either
> +   version 2.1 of the License, or (at your option) any later version.
> +
> +   The GNU C Library is distributed in the hope that it will be useful,
> +   but WITHOUT ANY WARRANTY; without even the implied warranty of
> +   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
> +   Lesser General Public License for more details.
> +
> +   You should have received a copy of the GNU Lesser General Public
> +   License along with the GNU C Library; if not, see
> +   <https://www.gnu.org/licenses/>.  */
> +
> +#include "tst-gnu2-tls2.h"
> +
> +__thread struct tls tls_var1[100] __attribute__ ((visibility ("hidden")));
> +
> +struct tls *
> +apply_tls (struct tls *p)
> +{
> +  tls_var1[1] = *p;
> +  return &tls_var1[1];
> +}
> diff --git a/elf/tst-gnu2-tls2mod2.c b/elf/tst-gnu2-tls2mod2.c
> new file mode 100644
> index 0000000000..2d13921717
> --- /dev/null
> +++ b/elf/tst-gnu2-tls2mod2.c
> @@ -0,0 +1,28 @@
> +/* DSO used by tst-gnu2-tls2.
> +   Copyright (C) 2024 Free Software Foundation, Inc.
> +   This file is part of the GNU C Library.
> +
> +   The GNU C Library is free software; you can redistribute it and/or
> +   modify it under the terms of the GNU Lesser General Public
> +   License as published by the Free Software Foundation; either
> +   version 2.1 of the License, or (at your option) any later version.
> +
> +   The GNU C Library is distributed in the hope that it will be useful,
> +   but WITHOUT ANY WARRANTY; without even the implied warranty of
> +   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
> +   Lesser General Public License for more details.
> +
> +   You should have received a copy of the GNU Lesser General Public
> +   License along with the GNU C Library; if not, see
> +   <https://www.gnu.org/licenses/>.  */
> +
> +#include "tst-gnu2-tls2.h"
> +
> +__thread struct tls tls_var2 __attribute__ ((visibility ("hidden")));
> +
> +struct tls *
> +apply_tls (struct tls *p)
> +{
> +  tls_var2 = *p;
> +  return &tls_var2;
> +}
> diff --git a/sysdeps/i386/dl-machine.h b/sysdeps/i386/dl-machine.h
> index fc1ef96587..50d74fe6e9 100644
> --- a/sysdeps/i386/dl-machine.h
> +++ b/sysdeps/i386/dl-machine.h
> @@ -347,7 +347,7 @@ and creates an unsatisfiable circular dependency.\n",
>                   {
>                     td->arg = _dl_make_tlsdesc_dynamic
>                       (sym_map, sym->st_value + (ElfW(Word))td->arg);
> -                   td->entry = _dl_tlsdesc_dynamic;
> +                   td->entry = GLRO(dl_x86_tlsdesc_dynamic);
>                   }
>                 else
>  #  endif
> diff --git a/sysdeps/i386/dl-tlsdesc-dynamic.h b/sysdeps/i386/dl-tlsdesc-dynamic.h
> new file mode 100644
> index 0000000000..675e56d32d
> --- /dev/null
> +++ b/sysdeps/i386/dl-tlsdesc-dynamic.h
> @@ -0,0 +1,187 @@
> +/* Thread-local storage handling in the ELF dynamic linker.  i386 version.
> +   Copyright (C) 2004-2024 Free Software Foundation, Inc.
> +   This file is part of the GNU C Library.
> +
> +   The GNU C Library is free software; you can redistribute it and/or
> +   modify it under the terms of the GNU Lesser General Public
> +   License as published by the Free Software Foundation; either
> +   version 2.1 of the License, or (at your option) any later version.
> +
> +   The GNU C Library is distributed in the hope that it will be useful,
> +   but WITHOUT ANY WARRANTY; without even the implied warranty of
> +   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
> +   Lesser General Public License for more details.
> +
> +   You should have received a copy of the GNU Lesser General Public
> +   License along with the GNU C Library; if not, see
> +   <https://www.gnu.org/licenses/>.  */
> +
> +#undef REGISTER_SAVE_AREA
> +
> +#if !defined USE_FNSAVE && (STATE_SAVE_ALIGNMENT % 16) != 0
> +# error STATE_SAVE_ALIGNMENT must be multiple of 16
> +#endif
> +
> +#if DL_RUNTIME_RESOLVE_REALIGN_STACK
> +# ifdef USE_FNSAVE
> +#  error USE_FNSAVE shouldn't be defined
> +# endif
> +# ifdef USE_FXSAVE
> +/* Use fxsave to save all registers.  */
> +#  define REGISTER_SAVE_AREA   512
> +# endif
> +#else
> +# ifdef USE_FNSAVE
> +/* Use fnsave to save x87 FPU stack registers.  */
> +#  define REGISTER_SAVE_AREA   108
> +# else
> +#  ifndef USE_FXSAVE
> +#   error USE_FXSAVE must be defined
> +#  endif
> +/* Use fxsave to save all registers.  Add 12 bytes to align the stack
> +   to 16 bytes.  */
> +#  define REGISTER_SAVE_AREA   (512 + 12)
> +# endif
> +#endif
> +
> +       .hidden _dl_tlsdesc_dynamic
> +       .global _dl_tlsdesc_dynamic
> +       .type   _dl_tlsdesc_dynamic,@function
> +
> +     /* This function is used for symbols that need dynamic TLS.
> +
> +       %eax points to the TLS descriptor, such that 0(%eax) points to
> +       _dl_tlsdesc_dynamic itself, and 4(%eax) points to a struct
> +       tlsdesc_dynamic_arg object.  It must return in %eax the offset
> +       between the thread pointer and the object denoted by the
> +       argument, without clobbering any registers.
> +
> +       The assembly code that follows is a rendition of the following
> +       C code, hand-optimized a little bit.
> +
> +ptrdiff_t
> +__attribute__ ((__regparm__ (1)))
> +_dl_tlsdesc_dynamic (struct tlsdesc *tdp)
> +{
> +  struct tlsdesc_dynamic_arg *td = tdp->arg;
> +  dtv_t *dtv = *(dtv_t **)((char *)__thread_pointer + DTV_OFFSET);
> +  if (__builtin_expect (td->gen_count <= dtv[0].counter
> +                       && (dtv[td->tlsinfo.ti_module].pointer.val
> +                           != TLS_DTV_UNALLOCATED),
> +                       1))
> +    return dtv[td->tlsinfo.ti_module].pointer.val + td->tlsinfo.ti_offset
> +      - __thread_pointer;
> +
> +  return ___tls_get_addr (&td->tlsinfo) - __thread_pointer;
> +}
> +*/
> +       cfi_startproc
> +       .align 16
> +_dl_tlsdesc_dynamic:
> +       /* Like all TLS resolvers, preserve call-clobbered registers.
> +          We need two scratch regs anyway.  */
> +       subl    $32, %esp
> +       cfi_adjust_cfa_offset (32)
> +       movl    %ecx, 20(%esp)
> +       movl    %edx, 24(%esp)
> +       movl    TLSDESC_ARG(%eax), %eax
> +       movl    %gs:DTV_OFFSET, %edx
> +       movl    TLSDESC_GEN_COUNT(%eax), %ecx
> +       cmpl    (%edx), %ecx
> +       ja      2f
> +       movl    TLSDESC_MODID(%eax), %ecx
> +       movl    (%edx,%ecx,8), %edx
> +       cmpl    $-1, %edx
> +       je      2f
> +       movl    TLSDESC_MODOFF(%eax), %eax
> +       addl    %edx, %eax
> +1:
> +       movl    20(%esp), %ecx
> +       subl    %gs:0, %eax
> +       movl    24(%esp), %edx
> +       addl    $32, %esp
> +       cfi_adjust_cfa_offset (-32)
> +       ret
> +       .p2align 4,,7
> +2:
> +       cfi_adjust_cfa_offset (32)
Extraneous AFAICT.

> +#if DL_RUNTIME_RESOLVE_REALIGN_STACK
> +       movl    %ebx, -28(%esp)
> +       movl    %esp, %ebx
> +       cfi_def_cfa_register(%ebx)
> +       and     $-STATE_SAVE_ALIGNMENT, %esp
> +#endif
> +#ifdef REGISTER_SAVE_AREA
> +       subl    $REGISTER_SAVE_AREA, %esp
> +# if !DL_RUNTIME_RESOLVE_REALIGN_STACK
> +       cfi_adjust_cfa_offset(REGISTER_SAVE_AREA)
> +# endif
> +#else
> +       # Allocate stack space of the required size to save the state.
> +       LOAD_PIC_REG (cx)
> +       subl    RTLD_GLOBAL_RO_DL_X86_CPU_FEATURES_OFFSET+XSAVE_STATE_SIZE_OFFSET+_rtld_local_ro@GOTOFF(%ecx), %esp
> +#endif
> +#ifdef USE_FNSAVE
> +       fnsave  (%esp)
> +#elif defined USE_FXSAVE
> +       fxsave  (%esp)
> +#else
> +       # Save the argument for ___tls_get_addr in EAX.
> +       movl    %eax, %ecx
> +       movl    $TLSDESC_CALL_STATE_SAVE_MASK, %eax
> +       xorl    %edx, %edx
> +       # Clear the XSAVE Header.
> +# ifdef USE_XSAVE
> +       movl    %edx, (512)(%esp)
> +       movl    %edx, (512 + 4 * 1)(%esp)
> +       movl    %edx, (512 + 4 * 2)(%esp)
> +       movl    %edx, (512 + 4 * 3)(%esp)
> +# endif
> +       movl    %edx, (512 + 4 * 4)(%esp)
> +       movl    %edx, (512 + 4 * 5)(%esp)
> +       movl    %edx, (512 + 4 * 6)(%esp)
> +       movl    %edx, (512 + 4 * 7)(%esp)
> +       movl    %edx, (512 + 4 * 8)(%esp)
> +       movl    %edx, (512 + 4 * 9)(%esp)
> +       movl    %edx, (512 + 4 * 10)(%esp)
> +       movl    %edx, (512 + 4 * 11)(%esp)
> +       movl    %edx, (512 + 4 * 12)(%esp)
> +       movl    %edx, (512 + 4 * 13)(%esp)
> +       movl    %edx, (512 + 4 * 14)(%esp)
> +       movl    %edx, (512 + 4 * 15)(%esp)
> +# ifdef USE_XSAVE
> +       xsave   (%esp)
> +# else
> +       xsavec  (%esp)
> +# endif
> +       # Restore the argument for ___tls_get_addr in EAX.
> +       movl    %ecx, %eax
> +#endif
> +       call    HIDDEN_JUMPTARGET (___tls_get_addr)
> +       # Get register content back.
> +#ifdef USE_FNSAVE
> +       frstor  (%esp)
> +#elif defined USE_FXSAVE
> +       fxrstor (%esp)
> +#else
> +       /* Save and retore ___tls_get_addr return value stored in EAX.  */
> +       movl    %eax, %ecx
> +       movl    $TLSDESC_CALL_STATE_SAVE_MASK, %eax
> +       xorl    %edx, %edx
> +       xrstor  (%esp)
> +       movl    %ecx, %eax
> +#endif
> +#if DL_RUNTIME_RESOLVE_REALIGN_STACK
> +       mov     %ebx, %esp
> +       cfi_def_cfa_register(%esp)
> +       movl    -28(%esp), %ebx
> +       cfi_restore(%ebx)
> +#else
> +       addl    $REGISTER_SAVE_AREA, %esp
> +       cfi_adjust_cfa_offset(-REGISTER_SAVE_AREA)
The use of `REGISTER_SAVE_AREA` above is guarded by an
`#ifdef REGISTER_SAVE_AREA`
and uses
`_rtld_local_ro+RTLD_GLOBAL_RO_DL_X86_CPU_FEATURES_OFFSET+XSAVE_STATE_SIZE_OFFSET(%rip)`
otherwise.
Would expect same here?
> +#endif
> +       jmp     1b
> +       cfi_endproc
> +       .size   _dl_tlsdesc_dynamic, .-_dl_tlsdesc_dynamic
> +
> +#undef STATE_SAVE_ALIGNMENT
> diff --git a/sysdeps/i386/dl-tlsdesc.S b/sysdeps/i386/dl-tlsdesc.S
> index 90d93caa0c..f002feee56 100644
> --- a/sysdeps/i386/dl-tlsdesc.S
> +++ b/sysdeps/i386/dl-tlsdesc.S
> @@ -18,8 +18,27 @@
>
>  #include <sysdep.h>
>  #include <tls.h>
> +#include <cpu-features-offsets.h>
> +#include <features-offsets.h>
>  #include "tlsdesc.h"
>
> +#ifndef DL_STACK_ALIGNMENT
> +/* Due to GCC bug:
> +
> +   https://gcc.gnu.org/bugzilla/show_bug.cgi?id=58066
> +
> +   __tls_get_addr may be called with 4-byte stack alignment.  Although
> +   this bug has been fixed in GCC 4.9.4, 5.3 and 6, we can't assume
> +   that stack will be always aligned at 16 bytes.  */
> +# define DL_STACK_ALIGNMENT 4
> +#endif
> +
> +/* True if _dl_tlsdesc_dynamic should align stack for STATE_SAVE or align
> +   stack to MINIMUM_ALIGNMENT bytes before calling ___tls_get_addr.  */
> +#define DL_RUNTIME_RESOLVE_REALIGN_STACK \
> +  (STATE_SAVE_ALIGNMENT > DL_STACK_ALIGNMENT \
> +   || MINIMUM_ALIGNMENT > DL_STACK_ALIGNMENT)
> +
>         .text
>
>       /* This function is used to compute the TP offset for symbols in
> @@ -65,69 +84,35 @@ _dl_tlsdesc_undefweak:
>         .size   _dl_tlsdesc_undefweak, .-_dl_tlsdesc_undefweak
>
>  #ifdef SHARED
> -       .hidden _dl_tlsdesc_dynamic
> -       .global _dl_tlsdesc_dynamic
> -       .type   _dl_tlsdesc_dynamic,@function
> -
> -     /* This function is used for symbols that need dynamic TLS.
> -
> -       %eax points to the TLS descriptor, such that 0(%eax) points to
> -       _dl_tlsdesc_dynamic itself, and 4(%eax) points to a struct
> -       tlsdesc_dynamic_arg object.  It must return in %eax the offset
> -       between the thread pointer and the object denoted by the
> -       argument, without clobbering any registers.
> -
> -       The assembly code that follows is a rendition of the following
> -       C code, hand-optimized a little bit.
> -
> -ptrdiff_t
> -__attribute__ ((__regparm__ (1)))
> -_dl_tlsdesc_dynamic (struct tlsdesc *tdp)
> -{
> -  struct tlsdesc_dynamic_arg *td = tdp->arg;
> -  dtv_t *dtv = *(dtv_t **)((char *)__thread_pointer + DTV_OFFSET);
> -  if (__builtin_expect (td->gen_count <= dtv[0].counter
> -                       && (dtv[td->tlsinfo.ti_module].pointer.val
> -                           != TLS_DTV_UNALLOCATED),
> -                       1))
> -    return dtv[td->tlsinfo.ti_module].pointer.val + td->tlsinfo.ti_offset
> -      - __thread_pointer;
> -
> -  return ___tls_get_addr (&td->tlsinfo) - __thread_pointer;
> -}
> -*/
> -       cfi_startproc
> -       .align 16
> -_dl_tlsdesc_dynamic:
> -       /* Like all TLS resolvers, preserve call-clobbered registers.
> -          We need two scratch regs anyway.  */
> -       subl    $28, %esp
> -       cfi_adjust_cfa_offset (28)
> -       movl    %ecx, 20(%esp)
> -       movl    %edx, 24(%esp)
> -       movl    TLSDESC_ARG(%eax), %eax
> -       movl    %gs:DTV_OFFSET, %edx
> -       movl    TLSDESC_GEN_COUNT(%eax), %ecx
> -       cmpl    (%edx), %ecx
> -       ja      .Lslow
> -       movl    TLSDESC_MODID(%eax), %ecx
> -       movl    (%edx,%ecx,8), %edx
> -       cmpl    $-1, %edx
> -       je      .Lslow
> -       movl    TLSDESC_MODOFF(%eax), %eax
> -       addl    %edx, %eax
> -.Lret:
> -       movl    20(%esp), %ecx
> -       subl    %gs:0, %eax
> -       movl    24(%esp), %edx
> -       addl    $28, %esp
> -       cfi_adjust_cfa_offset (-28)
> -       ret
> -       .p2align 4,,7
> -.Lslow:
> -       cfi_adjust_cfa_offset (28)
> -       call    HIDDEN_JUMPTARGET (___tls_get_addr)
> -       jmp     .Lret
> -       cfi_endproc
> -       .size   _dl_tlsdesc_dynamic, .-_dl_tlsdesc_dynamic
> +# define USE_FNSAVE
> +# define MINIMUM_ALIGNMENT     4
> +# define STATE_SAVE_ALIGNMENT  4
> +# define _dl_tlsdesc_dynamic   _dl_tlsdesc_dynamic_fnsave
> +# include "dl-tlsdesc-dynamic.h"
> +# undef _dl_tlsdesc_dynamic
> +# undef MINIMUM_ALIGNMENT
> +# undef USE_FNSAVE
> +
> +# define MINIMUM_ALIGNMENT     16
> +
> +# define USE_FXSAVE
> +# define STATE_SAVE_ALIGNMENT  16
> +# define _dl_tlsdesc_dynamic   _dl_tlsdesc_dynamic_fxsave
> +# include "dl-tlsdesc-dynamic.h"
> +# undef _dl_tlsdesc_dynamic
> +# undef USE_FXSAVE
> +
> +# define USE_XSAVE
> +# define STATE_SAVE_ALIGNMENT  64
> +# define _dl_tlsdesc_dynamic   _dl_tlsdesc_dynamic_xsave
> +# include "dl-tlsdesc-dynamic.h"
> +# undef _dl_tlsdesc_dynamic
> +# undef USE_XSAVE
> +
> +# define USE_XSAVEC
> +# define STATE_SAVE_ALIGNMENT  64
> +# define _dl_tlsdesc_dynamic   _dl_tlsdesc_dynamic_xsavec
> +# include "dl-tlsdesc-dynamic.h"
> +# undef _dl_tlsdesc_dynamic
> +# undef USE_XSAVEC
>  #endif /* SHARED */
> diff --git a/sysdeps/i386/tst-gnu2-tls2.c b/sysdeps/i386/tst-gnu2-tls2.c
> new file mode 100644
> index 0000000000..92e7fbff89
> --- /dev/null
> +++ b/sysdeps/i386/tst-gnu2-tls2.c
> @@ -0,0 +1,5 @@
> +#include <sys/platform/x86.h>
> +
> +#define IS_SUPPORTED() CPU_FEATURE_ACTIVE (SSE2)
> +
> +#include <elf/tst-gnu2-tls2.c>
> diff --git a/sysdeps/x86/Makefile b/sysdeps/x86/Makefile
> index 4d50b327b5..bc357f0099 100644
> --- a/sysdeps/x86/Makefile
> +++ b/sysdeps/x86/Makefile
> @@ -1,5 +1,5 @@
>  ifeq ($(subdir),csu)
> -gen-as-const-headers += cpu-features-offsets.sym
> +gen-as-const-headers += cpu-features-offsets.sym features-offsets.sym
>  endif
>
>  ifeq ($(subdir),elf)
> @@ -86,6 +86,11 @@ endif
>  tst-ifunc-isa-2-ENV = GLIBC_TUNABLES=glibc.cpu.hwcaps=-SSE4_2,-AVX,-AVX2,-AVX512F
>  tst-ifunc-isa-2-static-ENV = $(tst-ifunc-isa-2-ENV)
>  tst-hwcap-tunables-ARGS = -- $(host-test-program-cmd)
> +
> +CFLAGS-malloc-for-test.c += -msse2
> +CFLAGS-tst-gnu2-tls2mod0.c += -msse2 -mtune=haswell
> +CFLAGS-tst-gnu2-tls2mod1.c += -msse2 -mtune=haswell
> +CFLAGS-tst-gnu2-tls2mod2.c += -msse2 -mtune=haswell
>  endif
>
>  ifeq ($(subdir),math)
> diff --git a/sysdeps/x86/cpu-features.c b/sysdeps/x86/cpu-features.c
> index 25e6622a79..835113b42f 100644
> --- a/sysdeps/x86/cpu-features.c
> +++ b/sysdeps/x86/cpu-features.c
> @@ -27,8 +27,13 @@
>  extern void TUNABLE_CALLBACK (set_hwcaps) (tunable_val_t *)
>    attribute_hidden;
>
> -#if defined SHARED && defined __x86_64__
> -# include <dl-plt-rewrite.h>
> +#if defined SHARED
> +extern void _dl_tlsdesc_dynamic_fxsave (void) attribute_hidden;
> +extern void _dl_tlsdesc_dynamic_xsave (void) attribute_hidden;
> +extern void _dl_tlsdesc_dynamic_xsavec (void) attribute_hidden;
> +
> +# ifdef __x86_64__
> +#  include <dl-plt-rewrite.h>
>
>  static void
>  TUNABLE_CALLBACK (set_plt_rewrite) (tunable_val_t *valp)
> @@ -47,6 +52,15 @@ TUNABLE_CALLBACK (set_plt_rewrite) (tunable_val_t *valp)
>                  : plt_rewrite_jmp);
>      }
>  }
> +# else
> +extern void _dl_tlsdesc_dynamic_fnsave (void) attribute_hidden;
> +# endif
> +#endif
> +
> +#ifdef __x86_64__
> +extern void _dl_runtime_resolve_fxsave (void) attribute_hidden;
> +extern void _dl_runtime_resolve_xsave (void) attribute_hidden;
> +extern void _dl_runtime_resolve_xsavec (void) attribute_hidden;
>  #endif
>
>  #ifdef __LP64__
> @@ -1130,6 +1144,44 @@ no_cpuid:
>                TUNABLE_CALLBACK (set_x86_shstk));
>  #endif
>
> +  if (GLRO(dl_x86_cpu_features).xsave_state_size != 0)
> +    {
> +      if (CPU_FEATURE_USABLE_P (cpu_features, XSAVEC))
> +       {
> +#ifdef __x86_64__
> +         GLRO(dl_x86_64_runtime_resolve) = _dl_runtime_resolve_xsavec;
> +#endif
> +#ifdef SHARED
> +         GLRO(dl_x86_tlsdesc_dynamic) = _dl_tlsdesc_dynamic_xsavec;
> +#endif
> +       }
> +      else
> +       {
> +#ifdef __x86_64__
> +         GLRO(dl_x86_64_runtime_resolve) = _dl_runtime_resolve_xsave;
> +#endif
> +#ifdef SHARED
> +         GLRO(dl_x86_tlsdesc_dynamic) = _dl_tlsdesc_dynamic_xsave;
> +#endif
> +       }
> +    }
> +  else
> +    {
> +#ifdef __x86_64__
> +      GLRO(dl_x86_64_runtime_resolve) = _dl_runtime_resolve_fxsave;
> +# ifdef SHARED
> +      GLRO(dl_x86_tlsdesc_dynamic) = _dl_tlsdesc_dynamic_fxsave;
> +# endif
> +#else
> +# ifdef SHARED
> +      if (CPU_FEATURE_USABLE_P (cpu_features, FXSR))
> +       GLRO(dl_x86_tlsdesc_dynamic) = _dl_tlsdesc_dynamic_fxsave;
> +      else
> +       GLRO(dl_x86_tlsdesc_dynamic) = _dl_tlsdesc_dynamic_fnsave;
> +# endif
> +#endif
> +    }
> +
>  #ifdef SHARED
>  # ifdef __x86_64__
>    TUNABLE_GET (plt_rewrite, tunable_val_t *,
> diff --git a/sysdeps/x86/dl-procinfo.c b/sysdeps/x86/dl-procinfo.c
> index ee957b4d70..5920d4b320 100644
> --- a/sysdeps/x86/dl-procinfo.c
> +++ b/sysdeps/x86/dl-procinfo.c
> @@ -86,3 +86,19 @@ PROCINFO_CLASS const char _dl_x86_platforms[4][9]
>  #else
>  ,
>  #endif
> +
> +#if defined SHARED && !IS_IN (ldconfig)
> +# if !defined PROCINFO_DECL
> +  ._dl_x86_tlsdesc_dynamic
> +# else
> +PROCINFO_CLASS void * _dl_x86_tlsdesc_dynamic
> +# endif
> +# ifndef PROCINFO_DECL
> += NULL
> +# endif
> +# ifdef PROCINFO_DECL
> +;
> +# else
> +,
> +# endif
> +#endif
> diff --git a/sysdeps/x86_64/features-offsets.sym b/sysdeps/x86/features-offsets.sym
> similarity index 89%
> rename from sysdeps/x86_64/features-offsets.sym
> rename to sysdeps/x86/features-offsets.sym
> index 9e4be3393a..77e990c705 100644
> --- a/sysdeps/x86_64/features-offsets.sym
> +++ b/sysdeps/x86/features-offsets.sym
> @@ -3,4 +3,6 @@
>  #include <ldsodefs.h>
>
>  RTLD_GLOBAL_RO_DL_X86_CPU_FEATURES_OFFSET offsetof (struct rtld_global_ro, _dl_x86_cpu_features)
> +#ifdef __x86_64__
>  RTLD_GLOBAL_DL_X86_FEATURE_1_OFFSET offsetof (struct rtld_global, _dl_x86_feature_1)
> +#endif
> diff --git a/sysdeps/x86/malloc-for-test.c b/sysdeps/x86/malloc-for-test.c
> new file mode 100644
> index 0000000000..02f4dead5d
> --- /dev/null
> +++ b/sysdeps/x86/malloc-for-test.c
> @@ -0,0 +1,33 @@
> +/*  A malloc for intercept test.  x86 version.
> +   Copyright (C) 2024 Free Software Foundation, Inc.
> +   This file is part of the GNU C Library.
> +
> +   The GNU C Library is free software; you can redistribute it and/or
> +   modify it under the terms of the GNU Lesser General Public
> +   License as published by the Free Software Foundation; either
> +   version 2.1 of the License, or (at your option) any later version.
> +
> +   The GNU C Library is distributed in the hope that it will be useful,
> +   but WITHOUT ANY WARRANTY; without even the implied warranty of
> +   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
> +   Lesser General Public License for more details.
> +
> +   You should have received a copy of the GNU Lesser General Public
> +   License along with the GNU C Library; if not, see
> +   <http://www.gnu.org/licenses/>.  */
> +
> +
> +/* Clear XMM0...XMM7  */
> +#define PREPARE_MALLOC()                               \
> +{                                                      \
> +  asm volatile ("xorps %%xmm0, %%xmm0" : : : "xmm0" ); \
> +  asm volatile ("xorps %%xmm1, %%xmm1" : : : "xmm1" ); \
> +  asm volatile ("xorps %%xmm2, %%xmm2" : : : "xmm2" ); \
> +  asm volatile ("xorps %%xmm3, %%xmm3" : : : "xmm3" ); \
> +  asm volatile ("xorps %%xmm4, %%xmm4" : : : "xmm4" ); \
> +  asm volatile ("xorps %%xmm5, %%xmm5" : : : "xmm5" ); \
> +  asm volatile ("xorps %%xmm6, %%xmm6" : : : "xmm6" ); \
> +  asm volatile ("xorps %%xmm7, %%xmm7" : : : "xmm7" ); \
> +}
> +
> +#include <elf/malloc-for-test.c>
> diff --git a/sysdeps/x86/sysdep.h b/sysdeps/x86/sysdep.h
> index 5c1f0bcf53..792e2ea5ed 100644
> --- a/sysdeps/x86/sysdep.h
> +++ b/sysdeps/x86/sysdep.h
> @@ -68,6 +68,12 @@
>     | (1 << X86_XSTATE_ZMM_H_ID))
>  #endif
>
> +/* States which should be saved for TLSDESC_CALL and TLS_DESC_CALL.
> +   Compiler assumes that all registers, including x87 FPU stack registers,
> +   are unchanged after CALL, except for EFLAGS and RAX/EAX.  */
> +#define TLSDESC_CALL_STATE_SAVE_MASK   \
> +  (STATE_SAVE_MASK | (1 << X86_XSTATE_X87_ID))
> +
>  /* Constants for bits in __x86_string_control:  */
>
>  /* Avoid short distance REP MOVSB.  */
> diff --git a/sysdeps/x86_64/Makefile b/sysdeps/x86_64/Makefile
> index 90f4ecfd26..e8babc9a4e 100644
> --- a/sysdeps/x86_64/Makefile
> +++ b/sysdeps/x86_64/Makefile
> @@ -10,7 +10,7 @@ LDFLAGS-rtld += -Wl,-z,nomark-plt
>  endif
>
>  ifeq ($(subdir),csu)
> -gen-as-const-headers += features-offsets.sym link-defines.sym
> +gen-as-const-headers += link-defines.sym
>  endif
>
>  ifeq ($(subdir),gmon)
> diff --git a/sysdeps/x86_64/dl-machine.h b/sysdeps/x86_64/dl-machine.h
> index 6d605d0d32..ff5d45f7cb 100644
> --- a/sysdeps/x86_64/dl-machine.h
> +++ b/sysdeps/x86_64/dl-machine.h
> @@ -71,9 +71,6 @@ elf_machine_runtime_setup (struct link_map *l, struct r_scope_elem *scope[],
>                            int lazy, int profile)
>  {
>    Elf64_Addr *got;
> -  extern void _dl_runtime_resolve_fxsave (ElfW(Word)) attribute_hidden;
> -  extern void _dl_runtime_resolve_xsave (ElfW(Word)) attribute_hidden;
> -  extern void _dl_runtime_resolve_xsavec (ElfW(Word)) attribute_hidden;
>    extern void _dl_runtime_profile_sse (ElfW(Word)) attribute_hidden;
>    extern void _dl_runtime_profile_avx (ElfW(Word)) attribute_hidden;
>    extern void _dl_runtime_profile_avx512 (ElfW(Word)) attribute_hidden;
> @@ -96,8 +93,6 @@ elf_machine_runtime_setup (struct link_map *l, struct r_scope_elem *scope[],
>        /* Identify this shared object.  */
>        *(ElfW(Addr) *) (got + 1) = (ElfW(Addr)) l;
>
> -      const struct cpu_features* cpu_features = __get_cpu_features ();
> -
>  #ifdef SHARED
>        /* The got[2] entry contains the address of a function which gets
>          called to get the address of a so far unresolved function and
> @@ -107,6 +102,7 @@ elf_machine_runtime_setup (struct link_map *l, struct r_scope_elem *scope[],
>          end in this function.  */
>        if (__glibc_unlikely (profile))
>         {
> +         const struct cpu_features* cpu_features = __get_cpu_features ();
>           if (X86_ISA_CPU_FEATURE_USABLE_P (cpu_features, AVX512F))
>             *(ElfW(Addr) *) (got + 2) = (ElfW(Addr)) &_dl_runtime_profile_avx512;
>           else if (X86_ISA_CPU_FEATURE_USABLE_P (cpu_features, AVX))
> @@ -126,15 +122,8 @@ elf_machine_runtime_setup (struct link_map *l, struct r_scope_elem *scope[],
>           /* This function will get called to fix up the GOT entry
>              indicated by the offset on the stack, and then jump to
>              the resolved address.  */
> -         if (MINIMUM_X86_ISA_LEVEL >= AVX_X86_ISA_LEVEL
> -             || GLRO(dl_x86_cpu_features).xsave_state_size != 0)
> -           *(ElfW(Addr) *) (got + 2)
> -             = (CPU_FEATURE_USABLE_P (cpu_features, XSAVEC)
> -                ? (ElfW(Addr)) &_dl_runtime_resolve_xsavec
> -                : (ElfW(Addr)) &_dl_runtime_resolve_xsave);
> -         else
> -           *(ElfW(Addr) *) (got + 2)
> -             = (ElfW(Addr)) &_dl_runtime_resolve_fxsave;
> +         *(ElfW(Addr) *) (got + 2)
> +           = (ElfW(Addr)) GLRO(dl_x86_64_runtime_resolve);
>         }
>      }
>
> @@ -383,7 +372,7 @@ and creates an unsatisfiable circular dependency.\n",
>                   {
>                     td->arg = _dl_make_tlsdesc_dynamic
>                       (sym_map, sym->st_value + reloc->r_addend);
> -                   td->entry = _dl_tlsdesc_dynamic;
> +                   td->entry = GLRO(dl_x86_tlsdesc_dynamic);
>                   }
>                 else
>  #  endif
> diff --git a/sysdeps/x86_64/dl-procinfo.c b/sysdeps/x86_64/dl-procinfo.c
> index 4d1d790fbb..06637a8154 100644
> --- a/sysdeps/x86_64/dl-procinfo.c
> +++ b/sysdeps/x86_64/dl-procinfo.c
> @@ -41,5 +41,21 @@
>
>  #include <sysdeps/x86/dl-procinfo.c>
>
> +#if !IS_IN (ldconfig)
> +# if !defined PROCINFO_DECL && defined SHARED
> +  ._dl_x86_64_runtime_resolve
> +# else
> +PROCINFO_CLASS void * _dl_x86_64_runtime_resolve
> +# endif
> +# ifndef PROCINFO_DECL
> += NULL
> +# endif
> +# if !defined SHARED || defined PROCINFO_DECL
> +;
> +# else
> +,
> +# endif
> +#endif
> +
>  #undef PROCINFO_DECL
>  #undef PROCINFO_CLASS
> diff --git a/sysdeps/x86_64/dl-tlsdesc-dynamic.h b/sysdeps/x86_64/dl-tlsdesc-dynamic.h
> new file mode 100644
> index 0000000000..ce0bc094ec
> --- /dev/null
> +++ b/sysdeps/x86_64/dl-tlsdesc-dynamic.h
> @@ -0,0 +1,166 @@
> +/* Thread-local storage handling in the ELF dynamic linker.  x86_64 version.
> +   Copyright (C) 2004-2024 Free Software Foundation, Inc.
> +   This file is part of the GNU C Library.
> +
> +   The GNU C Library is free software; you can redistribute it and/or
> +   modify it under the terms of the GNU Lesser General Public
> +   License as published by the Free Software Foundation; either
> +   version 2.1 of the License, or (at your option) any later version.
> +
> +   The GNU C Library is distributed in the hope that it will be useful,
> +   but WITHOUT ANY WARRANTY; without even the implied warranty of
> +   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
> +   Lesser General Public License for more details.
> +
> +   You should have received a copy of the GNU Lesser General Public
> +   License along with the GNU C Library; if not, see
> +   <https://www.gnu.org/licenses/>.  */
> +
> +#ifndef SECTION
> +# define SECTION(p)    p
> +#endif
> +
> +#undef REGISTER_SAVE_AREA
> +#undef LOCAL_STORAGE_AREA
> +#undef BASE
> +
> +#include "dl-trampoline-state.h"
> +
> +       .section SECTION(.text),"ax",@progbits
> +
> +       .hidden _dl_tlsdesc_dynamic
> +       .global _dl_tlsdesc_dynamic
> +       .type   _dl_tlsdesc_dynamic,@function
> +
> +     /* %rax points to the TLS descriptor, such that 0(%rax) points to
> +       _dl_tlsdesc_dynamic itself, and 8(%rax) points to a struct
> +       tlsdesc_dynamic_arg object.  It must return in %rax the offset
> +       between the thread pointer and the object denoted by the
> +       argument, without clobbering any registers.
> +
> +       The assembly code that follows is a rendition of the following
> +       C code, hand-optimized a little bit.
> +
> +ptrdiff_t
> +_dl_tlsdesc_dynamic (register struct tlsdesc *tdp asm ("%rax"))
> +{
> +  struct tlsdesc_dynamic_arg *td = tdp->arg;
> +  dtv_t *dtv = *(dtv_t **)((char *)__thread_pointer + DTV_OFFSET);
> +  if (__builtin_expect (td->gen_count <= dtv[0].counter
> +                       && (dtv[td->tlsinfo.ti_module].pointer.val
> +                           != TLS_DTV_UNALLOCATED),
> +                       1))
> +    return dtv[td->tlsinfo.ti_module].pointer.val + td->tlsinfo.ti_offset
> +      - __thread_pointer;
> +
> +  return __tls_get_addr_internal (&td->tlsinfo) - __thread_pointer;
> +}
> +*/
> +       cfi_startproc
> +       .align 16
> +_dl_tlsdesc_dynamic:
> +       _CET_ENDBR
> +       /* Preserve call-clobbered registers that we modify.
> +          We need two scratch regs anyway.  */
> +       movq    %rsi, -16(%rsp)
> +       mov     %fs:DTV_OFFSET, %RSI_LP
> +       movq    %rdi, -8(%rsp)
> +       movq    TLSDESC_ARG(%rax), %rdi
> +       movq    (%rsi), %rax
> +       cmpq    %rax, TLSDESC_GEN_COUNT(%rdi)
> +       ja      2f
> +       movq    TLSDESC_MODID(%rdi), %rax
> +       salq    $4, %rax
> +       movq    (%rax,%rsi), %rax
> +       cmpq    $-1, %rax
> +       je      2f
> +       addq    TLSDESC_MODOFF(%rdi), %rax
> +1:
> +       movq    -16(%rsp), %rsi
> +       sub     %fs:0, %RAX_LP
> +       movq    -8(%rsp), %rdi
> +       ret
> +2:
> +#if DL_RUNTIME_RESOLVE_REALIGN_STACK
> +       movq    %rbx, -24(%rsp)
> +       mov     %RSP_LP, %RBX_LP
> +       cfi_def_cfa_register(%rbx)
> +       and     $-STATE_SAVE_ALIGNMENT, %RSP_LP
> +#endif
> +#ifdef REGISTER_SAVE_AREA
> +# if DL_RUNTIME_RESOLVE_REALIGN_STACK
> +       # STATE_SAVE_OFFSET has space for 8 integer registers.  But we
> +       # need space for RCX, RDX, RSI, RDI, R8, R9, R10 and R11, plus
> +       # RBX above.
> +       sub     $(REGISTER_SAVE_AREA + STATE_SAVE_ALIGNMENT), %RSP_LP
> +# else
> +       sub     $REGISTER_SAVE_AREA, %RSP_LP
> +       cfi_adjust_cfa_offset(REGISTER_SAVE_AREA)
> +# endif
> +#else
> +       # Allocate stack space of the required size to save the state.
> +       sub     _rtld_local_ro+RTLD_GLOBAL_RO_DL_X86_CPU_FEATURES_OFFSET+XSAVE_STATE_SIZE_OFFSET(%rip), %RSP_LP
> +#endif
> +       /* Besides rdi and rsi, saved above, save rcx, rdx, r8, r9,
> +          r10 and r11.  */
> +       movq    %rcx, REGISTER_SAVE_RCX(%rsp)
> +       movq    %rdx, REGISTER_SAVE_RDX(%rsp)
> +       movq    %r8, REGISTER_SAVE_R8(%rsp)
> +       movq    %r9, REGISTER_SAVE_R9(%rsp)
> +       movq    %r10, REGISTER_SAVE_R10(%rsp)
> +       movq    %r11, REGISTER_SAVE_R11(%rsp)
> +#ifdef USE_FXSAVE
> +       fxsave  STATE_SAVE_OFFSET(%rsp)
> +#else
> +       movl    $TLSDESC_CALL_STATE_SAVE_MASK, %eax
> +       xorl    %edx, %edx
> +       # Clear the XSAVE Header.
> +# ifdef USE_XSAVE
> +       movq    %rdx, (STATE_SAVE_OFFSET + 512)(%rsp)
> +       movq    %rdx, (STATE_SAVE_OFFSET + 512 + 8)(%rsp)
> +# endif
> +       movq    %rdx, (STATE_SAVE_OFFSET + 512 + 8 * 2)(%rsp)
> +       movq    %rdx, (STATE_SAVE_OFFSET + 512 + 8 * 3)(%rsp)
> +       movq    %rdx, (STATE_SAVE_OFFSET + 512 + 8 * 4)(%rsp)
> +       movq    %rdx, (STATE_SAVE_OFFSET + 512 + 8 * 5)(%rsp)
> +       movq    %rdx, (STATE_SAVE_OFFSET + 512 + 8 * 6)(%rsp)
> +       movq    %rdx, (STATE_SAVE_OFFSET + 512 + 8 * 7)(%rsp)
> +# ifdef USE_XSAVE
> +       xsave   STATE_SAVE_OFFSET(%rsp)
> +# else
> +       xsavec  STATE_SAVE_OFFSET(%rsp)
> +# endif
> +#endif
> +       /* %rdi already points to the tlsinfo data structure.  */
> +       call    HIDDEN_JUMPTARGET (__tls_get_addr)
> +       # Get register content back.
> +#ifdef USE_FXSAVE
> +       fxrstor STATE_SAVE_OFFSET(%rsp)
> +#else
> +       /* Save and retore __tls_get_addr return value stored in RAX.  */
> +       mov     %RAX_LP, %RCX_LP
> +       movl    $TLSDESC_CALL_STATE_SAVE_MASK, %eax
> +       xorl    %edx, %edx
> +       xrstor  STATE_SAVE_OFFSET(%rsp)
> +       mov     %RCX_LP, %RAX_LP
> +#endif
> +       movq    REGISTER_SAVE_R11(%rsp), %r11
> +       movq    REGISTER_SAVE_R10(%rsp), %r10
> +       movq    REGISTER_SAVE_R9(%rsp), %r9
> +       movq    REGISTER_SAVE_R8(%rsp), %r8
> +       movq    REGISTER_SAVE_RDX(%rsp), %rdx
> +       movq    REGISTER_SAVE_RCX(%rsp), %rcx
> +#if DL_RUNTIME_RESOLVE_REALIGN_STACK
> +       mov     %RBX_LP, %RSP_LP
> +       cfi_def_cfa_register(%rsp)
> +       movq    -24(%rsp), %rbx
> +       cfi_restore(%rbx)
> +#else
> +       add     $REGISTER_SAVE_AREA, %RSP_LP
> +       cfi_adjust_cfa_offset(-REGISTER_SAVE_AREA)
The use of `REGISTER_SAVE_AREA` above is guarded by an
`#ifdef REGISTER_SAVE_AREA`
and uses
`_rtld_local_ro+RTLD_GLOBAL_RO_DL_X86_CPU_FEATURES_OFFSET+XSAVE_STATE_SIZE_OFFSET(%rip)`
otherwise.
Would expect same here?
> +#endif
> +       jmp     1b
> +       cfi_endproc
> +       .size   _dl_tlsdesc_dynamic, .-_dl_tlsdesc_dynamic
> +
Seems to me the x32 and x64 implementations could be merged with
a few defines for the registers/register width
+ the extra GPR saving in x64.
> +#undef STATE_SAVE_ALIGNMENT
> diff --git a/sysdeps/x86_64/dl-tlsdesc.S b/sysdeps/x86_64/dl-tlsdesc.S
> index f748af2ece..ea69f5223a 100644
> --- a/sysdeps/x86_64/dl-tlsdesc.S
> +++ b/sysdeps/x86_64/dl-tlsdesc.S
> @@ -18,7 +18,19 @@
>
>  #include <sysdep.h>
>  #include <tls.h>
> +#include <cpu-features-offsets.h>
> +#include <features-offsets.h>
>  #include "tlsdesc.h"
> +#include "dl-trampoline-save.h"
> +
> +/* Area on stack to save and restore registers used for parameter
> +   passing when calling _dl_tlsdesc_dynamic.  */
> +#define REGISTER_SAVE_RCX      0
> +#define REGISTER_SAVE_RDX      (REGISTER_SAVE_RCX + 8)
> +#define REGISTER_SAVE_R8       (REGISTER_SAVE_RDX + 8)
> +#define REGISTER_SAVE_R9       (REGISTER_SAVE_R8 + 8)
> +#define REGISTER_SAVE_R10      (REGISTER_SAVE_R9 + 8)
> +#define REGISTER_SAVE_R11      (REGISTER_SAVE_R10 + 8)
>
>         .text
>
> @@ -67,80 +79,24 @@ _dl_tlsdesc_undefweak:
>         .size   _dl_tlsdesc_undefweak, .-_dl_tlsdesc_undefweak
>
>  #ifdef SHARED
> -       .hidden _dl_tlsdesc_dynamic
> -       .global _dl_tlsdesc_dynamic
> -       .type   _dl_tlsdesc_dynamic,@function
> -
> -     /* %rax points to the TLS descriptor, such that 0(%rax) points to
> -       _dl_tlsdesc_dynamic itself, and 8(%rax) points to a struct
> -       tlsdesc_dynamic_arg object.  It must return in %rax the offset
> -       between the thread pointer and the object denoted by the
> -       argument, without clobbering any registers.
> -
> -       The assembly code that follows is a rendition of the following
> -       C code, hand-optimized a little bit.
> -
> -ptrdiff_t
> -_dl_tlsdesc_dynamic (register struct tlsdesc *tdp asm ("%rax"))
> -{
> -  struct tlsdesc_dynamic_arg *td = tdp->arg;
> -  dtv_t *dtv = *(dtv_t **)((char *)__thread_pointer + DTV_OFFSET);
> -  if (__builtin_expect (td->gen_count <= dtv[0].counter
> -                       && (dtv[td->tlsinfo.ti_module].pointer.val
> -                           != TLS_DTV_UNALLOCATED),
> -                       1))
> -    return dtv[td->tlsinfo.ti_module].pointer.val + td->tlsinfo.ti_offset
> -      - __thread_pointer;
> -
> -  return __tls_get_addr_internal (&td->tlsinfo) - __thread_pointer;
> -}
> -*/
> -       cfi_startproc
> -       .align 16
> -_dl_tlsdesc_dynamic:
> -       _CET_ENDBR
> -       /* Preserve call-clobbered registers that we modify.
> -          We need two scratch regs anyway.  */
> -       movq    %rsi, -16(%rsp)
> -       mov     %fs:DTV_OFFSET, %RSI_LP
> -       movq    %rdi, -8(%rsp)
> -       movq    TLSDESC_ARG(%rax), %rdi
> -       movq    (%rsi), %rax
> -       cmpq    %rax, TLSDESC_GEN_COUNT(%rdi)
> -       ja      .Lslow
> -       movq    TLSDESC_MODID(%rdi), %rax
> -       salq    $4, %rax
> -       movq    (%rax,%rsi), %rax
> -       cmpq    $-1, %rax
> -       je      .Lslow
> -       addq    TLSDESC_MODOFF(%rdi), %rax
> -.Lret:
> -       movq    -16(%rsp), %rsi
> -       sub     %fs:0, %RAX_LP
> -       movq    -8(%rsp), %rdi
> -       ret
> -.Lslow:
> -       /* Besides rdi and rsi, saved above, save rdx, rcx, r8, r9,
> -          r10 and r11.  Also, align the stack, that's off by 8 bytes.  */
> -       subq    $72, %rsp
> -       cfi_adjust_cfa_offset (72)
> -       movq    %rdx, 8(%rsp)
> -       movq    %rcx, 16(%rsp)
> -       movq    %r8, 24(%rsp)
> -       movq    %r9, 32(%rsp)
> -       movq    %r10, 40(%rsp)
> -       movq    %r11, 48(%rsp)
> -       /* %rdi already points to the tlsinfo data structure.  */
> -       call    HIDDEN_JUMPTARGET (__tls_get_addr)
> -       movq    8(%rsp), %rdx
> -       movq    16(%rsp), %rcx
> -       movq    24(%rsp), %r8
> -       movq    32(%rsp), %r9
> -       movq    40(%rsp), %r10
> -       movq    48(%rsp), %r11
> -       addq    $72, %rsp
> -       cfi_adjust_cfa_offset (-72)
> -       jmp     .Lret
> -       cfi_endproc
> -       .size   _dl_tlsdesc_dynamic, .-_dl_tlsdesc_dynamic
> +# define USE_FXSAVE
> +# define STATE_SAVE_ALIGNMENT  16
> +# define _dl_tlsdesc_dynamic   _dl_tlsdesc_dynamic_fxsave
> +# include "dl-tlsdesc-dynamic.h"
> +# undef _dl_tlsdesc_dynamic
> +# undef USE_FXSAVE
> +
> +# define USE_XSAVE
> +# define STATE_SAVE_ALIGNMENT  64
> +# define _dl_tlsdesc_dynamic   _dl_tlsdesc_dynamic_xsave
> +# include "dl-tlsdesc-dynamic.h"
> +# undef _dl_tlsdesc_dynamic
> +# undef USE_XSAVE
> +
> +# define USE_XSAVEC
> +# define STATE_SAVE_ALIGNMENT  64
> +# define _dl_tlsdesc_dynamic   _dl_tlsdesc_dynamic_xsavec
> +# include "dl-tlsdesc-dynamic.h"
> +# undef _dl_tlsdesc_dynamic
> +# undef USE_XSAVEC
>  #endif /* SHARED */
> diff --git a/sysdeps/x86_64/dl-trampoline-save.h b/sysdeps/x86_64/dl-trampoline-save.h
> new file mode 100644
> index 0000000000..84eac4a8ac
> --- /dev/null
> +++ b/sysdeps/x86_64/dl-trampoline-save.h
> @@ -0,0 +1,34 @@
> +/* x86-64 PLT trampoline register save macros.
> +   Copyright (C) 2024 Free Software Foundation, Inc.
> +   This file is part of the GNU C Library.
> +
> +   The GNU C Library is free software; you can redistribute it and/or
> +   modify it under the terms of the GNU Lesser General Public
> +   License as published by the Free Software Foundation; either
> +   version 2.1 of the License, or (at your option) any later version.
> +
> +   The GNU C Library is distributed in the hope that it will be useful,
> +   but WITHOUT ANY WARRANTY; without even the implied warranty of
> +   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
> +   Lesser General Public License for more details.
> +
> +   You should have received a copy of the GNU Lesser General Public
> +   License along with the GNU C Library; if not, see
> +   <https://www.gnu.org/licenses/>.  */
> +
> +#ifndef DL_STACK_ALIGNMENT
> +/* Due to GCC bug:
> +
> +   https://gcc.gnu.org/bugzilla/show_bug.cgi?id=58066
> +
> +   __tls_get_addr may be called with 8-byte stack alignment.  Although
> +   this bug has been fixed in GCC 4.9.4, 5.3 and 6, we can't assume
> +   that stack will be always aligned at 16 bytes.  */
> +# define DL_STACK_ALIGNMENT 8
> +#endif
> +
> +/* True if _dl_runtime_resolve should align stack for STATE_SAVE or align
> +   stack to 16 bytes before calling _dl_fixup.  */
> +#define DL_RUNTIME_RESOLVE_REALIGN_STACK \
> +  (STATE_SAVE_ALIGNMENT > DL_STACK_ALIGNMENT \
> +   || 16 > DL_STACK_ALIGNMENT)
> diff --git a/sysdeps/x86_64/dl-trampoline-state.h b/sysdeps/x86_64/dl-trampoline-state.h
> new file mode 100644
> index 0000000000..575f120797
> --- /dev/null
> +++ b/sysdeps/x86_64/dl-trampoline-state.h
> @@ -0,0 +1,51 @@
> +/* x86-64 PLT dl-trampoline state macros.
> +   Copyright (C) 2024 Free Software Foundation, Inc.
> +   This file is part of the GNU C Library.
> +
> +   The GNU C Library is free software; you can redistribute it and/or
> +   modify it under the terms of the GNU Lesser General Public
> +   License as published by the Free Software Foundation; either
> +   version 2.1 of the License, or (at your option) any later version.
> +
> +   The GNU C Library is distributed in the hope that it will be useful,
> +   but WITHOUT ANY WARRANTY; without even the implied warranty of
> +   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
> +   Lesser General Public License for more details.
> +
> +   You should have received a copy of the GNU Lesser General Public
> +   License along with the GNU C Library; if not, see
> +   <https://www.gnu.org/licenses/>.  */
> +
> +#if (STATE_SAVE_ALIGNMENT % 16) != 0
> +# error STATE_SAVE_ALIGNMENT must be multiple of 16
> +#endif
> +
> +#if (STATE_SAVE_OFFSET % STATE_SAVE_ALIGNMENT) != 0
> +# error STATE_SAVE_OFFSET must be multiple of STATE_SAVE_ALIGNMENT
> +#endif
> +
> +#if DL_RUNTIME_RESOLVE_REALIGN_STACK
> +/* Local stack area before jumping to function address: RBX.  */
> +# define LOCAL_STORAGE_AREA    8
> +# define BASE                  rbx
> +# ifdef USE_FXSAVE
> +/* Use fxsave to save XMM registers.  */
> +#  define REGISTER_SAVE_AREA   (512 + STATE_SAVE_OFFSET)
> +#  if (REGISTER_SAVE_AREA % 16) != 0
> +#   error REGISTER_SAVE_AREA must be multiple of 16
> +#  endif
> +# endif
> +#else
> +# ifndef USE_FXSAVE
> +#  error USE_FXSAVE must be defined
> +# endif
> +/* Use fxsave to save XMM registers.  */
> +# define REGISTER_SAVE_AREA    (512 + STATE_SAVE_OFFSET + 8)
> +/* Local stack area before jumping to function address:  All saved
> +   registers.  */
> +# define LOCAL_STORAGE_AREA    REGISTER_SAVE_AREA
> +# define BASE                  rsp
> +# if (REGISTER_SAVE_AREA % 16) != 8
> +#  error REGISTER_SAVE_AREA must be odd multiple of 8
> +# endif
> +#endif
> diff --git a/sysdeps/x86_64/dl-trampoline.S b/sysdeps/x86_64/dl-trampoline.S
> index b2e7e0f69b..87c5137837 100644
> --- a/sysdeps/x86_64/dl-trampoline.S
> +++ b/sysdeps/x86_64/dl-trampoline.S
> @@ -22,25 +22,7 @@
>  #include <features-offsets.h>
>  #include <link-defines.h>
>  #include <isa-level.h>
> -
> -#ifndef DL_STACK_ALIGNMENT
> -/* Due to GCC bug:
> -
> -   https://gcc.gnu.org/bugzilla/show_bug.cgi?id=58066
> -
> -   __tls_get_addr may be called with 8-byte stack alignment.  Although
> -   this bug has been fixed in GCC 4.9.4, 5.3 and 6, we can't assume
> -   that stack will be always aligned at 16 bytes.  We use unaligned
> -   16-byte move to load and store SSE registers, which has no penalty
> -   on modern processors if stack is 16-byte aligned.  */
> -# define DL_STACK_ALIGNMENT 8
> -#endif
> -
> -/* True if _dl_runtime_resolve should align stack for STATE_SAVE or align
> -   stack to 16 bytes before calling _dl_fixup.  */
> -#define DL_RUNTIME_RESOLVE_REALIGN_STACK \
> -  (STATE_SAVE_ALIGNMENT > DL_STACK_ALIGNMENT \
> -   || 16 > DL_STACK_ALIGNMENT)
> +#include "dl-trampoline-save.h"
>
>  /* Area on stack to save and restore registers used for parameter
>     passing when calling _dl_fixup.  */
> diff --git a/sysdeps/x86_64/dl-trampoline.h b/sysdeps/x86_64/dl-trampoline.h
> index f55c6ea040..d9ccfb40d4 100644
> --- a/sysdeps/x86_64/dl-trampoline.h
> +++ b/sysdeps/x86_64/dl-trampoline.h
> @@ -27,39 +27,7 @@
>  # undef LOCAL_STORAGE_AREA
>  # undef BASE
>
> -# if (STATE_SAVE_ALIGNMENT % 16) != 0
> -#  error STATE_SAVE_ALIGNMENT must be multiple of 16
> -# endif
> -
> -# if (STATE_SAVE_OFFSET % STATE_SAVE_ALIGNMENT) != 0
> -#  error STATE_SAVE_OFFSET must be multiple of STATE_SAVE_ALIGNMENT
> -# endif
> -
> -# if DL_RUNTIME_RESOLVE_REALIGN_STACK
> -/* Local stack area before jumping to function address: RBX.  */
> -#  define LOCAL_STORAGE_AREA   8
> -#  define BASE                 rbx
> -#  ifdef USE_FXSAVE
> -/* Use fxsave to save XMM registers.  */
> -#   define REGISTER_SAVE_AREA  (512 + STATE_SAVE_OFFSET)
> -#   if (REGISTER_SAVE_AREA % 16) != 0
> -#    error REGISTER_SAVE_AREA must be multiple of 16
> -#   endif
> -#  endif
> -# else
> -#  ifndef USE_FXSAVE
> -#   error USE_FXSAVE must be defined
> -#  endif
> -/* Use fxsave to save XMM registers.  */
> -#  define REGISTER_SAVE_AREA   (512 + STATE_SAVE_OFFSET + 8)
> -/* Local stack area before jumping to function address:  All saved
> -   registers.  */
> -#  define LOCAL_STORAGE_AREA   REGISTER_SAVE_AREA
> -#  define BASE                 rsp
> -#  if (REGISTER_SAVE_AREA % 16) != 8
> -#   error REGISTER_SAVE_AREA must be odd multiple of 8
> -#  endif
> -# endif
> +# include "dl-trampoline-state.h"
>
>         .globl _dl_runtime_resolve
>         .hidden _dl_runtime_resolve
> --
> 2.43.0
>
  
H.J. Lu Feb. 14, 2024, 11:21 p.m. UTC | #2
On Wed, Feb 14, 2024 at 10:44:20PM +0000, Noah Goldstein wrote:
> On Tue, Feb 13, 2024 at 4:15 AM H.J. Lu <hjl.tools@gmail.com> wrote:
> >
> > Compiler generates the following instruction sequence for GNU2 dynamic
> > TLS access:
> >
> >         leaq    tls_var@TLSDESC(%rip), %rax
> >         call    *tls_var@TLSCALL(%rax)
> >
> > or
> >
> >         leal    tls_var@TLSDESC(%ebx), %eax
> >         call    *tls_var@TLSCALL(%eax)
> >
> > CALL instruction is transparent to compiler which assumes all registers,
> > except for EFLAGS and RAX/EAX, are unchanged after CALL.  When
> > _dl_tlsdesc_dynamic is called, it calls __tls_get_addr on the slow
> > path.  __tls_get_addr is a normal function which doesn't preserve any
> > caller-saved registers.  _dl_tlsdesc_dynamic saved and restored integer
> > caller-saved registers, but didn't preserve any other caller-saved
> > registers.  Add _dl_tlsdesc_dynamic IFUNC functions for FNSAVE, FXSAVE,
> > XSAVE and XSAVEC to save and restore all caller-saved registers.  This
> > fixes BZ #31372.
> >
> > Add GLRO(dl_x86_64_runtime_resolve) with GLRO(dl_x86_tlsdesc_dynamic)
> > to optimize elf_machine_runtime_setup.
> > ---
> >  elf/Makefile                                 |  19 ++
> >  elf/malloc-for-test.c                        |  32 ++++
> >  elf/malloc-for-test.map                      |   6 +
> >  elf/tst-gnu2-tls2.c                          |  97 ++++++++++
> >  elf/tst-gnu2-tls2.h                          |  26 +++
> >  elf/tst-gnu2-tls2mod0.c                      |  28 +++
> >  elf/tst-gnu2-tls2mod1.c                      |  28 +++
> >  elf/tst-gnu2-tls2mod2.c                      |  28 +++
> >  sysdeps/i386/dl-machine.h                    |   2 +-
> >  sysdeps/i386/dl-tlsdesc-dynamic.h            | 187 +++++++++++++++++++
> >  sysdeps/i386/dl-tlsdesc.S                    | 115 +++++-------
> >  sysdeps/i386/tst-gnu2-tls2.c                 |   5 +
> >  sysdeps/x86/Makefile                         |   7 +-
> >  sysdeps/x86/cpu-features.c                   |  56 +++++-
> >  sysdeps/x86/dl-procinfo.c                    |  16 ++
> >  sysdeps/{x86_64 => x86}/features-offsets.sym |   2 +
> >  sysdeps/x86/malloc-for-test.c                |  33 ++++
> >  sysdeps/x86/sysdep.h                         |   6 +
> >  sysdeps/x86_64/Makefile                      |   2 +-
> >  sysdeps/x86_64/dl-machine.h                  |  19 +-
> >  sysdeps/x86_64/dl-procinfo.c                 |  16 ++
> >  sysdeps/x86_64/dl-tlsdesc-dynamic.h          | 166 ++++++++++++++++
> >  sysdeps/x86_64/dl-tlsdesc.S                  | 108 ++++-------
> >  sysdeps/x86_64/dl-trampoline-save.h          |  34 ++++
> >  sysdeps/x86_64/dl-trampoline-state.h         |  51 +++++
> >  sysdeps/x86_64/dl-trampoline.S               |  20 +-
> >  sysdeps/x86_64/dl-trampoline.h               |  34 +---
> >  27 files changed, 930 insertions(+), 213 deletions(-)
> >  create mode 100644 elf/malloc-for-test.c
> >  create mode 100644 elf/malloc-for-test.map
> >  create mode 100644 elf/tst-gnu2-tls2.c
> >  create mode 100644 elf/tst-gnu2-tls2.h
> >  create mode 100644 elf/tst-gnu2-tls2mod0.c
> >  create mode 100644 elf/tst-gnu2-tls2mod1.c
> >  create mode 100644 elf/tst-gnu2-tls2mod2.c
> >  create mode 100644 sysdeps/i386/dl-tlsdesc-dynamic.h
> >  create mode 100644 sysdeps/i386/tst-gnu2-tls2.c
> >  rename sysdeps/{x86_64 => x86}/features-offsets.sym (89%)
> >  create mode 100644 sysdeps/x86/malloc-for-test.c
> >  create mode 100644 sysdeps/x86_64/dl-tlsdesc-dynamic.h
> >  create mode 100644 sysdeps/x86_64/dl-trampoline-save.h
> >  create mode 100644 sysdeps/x86_64/dl-trampoline-state.h
> >
> > diff --git a/elf/Makefile b/elf/Makefile
> > index 5d78b659ce..e0665d2007 100644
> > --- a/elf/Makefile
> > +++ b/elf/Makefile
> > @@ -424,6 +424,7 @@ tests += \
> >    tst-glibc-hwcaps-prepend \
> >    tst-global1 \
> >    tst-global2 \
> > +  tst-gnu2-tls2 \
> >    tst-initfinilazyfail \
> >    tst-initorder \
> >    tst-initorder2 \
> > @@ -699,6 +700,7 @@ modules-names += \
> >    libtracemod5-1 \
> >    ltglobmod1 \
> >    ltglobmod2 \
> > +  malloc-for-test \
> >    neededobj1 \
> >    neededobj2 \
> >    neededobj3 \
> > @@ -846,6 +848,9 @@ modules-names += \
> >    tst-filterobj-flt \
> >    tst-finilazyfailmod \
> >    tst-globalmod2 \
> > +  tst-gnu2-tls2mod0 \
> > +  tst-gnu2-tls2mod1 \
> > +  tst-gnu2-tls2mod2 \
> >    tst-initlazyfailmod \
> >    tst-initorder2a \
> >    tst-initorder2b \
> > @@ -3044,8 +3049,22 @@ $(objpfx)tst-tlsgap.out: \
> >    $(objpfx)tst-tlsgap-mod0.so \
> >    $(objpfx)tst-tlsgap-mod1.so \
> >    $(objpfx)tst-tlsgap-mod2.so
> > +
> > +$(objpfx)tst-gnu2-tls2: \
> > +  $(shared-thread-library) \
> > +  $(objpfx)malloc-for-test.so
> > +$(objpfx)tst-gnu2-tls2.out: \
> > +  $(objpfx)tst-gnu2-tls2mod0.so \
> > +  $(objpfx)tst-gnu2-tls2mod1.so \
> > +  $(objpfx)tst-gnu2-tls2mod2.so
> > +
> > +LDFLAGS-malloc-for-test.so += -Wl,--version-script=malloc-for-test.map
> > +
> >  ifeq (yes,$(have-mtls-dialect-gnu2))
> >  CFLAGS-tst-tlsgap-mod0.c += -mtls-dialect=gnu2
> >  CFLAGS-tst-tlsgap-mod1.c += -mtls-dialect=gnu2
> >  CFLAGS-tst-tlsgap-mod2.c += -mtls-dialect=gnu2
> > +CFLAGS-tst-gnu2-tls2mod0.c += -mtls-dialect=gnu2
> > +CFLAGS-tst-gnu2-tls2mod1.c += -mtls-dialect=gnu2
> > +CFLAGS-tst-gnu2-tls2mod2.c += -mtls-dialect=gnu2
> >  endif
> > diff --git a/elf/malloc-for-test.c b/elf/malloc-for-test.c
> > new file mode 100644
> > index 0000000000..1bec69eda7
> > --- /dev/null
> > +++ b/elf/malloc-for-test.c
> > @@ -0,0 +1,32 @@
> > +/* A malloc for intercept test.
> > +   Copyright (C) 2024 Free Software Foundation, Inc.
> > +   This file is part of the GNU C Library.
> > +
> > +   The GNU C Library is free software; you can redistribute it and/or
> > +   modify it under the terms of the GNU Lesser General Public
> > +   License as published by the Free Software Foundation; either
> > +   version 2.1 of the License, or (at your option) any later version.
> > +
> > +   The GNU C Library is distributed in the hope that it will be useful,
> > +   but WITHOUT ANY WARRANTY; without even the implied warranty of
> > +   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
> > +   Lesser General Public License for more details.
> > +
> > +   You should have received a copy of the GNU Lesser General Public
> > +   License along with the GNU C Library; if not, see
> > +   <http://www.gnu.org/licenses/>.  */
> > +
> > +#include <stdlib.h>
> > +
> > +extern void * __libc_malloc (size_t);
> > +
> > +#ifndef PREPARE_MALLOC
> > +# define PREPARE_MALLOC()
> > +#endif
> > +
> > +void *
> > +malloc (size_t n)
> > +{
> > +  PREPARE_MALLOC ();
> > +  return __libc_malloc (n);
> > +}
> > diff --git a/elf/malloc-for-test.map b/elf/malloc-for-test.map
> > new file mode 100644
> > index 0000000000..8437cf4346
> > --- /dev/null
> > +++ b/elf/malloc-for-test.map
> > @@ -0,0 +1,6 @@
> > +GLIBC_2.0 {
> > +  global:
> > +    malloc;
> > +  local:
> > +    *;
> > +};
> > diff --git a/elf/tst-gnu2-tls2.c b/elf/tst-gnu2-tls2.c
> > new file mode 100644
> > index 0000000000..34427f9a0f
> > --- /dev/null
> > +++ b/elf/tst-gnu2-tls2.c
> > @@ -0,0 +1,97 @@
> > +/* Test TLSDESC relocation.
> > +   Copyright (C) 2024 Free Software Foundation, Inc.
> > +   This file is part of the GNU C Library.
> > +
> > +   The GNU C Library is free software; you can redistribute it and/or
> > +   modify it under the terms of the GNU Lesser General Public
> > +   License as published by the Free Software Foundation; either
> > +   version 2.1 of the License, or (at your option) any later version.
> > +
> > +   The GNU C Library is distributed in the hope that it will be useful,
> > +   but WITHOUT ANY WARRANTY; without even the implied warranty of
> > +   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
> > +   Lesser General Public License for more details.
> > +
> > +   You should have received a copy of the GNU Lesser General Public
> > +   License along with the GNU C Library; if not, see
> > +   <http://www.gnu.org/licenses/>.  */
> > +
> > +#include <stdio.h>
> > +#include <string.h>
> > +#include <dlfcn.h>
> > +#include <pthread.h>
> > +#include <support/xdlfcn.h>
> > +#include <support/xthread.h>
> > +#include <support/check.h>
> > +#include <support/test-driver.h>
> > +#include "tst-gnu2-tls2.h"
> > +
> > +#ifndef IS_SUPPORTED
> > +# define IS_SUPPORTED() true
> > +#endif
> > +
> > +static void *mod[3];
> > +#define MOD(i) "tst-gnu2-tls2mod" #i ".so"
> > +static const char *modname[3] = { MOD(0), MOD(1), MOD(2) };
> > +#undef MOD
> > +
> > +static void
> > +open_mod (int i)
> > +{
> > +  mod[i] = xdlopen (modname[i], RTLD_LAZY);
> > +  printf ("open %s\n", modname[i]);
> > +}
> > +
> > +static void
> > +close_mod (int i)
> > +{
> > +  xdlclose (mod[i]);
> > +  mod[i] = NULL;
> > +  printf ("close %s\n", modname[i]);
> > +}
> > +
> > +static void
> > +access_mod (int i, const char *sym)
> > +{
> > +  struct tls var = { -1, -1, -1, -1 };
> > +  struct tls *(*f) (struct tls *) = xdlsym (mod[i], sym);
> > +  struct tls *p = f (&var);
> > +  printf ("access %s: %s() = %p\n", modname[i], sym, p);
> > +  TEST_VERIFY_EXIT (memcmp (p, &var, sizeof (var)) == 0);
> > +  ++(p->a);
> > +}
> > +
> > +static void *
> > +start (void *arg)
> > +{
> > +  /* The DTV generation is at the last dlopen of mod0 and the
> > +     entry for mod1 is NULL.  */
> > +
> > +  open_mod (1); /* Reuse modid of mod1. Uses dynamic TLS.  */
> > +
> > +  /* Force the slow path in GNU2 TLS descriptor call.  */
> > +  access_mod (1, "apply_tls");
> > +
> > +  return arg;
> > +}
> > +
> > +static int
> > +do_test (void)
> > +{
> > +  if (!IS_SUPPORTED ())
> > +    return EXIT_UNSUPPORTED;
> > +
> > +  open_mod (0);
> > +  open_mod (1);
> > +  open_mod (2);
> > +  close_mod (0);
> > +  close_mod (1); /* Create modid gap at mod1.  */
> > +  open_mod (0); /* Reuse modid of mod0, bump generation count.  */
> > +
> > +  /* Create a thread where DTV of mod1 is NULL.  */
> > +  pthread_t t = xpthread_create (NULL, start, NULL);
> > +  xpthread_join (t);
> > +  return 0;
> > +}
> > +
> > +#include <support/test-driver.c>
> > diff --git a/elf/tst-gnu2-tls2.h b/elf/tst-gnu2-tls2.h
> > new file mode 100644
> > index 0000000000..e33f4dbe27
> > --- /dev/null
> > +++ b/elf/tst-gnu2-tls2.h
> > @@ -0,0 +1,26 @@
> > +/* Test TLSDESC relocation.
> > +   Copyright (C) 2024 Free Software Foundation, Inc.
> > +   This file is part of the GNU C Library.
> > +
> > +   The GNU C Library is free software; you can redistribute it and/or
> > +   modify it under the terms of the GNU Lesser General Public
> > +   License as published by the Free Software Foundation; either
> > +   version 2.1 of the License, or (at your option) any later version.
> > +
> > +   The GNU C Library is distributed in the hope that it will be useful,
> > +   but WITHOUT ANY WARRANTY; without even the implied warranty of
> > +   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
> > +   Lesser General Public License for more details.
> > +
> > +   You should have received a copy of the GNU Lesser General Public
> > +   License along with the GNU C Library; if not, see
> > +   <https://www.gnu.org/licenses/>.  */
> > +
> > +#include <stdint.h>
> > +
> > +struct tls
> > +{
> > +  int64_t a, b, c, d;
> > +};
> > +
> > +extern struct tls *apply_tls (struct tls *);
> > diff --git a/elf/tst-gnu2-tls2mod0.c b/elf/tst-gnu2-tls2mod0.c
> > new file mode 100644
> > index 0000000000..67dc0d464d
> > --- /dev/null
> > +++ b/elf/tst-gnu2-tls2mod0.c
> > @@ -0,0 +1,28 @@
> > +/* DSO used by tst-gnu2-tls2.
> > +   Copyright (C) 2024 Free Software Foundation, Inc.
> > +   This file is part of the GNU C Library.
> > +
> > +   The GNU C Library is free software; you can redistribute it and/or
> > +   modify it under the terms of the GNU Lesser General Public
> > +   License as published by the Free Software Foundation; either
> > +   version 2.1 of the License, or (at your option) any later version.
> > +
> > +   The GNU C Library is distributed in the hope that it will be useful,
> > +   but WITHOUT ANY WARRANTY; without even the implied warranty of
> > +   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
> > +   Lesser General Public License for more details.
> > +
> > +   You should have received a copy of the GNU Lesser General Public
> > +   License along with the GNU C Library; if not, see
> > +   <https://www.gnu.org/licenses/>.  */
> > +
> > +#include "tst-gnu2-tls2.h"
> > +
> > +__thread struct tls tls_var0 __attribute__ ((visibility ("hidden")));
> > +
> > +struct tls *
> > +apply_tls (struct tls *p)
> > +{
> > +  tls_var0 = *p;
> > +  return &tls_var0;
> > +}
> > diff --git a/elf/tst-gnu2-tls2mod1.c b/elf/tst-gnu2-tls2mod1.c
> > new file mode 100644
> > index 0000000000..a4ae6db24f
> > --- /dev/null
> > +++ b/elf/tst-gnu2-tls2mod1.c
> > @@ -0,0 +1,28 @@
> > +/* DSO used by tst-gnu2-tls2.
> > +   Copyright (C) 2024 Free Software Foundation, Inc.
> > +   This file is part of the GNU C Library.
> > +
> > +   The GNU C Library is free software; you can redistribute it and/or
> > +   modify it under the terms of the GNU Lesser General Public
> > +   License as published by the Free Software Foundation; either
> > +   version 2.1 of the License, or (at your option) any later version.
> > +
> > +   The GNU C Library is distributed in the hope that it will be useful,
> > +   but WITHOUT ANY WARRANTY; without even the implied warranty of
> > +   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
> > +   Lesser General Public License for more details.
> > +
> > +   You should have received a copy of the GNU Lesser General Public
> > +   License along with the GNU C Library; if not, see
> > +   <https://www.gnu.org/licenses/>.  */
> > +
> > +#include "tst-gnu2-tls2.h"
> > +
> > +__thread struct tls tls_var1[100] __attribute__ ((visibility ("hidden")));
> > +
> > +struct tls *
> > +apply_tls (struct tls *p)
> > +{
> > +  tls_var1[1] = *p;
> > +  return &tls_var1[1];
> > +}
> > diff --git a/elf/tst-gnu2-tls2mod2.c b/elf/tst-gnu2-tls2mod2.c
> > new file mode 100644
> > index 0000000000..2d13921717
> > --- /dev/null
> > +++ b/elf/tst-gnu2-tls2mod2.c
> > @@ -0,0 +1,28 @@
> > +/* DSO used by tst-gnu2-tls2.
> > +   Copyright (C) 2024 Free Software Foundation, Inc.
> > +   This file is part of the GNU C Library.
> > +
> > +   The GNU C Library is free software; you can redistribute it and/or
> > +   modify it under the terms of the GNU Lesser General Public
> > +   License as published by the Free Software Foundation; either
> > +   version 2.1 of the License, or (at your option) any later version.
> > +
> > +   The GNU C Library is distributed in the hope that it will be useful,
> > +   but WITHOUT ANY WARRANTY; without even the implied warranty of
> > +   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
> > +   Lesser General Public License for more details.
> > +
> > +   You should have received a copy of the GNU Lesser General Public
> > +   License along with the GNU C Library; if not, see
> > +   <https://www.gnu.org/licenses/>.  */
> > +
> > +#include "tst-gnu2-tls2.h"
> > +
> > +__thread struct tls tls_var2 __attribute__ ((visibility ("hidden")));
> > +
> > +struct tls *
> > +apply_tls (struct tls *p)
> > +{
> > +  tls_var2 = *p;
> > +  return &tls_var2;
> > +}
> > diff --git a/sysdeps/i386/dl-machine.h b/sysdeps/i386/dl-machine.h
> > index fc1ef96587..50d74fe6e9 100644
> > --- a/sysdeps/i386/dl-machine.h
> > +++ b/sysdeps/i386/dl-machine.h
> > @@ -347,7 +347,7 @@ and creates an unsatisfiable circular dependency.\n",
> >                   {
> >                     td->arg = _dl_make_tlsdesc_dynamic
> >                       (sym_map, sym->st_value + (ElfW(Word))td->arg);
> > -                   td->entry = _dl_tlsdesc_dynamic;
> > +                   td->entry = GLRO(dl_x86_tlsdesc_dynamic);
> >                   }
> >                 else
> >  #  endif
> > diff --git a/sysdeps/i386/dl-tlsdesc-dynamic.h b/sysdeps/i386/dl-tlsdesc-dynamic.h
> > new file mode 100644
> > index 0000000000..675e56d32d
> > --- /dev/null
> > +++ b/sysdeps/i386/dl-tlsdesc-dynamic.h
> > @@ -0,0 +1,187 @@
> > +/* Thread-local storage handling in the ELF dynamic linker.  i386 version.
> > +   Copyright (C) 2004-2024 Free Software Foundation, Inc.
> > +   This file is part of the GNU C Library.
> > +
> > +   The GNU C Library is free software; you can redistribute it and/or
> > +   modify it under the terms of the GNU Lesser General Public
> > +   License as published by the Free Software Foundation; either
> > +   version 2.1 of the License, or (at your option) any later version.
> > +
> > +   The GNU C Library is distributed in the hope that it will be useful,
> > +   but WITHOUT ANY WARRANTY; without even the implied warranty of
> > +   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
> > +   Lesser General Public License for more details.
> > +
> > +   You should have received a copy of the GNU Lesser General Public
> > +   License along with the GNU C Library; if not, see
> > +   <https://www.gnu.org/licenses/>.  */
> > +
> > +#undef REGISTER_SAVE_AREA
> > +
> > +#if !defined USE_FNSAVE && (STATE_SAVE_ALIGNMENT % 16) != 0
> > +# error STATE_SAVE_ALIGNMENT must be multiple of 16
> > +#endif
> > +
> > +#if DL_RUNTIME_RESOLVE_REALIGN_STACK
> > +# ifdef USE_FNSAVE
> > +#  error USE_FNSAVE shouldn't be defined
> > +# endif
> > +# ifdef USE_FXSAVE
> > +/* Use fxsave to save all registers.  */
> > +#  define REGISTER_SAVE_AREA   512
> > +# endif
> > +#else
> > +# ifdef USE_FNSAVE
> > +/* Use fnsave to save x87 FPU stack registers.  */
> > +#  define REGISTER_SAVE_AREA   108
> > +# else
> > +#  ifndef USE_FXSAVE
> > +#   error USE_FXSAVE must be defined
> > +#  endif
> > +/* Use fxsave to save all registers.  Add 12 bytes to align the stack
> > +   to 16 bytes.  */
> > +#  define REGISTER_SAVE_AREA   (512 + 12)
> > +# endif
> > +#endif
> > +
> > +       .hidden _dl_tlsdesc_dynamic
> > +       .global _dl_tlsdesc_dynamic
> > +       .type   _dl_tlsdesc_dynamic,@function
> > +
> > +     /* This function is used for symbols that need dynamic TLS.
> > +
> > +       %eax points to the TLS descriptor, such that 0(%eax) points to
> > +       _dl_tlsdesc_dynamic itself, and 4(%eax) points to a struct
> > +       tlsdesc_dynamic_arg object.  It must return in %eax the offset
> > +       between the thread pointer and the object denoted by the
> > +       argument, without clobbering any registers.
> > +
> > +       The assembly code that follows is a rendition of the following
> > +       C code, hand-optimized a little bit.
> > +
> > +ptrdiff_t
> > +__attribute__ ((__regparm__ (1)))
> > +_dl_tlsdesc_dynamic (struct tlsdesc *tdp)
> > +{
> > +  struct tlsdesc_dynamic_arg *td = tdp->arg;
> > +  dtv_t *dtv = *(dtv_t **)((char *)__thread_pointer + DTV_OFFSET);
> > +  if (__builtin_expect (td->gen_count <= dtv[0].counter
> > +                       && (dtv[td->tlsinfo.ti_module].pointer.val
> > +                           != TLS_DTV_UNALLOCATED),
> > +                       1))
> > +    return dtv[td->tlsinfo.ti_module].pointer.val + td->tlsinfo.ti_offset
> > +      - __thread_pointer;
> > +
> > +  return ___tls_get_addr (&td->tlsinfo) - __thread_pointer;
> > +}
> > +*/
> > +       cfi_startproc
> > +       .align 16
> > +_dl_tlsdesc_dynamic:
> > +       /* Like all TLS resolvers, preserve call-clobbered registers.
> > +          We need two scratch regs anyway.  */
> > +       subl    $32, %esp
> > +       cfi_adjust_cfa_offset (32)
> > +       movl    %ecx, 20(%esp)
> > +       movl    %edx, 24(%esp)
> > +       movl    TLSDESC_ARG(%eax), %eax
> > +       movl    %gs:DTV_OFFSET, %edx
> > +       movl    TLSDESC_GEN_COUNT(%eax), %ecx
> > +       cmpl    (%edx), %ecx
> > +       ja      2f
> > +       movl    TLSDESC_MODID(%eax), %ecx
> > +       movl    (%edx,%ecx,8), %edx
> > +       cmpl    $-1, %edx
> > +       je      2f
> > +       movl    TLSDESC_MODOFF(%eax), %eax
> > +       addl    %edx, %eax
> > +1:
> > +       movl    20(%esp), %ecx
> > +       subl    %gs:0, %eax
> > +       movl    24(%esp), %edx
> > +       addl    $32, %esp
> > +       cfi_adjust_cfa_offset (-32)
> > +       ret
> > +       .p2align 4,,7
> > +2:
> > +       cfi_adjust_cfa_offset (32)
> Extraneous AFAICT.

This was in the existing code. The label 2 can only be reached by
a jump.  When the label 2 is reached, this CFA adjustment is to tell
debugger that CFA isn't changed the CFA directive above.

> 
> > +#if DL_RUNTIME_RESOLVE_REALIGN_STACK
> > +       movl    %ebx, -28(%esp)
> > +       movl    %esp, %ebx
> > +       cfi_def_cfa_register(%ebx)
> > +       and     $-STATE_SAVE_ALIGNMENT, %esp
> > +#endif
> > +#ifdef REGISTER_SAVE_AREA
> > +       subl    $REGISTER_SAVE_AREA, %esp
> > +# if !DL_RUNTIME_RESOLVE_REALIGN_STACK
> > +       cfi_adjust_cfa_offset(REGISTER_SAVE_AREA)
> > +# endif
> > +#else
> > +       # Allocate stack space of the required size to save the state.
> > +       LOAD_PIC_REG (cx)
> > +       subl    RTLD_GLOBAL_RO_DL_X86_CPU_FEATURES_OFFSET+XSAVE_STATE_SIZE_OFFSET+_rtld_local_ro@GOTOFF(%ecx), %esp
> > +#endif
> > +#ifdef USE_FNSAVE
> > +       fnsave  (%esp)
> > +#elif defined USE_FXSAVE
> > +       fxsave  (%esp)
> > +#else
> > +       # Save the argument for ___tls_get_addr in EAX.
> > +       movl    %eax, %ecx
> > +       movl    $TLSDESC_CALL_STATE_SAVE_MASK, %eax
> > +       xorl    %edx, %edx
> > +       # Clear the XSAVE Header.
> > +# ifdef USE_XSAVE
> > +       movl    %edx, (512)(%esp)
> > +       movl    %edx, (512 + 4 * 1)(%esp)
> > +       movl    %edx, (512 + 4 * 2)(%esp)
> > +       movl    %edx, (512 + 4 * 3)(%esp)
> > +# endif
> > +       movl    %edx, (512 + 4 * 4)(%esp)
> > +       movl    %edx, (512 + 4 * 5)(%esp)
> > +       movl    %edx, (512 + 4 * 6)(%esp)
> > +       movl    %edx, (512 + 4 * 7)(%esp)
> > +       movl    %edx, (512 + 4 * 8)(%esp)
> > +       movl    %edx, (512 + 4 * 9)(%esp)
> > +       movl    %edx, (512 + 4 * 10)(%esp)
> > +       movl    %edx, (512 + 4 * 11)(%esp)
> > +       movl    %edx, (512 + 4 * 12)(%esp)
> > +       movl    %edx, (512 + 4 * 13)(%esp)
> > +       movl    %edx, (512 + 4 * 14)(%esp)
> > +       movl    %edx, (512 + 4 * 15)(%esp)
> > +# ifdef USE_XSAVE
> > +       xsave   (%esp)
> > +# else
> > +       xsavec  (%esp)
> > +# endif
> > +       # Restore the argument for ___tls_get_addr in EAX.
> > +       movl    %ecx, %eax
> > +#endif
> > +       call    HIDDEN_JUMPTARGET (___tls_get_addr)
> > +       # Get register content back.
> > +#ifdef USE_FNSAVE
> > +       frstor  (%esp)
> > +#elif defined USE_FXSAVE
> > +       fxrstor (%esp)
> > +#else
> > +       /* Save and retore ___tls_get_addr return value stored in EAX.  */
> > +       movl    %eax, %ecx
> > +       movl    $TLSDESC_CALL_STATE_SAVE_MASK, %eax
> > +       xorl    %edx, %edx
> > +       xrstor  (%esp)
> > +       movl    %ecx, %eax
> > +#endif
> > +#if DL_RUNTIME_RESOLVE_REALIGN_STACK
> > +       mov     %ebx, %esp
> > +       cfi_def_cfa_register(%esp)
> > +       movl    -28(%esp), %ebx
> > +       cfi_restore(%ebx)
> > +#else
> > +       addl    $REGISTER_SAVE_AREA, %esp
> > +       cfi_adjust_cfa_offset(-REGISTER_SAVE_AREA)
> The use of `REGISTER_SAVE_AREA` above is guarded by an
> `#ifdef REGISTER_SAVE_AREA`
> and uses
> `_rtld_local_ro+RTLD_GLOBAL_RO_DL_X86_CPU_FEATURES_OFFSET+XSAVE_STATE_SIZE_OFFSET(%rip)`
> otherwise.
> Would expect same here?

REGISTER_SAVE_AREA is only used by fnsave and fxsave which
expect the fixed area.

_rtld_local_ro+RTLD_GLOBAL_RO_DL_X86_CPU_FEATURES_OFFSET+XSAVE_STATE_SIZE_OFFSET(%rip)
is used by xsave and xsavec whose saved area size depends on
the enabled features.

2 things are different.

> > +#endif
> > +       jmp     1b
> > +       cfi_endproc
> > +       .size   _dl_tlsdesc_dynamic, .-_dl_tlsdesc_dynamic
> > +
> > +#undef STATE_SAVE_ALIGNMENT
> > diff --git a/sysdeps/i386/dl-tlsdesc.S b/sysdeps/i386/dl-tlsdesc.S
> > index 90d93caa0c..f002feee56 100644
> > --- a/sysdeps/i386/dl-tlsdesc.S
> > +++ b/sysdeps/i386/dl-tlsdesc.S
> > @@ -18,8 +18,27 @@
> >
> >  #include <sysdep.h>
> >  #include <tls.h>
> > +#include <cpu-features-offsets.h>
> > +#include <features-offsets.h>
> >  #include "tlsdesc.h"
> >
> > +#ifndef DL_STACK_ALIGNMENT
> > +/* Due to GCC bug:
> > +
> > +   https://gcc.gnu.org/bugzilla/show_bug.cgi?id=58066
> > +
> > +   __tls_get_addr may be called with 4-byte stack alignment.  Although
> > +   this bug has been fixed in GCC 4.9.4, 5.3 and 6, we can't assume
> > +   that stack will be always aligned at 16 bytes.  */
> > +# define DL_STACK_ALIGNMENT 4
> > +#endif
> > +
> > +/* True if _dl_tlsdesc_dynamic should align stack for STATE_SAVE or align
> > +   stack to MINIMUM_ALIGNMENT bytes before calling ___tls_get_addr.  */
> > +#define DL_RUNTIME_RESOLVE_REALIGN_STACK \
> > +  (STATE_SAVE_ALIGNMENT > DL_STACK_ALIGNMENT \
> > +   || MINIMUM_ALIGNMENT > DL_STACK_ALIGNMENT)
> > +
> >         .text
> >
> >       /* This function is used to compute the TP offset for symbols in
> > @@ -65,69 +84,35 @@ _dl_tlsdesc_undefweak:
> >         .size   _dl_tlsdesc_undefweak, .-_dl_tlsdesc_undefweak
> >
> >  #ifdef SHARED
> > -       .hidden _dl_tlsdesc_dynamic
> > -       .global _dl_tlsdesc_dynamic
> > -       .type   _dl_tlsdesc_dynamic,@function
> > -
> > -     /* This function is used for symbols that need dynamic TLS.
> > -
> > -       %eax points to the TLS descriptor, such that 0(%eax) points to
> > -       _dl_tlsdesc_dynamic itself, and 4(%eax) points to a struct
> > -       tlsdesc_dynamic_arg object.  It must return in %eax the offset
> > -       between the thread pointer and the object denoted by the
> > -       argument, without clobbering any registers.
> > -
> > -       The assembly code that follows is a rendition of the following
> > -       C code, hand-optimized a little bit.
> > -
> > -ptrdiff_t
> > -__attribute__ ((__regparm__ (1)))
> > -_dl_tlsdesc_dynamic (struct tlsdesc *tdp)
> > -{
> > -  struct tlsdesc_dynamic_arg *td = tdp->arg;
> > -  dtv_t *dtv = *(dtv_t **)((char *)__thread_pointer + DTV_OFFSET);
> > -  if (__builtin_expect (td->gen_count <= dtv[0].counter
> > -                       && (dtv[td->tlsinfo.ti_module].pointer.val
> > -                           != TLS_DTV_UNALLOCATED),
> > -                       1))
> > -    return dtv[td->tlsinfo.ti_module].pointer.val + td->tlsinfo.ti_offset
> > -      - __thread_pointer;
> > -
> > -  return ___tls_get_addr (&td->tlsinfo) - __thread_pointer;
> > -}
> > -*/
> > -       cfi_startproc
> > -       .align 16
> > -_dl_tlsdesc_dynamic:
> > -       /* Like all TLS resolvers, preserve call-clobbered registers.
> > -          We need two scratch regs anyway.  */
> > -       subl    $28, %esp
> > -       cfi_adjust_cfa_offset (28)
> > -       movl    %ecx, 20(%esp)
> > -       movl    %edx, 24(%esp)
> > -       movl    TLSDESC_ARG(%eax), %eax
> > -       movl    %gs:DTV_OFFSET, %edx
> > -       movl    TLSDESC_GEN_COUNT(%eax), %ecx
> > -       cmpl    (%edx), %ecx
> > -       ja      .Lslow
> > -       movl    TLSDESC_MODID(%eax), %ecx
> > -       movl    (%edx,%ecx,8), %edx
> > -       cmpl    $-1, %edx
> > -       je      .Lslow
> > -       movl    TLSDESC_MODOFF(%eax), %eax
> > -       addl    %edx, %eax
> > -.Lret:
> > -       movl    20(%esp), %ecx
> > -       subl    %gs:0, %eax
> > -       movl    24(%esp), %edx
> > -       addl    $28, %esp
> > -       cfi_adjust_cfa_offset (-28)
> > -       ret
> > -       .p2align 4,,7
> > -.Lslow:
> > -       cfi_adjust_cfa_offset (28)
> > -       call    HIDDEN_JUMPTARGET (___tls_get_addr)
> > -       jmp     .Lret
> > -       cfi_endproc
> > -       .size   _dl_tlsdesc_dynamic, .-_dl_tlsdesc_dynamic
> > +# define USE_FNSAVE
> > +# define MINIMUM_ALIGNMENT     4
> > +# define STATE_SAVE_ALIGNMENT  4
> > +# define _dl_tlsdesc_dynamic   _dl_tlsdesc_dynamic_fnsave
> > +# include "dl-tlsdesc-dynamic.h"
> > +# undef _dl_tlsdesc_dynamic
> > +# undef MINIMUM_ALIGNMENT
> > +# undef USE_FNSAVE
> > +
> > +# define MINIMUM_ALIGNMENT     16
> > +
> > +# define USE_FXSAVE
> > +# define STATE_SAVE_ALIGNMENT  16
> > +# define _dl_tlsdesc_dynamic   _dl_tlsdesc_dynamic_fxsave
> > +# include "dl-tlsdesc-dynamic.h"
> > +# undef _dl_tlsdesc_dynamic
> > +# undef USE_FXSAVE
> > +
> > +# define USE_XSAVE
> > +# define STATE_SAVE_ALIGNMENT  64
> > +# define _dl_tlsdesc_dynamic   _dl_tlsdesc_dynamic_xsave
> > +# include "dl-tlsdesc-dynamic.h"
> > +# undef _dl_tlsdesc_dynamic
> > +# undef USE_XSAVE
> > +
> > +# define USE_XSAVEC
> > +# define STATE_SAVE_ALIGNMENT  64
> > +# define _dl_tlsdesc_dynamic   _dl_tlsdesc_dynamic_xsavec
> > +# include "dl-tlsdesc-dynamic.h"
> > +# undef _dl_tlsdesc_dynamic
> > +# undef USE_XSAVEC
> >  #endif /* SHARED */
> > diff --git a/sysdeps/i386/tst-gnu2-tls2.c b/sysdeps/i386/tst-gnu2-tls2.c
> > new file mode 100644
> > index 0000000000..92e7fbff89
> > --- /dev/null
> > +++ b/sysdeps/i386/tst-gnu2-tls2.c
> > @@ -0,0 +1,5 @@
> > +#include <sys/platform/x86.h>
> > +
> > +#define IS_SUPPORTED() CPU_FEATURE_ACTIVE (SSE2)
> > +
> > +#include <elf/tst-gnu2-tls2.c>
> > diff --git a/sysdeps/x86/Makefile b/sysdeps/x86/Makefile
> > index 4d50b327b5..bc357f0099 100644
> > --- a/sysdeps/x86/Makefile
> > +++ b/sysdeps/x86/Makefile
> > @@ -1,5 +1,5 @@
> >  ifeq ($(subdir),csu)
> > -gen-as-const-headers += cpu-features-offsets.sym
> > +gen-as-const-headers += cpu-features-offsets.sym features-offsets.sym
> >  endif
> >
> >  ifeq ($(subdir),elf)
> > @@ -86,6 +86,11 @@ endif
> >  tst-ifunc-isa-2-ENV = GLIBC_TUNABLES=glibc.cpu.hwcaps=-SSE4_2,-AVX,-AVX2,-AVX512F
> >  tst-ifunc-isa-2-static-ENV = $(tst-ifunc-isa-2-ENV)
> >  tst-hwcap-tunables-ARGS = -- $(host-test-program-cmd)
> > +
> > +CFLAGS-malloc-for-test.c += -msse2
> > +CFLAGS-tst-gnu2-tls2mod0.c += -msse2 -mtune=haswell
> > +CFLAGS-tst-gnu2-tls2mod1.c += -msse2 -mtune=haswell
> > +CFLAGS-tst-gnu2-tls2mod2.c += -msse2 -mtune=haswell
> >  endif
> >
> >  ifeq ($(subdir),math)
> > diff --git a/sysdeps/x86/cpu-features.c b/sysdeps/x86/cpu-features.c
> > index 25e6622a79..835113b42f 100644
> > --- a/sysdeps/x86/cpu-features.c
> > +++ b/sysdeps/x86/cpu-features.c
> > @@ -27,8 +27,13 @@
> >  extern void TUNABLE_CALLBACK (set_hwcaps) (tunable_val_t *)
> >    attribute_hidden;
> >
> > -#if defined SHARED && defined __x86_64__
> > -# include <dl-plt-rewrite.h>
> > +#if defined SHARED
> > +extern void _dl_tlsdesc_dynamic_fxsave (void) attribute_hidden;
> > +extern void _dl_tlsdesc_dynamic_xsave (void) attribute_hidden;
> > +extern void _dl_tlsdesc_dynamic_xsavec (void) attribute_hidden;
> > +
> > +# ifdef __x86_64__
> > +#  include <dl-plt-rewrite.h>
> >
> >  static void
> >  TUNABLE_CALLBACK (set_plt_rewrite) (tunable_val_t *valp)
> > @@ -47,6 +52,15 @@ TUNABLE_CALLBACK (set_plt_rewrite) (tunable_val_t *valp)
> >                  : plt_rewrite_jmp);
> >      }
> >  }
> > +# else
> > +extern void _dl_tlsdesc_dynamic_fnsave (void) attribute_hidden;
> > +# endif
> > +#endif
> > +
> > +#ifdef __x86_64__
> > +extern void _dl_runtime_resolve_fxsave (void) attribute_hidden;
> > +extern void _dl_runtime_resolve_xsave (void) attribute_hidden;
> > +extern void _dl_runtime_resolve_xsavec (void) attribute_hidden;
> >  #endif
> >
> >  #ifdef __LP64__
> > @@ -1130,6 +1144,44 @@ no_cpuid:
> >                TUNABLE_CALLBACK (set_x86_shstk));
> >  #endif
> >
> > +  if (GLRO(dl_x86_cpu_features).xsave_state_size != 0)
> > +    {
> > +      if (CPU_FEATURE_USABLE_P (cpu_features, XSAVEC))
> > +       {
> > +#ifdef __x86_64__
> > +         GLRO(dl_x86_64_runtime_resolve) = _dl_runtime_resolve_xsavec;
> > +#endif
> > +#ifdef SHARED
> > +         GLRO(dl_x86_tlsdesc_dynamic) = _dl_tlsdesc_dynamic_xsavec;
> > +#endif
> > +       }
> > +      else
> > +       {
> > +#ifdef __x86_64__
> > +         GLRO(dl_x86_64_runtime_resolve) = _dl_runtime_resolve_xsave;
> > +#endif
> > +#ifdef SHARED
> > +         GLRO(dl_x86_tlsdesc_dynamic) = _dl_tlsdesc_dynamic_xsave;
> > +#endif
> > +       }
> > +    }
> > +  else
> > +    {
> > +#ifdef __x86_64__
> > +      GLRO(dl_x86_64_runtime_resolve) = _dl_runtime_resolve_fxsave;
> > +# ifdef SHARED
> > +      GLRO(dl_x86_tlsdesc_dynamic) = _dl_tlsdesc_dynamic_fxsave;
> > +# endif
> > +#else
> > +# ifdef SHARED
> > +      if (CPU_FEATURE_USABLE_P (cpu_features, FXSR))
> > +       GLRO(dl_x86_tlsdesc_dynamic) = _dl_tlsdesc_dynamic_fxsave;
> > +      else
> > +       GLRO(dl_x86_tlsdesc_dynamic) = _dl_tlsdesc_dynamic_fnsave;
> > +# endif
> > +#endif
> > +    }
> > +
> >  #ifdef SHARED
> >  # ifdef __x86_64__
> >    TUNABLE_GET (plt_rewrite, tunable_val_t *,
> > diff --git a/sysdeps/x86/dl-procinfo.c b/sysdeps/x86/dl-procinfo.c
> > index ee957b4d70..5920d4b320 100644
> > --- a/sysdeps/x86/dl-procinfo.c
> > +++ b/sysdeps/x86/dl-procinfo.c
> > @@ -86,3 +86,19 @@ PROCINFO_CLASS const char _dl_x86_platforms[4][9]
> >  #else
> >  ,
> >  #endif
> > +
> > +#if defined SHARED && !IS_IN (ldconfig)
> > +# if !defined PROCINFO_DECL
> > +  ._dl_x86_tlsdesc_dynamic
> > +# else
> > +PROCINFO_CLASS void * _dl_x86_tlsdesc_dynamic
> > +# endif
> > +# ifndef PROCINFO_DECL
> > += NULL
> > +# endif
> > +# ifdef PROCINFO_DECL
> > +;
> > +# else
> > +,
> > +# endif
> > +#endif
> > diff --git a/sysdeps/x86_64/features-offsets.sym b/sysdeps/x86/features-offsets.sym
> > similarity index 89%
> > rename from sysdeps/x86_64/features-offsets.sym
> > rename to sysdeps/x86/features-offsets.sym
> > index 9e4be3393a..77e990c705 100644
> > --- a/sysdeps/x86_64/features-offsets.sym
> > +++ b/sysdeps/x86/features-offsets.sym
> > @@ -3,4 +3,6 @@
> >  #include <ldsodefs.h>
> >
> >  RTLD_GLOBAL_RO_DL_X86_CPU_FEATURES_OFFSET offsetof (struct rtld_global_ro, _dl_x86_cpu_features)
> > +#ifdef __x86_64__
> >  RTLD_GLOBAL_DL_X86_FEATURE_1_OFFSET offsetof (struct rtld_global, _dl_x86_feature_1)
> > +#endif
> > diff --git a/sysdeps/x86/malloc-for-test.c b/sysdeps/x86/malloc-for-test.c
> > new file mode 100644
> > index 0000000000..02f4dead5d
> > --- /dev/null
> > +++ b/sysdeps/x86/malloc-for-test.c
> > @@ -0,0 +1,33 @@
> > +/*  A malloc for intercept test.  x86 version.
> > +   Copyright (C) 2024 Free Software Foundation, Inc.
> > +   This file is part of the GNU C Library.
> > +
> > +   The GNU C Library is free software; you can redistribute it and/or
> > +   modify it under the terms of the GNU Lesser General Public
> > +   License as published by the Free Software Foundation; either
> > +   version 2.1 of the License, or (at your option) any later version.
> > +
> > +   The GNU C Library is distributed in the hope that it will be useful,
> > +   but WITHOUT ANY WARRANTY; without even the implied warranty of
> > +   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
> > +   Lesser General Public License for more details.
> > +
> > +   You should have received a copy of the GNU Lesser General Public
> > +   License along with the GNU C Library; if not, see
> > +   <http://www.gnu.org/licenses/>.  */
> > +
> > +
> > +/* Clear XMM0...XMM7  */
> > +#define PREPARE_MALLOC()                               \
> > +{                                                      \
> > +  asm volatile ("xorps %%xmm0, %%xmm0" : : : "xmm0" ); \
> > +  asm volatile ("xorps %%xmm1, %%xmm1" : : : "xmm1" ); \
> > +  asm volatile ("xorps %%xmm2, %%xmm2" : : : "xmm2" ); \
> > +  asm volatile ("xorps %%xmm3, %%xmm3" : : : "xmm3" ); \
> > +  asm volatile ("xorps %%xmm4, %%xmm4" : : : "xmm4" ); \
> > +  asm volatile ("xorps %%xmm5, %%xmm5" : : : "xmm5" ); \
> > +  asm volatile ("xorps %%xmm6, %%xmm6" : : : "xmm6" ); \
> > +  asm volatile ("xorps %%xmm7, %%xmm7" : : : "xmm7" ); \
> > +}
> > +
> > +#include <elf/malloc-for-test.c>
> > diff --git a/sysdeps/x86/sysdep.h b/sysdeps/x86/sysdep.h
> > index 5c1f0bcf53..792e2ea5ed 100644
> > --- a/sysdeps/x86/sysdep.h
> > +++ b/sysdeps/x86/sysdep.h
> > @@ -68,6 +68,12 @@
> >     | (1 << X86_XSTATE_ZMM_H_ID))
> >  #endif
> >
> > +/* States which should be saved for TLSDESC_CALL and TLS_DESC_CALL.
> > +   Compiler assumes that all registers, including x87 FPU stack registers,
> > +   are unchanged after CALL, except for EFLAGS and RAX/EAX.  */
> > +#define TLSDESC_CALL_STATE_SAVE_MASK   \
> > +  (STATE_SAVE_MASK | (1 << X86_XSTATE_X87_ID))
> > +
> >  /* Constants for bits in __x86_string_control:  */
> >
> >  /* Avoid short distance REP MOVSB.  */
> > diff --git a/sysdeps/x86_64/Makefile b/sysdeps/x86_64/Makefile
> > index 90f4ecfd26..e8babc9a4e 100644
> > --- a/sysdeps/x86_64/Makefile
> > +++ b/sysdeps/x86_64/Makefile
> > @@ -10,7 +10,7 @@ LDFLAGS-rtld += -Wl,-z,nomark-plt
> >  endif
> >
> >  ifeq ($(subdir),csu)
> > -gen-as-const-headers += features-offsets.sym link-defines.sym
> > +gen-as-const-headers += link-defines.sym
> >  endif
> >
> >  ifeq ($(subdir),gmon)
> > diff --git a/sysdeps/x86_64/dl-machine.h b/sysdeps/x86_64/dl-machine.h
> > index 6d605d0d32..ff5d45f7cb 100644
> > --- a/sysdeps/x86_64/dl-machine.h
> > +++ b/sysdeps/x86_64/dl-machine.h
> > @@ -71,9 +71,6 @@ elf_machine_runtime_setup (struct link_map *l, struct r_scope_elem *scope[],
> >                            int lazy, int profile)
> >  {
> >    Elf64_Addr *got;
> > -  extern void _dl_runtime_resolve_fxsave (ElfW(Word)) attribute_hidden;
> > -  extern void _dl_runtime_resolve_xsave (ElfW(Word)) attribute_hidden;
> > -  extern void _dl_runtime_resolve_xsavec (ElfW(Word)) attribute_hidden;
> >    extern void _dl_runtime_profile_sse (ElfW(Word)) attribute_hidden;
> >    extern void _dl_runtime_profile_avx (ElfW(Word)) attribute_hidden;
> >    extern void _dl_runtime_profile_avx512 (ElfW(Word)) attribute_hidden;
> > @@ -96,8 +93,6 @@ elf_machine_runtime_setup (struct link_map *l, struct r_scope_elem *scope[],
> >        /* Identify this shared object.  */
> >        *(ElfW(Addr) *) (got + 1) = (ElfW(Addr)) l;
> >
> > -      const struct cpu_features* cpu_features = __get_cpu_features ();
> > -
> >  #ifdef SHARED
> >        /* The got[2] entry contains the address of a function which gets
> >          called to get the address of a so far unresolved function and
> > @@ -107,6 +102,7 @@ elf_machine_runtime_setup (struct link_map *l, struct r_scope_elem *scope[],
> >          end in this function.  */
> >        if (__glibc_unlikely (profile))
> >         {
> > +         const struct cpu_features* cpu_features = __get_cpu_features ();
> >           if (X86_ISA_CPU_FEATURE_USABLE_P (cpu_features, AVX512F))
> >             *(ElfW(Addr) *) (got + 2) = (ElfW(Addr)) &_dl_runtime_profile_avx512;
> >           else if (X86_ISA_CPU_FEATURE_USABLE_P (cpu_features, AVX))
> > @@ -126,15 +122,8 @@ elf_machine_runtime_setup (struct link_map *l, struct r_scope_elem *scope[],
> >           /* This function will get called to fix up the GOT entry
> >              indicated by the offset on the stack, and then jump to
> >              the resolved address.  */
> > -         if (MINIMUM_X86_ISA_LEVEL >= AVX_X86_ISA_LEVEL
> > -             || GLRO(dl_x86_cpu_features).xsave_state_size != 0)
> > -           *(ElfW(Addr) *) (got + 2)
> > -             = (CPU_FEATURE_USABLE_P (cpu_features, XSAVEC)
> > -                ? (ElfW(Addr)) &_dl_runtime_resolve_xsavec
> > -                : (ElfW(Addr)) &_dl_runtime_resolve_xsave);
> > -         else
> > -           *(ElfW(Addr) *) (got + 2)
> > -             = (ElfW(Addr)) &_dl_runtime_resolve_fxsave;
> > +         *(ElfW(Addr) *) (got + 2)
> > +           = (ElfW(Addr)) GLRO(dl_x86_64_runtime_resolve);
> >         }
> >      }
> >
> > @@ -383,7 +372,7 @@ and creates an unsatisfiable circular dependency.\n",
> >                   {
> >                     td->arg = _dl_make_tlsdesc_dynamic
> >                       (sym_map, sym->st_value + reloc->r_addend);
> > -                   td->entry = _dl_tlsdesc_dynamic;
> > +                   td->entry = GLRO(dl_x86_tlsdesc_dynamic);
> >                   }
> >                 else
> >  #  endif
> > diff --git a/sysdeps/x86_64/dl-procinfo.c b/sysdeps/x86_64/dl-procinfo.c
> > index 4d1d790fbb..06637a8154 100644
> > --- a/sysdeps/x86_64/dl-procinfo.c
> > +++ b/sysdeps/x86_64/dl-procinfo.c
> > @@ -41,5 +41,21 @@
> >
> >  #include <sysdeps/x86/dl-procinfo.c>
> >
> > +#if !IS_IN (ldconfig)
> > +# if !defined PROCINFO_DECL && defined SHARED
> > +  ._dl_x86_64_runtime_resolve
> > +# else
> > +PROCINFO_CLASS void * _dl_x86_64_runtime_resolve
> > +# endif
> > +# ifndef PROCINFO_DECL
> > += NULL
> > +# endif
> > +# if !defined SHARED || defined PROCINFO_DECL
> > +;
> > +# else
> > +,
> > +# endif
> > +#endif
> > +
> >  #undef PROCINFO_DECL
> >  #undef PROCINFO_CLASS
> > diff --git a/sysdeps/x86_64/dl-tlsdesc-dynamic.h b/sysdeps/x86_64/dl-tlsdesc-dynamic.h
> > new file mode 100644
> > index 0000000000..ce0bc094ec
> > --- /dev/null
> > +++ b/sysdeps/x86_64/dl-tlsdesc-dynamic.h
> > @@ -0,0 +1,166 @@
> > +/* Thread-local storage handling in the ELF dynamic linker.  x86_64 version.
> > +   Copyright (C) 2004-2024 Free Software Foundation, Inc.
> > +   This file is part of the GNU C Library.
> > +
> > +   The GNU C Library is free software; you can redistribute it and/or
> > +   modify it under the terms of the GNU Lesser General Public
> > +   License as published by the Free Software Foundation; either
> > +   version 2.1 of the License, or (at your option) any later version.
> > +
> > +   The GNU C Library is distributed in the hope that it will be useful,
> > +   but WITHOUT ANY WARRANTY; without even the implied warranty of
> > +   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
> > +   Lesser General Public License for more details.
> > +
> > +   You should have received a copy of the GNU Lesser General Public
> > +   License along with the GNU C Library; if not, see
> > +   <https://www.gnu.org/licenses/>.  */
> > +
> > +#ifndef SECTION
> > +# define SECTION(p)    p
> > +#endif
> > +
> > +#undef REGISTER_SAVE_AREA
> > +#undef LOCAL_STORAGE_AREA
> > +#undef BASE
> > +
> > +#include "dl-trampoline-state.h"
> > +
> > +       .section SECTION(.text),"ax",@progbits
> > +
> > +       .hidden _dl_tlsdesc_dynamic
> > +       .global _dl_tlsdesc_dynamic
> > +       .type   _dl_tlsdesc_dynamic,@function
> > +
> > +     /* %rax points to the TLS descriptor, such that 0(%rax) points to
> > +       _dl_tlsdesc_dynamic itself, and 8(%rax) points to a struct
> > +       tlsdesc_dynamic_arg object.  It must return in %rax the offset
> > +       between the thread pointer and the object denoted by the
> > +       argument, without clobbering any registers.
> > +
> > +       The assembly code that follows is a rendition of the following
> > +       C code, hand-optimized a little bit.
> > +
> > +ptrdiff_t
> > +_dl_tlsdesc_dynamic (register struct tlsdesc *tdp asm ("%rax"))
> > +{
> > +  struct tlsdesc_dynamic_arg *td = tdp->arg;
> > +  dtv_t *dtv = *(dtv_t **)((char *)__thread_pointer + DTV_OFFSET);
> > +  if (__builtin_expect (td->gen_count <= dtv[0].counter
> > +                       && (dtv[td->tlsinfo.ti_module].pointer.val
> > +                           != TLS_DTV_UNALLOCATED),
> > +                       1))
> > +    return dtv[td->tlsinfo.ti_module].pointer.val + td->tlsinfo.ti_offset
> > +      - __thread_pointer;
> > +
> > +  return __tls_get_addr_internal (&td->tlsinfo) - __thread_pointer;
> > +}
> > +*/
> > +       cfi_startproc
> > +       .align 16
> > +_dl_tlsdesc_dynamic:
> > +       _CET_ENDBR
> > +       /* Preserve call-clobbered registers that we modify.
> > +          We need two scratch regs anyway.  */
> > +       movq    %rsi, -16(%rsp)
> > +       mov     %fs:DTV_OFFSET, %RSI_LP
> > +       movq    %rdi, -8(%rsp)
> > +       movq    TLSDESC_ARG(%rax), %rdi
> > +       movq    (%rsi), %rax
> > +       cmpq    %rax, TLSDESC_GEN_COUNT(%rdi)
> > +       ja      2f
> > +       movq    TLSDESC_MODID(%rdi), %rax
> > +       salq    $4, %rax
> > +       movq    (%rax,%rsi), %rax
> > +       cmpq    $-1, %rax
> > +       je      2f
> > +       addq    TLSDESC_MODOFF(%rdi), %rax
> > +1:
> > +       movq    -16(%rsp), %rsi
> > +       sub     %fs:0, %RAX_LP
> > +       movq    -8(%rsp), %rdi
> > +       ret
> > +2:
> > +#if DL_RUNTIME_RESOLVE_REALIGN_STACK
> > +       movq    %rbx, -24(%rsp)
> > +       mov     %RSP_LP, %RBX_LP
> > +       cfi_def_cfa_register(%rbx)
> > +       and     $-STATE_SAVE_ALIGNMENT, %RSP_LP
> > +#endif
> > +#ifdef REGISTER_SAVE_AREA
> > +# if DL_RUNTIME_RESOLVE_REALIGN_STACK
> > +       # STATE_SAVE_OFFSET has space for 8 integer registers.  But we
> > +       # need space for RCX, RDX, RSI, RDI, R8, R9, R10 and R11, plus
> > +       # RBX above.
> > +       sub     $(REGISTER_SAVE_AREA + STATE_SAVE_ALIGNMENT), %RSP_LP
> > +# else
> > +       sub     $REGISTER_SAVE_AREA, %RSP_LP
> > +       cfi_adjust_cfa_offset(REGISTER_SAVE_AREA)
> > +# endif
> > +#else
> > +       # Allocate stack space of the required size to save the state.
> > +       sub     _rtld_local_ro+RTLD_GLOBAL_RO_DL_X86_CPU_FEATURES_OFFSET+XSAVE_STATE_SIZE_OFFSET(%rip), %RSP_LP
> > +#endif
> > +       /* Besides rdi and rsi, saved above, save rcx, rdx, r8, r9,
> > +          r10 and r11.  */
> > +       movq    %rcx, REGISTER_SAVE_RCX(%rsp)
> > +       movq    %rdx, REGISTER_SAVE_RDX(%rsp)
> > +       movq    %r8, REGISTER_SAVE_R8(%rsp)
> > +       movq    %r9, REGISTER_SAVE_R9(%rsp)
> > +       movq    %r10, REGISTER_SAVE_R10(%rsp)
> > +       movq    %r11, REGISTER_SAVE_R11(%rsp)
> > +#ifdef USE_FXSAVE
> > +       fxsave  STATE_SAVE_OFFSET(%rsp)
> > +#else
> > +       movl    $TLSDESC_CALL_STATE_SAVE_MASK, %eax
> > +       xorl    %edx, %edx
> > +       # Clear the XSAVE Header.
> > +# ifdef USE_XSAVE
> > +       movq    %rdx, (STATE_SAVE_OFFSET + 512)(%rsp)
> > +       movq    %rdx, (STATE_SAVE_OFFSET + 512 + 8)(%rsp)
> > +# endif
> > +       movq    %rdx, (STATE_SAVE_OFFSET + 512 + 8 * 2)(%rsp)
> > +       movq    %rdx, (STATE_SAVE_OFFSET + 512 + 8 * 3)(%rsp)
> > +       movq    %rdx, (STATE_SAVE_OFFSET + 512 + 8 * 4)(%rsp)
> > +       movq    %rdx, (STATE_SAVE_OFFSET + 512 + 8 * 5)(%rsp)
> > +       movq    %rdx, (STATE_SAVE_OFFSET + 512 + 8 * 6)(%rsp)
> > +       movq    %rdx, (STATE_SAVE_OFFSET + 512 + 8 * 7)(%rsp)
> > +# ifdef USE_XSAVE
> > +       xsave   STATE_SAVE_OFFSET(%rsp)
> > +# else
> > +       xsavec  STATE_SAVE_OFFSET(%rsp)
> > +# endif
> > +#endif
> > +       /* %rdi already points to the tlsinfo data structure.  */
> > +       call    HIDDEN_JUMPTARGET (__tls_get_addr)
> > +       # Get register content back.
> > +#ifdef USE_FXSAVE
> > +       fxrstor STATE_SAVE_OFFSET(%rsp)
> > +#else
> > +       /* Save and retore __tls_get_addr return value stored in RAX.  */
> > +       mov     %RAX_LP, %RCX_LP
> > +       movl    $TLSDESC_CALL_STATE_SAVE_MASK, %eax
> > +       xorl    %edx, %edx
> > +       xrstor  STATE_SAVE_OFFSET(%rsp)
> > +       mov     %RCX_LP, %RAX_LP
> > +#endif
> > +       movq    REGISTER_SAVE_R11(%rsp), %r11
> > +       movq    REGISTER_SAVE_R10(%rsp), %r10
> > +       movq    REGISTER_SAVE_R9(%rsp), %r9
> > +       movq    REGISTER_SAVE_R8(%rsp), %r8
> > +       movq    REGISTER_SAVE_RDX(%rsp), %rdx
> > +       movq    REGISTER_SAVE_RCX(%rsp), %rcx
> > +#if DL_RUNTIME_RESOLVE_REALIGN_STACK
> > +       mov     %RBX_LP, %RSP_LP
> > +       cfi_def_cfa_register(%rsp)
> > +       movq    -24(%rsp), %rbx
> > +       cfi_restore(%rbx)
> > +#else
> > +       add     $REGISTER_SAVE_AREA, %RSP_LP
> > +       cfi_adjust_cfa_offset(-REGISTER_SAVE_AREA)
> The use of `REGISTER_SAVE_AREA` above is guarded by an
> `#ifdef REGISTER_SAVE_AREA`
> and uses
> `_rtld_local_ro+RTLD_GLOBAL_RO_DL_X86_CPU_FEATURES_OFFSET+XSAVE_STATE_SIZE_OFFSET(%rip)`
> otherwise.
> Would expect same here?
> > +#endif
> > +       jmp     1b
> > +       cfi_endproc
> > +       .size   _dl_tlsdesc_dynamic, .-_dl_tlsdesc_dynamic
> > +
> Seems to me the x32 and x64 implementations could be merged with

Did you mean i386? x32 uses the same implementation as x64.

> a few defines for the registers/register width
> + the extra GPR saving in x64.

i386 ISA is quite different from x86-64.  Merging them will
only make codes hard to follow.

> > +#undef STATE_SAVE_ALIGNMENT
> > diff --git a/sysdeps/x86_64/dl-tlsdesc.S b/sysdeps/x86_64/dl-tlsdesc.S
> > index f748af2ece..ea69f5223a 100644
> > --- a/sysdeps/x86_64/dl-tlsdesc.S
> > +++ b/sysdeps/x86_64/dl-tlsdesc.S
> > @@ -18,7 +18,19 @@
> >
> >  #include <sysdep.h>
> >  #include <tls.h>
> > +#include <cpu-features-offsets.h>
> > +#include <features-offsets.h>
> >  #include "tlsdesc.h"
> > +#include "dl-trampoline-save.h"
> > +
> > +/* Area on stack to save and restore registers used for parameter
> > +   passing when calling _dl_tlsdesc_dynamic.  */
> > +#define REGISTER_SAVE_RCX      0
> > +#define REGISTER_SAVE_RDX      (REGISTER_SAVE_RCX + 8)
> > +#define REGISTER_SAVE_R8       (REGISTER_SAVE_RDX + 8)
> > +#define REGISTER_SAVE_R9       (REGISTER_SAVE_R8 + 8)
> > +#define REGISTER_SAVE_R10      (REGISTER_SAVE_R9 + 8)
> > +#define REGISTER_SAVE_R11      (REGISTER_SAVE_R10 + 8)
> >
> >         .text
> >
> > @@ -67,80 +79,24 @@ _dl_tlsdesc_undefweak:
> >         .size   _dl_tlsdesc_undefweak, .-_dl_tlsdesc_undefweak
> >
> >  #ifdef SHARED
> > -       .hidden _dl_tlsdesc_dynamic
> > -       .global _dl_tlsdesc_dynamic
> > -       .type   _dl_tlsdesc_dynamic,@function
> > -
> > -     /* %rax points to the TLS descriptor, such that 0(%rax) points to
> > -       _dl_tlsdesc_dynamic itself, and 8(%rax) points to a struct
> > -       tlsdesc_dynamic_arg object.  It must return in %rax the offset
> > -       between the thread pointer and the object denoted by the
> > -       argument, without clobbering any registers.
> > -
> > -       The assembly code that follows is a rendition of the following
> > -       C code, hand-optimized a little bit.
> > -
> > -ptrdiff_t
> > -_dl_tlsdesc_dynamic (register struct tlsdesc *tdp asm ("%rax"))
> > -{
> > -  struct tlsdesc_dynamic_arg *td = tdp->arg;
> > -  dtv_t *dtv = *(dtv_t **)((char *)__thread_pointer + DTV_OFFSET);
> > -  if (__builtin_expect (td->gen_count <= dtv[0].counter
> > -                       && (dtv[td->tlsinfo.ti_module].pointer.val
> > -                           != TLS_DTV_UNALLOCATED),
> > -                       1))
> > -    return dtv[td->tlsinfo.ti_module].pointer.val + td->tlsinfo.ti_offset
> > -      - __thread_pointer;
> > -
> > -  return __tls_get_addr_internal (&td->tlsinfo) - __thread_pointer;
> > -}
> > -*/
> > -       cfi_startproc
> > -       .align 16
> > -_dl_tlsdesc_dynamic:
> > -       _CET_ENDBR
> > -       /* Preserve call-clobbered registers that we modify.
> > -          We need two scratch regs anyway.  */
> > -       movq    %rsi, -16(%rsp)
> > -       mov     %fs:DTV_OFFSET, %RSI_LP
> > -       movq    %rdi, -8(%rsp)
> > -       movq    TLSDESC_ARG(%rax), %rdi
> > -       movq    (%rsi), %rax
> > -       cmpq    %rax, TLSDESC_GEN_COUNT(%rdi)
> > -       ja      .Lslow
> > -       movq    TLSDESC_MODID(%rdi), %rax
> > -       salq    $4, %rax
> > -       movq    (%rax,%rsi), %rax
> > -       cmpq    $-1, %rax
> > -       je      .Lslow
> > -       addq    TLSDESC_MODOFF(%rdi), %rax
> > -.Lret:
> > -       movq    -16(%rsp), %rsi
> > -       sub     %fs:0, %RAX_LP
> > -       movq    -8(%rsp), %rdi
> > -       ret
> > -.Lslow:
> > -       /* Besides rdi and rsi, saved above, save rdx, rcx, r8, r9,
> > -          r10 and r11.  Also, align the stack, that's off by 8 bytes.  */
> > -       subq    $72, %rsp
> > -       cfi_adjust_cfa_offset (72)
> > -       movq    %rdx, 8(%rsp)
> > -       movq    %rcx, 16(%rsp)
> > -       movq    %r8, 24(%rsp)
> > -       movq    %r9, 32(%rsp)
> > -       movq    %r10, 40(%rsp)
> > -       movq    %r11, 48(%rsp)
> > -       /* %rdi already points to the tlsinfo data structure.  */
> > -       call    HIDDEN_JUMPTARGET (__tls_get_addr)
> > -       movq    8(%rsp), %rdx
> > -       movq    16(%rsp), %rcx
> > -       movq    24(%rsp), %r8
> > -       movq    32(%rsp), %r9
> > -       movq    40(%rsp), %r10
> > -       movq    48(%rsp), %r11
> > -       addq    $72, %rsp
> > -       cfi_adjust_cfa_offset (-72)
> > -       jmp     .Lret
> > -       cfi_endproc
> > -       .size   _dl_tlsdesc_dynamic, .-_dl_tlsdesc_dynamic
> > +# define USE_FXSAVE
> > +# define STATE_SAVE_ALIGNMENT  16
> > +# define _dl_tlsdesc_dynamic   _dl_tlsdesc_dynamic_fxsave
> > +# include "dl-tlsdesc-dynamic.h"
> > +# undef _dl_tlsdesc_dynamic
> > +# undef USE_FXSAVE
> > +
> > +# define USE_XSAVE
> > +# define STATE_SAVE_ALIGNMENT  64
> > +# define _dl_tlsdesc_dynamic   _dl_tlsdesc_dynamic_xsave
> > +# include "dl-tlsdesc-dynamic.h"
> > +# undef _dl_tlsdesc_dynamic
> > +# undef USE_XSAVE
> > +
> > +# define USE_XSAVEC
> > +# define STATE_SAVE_ALIGNMENT  64
> > +# define _dl_tlsdesc_dynamic   _dl_tlsdesc_dynamic_xsavec
> > +# include "dl-tlsdesc-dynamic.h"
> > +# undef _dl_tlsdesc_dynamic
> > +# undef USE_XSAVEC
> >  #endif /* SHARED */
> > diff --git a/sysdeps/x86_64/dl-trampoline-save.h b/sysdeps/x86_64/dl-trampoline-save.h
> > new file mode 100644
> > index 0000000000..84eac4a8ac
> > --- /dev/null
> > +++ b/sysdeps/x86_64/dl-trampoline-save.h
> > @@ -0,0 +1,34 @@
> > +/* x86-64 PLT trampoline register save macros.
> > +   Copyright (C) 2024 Free Software Foundation, Inc.
> > +   This file is part of the GNU C Library.
> > +
> > +   The GNU C Library is free software; you can redistribute it and/or
> > +   modify it under the terms of the GNU Lesser General Public
> > +   License as published by the Free Software Foundation; either
> > +   version 2.1 of the License, or (at your option) any later version.
> > +
> > +   The GNU C Library is distributed in the hope that it will be useful,
> > +   but WITHOUT ANY WARRANTY; without even the implied warranty of
> > +   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
> > +   Lesser General Public License for more details.
> > +
> > +   You should have received a copy of the GNU Lesser General Public
> > +   License along with the GNU C Library; if not, see
> > +   <https://www.gnu.org/licenses/>.  */
> > +
> > +#ifndef DL_STACK_ALIGNMENT
> > +/* Due to GCC bug:
> > +
> > +   https://gcc.gnu.org/bugzilla/show_bug.cgi?id=58066
> > +
> > +   __tls_get_addr may be called with 8-byte stack alignment.  Although
> > +   this bug has been fixed in GCC 4.9.4, 5.3 and 6, we can't assume
> > +   that stack will be always aligned at 16 bytes.  */
> > +# define DL_STACK_ALIGNMENT 8
> > +#endif
> > +
> > +/* True if _dl_runtime_resolve should align stack for STATE_SAVE or align
> > +   stack to 16 bytes before calling _dl_fixup.  */
> > +#define DL_RUNTIME_RESOLVE_REALIGN_STACK \
> > +  (STATE_SAVE_ALIGNMENT > DL_STACK_ALIGNMENT \
> > +   || 16 > DL_STACK_ALIGNMENT)
> > diff --git a/sysdeps/x86_64/dl-trampoline-state.h b/sysdeps/x86_64/dl-trampoline-state.h
> > new file mode 100644
> > index 0000000000..575f120797
> > --- /dev/null
> > +++ b/sysdeps/x86_64/dl-trampoline-state.h
> > @@ -0,0 +1,51 @@
> > +/* x86-64 PLT dl-trampoline state macros.
> > +   Copyright (C) 2024 Free Software Foundation, Inc.
> > +   This file is part of the GNU C Library.
> > +
> > +   The GNU C Library is free software; you can redistribute it and/or
> > +   modify it under the terms of the GNU Lesser General Public
> > +   License as published by the Free Software Foundation; either
> > +   version 2.1 of the License, or (at your option) any later version.
> > +
> > +   The GNU C Library is distributed in the hope that it will be useful,
> > +   but WITHOUT ANY WARRANTY; without even the implied warranty of
> > +   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
> > +   Lesser General Public License for more details.
> > +
> > +   You should have received a copy of the GNU Lesser General Public
> > +   License along with the GNU C Library; if not, see
> > +   <https://www.gnu.org/licenses/>.  */
> > +
> > +#if (STATE_SAVE_ALIGNMENT % 16) != 0
> > +# error STATE_SAVE_ALIGNMENT must be multiple of 16
> > +#endif
> > +
> > +#if (STATE_SAVE_OFFSET % STATE_SAVE_ALIGNMENT) != 0
> > +# error STATE_SAVE_OFFSET must be multiple of STATE_SAVE_ALIGNMENT
> > +#endif
> > +
> > +#if DL_RUNTIME_RESOLVE_REALIGN_STACK
> > +/* Local stack area before jumping to function address: RBX.  */
> > +# define LOCAL_STORAGE_AREA    8
> > +# define BASE                  rbx
> > +# ifdef USE_FXSAVE
> > +/* Use fxsave to save XMM registers.  */
> > +#  define REGISTER_SAVE_AREA   (512 + STATE_SAVE_OFFSET)
> > +#  if (REGISTER_SAVE_AREA % 16) != 0
> > +#   error REGISTER_SAVE_AREA must be multiple of 16
> > +#  endif
> > +# endif
> > +#else
> > +# ifndef USE_FXSAVE
> > +#  error USE_FXSAVE must be defined
> > +# endif
> > +/* Use fxsave to save XMM registers.  */
> > +# define REGISTER_SAVE_AREA    (512 + STATE_SAVE_OFFSET + 8)
> > +/* Local stack area before jumping to function address:  All saved
> > +   registers.  */
> > +# define LOCAL_STORAGE_AREA    REGISTER_SAVE_AREA
> > +# define BASE                  rsp
> > +# if (REGISTER_SAVE_AREA % 16) != 8
> > +#  error REGISTER_SAVE_AREA must be odd multiple of 8
> > +# endif
> > +#endif
> > diff --git a/sysdeps/x86_64/dl-trampoline.S b/sysdeps/x86_64/dl-trampoline.S
> > index b2e7e0f69b..87c5137837 100644
> > --- a/sysdeps/x86_64/dl-trampoline.S
> > +++ b/sysdeps/x86_64/dl-trampoline.S
> > @@ -22,25 +22,7 @@
> >  #include <features-offsets.h>
> >  #include <link-defines.h>
> >  #include <isa-level.h>
> > -
> > -#ifndef DL_STACK_ALIGNMENT
> > -/* Due to GCC bug:
> > -
> > -   https://gcc.gnu.org/bugzilla/show_bug.cgi?id=58066
> > -
> > -   __tls_get_addr may be called with 8-byte stack alignment.  Although
> > -   this bug has been fixed in GCC 4.9.4, 5.3 and 6, we can't assume
> > -   that stack will be always aligned at 16 bytes.  We use unaligned
> > -   16-byte move to load and store SSE registers, which has no penalty
> > -   on modern processors if stack is 16-byte aligned.  */
> > -# define DL_STACK_ALIGNMENT 8
> > -#endif
> > -
> > -/* True if _dl_runtime_resolve should align stack for STATE_SAVE or align
> > -   stack to 16 bytes before calling _dl_fixup.  */
> > -#define DL_RUNTIME_RESOLVE_REALIGN_STACK \
> > -  (STATE_SAVE_ALIGNMENT > DL_STACK_ALIGNMENT \
> > -   || 16 > DL_STACK_ALIGNMENT)
> > +#include "dl-trampoline-save.h"
> >
> >  /* Area on stack to save and restore registers used for parameter
> >     passing when calling _dl_fixup.  */
> > diff --git a/sysdeps/x86_64/dl-trampoline.h b/sysdeps/x86_64/dl-trampoline.h
> > index f55c6ea040..d9ccfb40d4 100644
> > --- a/sysdeps/x86_64/dl-trampoline.h
> > +++ b/sysdeps/x86_64/dl-trampoline.h
> > @@ -27,39 +27,7 @@
> >  # undef LOCAL_STORAGE_AREA
> >  # undef BASE
> >
> > -# if (STATE_SAVE_ALIGNMENT % 16) != 0
> > -#  error STATE_SAVE_ALIGNMENT must be multiple of 16
> > -# endif
> > -
> > -# if (STATE_SAVE_OFFSET % STATE_SAVE_ALIGNMENT) != 0
> > -#  error STATE_SAVE_OFFSET must be multiple of STATE_SAVE_ALIGNMENT
> > -# endif
> > -
> > -# if DL_RUNTIME_RESOLVE_REALIGN_STACK
> > -/* Local stack area before jumping to function address: RBX.  */
> > -#  define LOCAL_STORAGE_AREA   8
> > -#  define BASE                 rbx
> > -#  ifdef USE_FXSAVE
> > -/* Use fxsave to save XMM registers.  */
> > -#   define REGISTER_SAVE_AREA  (512 + STATE_SAVE_OFFSET)
> > -#   if (REGISTER_SAVE_AREA % 16) != 0
> > -#    error REGISTER_SAVE_AREA must be multiple of 16
> > -#   endif
> > -#  endif
> > -# else
> > -#  ifndef USE_FXSAVE
> > -#   error USE_FXSAVE must be defined
> > -#  endif
> > -/* Use fxsave to save XMM registers.  */
> > -#  define REGISTER_SAVE_AREA   (512 + STATE_SAVE_OFFSET + 8)
> > -/* Local stack area before jumping to function address:  All saved
> > -   registers.  */
> > -#  define LOCAL_STORAGE_AREA   REGISTER_SAVE_AREA
> > -#  define BASE                 rsp
> > -#  if (REGISTER_SAVE_AREA % 16) != 8
> > -#   error REGISTER_SAVE_AREA must be odd multiple of 8
> > -#  endif
> > -# endif
> > +# include "dl-trampoline-state.h"
> >
> >         .globl _dl_runtime_resolve
> >         .hidden _dl_runtime_resolve
> > --
> > 2.43.0
> >

H.J.
  
Noah Goldstein Feb. 14, 2024, 11:57 p.m. UTC | #3
On Wed, Feb 14, 2024 at 11:21 PM H.J. Lu <hjl.tools@gmail.com> wrote:
>
> On Wed, Feb 14, 2024 at 10:44:20PM +0000, Noah Goldstein wrote:
> > On Tue, Feb 13, 2024 at 4:15 AM H.J. Lu <hjl.tools@gmail.com> wrote:
> > >
> > > Compiler generates the following instruction sequence for GNU2 dynamic
> > > TLS access:
> > >
> > >         leaq    tls_var@TLSDESC(%rip), %rax
> > >         call    *tls_var@TLSCALL(%rax)
> > >
> > > or
> > >
> > >         leal    tls_var@TLSDESC(%ebx), %eax
> > >         call    *tls_var@TLSCALL(%eax)
> > >
> > > CALL instruction is transparent to compiler which assumes all registers,
> > > except for EFLAGS and RAX/EAX, are unchanged after CALL.  When
> > > _dl_tlsdesc_dynamic is called, it calls __tls_get_addr on the slow
> > > path.  __tls_get_addr is a normal function which doesn't preserve any
> > > caller-saved registers.  _dl_tlsdesc_dynamic saved and restored integer
> > > caller-saved registers, but didn't preserve any other caller-saved
> > > registers.  Add _dl_tlsdesc_dynamic IFUNC functions for FNSAVE, FXSAVE,
> > > XSAVE and XSAVEC to save and restore all caller-saved registers.  This
> > > fixes BZ #31372.
> > >
> > > Add GLRO(dl_x86_64_runtime_resolve) with GLRO(dl_x86_tlsdesc_dynamic)
> > > to optimize elf_machine_runtime_setup.
> > > ---
> > >  elf/Makefile                                 |  19 ++
> > >  elf/malloc-for-test.c                        |  32 ++++
> > >  elf/malloc-for-test.map                      |   6 +
> > >  elf/tst-gnu2-tls2.c                          |  97 ++++++++++
> > >  elf/tst-gnu2-tls2.h                          |  26 +++
> > >  elf/tst-gnu2-tls2mod0.c                      |  28 +++
> > >  elf/tst-gnu2-tls2mod1.c                      |  28 +++
> > >  elf/tst-gnu2-tls2mod2.c                      |  28 +++
> > >  sysdeps/i386/dl-machine.h                    |   2 +-
> > >  sysdeps/i386/dl-tlsdesc-dynamic.h            | 187 +++++++++++++++++++
> > >  sysdeps/i386/dl-tlsdesc.S                    | 115 +++++-------
> > >  sysdeps/i386/tst-gnu2-tls2.c                 |   5 +
> > >  sysdeps/x86/Makefile                         |   7 +-
> > >  sysdeps/x86/cpu-features.c                   |  56 +++++-
> > >  sysdeps/x86/dl-procinfo.c                    |  16 ++
> > >  sysdeps/{x86_64 => x86}/features-offsets.sym |   2 +
> > >  sysdeps/x86/malloc-for-test.c                |  33 ++++
> > >  sysdeps/x86/sysdep.h                         |   6 +
> > >  sysdeps/x86_64/Makefile                      |   2 +-
> > >  sysdeps/x86_64/dl-machine.h                  |  19 +-
> > >  sysdeps/x86_64/dl-procinfo.c                 |  16 ++
> > >  sysdeps/x86_64/dl-tlsdesc-dynamic.h          | 166 ++++++++++++++++
> > >  sysdeps/x86_64/dl-tlsdesc.S                  | 108 ++++-------
> > >  sysdeps/x86_64/dl-trampoline-save.h          |  34 ++++
> > >  sysdeps/x86_64/dl-trampoline-state.h         |  51 +++++
> > >  sysdeps/x86_64/dl-trampoline.S               |  20 +-
> > >  sysdeps/x86_64/dl-trampoline.h               |  34 +---
> > >  27 files changed, 930 insertions(+), 213 deletions(-)
> > >  create mode 100644 elf/malloc-for-test.c
> > >  create mode 100644 elf/malloc-for-test.map
> > >  create mode 100644 elf/tst-gnu2-tls2.c
> > >  create mode 100644 elf/tst-gnu2-tls2.h
> > >  create mode 100644 elf/tst-gnu2-tls2mod0.c
> > >  create mode 100644 elf/tst-gnu2-tls2mod1.c
> > >  create mode 100644 elf/tst-gnu2-tls2mod2.c
> > >  create mode 100644 sysdeps/i386/dl-tlsdesc-dynamic.h
> > >  create mode 100644 sysdeps/i386/tst-gnu2-tls2.c
> > >  rename sysdeps/{x86_64 => x86}/features-offsets.sym (89%)
> > >  create mode 100644 sysdeps/x86/malloc-for-test.c
> > >  create mode 100644 sysdeps/x86_64/dl-tlsdesc-dynamic.h
> > >  create mode 100644 sysdeps/x86_64/dl-trampoline-save.h
> > >  create mode 100644 sysdeps/x86_64/dl-trampoline-state.h
> > >
> > > diff --git a/elf/Makefile b/elf/Makefile
> > > index 5d78b659ce..e0665d2007 100644
> > > --- a/elf/Makefile
> > > +++ b/elf/Makefile
> > > @@ -424,6 +424,7 @@ tests += \
> > >    tst-glibc-hwcaps-prepend \
> > >    tst-global1 \
> > >    tst-global2 \
> > > +  tst-gnu2-tls2 \
> > >    tst-initfinilazyfail \
> > >    tst-initorder \
> > >    tst-initorder2 \
> > > @@ -699,6 +700,7 @@ modules-names += \
> > >    libtracemod5-1 \
> > >    ltglobmod1 \
> > >    ltglobmod2 \
> > > +  malloc-for-test \
> > >    neededobj1 \
> > >    neededobj2 \
> > >    neededobj3 \
> > > @@ -846,6 +848,9 @@ modules-names += \
> > >    tst-filterobj-flt \
> > >    tst-finilazyfailmod \
> > >    tst-globalmod2 \
> > > +  tst-gnu2-tls2mod0 \
> > > +  tst-gnu2-tls2mod1 \
> > > +  tst-gnu2-tls2mod2 \
> > >    tst-initlazyfailmod \
> > >    tst-initorder2a \
> > >    tst-initorder2b \
> > > @@ -3044,8 +3049,22 @@ $(objpfx)tst-tlsgap.out: \
> > >    $(objpfx)tst-tlsgap-mod0.so \
> > >    $(objpfx)tst-tlsgap-mod1.so \
> > >    $(objpfx)tst-tlsgap-mod2.so
> > > +
> > > +$(objpfx)tst-gnu2-tls2: \
> > > +  $(shared-thread-library) \
> > > +  $(objpfx)malloc-for-test.so
> > > +$(objpfx)tst-gnu2-tls2.out: \
> > > +  $(objpfx)tst-gnu2-tls2mod0.so \
> > > +  $(objpfx)tst-gnu2-tls2mod1.so \
> > > +  $(objpfx)tst-gnu2-tls2mod2.so
> > > +
> > > +LDFLAGS-malloc-for-test.so += -Wl,--version-script=malloc-for-test.map
> > > +
> > >  ifeq (yes,$(have-mtls-dialect-gnu2))
> > >  CFLAGS-tst-tlsgap-mod0.c += -mtls-dialect=gnu2
> > >  CFLAGS-tst-tlsgap-mod1.c += -mtls-dialect=gnu2
> > >  CFLAGS-tst-tlsgap-mod2.c += -mtls-dialect=gnu2
> > > +CFLAGS-tst-gnu2-tls2mod0.c += -mtls-dialect=gnu2
> > > +CFLAGS-tst-gnu2-tls2mod1.c += -mtls-dialect=gnu2
> > > +CFLAGS-tst-gnu2-tls2mod2.c += -mtls-dialect=gnu2
> > >  endif
> > > diff --git a/elf/malloc-for-test.c b/elf/malloc-for-test.c
> > > new file mode 100644
> > > index 0000000000..1bec69eda7
> > > --- /dev/null
> > > +++ b/elf/malloc-for-test.c
> > > @@ -0,0 +1,32 @@
> > > +/* A malloc for intercept test.
> > > +   Copyright (C) 2024 Free Software Foundation, Inc.
> > > +   This file is part of the GNU C Library.
> > > +
> > > +   The GNU C Library is free software; you can redistribute it and/or
> > > +   modify it under the terms of the GNU Lesser General Public
> > > +   License as published by the Free Software Foundation; either
> > > +   version 2.1 of the License, or (at your option) any later version.
> > > +
> > > +   The GNU C Library is distributed in the hope that it will be useful,
> > > +   but WITHOUT ANY WARRANTY; without even the implied warranty of
> > > +   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
> > > +   Lesser General Public License for more details.
> > > +
> > > +   You should have received a copy of the GNU Lesser General Public
> > > +   License along with the GNU C Library; if not, see
> > > +   <http://www.gnu.org/licenses/>.  */
> > > +
> > > +#include <stdlib.h>
> > > +
> > > +extern void * __libc_malloc (size_t);
> > > +
> > > +#ifndef PREPARE_MALLOC
> > > +# define PREPARE_MALLOC()
> > > +#endif
> > > +
> > > +void *
> > > +malloc (size_t n)
> > > +{
> > > +  PREPARE_MALLOC ();
> > > +  return __libc_malloc (n);
> > > +}
> > > diff --git a/elf/malloc-for-test.map b/elf/malloc-for-test.map
> > > new file mode 100644
> > > index 0000000000..8437cf4346
> > > --- /dev/null
> > > +++ b/elf/malloc-for-test.map
> > > @@ -0,0 +1,6 @@
> > > +GLIBC_2.0 {
> > > +  global:
> > > +    malloc;
> > > +  local:
> > > +    *;
> > > +};
> > > diff --git a/elf/tst-gnu2-tls2.c b/elf/tst-gnu2-tls2.c
> > > new file mode 100644
> > > index 0000000000..34427f9a0f
> > > --- /dev/null
> > > +++ b/elf/tst-gnu2-tls2.c
> > > @@ -0,0 +1,97 @@
> > > +/* Test TLSDESC relocation.
> > > +   Copyright (C) 2024 Free Software Foundation, Inc.
> > > +   This file is part of the GNU C Library.
> > > +
> > > +   The GNU C Library is free software; you can redistribute it and/or
> > > +   modify it under the terms of the GNU Lesser General Public
> > > +   License as published by the Free Software Foundation; either
> > > +   version 2.1 of the License, or (at your option) any later version.
> > > +
> > > +   The GNU C Library is distributed in the hope that it will be useful,
> > > +   but WITHOUT ANY WARRANTY; without even the implied warranty of
> > > +   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
> > > +   Lesser General Public License for more details.
> > > +
> > > +   You should have received a copy of the GNU Lesser General Public
> > > +   License along with the GNU C Library; if not, see
> > > +   <http://www.gnu.org/licenses/>.  */
> > > +
> > > +#include <stdio.h>
> > > +#include <string.h>
> > > +#include <dlfcn.h>
> > > +#include <pthread.h>
> > > +#include <support/xdlfcn.h>
> > > +#include <support/xthread.h>
> > > +#include <support/check.h>
> > > +#include <support/test-driver.h>
> > > +#include "tst-gnu2-tls2.h"
> > > +
> > > +#ifndef IS_SUPPORTED
> > > +# define IS_SUPPORTED() true
> > > +#endif
> > > +
> > > +static void *mod[3];
> > > +#define MOD(i) "tst-gnu2-tls2mod" #i ".so"
> > > +static const char *modname[3] = { MOD(0), MOD(1), MOD(2) };
> > > +#undef MOD
> > > +
> > > +static void
> > > +open_mod (int i)
> > > +{
> > > +  mod[i] = xdlopen (modname[i], RTLD_LAZY);
> > > +  printf ("open %s\n", modname[i]);
> > > +}
> > > +
> > > +static void
> > > +close_mod (int i)
> > > +{
> > > +  xdlclose (mod[i]);
> > > +  mod[i] = NULL;
> > > +  printf ("close %s\n", modname[i]);
> > > +}
> > > +
> > > +static void
> > > +access_mod (int i, const char *sym)
> > > +{
> > > +  struct tls var = { -1, -1, -1, -1 };
> > > +  struct tls *(*f) (struct tls *) = xdlsym (mod[i], sym);
> > > +  struct tls *p = f (&var);
> > > +  printf ("access %s: %s() = %p\n", modname[i], sym, p);
> > > +  TEST_VERIFY_EXIT (memcmp (p, &var, sizeof (var)) == 0);
> > > +  ++(p->a);
> > > +}
> > > +
> > > +static void *
> > > +start (void *arg)
> > > +{
> > > +  /* The DTV generation is at the last dlopen of mod0 and the
> > > +     entry for mod1 is NULL.  */
> > > +
> > > +  open_mod (1); /* Reuse modid of mod1. Uses dynamic TLS.  */
> > > +
> > > +  /* Force the slow path in GNU2 TLS descriptor call.  */
> > > +  access_mod (1, "apply_tls");
> > > +
> > > +  return arg;
> > > +}
> > > +
> > > +static int
> > > +do_test (void)
> > > +{
> > > +  if (!IS_SUPPORTED ())
> > > +    return EXIT_UNSUPPORTED;
> > > +
> > > +  open_mod (0);
> > > +  open_mod (1);
> > > +  open_mod (2);
> > > +  close_mod (0);
> > > +  close_mod (1); /* Create modid gap at mod1.  */
> > > +  open_mod (0); /* Reuse modid of mod0, bump generation count.  */
> > > +
> > > +  /* Create a thread where DTV of mod1 is NULL.  */
> > > +  pthread_t t = xpthread_create (NULL, start, NULL);
> > > +  xpthread_join (t);
> > > +  return 0;
> > > +}
> > > +
> > > +#include <support/test-driver.c>
> > > diff --git a/elf/tst-gnu2-tls2.h b/elf/tst-gnu2-tls2.h
> > > new file mode 100644
> > > index 0000000000..e33f4dbe27
> > > --- /dev/null
> > > +++ b/elf/tst-gnu2-tls2.h
> > > @@ -0,0 +1,26 @@
> > > +/* Test TLSDESC relocation.
> > > +   Copyright (C) 2024 Free Software Foundation, Inc.
> > > +   This file is part of the GNU C Library.
> > > +
> > > +   The GNU C Library is free software; you can redistribute it and/or
> > > +   modify it under the terms of the GNU Lesser General Public
> > > +   License as published by the Free Software Foundation; either
> > > +   version 2.1 of the License, or (at your option) any later version.
> > > +
> > > +   The GNU C Library is distributed in the hope that it will be useful,
> > > +   but WITHOUT ANY WARRANTY; without even the implied warranty of
> > > +   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
> > > +   Lesser General Public License for more details.
> > > +
> > > +   You should have received a copy of the GNU Lesser General Public
> > > +   License along with the GNU C Library; if not, see
> > > +   <https://www.gnu.org/licenses/>.  */
> > > +
> > > +#include <stdint.h>
> > > +
> > > +struct tls
> > > +{
> > > +  int64_t a, b, c, d;
> > > +};
> > > +
> > > +extern struct tls *apply_tls (struct tls *);
> > > diff --git a/elf/tst-gnu2-tls2mod0.c b/elf/tst-gnu2-tls2mod0.c
> > > new file mode 100644
> > > index 0000000000..67dc0d464d
> > > --- /dev/null
> > > +++ b/elf/tst-gnu2-tls2mod0.c
> > > @@ -0,0 +1,28 @@
> > > +/* DSO used by tst-gnu2-tls2.
> > > +   Copyright (C) 2024 Free Software Foundation, Inc.
> > > +   This file is part of the GNU C Library.
> > > +
> > > +   The GNU C Library is free software; you can redistribute it and/or
> > > +   modify it under the terms of the GNU Lesser General Public
> > > +   License as published by the Free Software Foundation; either
> > > +   version 2.1 of the License, or (at your option) any later version.
> > > +
> > > +   The GNU C Library is distributed in the hope that it will be useful,
> > > +   but WITHOUT ANY WARRANTY; without even the implied warranty of
> > > +   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
> > > +   Lesser General Public License for more details.
> > > +
> > > +   You should have received a copy of the GNU Lesser General Public
> > > +   License along with the GNU C Library; if not, see
> > > +   <https://www.gnu.org/licenses/>.  */
> > > +
> > > +#include "tst-gnu2-tls2.h"
> > > +
> > > +__thread struct tls tls_var0 __attribute__ ((visibility ("hidden")));
> > > +
> > > +struct tls *
> > > +apply_tls (struct tls *p)
> > > +{
> > > +  tls_var0 = *p;
> > > +  return &tls_var0;
> > > +}
> > > diff --git a/elf/tst-gnu2-tls2mod1.c b/elf/tst-gnu2-tls2mod1.c
> > > new file mode 100644
> > > index 0000000000..a4ae6db24f
> > > --- /dev/null
> > > +++ b/elf/tst-gnu2-tls2mod1.c
> > > @@ -0,0 +1,28 @@
> > > +/* DSO used by tst-gnu2-tls2.
> > > +   Copyright (C) 2024 Free Software Foundation, Inc.
> > > +   This file is part of the GNU C Library.
> > > +
> > > +   The GNU C Library is free software; you can redistribute it and/or
> > > +   modify it under the terms of the GNU Lesser General Public
> > > +   License as published by the Free Software Foundation; either
> > > +   version 2.1 of the License, or (at your option) any later version.
> > > +
> > > +   The GNU C Library is distributed in the hope that it will be useful,
> > > +   but WITHOUT ANY WARRANTY; without even the implied warranty of
> > > +   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
> > > +   Lesser General Public License for more details.
> > > +
> > > +   You should have received a copy of the GNU Lesser General Public
> > > +   License along with the GNU C Library; if not, see
> > > +   <https://www.gnu.org/licenses/>.  */
> > > +
> > > +#include "tst-gnu2-tls2.h"
> > > +
> > > +__thread struct tls tls_var1[100] __attribute__ ((visibility ("hidden")));
> > > +
> > > +struct tls *
> > > +apply_tls (struct tls *p)
> > > +{
> > > +  tls_var1[1] = *p;
> > > +  return &tls_var1[1];
> > > +}
> > > diff --git a/elf/tst-gnu2-tls2mod2.c b/elf/tst-gnu2-tls2mod2.c
> > > new file mode 100644
> > > index 0000000000..2d13921717
> > > --- /dev/null
> > > +++ b/elf/tst-gnu2-tls2mod2.c
> > > @@ -0,0 +1,28 @@
> > > +/* DSO used by tst-gnu2-tls2.
> > > +   Copyright (C) 2024 Free Software Foundation, Inc.
> > > +   This file is part of the GNU C Library.
> > > +
> > > +   The GNU C Library is free software; you can redistribute it and/or
> > > +   modify it under the terms of the GNU Lesser General Public
> > > +   License as published by the Free Software Foundation; either
> > > +   version 2.1 of the License, or (at your option) any later version.
> > > +
> > > +   The GNU C Library is distributed in the hope that it will be useful,
> > > +   but WITHOUT ANY WARRANTY; without even the implied warranty of
> > > +   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
> > > +   Lesser General Public License for more details.
> > > +
> > > +   You should have received a copy of the GNU Lesser General Public
> > > +   License along with the GNU C Library; if not, see
> > > +   <https://www.gnu.org/licenses/>.  */
> > > +
> > > +#include "tst-gnu2-tls2.h"
> > > +
> > > +__thread struct tls tls_var2 __attribute__ ((visibility ("hidden")));
> > > +
> > > +struct tls *
> > > +apply_tls (struct tls *p)
> > > +{
> > > +  tls_var2 = *p;
> > > +  return &tls_var2;
> > > +}
> > > diff --git a/sysdeps/i386/dl-machine.h b/sysdeps/i386/dl-machine.h
> > > index fc1ef96587..50d74fe6e9 100644
> > > --- a/sysdeps/i386/dl-machine.h
> > > +++ b/sysdeps/i386/dl-machine.h
> > > @@ -347,7 +347,7 @@ and creates an unsatisfiable circular dependency.\n",
> > >                   {
> > >                     td->arg = _dl_make_tlsdesc_dynamic
> > >                       (sym_map, sym->st_value + (ElfW(Word))td->arg);
> > > -                   td->entry = _dl_tlsdesc_dynamic;
> > > +                   td->entry = GLRO(dl_x86_tlsdesc_dynamic);
> > >                   }
> > >                 else
> > >  #  endif
> > > diff --git a/sysdeps/i386/dl-tlsdesc-dynamic.h b/sysdeps/i386/dl-tlsdesc-dynamic.h
> > > new file mode 100644
> > > index 0000000000..675e56d32d
> > > --- /dev/null
> > > +++ b/sysdeps/i386/dl-tlsdesc-dynamic.h
> > > @@ -0,0 +1,187 @@
> > > +/* Thread-local storage handling in the ELF dynamic linker.  i386 version.
> > > +   Copyright (C) 2004-2024 Free Software Foundation, Inc.
> > > +   This file is part of the GNU C Library.
> > > +
> > > +   The GNU C Library is free software; you can redistribute it and/or
> > > +   modify it under the terms of the GNU Lesser General Public
> > > +   License as published by the Free Software Foundation; either
> > > +   version 2.1 of the License, or (at your option) any later version.
> > > +
> > > +   The GNU C Library is distributed in the hope that it will be useful,
> > > +   but WITHOUT ANY WARRANTY; without even the implied warranty of
> > > +   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
> > > +   Lesser General Public License for more details.
> > > +
> > > +   You should have received a copy of the GNU Lesser General Public
> > > +   License along with the GNU C Library; if not, see
> > > +   <https://www.gnu.org/licenses/>.  */
> > > +
> > > +#undef REGISTER_SAVE_AREA
> > > +
> > > +#if !defined USE_FNSAVE && (STATE_SAVE_ALIGNMENT % 16) != 0
> > > +# error STATE_SAVE_ALIGNMENT must be multiple of 16
> > > +#endif
> > > +
> > > +#if DL_RUNTIME_RESOLVE_REALIGN_STACK
> > > +# ifdef USE_FNSAVE
> > > +#  error USE_FNSAVE shouldn't be defined
> > > +# endif
> > > +# ifdef USE_FXSAVE
> > > +/* Use fxsave to save all registers.  */
> > > +#  define REGISTER_SAVE_AREA   512
> > > +# endif
> > > +#else
> > > +# ifdef USE_FNSAVE
> > > +/* Use fnsave to save x87 FPU stack registers.  */
> > > +#  define REGISTER_SAVE_AREA   108
> > > +# else
> > > +#  ifndef USE_FXSAVE
> > > +#   error USE_FXSAVE must be defined
> > > +#  endif
> > > +/* Use fxsave to save all registers.  Add 12 bytes to align the stack
> > > +   to 16 bytes.  */
> > > +#  define REGISTER_SAVE_AREA   (512 + 12)
> > > +# endif
> > > +#endif
> > > +
> > > +       .hidden _dl_tlsdesc_dynamic
> > > +       .global _dl_tlsdesc_dynamic
> > > +       .type   _dl_tlsdesc_dynamic,@function
> > > +
> > > +     /* This function is used for symbols that need dynamic TLS.
> > > +
> > > +       %eax points to the TLS descriptor, such that 0(%eax) points to
> > > +       _dl_tlsdesc_dynamic itself, and 4(%eax) points to a struct
> > > +       tlsdesc_dynamic_arg object.  It must return in %eax the offset
> > > +       between the thread pointer and the object denoted by the
> > > +       argument, without clobbering any registers.
> > > +
> > > +       The assembly code that follows is a rendition of the following
> > > +       C code, hand-optimized a little bit.
> > > +
> > > +ptrdiff_t
> > > +__attribute__ ((__regparm__ (1)))
> > > +_dl_tlsdesc_dynamic (struct tlsdesc *tdp)
> > > +{
> > > +  struct tlsdesc_dynamic_arg *td = tdp->arg;
> > > +  dtv_t *dtv = *(dtv_t **)((char *)__thread_pointer + DTV_OFFSET);
> > > +  if (__builtin_expect (td->gen_count <= dtv[0].counter
> > > +                       && (dtv[td->tlsinfo.ti_module].pointer.val
> > > +                           != TLS_DTV_UNALLOCATED),
> > > +                       1))
> > > +    return dtv[td->tlsinfo.ti_module].pointer.val + td->tlsinfo.ti_offset
> > > +      - __thread_pointer;
> > > +
> > > +  return ___tls_get_addr (&td->tlsinfo) - __thread_pointer;
> > > +}
> > > +*/
> > > +       cfi_startproc
> > > +       .align 16
> > > +_dl_tlsdesc_dynamic:
> > > +       /* Like all TLS resolvers, preserve call-clobbered registers.
> > > +          We need two scratch regs anyway.  */
> > > +       subl    $32, %esp
> > > +       cfi_adjust_cfa_offset (32)
> > > +       movl    %ecx, 20(%esp)
> > > +       movl    %edx, 24(%esp)
> > > +       movl    TLSDESC_ARG(%eax), %eax
> > > +       movl    %gs:DTV_OFFSET, %edx
> > > +       movl    TLSDESC_GEN_COUNT(%eax), %ecx
> > > +       cmpl    (%edx), %ecx
> > > +       ja      2f
> > > +       movl    TLSDESC_MODID(%eax), %ecx
> > > +       movl    (%edx,%ecx,8), %edx
> > > +       cmpl    $-1, %edx
> > > +       je      2f
> > > +       movl    TLSDESC_MODOFF(%eax), %eax
> > > +       addl    %edx, %eax
> > > +1:
> > > +       movl    20(%esp), %ecx
> > > +       subl    %gs:0, %eax
> > > +       movl    24(%esp), %edx
> > > +       addl    $32, %esp
> > > +       cfi_adjust_cfa_offset (-32)
> > > +       ret
> > > +       .p2align 4,,7
> > > +2:
> > > +       cfi_adjust_cfa_offset (32)
> > Extraneous AFAICT.
>
> This was in the existing code. The label 2 can only be reached by
> a jump.  When the label 2 is reached, this CFA adjustment is to tell
> debugger that CFA isn't changed the CFA directive above.
>
> >
> > > +#if DL_RUNTIME_RESOLVE_REALIGN_STACK
> > > +       movl    %ebx, -28(%esp)
> > > +       movl    %esp, %ebx
> > > +       cfi_def_cfa_register(%ebx)
> > > +       and     $-STATE_SAVE_ALIGNMENT, %esp
> > > +#endif
> > > +#ifdef REGISTER_SAVE_AREA
> > > +       subl    $REGISTER_SAVE_AREA, %esp
> > > +# if !DL_RUNTIME_RESOLVE_REALIGN_STACK
> > > +       cfi_adjust_cfa_offset(REGISTER_SAVE_AREA)
> > > +# endif
> > > +#else
> > > +       # Allocate stack space of the required size to save the state.
> > > +       LOAD_PIC_REG (cx)
> > > +       subl    RTLD_GLOBAL_RO_DL_X86_CPU_FEATURES_OFFSET+XSAVE_STATE_SIZE_OFFSET+_rtld_local_ro@GOTOFF(%ecx), %esp
> > > +#endif
> > > +#ifdef USE_FNSAVE
> > > +       fnsave  (%esp)
> > > +#elif defined USE_FXSAVE
> > > +       fxsave  (%esp)
> > > +#else
> > > +       # Save the argument for ___tls_get_addr in EAX.
> > > +       movl    %eax, %ecx
> > > +       movl    $TLSDESC_CALL_STATE_SAVE_MASK, %eax
> > > +       xorl    %edx, %edx
> > > +       # Clear the XSAVE Header.
> > > +# ifdef USE_XSAVE
> > > +       movl    %edx, (512)(%esp)
> > > +       movl    %edx, (512 + 4 * 1)(%esp)
> > > +       movl    %edx, (512 + 4 * 2)(%esp)
> > > +       movl    %edx, (512 + 4 * 3)(%esp)
> > > +# endif
> > > +       movl    %edx, (512 + 4 * 4)(%esp)
> > > +       movl    %edx, (512 + 4 * 5)(%esp)
> > > +       movl    %edx, (512 + 4 * 6)(%esp)
> > > +       movl    %edx, (512 + 4 * 7)(%esp)
> > > +       movl    %edx, (512 + 4 * 8)(%esp)
> > > +       movl    %edx, (512 + 4 * 9)(%esp)
> > > +       movl    %edx, (512 + 4 * 10)(%esp)
> > > +       movl    %edx, (512 + 4 * 11)(%esp)
> > > +       movl    %edx, (512 + 4 * 12)(%esp)
> > > +       movl    %edx, (512 + 4 * 13)(%esp)
> > > +       movl    %edx, (512 + 4 * 14)(%esp)
> > > +       movl    %edx, (512 + 4 * 15)(%esp)
> > > +# ifdef USE_XSAVE
> > > +       xsave   (%esp)
> > > +# else
> > > +       xsavec  (%esp)
> > > +# endif
> > > +       # Restore the argument for ___tls_get_addr in EAX.
> > > +       movl    %ecx, %eax
> > > +#endif
> > > +       call    HIDDEN_JUMPTARGET (___tls_get_addr)
> > > +       # Get register content back.
> > > +#ifdef USE_FNSAVE
> > > +       frstor  (%esp)
> > > +#elif defined USE_FXSAVE
> > > +       fxrstor (%esp)
> > > +#else
> > > +       /* Save and retore ___tls_get_addr return value stored in EAX.  */
> > > +       movl    %eax, %ecx
> > > +       movl    $TLSDESC_CALL_STATE_SAVE_MASK, %eax
> > > +       xorl    %edx, %edx
> > > +       xrstor  (%esp)
> > > +       movl    %ecx, %eax
> > > +#endif
> > > +#if DL_RUNTIME_RESOLVE_REALIGN_STACK
> > > +       mov     %ebx, %esp
> > > +       cfi_def_cfa_register(%esp)
> > > +       movl    -28(%esp), %ebx
> > > +       cfi_restore(%ebx)
> > > +#else
> > > +       addl    $REGISTER_SAVE_AREA, %esp
> > > +       cfi_adjust_cfa_offset(-REGISTER_SAVE_AREA)
> > The use of `REGISTER_SAVE_AREA` above is guarded by an
> > `#ifdef REGISTER_SAVE_AREA`
> > and uses
> > `_rtld_local_ro+RTLD_GLOBAL_RO_DL_X86_CPU_FEATURES_OFFSET+XSAVE_STATE_SIZE_OFFSET(%rip)`
> > otherwise.
> > Would expect same here?
>
> REGISTER_SAVE_AREA is only used by fnsave and fxsave which
> expect the fixed area.
>
> _rtld_local_ro+RTLD_GLOBAL_RO_DL_X86_CPU_FEATURES_OFFSET+XSAVE_STATE_SIZE_OFFSET(%rip)
> is used by xsave and xsavec whose saved area size depends on
> the enabled features.
>
> 2 things are different.

My point is that we setup the stack above with ifdef i.e
```
#ifdef REGISTER_SAVE_AREA
       subl    $REGISTER_SAVE_AREA, %esp
#else
       subl    RTLD_GLOBAL_RO_DL_X86_CPU_FEATURES_OFFSET+XSAVE_STATE_SIZE_OFFSET+_rtld_local_ro@GOTOFF(%ecx),
%esp
#endif
```
Shouldnt you have the same ifdef for restoring?
>
> > > +#endif
> > > +       jmp     1b
> > > +       cfi_endproc
> > > +       .size   _dl_tlsdesc_dynamic, .-_dl_tlsdesc_dynamic
> > > +
> > > +#undef STATE_SAVE_ALIGNMENT
> > > diff --git a/sysdeps/i386/dl-tlsdesc.S b/sysdeps/i386/dl-tlsdesc.S
> > > index 90d93caa0c..f002feee56 100644
> > > --- a/sysdeps/i386/dl-tlsdesc.S
> > > +++ b/sysdeps/i386/dl-tlsdesc.S
> > > @@ -18,8 +18,27 @@
> > >
> > >  #include <sysdep.h>
> > >  #include <tls.h>
> > > +#include <cpu-features-offsets.h>
> > > +#include <features-offsets.h>
> > >  #include "tlsdesc.h"
> > >
> > > +#ifndef DL_STACK_ALIGNMENT
> > > +/* Due to GCC bug:
> > > +
> > > +   https://gcc.gnu.org/bugzilla/show_bug.cgi?id=58066
> > > +
> > > +   __tls_get_addr may be called with 4-byte stack alignment.  Although
> > > +   this bug has been fixed in GCC 4.9.4, 5.3 and 6, we can't assume
> > > +   that stack will be always aligned at 16 bytes.  */
> > > +# define DL_STACK_ALIGNMENT 4
> > > +#endif
> > > +
> > > +/* True if _dl_tlsdesc_dynamic should align stack for STATE_SAVE or align
> > > +   stack to MINIMUM_ALIGNMENT bytes before calling ___tls_get_addr.  */
> > > +#define DL_RUNTIME_RESOLVE_REALIGN_STACK \
> > > +  (STATE_SAVE_ALIGNMENT > DL_STACK_ALIGNMENT \
> > > +   || MINIMUM_ALIGNMENT > DL_STACK_ALIGNMENT)
> > > +
> > >         .text
> > >
> > >       /* This function is used to compute the TP offset for symbols in
> > > @@ -65,69 +84,35 @@ _dl_tlsdesc_undefweak:
> > >         .size   _dl_tlsdesc_undefweak, .-_dl_tlsdesc_undefweak
> > >
> > >  #ifdef SHARED
> > > -       .hidden _dl_tlsdesc_dynamic
> > > -       .global _dl_tlsdesc_dynamic
> > > -       .type   _dl_tlsdesc_dynamic,@function
> > > -
> > > -     /* This function is used for symbols that need dynamic TLS.
> > > -
> > > -       %eax points to the TLS descriptor, such that 0(%eax) points to
> > > -       _dl_tlsdesc_dynamic itself, and 4(%eax) points to a struct
> > > -       tlsdesc_dynamic_arg object.  It must return in %eax the offset
> > > -       between the thread pointer and the object denoted by the
> > > -       argument, without clobbering any registers.
> > > -
> > > -       The assembly code that follows is a rendition of the following
> > > -       C code, hand-optimized a little bit.
> > > -
> > > -ptrdiff_t
> > > -__attribute__ ((__regparm__ (1)))
> > > -_dl_tlsdesc_dynamic (struct tlsdesc *tdp)
> > > -{
> > > -  struct tlsdesc_dynamic_arg *td = tdp->arg;
> > > -  dtv_t *dtv = *(dtv_t **)((char *)__thread_pointer + DTV_OFFSET);
> > > -  if (__builtin_expect (td->gen_count <= dtv[0].counter
> > > -                       && (dtv[td->tlsinfo.ti_module].pointer.val
> > > -                           != TLS_DTV_UNALLOCATED),
> > > -                       1))
> > > -    return dtv[td->tlsinfo.ti_module].pointer.val + td->tlsinfo.ti_offset
> > > -      - __thread_pointer;
> > > -
> > > -  return ___tls_get_addr (&td->tlsinfo) - __thread_pointer;
> > > -}
> > > -*/
> > > -       cfi_startproc
> > > -       .align 16
> > > -_dl_tlsdesc_dynamic:
> > > -       /* Like all TLS resolvers, preserve call-clobbered registers.
> > > -          We need two scratch regs anyway.  */
> > > -       subl    $28, %esp
> > > -       cfi_adjust_cfa_offset (28)
> > > -       movl    %ecx, 20(%esp)
> > > -       movl    %edx, 24(%esp)
> > > -       movl    TLSDESC_ARG(%eax), %eax
> > > -       movl    %gs:DTV_OFFSET, %edx
> > > -       movl    TLSDESC_GEN_COUNT(%eax), %ecx
> > > -       cmpl    (%edx), %ecx
> > > -       ja      .Lslow
> > > -       movl    TLSDESC_MODID(%eax), %ecx
> > > -       movl    (%edx,%ecx,8), %edx
> > > -       cmpl    $-1, %edx
> > > -       je      .Lslow
> > > -       movl    TLSDESC_MODOFF(%eax), %eax
> > > -       addl    %edx, %eax
> > > -.Lret:
> > > -       movl    20(%esp), %ecx
> > > -       subl    %gs:0, %eax
> > > -       movl    24(%esp), %edx
> > > -       addl    $28, %esp
> > > -       cfi_adjust_cfa_offset (-28)
> > > -       ret
> > > -       .p2align 4,,7
> > > -.Lslow:
> > > -       cfi_adjust_cfa_offset (28)
> > > -       call    HIDDEN_JUMPTARGET (___tls_get_addr)
> > > -       jmp     .Lret
> > > -       cfi_endproc
> > > -       .size   _dl_tlsdesc_dynamic, .-_dl_tlsdesc_dynamic
> > > +# define USE_FNSAVE
> > > +# define MINIMUM_ALIGNMENT     4
> > > +# define STATE_SAVE_ALIGNMENT  4
> > > +# define _dl_tlsdesc_dynamic   _dl_tlsdesc_dynamic_fnsave
> > > +# include "dl-tlsdesc-dynamic.h"
> > > +# undef _dl_tlsdesc_dynamic
> > > +# undef MINIMUM_ALIGNMENT
> > > +# undef USE_FNSAVE
> > > +
> > > +# define MINIMUM_ALIGNMENT     16
> > > +
> > > +# define USE_FXSAVE
> > > +# define STATE_SAVE_ALIGNMENT  16
> > > +# define _dl_tlsdesc_dynamic   _dl_tlsdesc_dynamic_fxsave
> > > +# include "dl-tlsdesc-dynamic.h"
> > > +# undef _dl_tlsdesc_dynamic
> > > +# undef USE_FXSAVE
> > > +
> > > +# define USE_XSAVE
> > > +# define STATE_SAVE_ALIGNMENT  64
> > > +# define _dl_tlsdesc_dynamic   _dl_tlsdesc_dynamic_xsave
> > > +# include "dl-tlsdesc-dynamic.h"
> > > +# undef _dl_tlsdesc_dynamic
> > > +# undef USE_XSAVE
> > > +
> > > +# define USE_XSAVEC
> > > +# define STATE_SAVE_ALIGNMENT  64
> > > +# define _dl_tlsdesc_dynamic   _dl_tlsdesc_dynamic_xsavec
> > > +# include "dl-tlsdesc-dynamic.h"
> > > +# undef _dl_tlsdesc_dynamic
> > > +# undef USE_XSAVEC
> > >  #endif /* SHARED */
> > > diff --git a/sysdeps/i386/tst-gnu2-tls2.c b/sysdeps/i386/tst-gnu2-tls2.c
> > > new file mode 100644
> > > index 0000000000..92e7fbff89
> > > --- /dev/null
> > > +++ b/sysdeps/i386/tst-gnu2-tls2.c
> > > @@ -0,0 +1,5 @@
> > > +#include <sys/platform/x86.h>
> > > +
> > > +#define IS_SUPPORTED() CPU_FEATURE_ACTIVE (SSE2)
> > > +
> > > +#include <elf/tst-gnu2-tls2.c>
> > > diff --git a/sysdeps/x86/Makefile b/sysdeps/x86/Makefile
> > > index 4d50b327b5..bc357f0099 100644
> > > --- a/sysdeps/x86/Makefile
> > > +++ b/sysdeps/x86/Makefile
> > > @@ -1,5 +1,5 @@
> > >  ifeq ($(subdir),csu)
> > > -gen-as-const-headers += cpu-features-offsets.sym
> > > +gen-as-const-headers += cpu-features-offsets.sym features-offsets.sym
> > >  endif
> > >
> > >  ifeq ($(subdir),elf)
> > > @@ -86,6 +86,11 @@ endif
> > >  tst-ifunc-isa-2-ENV = GLIBC_TUNABLES=glibc.cpu.hwcaps=-SSE4_2,-AVX,-AVX2,-AVX512F
> > >  tst-ifunc-isa-2-static-ENV = $(tst-ifunc-isa-2-ENV)
> > >  tst-hwcap-tunables-ARGS = -- $(host-test-program-cmd)
> > > +
> > > +CFLAGS-malloc-for-test.c += -msse2
> > > +CFLAGS-tst-gnu2-tls2mod0.c += -msse2 -mtune=haswell
> > > +CFLAGS-tst-gnu2-tls2mod1.c += -msse2 -mtune=haswell
> > > +CFLAGS-tst-gnu2-tls2mod2.c += -msse2 -mtune=haswell
> > >  endif
> > >
> > >  ifeq ($(subdir),math)
> > > diff --git a/sysdeps/x86/cpu-features.c b/sysdeps/x86/cpu-features.c
> > > index 25e6622a79..835113b42f 100644
> > > --- a/sysdeps/x86/cpu-features.c
> > > +++ b/sysdeps/x86/cpu-features.c
> > > @@ -27,8 +27,13 @@
> > >  extern void TUNABLE_CALLBACK (set_hwcaps) (tunable_val_t *)
> > >    attribute_hidden;
> > >
> > > -#if defined SHARED && defined __x86_64__
> > > -# include <dl-plt-rewrite.h>
> > > +#if defined SHARED
> > > +extern void _dl_tlsdesc_dynamic_fxsave (void) attribute_hidden;
> > > +extern void _dl_tlsdesc_dynamic_xsave (void) attribute_hidden;
> > > +extern void _dl_tlsdesc_dynamic_xsavec (void) attribute_hidden;
> > > +
> > > +# ifdef __x86_64__
> > > +#  include <dl-plt-rewrite.h>
> > >
> > >  static void
> > >  TUNABLE_CALLBACK (set_plt_rewrite) (tunable_val_t *valp)
> > > @@ -47,6 +52,15 @@ TUNABLE_CALLBACK (set_plt_rewrite) (tunable_val_t *valp)
> > >                  : plt_rewrite_jmp);
> > >      }
> > >  }
> > > +# else
> > > +extern void _dl_tlsdesc_dynamic_fnsave (void) attribute_hidden;
> > > +# endif
> > > +#endif
> > > +
> > > +#ifdef __x86_64__
> > > +extern void _dl_runtime_resolve_fxsave (void) attribute_hidden;
> > > +extern void _dl_runtime_resolve_xsave (void) attribute_hidden;
> > > +extern void _dl_runtime_resolve_xsavec (void) attribute_hidden;
> > >  #endif
> > >
> > >  #ifdef __LP64__
> > > @@ -1130,6 +1144,44 @@ no_cpuid:
> > >                TUNABLE_CALLBACK (set_x86_shstk));
> > >  #endif
> > >
> > > +  if (GLRO(dl_x86_cpu_features).xsave_state_size != 0)
> > > +    {
> > > +      if (CPU_FEATURE_USABLE_P (cpu_features, XSAVEC))
> > > +       {
> > > +#ifdef __x86_64__
> > > +         GLRO(dl_x86_64_runtime_resolve) = _dl_runtime_resolve_xsavec;
> > > +#endif
> > > +#ifdef SHARED
> > > +         GLRO(dl_x86_tlsdesc_dynamic) = _dl_tlsdesc_dynamic_xsavec;
> > > +#endif
> > > +       }
> > > +      else
> > > +       {
> > > +#ifdef __x86_64__
> > > +         GLRO(dl_x86_64_runtime_resolve) = _dl_runtime_resolve_xsave;
> > > +#endif
> > > +#ifdef SHARED
> > > +         GLRO(dl_x86_tlsdesc_dynamic) = _dl_tlsdesc_dynamic_xsave;
> > > +#endif
> > > +       }
> > > +    }
> > > +  else
> > > +    {
> > > +#ifdef __x86_64__
> > > +      GLRO(dl_x86_64_runtime_resolve) = _dl_runtime_resolve_fxsave;
> > > +# ifdef SHARED
> > > +      GLRO(dl_x86_tlsdesc_dynamic) = _dl_tlsdesc_dynamic_fxsave;
> > > +# endif
> > > +#else
> > > +# ifdef SHARED
> > > +      if (CPU_FEATURE_USABLE_P (cpu_features, FXSR))
> > > +       GLRO(dl_x86_tlsdesc_dynamic) = _dl_tlsdesc_dynamic_fxsave;
> > > +      else
> > > +       GLRO(dl_x86_tlsdesc_dynamic) = _dl_tlsdesc_dynamic_fnsave;
> > > +# endif
> > > +#endif
> > > +    }
> > > +
> > >  #ifdef SHARED
> > >  # ifdef __x86_64__
> > >    TUNABLE_GET (plt_rewrite, tunable_val_t *,
> > > diff --git a/sysdeps/x86/dl-procinfo.c b/sysdeps/x86/dl-procinfo.c
> > > index ee957b4d70..5920d4b320 100644
> > > --- a/sysdeps/x86/dl-procinfo.c
> > > +++ b/sysdeps/x86/dl-procinfo.c
> > > @@ -86,3 +86,19 @@ PROCINFO_CLASS const char _dl_x86_platforms[4][9]
> > >  #else
> > >  ,
> > >  #endif
> > > +
> > > +#if defined SHARED && !IS_IN (ldconfig)
> > > +# if !defined PROCINFO_DECL
> > > +  ._dl_x86_tlsdesc_dynamic
> > > +# else
> > > +PROCINFO_CLASS void * _dl_x86_tlsdesc_dynamic
> > > +# endif
> > > +# ifndef PROCINFO_DECL
> > > += NULL
> > > +# endif
> > > +# ifdef PROCINFO_DECL
> > > +;
> > > +# else
> > > +,
> > > +# endif
> > > +#endif
> > > diff --git a/sysdeps/x86_64/features-offsets.sym b/sysdeps/x86/features-offsets.sym
> > > similarity index 89%
> > > rename from sysdeps/x86_64/features-offsets.sym
> > > rename to sysdeps/x86/features-offsets.sym
> > > index 9e4be3393a..77e990c705 100644
> > > --- a/sysdeps/x86_64/features-offsets.sym
> > > +++ b/sysdeps/x86/features-offsets.sym
> > > @@ -3,4 +3,6 @@
> > >  #include <ldsodefs.h>
> > >
> > >  RTLD_GLOBAL_RO_DL_X86_CPU_FEATURES_OFFSET offsetof (struct rtld_global_ro, _dl_x86_cpu_features)
> > > +#ifdef __x86_64__
> > >  RTLD_GLOBAL_DL_X86_FEATURE_1_OFFSET offsetof (struct rtld_global, _dl_x86_feature_1)
> > > +#endif
> > > diff --git a/sysdeps/x86/malloc-for-test.c b/sysdeps/x86/malloc-for-test.c
> > > new file mode 100644
> > > index 0000000000..02f4dead5d
> > > --- /dev/null
> > > +++ b/sysdeps/x86/malloc-for-test.c
> > > @@ -0,0 +1,33 @@
> > > +/*  A malloc for intercept test.  x86 version.
> > > +   Copyright (C) 2024 Free Software Foundation, Inc.
> > > +   This file is part of the GNU C Library.
> > > +
> > > +   The GNU C Library is free software; you can redistribute it and/or
> > > +   modify it under the terms of the GNU Lesser General Public
> > > +   License as published by the Free Software Foundation; either
> > > +   version 2.1 of the License, or (at your option) any later version.
> > > +
> > > +   The GNU C Library is distributed in the hope that it will be useful,
> > > +   but WITHOUT ANY WARRANTY; without even the implied warranty of
> > > +   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
> > > +   Lesser General Public License for more details.
> > > +
> > > +   You should have received a copy of the GNU Lesser General Public
> > > +   License along with the GNU C Library; if not, see
> > > +   <http://www.gnu.org/licenses/>.  */
> > > +
> > > +
> > > +/* Clear XMM0...XMM7  */
> > > +#define PREPARE_MALLOC()                               \
> > > +{                                                      \
> > > +  asm volatile ("xorps %%xmm0, %%xmm0" : : : "xmm0" ); \
> > > +  asm volatile ("xorps %%xmm1, %%xmm1" : : : "xmm1" ); \
> > > +  asm volatile ("xorps %%xmm2, %%xmm2" : : : "xmm2" ); \
> > > +  asm volatile ("xorps %%xmm3, %%xmm3" : : : "xmm3" ); \
> > > +  asm volatile ("xorps %%xmm4, %%xmm4" : : : "xmm4" ); \
> > > +  asm volatile ("xorps %%xmm5, %%xmm5" : : : "xmm5" ); \
> > > +  asm volatile ("xorps %%xmm6, %%xmm6" : : : "xmm6" ); \
> > > +  asm volatile ("xorps %%xmm7, %%xmm7" : : : "xmm7" ); \
> > > +}
> > > +
> > > +#include <elf/malloc-for-test.c>
> > > diff --git a/sysdeps/x86/sysdep.h b/sysdeps/x86/sysdep.h
> > > index 5c1f0bcf53..792e2ea5ed 100644
> > > --- a/sysdeps/x86/sysdep.h
> > > +++ b/sysdeps/x86/sysdep.h
> > > @@ -68,6 +68,12 @@
> > >     | (1 << X86_XSTATE_ZMM_H_ID))
> > >  #endif
> > >
> > > +/* States which should be saved for TLSDESC_CALL and TLS_DESC_CALL.
> > > +   Compiler assumes that all registers, including x87 FPU stack registers,
> > > +   are unchanged after CALL, except for EFLAGS and RAX/EAX.  */
> > > +#define TLSDESC_CALL_STATE_SAVE_MASK   \
> > > +  (STATE_SAVE_MASK | (1 << X86_XSTATE_X87_ID))
> > > +
> > >  /* Constants for bits in __x86_string_control:  */
> > >
> > >  /* Avoid short distance REP MOVSB.  */
> > > diff --git a/sysdeps/x86_64/Makefile b/sysdeps/x86_64/Makefile
> > > index 90f4ecfd26..e8babc9a4e 100644
> > > --- a/sysdeps/x86_64/Makefile
> > > +++ b/sysdeps/x86_64/Makefile
> > > @@ -10,7 +10,7 @@ LDFLAGS-rtld += -Wl,-z,nomark-plt
> > >  endif
> > >
> > >  ifeq ($(subdir),csu)
> > > -gen-as-const-headers += features-offsets.sym link-defines.sym
> > > +gen-as-const-headers += link-defines.sym
> > >  endif
> > >
> > >  ifeq ($(subdir),gmon)
> > > diff --git a/sysdeps/x86_64/dl-machine.h b/sysdeps/x86_64/dl-machine.h
> > > index 6d605d0d32..ff5d45f7cb 100644
> > > --- a/sysdeps/x86_64/dl-machine.h
> > > +++ b/sysdeps/x86_64/dl-machine.h
> > > @@ -71,9 +71,6 @@ elf_machine_runtime_setup (struct link_map *l, struct r_scope_elem *scope[],
> > >                            int lazy, int profile)
> > >  {
> > >    Elf64_Addr *got;
> > > -  extern void _dl_runtime_resolve_fxsave (ElfW(Word)) attribute_hidden;
> > > -  extern void _dl_runtime_resolve_xsave (ElfW(Word)) attribute_hidden;
> > > -  extern void _dl_runtime_resolve_xsavec (ElfW(Word)) attribute_hidden;
> > >    extern void _dl_runtime_profile_sse (ElfW(Word)) attribute_hidden;
> > >    extern void _dl_runtime_profile_avx (ElfW(Word)) attribute_hidden;
> > >    extern void _dl_runtime_profile_avx512 (ElfW(Word)) attribute_hidden;
> > > @@ -96,8 +93,6 @@ elf_machine_runtime_setup (struct link_map *l, struct r_scope_elem *scope[],
> > >        /* Identify this shared object.  */
> > >        *(ElfW(Addr) *) (got + 1) = (ElfW(Addr)) l;
> > >
> > > -      const struct cpu_features* cpu_features = __get_cpu_features ();
> > > -
> > >  #ifdef SHARED
> > >        /* The got[2] entry contains the address of a function which gets
> > >          called to get the address of a so far unresolved function and
> > > @@ -107,6 +102,7 @@ elf_machine_runtime_setup (struct link_map *l, struct r_scope_elem *scope[],
> > >          end in this function.  */
> > >        if (__glibc_unlikely (profile))
> > >         {
> > > +         const struct cpu_features* cpu_features = __get_cpu_features ();
> > >           if (X86_ISA_CPU_FEATURE_USABLE_P (cpu_features, AVX512F))
> > >             *(ElfW(Addr) *) (got + 2) = (ElfW(Addr)) &_dl_runtime_profile_avx512;
> > >           else if (X86_ISA_CPU_FEATURE_USABLE_P (cpu_features, AVX))
> > > @@ -126,15 +122,8 @@ elf_machine_runtime_setup (struct link_map *l, struct r_scope_elem *scope[],
> > >           /* This function will get called to fix up the GOT entry
> > >              indicated by the offset on the stack, and then jump to
> > >              the resolved address.  */
> > > -         if (MINIMUM_X86_ISA_LEVEL >= AVX_X86_ISA_LEVEL
> > > -             || GLRO(dl_x86_cpu_features).xsave_state_size != 0)
> > > -           *(ElfW(Addr) *) (got + 2)
> > > -             = (CPU_FEATURE_USABLE_P (cpu_features, XSAVEC)
> > > -                ? (ElfW(Addr)) &_dl_runtime_resolve_xsavec
> > > -                : (ElfW(Addr)) &_dl_runtime_resolve_xsave);
> > > -         else
> > > -           *(ElfW(Addr) *) (got + 2)
> > > -             = (ElfW(Addr)) &_dl_runtime_resolve_fxsave;
> > > +         *(ElfW(Addr) *) (got + 2)
> > > +           = (ElfW(Addr)) GLRO(dl_x86_64_runtime_resolve);
> > >         }
> > >      }
> > >
> > > @@ -383,7 +372,7 @@ and creates an unsatisfiable circular dependency.\n",
> > >                   {
> > >                     td->arg = _dl_make_tlsdesc_dynamic
> > >                       (sym_map, sym->st_value + reloc->r_addend);
> > > -                   td->entry = _dl_tlsdesc_dynamic;
> > > +                   td->entry = GLRO(dl_x86_tlsdesc_dynamic);
> > >                   }
> > >                 else
> > >  #  endif
> > > diff --git a/sysdeps/x86_64/dl-procinfo.c b/sysdeps/x86_64/dl-procinfo.c
> > > index 4d1d790fbb..06637a8154 100644
> > > --- a/sysdeps/x86_64/dl-procinfo.c
> > > +++ b/sysdeps/x86_64/dl-procinfo.c
> > > @@ -41,5 +41,21 @@
> > >
> > >  #include <sysdeps/x86/dl-procinfo.c>
> > >
> > > +#if !IS_IN (ldconfig)
> > > +# if !defined PROCINFO_DECL && defined SHARED
> > > +  ._dl_x86_64_runtime_resolve
> > > +# else
> > > +PROCINFO_CLASS void * _dl_x86_64_runtime_resolve
> > > +# endif
> > > +# ifndef PROCINFO_DECL
> > > += NULL
> > > +# endif
> > > +# if !defined SHARED || defined PROCINFO_DECL
> > > +;
> > > +# else
> > > +,
> > > +# endif
> > > +#endif
> > > +
> > >  #undef PROCINFO_DECL
> > >  #undef PROCINFO_CLASS
> > > diff --git a/sysdeps/x86_64/dl-tlsdesc-dynamic.h b/sysdeps/x86_64/dl-tlsdesc-dynamic.h
> > > new file mode 100644
> > > index 0000000000..ce0bc094ec
> > > --- /dev/null
> > > +++ b/sysdeps/x86_64/dl-tlsdesc-dynamic.h
> > > @@ -0,0 +1,166 @@
> > > +/* Thread-local storage handling in the ELF dynamic linker.  x86_64 version.
> > > +   Copyright (C) 2004-2024 Free Software Foundation, Inc.
> > > +   This file is part of the GNU C Library.
> > > +
> > > +   The GNU C Library is free software; you can redistribute it and/or
> > > +   modify it under the terms of the GNU Lesser General Public
> > > +   License as published by the Free Software Foundation; either
> > > +   version 2.1 of the License, or (at your option) any later version.
> > > +
> > > +   The GNU C Library is distributed in the hope that it will be useful,
> > > +   but WITHOUT ANY WARRANTY; without even the implied warranty of
> > > +   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
> > > +   Lesser General Public License for more details.
> > > +
> > > +   You should have received a copy of the GNU Lesser General Public
> > > +   License along with the GNU C Library; if not, see
> > > +   <https://www.gnu.org/licenses/>.  */
> > > +
> > > +#ifndef SECTION
> > > +# define SECTION(p)    p
> > > +#endif
> > > +
> > > +#undef REGISTER_SAVE_AREA
> > > +#undef LOCAL_STORAGE_AREA
> > > +#undef BASE
> > > +
> > > +#include "dl-trampoline-state.h"
> > > +
> > > +       .section SECTION(.text),"ax",@progbits
> > > +
> > > +       .hidden _dl_tlsdesc_dynamic
> > > +       .global _dl_tlsdesc_dynamic
> > > +       .type   _dl_tlsdesc_dynamic,@function
> > > +
> > > +     /* %rax points to the TLS descriptor, such that 0(%rax) points to
> > > +       _dl_tlsdesc_dynamic itself, and 8(%rax) points to a struct
> > > +       tlsdesc_dynamic_arg object.  It must return in %rax the offset
> > > +       between the thread pointer and the object denoted by the
> > > +       argument, without clobbering any registers.
> > > +
> > > +       The assembly code that follows is a rendition of the following
> > > +       C code, hand-optimized a little bit.
> > > +
> > > +ptrdiff_t
> > > +_dl_tlsdesc_dynamic (register struct tlsdesc *tdp asm ("%rax"))
> > > +{
> > > +  struct tlsdesc_dynamic_arg *td = tdp->arg;
> > > +  dtv_t *dtv = *(dtv_t **)((char *)__thread_pointer + DTV_OFFSET);
> > > +  if (__builtin_expect (td->gen_count <= dtv[0].counter
> > > +                       && (dtv[td->tlsinfo.ti_module].pointer.val
> > > +                           != TLS_DTV_UNALLOCATED),
> > > +                       1))
> > > +    return dtv[td->tlsinfo.ti_module].pointer.val + td->tlsinfo.ti_offset
> > > +      - __thread_pointer;
> > > +
> > > +  return __tls_get_addr_internal (&td->tlsinfo) - __thread_pointer;
> > > +}
> > > +*/
> > > +       cfi_startproc
> > > +       .align 16
> > > +_dl_tlsdesc_dynamic:
> > > +       _CET_ENDBR
> > > +       /* Preserve call-clobbered registers that we modify.
> > > +          We need two scratch regs anyway.  */
> > > +       movq    %rsi, -16(%rsp)
> > > +       mov     %fs:DTV_OFFSET, %RSI_LP
> > > +       movq    %rdi, -8(%rsp)
> > > +       movq    TLSDESC_ARG(%rax), %rdi
> > > +       movq    (%rsi), %rax
> > > +       cmpq    %rax, TLSDESC_GEN_COUNT(%rdi)
> > > +       ja      2f
> > > +       movq    TLSDESC_MODID(%rdi), %rax
> > > +       salq    $4, %rax
> > > +       movq    (%rax,%rsi), %rax
> > > +       cmpq    $-1, %rax
> > > +       je      2f
> > > +       addq    TLSDESC_MODOFF(%rdi), %rax
> > > +1:
> > > +       movq    -16(%rsp), %rsi
> > > +       sub     %fs:0, %RAX_LP
> > > +       movq    -8(%rsp), %rdi
> > > +       ret
> > > +2:
> > > +#if DL_RUNTIME_RESOLVE_REALIGN_STACK
> > > +       movq    %rbx, -24(%rsp)
> > > +       mov     %RSP_LP, %RBX_LP
> > > +       cfi_def_cfa_register(%rbx)
> > > +       and     $-STATE_SAVE_ALIGNMENT, %RSP_LP
> > > +#endif
> > > +#ifdef REGISTER_SAVE_AREA
> > > +# if DL_RUNTIME_RESOLVE_REALIGN_STACK
> > > +       # STATE_SAVE_OFFSET has space for 8 integer registers.  But we
> > > +       # need space for RCX, RDX, RSI, RDI, R8, R9, R10 and R11, plus
> > > +       # RBX above.
> > > +       sub     $(REGISTER_SAVE_AREA + STATE_SAVE_ALIGNMENT), %RSP_LP
> > > +# else
> > > +       sub     $REGISTER_SAVE_AREA, %RSP_LP
> > > +       cfi_adjust_cfa_offset(REGISTER_SAVE_AREA)
> > > +# endif
> > > +#else
> > > +       # Allocate stack space of the required size to save the state.
> > > +       sub     _rtld_local_ro+RTLD_GLOBAL_RO_DL_X86_CPU_FEATURES_OFFSET+XSAVE_STATE_SIZE_OFFSET(%rip), %RSP_LP
> > > +#endif
> > > +       /* Besides rdi and rsi, saved above, save rcx, rdx, r8, r9,
> > > +          r10 and r11.  */
> > > +       movq    %rcx, REGISTER_SAVE_RCX(%rsp)
> > > +       movq    %rdx, REGISTER_SAVE_RDX(%rsp)
> > > +       movq    %r8, REGISTER_SAVE_R8(%rsp)
> > > +       movq    %r9, REGISTER_SAVE_R9(%rsp)
> > > +       movq    %r10, REGISTER_SAVE_R10(%rsp)
> > > +       movq    %r11, REGISTER_SAVE_R11(%rsp)
> > > +#ifdef USE_FXSAVE
> > > +       fxsave  STATE_SAVE_OFFSET(%rsp)
> > > +#else
> > > +       movl    $TLSDESC_CALL_STATE_SAVE_MASK, %eax
> > > +       xorl    %edx, %edx
> > > +       # Clear the XSAVE Header.
> > > +# ifdef USE_XSAVE
> > > +       movq    %rdx, (STATE_SAVE_OFFSET + 512)(%rsp)
> > > +       movq    %rdx, (STATE_SAVE_OFFSET + 512 + 8)(%rsp)
> > > +# endif
> > > +       movq    %rdx, (STATE_SAVE_OFFSET + 512 + 8 * 2)(%rsp)
> > > +       movq    %rdx, (STATE_SAVE_OFFSET + 512 + 8 * 3)(%rsp)
> > > +       movq    %rdx, (STATE_SAVE_OFFSET + 512 + 8 * 4)(%rsp)
> > > +       movq    %rdx, (STATE_SAVE_OFFSET + 512 + 8 * 5)(%rsp)
> > > +       movq    %rdx, (STATE_SAVE_OFFSET + 512 + 8 * 6)(%rsp)
> > > +       movq    %rdx, (STATE_SAVE_OFFSET + 512 + 8 * 7)(%rsp)
> > > +# ifdef USE_XSAVE
> > > +       xsave   STATE_SAVE_OFFSET(%rsp)
> > > +# else
> > > +       xsavec  STATE_SAVE_OFFSET(%rsp)
> > > +# endif
> > > +#endif
> > > +       /* %rdi already points to the tlsinfo data structure.  */
> > > +       call    HIDDEN_JUMPTARGET (__tls_get_addr)
> > > +       # Get register content back.
> > > +#ifdef USE_FXSAVE
> > > +       fxrstor STATE_SAVE_OFFSET(%rsp)
> > > +#else
> > > +       /* Save and retore __tls_get_addr return value stored in RAX.  */
> > > +       mov     %RAX_LP, %RCX_LP
> > > +       movl    $TLSDESC_CALL_STATE_SAVE_MASK, %eax
> > > +       xorl    %edx, %edx
> > > +       xrstor  STATE_SAVE_OFFSET(%rsp)
> > > +       mov     %RCX_LP, %RAX_LP
> > > +#endif
> > > +       movq    REGISTER_SAVE_R11(%rsp), %r11
> > > +       movq    REGISTER_SAVE_R10(%rsp), %r10
> > > +       movq    REGISTER_SAVE_R9(%rsp), %r9
> > > +       movq    REGISTER_SAVE_R8(%rsp), %r8
> > > +       movq    REGISTER_SAVE_RDX(%rsp), %rdx
> > > +       movq    REGISTER_SAVE_RCX(%rsp), %rcx
> > > +#if DL_RUNTIME_RESOLVE_REALIGN_STACK
> > > +       mov     %RBX_LP, %RSP_LP
> > > +       cfi_def_cfa_register(%rsp)
> > > +       movq    -24(%rsp), %rbx
> > > +       cfi_restore(%rbx)
> > > +#else
> > > +       add     $REGISTER_SAVE_AREA, %RSP_LP
> > > +       cfi_adjust_cfa_offset(-REGISTER_SAVE_AREA)
> > The use of `REGISTER_SAVE_AREA` above is guarded by an
> > `#ifdef REGISTER_SAVE_AREA`
> > and uses
> > `_rtld_local_ro+RTLD_GLOBAL_RO_DL_X86_CPU_FEATURES_OFFSET+XSAVE_STATE_SIZE_OFFSET(%rip)`
> > otherwise.
> > Would expect same here?
> > > +#endif
> > > +       jmp     1b
> > > +       cfi_endproc
> > > +       .size   _dl_tlsdesc_dynamic, .-_dl_tlsdesc_dynamic
> > > +
> > Seems to me the x32 and x64 implementations could be merged with
>
> Did you mean i386? x32 uses the same implementation as x64.
>
> > a few defines for the registers/register width
> > + the extra GPR saving in x64.
>
> i386 ISA is quite different from x86-64.  Merging them will
> only make codes hard to follow.
>
> > > +#undef STATE_SAVE_ALIGNMENT
> > > diff --git a/sysdeps/x86_64/dl-tlsdesc.S b/sysdeps/x86_64/dl-tlsdesc.S
> > > index f748af2ece..ea69f5223a 100644
> > > --- a/sysdeps/x86_64/dl-tlsdesc.S
> > > +++ b/sysdeps/x86_64/dl-tlsdesc.S
> > > @@ -18,7 +18,19 @@
> > >
> > >  #include <sysdep.h>
> > >  #include <tls.h>
> > > +#include <cpu-features-offsets.h>
> > > +#include <features-offsets.h>
> > >  #include "tlsdesc.h"
> > > +#include "dl-trampoline-save.h"
> > > +
> > > +/* Area on stack to save and restore registers used for parameter
> > > +   passing when calling _dl_tlsdesc_dynamic.  */
> > > +#define REGISTER_SAVE_RCX      0
> > > +#define REGISTER_SAVE_RDX      (REGISTER_SAVE_RCX + 8)
> > > +#define REGISTER_SAVE_R8       (REGISTER_SAVE_RDX + 8)
> > > +#define REGISTER_SAVE_R9       (REGISTER_SAVE_R8 + 8)
> > > +#define REGISTER_SAVE_R10      (REGISTER_SAVE_R9 + 8)
> > > +#define REGISTER_SAVE_R11      (REGISTER_SAVE_R10 + 8)
> > >
> > >         .text
> > >
> > > @@ -67,80 +79,24 @@ _dl_tlsdesc_undefweak:
> > >         .size   _dl_tlsdesc_undefweak, .-_dl_tlsdesc_undefweak
> > >
> > >  #ifdef SHARED
> > > -       .hidden _dl_tlsdesc_dynamic
> > > -       .global _dl_tlsdesc_dynamic
> > > -       .type   _dl_tlsdesc_dynamic,@function
> > > -
> > > -     /* %rax points to the TLS descriptor, such that 0(%rax) points to
> > > -       _dl_tlsdesc_dynamic itself, and 8(%rax) points to a struct
> > > -       tlsdesc_dynamic_arg object.  It must return in %rax the offset
> > > -       between the thread pointer and the object denoted by the
> > > -       argument, without clobbering any registers.
> > > -
> > > -       The assembly code that follows is a rendition of the following
> > > -       C code, hand-optimized a little bit.
> > > -
> > > -ptrdiff_t
> > > -_dl_tlsdesc_dynamic (register struct tlsdesc *tdp asm ("%rax"))
> > > -{
> > > -  struct tlsdesc_dynamic_arg *td = tdp->arg;
> > > -  dtv_t *dtv = *(dtv_t **)((char *)__thread_pointer + DTV_OFFSET);
> > > -  if (__builtin_expect (td->gen_count <= dtv[0].counter
> > > -                       && (dtv[td->tlsinfo.ti_module].pointer.val
> > > -                           != TLS_DTV_UNALLOCATED),
> > > -                       1))
> > > -    return dtv[td->tlsinfo.ti_module].pointer.val + td->tlsinfo.ti_offset
> > > -      - __thread_pointer;
> > > -
> > > -  return __tls_get_addr_internal (&td->tlsinfo) - __thread_pointer;
> > > -}
> > > -*/
> > > -       cfi_startproc
> > > -       .align 16
> > > -_dl_tlsdesc_dynamic:
> > > -       _CET_ENDBR
> > > -       /* Preserve call-clobbered registers that we modify.
> > > -          We need two scratch regs anyway.  */
> > > -       movq    %rsi, -16(%rsp)
> > > -       mov     %fs:DTV_OFFSET, %RSI_LP
> > > -       movq    %rdi, -8(%rsp)
> > > -       movq    TLSDESC_ARG(%rax), %rdi
> > > -       movq    (%rsi), %rax
> > > -       cmpq    %rax, TLSDESC_GEN_COUNT(%rdi)
> > > -       ja      .Lslow
> > > -       movq    TLSDESC_MODID(%rdi), %rax
> > > -       salq    $4, %rax
> > > -       movq    (%rax,%rsi), %rax
> > > -       cmpq    $-1, %rax
> > > -       je      .Lslow
> > > -       addq    TLSDESC_MODOFF(%rdi), %rax
> > > -.Lret:
> > > -       movq    -16(%rsp), %rsi
> > > -       sub     %fs:0, %RAX_LP
> > > -       movq    -8(%rsp), %rdi
> > > -       ret
> > > -.Lslow:
> > > -       /* Besides rdi and rsi, saved above, save rdx, rcx, r8, r9,
> > > -          r10 and r11.  Also, align the stack, that's off by 8 bytes.  */
> > > -       subq    $72, %rsp
> > > -       cfi_adjust_cfa_offset (72)
> > > -       movq    %rdx, 8(%rsp)
> > > -       movq    %rcx, 16(%rsp)
> > > -       movq    %r8, 24(%rsp)
> > > -       movq    %r9, 32(%rsp)
> > > -       movq    %r10, 40(%rsp)
> > > -       movq    %r11, 48(%rsp)
> > > -       /* %rdi already points to the tlsinfo data structure.  */
> > > -       call    HIDDEN_JUMPTARGET (__tls_get_addr)
> > > -       movq    8(%rsp), %rdx
> > > -       movq    16(%rsp), %rcx
> > > -       movq    24(%rsp), %r8
> > > -       movq    32(%rsp), %r9
> > > -       movq    40(%rsp), %r10
> > > -       movq    48(%rsp), %r11
> > > -       addq    $72, %rsp
> > > -       cfi_adjust_cfa_offset (-72)
> > > -       jmp     .Lret
> > > -       cfi_endproc
> > > -       .size   _dl_tlsdesc_dynamic, .-_dl_tlsdesc_dynamic
> > > +# define USE_FXSAVE
> > > +# define STATE_SAVE_ALIGNMENT  16
> > > +# define _dl_tlsdesc_dynamic   _dl_tlsdesc_dynamic_fxsave
> > > +# include "dl-tlsdesc-dynamic.h"
> > > +# undef _dl_tlsdesc_dynamic
> > > +# undef USE_FXSAVE
> > > +
> > > +# define USE_XSAVE
> > > +# define STATE_SAVE_ALIGNMENT  64
> > > +# define _dl_tlsdesc_dynamic   _dl_tlsdesc_dynamic_xsave
> > > +# include "dl-tlsdesc-dynamic.h"
> > > +# undef _dl_tlsdesc_dynamic
> > > +# undef USE_XSAVE
> > > +
> > > +# define USE_XSAVEC
> > > +# define STATE_SAVE_ALIGNMENT  64
> > > +# define _dl_tlsdesc_dynamic   _dl_tlsdesc_dynamic_xsavec
> > > +# include "dl-tlsdesc-dynamic.h"
> > > +# undef _dl_tlsdesc_dynamic
> > > +# undef USE_XSAVEC
> > >  #endif /* SHARED */
> > > diff --git a/sysdeps/x86_64/dl-trampoline-save.h b/sysdeps/x86_64/dl-trampoline-save.h
> > > new file mode 100644
> > > index 0000000000..84eac4a8ac
> > > --- /dev/null
> > > +++ b/sysdeps/x86_64/dl-trampoline-save.h
> > > @@ -0,0 +1,34 @@
> > > +/* x86-64 PLT trampoline register save macros.
> > > +   Copyright (C) 2024 Free Software Foundation, Inc.
> > > +   This file is part of the GNU C Library.
> > > +
> > > +   The GNU C Library is free software; you can redistribute it and/or
> > > +   modify it under the terms of the GNU Lesser General Public
> > > +   License as published by the Free Software Foundation; either
> > > +   version 2.1 of the License, or (at your option) any later version.
> > > +
> > > +   The GNU C Library is distributed in the hope that it will be useful,
> > > +   but WITHOUT ANY WARRANTY; without even the implied warranty of
> > > +   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
> > > +   Lesser General Public License for more details.
> > > +
> > > +   You should have received a copy of the GNU Lesser General Public
> > > +   License along with the GNU C Library; if not, see
> > > +   <https://www.gnu.org/licenses/>.  */
> > > +
> > > +#ifndef DL_STACK_ALIGNMENT
> > > +/* Due to GCC bug:
> > > +
> > > +   https://gcc.gnu.org/bugzilla/show_bug.cgi?id=58066
> > > +
> > > +   __tls_get_addr may be called with 8-byte stack alignment.  Although
> > > +   this bug has been fixed in GCC 4.9.4, 5.3 and 6, we can't assume
> > > +   that stack will be always aligned at 16 bytes.  */
> > > +# define DL_STACK_ALIGNMENT 8
> > > +#endif
> > > +
> > > +/* True if _dl_runtime_resolve should align stack for STATE_SAVE or align
> > > +   stack to 16 bytes before calling _dl_fixup.  */
> > > +#define DL_RUNTIME_RESOLVE_REALIGN_STACK \
> > > +  (STATE_SAVE_ALIGNMENT > DL_STACK_ALIGNMENT \
> > > +   || 16 > DL_STACK_ALIGNMENT)
> > > diff --git a/sysdeps/x86_64/dl-trampoline-state.h b/sysdeps/x86_64/dl-trampoline-state.h
> > > new file mode 100644
> > > index 0000000000..575f120797
> > > --- /dev/null
> > > +++ b/sysdeps/x86_64/dl-trampoline-state.h
> > > @@ -0,0 +1,51 @@
> > > +/* x86-64 PLT dl-trampoline state macros.
> > > +   Copyright (C) 2024 Free Software Foundation, Inc.
> > > +   This file is part of the GNU C Library.
> > > +
> > > +   The GNU C Library is free software; you can redistribute it and/or
> > > +   modify it under the terms of the GNU Lesser General Public
> > > +   License as published by the Free Software Foundation; either
> > > +   version 2.1 of the License, or (at your option) any later version.
> > > +
> > > +   The GNU C Library is distributed in the hope that it will be useful,
> > > +   but WITHOUT ANY WARRANTY; without even the implied warranty of
> > > +   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
> > > +   Lesser General Public License for more details.
> > > +
> > > +   You should have received a copy of the GNU Lesser General Public
> > > +   License along with the GNU C Library; if not, see
> > > +   <https://www.gnu.org/licenses/>.  */
> > > +
> > > +#if (STATE_SAVE_ALIGNMENT % 16) != 0
> > > +# error STATE_SAVE_ALIGNMENT must be multiple of 16
> > > +#endif
> > > +
> > > +#if (STATE_SAVE_OFFSET % STATE_SAVE_ALIGNMENT) != 0
> > > +# error STATE_SAVE_OFFSET must be multiple of STATE_SAVE_ALIGNMENT
> > > +#endif
> > > +
> > > +#if DL_RUNTIME_RESOLVE_REALIGN_STACK
> > > +/* Local stack area before jumping to function address: RBX.  */
> > > +# define LOCAL_STORAGE_AREA    8
> > > +# define BASE                  rbx
> > > +# ifdef USE_FXSAVE
> > > +/* Use fxsave to save XMM registers.  */
> > > +#  define REGISTER_SAVE_AREA   (512 + STATE_SAVE_OFFSET)
> > > +#  if (REGISTER_SAVE_AREA % 16) != 0
> > > +#   error REGISTER_SAVE_AREA must be multiple of 16
> > > +#  endif
> > > +# endif
> > > +#else
> > > +# ifndef USE_FXSAVE
> > > +#  error USE_FXSAVE must be defined
> > > +# endif
> > > +/* Use fxsave to save XMM registers.  */
> > > +# define REGISTER_SAVE_AREA    (512 + STATE_SAVE_OFFSET + 8)
> > > +/* Local stack area before jumping to function address:  All saved
> > > +   registers.  */
> > > +# define LOCAL_STORAGE_AREA    REGISTER_SAVE_AREA
> > > +# define BASE                  rsp
> > > +# if (REGISTER_SAVE_AREA % 16) != 8
> > > +#  error REGISTER_SAVE_AREA must be odd multiple of 8
> > > +# endif
> > > +#endif
> > > diff --git a/sysdeps/x86_64/dl-trampoline.S b/sysdeps/x86_64/dl-trampoline.S
> > > index b2e7e0f69b..87c5137837 100644
> > > --- a/sysdeps/x86_64/dl-trampoline.S
> > > +++ b/sysdeps/x86_64/dl-trampoline.S
> > > @@ -22,25 +22,7 @@
> > >  #include <features-offsets.h>
> > >  #include <link-defines.h>
> > >  #include <isa-level.h>
> > > -
> > > -#ifndef DL_STACK_ALIGNMENT
> > > -/* Due to GCC bug:
> > > -
> > > -   https://gcc.gnu.org/bugzilla/show_bug.cgi?id=58066
> > > -
> > > -   __tls_get_addr may be called with 8-byte stack alignment.  Although
> > > -   this bug has been fixed in GCC 4.9.4, 5.3 and 6, we can't assume
> > > -   that stack will be always aligned at 16 bytes.  We use unaligned
> > > -   16-byte move to load and store SSE registers, which has no penalty
> > > -   on modern processors if stack is 16-byte aligned.  */
> > > -# define DL_STACK_ALIGNMENT 8
> > > -#endif
> > > -
> > > -/* True if _dl_runtime_resolve should align stack for STATE_SAVE or align
> > > -   stack to 16 bytes before calling _dl_fixup.  */
> > > -#define DL_RUNTIME_RESOLVE_REALIGN_STACK \
> > > -  (STATE_SAVE_ALIGNMENT > DL_STACK_ALIGNMENT \
> > > -   || 16 > DL_STACK_ALIGNMENT)
> > > +#include "dl-trampoline-save.h"
> > >
> > >  /* Area on stack to save and restore registers used for parameter
> > >     passing when calling _dl_fixup.  */
> > > diff --git a/sysdeps/x86_64/dl-trampoline.h b/sysdeps/x86_64/dl-trampoline.h
> > > index f55c6ea040..d9ccfb40d4 100644
> > > --- a/sysdeps/x86_64/dl-trampoline.h
> > > +++ b/sysdeps/x86_64/dl-trampoline.h
> > > @@ -27,39 +27,7 @@
> > >  # undef LOCAL_STORAGE_AREA
> > >  # undef BASE
> > >
> > > -# if (STATE_SAVE_ALIGNMENT % 16) != 0
> > > -#  error STATE_SAVE_ALIGNMENT must be multiple of 16
> > > -# endif
> > > -
> > > -# if (STATE_SAVE_OFFSET % STATE_SAVE_ALIGNMENT) != 0
> > > -#  error STATE_SAVE_OFFSET must be multiple of STATE_SAVE_ALIGNMENT
> > > -# endif
> > > -
> > > -# if DL_RUNTIME_RESOLVE_REALIGN_STACK
> > > -/* Local stack area before jumping to function address: RBX.  */
> > > -#  define LOCAL_STORAGE_AREA   8
> > > -#  define BASE                 rbx
> > > -#  ifdef USE_FXSAVE
> > > -/* Use fxsave to save XMM registers.  */
> > > -#   define REGISTER_SAVE_AREA  (512 + STATE_SAVE_OFFSET)
> > > -#   if (REGISTER_SAVE_AREA % 16) != 0
> > > -#    error REGISTER_SAVE_AREA must be multiple of 16
> > > -#   endif
> > > -#  endif
> > > -# else
> > > -#  ifndef USE_FXSAVE
> > > -#   error USE_FXSAVE must be defined
> > > -#  endif
> > > -/* Use fxsave to save XMM registers.  */
> > > -#  define REGISTER_SAVE_AREA   (512 + STATE_SAVE_OFFSET + 8)
> > > -/* Local stack area before jumping to function address:  All saved
> > > -   registers.  */
> > > -#  define LOCAL_STORAGE_AREA   REGISTER_SAVE_AREA
> > > -#  define BASE                 rsp
> > > -#  if (REGISTER_SAVE_AREA % 16) != 8
> > > -#   error REGISTER_SAVE_AREA must be odd multiple of 8
> > > -#  endif
> > > -# endif
> > > +# include "dl-trampoline-state.h"
> > >
> > >         .globl _dl_runtime_resolve
> > >         .hidden _dl_runtime_resolve
> > > --
> > > 2.43.0
> > >
>
> H.J.
  
H.J. Lu Feb. 15, 2024, 12:23 a.m. UTC | #4
On Wed, Feb 14, 2024 at 11:57:20PM +0000, Noah Goldstein wrote:
> On Wed, Feb 14, 2024 at 11:21 PM H.J. Lu <hjl.tools@gmail.com> wrote:
> >
> > On Wed, Feb 14, 2024 at 10:44:20PM +0000, Noah Goldstein wrote:
> > > On Tue, Feb 13, 2024 at 4:15 AM H.J. Lu <hjl.tools@gmail.com> wrote:
> > > >
> > > > Compiler generates the following instruction sequence for GNU2 dynamic
> > > > TLS access:
> > > >
> > > >         leaq    tls_var@TLSDESC(%rip), %rax
> > > >         call    *tls_var@TLSCALL(%rax)
> > > >
> > > > or
> > > >
> > > >         leal    tls_var@TLSDESC(%ebx), %eax
> > > >         call    *tls_var@TLSCALL(%eax)
> > > >
> > > > CALL instruction is transparent to compiler which assumes all registers,
> > > > except for EFLAGS and RAX/EAX, are unchanged after CALL.  When
> > > > _dl_tlsdesc_dynamic is called, it calls __tls_get_addr on the slow
> > > > path.  __tls_get_addr is a normal function which doesn't preserve any
> > > > caller-saved registers.  _dl_tlsdesc_dynamic saved and restored integer
> > > > caller-saved registers, but didn't preserve any other caller-saved
> > > > registers.  Add _dl_tlsdesc_dynamic IFUNC functions for FNSAVE, FXSAVE,
> > > > XSAVE and XSAVEC to save and restore all caller-saved registers.  This
> > > > fixes BZ #31372.
> > > >
> > > > Add GLRO(dl_x86_64_runtime_resolve) with GLRO(dl_x86_tlsdesc_dynamic)
> > > > to optimize elf_machine_runtime_setup.
> > > > ---
> > > >  elf/Makefile                                 |  19 ++
> > > >  elf/malloc-for-test.c                        |  32 ++++
> > > >  elf/malloc-for-test.map                      |   6 +
> > > >  elf/tst-gnu2-tls2.c                          |  97 ++++++++++
> > > >  elf/tst-gnu2-tls2.h                          |  26 +++
> > > >  elf/tst-gnu2-tls2mod0.c                      |  28 +++
> > > >  elf/tst-gnu2-tls2mod1.c                      |  28 +++
> > > >  elf/tst-gnu2-tls2mod2.c                      |  28 +++
> > > >  sysdeps/i386/dl-machine.h                    |   2 +-
> > > >  sysdeps/i386/dl-tlsdesc-dynamic.h            | 187 +++++++++++++++++++
> > > >  sysdeps/i386/dl-tlsdesc.S                    | 115 +++++-------
> > > >  sysdeps/i386/tst-gnu2-tls2.c                 |   5 +
> > > >  sysdeps/x86/Makefile                         |   7 +-
> > > >  sysdeps/x86/cpu-features.c                   |  56 +++++-
> > > >  sysdeps/x86/dl-procinfo.c                    |  16 ++
> > > >  sysdeps/{x86_64 => x86}/features-offsets.sym |   2 +
> > > >  sysdeps/x86/malloc-for-test.c                |  33 ++++
> > > >  sysdeps/x86/sysdep.h                         |   6 +
> > > >  sysdeps/x86_64/Makefile                      |   2 +-
> > > >  sysdeps/x86_64/dl-machine.h                  |  19 +-
> > > >  sysdeps/x86_64/dl-procinfo.c                 |  16 ++
> > > >  sysdeps/x86_64/dl-tlsdesc-dynamic.h          | 166 ++++++++++++++++
> > > >  sysdeps/x86_64/dl-tlsdesc.S                  | 108 ++++-------
> > > >  sysdeps/x86_64/dl-trampoline-save.h          |  34 ++++
> > > >  sysdeps/x86_64/dl-trampoline-state.h         |  51 +++++
> > > >  sysdeps/x86_64/dl-trampoline.S               |  20 +-
> > > >  sysdeps/x86_64/dl-trampoline.h               |  34 +---
> > > >  27 files changed, 930 insertions(+), 213 deletions(-)
> > > >  create mode 100644 elf/malloc-for-test.c
> > > >  create mode 100644 elf/malloc-for-test.map
> > > >  create mode 100644 elf/tst-gnu2-tls2.c
> > > >  create mode 100644 elf/tst-gnu2-tls2.h
> > > >  create mode 100644 elf/tst-gnu2-tls2mod0.c
> > > >  create mode 100644 elf/tst-gnu2-tls2mod1.c
> > > >  create mode 100644 elf/tst-gnu2-tls2mod2.c
> > > >  create mode 100644 sysdeps/i386/dl-tlsdesc-dynamic.h
> > > >  create mode 100644 sysdeps/i386/tst-gnu2-tls2.c
> > > >  rename sysdeps/{x86_64 => x86}/features-offsets.sym (89%)
> > > >  create mode 100644 sysdeps/x86/malloc-for-test.c
> > > >  create mode 100644 sysdeps/x86_64/dl-tlsdesc-dynamic.h
> > > >  create mode 100644 sysdeps/x86_64/dl-trampoline-save.h
> > > >  create mode 100644 sysdeps/x86_64/dl-trampoline-state.h
> > > >
> > > > diff --git a/elf/Makefile b/elf/Makefile
> > > > index 5d78b659ce..e0665d2007 100644
> > > > --- a/elf/Makefile
> > > > +++ b/elf/Makefile
> > > > @@ -424,6 +424,7 @@ tests += \
> > > >    tst-glibc-hwcaps-prepend \
> > > >    tst-global1 \
> > > >    tst-global2 \
> > > > +  tst-gnu2-tls2 \
> > > >    tst-initfinilazyfail \
> > > >    tst-initorder \
> > > >    tst-initorder2 \
> > > > @@ -699,6 +700,7 @@ modules-names += \
> > > >    libtracemod5-1 \
> > > >    ltglobmod1 \
> > > >    ltglobmod2 \
> > > > +  malloc-for-test \
> > > >    neededobj1 \
> > > >    neededobj2 \
> > > >    neededobj3 \
> > > > @@ -846,6 +848,9 @@ modules-names += \
> > > >    tst-filterobj-flt \
> > > >    tst-finilazyfailmod \
> > > >    tst-globalmod2 \
> > > > +  tst-gnu2-tls2mod0 \
> > > > +  tst-gnu2-tls2mod1 \
> > > > +  tst-gnu2-tls2mod2 \
> > > >    tst-initlazyfailmod \
> > > >    tst-initorder2a \
> > > >    tst-initorder2b \
> > > > @@ -3044,8 +3049,22 @@ $(objpfx)tst-tlsgap.out: \
> > > >    $(objpfx)tst-tlsgap-mod0.so \
> > > >    $(objpfx)tst-tlsgap-mod1.so \
> > > >    $(objpfx)tst-tlsgap-mod2.so
> > > > +
> > > > +$(objpfx)tst-gnu2-tls2: \
> > > > +  $(shared-thread-library) \
> > > > +  $(objpfx)malloc-for-test.so
> > > > +$(objpfx)tst-gnu2-tls2.out: \
> > > > +  $(objpfx)tst-gnu2-tls2mod0.so \
> > > > +  $(objpfx)tst-gnu2-tls2mod1.so \
> > > > +  $(objpfx)tst-gnu2-tls2mod2.so
> > > > +
> > > > +LDFLAGS-malloc-for-test.so += -Wl,--version-script=malloc-for-test.map
> > > > +
> > > >  ifeq (yes,$(have-mtls-dialect-gnu2))
> > > >  CFLAGS-tst-tlsgap-mod0.c += -mtls-dialect=gnu2
> > > >  CFLAGS-tst-tlsgap-mod1.c += -mtls-dialect=gnu2
> > > >  CFLAGS-tst-tlsgap-mod2.c += -mtls-dialect=gnu2
> > > > +CFLAGS-tst-gnu2-tls2mod0.c += -mtls-dialect=gnu2
> > > > +CFLAGS-tst-gnu2-tls2mod1.c += -mtls-dialect=gnu2
> > > > +CFLAGS-tst-gnu2-tls2mod2.c += -mtls-dialect=gnu2
> > > >  endif
> > > > diff --git a/elf/malloc-for-test.c b/elf/malloc-for-test.c
> > > > new file mode 100644
> > > > index 0000000000..1bec69eda7
> > > > --- /dev/null
> > > > +++ b/elf/malloc-for-test.c
> > > > @@ -0,0 +1,32 @@
> > > > +/* A malloc for intercept test.
> > > > +   Copyright (C) 2024 Free Software Foundation, Inc.
> > > > +   This file is part of the GNU C Library.
> > > > +
> > > > +   The GNU C Library is free software; you can redistribute it and/or
> > > > +   modify it under the terms of the GNU Lesser General Public
> > > > +   License as published by the Free Software Foundation; either
> > > > +   version 2.1 of the License, or (at your option) any later version.
> > > > +
> > > > +   The GNU C Library is distributed in the hope that it will be useful,
> > > > +   but WITHOUT ANY WARRANTY; without even the implied warranty of
> > > > +   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
> > > > +   Lesser General Public License for more details.
> > > > +
> > > > +   You should have received a copy of the GNU Lesser General Public
> > > > +   License along with the GNU C Library; if not, see
> > > > +   <http://www.gnu.org/licenses/>.  */
> > > > +
> > > > +#include <stdlib.h>
> > > > +
> > > > +extern void * __libc_malloc (size_t);
> > > > +
> > > > +#ifndef PREPARE_MALLOC
> > > > +# define PREPARE_MALLOC()
> > > > +#endif
> > > > +
> > > > +void *
> > > > +malloc (size_t n)
> > > > +{
> > > > +  PREPARE_MALLOC ();
> > > > +  return __libc_malloc (n);
> > > > +}
> > > > diff --git a/elf/malloc-for-test.map b/elf/malloc-for-test.map
> > > > new file mode 100644
> > > > index 0000000000..8437cf4346
> > > > --- /dev/null
> > > > +++ b/elf/malloc-for-test.map
> > > > @@ -0,0 +1,6 @@
> > > > +GLIBC_2.0 {
> > > > +  global:
> > > > +    malloc;
> > > > +  local:
> > > > +    *;
> > > > +};
> > > > diff --git a/elf/tst-gnu2-tls2.c b/elf/tst-gnu2-tls2.c
> > > > new file mode 100644
> > > > index 0000000000..34427f9a0f
> > > > --- /dev/null
> > > > +++ b/elf/tst-gnu2-tls2.c
> > > > @@ -0,0 +1,97 @@
> > > > +/* Test TLSDESC relocation.
> > > > +   Copyright (C) 2024 Free Software Foundation, Inc.
> > > > +   This file is part of the GNU C Library.
> > > > +
> > > > +   The GNU C Library is free software; you can redistribute it and/or
> > > > +   modify it under the terms of the GNU Lesser General Public
> > > > +   License as published by the Free Software Foundation; either
> > > > +   version 2.1 of the License, or (at your option) any later version.
> > > > +
> > > > +   The GNU C Library is distributed in the hope that it will be useful,
> > > > +   but WITHOUT ANY WARRANTY; without even the implied warranty of
> > > > +   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
> > > > +   Lesser General Public License for more details.
> > > > +
> > > > +   You should have received a copy of the GNU Lesser General Public
> > > > +   License along with the GNU C Library; if not, see
> > > > +   <http://www.gnu.org/licenses/>.  */
> > > > +
> > > > +#include <stdio.h>
> > > > +#include <string.h>
> > > > +#include <dlfcn.h>
> > > > +#include <pthread.h>
> > > > +#include <support/xdlfcn.h>
> > > > +#include <support/xthread.h>
> > > > +#include <support/check.h>
> > > > +#include <support/test-driver.h>
> > > > +#include "tst-gnu2-tls2.h"
> > > > +
> > > > +#ifndef IS_SUPPORTED
> > > > +# define IS_SUPPORTED() true
> > > > +#endif
> > > > +
> > > > +static void *mod[3];
> > > > +#define MOD(i) "tst-gnu2-tls2mod" #i ".so"
> > > > +static const char *modname[3] = { MOD(0), MOD(1), MOD(2) };
> > > > +#undef MOD
> > > > +
> > > > +static void
> > > > +open_mod (int i)
> > > > +{
> > > > +  mod[i] = xdlopen (modname[i], RTLD_LAZY);
> > > > +  printf ("open %s\n", modname[i]);
> > > > +}
> > > > +
> > > > +static void
> > > > +close_mod (int i)
> > > > +{
> > > > +  xdlclose (mod[i]);
> > > > +  mod[i] = NULL;
> > > > +  printf ("close %s\n", modname[i]);
> > > > +}
> > > > +
> > > > +static void
> > > > +access_mod (int i, const char *sym)
> > > > +{
> > > > +  struct tls var = { -1, -1, -1, -1 };
> > > > +  struct tls *(*f) (struct tls *) = xdlsym (mod[i], sym);
> > > > +  struct tls *p = f (&var);
> > > > +  printf ("access %s: %s() = %p\n", modname[i], sym, p);
> > > > +  TEST_VERIFY_EXIT (memcmp (p, &var, sizeof (var)) == 0);
> > > > +  ++(p->a);
> > > > +}
> > > > +
> > > > +static void *
> > > > +start (void *arg)
> > > > +{
> > > > +  /* The DTV generation is at the last dlopen of mod0 and the
> > > > +     entry for mod1 is NULL.  */
> > > > +
> > > > +  open_mod (1); /* Reuse modid of mod1. Uses dynamic TLS.  */
> > > > +
> > > > +  /* Force the slow path in GNU2 TLS descriptor call.  */
> > > > +  access_mod (1, "apply_tls");
> > > > +
> > > > +  return arg;
> > > > +}
> > > > +
> > > > +static int
> > > > +do_test (void)
> > > > +{
> > > > +  if (!IS_SUPPORTED ())
> > > > +    return EXIT_UNSUPPORTED;
> > > > +
> > > > +  open_mod (0);
> > > > +  open_mod (1);
> > > > +  open_mod (2);
> > > > +  close_mod (0);
> > > > +  close_mod (1); /* Create modid gap at mod1.  */
> > > > +  open_mod (0); /* Reuse modid of mod0, bump generation count.  */
> > > > +
> > > > +  /* Create a thread where DTV of mod1 is NULL.  */
> > > > +  pthread_t t = xpthread_create (NULL, start, NULL);
> > > > +  xpthread_join (t);
> > > > +  return 0;
> > > > +}
> > > > +
> > > > +#include <support/test-driver.c>
> > > > diff --git a/elf/tst-gnu2-tls2.h b/elf/tst-gnu2-tls2.h
> > > > new file mode 100644
> > > > index 0000000000..e33f4dbe27
> > > > --- /dev/null
> > > > +++ b/elf/tst-gnu2-tls2.h
> > > > @@ -0,0 +1,26 @@
> > > > +/* Test TLSDESC relocation.
> > > > +   Copyright (C) 2024 Free Software Foundation, Inc.
> > > > +   This file is part of the GNU C Library.
> > > > +
> > > > +   The GNU C Library is free software; you can redistribute it and/or
> > > > +   modify it under the terms of the GNU Lesser General Public
> > > > +   License as published by the Free Software Foundation; either
> > > > +   version 2.1 of the License, or (at your option) any later version.
> > > > +
> > > > +   The GNU C Library is distributed in the hope that it will be useful,
> > > > +   but WITHOUT ANY WARRANTY; without even the implied warranty of
> > > > +   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
> > > > +   Lesser General Public License for more details.
> > > > +
> > > > +   You should have received a copy of the GNU Lesser General Public
> > > > +   License along with the GNU C Library; if not, see
> > > > +   <https://www.gnu.org/licenses/>.  */
> > > > +
> > > > +#include <stdint.h>
> > > > +
> > > > +struct tls
> > > > +{
> > > > +  int64_t a, b, c, d;
> > > > +};
> > > > +
> > > > +extern struct tls *apply_tls (struct tls *);
> > > > diff --git a/elf/tst-gnu2-tls2mod0.c b/elf/tst-gnu2-tls2mod0.c
> > > > new file mode 100644
> > > > index 0000000000..67dc0d464d
> > > > --- /dev/null
> > > > +++ b/elf/tst-gnu2-tls2mod0.c
> > > > @@ -0,0 +1,28 @@
> > > > +/* DSO used by tst-gnu2-tls2.
> > > > +   Copyright (C) 2024 Free Software Foundation, Inc.
> > > > +   This file is part of the GNU C Library.
> > > > +
> > > > +   The GNU C Library is free software; you can redistribute it and/or
> > > > +   modify it under the terms of the GNU Lesser General Public
> > > > +   License as published by the Free Software Foundation; either
> > > > +   version 2.1 of the License, or (at your option) any later version.
> > > > +
> > > > +   The GNU C Library is distributed in the hope that it will be useful,
> > > > +   but WITHOUT ANY WARRANTY; without even the implied warranty of
> > > > +   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
> > > > +   Lesser General Public License for more details.
> > > > +
> > > > +   You should have received a copy of the GNU Lesser General Public
> > > > +   License along with the GNU C Library; if not, see
> > > > +   <https://www.gnu.org/licenses/>.  */
> > > > +
> > > > +#include "tst-gnu2-tls2.h"
> > > > +
> > > > +__thread struct tls tls_var0 __attribute__ ((visibility ("hidden")));
> > > > +
> > > > +struct tls *
> > > > +apply_tls (struct tls *p)
> > > > +{
> > > > +  tls_var0 = *p;
> > > > +  return &tls_var0;
> > > > +}
> > > > diff --git a/elf/tst-gnu2-tls2mod1.c b/elf/tst-gnu2-tls2mod1.c
> > > > new file mode 100644
> > > > index 0000000000..a4ae6db24f
> > > > --- /dev/null
> > > > +++ b/elf/tst-gnu2-tls2mod1.c
> > > > @@ -0,0 +1,28 @@
> > > > +/* DSO used by tst-gnu2-tls2.
> > > > +   Copyright (C) 2024 Free Software Foundation, Inc.
> > > > +   This file is part of the GNU C Library.
> > > > +
> > > > +   The GNU C Library is free software; you can redistribute it and/or
> > > > +   modify it under the terms of the GNU Lesser General Public
> > > > +   License as published by the Free Software Foundation; either
> > > > +   version 2.1 of the License, or (at your option) any later version.
> > > > +
> > > > +   The GNU C Library is distributed in the hope that it will be useful,
> > > > +   but WITHOUT ANY WARRANTY; without even the implied warranty of
> > > > +   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
> > > > +   Lesser General Public License for more details.
> > > > +
> > > > +   You should have received a copy of the GNU Lesser General Public
> > > > +   License along with the GNU C Library; if not, see
> > > > +   <https://www.gnu.org/licenses/>.  */
> > > > +
> > > > +#include "tst-gnu2-tls2.h"
> > > > +
> > > > +__thread struct tls tls_var1[100] __attribute__ ((visibility ("hidden")));
> > > > +
> > > > +struct tls *
> > > > +apply_tls (struct tls *p)
> > > > +{
> > > > +  tls_var1[1] = *p;
> > > > +  return &tls_var1[1];
> > > > +}
> > > > diff --git a/elf/tst-gnu2-tls2mod2.c b/elf/tst-gnu2-tls2mod2.c
> > > > new file mode 100644
> > > > index 0000000000..2d13921717
> > > > --- /dev/null
> > > > +++ b/elf/tst-gnu2-tls2mod2.c
> > > > @@ -0,0 +1,28 @@
> > > > +/* DSO used by tst-gnu2-tls2.
> > > > +   Copyright (C) 2024 Free Software Foundation, Inc.
> > > > +   This file is part of the GNU C Library.
> > > > +
> > > > +   The GNU C Library is free software; you can redistribute it and/or
> > > > +   modify it under the terms of the GNU Lesser General Public
> > > > +   License as published by the Free Software Foundation; either
> > > > +   version 2.1 of the License, or (at your option) any later version.
> > > > +
> > > > +   The GNU C Library is distributed in the hope that it will be useful,
> > > > +   but WITHOUT ANY WARRANTY; without even the implied warranty of
> > > > +   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
> > > > +   Lesser General Public License for more details.
> > > > +
> > > > +   You should have received a copy of the GNU Lesser General Public
> > > > +   License along with the GNU C Library; if not, see
> > > > +   <https://www.gnu.org/licenses/>.  */
> > > > +
> > > > +#include "tst-gnu2-tls2.h"
> > > > +
> > > > +__thread struct tls tls_var2 __attribute__ ((visibility ("hidden")));
> > > > +
> > > > +struct tls *
> > > > +apply_tls (struct tls *p)
> > > > +{
> > > > +  tls_var2 = *p;
> > > > +  return &tls_var2;
> > > > +}
> > > > diff --git a/sysdeps/i386/dl-machine.h b/sysdeps/i386/dl-machine.h
> > > > index fc1ef96587..50d74fe6e9 100644
> > > > --- a/sysdeps/i386/dl-machine.h
> > > > +++ b/sysdeps/i386/dl-machine.h
> > > > @@ -347,7 +347,7 @@ and creates an unsatisfiable circular dependency.\n",
> > > >                   {
> > > >                     td->arg = _dl_make_tlsdesc_dynamic
> > > >                       (sym_map, sym->st_value + (ElfW(Word))td->arg);
> > > > -                   td->entry = _dl_tlsdesc_dynamic;
> > > > +                   td->entry = GLRO(dl_x86_tlsdesc_dynamic);
> > > >                   }
> > > >                 else
> > > >  #  endif
> > > > diff --git a/sysdeps/i386/dl-tlsdesc-dynamic.h b/sysdeps/i386/dl-tlsdesc-dynamic.h
> > > > new file mode 100644
> > > > index 0000000000..675e56d32d
> > > > --- /dev/null
> > > > +++ b/sysdeps/i386/dl-tlsdesc-dynamic.h
> > > > @@ -0,0 +1,187 @@
> > > > +/* Thread-local storage handling in the ELF dynamic linker.  i386 version.
> > > > +   Copyright (C) 2004-2024 Free Software Foundation, Inc.
> > > > +   This file is part of the GNU C Library.
> > > > +
> > > > +   The GNU C Library is free software; you can redistribute it and/or
> > > > +   modify it under the terms of the GNU Lesser General Public
> > > > +   License as published by the Free Software Foundation; either
> > > > +   version 2.1 of the License, or (at your option) any later version.
> > > > +
> > > > +   The GNU C Library is distributed in the hope that it will be useful,
> > > > +   but WITHOUT ANY WARRANTY; without even the implied warranty of
> > > > +   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
> > > > +   Lesser General Public License for more details.
> > > > +
> > > > +   You should have received a copy of the GNU Lesser General Public
> > > > +   License along with the GNU C Library; if not, see
> > > > +   <https://www.gnu.org/licenses/>.  */
> > > > +
> > > > +#undef REGISTER_SAVE_AREA
> > > > +
> > > > +#if !defined USE_FNSAVE && (STATE_SAVE_ALIGNMENT % 16) != 0
> > > > +# error STATE_SAVE_ALIGNMENT must be multiple of 16
> > > > +#endif
> > > > +
> > > > +#if DL_RUNTIME_RESOLVE_REALIGN_STACK
> > > > +# ifdef USE_FNSAVE
> > > > +#  error USE_FNSAVE shouldn't be defined
> > > > +# endif
> > > > +# ifdef USE_FXSAVE
> > > > +/* Use fxsave to save all registers.  */
> > > > +#  define REGISTER_SAVE_AREA   512
> > > > +# endif
> > > > +#else
> > > > +# ifdef USE_FNSAVE
> > > > +/* Use fnsave to save x87 FPU stack registers.  */
> > > > +#  define REGISTER_SAVE_AREA   108
> > > > +# else
> > > > +#  ifndef USE_FXSAVE
> > > > +#   error USE_FXSAVE must be defined
> > > > +#  endif
> > > > +/* Use fxsave to save all registers.  Add 12 bytes to align the stack
> > > > +   to 16 bytes.  */
> > > > +#  define REGISTER_SAVE_AREA   (512 + 12)
> > > > +# endif
> > > > +#endif
> > > > +
> > > > +       .hidden _dl_tlsdesc_dynamic
> > > > +       .global _dl_tlsdesc_dynamic
> > > > +       .type   _dl_tlsdesc_dynamic,@function
> > > > +
> > > > +     /* This function is used for symbols that need dynamic TLS.
> > > > +
> > > > +       %eax points to the TLS descriptor, such that 0(%eax) points to
> > > > +       _dl_tlsdesc_dynamic itself, and 4(%eax) points to a struct
> > > > +       tlsdesc_dynamic_arg object.  It must return in %eax the offset
> > > > +       between the thread pointer and the object denoted by the
> > > > +       argument, without clobbering any registers.
> > > > +
> > > > +       The assembly code that follows is a rendition of the following
> > > > +       C code, hand-optimized a little bit.
> > > > +
> > > > +ptrdiff_t
> > > > +__attribute__ ((__regparm__ (1)))
> > > > +_dl_tlsdesc_dynamic (struct tlsdesc *tdp)
> > > > +{
> > > > +  struct tlsdesc_dynamic_arg *td = tdp->arg;
> > > > +  dtv_t *dtv = *(dtv_t **)((char *)__thread_pointer + DTV_OFFSET);
> > > > +  if (__builtin_expect (td->gen_count <= dtv[0].counter
> > > > +                       && (dtv[td->tlsinfo.ti_module].pointer.val
> > > > +                           != TLS_DTV_UNALLOCATED),
> > > > +                       1))
> > > > +    return dtv[td->tlsinfo.ti_module].pointer.val + td->tlsinfo.ti_offset
> > > > +      - __thread_pointer;
> > > > +
> > > > +  return ___tls_get_addr (&td->tlsinfo) - __thread_pointer;
> > > > +}
> > > > +*/
> > > > +       cfi_startproc
> > > > +       .align 16
> > > > +_dl_tlsdesc_dynamic:
> > > > +       /* Like all TLS resolvers, preserve call-clobbered registers.
> > > > +          We need two scratch regs anyway.  */
> > > > +       subl    $32, %esp
> > > > +       cfi_adjust_cfa_offset (32)
> > > > +       movl    %ecx, 20(%esp)
> > > > +       movl    %edx, 24(%esp)
> > > > +       movl    TLSDESC_ARG(%eax), %eax
> > > > +       movl    %gs:DTV_OFFSET, %edx
> > > > +       movl    TLSDESC_GEN_COUNT(%eax), %ecx
> > > > +       cmpl    (%edx), %ecx
> > > > +       ja      2f
> > > > +       movl    TLSDESC_MODID(%eax), %ecx
> > > > +       movl    (%edx,%ecx,8), %edx
> > > > +       cmpl    $-1, %edx
> > > > +       je      2f
> > > > +       movl    TLSDESC_MODOFF(%eax), %eax
> > > > +       addl    %edx, %eax
> > > > +1:
> > > > +       movl    20(%esp), %ecx
> > > > +       subl    %gs:0, %eax
> > > > +       movl    24(%esp), %edx
> > > > +       addl    $32, %esp
> > > > +       cfi_adjust_cfa_offset (-32)
> > > > +       ret
> > > > +       .p2align 4,,7
> > > > +2:
> > > > +       cfi_adjust_cfa_offset (32)
> > > Extraneous AFAICT.
> >
> > This was in the existing code. The label 2 can only be reached by
> > a jump.  When the label 2 is reached, this CFA adjustment is to tell
> > debugger that CFA isn't changed the CFA directive above.
> >
> > >
> > > > +#if DL_RUNTIME_RESOLVE_REALIGN_STACK
> > > > +       movl    %ebx, -28(%esp)
> > > > +       movl    %esp, %ebx
> > > > +       cfi_def_cfa_register(%ebx)
> > > > +       and     $-STATE_SAVE_ALIGNMENT, %esp
> > > > +#endif
> > > > +#ifdef REGISTER_SAVE_AREA
> > > > +       subl    $REGISTER_SAVE_AREA, %esp
> > > > +# if !DL_RUNTIME_RESOLVE_REALIGN_STACK
> > > > +       cfi_adjust_cfa_offset(REGISTER_SAVE_AREA)
> > > > +# endif
> > > > +#else
> > > > +       # Allocate stack space of the required size to save the state.
> > > > +       LOAD_PIC_REG (cx)
> > > > +       subl    RTLD_GLOBAL_RO_DL_X86_CPU_FEATURES_OFFSET+XSAVE_STATE_SIZE_OFFSET+_rtld_local_ro@GOTOFF(%ecx), %esp
> > > > +#endif
> > > > +#ifdef USE_FNSAVE
> > > > +       fnsave  (%esp)
> > > > +#elif defined USE_FXSAVE
> > > > +       fxsave  (%esp)
> > > > +#else
> > > > +       # Save the argument for ___tls_get_addr in EAX.
> > > > +       movl    %eax, %ecx
> > > > +       movl    $TLSDESC_CALL_STATE_SAVE_MASK, %eax
> > > > +       xorl    %edx, %edx
> > > > +       # Clear the XSAVE Header.
> > > > +# ifdef USE_XSAVE
> > > > +       movl    %edx, (512)(%esp)
> > > > +       movl    %edx, (512 + 4 * 1)(%esp)
> > > > +       movl    %edx, (512 + 4 * 2)(%esp)
> > > > +       movl    %edx, (512 + 4 * 3)(%esp)
> > > > +# endif
> > > > +       movl    %edx, (512 + 4 * 4)(%esp)
> > > > +       movl    %edx, (512 + 4 * 5)(%esp)
> > > > +       movl    %edx, (512 + 4 * 6)(%esp)
> > > > +       movl    %edx, (512 + 4 * 7)(%esp)
> > > > +       movl    %edx, (512 + 4 * 8)(%esp)
> > > > +       movl    %edx, (512 + 4 * 9)(%esp)
> > > > +       movl    %edx, (512 + 4 * 10)(%esp)
> > > > +       movl    %edx, (512 + 4 * 11)(%esp)
> > > > +       movl    %edx, (512 + 4 * 12)(%esp)
> > > > +       movl    %edx, (512 + 4 * 13)(%esp)
> > > > +       movl    %edx, (512 + 4 * 14)(%esp)
> > > > +       movl    %edx, (512 + 4 * 15)(%esp)
> > > > +# ifdef USE_XSAVE
> > > > +       xsave   (%esp)
> > > > +# else
> > > > +       xsavec  (%esp)
> > > > +# endif
> > > > +       # Restore the argument for ___tls_get_addr in EAX.
> > > > +       movl    %ecx, %eax
> > > > +#endif
> > > > +       call    HIDDEN_JUMPTARGET (___tls_get_addr)
> > > > +       # Get register content back.
> > > > +#ifdef USE_FNSAVE
> > > > +       frstor  (%esp)
> > > > +#elif defined USE_FXSAVE
> > > > +       fxrstor (%esp)
> > > > +#else
> > > > +       /* Save and retore ___tls_get_addr return value stored in EAX.  */
> > > > +       movl    %eax, %ecx
> > > > +       movl    $TLSDESC_CALL_STATE_SAVE_MASK, %eax
> > > > +       xorl    %edx, %edx
> > > > +       xrstor  (%esp)
> > > > +       movl    %ecx, %eax
> > > > +#endif
> > > > +#if DL_RUNTIME_RESOLVE_REALIGN_STACK
> > > > +       mov     %ebx, %esp
> > > > +       cfi_def_cfa_register(%esp)
> > > > +       movl    -28(%esp), %ebx
> > > > +       cfi_restore(%ebx)
> > > > +#else
> > > > +       addl    $REGISTER_SAVE_AREA, %esp
> > > > +       cfi_adjust_cfa_offset(-REGISTER_SAVE_AREA)
> > > The use of `REGISTER_SAVE_AREA` above is guarded by an
> > > `#ifdef REGISTER_SAVE_AREA`
> > > and uses
> > > `_rtld_local_ro+RTLD_GLOBAL_RO_DL_X86_CPU_FEATURES_OFFSET+XSAVE_STATE_SIZE_OFFSET(%rip)`
> > > otherwise.
> > > Would expect same here?
> >
> > REGISTER_SAVE_AREA is only used by fnsave and fxsave which
> > expect the fixed area.
> >
> > _rtld_local_ro+RTLD_GLOBAL_RO_DL_X86_CPU_FEATURES_OFFSET+XSAVE_STATE_SIZE_OFFSET(%rip)
> > is used by xsave and xsavec whose saved area size depends on
> > the enabled features.
> >
> > 2 things are different.
> 
> My point is that we setup the stack above with ifdef i.e
> ```
> #ifdef REGISTER_SAVE_AREA
>        subl    $REGISTER_SAVE_AREA, %esp
> #else
>        subl    RTLD_GLOBAL_RO_DL_X86_CPU_FEATURES_OFFSET+XSAVE_STATE_SIZE_OFFSET+_rtld_local_ro@GOTOFF(%ecx),
> %esp
> #endif
> ```
> Shouldnt you have the same ifdef for restoring?

The actual code is

#ifdef REGISTER_SAVE_AREA
        subl    $REGISTER_SAVE_AREA, %esp
# if !DL_RUNTIME_RESOLVE_REALIGN_STACK
        cfi_adjust_cfa_offset(REGISTER_SAVE_AREA)
# endif
#else
        # Allocate stack space of the required size to save the state.
        LOAD_PIC_REG (cx)
        subl    RTLD_GLOBAL_RO_DL_X86_CPU_FEATURES_OFFSET+XSAVE_STATE_SIZE_OFFSET+_rtld_local_ro@GOTOFF(%ecx), %esp
#endif

I am not sure how your suggestion should work.

H.J.
  
Adhemerval Zanella Netto Feb. 15, 2024, 11:05 p.m. UTC | #5
On 13/02/24 01:15, H.J. Lu wrote:
> Compiler generates the following instruction sequence for GNU2 dynamic
> TLS access:
> 
> 	leaq	tls_var@TLSDESC(%rip), %rax
> 	call	*tls_var@TLSCALL(%rax)
> 
> or
> 
> 	leal	tls_var@TLSDESC(%ebx), %eax
> 	call	*tls_var@TLSCALL(%eax)
> 
> CALL instruction is transparent to compiler which assumes all registers,
> except for EFLAGS and RAX/EAX, are unchanged after CALL.  When
> _dl_tlsdesc_dynamic is called, it calls __tls_get_addr on the slow
> path.  __tls_get_addr is a normal function which doesn't preserve any
> caller-saved registers.  _dl_tlsdesc_dynamic saved and restored integer
> caller-saved registers, but didn't preserve any other caller-saved
> registers.  Add _dl_tlsdesc_dynamic IFUNC functions for FNSAVE, FXSAVE,
> XSAVE and XSAVEC to save and restore all caller-saved registers.  This
> fixes BZ #31372.
> 
> Add GLRO(dl_x86_64_runtime_resolve) with GLRO(dl_x86_tlsdesc_dynamic)
> to optimize elf_machine_runtime_setup.
> ---
>  elf/Makefile                                 |  19 ++
>  elf/malloc-for-test.c                        |  32 ++++
>  elf/malloc-for-test.map                      |   6 +
>  elf/tst-gnu2-tls2.c                          |  97 ++++++++++
>  elf/tst-gnu2-tls2.h                          |  26 +++
>  elf/tst-gnu2-tls2mod0.c                      |  28 +++
>  elf/tst-gnu2-tls2mod1.c                      |  28 +++
>  elf/tst-gnu2-tls2mod2.c                      |  28 +++
>  sysdeps/i386/dl-machine.h                    |   2 +-
>  sysdeps/i386/dl-tlsdesc-dynamic.h            | 187 +++++++++++++++++++
>  sysdeps/i386/dl-tlsdesc.S                    | 115 +++++-------
>  sysdeps/i386/tst-gnu2-tls2.c                 |   5 +
>  sysdeps/x86/Makefile                         |   7 +-
>  sysdeps/x86/cpu-features.c                   |  56 +++++-
>  sysdeps/x86/dl-procinfo.c                    |  16 ++
>  sysdeps/{x86_64 => x86}/features-offsets.sym |   2 +
>  sysdeps/x86/malloc-for-test.c                |  33 ++++
>  sysdeps/x86/sysdep.h                         |   6 +
>  sysdeps/x86_64/Makefile                      |   2 +-
>  sysdeps/x86_64/dl-machine.h                  |  19 +-
>  sysdeps/x86_64/dl-procinfo.c                 |  16 ++
>  sysdeps/x86_64/dl-tlsdesc-dynamic.h          | 166 ++++++++++++++++
>  sysdeps/x86_64/dl-tlsdesc.S                  | 108 ++++-------
>  sysdeps/x86_64/dl-trampoline-save.h          |  34 ++++
>  sysdeps/x86_64/dl-trampoline-state.h         |  51 +++++
>  sysdeps/x86_64/dl-trampoline.S               |  20 +-
>  sysdeps/x86_64/dl-trampoline.h               |  34 +---
>  27 files changed, 930 insertions(+), 213 deletions(-)
>  create mode 100644 elf/malloc-for-test.c
>  create mode 100644 elf/malloc-for-test.map
>  create mode 100644 elf/tst-gnu2-tls2.c
>  create mode 100644 elf/tst-gnu2-tls2.h
>  create mode 100644 elf/tst-gnu2-tls2mod0.c
>  create mode 100644 elf/tst-gnu2-tls2mod1.c
>  create mode 100644 elf/tst-gnu2-tls2mod2.c
>  create mode 100644 sysdeps/i386/dl-tlsdesc-dynamic.h
>  create mode 100644 sysdeps/i386/tst-gnu2-tls2.c
>  rename sysdeps/{x86_64 => x86}/features-offsets.sym (89%)
>  create mode 100644 sysdeps/x86/malloc-for-test.c
>  create mode 100644 sysdeps/x86_64/dl-tlsdesc-dynamic.h
>  create mode 100644 sysdeps/x86_64/dl-trampoline-save.h
>  create mode 100644 sysdeps/x86_64/dl-trampoline-state.h
> 
> diff --git a/elf/Makefile b/elf/Makefile
> index 5d78b659ce..e0665d2007 100644
> --- a/elf/Makefile
> +++ b/elf/Makefile
> @@ -424,6 +424,7 @@ tests += \
>    tst-glibc-hwcaps-prepend \
>    tst-global1 \
>    tst-global2 \
> +  tst-gnu2-tls2 \
>    tst-initfinilazyfail \
>    tst-initorder \
>    tst-initorder2 \
> @@ -699,6 +700,7 @@ modules-names += \
>    libtracemod5-1 \
>    ltglobmod1 \
>    ltglobmod2 \
> +  malloc-for-test \
>    neededobj1 \
>    neededobj2 \
>    neededobj3 \
> @@ -846,6 +848,9 @@ modules-names += \
>    tst-filterobj-flt \
>    tst-finilazyfailmod \
>    tst-globalmod2 \
> +  tst-gnu2-tls2mod0 \
> +  tst-gnu2-tls2mod1 \
> +  tst-gnu2-tls2mod2 \
>    tst-initlazyfailmod \
>    tst-initorder2a \
>    tst-initorder2b \
> @@ -3044,8 +3049,22 @@ $(objpfx)tst-tlsgap.out: \
>    $(objpfx)tst-tlsgap-mod0.so \
>    $(objpfx)tst-tlsgap-mod1.so \
>    $(objpfx)tst-tlsgap-mod2.so
> +
> +$(objpfx)tst-gnu2-tls2: \
> +  $(shared-thread-library) \
> +  $(objpfx)malloc-for-test.so
> +$(objpfx)tst-gnu2-tls2.out: \
> +  $(objpfx)tst-gnu2-tls2mod0.so \
> +  $(objpfx)tst-gnu2-tls2mod1.so \
> +  $(objpfx)tst-gnu2-tls2mod2.so
> +
> +LDFLAGS-malloc-for-test.so += -Wl,--version-script=malloc-for-test.map
> +
>  ifeq (yes,$(have-mtls-dialect-gnu2))
>  CFLAGS-tst-tlsgap-mod0.c += -mtls-dialect=gnu2
>  CFLAGS-tst-tlsgap-mod1.c += -mtls-dialect=gnu2
>  CFLAGS-tst-tlsgap-mod2.c += -mtls-dialect=gnu2
> +CFLAGS-tst-gnu2-tls2mod0.c += -mtls-dialect=gnu2
> +CFLAGS-tst-gnu2-tls2mod1.c += -mtls-dialect=gnu2
> +CFLAGS-tst-gnu2-tls2mod2.c += -mtls-dialect=gnu2
>  endif
> diff --git a/elf/malloc-for-test.c b/elf/malloc-for-test.c
> new file mode 100644
> index 0000000000..1bec69eda7
> --- /dev/null
> +++ b/elf/malloc-for-test.c
> @@ -0,0 +1,32 @@
> +/* A malloc for intercept test.
> +   Copyright (C) 2024 Free Software Foundation, Inc.
> +   This file is part of the GNU C Library.
> +
> +   The GNU C Library is free software; you can redistribute it and/or
> +   modify it under the terms of the GNU Lesser General Public
> +   License as published by the Free Software Foundation; either
> +   version 2.1 of the License, or (at your option) any later version.
> +
> +   The GNU C Library is distributed in the hope that it will be useful,
> +   but WITHOUT ANY WARRANTY; without even the implied warranty of
> +   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
> +   Lesser General Public License for more details.
> +
> +   You should have received a copy of the GNU Lesser General Public
> +   License along with the GNU C Library; if not, see
> +   <http://www.gnu.org/licenses/>.  */
> +
> +#include <stdlib.h>
> +
> +extern void * __libc_malloc (size_t);
> +
> +#ifndef PREPARE_MALLOC
> +# define PREPARE_MALLOC()
> +#endif
> +
> +void *
> +malloc (size_t n)
> +{
> +  PREPARE_MALLOC ();

It is not clear to me how exactly this adds proper coverage without
actually set the affected registers *before* the TLS variable access 
and later check its value has not changed. In fact, on x86_64 it is 
passing on my system even without the test actually work as expected 
(see below).

And I think we also need to add arch-specific rules to build the test
with only the base ABI, and add extra macros to clobber and check
the expected registers that _dl_tlsdesc_dynamic should save/restore.

It would be slightly more trick on ABIs that already have a large set 
or register (like x86_64-v1 and armv8-a).

> +  return __libc_malloc (n);
> +}
> diff --git a/elf/malloc-for-test.map b/elf/malloc-for-test.map
> new file mode 100644
> index 0000000000..8437cf4346
> --- /dev/null
> +++ b/elf/malloc-for-test.map
> @@ -0,0 +1,6 @@
> +GLIBC_2.0 {

You need to use the correct version to override the malloc:

$ gdb --args tst-gnu2-tls2 --direct
[...]
(gdb) b apply_tls
(gdb) r
Thread 2 "tst-gnu2-tls2" hit Breakpoint 1, apply_tls (p=0x7ffff7bfee80) at tst-gnu2-tls2mod1.c:25
25 {
(gdb) b malloc
Breakpoint 2 at 0x7ffff7ca8ad0: malloc. (3 locations)
(gdb) c
Continuing.

Thread 2 "test-gnu2-tls2" hit Breakpoint 2, __GI___libc_malloc (bytes=3200) at malloc.c:3294
3294 {
(gdb) bt
#0 __GI___libc_malloc (bytes=3200) at malloc.c:3294
#1 0x00007ffff7fda3de in malloc (size=<optimized out>) at ../include/rtld-malloc.h:56
#2 allocate_dtv_entry (size=<optimized out>, alignment=16) at ../elf/dl-tls.c:679
#3 allocate_and_init (map=0x7ffff0000bd0) at ../elf/dl-tls.c:704
#4 tls_get_addr_tail (ti=0x7ffff0001240, dtv=0x55555555e340, the_map=0x7ffff0000bd0) at ../elf/dl-tls.c:904
#5 0x00007ffff7fdda2e in _dl_tlsdesc_dynamic_xsavec () at ../sysdeps/x86_64/dl-tlsdesc-dynamic.h:135
#6 0x00007ffff7fb0155 in apply_tls (p=0xc80) at tst-gnu2-tls2mod1.c:27
#7 0x0000555555556965 in access_mod (i=1, sym=0x555555559022 "apply_tls") at tst-gnu2-tls2.c:58
#8 start (arg=0x0) at tst-gnu2-tls2.c:73
#9 0x00007ffff7c96a82 in start_thread (arg=<optimized out>) at pthread_create.c:447
#10 0x00007ffff7d1b13c in clone3 () at ../sysdeps/unix/sysv/linux/x86_64/clone3.S:78

By using GLIBC_2.2.5 instead of GLIBC_2.0, now I see:

(gdb) bt
#0 malloc (n=3200) at ../elf/malloc-for-test.c:29
#1 0x00007ffff7fda3de in malloc (size=<optimized out>) at ../include/rtld-malloc.h:56
#2 allocate_dtv_entry (size=<optimized out>, alignment=16) at ../elf/dl-tls.c:679
#3 allocate_and_init (map=0x7ffff0000bd0) at ../elf/dl-tls.c:704
#4 tls_get_addr_tail (ti=0x7ffff0001240, dtv=0x55555555e340, the_map=0x7ffff0000bd0) at ../elf/dl-tls.c:904
#5 0x00007ffff7fdda2e in _dl_tlsdesc_dynamic_xsavec () at ../sysdeps/x86_64/dl-tlsdesc-dynamic.h:135
#6 0x00007ffff7fb0155 in apply_tls (p=0xc80) at tst-gnu2-tls2mod1.c:27
#7 0x0000555555556965 in access_mod (i=1, sym=0x555555559022 "apply_tls") at tst-gnu2-tls2.c:58
#8 start (arg=0x0) at tst-gnu2-tls2.c:73
#9 0x00007ffff7c96a82 in start_thread (arg=<optimized out>) at pthread_create.c:447
#10 0x00007ffff7d1b13c in clone3 () at ../sysdeps/unix/sysv/linux/x86_64/clone3.S:78

So you will need either to get the correct version or parameterize
the map file for each ABI (which is just a handful that actually 
support TLS descriptors).

Which made me realize that preloading malloc won't interpose ld
implementation since 3a0ecccb599a6b1ad4b149dc569c0080e92d057b unless
the malloc replacement also exports the malloc with the proper ABI
version. 

I think it is an unexpected change from BZ#25486, but at the same 
time I don't think this is a bad one. We will need to add all the
machinery required to save/restore the caller-saved register for
each ABI that supports TLS descriptors because even glibc malloc
might call internal functions that might use such ABI extension
(for instance mem* and str* functions).

All this made me realize that the TLS descriptor slow path
is *far* from maintainable, as we discussed on the weekly call.
And I think we *should* move away from it. Some issues:

 * To properly support _dl_tlsdesc_dynamic on ABI with vector 
   extensions, it would either need to pessimize code generation 
   for TLS access (so the compiler would add all the required 
   instructions to save/restore  the caller-saved registers) or
   move the complexity to libc.

 * The latter would make the libc to require either a quite complex
   _dl_tlsdesc_dynamic, which would either need to probe hardware
   support to provide the multiple code paths or add the support 
   through iFUNC.

 * ARM also has the issue and I think it has not seen this issue
   because gnu2 is not the default TLS ABI and gcc likely won't change
   in nearby future. And to properly fix it, it would require to add
   something like what you are doing for x86 to support the multiple
   vector extensions (VFP, VFP3, NEON).

 * Loongsong is finishing its TLSDESC ABI support on gcc/binutils, and
   most likely would require quite similar support to proper support
   LSX, LASX.

 * I think RISC-V would also have a similar issue for its vector ABI.

So I think we really should reevaluate the BZ#16133 fix that we reverted
on 2.20 [1] [2]. if I recall correctly (I need to go through again my
notes about this issue), two main issues triggered the revert:

 1. It broke LSAN;
 2. Lazy allocation is an explicit feature [3].

For 1. I think it should be doable to fix on sanitizer, either by adding
more hacks to get the correct TLS size or by providing a proper ABI.

However for 2. I think it is past time that we accept that lazy allocation
was a nice idea, but it adds a *lot* of maintainability burden that
it is not paying off.

[1] https://sourceware.org/git/?p=glibc.git;a=commitdiff;h=1f33d36a8a9e78c81bed59b47f260723f56bb7e6
[2] https://sourceware.org/legacy-ml/libc-alpha/2013-09/msg00721.html
[3] https://sourceware.org/legacy-ml/libc-alpha/2014-01/msg00287.html

> +  global:
> +    malloc;
> +  local:
> +    *;
> +};
> diff --git a/elf/tst-gnu2-tls2.c b/elf/tst-gnu2-tls2.c
> new file mode 100644
> index 0000000000..34427f9a0f
> --- /dev/null
> +++ b/elf/tst-gnu2-tls2.c
> @@ -0,0 +1,97 @@
> +/* Test TLSDESC relocation.
> +   Copyright (C) 2024 Free Software Foundation, Inc.
> +   This file is part of the GNU C Library.
> +
> +   The GNU C Library is free software; you can redistribute it and/or
> +   modify it under the terms of the GNU Lesser General Public
> +   License as published by the Free Software Foundation; either
> +   version 2.1 of the License, or (at your option) any later version.
> +
> +   The GNU C Library is distributed in the hope that it will be useful,
> +   but WITHOUT ANY WARRANTY; without even the implied warranty of
> +   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
> +   Lesser General Public License for more details.
> +
> +   You should have received a copy of the GNU Lesser General Public
> +   License along with the GNU C Library; if not, see
> +   <http://www.gnu.org/licenses/>.  */
> +
> +#include <stdio.h>
> +#include <string.h>
> +#include <dlfcn.h>
> +#include <pthread.h>
> +#include <support/xdlfcn.h>
> +#include <support/xthread.h>
> +#include <support/check.h>
> +#include <support/test-driver.h>
> +#include "tst-gnu2-tls2.h"
> +
> +#ifndef IS_SUPPORTED
> +# define IS_SUPPORTED() true
> +#endif
> +
> +static void *mod[3];
> +#define MOD(i) "tst-gnu2-tls2mod" #i ".so"
> +static const char *modname[3] = { MOD(0), MOD(1), MOD(2) };
> +#undef MOD
> +
> +static void
> +open_mod (int i)
> +{
> +  mod[i] = xdlopen (modname[i], RTLD_LAZY);
> +  printf ("open %s\n", modname[i]);
> +}
> +
> +static void
> +close_mod (int i)
> +{
> +  xdlclose (mod[i]);
> +  mod[i] = NULL;
> +  printf ("close %s\n", modname[i]);
> +}
> +
> +static void
> +access_mod (int i, const char *sym)
> +{
> +  struct tls var = { -1, -1, -1, -1 };
> +  struct tls *(*f) (struct tls *) = xdlsym (mod[i], sym);
> +  struct tls *p = f (&var);
> +  printf ("access %s: %s() = %p\n", modname[i], sym, p);
> +  TEST_VERIFY_EXIT (memcmp (p, &var, sizeof (var)) == 0);
> +  ++(p->a);
> +}
> +
> +static void *
> +start (void *arg)
> +{
> +  /* The DTV generation is at the last dlopen of mod0 and the
> +     entry for mod1 is NULL.  */
> +
> +  open_mod (1); /* Reuse modid of mod1. Uses dynamic TLS.  */
> +
> +  /* Force the slow path in GNU2 TLS descriptor call.  */
> +  access_mod (1, "apply_tls");
> +
> +  return arg;
> +}
> +
> +static int
> +do_test (void)
> +{
> +  if (!IS_SUPPORTED ())
> +    return EXIT_UNSUPPORTED;
> +
> +  open_mod (0);
> +  open_mod (1);
> +  open_mod (2);
> +  close_mod (0);
> +  close_mod (1); /* Create modid gap at mod1.  */
> +  open_mod (0); /* Reuse modid of mod0, bump generation count.  */
> +
> +  /* Create a thread where DTV of mod1 is NULL.  */
> +  pthread_t t = xpthread_create (NULL, start, NULL);
> +  xpthread_join (t);
> +  return 0;
> +}
> +
> +#include <support/test-driver.c>
> diff --git a/elf/tst-gnu2-tls2.h b/elf/tst-gnu2-tls2.h
> new file mode 100644
> index 0000000000..e33f4dbe27
> --- /dev/null
> +++ b/elf/tst-gnu2-tls2.h
> @@ -0,0 +1,26 @@
> +/* Test TLSDESC relocation.
> +   Copyright (C) 2024 Free Software Foundation, Inc.
> +   This file is part of the GNU C Library.
> +
> +   The GNU C Library is free software; you can redistribute it and/or
> +   modify it under the terms of the GNU Lesser General Public
> +   License as published by the Free Software Foundation; either
> +   version 2.1 of the License, or (at your option) any later version.
> +
> +   The GNU C Library is distributed in the hope that it will be useful,
> +   but WITHOUT ANY WARRANTY; without even the implied warranty of
> +   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
> +   Lesser General Public License for more details.
> +
> +   You should have received a copy of the GNU Lesser General Public
> +   License along with the GNU C Library; if not, see
> +   <https://www.gnu.org/licenses/>.  */
> +
> +#include <stdint.h>
> +
> +struct tls
> +{
> +  int64_t a, b, c, d;
> +};
> +
> +extern struct tls *apply_tls (struct tls *);
> diff --git a/elf/tst-gnu2-tls2mod0.c b/elf/tst-gnu2-tls2mod0.c
> new file mode 100644
> index 0000000000..67dc0d464d
> --- /dev/null
> +++ b/elf/tst-gnu2-tls2mod0.c
> @@ -0,0 +1,28 @@
> +/* DSO used by tst-gnu2-tls2.
> +   Copyright (C) 2024 Free Software Foundation, Inc.
> +   This file is part of the GNU C Library.
> +
> +   The GNU C Library is free software; you can redistribute it and/or
> +   modify it under the terms of the GNU Lesser General Public
> +   License as published by the Free Software Foundation; either
> +   version 2.1 of the License, or (at your option) any later version.
> +
> +   The GNU C Library is distributed in the hope that it will be useful,
> +   but WITHOUT ANY WARRANTY; without even the implied warranty of
> +   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
> +   Lesser General Public License for more details.
> +
> +   You should have received a copy of the GNU Lesser General Public
> +   License along with the GNU C Library; if not, see
> +   <https://www.gnu.org/licenses/>.  */
> +
> +#include "tst-gnu2-tls2.h"
> +
> +__thread struct tls tls_var0 __attribute__ ((visibility ("hidden")));
> +
> +struct tls *
> +apply_tls (struct tls *p)
> +{
> +  tls_var0 = *p;
> +  return &tls_var0;
> +}
> diff --git a/elf/tst-gnu2-tls2mod1.c b/elf/tst-gnu2-tls2mod1.c
> new file mode 100644
> index 0000000000..a4ae6db24f
> --- /dev/null
> +++ b/elf/tst-gnu2-tls2mod1.c
> @@ -0,0 +1,28 @@
> +/* DSO used by tst-gnu2-tls2.
> +   Copyright (C) 2024 Free Software Foundation, Inc.
> +   This file is part of the GNU C Library.
> +
> +   The GNU C Library is free software; you can redistribute it and/or
> +   modify it under the terms of the GNU Lesser General Public
> +   License as published by the Free Software Foundation; either
> +   version 2.1 of the License, or (at your option) any later version.
> +
> +   The GNU C Library is distributed in the hope that it will be useful,
> +   but WITHOUT ANY WARRANTY; without even the implied warranty of
> +   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
> +   Lesser General Public License for more details.
> +
> +   You should have received a copy of the GNU Lesser General Public
> +   License along with the GNU C Library; if not, see
> +   <https://www.gnu.org/licenses/>.  */
> +
> +#include "tst-gnu2-tls2.h"
> +
> +__thread struct tls tls_var1[100] __attribute__ ((visibility ("hidden")));
> +
> +struct tls *
> +apply_tls (struct tls *p)
> +{
> +  tls_var1[1] = *p;
> +  return &tls_var1[1];
> +}
> diff --git a/elf/tst-gnu2-tls2mod2.c b/elf/tst-gnu2-tls2mod2.c
> new file mode 100644
> index 0000000000..2d13921717
> --- /dev/null
> +++ b/elf/tst-gnu2-tls2mod2.c
> @@ -0,0 +1,28 @@
> +/* DSO used by tst-gnu2-tls2.
> +   Copyright (C) 2024 Free Software Foundation, Inc.
> +   This file is part of the GNU C Library.
> +
> +   The GNU C Library is free software; you can redistribute it and/or
> +   modify it under the terms of the GNU Lesser General Public
> +   License as published by the Free Software Foundation; either
> +   version 2.1 of the License, or (at your option) any later version.
> +
> +   The GNU C Library is distributed in the hope that it will be useful,
> +   but WITHOUT ANY WARRANTY; without even the implied warranty of
> +   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
> +   Lesser General Public License for more details.
> +
> +   You should have received a copy of the GNU Lesser General Public
> +   License along with the GNU C Library; if not, see
> +   <https://www.gnu.org/licenses/>.  */
> +
> +#include "tst-gnu2-tls2.h"
> +
> +__thread struct tls tls_var2 __attribute__ ((visibility ("hidden")));
> +
> +struct tls *
> +apply_tls (struct tls *p)
> +{
> +  tls_var2 = *p;
> +  return &tls_var2;
> +}
> diff --git a/sysdeps/i386/dl-machine.h b/sysdeps/i386/dl-machine.h
> index fc1ef96587..50d74fe6e9 100644
> --- a/sysdeps/i386/dl-machine.h
> +++ b/sysdeps/i386/dl-machine.h
> @@ -347,7 +347,7 @@ and creates an unsatisfiable circular dependency.\n",
>  		  {
>  		    td->arg = _dl_make_tlsdesc_dynamic
>  		      (sym_map, sym->st_value + (ElfW(Word))td->arg);
> -		    td->entry = _dl_tlsdesc_dynamic;
> +		    td->entry = GLRO(dl_x86_tlsdesc_dynamic);
>  		  }
>  		else
>  #  endif
> diff --git a/sysdeps/i386/dl-tlsdesc-dynamic.h b/sysdeps/i386/dl-tlsdesc-dynamic.h
> new file mode 100644
> index 0000000000..675e56d32d
> --- /dev/null
> +++ b/sysdeps/i386/dl-tlsdesc-dynamic.h
> @@ -0,0 +1,187 @@
> +/* Thread-local storage handling in the ELF dynamic linker.  i386 version.
> +   Copyright (C) 2004-2024 Free Software Foundation, Inc.
> +   This file is part of the GNU C Library.
> +
> +   The GNU C Library is free software; you can redistribute it and/or
> +   modify it under the terms of the GNU Lesser General Public
> +   License as published by the Free Software Foundation; either
> +   version 2.1 of the License, or (at your option) any later version.
> +
> +   The GNU C Library is distributed in the hope that it will be useful,
> +   but WITHOUT ANY WARRANTY; without even the implied warranty of
> +   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
> +   Lesser General Public License for more details.
> +
> +   You should have received a copy of the GNU Lesser General Public
> +   License along with the GNU C Library; if not, see
> +   <https://www.gnu.org/licenses/>.  */
> +
> +#undef REGISTER_SAVE_AREA
> +
> +#if !defined USE_FNSAVE && (STATE_SAVE_ALIGNMENT % 16) != 0
> +# error STATE_SAVE_ALIGNMENT must be multiple of 16
> +#endif
> +
> +#if DL_RUNTIME_RESOLVE_REALIGN_STACK
> +# ifdef USE_FNSAVE
> +#  error USE_FNSAVE shouldn't be defined
> +# endif
> +# ifdef USE_FXSAVE
> +/* Use fxsave to save all registers.  */
> +#  define REGISTER_SAVE_AREA	512
> +# endif
> +#else
> +# ifdef USE_FNSAVE
> +/* Use fnsave to save x87 FPU stack registers.  */
> +#  define REGISTER_SAVE_AREA	108
> +# else
> +#  ifndef USE_FXSAVE
> +#   error USE_FXSAVE must be defined
> +#  endif
> +/* Use fxsave to save all registers.  Add 12 bytes to align the stack
> +   to 16 bytes.  */
> +#  define REGISTER_SAVE_AREA	(512 + 12)
> +# endif
> +#endif
> +
> +	.hidden _dl_tlsdesc_dynamic
> +	.global	_dl_tlsdesc_dynamic
> +	.type	_dl_tlsdesc_dynamic,@function
> +
> +     /* This function is used for symbols that need dynamic TLS.
> +
> +	%eax points to the TLS descriptor, such that 0(%eax) points to
> +	_dl_tlsdesc_dynamic itself, and 4(%eax) points to a struct
> +	tlsdesc_dynamic_arg object.  It must return in %eax the offset
> +	between the thread pointer and the object denoted by the
> +	argument, without clobbering any registers.
> +
> +	The assembly code that follows is a rendition of the following
> +	C code, hand-optimized a little bit.
> +
> +ptrdiff_t
> +__attribute__ ((__regparm__ (1)))
> +_dl_tlsdesc_dynamic (struct tlsdesc *tdp)
> +{
> +  struct tlsdesc_dynamic_arg *td = tdp->arg;
> +  dtv_t *dtv = *(dtv_t **)((char *)__thread_pointer + DTV_OFFSET);
> +  if (__builtin_expect (td->gen_count <= dtv[0].counter
> +			&& (dtv[td->tlsinfo.ti_module].pointer.val
> +			    != TLS_DTV_UNALLOCATED),
> +			1))
> +    return dtv[td->tlsinfo.ti_module].pointer.val + td->tlsinfo.ti_offset
> +      - __thread_pointer;
> +
> +  return ___tls_get_addr (&td->tlsinfo) - __thread_pointer;
> +}
> +*/
> +	cfi_startproc
> +	.align 16
> +_dl_tlsdesc_dynamic:
> +	/* Like all TLS resolvers, preserve call-clobbered registers.
> +	   We need two scratch regs anyway.  */
> +	subl	$32, %esp
> +	cfi_adjust_cfa_offset (32)
> +	movl	%ecx, 20(%esp)
> +	movl	%edx, 24(%esp)
> +	movl	TLSDESC_ARG(%eax), %eax
> +	movl	%gs:DTV_OFFSET, %edx
> +	movl	TLSDESC_GEN_COUNT(%eax), %ecx
> +	cmpl	(%edx), %ecx
> +	ja	2f
> +	movl	TLSDESC_MODID(%eax), %ecx
> +	movl	(%edx,%ecx,8), %edx
> +	cmpl	$-1, %edx
> +	je	2f
> +	movl	TLSDESC_MODOFF(%eax), %eax
> +	addl	%edx, %eax
> +1:
> +	movl	20(%esp), %ecx
> +	subl	%gs:0, %eax
> +	movl	24(%esp), %edx
> +	addl	$32, %esp
> +	cfi_adjust_cfa_offset (-32)
> +	ret
> +	.p2align 4,,7
> +2:
> +	cfi_adjust_cfa_offset (32)
> +#if DL_RUNTIME_RESOLVE_REALIGN_STACK
> +	movl	%ebx, -28(%esp)
> +	movl	%esp, %ebx
> +	cfi_def_cfa_register(%ebx)
> +	and	$-STATE_SAVE_ALIGNMENT, %esp
> +#endif
> +#ifdef REGISTER_SAVE_AREA
> +	subl	$REGISTER_SAVE_AREA, %esp
> +# if !DL_RUNTIME_RESOLVE_REALIGN_STACK
> +	cfi_adjust_cfa_offset(REGISTER_SAVE_AREA)
> +# endif
> +#else
> +	# Allocate stack space of the required size to save the state.
> +	LOAD_PIC_REG (cx)
> +	subl	RTLD_GLOBAL_RO_DL_X86_CPU_FEATURES_OFFSET+XSAVE_STATE_SIZE_OFFSET+_rtld_local_ro@GOTOFF(%ecx), %esp
> +#endif
> +#ifdef USE_FNSAVE
> +	fnsave	(%esp)
> +#elif defined USE_FXSAVE
> +	fxsave	(%esp)
> +#else
> +	# Save the argument for ___tls_get_addr in EAX.
> +	movl	%eax, %ecx
> +	movl	$TLSDESC_CALL_STATE_SAVE_MASK, %eax
> +	xorl	%edx, %edx
> +	# Clear the XSAVE Header.
> +# ifdef USE_XSAVE
> +	movl	%edx, (512)(%esp)
> +	movl	%edx, (512 + 4 * 1)(%esp)
> +	movl	%edx, (512 + 4 * 2)(%esp)
> +	movl	%edx, (512 + 4 * 3)(%esp)
> +# endif
> +	movl	%edx, (512 + 4 * 4)(%esp)
> +	movl	%edx, (512 + 4 * 5)(%esp)
> +	movl	%edx, (512 + 4 * 6)(%esp)
> +	movl	%edx, (512 + 4 * 7)(%esp)
> +	movl	%edx, (512 + 4 * 8)(%esp)
> +	movl	%edx, (512 + 4 * 9)(%esp)
> +	movl	%edx, (512 + 4 * 10)(%esp)
> +	movl	%edx, (512 + 4 * 11)(%esp)
> +	movl	%edx, (512 + 4 * 12)(%esp)
> +	movl	%edx, (512 + 4 * 13)(%esp)
> +	movl	%edx, (512 + 4 * 14)(%esp)
> +	movl	%edx, (512 + 4 * 15)(%esp)
> +# ifdef USE_XSAVE
> +	xsave	(%esp)
> +# else
> +	xsavec	(%esp)
> +# endif
> +	# Restore the argument for ___tls_get_addr in EAX.
> +	movl	%ecx, %eax
> +#endif
> +	call	HIDDEN_JUMPTARGET (___tls_get_addr)
> +	# Get register content back.
> +#ifdef USE_FNSAVE
> +	frstor	(%esp)
> +#elif defined USE_FXSAVE
> +	fxrstor	(%esp)
> +#else
> +	/* Save and retore ___tls_get_addr return value stored in EAX.  */
> +	movl	%eax, %ecx
> +	movl	$TLSDESC_CALL_STATE_SAVE_MASK, %eax
> +	xorl	%edx, %edx
> +	xrstor	(%esp)
> +	movl	%ecx, %eax
> +#endif
> +#if DL_RUNTIME_RESOLVE_REALIGN_STACK
> +	mov	%ebx, %esp
> +	cfi_def_cfa_register(%esp)
> +	movl	-28(%esp), %ebx
> +	cfi_restore(%ebx)
> +#else
> +	addl	$REGISTER_SAVE_AREA, %esp
> +	cfi_adjust_cfa_offset(-REGISTER_SAVE_AREA)
> +#endif
> +	jmp	1b
> +	cfi_endproc
> +	.size	_dl_tlsdesc_dynamic, .-_dl_tlsdesc_dynamic
> +
> +#undef STATE_SAVE_ALIGNMENT
> diff --git a/sysdeps/i386/dl-tlsdesc.S b/sysdeps/i386/dl-tlsdesc.S
> index 90d93caa0c..f002feee56 100644
> --- a/sysdeps/i386/dl-tlsdesc.S
> +++ b/sysdeps/i386/dl-tlsdesc.S
> @@ -18,8 +18,27 @@
>  
>  #include <sysdep.h>
>  #include <tls.h>
> +#include <cpu-features-offsets.h>
> +#include <features-offsets.h>
>  #include "tlsdesc.h"
>  
> +#ifndef DL_STACK_ALIGNMENT
> +/* Due to GCC bug:
> +
> +   https://gcc.gnu.org/bugzilla/show_bug.cgi?id=58066
> +
> +   __tls_get_addr may be called with 4-byte stack alignment.  Although
> +   this bug has been fixed in GCC 4.9.4, 5.3 and 6, we can't assume
> +   that stack will be always aligned at 16 bytes.  */
> +# define DL_STACK_ALIGNMENT 4
> +#endif
> +
> +/* True if _dl_tlsdesc_dynamic should align stack for STATE_SAVE or align
> +   stack to MINIMUM_ALIGNMENT bytes before calling ___tls_get_addr.  */
> +#define DL_RUNTIME_RESOLVE_REALIGN_STACK \
> +  (STATE_SAVE_ALIGNMENT > DL_STACK_ALIGNMENT \
> +   || MINIMUM_ALIGNMENT > DL_STACK_ALIGNMENT)
> +
>  	.text
>  
>       /* This function is used to compute the TP offset for symbols in
> @@ -65,69 +84,35 @@ _dl_tlsdesc_undefweak:
>  	.size	_dl_tlsdesc_undefweak, .-_dl_tlsdesc_undefweak
>  
>  #ifdef SHARED
> -	.hidden _dl_tlsdesc_dynamic
> -	.global	_dl_tlsdesc_dynamic
> -	.type	_dl_tlsdesc_dynamic,@function
> -
> -     /* This function is used for symbols that need dynamic TLS.
> -
> -	%eax points to the TLS descriptor, such that 0(%eax) points to
> -	_dl_tlsdesc_dynamic itself, and 4(%eax) points to a struct
> -	tlsdesc_dynamic_arg object.  It must return in %eax the offset
> -	between the thread pointer and the object denoted by the
> -	argument, without clobbering any registers.
> -
> -	The assembly code that follows is a rendition of the following
> -	C code, hand-optimized a little bit.
> -
> -ptrdiff_t
> -__attribute__ ((__regparm__ (1)))
> -_dl_tlsdesc_dynamic (struct tlsdesc *tdp)
> -{
> -  struct tlsdesc_dynamic_arg *td = tdp->arg;
> -  dtv_t *dtv = *(dtv_t **)((char *)__thread_pointer + DTV_OFFSET);
> -  if (__builtin_expect (td->gen_count <= dtv[0].counter
> -			&& (dtv[td->tlsinfo.ti_module].pointer.val
> -			    != TLS_DTV_UNALLOCATED),
> -			1))
> -    return dtv[td->tlsinfo.ti_module].pointer.val + td->tlsinfo.ti_offset
> -      - __thread_pointer;
> -
> -  return ___tls_get_addr (&td->tlsinfo) - __thread_pointer;
> -}
> -*/
> -	cfi_startproc
> -	.align 16
> -_dl_tlsdesc_dynamic:
> -	/* Like all TLS resolvers, preserve call-clobbered registers.
> -	   We need two scratch regs anyway.  */
> -	subl	$28, %esp
> -	cfi_adjust_cfa_offset (28)
> -	movl	%ecx, 20(%esp)
> -	movl	%edx, 24(%esp)
> -	movl	TLSDESC_ARG(%eax), %eax
> -	movl	%gs:DTV_OFFSET, %edx
> -	movl	TLSDESC_GEN_COUNT(%eax), %ecx
> -	cmpl	(%edx), %ecx
> -	ja	.Lslow
> -	movl	TLSDESC_MODID(%eax), %ecx
> -	movl	(%edx,%ecx,8), %edx
> -	cmpl	$-1, %edx
> -	je	.Lslow
> -	movl	TLSDESC_MODOFF(%eax), %eax
> -	addl	%edx, %eax
> -.Lret:
> -	movl	20(%esp), %ecx
> -	subl	%gs:0, %eax
> -	movl	24(%esp), %edx
> -	addl	$28, %esp
> -	cfi_adjust_cfa_offset (-28)
> -	ret
> -	.p2align 4,,7
> -.Lslow:
> -	cfi_adjust_cfa_offset (28)
> -	call	HIDDEN_JUMPTARGET (___tls_get_addr)
> -	jmp	.Lret
> -	cfi_endproc
> -	.size	_dl_tlsdesc_dynamic, .-_dl_tlsdesc_dynamic
> +# define USE_FNSAVE
> +# define MINIMUM_ALIGNMENT	4
> +# define STATE_SAVE_ALIGNMENT	4
> +# define _dl_tlsdesc_dynamic	_dl_tlsdesc_dynamic_fnsave
> +# include "dl-tlsdesc-dynamic.h"
> +# undef _dl_tlsdesc_dynamic
> +# undef MINIMUM_ALIGNMENT
> +# undef USE_FNSAVE
> +
> +# define MINIMUM_ALIGNMENT	16
> +
> +# define USE_FXSAVE
> +# define STATE_SAVE_ALIGNMENT	16
> +# define _dl_tlsdesc_dynamic	_dl_tlsdesc_dynamic_fxsave
> +# include "dl-tlsdesc-dynamic.h"
> +# undef _dl_tlsdesc_dynamic
> +# undef USE_FXSAVE
> +
> +# define USE_XSAVE
> +# define STATE_SAVE_ALIGNMENT	64
> +# define _dl_tlsdesc_dynamic	_dl_tlsdesc_dynamic_xsave
> +# include "dl-tlsdesc-dynamic.h"
> +# undef _dl_tlsdesc_dynamic
> +# undef USE_XSAVE
> +
> +# define USE_XSAVEC
> +# define STATE_SAVE_ALIGNMENT	64
> +# define _dl_tlsdesc_dynamic	_dl_tlsdesc_dynamic_xsavec
> +# include "dl-tlsdesc-dynamic.h"
> +# undef _dl_tlsdesc_dynamic
> +# undef USE_XSAVEC
>  #endif /* SHARED */
> diff --git a/sysdeps/i386/tst-gnu2-tls2.c b/sysdeps/i386/tst-gnu2-tls2.c
> new file mode 100644
> index 0000000000..92e7fbff89
> --- /dev/null
> +++ b/sysdeps/i386/tst-gnu2-tls2.c
> @@ -0,0 +1,5 @@
> +#include <sys/platform/x86.h>
> +
> +#define IS_SUPPORTED() CPU_FEATURE_ACTIVE (SSE2)
> +
> +#include <elf/tst-gnu2-tls2.c>
> diff --git a/sysdeps/x86/Makefile b/sysdeps/x86/Makefile
> index 4d50b327b5..bc357f0099 100644
> --- a/sysdeps/x86/Makefile
> +++ b/sysdeps/x86/Makefile
> @@ -1,5 +1,5 @@
>  ifeq ($(subdir),csu)
> -gen-as-const-headers += cpu-features-offsets.sym
> +gen-as-const-headers += cpu-features-offsets.sym features-offsets.sym
>  endif
>  
>  ifeq ($(subdir),elf)
> @@ -86,6 +86,11 @@ endif
>  tst-ifunc-isa-2-ENV = GLIBC_TUNABLES=glibc.cpu.hwcaps=-SSE4_2,-AVX,-AVX2,-AVX512F
>  tst-ifunc-isa-2-static-ENV = $(tst-ifunc-isa-2-ENV)
>  tst-hwcap-tunables-ARGS = -- $(host-test-program-cmd)
> +
> +CFLAGS-malloc-for-test.c += -msse2
> +CFLAGS-tst-gnu2-tls2mod0.c += -msse2 -mtune=haswell
> +CFLAGS-tst-gnu2-tls2mod1.c += -msse2 -mtune=haswell
> +CFLAGS-tst-gnu2-tls2mod2.c += -msse2 -mtune=haswell
>  endif
>  
>  ifeq ($(subdir),math)
> diff --git a/sysdeps/x86/cpu-features.c b/sysdeps/x86/cpu-features.c
> index 25e6622a79..835113b42f 100644
> --- a/sysdeps/x86/cpu-features.c
> +++ b/sysdeps/x86/cpu-features.c
> @@ -27,8 +27,13 @@
>  extern void TUNABLE_CALLBACK (set_hwcaps) (tunable_val_t *)
>    attribute_hidden;
>  
> -#if defined SHARED && defined __x86_64__
> -# include <dl-plt-rewrite.h>
> +#if defined SHARED
> +extern void _dl_tlsdesc_dynamic_fxsave (void) attribute_hidden;
> +extern void _dl_tlsdesc_dynamic_xsave (void) attribute_hidden;
> +extern void _dl_tlsdesc_dynamic_xsavec (void) attribute_hidden;
> +
> +# ifdef __x86_64__
> +#  include <dl-plt-rewrite.h>
>  
>  static void
>  TUNABLE_CALLBACK (set_plt_rewrite) (tunable_val_t *valp)
> @@ -47,6 +52,15 @@ TUNABLE_CALLBACK (set_plt_rewrite) (tunable_val_t *valp)
>  		 : plt_rewrite_jmp);
>      }
>  }
> +# else
> +extern void _dl_tlsdesc_dynamic_fnsave (void) attribute_hidden;
> +# endif
> +#endif
> +
> +#ifdef __x86_64__
> +extern void _dl_runtime_resolve_fxsave (void) attribute_hidden;
> +extern void _dl_runtime_resolve_xsave (void) attribute_hidden;
> +extern void _dl_runtime_resolve_xsavec (void) attribute_hidden;
>  #endif
>  
>  #ifdef __LP64__
> @@ -1130,6 +1144,44 @@ no_cpuid:
>  	       TUNABLE_CALLBACK (set_x86_shstk));
>  #endif
>  
> +  if (GLRO(dl_x86_cpu_features).xsave_state_size != 0)
> +    {
> +      if (CPU_FEATURE_USABLE_P (cpu_features, XSAVEC))
> +	{
> +#ifdef __x86_64__
> +	  GLRO(dl_x86_64_runtime_resolve) = _dl_runtime_resolve_xsavec;
> +#endif
> +#ifdef SHARED
> +	  GLRO(dl_x86_tlsdesc_dynamic) = _dl_tlsdesc_dynamic_xsavec;
> +#endif
> +	}
> +      else
> +	{
> +#ifdef __x86_64__
> +	  GLRO(dl_x86_64_runtime_resolve) = _dl_runtime_resolve_xsave;
> +#endif
> +#ifdef SHARED
> +	  GLRO(dl_x86_tlsdesc_dynamic) = _dl_tlsdesc_dynamic_xsave;
> +#endif
> +	}
> +    }
> +  else
> +    {
> +#ifdef __x86_64__
> +      GLRO(dl_x86_64_runtime_resolve) = _dl_runtime_resolve_fxsave;
> +# ifdef SHARED
> +      GLRO(dl_x86_tlsdesc_dynamic) = _dl_tlsdesc_dynamic_fxsave;
> +# endif
> +#else
> +# ifdef SHARED
> +      if (CPU_FEATURE_USABLE_P (cpu_features, FXSR))
> +	GLRO(dl_x86_tlsdesc_dynamic) = _dl_tlsdesc_dynamic_fxsave;
> +      else
> +	GLRO(dl_x86_tlsdesc_dynamic) = _dl_tlsdesc_dynamic_fnsave;
> +# endif
> +#endif
> +    }
> +
>  #ifdef SHARED
>  # ifdef __x86_64__
>    TUNABLE_GET (plt_rewrite, tunable_val_t *,
> diff --git a/sysdeps/x86/dl-procinfo.c b/sysdeps/x86/dl-procinfo.c
> index ee957b4d70..5920d4b320 100644
> --- a/sysdeps/x86/dl-procinfo.c
> +++ b/sysdeps/x86/dl-procinfo.c
> @@ -86,3 +86,19 @@ PROCINFO_CLASS const char _dl_x86_platforms[4][9]
>  #else
>  ,
>  #endif
> +
> +#if defined SHARED && !IS_IN (ldconfig)
> +# if !defined PROCINFO_DECL
> +  ._dl_x86_tlsdesc_dynamic
> +# else
> +PROCINFO_CLASS void * _dl_x86_tlsdesc_dynamic
> +# endif
> +# ifndef PROCINFO_DECL
> += NULL
> +# endif
> +# ifdef PROCINFO_DECL
> +;
> +# else
> +,
> +# endif
> +#endif
> diff --git a/sysdeps/x86_64/features-offsets.sym b/sysdeps/x86/features-offsets.sym
> similarity index 89%
> rename from sysdeps/x86_64/features-offsets.sym
> rename to sysdeps/x86/features-offsets.sym
> index 9e4be3393a..77e990c705 100644
> --- a/sysdeps/x86_64/features-offsets.sym
> +++ b/sysdeps/x86/features-offsets.sym
> @@ -3,4 +3,6 @@
>  #include <ldsodefs.h>
>  
>  RTLD_GLOBAL_RO_DL_X86_CPU_FEATURES_OFFSET offsetof (struct rtld_global_ro, _dl_x86_cpu_features)
> +#ifdef __x86_64__
>  RTLD_GLOBAL_DL_X86_FEATURE_1_OFFSET offsetof (struct rtld_global, _dl_x86_feature_1)
> +#endif
> diff --git a/sysdeps/x86/malloc-for-test.c b/sysdeps/x86/malloc-for-test.c
> new file mode 100644
> index 0000000000..02f4dead5d
> --- /dev/null
> +++ b/sysdeps/x86/malloc-for-test.c
> @@ -0,0 +1,33 @@
> +/*  A malloc for intercept test.  x86 version.
> +   Copyright (C) 2024 Free Software Foundation, Inc.
> +   This file is part of the GNU C Library.
> +
> +   The GNU C Library is free software; you can redistribute it and/or
> +   modify it under the terms of the GNU Lesser General Public
> +   License as published by the Free Software Foundation; either
> +   version 2.1 of the License, or (at your option) any later version.
> +
> +   The GNU C Library is distributed in the hope that it will be useful,
> +   but WITHOUT ANY WARRANTY; without even the implied warranty of
> +   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
> +   Lesser General Public License for more details.
> +
> +   You should have received a copy of the GNU Lesser General Public
> +   License along with the GNU C Library; if not, see
> +   <http://www.gnu.org/licenses/>.  */
> +
> +
> +/* Clear XMM0...XMM7  */
> +#define PREPARE_MALLOC()				\
> +{							\
> +  asm volatile ("xorps %%xmm0, %%xmm0" : : : "xmm0" );	\
> +  asm volatile ("xorps %%xmm1, %%xmm1" : : : "xmm1" );	\
> +  asm volatile ("xorps %%xmm2, %%xmm2" : : : "xmm2" );	\
> +  asm volatile ("xorps %%xmm3, %%xmm3" : : : "xmm3" );	\
> +  asm volatile ("xorps %%xmm4, %%xmm4" : : : "xmm4" );	\
> +  asm volatile ("xorps %%xmm5, %%xmm5" : : : "xmm5" );	\
> +  asm volatile ("xorps %%xmm6, %%xmm6" : : : "xmm6" );	\
> +  asm volatile ("xorps %%xmm7, %%xmm7" : : : "xmm7" );	\
> +}
> +
> +#include <elf/malloc-for-test.c>
> diff --git a/sysdeps/x86/sysdep.h b/sysdeps/x86/sysdep.h
> index 5c1f0bcf53..792e2ea5ed 100644
> --- a/sysdeps/x86/sysdep.h
> +++ b/sysdeps/x86/sysdep.h
> @@ -68,6 +68,12 @@
>     | (1 << X86_XSTATE_ZMM_H_ID))
>  #endif
>  
> +/* States which should be saved for TLSDESC_CALL and TLS_DESC_CALL.
> +   Compiler assumes that all registers, including x87 FPU stack registers,
> +   are unchanged after CALL, except for EFLAGS and RAX/EAX.  */
> +#define TLSDESC_CALL_STATE_SAVE_MASK	\
> +  (STATE_SAVE_MASK | (1 << X86_XSTATE_X87_ID))
> +
>  /* Constants for bits in __x86_string_control:  */
>  
>  /* Avoid short distance REP MOVSB.  */
> diff --git a/sysdeps/x86_64/Makefile b/sysdeps/x86_64/Makefile
> index 90f4ecfd26..e8babc9a4e 100644
> --- a/sysdeps/x86_64/Makefile
> +++ b/sysdeps/x86_64/Makefile
> @@ -10,7 +10,7 @@ LDFLAGS-rtld += -Wl,-z,nomark-plt
>  endif
>  
>  ifeq ($(subdir),csu)
> -gen-as-const-headers += features-offsets.sym link-defines.sym
> +gen-as-const-headers += link-defines.sym
>  endif
>  
>  ifeq ($(subdir),gmon)
> diff --git a/sysdeps/x86_64/dl-machine.h b/sysdeps/x86_64/dl-machine.h
> index 6d605d0d32..ff5d45f7cb 100644
> --- a/sysdeps/x86_64/dl-machine.h
> +++ b/sysdeps/x86_64/dl-machine.h
> @@ -71,9 +71,6 @@ elf_machine_runtime_setup (struct link_map *l, struct r_scope_elem *scope[],
>  			   int lazy, int profile)
>  {
>    Elf64_Addr *got;
> -  extern void _dl_runtime_resolve_fxsave (ElfW(Word)) attribute_hidden;
> -  extern void _dl_runtime_resolve_xsave (ElfW(Word)) attribute_hidden;
> -  extern void _dl_runtime_resolve_xsavec (ElfW(Word)) attribute_hidden;
>    extern void _dl_runtime_profile_sse (ElfW(Word)) attribute_hidden;
>    extern void _dl_runtime_profile_avx (ElfW(Word)) attribute_hidden;
>    extern void _dl_runtime_profile_avx512 (ElfW(Word)) attribute_hidden;
> @@ -96,8 +93,6 @@ elf_machine_runtime_setup (struct link_map *l, struct r_scope_elem *scope[],
>        /* Identify this shared object.  */
>        *(ElfW(Addr) *) (got + 1) = (ElfW(Addr)) l;
>  
> -      const struct cpu_features* cpu_features = __get_cpu_features ();
> -
>  #ifdef SHARED
>        /* The got[2] entry contains the address of a function which gets
>  	 called to get the address of a so far unresolved function and
> @@ -107,6 +102,7 @@ elf_machine_runtime_setup (struct link_map *l, struct r_scope_elem *scope[],
>  	 end in this function.  */
>        if (__glibc_unlikely (profile))
>  	{
> +	  const struct cpu_features* cpu_features = __get_cpu_features ();
>  	  if (X86_ISA_CPU_FEATURE_USABLE_P (cpu_features, AVX512F))
>  	    *(ElfW(Addr) *) (got + 2) = (ElfW(Addr)) &_dl_runtime_profile_avx512;
>  	  else if (X86_ISA_CPU_FEATURE_USABLE_P (cpu_features, AVX))
> @@ -126,15 +122,8 @@ elf_machine_runtime_setup (struct link_map *l, struct r_scope_elem *scope[],
>  	  /* This function will get called to fix up the GOT entry
>  	     indicated by the offset on the stack, and then jump to
>  	     the resolved address.  */
> -	  if (MINIMUM_X86_ISA_LEVEL >= AVX_X86_ISA_LEVEL
> -	      || GLRO(dl_x86_cpu_features).xsave_state_size != 0)
> -	    *(ElfW(Addr) *) (got + 2)
> -	      = (CPU_FEATURE_USABLE_P (cpu_features, XSAVEC)
> -		 ? (ElfW(Addr)) &_dl_runtime_resolve_xsavec
> -		 : (ElfW(Addr)) &_dl_runtime_resolve_xsave);
> -	  else
> -	    *(ElfW(Addr) *) (got + 2)
> -	      = (ElfW(Addr)) &_dl_runtime_resolve_fxsave;
> +	  *(ElfW(Addr) *) (got + 2)
> +	    = (ElfW(Addr)) GLRO(dl_x86_64_runtime_resolve);
>  	}
>      }
>  
> @@ -383,7 +372,7 @@ and creates an unsatisfiable circular dependency.\n",
>  		  {
>  		    td->arg = _dl_make_tlsdesc_dynamic
>  		      (sym_map, sym->st_value + reloc->r_addend);
> -		    td->entry = _dl_tlsdesc_dynamic;
> +		    td->entry = GLRO(dl_x86_tlsdesc_dynamic);
>  		  }
>  		else
>  #  endif
> diff --git a/sysdeps/x86_64/dl-procinfo.c b/sysdeps/x86_64/dl-procinfo.c
> index 4d1d790fbb..06637a8154 100644
> --- a/sysdeps/x86_64/dl-procinfo.c
> +++ b/sysdeps/x86_64/dl-procinfo.c
> @@ -41,5 +41,21 @@
>  
>  #include <sysdeps/x86/dl-procinfo.c>
>  
> +#if !IS_IN (ldconfig)
> +# if !defined PROCINFO_DECL && defined SHARED
> +  ._dl_x86_64_runtime_resolve
> +# else
> +PROCINFO_CLASS void * _dl_x86_64_runtime_resolve
> +# endif
> +# ifndef PROCINFO_DECL
> += NULL
> +# endif
> +# if !defined SHARED || defined PROCINFO_DECL
> +;
> +# else
> +,
> +# endif
> +#endif
> +
>  #undef PROCINFO_DECL
>  #undef PROCINFO_CLASS
> diff --git a/sysdeps/x86_64/dl-tlsdesc-dynamic.h b/sysdeps/x86_64/dl-tlsdesc-dynamic.h
> new file mode 100644
> index 0000000000..ce0bc094ec
> --- /dev/null
> +++ b/sysdeps/x86_64/dl-tlsdesc-dynamic.h
> @@ -0,0 +1,166 @@
> +/* Thread-local storage handling in the ELF dynamic linker.  x86_64 version.
> +   Copyright (C) 2004-2024 Free Software Foundation, Inc.
> +   This file is part of the GNU C Library.
> +
> +   The GNU C Library is free software; you can redistribute it and/or
> +   modify it under the terms of the GNU Lesser General Public
> +   License as published by the Free Software Foundation; either
> +   version 2.1 of the License, or (at your option) any later version.
> +
> +   The GNU C Library is distributed in the hope that it will be useful,
> +   but WITHOUT ANY WARRANTY; without even the implied warranty of
> +   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
> +   Lesser General Public License for more details.
> +
> +   You should have received a copy of the GNU Lesser General Public
> +   License along with the GNU C Library; if not, see
> +   <https://www.gnu.org/licenses/>.  */
> +
> +#ifndef SECTION
> +# define SECTION(p)	p
> +#endif
> +
> +#undef REGISTER_SAVE_AREA
> +#undef LOCAL_STORAGE_AREA
> +#undef BASE
> +
> +#include "dl-trampoline-state.h"
> +
> +	.section SECTION(.text),"ax",@progbits
> +
> +	.hidden _dl_tlsdesc_dynamic
> +	.global	_dl_tlsdesc_dynamic
> +	.type	_dl_tlsdesc_dynamic,@function
> +
> +     /* %rax points to the TLS descriptor, such that 0(%rax) points to
> +	_dl_tlsdesc_dynamic itself, and 8(%rax) points to a struct
> +	tlsdesc_dynamic_arg object.  It must return in %rax the offset
> +	between the thread pointer and the object denoted by the
> +	argument, without clobbering any registers.
> +
> +	The assembly code that follows is a rendition of the following
> +	C code, hand-optimized a little bit.
> +
> +ptrdiff_t
> +_dl_tlsdesc_dynamic (register struct tlsdesc *tdp asm ("%rax"))
> +{
> +  struct tlsdesc_dynamic_arg *td = tdp->arg;
> +  dtv_t *dtv = *(dtv_t **)((char *)__thread_pointer + DTV_OFFSET);
> +  if (__builtin_expect (td->gen_count <= dtv[0].counter
> +			&& (dtv[td->tlsinfo.ti_module].pointer.val
> +			    != TLS_DTV_UNALLOCATED),
> +			1))
> +    return dtv[td->tlsinfo.ti_module].pointer.val + td->tlsinfo.ti_offset
> +      - __thread_pointer;
> +
> +  return __tls_get_addr_internal (&td->tlsinfo) - __thread_pointer;
> +}
> +*/
> +	cfi_startproc
> +	.align 16
> +_dl_tlsdesc_dynamic:
> +	_CET_ENDBR
> +	/* Preserve call-clobbered registers that we modify.
> +	   We need two scratch regs anyway.  */
> +	movq	%rsi, -16(%rsp)
> +	mov	%fs:DTV_OFFSET, %RSI_LP
> +	movq	%rdi, -8(%rsp)
> +	movq	TLSDESC_ARG(%rax), %rdi
> +	movq	(%rsi), %rax
> +	cmpq	%rax, TLSDESC_GEN_COUNT(%rdi)
> +	ja	2f
> +	movq	TLSDESC_MODID(%rdi), %rax
> +	salq	$4, %rax
> +	movq	(%rax,%rsi), %rax
> +	cmpq	$-1, %rax
> +	je	2f
> +	addq	TLSDESC_MODOFF(%rdi), %rax
> +1:
> +	movq	-16(%rsp), %rsi
> +	sub	%fs:0, %RAX_LP
> +	movq	-8(%rsp), %rdi
> +	ret
> +2:
> +#if DL_RUNTIME_RESOLVE_REALIGN_STACK
> +	movq	%rbx, -24(%rsp)
> +	mov	%RSP_LP, %RBX_LP
> +	cfi_def_cfa_register(%rbx)
> +	and	$-STATE_SAVE_ALIGNMENT, %RSP_LP
> +#endif
> +#ifdef REGISTER_SAVE_AREA
> +# if DL_RUNTIME_RESOLVE_REALIGN_STACK
> +	# STATE_SAVE_OFFSET has space for 8 integer registers.  But we
> +	# need space for RCX, RDX, RSI, RDI, R8, R9, R10 and R11, plus
> +	# RBX above.
> +	sub	$(REGISTER_SAVE_AREA + STATE_SAVE_ALIGNMENT), %RSP_LP
> +# else
> +	sub	$REGISTER_SAVE_AREA, %RSP_LP
> +	cfi_adjust_cfa_offset(REGISTER_SAVE_AREA)
> +# endif
> +#else
> +	# Allocate stack space of the required size to save the state.
> +	sub	_rtld_local_ro+RTLD_GLOBAL_RO_DL_X86_CPU_FEATURES_OFFSET+XSAVE_STATE_SIZE_OFFSET(%rip), %RSP_LP
> +#endif
> +	/* Besides rdi and rsi, saved above, save rcx, rdx, r8, r9,
> +	   r10 and r11.  */
> +	movq	%rcx, REGISTER_SAVE_RCX(%rsp)
> +	movq	%rdx, REGISTER_SAVE_RDX(%rsp)
> +	movq	%r8, REGISTER_SAVE_R8(%rsp)
> +	movq	%r9, REGISTER_SAVE_R9(%rsp)
> +	movq	%r10, REGISTER_SAVE_R10(%rsp)
> +	movq	%r11, REGISTER_SAVE_R11(%rsp)
> +#ifdef USE_FXSAVE
> +	fxsave	STATE_SAVE_OFFSET(%rsp)
> +#else
> +	movl	$TLSDESC_CALL_STATE_SAVE_MASK, %eax
> +	xorl	%edx, %edx
> +	# Clear the XSAVE Header.
> +# ifdef USE_XSAVE
> +	movq	%rdx, (STATE_SAVE_OFFSET + 512)(%rsp)
> +	movq	%rdx, (STATE_SAVE_OFFSET + 512 + 8)(%rsp)
> +# endif
> +	movq	%rdx, (STATE_SAVE_OFFSET + 512 + 8 * 2)(%rsp)
> +	movq	%rdx, (STATE_SAVE_OFFSET + 512 + 8 * 3)(%rsp)
> +	movq	%rdx, (STATE_SAVE_OFFSET + 512 + 8 * 4)(%rsp)
> +	movq	%rdx, (STATE_SAVE_OFFSET + 512 + 8 * 5)(%rsp)
> +	movq	%rdx, (STATE_SAVE_OFFSET + 512 + 8 * 6)(%rsp)
> +	movq	%rdx, (STATE_SAVE_OFFSET + 512 + 8 * 7)(%rsp)
> +# ifdef USE_XSAVE
> +	xsave	STATE_SAVE_OFFSET(%rsp)
> +# else
> +	xsavec	STATE_SAVE_OFFSET(%rsp)
> +# endif
> +#endif
> +	/* %rdi already points to the tlsinfo data structure.  */
> +	call	HIDDEN_JUMPTARGET (__tls_get_addr)
> +	# Get register content back.
> +#ifdef USE_FXSAVE
> +	fxrstor	STATE_SAVE_OFFSET(%rsp)
> +#else
> +	/* Save and retore __tls_get_addr return value stored in RAX.  */
> +	mov	%RAX_LP, %RCX_LP
> +	movl	$TLSDESC_CALL_STATE_SAVE_MASK, %eax
> +	xorl	%edx, %edx
> +	xrstor	STATE_SAVE_OFFSET(%rsp)
> +	mov	%RCX_LP, %RAX_LP
> +#endif
> +	movq	REGISTER_SAVE_R11(%rsp), %r11
> +	movq	REGISTER_SAVE_R10(%rsp), %r10
> +	movq	REGISTER_SAVE_R9(%rsp), %r9
> +	movq	REGISTER_SAVE_R8(%rsp), %r8
> +	movq	REGISTER_SAVE_RDX(%rsp), %rdx
> +	movq	REGISTER_SAVE_RCX(%rsp), %rcx
> +#if DL_RUNTIME_RESOLVE_REALIGN_STACK
> +	mov	%RBX_LP, %RSP_LP
> +	cfi_def_cfa_register(%rsp)
> +	movq	-24(%rsp), %rbx
> +	cfi_restore(%rbx)
> +#else
> +	add	$REGISTER_SAVE_AREA, %RSP_LP
> +	cfi_adjust_cfa_offset(-REGISTER_SAVE_AREA)
> +#endif
> +	jmp	1b
> +	cfi_endproc
> +	.size	_dl_tlsdesc_dynamic, .-_dl_tlsdesc_dynamic
> +
> +#undef STATE_SAVE_ALIGNMENT
> diff --git a/sysdeps/x86_64/dl-tlsdesc.S b/sysdeps/x86_64/dl-tlsdesc.S
> index f748af2ece..ea69f5223a 100644
> --- a/sysdeps/x86_64/dl-tlsdesc.S
> +++ b/sysdeps/x86_64/dl-tlsdesc.S
> @@ -18,7 +18,19 @@
>  
>  #include <sysdep.h>
>  #include <tls.h>
> +#include <cpu-features-offsets.h>
> +#include <features-offsets.h>
>  #include "tlsdesc.h"
> +#include "dl-trampoline-save.h"
> +
> +/* Area on stack to save and restore registers used for parameter
> +   passing when calling _dl_tlsdesc_dynamic.  */
> +#define REGISTER_SAVE_RCX	0
> +#define REGISTER_SAVE_RDX	(REGISTER_SAVE_RCX + 8)
> +#define REGISTER_SAVE_R8	(REGISTER_SAVE_RDX + 8)
> +#define REGISTER_SAVE_R9	(REGISTER_SAVE_R8 + 8)
> +#define REGISTER_SAVE_R10	(REGISTER_SAVE_R9 + 8)
> +#define REGISTER_SAVE_R11	(REGISTER_SAVE_R10 + 8)
>  
>  	.text
>  
> @@ -67,80 +79,24 @@ _dl_tlsdesc_undefweak:
>  	.size	_dl_tlsdesc_undefweak, .-_dl_tlsdesc_undefweak
>  
>  #ifdef SHARED
> -	.hidden _dl_tlsdesc_dynamic
> -	.global	_dl_tlsdesc_dynamic
> -	.type	_dl_tlsdesc_dynamic,@function
> -
> -     /* %rax points to the TLS descriptor, such that 0(%rax) points to
> -	_dl_tlsdesc_dynamic itself, and 8(%rax) points to a struct
> -	tlsdesc_dynamic_arg object.  It must return in %rax the offset
> -	between the thread pointer and the object denoted by the
> -	argument, without clobbering any registers.
> -
> -	The assembly code that follows is a rendition of the following
> -	C code, hand-optimized a little bit.
> -
> -ptrdiff_t
> -_dl_tlsdesc_dynamic (register struct tlsdesc *tdp asm ("%rax"))
> -{
> -  struct tlsdesc_dynamic_arg *td = tdp->arg;
> -  dtv_t *dtv = *(dtv_t **)((char *)__thread_pointer + DTV_OFFSET);
> -  if (__builtin_expect (td->gen_count <= dtv[0].counter
> -			&& (dtv[td->tlsinfo.ti_module].pointer.val
> -			    != TLS_DTV_UNALLOCATED),
> -			1))
> -    return dtv[td->tlsinfo.ti_module].pointer.val + td->tlsinfo.ti_offset
> -      - __thread_pointer;
> -
> -  return __tls_get_addr_internal (&td->tlsinfo) - __thread_pointer;
> -}
> -*/
> -	cfi_startproc
> -	.align 16
> -_dl_tlsdesc_dynamic:
> -	_CET_ENDBR
> -	/* Preserve call-clobbered registers that we modify.
> -	   We need two scratch regs anyway.  */
> -	movq	%rsi, -16(%rsp)
> -	mov	%fs:DTV_OFFSET, %RSI_LP
> -	movq	%rdi, -8(%rsp)
> -	movq	TLSDESC_ARG(%rax), %rdi
> -	movq	(%rsi), %rax
> -	cmpq	%rax, TLSDESC_GEN_COUNT(%rdi)
> -	ja	.Lslow
> -	movq	TLSDESC_MODID(%rdi), %rax
> -	salq	$4, %rax
> -	movq	(%rax,%rsi), %rax
> -	cmpq	$-1, %rax
> -	je	.Lslow
> -	addq	TLSDESC_MODOFF(%rdi), %rax
> -.Lret:
> -	movq	-16(%rsp), %rsi
> -	sub	%fs:0, %RAX_LP
> -	movq	-8(%rsp), %rdi
> -	ret
> -.Lslow:
> -	/* Besides rdi and rsi, saved above, save rdx, rcx, r8, r9,
> -	   r10 and r11.  Also, align the stack, that's off by 8 bytes.	*/
> -	subq	$72, %rsp
> -	cfi_adjust_cfa_offset (72)
> -	movq	%rdx, 8(%rsp)
> -	movq	%rcx, 16(%rsp)
> -	movq	%r8, 24(%rsp)
> -	movq	%r9, 32(%rsp)
> -	movq	%r10, 40(%rsp)
> -	movq	%r11, 48(%rsp)
> -	/* %rdi already points to the tlsinfo data structure.  */
> -	call	HIDDEN_JUMPTARGET (__tls_get_addr)
> -	movq	8(%rsp), %rdx
> -	movq	16(%rsp), %rcx
> -	movq	24(%rsp), %r8
> -	movq	32(%rsp), %r9
> -	movq	40(%rsp), %r10
> -	movq	48(%rsp), %r11
> -	addq	$72, %rsp
> -	cfi_adjust_cfa_offset (-72)
> -	jmp	.Lret
> -	cfi_endproc
> -	.size	_dl_tlsdesc_dynamic, .-_dl_tlsdesc_dynamic
> +# define USE_FXSAVE
> +# define STATE_SAVE_ALIGNMENT	16
> +# define _dl_tlsdesc_dynamic	_dl_tlsdesc_dynamic_fxsave
> +# include "dl-tlsdesc-dynamic.h"
> +# undef _dl_tlsdesc_dynamic
> +# undef USE_FXSAVE
> +
> +# define USE_XSAVE
> +# define STATE_SAVE_ALIGNMENT	64
> +# define _dl_tlsdesc_dynamic	_dl_tlsdesc_dynamic_xsave
> +# include "dl-tlsdesc-dynamic.h"
> +# undef _dl_tlsdesc_dynamic
> +# undef USE_XSAVE
> +
> +# define USE_XSAVEC
> +# define STATE_SAVE_ALIGNMENT	64
> +# define _dl_tlsdesc_dynamic	_dl_tlsdesc_dynamic_xsavec
> +# include "dl-tlsdesc-dynamic.h"
> +# undef _dl_tlsdesc_dynamic
> +# undef USE_XSAVEC
>  #endif /* SHARED */
> diff --git a/sysdeps/x86_64/dl-trampoline-save.h b/sysdeps/x86_64/dl-trampoline-save.h
> new file mode 100644
> index 0000000000..84eac4a8ac
> --- /dev/null
> +++ b/sysdeps/x86_64/dl-trampoline-save.h
> @@ -0,0 +1,34 @@
> +/* x86-64 PLT trampoline register save macros.
> +   Copyright (C) 2024 Free Software Foundation, Inc.
> +   This file is part of the GNU C Library.
> +
> +   The GNU C Library is free software; you can redistribute it and/or
> +   modify it under the terms of the GNU Lesser General Public
> +   License as published by the Free Software Foundation; either
> +   version 2.1 of the License, or (at your option) any later version.
> +
> +   The GNU C Library is distributed in the hope that it will be useful,
> +   but WITHOUT ANY WARRANTY; without even the implied warranty of
> +   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
> +   Lesser General Public License for more details.
> +
> +   You should have received a copy of the GNU Lesser General Public
> +   License along with the GNU C Library; if not, see
> +   <https://www.gnu.org/licenses/>.  */
> +
> +#ifndef DL_STACK_ALIGNMENT
> +/* Due to GCC bug:
> +
> +   https://gcc.gnu.org/bugzilla/show_bug.cgi?id=58066
> +
> +   __tls_get_addr may be called with 8-byte stack alignment.  Although
> +   this bug has been fixed in GCC 4.9.4, 5.3 and 6, we can't assume
> +   that stack will be always aligned at 16 bytes.  */
> +# define DL_STACK_ALIGNMENT 8
> +#endif
> +
> +/* True if _dl_runtime_resolve should align stack for STATE_SAVE or align
> +   stack to 16 bytes before calling _dl_fixup.  */
> +#define DL_RUNTIME_RESOLVE_REALIGN_STACK \
> +  (STATE_SAVE_ALIGNMENT > DL_STACK_ALIGNMENT \
> +   || 16 > DL_STACK_ALIGNMENT)
> diff --git a/sysdeps/x86_64/dl-trampoline-state.h b/sysdeps/x86_64/dl-trampoline-state.h
> new file mode 100644
> index 0000000000..575f120797
> --- /dev/null
> +++ b/sysdeps/x86_64/dl-trampoline-state.h
> @@ -0,0 +1,51 @@
> +/* x86-64 PLT dl-trampoline state macros.
> +   Copyright (C) 2024 Free Software Foundation, Inc.
> +   This file is part of the GNU C Library.
> +
> +   The GNU C Library is free software; you can redistribute it and/or
> +   modify it under the terms of the GNU Lesser General Public
> +   License as published by the Free Software Foundation; either
> +   version 2.1 of the License, or (at your option) any later version.
> +
> +   The GNU C Library is distributed in the hope that it will be useful,
> +   but WITHOUT ANY WARRANTY; without even the implied warranty of
> +   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
> +   Lesser General Public License for more details.
> +
> +   You should have received a copy of the GNU Lesser General Public
> +   License along with the GNU C Library; if not, see
> +   <https://www.gnu.org/licenses/>.  */
> +
> +#if (STATE_SAVE_ALIGNMENT % 16) != 0
> +# error STATE_SAVE_ALIGNMENT must be multiple of 16
> +#endif
> +
> +#if (STATE_SAVE_OFFSET % STATE_SAVE_ALIGNMENT) != 0
> +# error STATE_SAVE_OFFSET must be multiple of STATE_SAVE_ALIGNMENT
> +#endif
> +
> +#if DL_RUNTIME_RESOLVE_REALIGN_STACK
> +/* Local stack area before jumping to function address: RBX.  */
> +# define LOCAL_STORAGE_AREA	8
> +# define BASE			rbx
> +# ifdef USE_FXSAVE
> +/* Use fxsave to save XMM registers.  */
> +#  define REGISTER_SAVE_AREA	(512 + STATE_SAVE_OFFSET)
> +#  if (REGISTER_SAVE_AREA % 16) != 0
> +#   error REGISTER_SAVE_AREA must be multiple of 16
> +#  endif
> +# endif
> +#else
> +# ifndef USE_FXSAVE
> +#  error USE_FXSAVE must be defined
> +# endif
> +/* Use fxsave to save XMM registers.  */
> +# define REGISTER_SAVE_AREA	(512 + STATE_SAVE_OFFSET + 8)
> +/* Local stack area before jumping to function address:  All saved
> +   registers.  */
> +# define LOCAL_STORAGE_AREA	REGISTER_SAVE_AREA
> +# define BASE			rsp
> +# if (REGISTER_SAVE_AREA % 16) != 8
> +#  error REGISTER_SAVE_AREA must be odd multiple of 8
> +# endif
> +#endif
> diff --git a/sysdeps/x86_64/dl-trampoline.S b/sysdeps/x86_64/dl-trampoline.S
> index b2e7e0f69b..87c5137837 100644
> --- a/sysdeps/x86_64/dl-trampoline.S
> +++ b/sysdeps/x86_64/dl-trampoline.S
> @@ -22,25 +22,7 @@
>  #include <features-offsets.h>
>  #include <link-defines.h>
>  #include <isa-level.h>
> -
> -#ifndef DL_STACK_ALIGNMENT
> -/* Due to GCC bug:
> -
> -   https://gcc.gnu.org/bugzilla/show_bug.cgi?id=58066
> -
> -   __tls_get_addr may be called with 8-byte stack alignment.  Although
> -   this bug has been fixed in GCC 4.9.4, 5.3 and 6, we can't assume
> -   that stack will be always aligned at 16 bytes.  We use unaligned
> -   16-byte move to load and store SSE registers, which has no penalty
> -   on modern processors if stack is 16-byte aligned.  */
> -# define DL_STACK_ALIGNMENT 8
> -#endif
> -
> -/* True if _dl_runtime_resolve should align stack for STATE_SAVE or align
> -   stack to 16 bytes before calling _dl_fixup.  */
> -#define DL_RUNTIME_RESOLVE_REALIGN_STACK \
> -  (STATE_SAVE_ALIGNMENT > DL_STACK_ALIGNMENT \
> -   || 16 > DL_STACK_ALIGNMENT)
> +#include "dl-trampoline-save.h"
>  
>  /* Area on stack to save and restore registers used for parameter
>     passing when calling _dl_fixup.  */
> diff --git a/sysdeps/x86_64/dl-trampoline.h b/sysdeps/x86_64/dl-trampoline.h
> index f55c6ea040..d9ccfb40d4 100644
> --- a/sysdeps/x86_64/dl-trampoline.h
> +++ b/sysdeps/x86_64/dl-trampoline.h
> @@ -27,39 +27,7 @@
>  # undef LOCAL_STORAGE_AREA
>  # undef BASE
>  
> -# if (STATE_SAVE_ALIGNMENT % 16) != 0
> -#  error STATE_SAVE_ALIGNMENT must be multiple of 16
> -# endif
> -
> -# if (STATE_SAVE_OFFSET % STATE_SAVE_ALIGNMENT) != 0
> -#  error STATE_SAVE_OFFSET must be multiple of STATE_SAVE_ALIGNMENT
> -# endif
> -
> -# if DL_RUNTIME_RESOLVE_REALIGN_STACK
> -/* Local stack area before jumping to function address: RBX.  */
> -#  define LOCAL_STORAGE_AREA	8
> -#  define BASE			rbx
> -#  ifdef USE_FXSAVE
> -/* Use fxsave to save XMM registers.  */
> -#   define REGISTER_SAVE_AREA	(512 + STATE_SAVE_OFFSET)
> -#   if (REGISTER_SAVE_AREA % 16) != 0
> -#    error REGISTER_SAVE_AREA must be multiple of 16
> -#   endif
> -#  endif
> -# else
> -#  ifndef USE_FXSAVE
> -#   error USE_FXSAVE must be defined
> -#  endif
> -/* Use fxsave to save XMM registers.  */
> -#  define REGISTER_SAVE_AREA	(512 + STATE_SAVE_OFFSET + 8)
> -/* Local stack area before jumping to function address:  All saved
> -   registers.  */
> -#  define LOCAL_STORAGE_AREA	REGISTER_SAVE_AREA
> -#  define BASE			rsp
> -#  if (REGISTER_SAVE_AREA % 16) != 8
> -#   error REGISTER_SAVE_AREA must be odd multiple of 8
> -#  endif
> -# endif
> +# include "dl-trampoline-state.h"
>  
>  	.globl _dl_runtime_resolve
>  	.hidden _dl_runtime_resolve
  
H.J. Lu Feb. 15, 2024, 11:15 p.m. UTC | #6
On Thu, Feb 15, 2024 at 3:05 PM Adhemerval Zanella Netto
<adhemerval.zanella@linaro.org> wrote:
>
>
>
> On 13/02/24 01:15, H.J. Lu wrote:
> > Compiler generates the following instruction sequence for GNU2 dynamic
> > TLS access:
> >
> >       leaq    tls_var@TLSDESC(%rip), %rax
> >       call    *tls_var@TLSCALL(%rax)
> >
> > or
> >
> >       leal    tls_var@TLSDESC(%ebx), %eax
> >       call    *tls_var@TLSCALL(%eax)
> >
> > CALL instruction is transparent to compiler which assumes all registers,
> > except for EFLAGS and RAX/EAX, are unchanged after CALL.  When
> > _dl_tlsdesc_dynamic is called, it calls __tls_get_addr on the slow
> > path.  __tls_get_addr is a normal function which doesn't preserve any
> > caller-saved registers.  _dl_tlsdesc_dynamic saved and restored integer
> > caller-saved registers, but didn't preserve any other caller-saved
> > registers.  Add _dl_tlsdesc_dynamic IFUNC functions for FNSAVE, FXSAVE,
> > XSAVE and XSAVEC to save and restore all caller-saved registers.  This
> > fixes BZ #31372.
> >
> > Add GLRO(dl_x86_64_runtime_resolve) with GLRO(dl_x86_tlsdesc_dynamic)
> > to optimize elf_machine_runtime_setup.
> > ---
> >  elf/Makefile                                 |  19 ++
> >  elf/malloc-for-test.c                        |  32 ++++
> >  elf/malloc-for-test.map                      |   6 +
> >  elf/tst-gnu2-tls2.c                          |  97 ++++++++++
> >  elf/tst-gnu2-tls2.h                          |  26 +++
> >  elf/tst-gnu2-tls2mod0.c                      |  28 +++
> >  elf/tst-gnu2-tls2mod1.c                      |  28 +++
> >  elf/tst-gnu2-tls2mod2.c                      |  28 +++
> >  sysdeps/i386/dl-machine.h                    |   2 +-
> >  sysdeps/i386/dl-tlsdesc-dynamic.h            | 187 +++++++++++++++++++
> >  sysdeps/i386/dl-tlsdesc.S                    | 115 +++++-------
> >  sysdeps/i386/tst-gnu2-tls2.c                 |   5 +
> >  sysdeps/x86/Makefile                         |   7 +-
> >  sysdeps/x86/cpu-features.c                   |  56 +++++-
> >  sysdeps/x86/dl-procinfo.c                    |  16 ++
> >  sysdeps/{x86_64 => x86}/features-offsets.sym |   2 +
> >  sysdeps/x86/malloc-for-test.c                |  33 ++++
> >  sysdeps/x86/sysdep.h                         |   6 +
> >  sysdeps/x86_64/Makefile                      |   2 +-
> >  sysdeps/x86_64/dl-machine.h                  |  19 +-
> >  sysdeps/x86_64/dl-procinfo.c                 |  16 ++
> >  sysdeps/x86_64/dl-tlsdesc-dynamic.h          | 166 ++++++++++++++++
> >  sysdeps/x86_64/dl-tlsdesc.S                  | 108 ++++-------
> >  sysdeps/x86_64/dl-trampoline-save.h          |  34 ++++
> >  sysdeps/x86_64/dl-trampoline-state.h         |  51 +++++
> >  sysdeps/x86_64/dl-trampoline.S               |  20 +-
> >  sysdeps/x86_64/dl-trampoline.h               |  34 +---
> >  27 files changed, 930 insertions(+), 213 deletions(-)
> >  create mode 100644 elf/malloc-for-test.c
> >  create mode 100644 elf/malloc-for-test.map
> >  create mode 100644 elf/tst-gnu2-tls2.c
> >  create mode 100644 elf/tst-gnu2-tls2.h
> >  create mode 100644 elf/tst-gnu2-tls2mod0.c
> >  create mode 100644 elf/tst-gnu2-tls2mod1.c
> >  create mode 100644 elf/tst-gnu2-tls2mod2.c
> >  create mode 100644 sysdeps/i386/dl-tlsdesc-dynamic.h
> >  create mode 100644 sysdeps/i386/tst-gnu2-tls2.c
> >  rename sysdeps/{x86_64 => x86}/features-offsets.sym (89%)
> >  create mode 100644 sysdeps/x86/malloc-for-test.c
> >  create mode 100644 sysdeps/x86_64/dl-tlsdesc-dynamic.h
> >  create mode 100644 sysdeps/x86_64/dl-trampoline-save.h
> >  create mode 100644 sysdeps/x86_64/dl-trampoline-state.h
> >
> > diff --git a/elf/Makefile b/elf/Makefile
> > index 5d78b659ce..e0665d2007 100644
> > --- a/elf/Makefile
> > +++ b/elf/Makefile
> > @@ -424,6 +424,7 @@ tests += \
> >    tst-glibc-hwcaps-prepend \
> >    tst-global1 \
> >    tst-global2 \
> > +  tst-gnu2-tls2 \
> >    tst-initfinilazyfail \
> >    tst-initorder \
> >    tst-initorder2 \
> > @@ -699,6 +700,7 @@ modules-names += \
> >    libtracemod5-1 \
> >    ltglobmod1 \
> >    ltglobmod2 \
> > +  malloc-for-test \
> >    neededobj1 \
> >    neededobj2 \
> >    neededobj3 \
> > @@ -846,6 +848,9 @@ modules-names += \
> >    tst-filterobj-flt \
> >    tst-finilazyfailmod \
> >    tst-globalmod2 \
> > +  tst-gnu2-tls2mod0 \
> > +  tst-gnu2-tls2mod1 \
> > +  tst-gnu2-tls2mod2 \
> >    tst-initlazyfailmod \
> >    tst-initorder2a \
> >    tst-initorder2b \
> > @@ -3044,8 +3049,22 @@ $(objpfx)tst-tlsgap.out: \
> >    $(objpfx)tst-tlsgap-mod0.so \
> >    $(objpfx)tst-tlsgap-mod1.so \
> >    $(objpfx)tst-tlsgap-mod2.so
> > +
> > +$(objpfx)tst-gnu2-tls2: \
> > +  $(shared-thread-library) \
> > +  $(objpfx)malloc-for-test.so
> > +$(objpfx)tst-gnu2-tls2.out: \
> > +  $(objpfx)tst-gnu2-tls2mod0.so \
> > +  $(objpfx)tst-gnu2-tls2mod1.so \
> > +  $(objpfx)tst-gnu2-tls2mod2.so
> > +
> > +LDFLAGS-malloc-for-test.so += -Wl,--version-script=malloc-for-test.map
> > +
> >  ifeq (yes,$(have-mtls-dialect-gnu2))
> >  CFLAGS-tst-tlsgap-mod0.c += -mtls-dialect=gnu2
> >  CFLAGS-tst-tlsgap-mod1.c += -mtls-dialect=gnu2
> >  CFLAGS-tst-tlsgap-mod2.c += -mtls-dialect=gnu2
> > +CFLAGS-tst-gnu2-tls2mod0.c += -mtls-dialect=gnu2
> > +CFLAGS-tst-gnu2-tls2mod1.c += -mtls-dialect=gnu2
> > +CFLAGS-tst-gnu2-tls2mod2.c += -mtls-dialect=gnu2
> >  endif
> > diff --git a/elf/malloc-for-test.c b/elf/malloc-for-test.c
> > new file mode 100644
> > index 0000000000..1bec69eda7
> > --- /dev/null
> > +++ b/elf/malloc-for-test.c
> > @@ -0,0 +1,32 @@
> > +/* A malloc for intercept test.
> > +   Copyright (C) 2024 Free Software Foundation, Inc.
> > +   This file is part of the GNU C Library.
> > +
> > +   The GNU C Library is free software; you can redistribute it and/or
> > +   modify it under the terms of the GNU Lesser General Public
> > +   License as published by the Free Software Foundation; either
> > +   version 2.1 of the License, or (at your option) any later version.
> > +
> > +   The GNU C Library is distributed in the hope that it will be useful,
> > +   but WITHOUT ANY WARRANTY; without even the implied warranty of
> > +   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
> > +   Lesser General Public License for more details.
> > +
> > +   You should have received a copy of the GNU Lesser General Public
> > +   License along with the GNU C Library; if not, see
> > +   <http://www.gnu.org/licenses/>.  */
> > +
> > +#include <stdlib.h>
> > +
> > +extern void * __libc_malloc (size_t);
> > +
> > +#ifndef PREPARE_MALLOC
> > +# define PREPARE_MALLOC()
> > +#endif
> > +
> > +void *
> > +malloc (size_t n)
> > +{
> > +  PREPARE_MALLOC ();
>
> It is not clear to me how exactly this adds proper coverage without
> actually set the affected registers *before* the TLS variable access
> and later check its value has not changed. In fact, on x86_64 it is
> passing on my system even without the test actually work as expected
> (see below).

It depends on the compiler version.  Newer GCC will generate vector
load/store on x86-64 to copy a structure.

> And I think we also need to add arch-specific rules to build the test
> with only the base ABI, and add extra macros to clobber and check
> the expected registers that _dl_tlsdesc_dynamic should save/restore.
>
> It would be slightly more trick on ABIs that already have a large set
> or register (like x86_64-v1 and armv8-a).
>
> > +  return __libc_malloc (n);
> > +}
> > diff --git a/elf/malloc-for-test.map b/elf/malloc-for-test.map
> > new file mode 100644
> > index 0000000000..8437cf4346
> > --- /dev/null
> > +++ b/elf/malloc-for-test.map
> > @@ -0,0 +1,6 @@
> > +GLIBC_2.0 {
>
> You need to use the correct version to override the malloc:
>
> $ gdb --args tst-gnu2-tls2 --direct
> [...]
> (gdb) b apply_tls
> (gdb) r
> Thread 2 "tst-gnu2-tls2" hit Breakpoint 1, apply_tls (p=0x7ffff7bfee80) at tst-gnu2-tls2mod1.c:25
> 25 {
> (gdb) b malloc
> Breakpoint 2 at 0x7ffff7ca8ad0: malloc. (3 locations)
> (gdb) c
> Continuing.
>
> Thread 2 "test-gnu2-tls2" hit Breakpoint 2, __GI___libc_malloc (bytes=3200) at malloc.c:3294
> 3294 {
> (gdb) bt
> #0 __GI___libc_malloc (bytes=3200) at malloc.c:3294
> #1 0x00007ffff7fda3de in malloc (size=<optimized out>) at ../include/rtld-malloc.h:56
> #2 allocate_dtv_entry (size=<optimized out>, alignment=16) at ../elf/dl-tls.c:679
> #3 allocate_and_init (map=0x7ffff0000bd0) at ../elf/dl-tls.c:704
> #4 tls_get_addr_tail (ti=0x7ffff0001240, dtv=0x55555555e340, the_map=0x7ffff0000bd0) at ../elf/dl-tls.c:904
> #5 0x00007ffff7fdda2e in _dl_tlsdesc_dynamic_xsavec () at ../sysdeps/x86_64/dl-tlsdesc-dynamic.h:135
> #6 0x00007ffff7fb0155 in apply_tls (p=0xc80) at tst-gnu2-tls2mod1.c:27
> #7 0x0000555555556965 in access_mod (i=1, sym=0x555555559022 "apply_tls") at tst-gnu2-tls2.c:58
> #8 start (arg=0x0) at tst-gnu2-tls2.c:73
> #9 0x00007ffff7c96a82 in start_thread (arg=<optimized out>) at pthread_create.c:447
> #10 0x00007ffff7d1b13c in clone3 () at ../sysdeps/unix/sysv/linux/x86_64/clone3.S:78
>
> By using GLIBC_2.2.5 instead of GLIBC_2.0, now I see:
>
> (gdb) bt
> #0 malloc (n=3200) at ../elf/malloc-for-test.c:29
> #1 0x00007ffff7fda3de in malloc (size=<optimized out>) at ../include/rtld-malloc.h:56
> #2 allocate_dtv_entry (size=<optimized out>, alignment=16) at ../elf/dl-tls.c:679
> #3 allocate_and_init (map=0x7ffff0000bd0) at ../elf/dl-tls.c:704
> #4 tls_get_addr_tail (ti=0x7ffff0001240, dtv=0x55555555e340, the_map=0x7ffff0000bd0) at ../elf/dl-tls.c:904
> #5 0x00007ffff7fdda2e in _dl_tlsdesc_dynamic_xsavec () at ../sysdeps/x86_64/dl-tlsdesc-dynamic.h:135
> #6 0x00007ffff7fb0155 in apply_tls (p=0xc80) at tst-gnu2-tls2mod1.c:27
> #7 0x0000555555556965 in access_mod (i=1, sym=0x555555559022 "apply_tls") at tst-gnu2-tls2.c:58
> #8 start (arg=0x0) at tst-gnu2-tls2.c:73
> #9 0x00007ffff7c96a82 in start_thread (arg=<optimized out>) at pthread_create.c:447
> #10 0x00007ffff7d1b13c in clone3 () at ../sysdeps/unix/sysv/linux/x86_64/clone3.S:78
>
> So you will need either to get the correct version or parameterize
> the map file for each ABI (which is just a handful that actually
> support TLS descriptors).

An arch may need a different version map.

> Which made me realize that preloading malloc won't interpose ld
> implementation since 3a0ecccb599a6b1ad4b149dc569c0080e92d057b unless
> the malloc replacement also exports the malloc with the proper ABI
> version.

True.

> I think it is an unexpected change from BZ#25486, but at the same
> time I don't think this is a bad one. We will need to add all the
> machinery required to save/restore the caller-saved register for
> each ABI that supports TLS descriptors because even glibc malloc
> might call internal functions that might use such ABI extension
> (for instance mem* and str* functions).
>
> All this made me realize that the TLS descriptor slow path
> is *far* from maintainable, as we discussed on the weekly call.
> And I think we *should* move away from it. Some issues:
>
>  * To properly support _dl_tlsdesc_dynamic on ABI with vector
>    extensions, it would either need to pessimize code generation
>    for TLS access (so the compiler would add all the required
>    instructions to save/restore  the caller-saved registers) or
>    move the complexity to libc.
>
>  * The latter would make the libc to require either a quite complex
>    _dl_tlsdesc_dynamic, which would either need to probe hardware
>    support to provide the multiple code paths or add the support
>    through iFUNC.

True.

>  * ARM also has the issue and I think it has not seen this issue
>    because gnu2 is not the default TLS ABI and gcc likely won't change
>    in nearby future. And to properly fix it, it would require to add
>    something like what you are doing for x86 to support the multiple
>    vector extensions (VFP, VFP3, NEON).

True.

>  * Loongsong is finishing its TLSDESC ABI support on gcc/binutils, and
>    most likely would require quite similar support to proper support
>    LSX, LASX.
>
>  * I think RISC-V would also have a similar issue for its vector ABI.
>
> So I think we really should reevaluate the BZ#16133 fix that we reverted
> on 2.20 [1] [2]. if I recall correctly (I need to go through again my
> notes about this issue), two main issues triggered the revert:
>
>  1. It broke LSAN;
>  2. Lazy allocation is an explicit feature [3].
>
> For 1. I think it should be doable to fix on sanitizer, either by adding
> more hacks to get the correct TLS size or by providing a proper ABI.
>
> However for 2. I think it is past time that we accept that lazy allocation
> was a nice idea, but it adds a *lot* of maintainability burden that
> it is not paying off.
>
> [1] https://sourceware.org/git/?p=glibc.git;a=commitdiff;h=1f33d36a8a9e78c81bed59b47f260723f56bb7e6
> [2] https://sourceware.org/legacy-ml/libc-alpha/2013-09/msg00721.html
> [3] https://sourceware.org/legacy-ml/libc-alpha/2014-01/msg00287.html
>

We need a short-team fix before the slow path is removed.
  
Florian Weimer Feb. 16, 2024, 6:23 a.m. UTC | #7
* Adhemerval Zanella Netto:

>> diff --git a/elf/malloc-for-test.map b/elf/malloc-for-test.map
>> new file mode 100644
>> index 0000000000..8437cf4346
>> --- /dev/null
>> +++ b/elf/malloc-for-test.map
>> @@ -0,0 +1,6 @@
>> +GLIBC_2.0 {
>
> You need to use the correct version to override the malloc:
>
> $ gdb --args tst-gnu2-tls2 --direct
> [...]
> (gdb) b apply_tls
> (gdb) r
> Thread 2 "tst-gnu2-tls2" hit Breakpoint 1, apply_tls (p=0x7ffff7bfee80) at tst-gnu2-tls2mod1.c:25
> 25 {
> (gdb) b malloc
> Breakpoint 2 at 0x7ffff7ca8ad0: malloc. (3 locations)
> (gdb) c
> Continuing.

Why do we need to set a symbol version here?  I think this can be
removed.

Thanks,
Florian
  
H.J. Lu Feb. 16, 2024, 11:59 a.m. UTC | #8
On Thu, Feb 15, 2024 at 10:23 PM Florian Weimer <fweimer@redhat.com> wrote:
>
> * Adhemerval Zanella Netto:
>
> >> diff --git a/elf/malloc-for-test.map b/elf/malloc-for-test.map
> >> new file mode 100644
> >> index 0000000000..8437cf4346
> >> --- /dev/null
> >> +++ b/elf/malloc-for-test.map
> >> @@ -0,0 +1,6 @@
> >> +GLIBC_2.0 {
> >
> > You need to use the correct version to override the malloc:
> >
> > $ gdb --args tst-gnu2-tls2 --direct
> > [...]
> > (gdb) b apply_tls
> > (gdb) r
> > Thread 2 "tst-gnu2-tls2" hit Breakpoint 1, apply_tls (p=0x7ffff7bfee80) at tst-gnu2-tls2mod1.c:25
> > 25 {
> > (gdb) b malloc
> > Breakpoint 2 at 0x7ffff7ca8ad0: malloc. (3 locations)
> > (gdb) c
> > Continuing.
>
> Why do we need to set a symbol version here?  I think this can be
> removed.

malloc-for-test.so provides a fake malloc to clobber caller-save
registers.  Since malloc in ld.so has a symbol version,

  struct r_found_version version;
  version.name = symbol_version_string (libc, GLIBC_2_0);
  version.hidden = 0;
  version.hash = _dl_elf_hash (version.name);
  version.filename = NULL;

  void *new_calloc = lookup_malloc_symbol (main_map, "calloc", &version);
  void *new_free = lookup_malloc_symbol (main_map, "free", &version);
  void *new_malloc = lookup_malloc_symbol (main_map, "malloc", &version);
  void *new_realloc = lookup_malloc_symbol (main_map, "realloc", &version);

malloc in malloc-for-test.so must have the same version.  Otherwise,
it won't be used in the test.
  
Florian Weimer Feb. 16, 2024, 12:18 p.m. UTC | #9
* H. J. Lu:

> On Thu, Feb 15, 2024 at 10:23 PM Florian Weimer <fweimer@redhat.com> wrote:
>>
>> * Adhemerval Zanella Netto:
>>
>> >> diff --git a/elf/malloc-for-test.map b/elf/malloc-for-test.map
>> >> new file mode 100644
>> >> index 0000000000..8437cf4346
>> >> --- /dev/null
>> >> +++ b/elf/malloc-for-test.map
>> >> @@ -0,0 +1,6 @@
>> >> +GLIBC_2.0 {
>> >
>> > You need to use the correct version to override the malloc:
>> >
>> > $ gdb --args tst-gnu2-tls2 --direct
>> > [...]
>> > (gdb) b apply_tls
>> > (gdb) r
>> > Thread 2 "tst-gnu2-tls2" hit Breakpoint 1, apply_tls (p=0x7ffff7bfee80) at tst-gnu2-tls2mod1.c:25
>> > 25 {
>> > (gdb) b malloc
>> > Breakpoint 2 at 0x7ffff7ca8ad0: malloc. (3 locations)
>> > (gdb) c
>> > Continuing.
>>
>> Why do we need to set a symbol version here?  I think this can be
>> removed.
>
> malloc-for-test.so provides a fake malloc to clobber caller-save
> registers.  Since malloc in ld.so has a symbol version,
>
>   struct r_found_version version;
>   version.name = symbol_version_string (libc, GLIBC_2_0);
>   version.hidden = 0;
>   version.hash = _dl_elf_hash (version.name);
>   version.filename = NULL;
>
>   void *new_calloc = lookup_malloc_symbol (main_map, "calloc", &version);
>   void *new_free = lookup_malloc_symbol (main_map, "free", &version);
>   void *new_malloc = lookup_malloc_symbol (main_map, "malloc", &version);
>   void *new_realloc = lookup_malloc_symbol (main_map, "realloc", &version);
>
> malloc in malloc-for-test.so must have the same version.  Otherwise,
> it won't be used in the test.

I thought that unversioned symbols interpose all versioned symbols.  Has
this changed in the dynamic linker?

Thanks,
Florian
  
H.J. Lu Feb. 16, 2024, 12:20 p.m. UTC | #10
On Fri, Feb 16, 2024 at 4:18 AM Florian Weimer <fweimer@redhat.com> wrote:
>
> * H. J. Lu:
>
> > On Thu, Feb 15, 2024 at 10:23 PM Florian Weimer <fweimer@redhat.com> wrote:
> >>
> >> * Adhemerval Zanella Netto:
> >>
> >> >> diff --git a/elf/malloc-for-test.map b/elf/malloc-for-test.map
> >> >> new file mode 100644
> >> >> index 0000000000..8437cf4346
> >> >> --- /dev/null
> >> >> +++ b/elf/malloc-for-test.map
> >> >> @@ -0,0 +1,6 @@
> >> >> +GLIBC_2.0 {
> >> >
> >> > You need to use the correct version to override the malloc:
> >> >
> >> > $ gdb --args tst-gnu2-tls2 --direct
> >> > [...]
> >> > (gdb) b apply_tls
> >> > (gdb) r
> >> > Thread 2 "tst-gnu2-tls2" hit Breakpoint 1, apply_tls (p=0x7ffff7bfee80) at tst-gnu2-tls2mod1.c:25
> >> > 25 {
> >> > (gdb) b malloc
> >> > Breakpoint 2 at 0x7ffff7ca8ad0: malloc. (3 locations)
> >> > (gdb) c
> >> > Continuing.
> >>
> >> Why do we need to set a symbol version here?  I think this can be
> >> removed.
> >
> > malloc-for-test.so provides a fake malloc to clobber caller-save
> > registers.  Since malloc in ld.so has a symbol version,
> >
> >   struct r_found_version version;
> >   version.name = symbol_version_string (libc, GLIBC_2_0);
> >   version.hidden = 0;
> >   version.hash = _dl_elf_hash (version.name);
> >   version.filename = NULL;
> >
> >   void *new_calloc = lookup_malloc_symbol (main_map, "calloc", &version);
> >   void *new_free = lookup_malloc_symbol (main_map, "free", &version);
> >   void *new_malloc = lookup_malloc_symbol (main_map, "malloc", &version);
> >   void *new_realloc = lookup_malloc_symbol (main_map, "realloc", &version);
> >
> > malloc in malloc-for-test.so must have the same version.  Otherwise,
> > it won't be used in the test.
>
> I thought that unversioned symbols interpose all versioned symbols.  Has
> this changed in the dynamic linker?

Only for this case.

> Thanks,
> Florian
>
  
H.J. Lu Feb. 16, 2024, 12:37 p.m. UTC | #11
On Fri, Feb 16, 2024 at 4:20 AM H.J. Lu <hjl.tools@gmail.com> wrote:
>
> On Fri, Feb 16, 2024 at 4:18 AM Florian Weimer <fweimer@redhat.com> wrote:
> >
> > * H. J. Lu:
> >
> > > On Thu, Feb 15, 2024 at 10:23 PM Florian Weimer <fweimer@redhat.com> wrote:
> > >>
> > >> * Adhemerval Zanella Netto:
> > >>
> > >> >> diff --git a/elf/malloc-for-test.map b/elf/malloc-for-test.map
> > >> >> new file mode 100644
> > >> >> index 0000000000..8437cf4346
> > >> >> --- /dev/null
> > >> >> +++ b/elf/malloc-for-test.map
> > >> >> @@ -0,0 +1,6 @@
> > >> >> +GLIBC_2.0 {
> > >> >
> > >> > You need to use the correct version to override the malloc:
> > >> >
> > >> > $ gdb --args tst-gnu2-tls2 --direct
> > >> > [...]
> > >> > (gdb) b apply_tls
> > >> > (gdb) r
> > >> > Thread 2 "tst-gnu2-tls2" hit Breakpoint 1, apply_tls (p=0x7ffff7bfee80) at tst-gnu2-tls2mod1.c:25
> > >> > 25 {
> > >> > (gdb) b malloc
> > >> > Breakpoint 2 at 0x7ffff7ca8ad0: malloc. (3 locations)
> > >> > (gdb) c
> > >> > Continuing.
> > >>
> > >> Why do we need to set a symbol version here?  I think this can be
> > >> removed.
> > >
> > > malloc-for-test.so provides a fake malloc to clobber caller-save
> > > registers.  Since malloc in ld.so has a symbol version,
> > >
> > >   struct r_found_version version;
> > >   version.name = symbol_version_string (libc, GLIBC_2_0);
> > >   version.hidden = 0;
> > >   version.hash = _dl_elf_hash (version.name);
> > >   version.filename = NULL;
> > >
> > >   void *new_calloc = lookup_malloc_symbol (main_map, "calloc", &version);
> > >   void *new_free = lookup_malloc_symbol (main_map, "free", &version);
> > >   void *new_malloc = lookup_malloc_symbol (main_map, "malloc", &version);
> > >   void *new_realloc = lookup_malloc_symbol (main_map, "realloc", &version);
> > >
> > > malloc in malloc-for-test.so must have the same version.  Otherwise,
> > > it won't be used in the test.
> >
> > I thought that unversioned symbols interpose all versioned symbols.  Has
> > this changed in the dynamic linker?
>
> Only for this case.
>

I was wrong.  Version isn't needed.  Will fix it.

Thanks.
  
Adhemerval Zanella Netto Feb. 16, 2024, 12:47 p.m. UTC | #12
On 16/02/24 09:37, H.J. Lu wrote:
> On Fri, Feb 16, 2024 at 4:20 AM H.J. Lu <hjl.tools@gmail.com> wrote:
>>
>> On Fri, Feb 16, 2024 at 4:18 AM Florian Weimer <fweimer@redhat.com> wrote:
>>>
>>> * H. J. Lu:
>>>
>>>> On Thu, Feb 15, 2024 at 10:23 PM Florian Weimer <fweimer@redhat.com> wrote:
>>>>>
>>>>> * Adhemerval Zanella Netto:
>>>>>
>>>>>>> diff --git a/elf/malloc-for-test.map b/elf/malloc-for-test.map
>>>>>>> new file mode 100644
>>>>>>> index 0000000000..8437cf4346
>>>>>>> --- /dev/null
>>>>>>> +++ b/elf/malloc-for-test.map
>>>>>>> @@ -0,0 +1,6 @@
>>>>>>> +GLIBC_2.0 {
>>>>>>
>>>>>> You need to use the correct version to override the malloc:
>>>>>>
>>>>>> $ gdb --args tst-gnu2-tls2 --direct
>>>>>> [...]
>>>>>> (gdb) b apply_tls
>>>>>> (gdb) r
>>>>>> Thread 2 "tst-gnu2-tls2" hit Breakpoint 1, apply_tls (p=0x7ffff7bfee80) at tst-gnu2-tls2mod1.c:25
>>>>>> 25 {
>>>>>> (gdb) b malloc
>>>>>> Breakpoint 2 at 0x7ffff7ca8ad0: malloc. (3 locations)
>>>>>> (gdb) c
>>>>>> Continuing.
>>>>>
>>>>> Why do we need to set a symbol version here?  I think this can be
>>>>> removed.
>>>>
>>>> malloc-for-test.so provides a fake malloc to clobber caller-save
>>>> registers.  Since malloc in ld.so has a symbol version,
>>>>
>>>>   struct r_found_version version;
>>>>   version.name = symbol_version_string (libc, GLIBC_2_0);
>>>>   version.hidden = 0;
>>>>   version.hash = _dl_elf_hash (version.name);
>>>>   version.filename = NULL;
>>>>
>>>>   void *new_calloc = lookup_malloc_symbol (main_map, "calloc", &version);
>>>>   void *new_free = lookup_malloc_symbol (main_map, "free", &version);
>>>>   void *new_malloc = lookup_malloc_symbol (main_map, "malloc", &version);
>>>>   void *new_realloc = lookup_malloc_symbol (main_map, "realloc", &version);
>>>>
>>>> malloc in malloc-for-test.so must have the same version.  Otherwise,
>>>> it won't be used in the test.
>>>
>>> I thought that unversioned symbols interpose all versioned symbols.  Has
>>> this changed in the dynamic linker?
>>
>> Only for this case.
>>
> 
> I was wrong.  Version isn't needed.  Will fix it.
> 

Indeed, but I think we also improve the register check testing since just clobbering
on malloc does not really provide much coverage.
  
H.J. Lu Feb. 16, 2024, 12:58 p.m. UTC | #13
On Fri, Feb 16, 2024 at 4:47 AM Adhemerval Zanella Netto
<adhemerval.zanella@linaro.org> wrote:
>
>
>
> On 16/02/24 09:37, H.J. Lu wrote:
> > On Fri, Feb 16, 2024 at 4:20 AM H.J. Lu <hjl.tools@gmail.com> wrote:
> >>
> >> On Fri, Feb 16, 2024 at 4:18 AM Florian Weimer <fweimer@redhat.com> wrote:
> >>>
> >>> * H. J. Lu:
> >>>
> >>>> On Thu, Feb 15, 2024 at 10:23 PM Florian Weimer <fweimer@redhat.com> wrote:
> >>>>>
> >>>>> * Adhemerval Zanella Netto:
> >>>>>
> >>>>>>> diff --git a/elf/malloc-for-test.map b/elf/malloc-for-test.map
> >>>>>>> new file mode 100644
> >>>>>>> index 0000000000..8437cf4346
> >>>>>>> --- /dev/null
> >>>>>>> +++ b/elf/malloc-for-test.map
> >>>>>>> @@ -0,0 +1,6 @@
> >>>>>>> +GLIBC_2.0 {
> >>>>>>
> >>>>>> You need to use the correct version to override the malloc:
> >>>>>>
> >>>>>> $ gdb --args tst-gnu2-tls2 --direct
> >>>>>> [...]
> >>>>>> (gdb) b apply_tls
> >>>>>> (gdb) r
> >>>>>> Thread 2 "tst-gnu2-tls2" hit Breakpoint 1, apply_tls (p=0x7ffff7bfee80) at tst-gnu2-tls2mod1.c:25
> >>>>>> 25 {
> >>>>>> (gdb) b malloc
> >>>>>> Breakpoint 2 at 0x7ffff7ca8ad0: malloc. (3 locations)
> >>>>>> (gdb) c
> >>>>>> Continuing.
> >>>>>
> >>>>> Why do we need to set a symbol version here?  I think this can be
> >>>>> removed.
> >>>>
> >>>> malloc-for-test.so provides a fake malloc to clobber caller-save
> >>>> registers.  Since malloc in ld.so has a symbol version,
> >>>>
> >>>>   struct r_found_version version;
> >>>>   version.name = symbol_version_string (libc, GLIBC_2_0);
> >>>>   version.hidden = 0;
> >>>>   version.hash = _dl_elf_hash (version.name);
> >>>>   version.filename = NULL;
> >>>>
> >>>>   void *new_calloc = lookup_malloc_symbol (main_map, "calloc", &version);
> >>>>   void *new_free = lookup_malloc_symbol (main_map, "free", &version);
> >>>>   void *new_malloc = lookup_malloc_symbol (main_map, "malloc", &version);
> >>>>   void *new_realloc = lookup_malloc_symbol (main_map, "realloc", &version);
> >>>>
> >>>> malloc in malloc-for-test.so must have the same version.  Otherwise,
> >>>> it won't be used in the test.
> >>>
> >>> I thought that unversioned symbols interpose all versioned symbols.  Has
> >>> this changed in the dynamic linker?
> >>
> >> Only for this case.
> >>
> >
> > I was wrong.  Version isn't needed.  Will fix it.
> >
>
> Indeed, but I think we also improve the register check testing since just clobbering
> on malloc does not really provide much coverage.

Why?  The slow path will always call malloc in malloc-for-test.so.
We can clobber any caller-save registers we need.  Without
malloc-for-test.so, the test doesn't fail for i386 since malloc in libc.so
doesn't use any vector registers.
  
Florian Weimer Feb. 16, 2024, 1:06 p.m. UTC | #14
* H. J. Lu:

>> > I thought that unversioned symbols interpose all versioned symbols.  Has
>> > this changed in the dynamic linker?
>>
>> Only for this case.
>>
>
> I was wrong.  Version isn't needed.  Will fix it.

Ahh.  You don't need malloc-for-test.so, either.  I think you could use
-Wl,-E (--export-dynamic) to trigger interposition from the main
program.

It would make sense to check using a counter that the interposed malloc
is in fact called.

Thanks,
Florian
  
Adhemerval Zanella Netto Feb. 16, 2024, 1:24 p.m. UTC | #15
On 16/02/24 09:58, H.J. Lu wrote:
> On Fri, Feb 16, 2024 at 4:47 AM Adhemerval Zanella Netto
> <adhemerval.zanella@linaro.org> wrote:
>>
>>
>>
>> On 16/02/24 09:37, H.J. Lu wrote:
>>> On Fri, Feb 16, 2024 at 4:20 AM H.J. Lu <hjl.tools@gmail.com> wrote:
>>>>
>>>> On Fri, Feb 16, 2024 at 4:18 AM Florian Weimer <fweimer@redhat.com> wrote:
>>>>>
>>>>> * H. J. Lu:
>>>>>
>>>>>> On Thu, Feb 15, 2024 at 10:23 PM Florian Weimer <fweimer@redhat.com> wrote:
>>>>>>>
>>>>>>> * Adhemerval Zanella Netto:
>>>>>>>
>>>>>>>>> diff --git a/elf/malloc-for-test.map b/elf/malloc-for-test.map
>>>>>>>>> new file mode 100644
>>>>>>>>> index 0000000000..8437cf4346
>>>>>>>>> --- /dev/null
>>>>>>>>> +++ b/elf/malloc-for-test.map
>>>>>>>>> @@ -0,0 +1,6 @@
>>>>>>>>> +GLIBC_2.0 {
>>>>>>>>
>>>>>>>> You need to use the correct version to override the malloc:
>>>>>>>>
>>>>>>>> $ gdb --args tst-gnu2-tls2 --direct
>>>>>>>> [...]
>>>>>>>> (gdb) b apply_tls
>>>>>>>> (gdb) r
>>>>>>>> Thread 2 "tst-gnu2-tls2" hit Breakpoint 1, apply_tls (p=0x7ffff7bfee80) at tst-gnu2-tls2mod1.c:25
>>>>>>>> 25 {
>>>>>>>> (gdb) b malloc
>>>>>>>> Breakpoint 2 at 0x7ffff7ca8ad0: malloc. (3 locations)
>>>>>>>> (gdb) c
>>>>>>>> Continuing.
>>>>>>>
>>>>>>> Why do we need to set a symbol version here?  I think this can be
>>>>>>> removed.
>>>>>>
>>>>>> malloc-for-test.so provides a fake malloc to clobber caller-save
>>>>>> registers.  Since malloc in ld.so has a symbol version,
>>>>>>
>>>>>>   struct r_found_version version;
>>>>>>   version.name = symbol_version_string (libc, GLIBC_2_0);
>>>>>>   version.hidden = 0;
>>>>>>   version.hash = _dl_elf_hash (version.name);
>>>>>>   version.filename = NULL;
>>>>>>
>>>>>>   void *new_calloc = lookup_malloc_symbol (main_map, "calloc", &version);
>>>>>>   void *new_free = lookup_malloc_symbol (main_map, "free", &version);
>>>>>>   void *new_malloc = lookup_malloc_symbol (main_map, "malloc", &version);
>>>>>>   void *new_realloc = lookup_malloc_symbol (main_map, "realloc", &version);
>>>>>>
>>>>>> malloc in malloc-for-test.so must have the same version.  Otherwise,
>>>>>> it won't be used in the test.
>>>>>
>>>>> I thought that unversioned symbols interpose all versioned symbols.  Has
>>>>> this changed in the dynamic linker?
>>>>
>>>> Only for this case.
>>>>
>>>
>>> I was wrong.  Version isn't needed.  Will fix it.
>>>
>>
>> Indeed, but I think we also improve the register check testing since just clobbering
>> on malloc does not really provide much coverage.
> 
> Why?  The slow path will always call malloc in malloc-for-test.so.
> We can clobber any caller-save registers we need.  Without
> malloc-for-test.so, the test doesn't fail for i386 since malloc in libc.so
> doesn't use any vector registers.
> 

The idea is to check whether _dl_tlsdesc_dynamic call does preserve any
possible caller-saved register and since this test is generic and afaik
all affected ABIs follow the same idea (compiler won't save/restore such
register), clobbering the register will only trigger a possible issue
iff the thread that actually issue the TLS usage does actually use any
possible register.

That's why I think a better coverage would to also clobber the register
before the TLS access, and check if their values does not change over
the TLS access.
  
H.J. Lu Feb. 16, 2024, 1:24 p.m. UTC | #16
On Fri, Feb 16, 2024 at 5:06 AM Florian Weimer <fweimer@redhat.com> wrote:
>
> * H. J. Lu:
>
> >> > I thought that unversioned symbols interpose all versioned symbols.  Has
> >> > this changed in the dynamic linker?
> >>
> >> Only for this case.
> >>
> >
> > I was wrong.  Version isn't needed.  Will fix it.
>
> Ahh.  You don't need malloc-for-test.so, either.  I think you could use
> -Wl,-E (--export-dynamic) to trigger interposition from the main
> program.

No need for --export-dynamic since malloc is exported from libc.so.

> It would make sense to check using a counter that the interposed malloc
> is in fact called.

Will do.

> Thanks,
> Florian
>

Thanks.
  
H.J. Lu Feb. 16, 2024, 2:25 p.m. UTC | #17
On Fri, Feb 16, 2024 at 5:24 AM Adhemerval Zanella Netto
<adhemerval.zanella@linaro.org> wrote:
>
>
>
> On 16/02/24 09:58, H.J. Lu wrote:
> > On Fri, Feb 16, 2024 at 4:47 AM Adhemerval Zanella Netto
> > <adhemerval.zanella@linaro.org> wrote:
> >>
> >>
> >>
> >> On 16/02/24 09:37, H.J. Lu wrote:
> >>> On Fri, Feb 16, 2024 at 4:20 AM H.J. Lu <hjl.tools@gmail.com> wrote:
> >>>>
> >>>> On Fri, Feb 16, 2024 at 4:18 AM Florian Weimer <fweimer@redhat.com> wrote:
> >>>>>
> >>>>> * H. J. Lu:
> >>>>>
> >>>>>> On Thu, Feb 15, 2024 at 10:23 PM Florian Weimer <fweimer@redhat.com> wrote:
> >>>>>>>
> >>>>>>> * Adhemerval Zanella Netto:
> >>>>>>>
> >>>>>>>>> diff --git a/elf/malloc-for-test.map b/elf/malloc-for-test.map
> >>>>>>>>> new file mode 100644
> >>>>>>>>> index 0000000000..8437cf4346
> >>>>>>>>> --- /dev/null
> >>>>>>>>> +++ b/elf/malloc-for-test.map
> >>>>>>>>> @@ -0,0 +1,6 @@
> >>>>>>>>> +GLIBC_2.0 {
> >>>>>>>>
> >>>>>>>> You need to use the correct version to override the malloc:
> >>>>>>>>
> >>>>>>>> $ gdb --args tst-gnu2-tls2 --direct
> >>>>>>>> [...]
> >>>>>>>> (gdb) b apply_tls
> >>>>>>>> (gdb) r
> >>>>>>>> Thread 2 "tst-gnu2-tls2" hit Breakpoint 1, apply_tls (p=0x7ffff7bfee80) at tst-gnu2-tls2mod1.c:25
> >>>>>>>> 25 {
> >>>>>>>> (gdb) b malloc
> >>>>>>>> Breakpoint 2 at 0x7ffff7ca8ad0: malloc. (3 locations)
> >>>>>>>> (gdb) c
> >>>>>>>> Continuing.
> >>>>>>>
> >>>>>>> Why do we need to set a symbol version here?  I think this can be
> >>>>>>> removed.
> >>>>>>
> >>>>>> malloc-for-test.so provides a fake malloc to clobber caller-save
> >>>>>> registers.  Since malloc in ld.so has a symbol version,
> >>>>>>
> >>>>>>   struct r_found_version version;
> >>>>>>   version.name = symbol_version_string (libc, GLIBC_2_0);
> >>>>>>   version.hidden = 0;
> >>>>>>   version.hash = _dl_elf_hash (version.name);
> >>>>>>   version.filename = NULL;
> >>>>>>
> >>>>>>   void *new_calloc = lookup_malloc_symbol (main_map, "calloc", &version);
> >>>>>>   void *new_free = lookup_malloc_symbol (main_map, "free", &version);
> >>>>>>   void *new_malloc = lookup_malloc_symbol (main_map, "malloc", &version);
> >>>>>>   void *new_realloc = lookup_malloc_symbol (main_map, "realloc", &version);
> >>>>>>
> >>>>>> malloc in malloc-for-test.so must have the same version.  Otherwise,
> >>>>>> it won't be used in the test.
> >>>>>
> >>>>> I thought that unversioned symbols interpose all versioned symbols.  Has
> >>>>> this changed in the dynamic linker?
> >>>>
> >>>> Only for this case.
> >>>>
> >>>
> >>> I was wrong.  Version isn't needed.  Will fix it.
> >>>
> >>
> >> Indeed, but I think we also improve the register check testing since just clobbering
> >> on malloc does not really provide much coverage.
> >
> > Why?  The slow path will always call malloc in malloc-for-test.so.
> > We can clobber any caller-save registers we need.  Without
> > malloc-for-test.so, the test doesn't fail for i386 since malloc in libc.so
> > doesn't use any vector registers.
> >
>
> The idea is to check whether _dl_tlsdesc_dynamic call does preserve any
> possible caller-saved register and since this test is generic and afaik
> all affected ABIs follow the same idea (compiler won't save/restore such
> register), clobbering the register will only trigger a possible issue
> iff the thread that actually issue the TLS usage does actually use any
> possible register.
>
> That's why I think a better coverage would to also clobber the register
> before the TLS access, and check if their values does not change over
> the TLS access.

How about this

struct tls *
apply_tls (struct tls *p)
{
  tls_var1[1] = *p;
  BEFORE_TLSDESC_CALL ();
  struct tls *ret = &tls_var1[1];
  AFTER_TLSDESC_CALL ();
  return ret;
}

An architecture can define BEFORE_TLSDESC_CALL and
tAFTER_TLSDESC_CALL o verify that clobber caller-saved
registers aren't changed by the implicit TLSDESC call.


H.J.
  

Patch

diff --git a/elf/Makefile b/elf/Makefile
index 5d78b659ce..e0665d2007 100644
--- a/elf/Makefile
+++ b/elf/Makefile
@@ -424,6 +424,7 @@  tests += \
   tst-glibc-hwcaps-prepend \
   tst-global1 \
   tst-global2 \
+  tst-gnu2-tls2 \
   tst-initfinilazyfail \
   tst-initorder \
   tst-initorder2 \
@@ -699,6 +700,7 @@  modules-names += \
   libtracemod5-1 \
   ltglobmod1 \
   ltglobmod2 \
+  malloc-for-test \
   neededobj1 \
   neededobj2 \
   neededobj3 \
@@ -846,6 +848,9 @@  modules-names += \
   tst-filterobj-flt \
   tst-finilazyfailmod \
   tst-globalmod2 \
+  tst-gnu2-tls2mod0 \
+  tst-gnu2-tls2mod1 \
+  tst-gnu2-tls2mod2 \
   tst-initlazyfailmod \
   tst-initorder2a \
   tst-initorder2b \
@@ -3044,8 +3049,22 @@  $(objpfx)tst-tlsgap.out: \
   $(objpfx)tst-tlsgap-mod0.so \
   $(objpfx)tst-tlsgap-mod1.so \
   $(objpfx)tst-tlsgap-mod2.so
+
+$(objpfx)tst-gnu2-tls2: \
+  $(shared-thread-library) \
+  $(objpfx)malloc-for-test.so
+$(objpfx)tst-gnu2-tls2.out: \
+  $(objpfx)tst-gnu2-tls2mod0.so \
+  $(objpfx)tst-gnu2-tls2mod1.so \
+  $(objpfx)tst-gnu2-tls2mod2.so
+
+LDFLAGS-malloc-for-test.so += -Wl,--version-script=malloc-for-test.map
+
 ifeq (yes,$(have-mtls-dialect-gnu2))
 CFLAGS-tst-tlsgap-mod0.c += -mtls-dialect=gnu2
 CFLAGS-tst-tlsgap-mod1.c += -mtls-dialect=gnu2
 CFLAGS-tst-tlsgap-mod2.c += -mtls-dialect=gnu2
+CFLAGS-tst-gnu2-tls2mod0.c += -mtls-dialect=gnu2
+CFLAGS-tst-gnu2-tls2mod1.c += -mtls-dialect=gnu2
+CFLAGS-tst-gnu2-tls2mod2.c += -mtls-dialect=gnu2
 endif
diff --git a/elf/malloc-for-test.c b/elf/malloc-for-test.c
new file mode 100644
index 0000000000..1bec69eda7
--- /dev/null
+++ b/elf/malloc-for-test.c
@@ -0,0 +1,32 @@ 
+/* A malloc for intercept test.
+   Copyright (C) 2024 Free Software Foundation, Inc.
+   This file is part of the GNU C Library.
+
+   The GNU C Library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 2.1 of the License, or (at your option) any later version.
+
+   The GNU C Library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with the GNU C Library; if not, see
+   <http://www.gnu.org/licenses/>.  */
+
+#include <stdlib.h>
+
+extern void * __libc_malloc (size_t);
+
+#ifndef PREPARE_MALLOC
+# define PREPARE_MALLOC()
+#endif
+
+void *
+malloc (size_t n)
+{
+  PREPARE_MALLOC ();
+  return __libc_malloc (n);
+}
diff --git a/elf/malloc-for-test.map b/elf/malloc-for-test.map
new file mode 100644
index 0000000000..8437cf4346
--- /dev/null
+++ b/elf/malloc-for-test.map
@@ -0,0 +1,6 @@ 
+GLIBC_2.0 {
+  global:
+    malloc;
+  local:
+    *;
+};
diff --git a/elf/tst-gnu2-tls2.c b/elf/tst-gnu2-tls2.c
new file mode 100644
index 0000000000..34427f9a0f
--- /dev/null
+++ b/elf/tst-gnu2-tls2.c
@@ -0,0 +1,97 @@ 
+/* Test TLSDESC relocation.
+   Copyright (C) 2024 Free Software Foundation, Inc.
+   This file is part of the GNU C Library.
+
+   The GNU C Library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 2.1 of the License, or (at your option) any later version.
+
+   The GNU C Library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with the GNU C Library; if not, see
+   <http://www.gnu.org/licenses/>.  */
+
+#include <stdio.h>
+#include <string.h>
+#include <dlfcn.h>
+#include <pthread.h>
+#include <support/xdlfcn.h>
+#include <support/xthread.h>
+#include <support/check.h>
+#include <support/test-driver.h>
+#include "tst-gnu2-tls2.h"
+
+#ifndef IS_SUPPORTED
+# define IS_SUPPORTED() true
+#endif
+
+static void *mod[3];
+#define MOD(i) "tst-gnu2-tls2mod" #i ".so"
+static const char *modname[3] = { MOD(0), MOD(1), MOD(2) };
+#undef MOD
+
+static void
+open_mod (int i)
+{
+  mod[i] = xdlopen (modname[i], RTLD_LAZY);
+  printf ("open %s\n", modname[i]);
+}
+
+static void
+close_mod (int i)
+{
+  xdlclose (mod[i]);
+  mod[i] = NULL;
+  printf ("close %s\n", modname[i]);
+}
+
+static void
+access_mod (int i, const char *sym)
+{
+  struct tls var = { -1, -1, -1, -1 };
+  struct tls *(*f) (struct tls *) = xdlsym (mod[i], sym);
+  struct tls *p = f (&var);
+  printf ("access %s: %s() = %p\n", modname[i], sym, p);
+  TEST_VERIFY_EXIT (memcmp (p, &var, sizeof (var)) == 0);
+  ++(p->a);
+}
+
+static void *
+start (void *arg)
+{
+  /* The DTV generation is at the last dlopen of mod0 and the
+     entry for mod1 is NULL.  */
+
+  open_mod (1); /* Reuse modid of mod1. Uses dynamic TLS.  */
+
+  /* Force the slow path in GNU2 TLS descriptor call.  */
+  access_mod (1, "apply_tls");
+
+  return arg;
+}
+
+static int
+do_test (void)
+{
+  if (!IS_SUPPORTED ())
+    return EXIT_UNSUPPORTED;
+
+  open_mod (0);
+  open_mod (1);
+  open_mod (2);
+  close_mod (0);
+  close_mod (1); /* Create modid gap at mod1.  */
+  open_mod (0); /* Reuse modid of mod0, bump generation count.  */
+
+  /* Create a thread where DTV of mod1 is NULL.  */
+  pthread_t t = xpthread_create (NULL, start, NULL);
+  xpthread_join (t);
+  return 0;
+}
+
+#include <support/test-driver.c>
diff --git a/elf/tst-gnu2-tls2.h b/elf/tst-gnu2-tls2.h
new file mode 100644
index 0000000000..e33f4dbe27
--- /dev/null
+++ b/elf/tst-gnu2-tls2.h
@@ -0,0 +1,26 @@ 
+/* Test TLSDESC relocation.
+   Copyright (C) 2024 Free Software Foundation, Inc.
+   This file is part of the GNU C Library.
+
+   The GNU C Library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 2.1 of the License, or (at your option) any later version.
+
+   The GNU C Library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with the GNU C Library; if not, see
+   <https://www.gnu.org/licenses/>.  */
+
+#include <stdint.h>
+
+struct tls
+{
+  int64_t a, b, c, d;
+};
+
+extern struct tls *apply_tls (struct tls *);
diff --git a/elf/tst-gnu2-tls2mod0.c b/elf/tst-gnu2-tls2mod0.c
new file mode 100644
index 0000000000..67dc0d464d
--- /dev/null
+++ b/elf/tst-gnu2-tls2mod0.c
@@ -0,0 +1,28 @@ 
+/* DSO used by tst-gnu2-tls2.
+   Copyright (C) 2024 Free Software Foundation, Inc.
+   This file is part of the GNU C Library.
+
+   The GNU C Library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 2.1 of the License, or (at your option) any later version.
+
+   The GNU C Library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with the GNU C Library; if not, see
+   <https://www.gnu.org/licenses/>.  */
+
+#include "tst-gnu2-tls2.h"
+
+__thread struct tls tls_var0 __attribute__ ((visibility ("hidden")));
+
+struct tls *
+apply_tls (struct tls *p)
+{
+  tls_var0 = *p;
+  return &tls_var0;
+}
diff --git a/elf/tst-gnu2-tls2mod1.c b/elf/tst-gnu2-tls2mod1.c
new file mode 100644
index 0000000000..a4ae6db24f
--- /dev/null
+++ b/elf/tst-gnu2-tls2mod1.c
@@ -0,0 +1,28 @@ 
+/* DSO used by tst-gnu2-tls2.
+   Copyright (C) 2024 Free Software Foundation, Inc.
+   This file is part of the GNU C Library.
+
+   The GNU C Library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 2.1 of the License, or (at your option) any later version.
+
+   The GNU C Library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with the GNU C Library; if not, see
+   <https://www.gnu.org/licenses/>.  */
+
+#include "tst-gnu2-tls2.h"
+
+__thread struct tls tls_var1[100] __attribute__ ((visibility ("hidden")));
+
+struct tls *
+apply_tls (struct tls *p)
+{
+  tls_var1[1] = *p;
+  return &tls_var1[1];
+}
diff --git a/elf/tst-gnu2-tls2mod2.c b/elf/tst-gnu2-tls2mod2.c
new file mode 100644
index 0000000000..2d13921717
--- /dev/null
+++ b/elf/tst-gnu2-tls2mod2.c
@@ -0,0 +1,28 @@ 
+/* DSO used by tst-gnu2-tls2.
+   Copyright (C) 2024 Free Software Foundation, Inc.
+   This file is part of the GNU C Library.
+
+   The GNU C Library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 2.1 of the License, or (at your option) any later version.
+
+   The GNU C Library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with the GNU C Library; if not, see
+   <https://www.gnu.org/licenses/>.  */
+
+#include "tst-gnu2-tls2.h"
+
+__thread struct tls tls_var2 __attribute__ ((visibility ("hidden")));
+
+struct tls *
+apply_tls (struct tls *p)
+{
+  tls_var2 = *p;
+  return &tls_var2;
+}
diff --git a/sysdeps/i386/dl-machine.h b/sysdeps/i386/dl-machine.h
index fc1ef96587..50d74fe6e9 100644
--- a/sysdeps/i386/dl-machine.h
+++ b/sysdeps/i386/dl-machine.h
@@ -347,7 +347,7 @@  and creates an unsatisfiable circular dependency.\n",
 		  {
 		    td->arg = _dl_make_tlsdesc_dynamic
 		      (sym_map, sym->st_value + (ElfW(Word))td->arg);
-		    td->entry = _dl_tlsdesc_dynamic;
+		    td->entry = GLRO(dl_x86_tlsdesc_dynamic);
 		  }
 		else
 #  endif
diff --git a/sysdeps/i386/dl-tlsdesc-dynamic.h b/sysdeps/i386/dl-tlsdesc-dynamic.h
new file mode 100644
index 0000000000..675e56d32d
--- /dev/null
+++ b/sysdeps/i386/dl-tlsdesc-dynamic.h
@@ -0,0 +1,187 @@ 
+/* Thread-local storage handling in the ELF dynamic linker.  i386 version.
+   Copyright (C) 2004-2024 Free Software Foundation, Inc.
+   This file is part of the GNU C Library.
+
+   The GNU C Library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 2.1 of the License, or (at your option) any later version.
+
+   The GNU C Library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with the GNU C Library; if not, see
+   <https://www.gnu.org/licenses/>.  */
+
+#undef REGISTER_SAVE_AREA
+
+#if !defined USE_FNSAVE && (STATE_SAVE_ALIGNMENT % 16) != 0
+# error STATE_SAVE_ALIGNMENT must be multiple of 16
+#endif
+
+#if DL_RUNTIME_RESOLVE_REALIGN_STACK
+# ifdef USE_FNSAVE
+#  error USE_FNSAVE shouldn't be defined
+# endif
+# ifdef USE_FXSAVE
+/* Use fxsave to save all registers.  */
+#  define REGISTER_SAVE_AREA	512
+# endif
+#else
+# ifdef USE_FNSAVE
+/* Use fnsave to save x87 FPU stack registers.  */
+#  define REGISTER_SAVE_AREA	108
+# else
+#  ifndef USE_FXSAVE
+#   error USE_FXSAVE must be defined
+#  endif
+/* Use fxsave to save all registers.  Add 12 bytes to align the stack
+   to 16 bytes.  */
+#  define REGISTER_SAVE_AREA	(512 + 12)
+# endif
+#endif
+
+	.hidden _dl_tlsdesc_dynamic
+	.global	_dl_tlsdesc_dynamic
+	.type	_dl_tlsdesc_dynamic,@function
+
+     /* This function is used for symbols that need dynamic TLS.
+
+	%eax points to the TLS descriptor, such that 0(%eax) points to
+	_dl_tlsdesc_dynamic itself, and 4(%eax) points to a struct
+	tlsdesc_dynamic_arg object.  It must return in %eax the offset
+	between the thread pointer and the object denoted by the
+	argument, without clobbering any registers.
+
+	The assembly code that follows is a rendition of the following
+	C code, hand-optimized a little bit.
+
+ptrdiff_t
+__attribute__ ((__regparm__ (1)))
+_dl_tlsdesc_dynamic (struct tlsdesc *tdp)
+{
+  struct tlsdesc_dynamic_arg *td = tdp->arg;
+  dtv_t *dtv = *(dtv_t **)((char *)__thread_pointer + DTV_OFFSET);
+  if (__builtin_expect (td->gen_count <= dtv[0].counter
+			&& (dtv[td->tlsinfo.ti_module].pointer.val
+			    != TLS_DTV_UNALLOCATED),
+			1))
+    return dtv[td->tlsinfo.ti_module].pointer.val + td->tlsinfo.ti_offset
+      - __thread_pointer;
+
+  return ___tls_get_addr (&td->tlsinfo) - __thread_pointer;
+}
+*/
+	cfi_startproc
+	.align 16
+_dl_tlsdesc_dynamic:
+	/* Like all TLS resolvers, preserve call-clobbered registers.
+	   We need two scratch regs anyway.  */
+	subl	$32, %esp
+	cfi_adjust_cfa_offset (32)
+	movl	%ecx, 20(%esp)
+	movl	%edx, 24(%esp)
+	movl	TLSDESC_ARG(%eax), %eax
+	movl	%gs:DTV_OFFSET, %edx
+	movl	TLSDESC_GEN_COUNT(%eax), %ecx
+	cmpl	(%edx), %ecx
+	ja	2f
+	movl	TLSDESC_MODID(%eax), %ecx
+	movl	(%edx,%ecx,8), %edx
+	cmpl	$-1, %edx
+	je	2f
+	movl	TLSDESC_MODOFF(%eax), %eax
+	addl	%edx, %eax
+1:
+	movl	20(%esp), %ecx
+	subl	%gs:0, %eax
+	movl	24(%esp), %edx
+	addl	$32, %esp
+	cfi_adjust_cfa_offset (-32)
+	ret
+	.p2align 4,,7
+2:
+	cfi_adjust_cfa_offset (32)
+#if DL_RUNTIME_RESOLVE_REALIGN_STACK
+	movl	%ebx, -28(%esp)
+	movl	%esp, %ebx
+	cfi_def_cfa_register(%ebx)
+	and	$-STATE_SAVE_ALIGNMENT, %esp
+#endif
+#ifdef REGISTER_SAVE_AREA
+	subl	$REGISTER_SAVE_AREA, %esp
+# if !DL_RUNTIME_RESOLVE_REALIGN_STACK
+	cfi_adjust_cfa_offset(REGISTER_SAVE_AREA)
+# endif
+#else
+	# Allocate stack space of the required size to save the state.
+	LOAD_PIC_REG (cx)
+	subl	RTLD_GLOBAL_RO_DL_X86_CPU_FEATURES_OFFSET+XSAVE_STATE_SIZE_OFFSET+_rtld_local_ro@GOTOFF(%ecx), %esp
+#endif
+#ifdef USE_FNSAVE
+	fnsave	(%esp)
+#elif defined USE_FXSAVE
+	fxsave	(%esp)
+#else
+	# Save the argument for ___tls_get_addr in EAX.
+	movl	%eax, %ecx
+	movl	$TLSDESC_CALL_STATE_SAVE_MASK, %eax
+	xorl	%edx, %edx
+	# Clear the XSAVE Header.
+# ifdef USE_XSAVE
+	movl	%edx, (512)(%esp)
+	movl	%edx, (512 + 4 * 1)(%esp)
+	movl	%edx, (512 + 4 * 2)(%esp)
+	movl	%edx, (512 + 4 * 3)(%esp)
+# endif
+	movl	%edx, (512 + 4 * 4)(%esp)
+	movl	%edx, (512 + 4 * 5)(%esp)
+	movl	%edx, (512 + 4 * 6)(%esp)
+	movl	%edx, (512 + 4 * 7)(%esp)
+	movl	%edx, (512 + 4 * 8)(%esp)
+	movl	%edx, (512 + 4 * 9)(%esp)
+	movl	%edx, (512 + 4 * 10)(%esp)
+	movl	%edx, (512 + 4 * 11)(%esp)
+	movl	%edx, (512 + 4 * 12)(%esp)
+	movl	%edx, (512 + 4 * 13)(%esp)
+	movl	%edx, (512 + 4 * 14)(%esp)
+	movl	%edx, (512 + 4 * 15)(%esp)
+# ifdef USE_XSAVE
+	xsave	(%esp)
+# else
+	xsavec	(%esp)
+# endif
+	# Restore the argument for ___tls_get_addr in EAX.
+	movl	%ecx, %eax
+#endif
+	call	HIDDEN_JUMPTARGET (___tls_get_addr)
+	# Get register content back.
+#ifdef USE_FNSAVE
+	frstor	(%esp)
+#elif defined USE_FXSAVE
+	fxrstor	(%esp)
+#else
+	/* Save and retore ___tls_get_addr return value stored in EAX.  */
+	movl	%eax, %ecx
+	movl	$TLSDESC_CALL_STATE_SAVE_MASK, %eax
+	xorl	%edx, %edx
+	xrstor	(%esp)
+	movl	%ecx, %eax
+#endif
+#if DL_RUNTIME_RESOLVE_REALIGN_STACK
+	mov	%ebx, %esp
+	cfi_def_cfa_register(%esp)
+	movl	-28(%esp), %ebx
+	cfi_restore(%ebx)
+#else
+	addl	$REGISTER_SAVE_AREA, %esp
+	cfi_adjust_cfa_offset(-REGISTER_SAVE_AREA)
+#endif
+	jmp	1b
+	cfi_endproc
+	.size	_dl_tlsdesc_dynamic, .-_dl_tlsdesc_dynamic
+
+#undef STATE_SAVE_ALIGNMENT
diff --git a/sysdeps/i386/dl-tlsdesc.S b/sysdeps/i386/dl-tlsdesc.S
index 90d93caa0c..f002feee56 100644
--- a/sysdeps/i386/dl-tlsdesc.S
+++ b/sysdeps/i386/dl-tlsdesc.S
@@ -18,8 +18,27 @@ 
 
 #include <sysdep.h>
 #include <tls.h>
+#include <cpu-features-offsets.h>
+#include <features-offsets.h>
 #include "tlsdesc.h"
 
+#ifndef DL_STACK_ALIGNMENT
+/* Due to GCC bug:
+
+   https://gcc.gnu.org/bugzilla/show_bug.cgi?id=58066
+
+   __tls_get_addr may be called with 4-byte stack alignment.  Although
+   this bug has been fixed in GCC 4.9.4, 5.3 and 6, we can't assume
+   that stack will be always aligned at 16 bytes.  */
+# define DL_STACK_ALIGNMENT 4
+#endif
+
+/* True if _dl_tlsdesc_dynamic should align stack for STATE_SAVE or align
+   stack to MINIMUM_ALIGNMENT bytes before calling ___tls_get_addr.  */
+#define DL_RUNTIME_RESOLVE_REALIGN_STACK \
+  (STATE_SAVE_ALIGNMENT > DL_STACK_ALIGNMENT \
+   || MINIMUM_ALIGNMENT > DL_STACK_ALIGNMENT)
+
 	.text
 
      /* This function is used to compute the TP offset for symbols in
@@ -65,69 +84,35 @@  _dl_tlsdesc_undefweak:
 	.size	_dl_tlsdesc_undefweak, .-_dl_tlsdesc_undefweak
 
 #ifdef SHARED
-	.hidden _dl_tlsdesc_dynamic
-	.global	_dl_tlsdesc_dynamic
-	.type	_dl_tlsdesc_dynamic,@function
-
-     /* This function is used for symbols that need dynamic TLS.
-
-	%eax points to the TLS descriptor, such that 0(%eax) points to
-	_dl_tlsdesc_dynamic itself, and 4(%eax) points to a struct
-	tlsdesc_dynamic_arg object.  It must return in %eax the offset
-	between the thread pointer and the object denoted by the
-	argument, without clobbering any registers.
-
-	The assembly code that follows is a rendition of the following
-	C code, hand-optimized a little bit.
-
-ptrdiff_t
-__attribute__ ((__regparm__ (1)))
-_dl_tlsdesc_dynamic (struct tlsdesc *tdp)
-{
-  struct tlsdesc_dynamic_arg *td = tdp->arg;
-  dtv_t *dtv = *(dtv_t **)((char *)__thread_pointer + DTV_OFFSET);
-  if (__builtin_expect (td->gen_count <= dtv[0].counter
-			&& (dtv[td->tlsinfo.ti_module].pointer.val
-			    != TLS_DTV_UNALLOCATED),
-			1))
-    return dtv[td->tlsinfo.ti_module].pointer.val + td->tlsinfo.ti_offset
-      - __thread_pointer;
-
-  return ___tls_get_addr (&td->tlsinfo) - __thread_pointer;
-}
-*/
-	cfi_startproc
-	.align 16
-_dl_tlsdesc_dynamic:
-	/* Like all TLS resolvers, preserve call-clobbered registers.
-	   We need two scratch regs anyway.  */
-	subl	$28, %esp
-	cfi_adjust_cfa_offset (28)
-	movl	%ecx, 20(%esp)
-	movl	%edx, 24(%esp)
-	movl	TLSDESC_ARG(%eax), %eax
-	movl	%gs:DTV_OFFSET, %edx
-	movl	TLSDESC_GEN_COUNT(%eax), %ecx
-	cmpl	(%edx), %ecx
-	ja	.Lslow
-	movl	TLSDESC_MODID(%eax), %ecx
-	movl	(%edx,%ecx,8), %edx
-	cmpl	$-1, %edx
-	je	.Lslow
-	movl	TLSDESC_MODOFF(%eax), %eax
-	addl	%edx, %eax
-.Lret:
-	movl	20(%esp), %ecx
-	subl	%gs:0, %eax
-	movl	24(%esp), %edx
-	addl	$28, %esp
-	cfi_adjust_cfa_offset (-28)
-	ret
-	.p2align 4,,7
-.Lslow:
-	cfi_adjust_cfa_offset (28)
-	call	HIDDEN_JUMPTARGET (___tls_get_addr)
-	jmp	.Lret
-	cfi_endproc
-	.size	_dl_tlsdesc_dynamic, .-_dl_tlsdesc_dynamic
+# define USE_FNSAVE
+# define MINIMUM_ALIGNMENT	4
+# define STATE_SAVE_ALIGNMENT	4
+# define _dl_tlsdesc_dynamic	_dl_tlsdesc_dynamic_fnsave
+# include "dl-tlsdesc-dynamic.h"
+# undef _dl_tlsdesc_dynamic
+# undef MINIMUM_ALIGNMENT
+# undef USE_FNSAVE
+
+# define MINIMUM_ALIGNMENT	16
+
+# define USE_FXSAVE
+# define STATE_SAVE_ALIGNMENT	16
+# define _dl_tlsdesc_dynamic	_dl_tlsdesc_dynamic_fxsave
+# include "dl-tlsdesc-dynamic.h"
+# undef _dl_tlsdesc_dynamic
+# undef USE_FXSAVE
+
+# define USE_XSAVE
+# define STATE_SAVE_ALIGNMENT	64
+# define _dl_tlsdesc_dynamic	_dl_tlsdesc_dynamic_xsave
+# include "dl-tlsdesc-dynamic.h"
+# undef _dl_tlsdesc_dynamic
+# undef USE_XSAVE
+
+# define USE_XSAVEC
+# define STATE_SAVE_ALIGNMENT	64
+# define _dl_tlsdesc_dynamic	_dl_tlsdesc_dynamic_xsavec
+# include "dl-tlsdesc-dynamic.h"
+# undef _dl_tlsdesc_dynamic
+# undef USE_XSAVEC
 #endif /* SHARED */
diff --git a/sysdeps/i386/tst-gnu2-tls2.c b/sysdeps/i386/tst-gnu2-tls2.c
new file mode 100644
index 0000000000..92e7fbff89
--- /dev/null
+++ b/sysdeps/i386/tst-gnu2-tls2.c
@@ -0,0 +1,5 @@ 
+#include <sys/platform/x86.h>
+
+#define IS_SUPPORTED() CPU_FEATURE_ACTIVE (SSE2)
+
+#include <elf/tst-gnu2-tls2.c>
diff --git a/sysdeps/x86/Makefile b/sysdeps/x86/Makefile
index 4d50b327b5..bc357f0099 100644
--- a/sysdeps/x86/Makefile
+++ b/sysdeps/x86/Makefile
@@ -1,5 +1,5 @@ 
 ifeq ($(subdir),csu)
-gen-as-const-headers += cpu-features-offsets.sym
+gen-as-const-headers += cpu-features-offsets.sym features-offsets.sym
 endif
 
 ifeq ($(subdir),elf)
@@ -86,6 +86,11 @@  endif
 tst-ifunc-isa-2-ENV = GLIBC_TUNABLES=glibc.cpu.hwcaps=-SSE4_2,-AVX,-AVX2,-AVX512F
 tst-ifunc-isa-2-static-ENV = $(tst-ifunc-isa-2-ENV)
 tst-hwcap-tunables-ARGS = -- $(host-test-program-cmd)
+
+CFLAGS-malloc-for-test.c += -msse2
+CFLAGS-tst-gnu2-tls2mod0.c += -msse2 -mtune=haswell
+CFLAGS-tst-gnu2-tls2mod1.c += -msse2 -mtune=haswell
+CFLAGS-tst-gnu2-tls2mod2.c += -msse2 -mtune=haswell
 endif
 
 ifeq ($(subdir),math)
diff --git a/sysdeps/x86/cpu-features.c b/sysdeps/x86/cpu-features.c
index 25e6622a79..835113b42f 100644
--- a/sysdeps/x86/cpu-features.c
+++ b/sysdeps/x86/cpu-features.c
@@ -27,8 +27,13 @@ 
 extern void TUNABLE_CALLBACK (set_hwcaps) (tunable_val_t *)
   attribute_hidden;
 
-#if defined SHARED && defined __x86_64__
-# include <dl-plt-rewrite.h>
+#if defined SHARED
+extern void _dl_tlsdesc_dynamic_fxsave (void) attribute_hidden;
+extern void _dl_tlsdesc_dynamic_xsave (void) attribute_hidden;
+extern void _dl_tlsdesc_dynamic_xsavec (void) attribute_hidden;
+
+# ifdef __x86_64__
+#  include <dl-plt-rewrite.h>
 
 static void
 TUNABLE_CALLBACK (set_plt_rewrite) (tunable_val_t *valp)
@@ -47,6 +52,15 @@  TUNABLE_CALLBACK (set_plt_rewrite) (tunable_val_t *valp)
 		 : plt_rewrite_jmp);
     }
 }
+# else
+extern void _dl_tlsdesc_dynamic_fnsave (void) attribute_hidden;
+# endif
+#endif
+
+#ifdef __x86_64__
+extern void _dl_runtime_resolve_fxsave (void) attribute_hidden;
+extern void _dl_runtime_resolve_xsave (void) attribute_hidden;
+extern void _dl_runtime_resolve_xsavec (void) attribute_hidden;
 #endif
 
 #ifdef __LP64__
@@ -1130,6 +1144,44 @@  no_cpuid:
 	       TUNABLE_CALLBACK (set_x86_shstk));
 #endif
 
+  if (GLRO(dl_x86_cpu_features).xsave_state_size != 0)
+    {
+      if (CPU_FEATURE_USABLE_P (cpu_features, XSAVEC))
+	{
+#ifdef __x86_64__
+	  GLRO(dl_x86_64_runtime_resolve) = _dl_runtime_resolve_xsavec;
+#endif
+#ifdef SHARED
+	  GLRO(dl_x86_tlsdesc_dynamic) = _dl_tlsdesc_dynamic_xsavec;
+#endif
+	}
+      else
+	{
+#ifdef __x86_64__
+	  GLRO(dl_x86_64_runtime_resolve) = _dl_runtime_resolve_xsave;
+#endif
+#ifdef SHARED
+	  GLRO(dl_x86_tlsdesc_dynamic) = _dl_tlsdesc_dynamic_xsave;
+#endif
+	}
+    }
+  else
+    {
+#ifdef __x86_64__
+      GLRO(dl_x86_64_runtime_resolve) = _dl_runtime_resolve_fxsave;
+# ifdef SHARED
+      GLRO(dl_x86_tlsdesc_dynamic) = _dl_tlsdesc_dynamic_fxsave;
+# endif
+#else
+# ifdef SHARED
+      if (CPU_FEATURE_USABLE_P (cpu_features, FXSR))
+	GLRO(dl_x86_tlsdesc_dynamic) = _dl_tlsdesc_dynamic_fxsave;
+      else
+	GLRO(dl_x86_tlsdesc_dynamic) = _dl_tlsdesc_dynamic_fnsave;
+# endif
+#endif
+    }
+
 #ifdef SHARED
 # ifdef __x86_64__
   TUNABLE_GET (plt_rewrite, tunable_val_t *,
diff --git a/sysdeps/x86/dl-procinfo.c b/sysdeps/x86/dl-procinfo.c
index ee957b4d70..5920d4b320 100644
--- a/sysdeps/x86/dl-procinfo.c
+++ b/sysdeps/x86/dl-procinfo.c
@@ -86,3 +86,19 @@  PROCINFO_CLASS const char _dl_x86_platforms[4][9]
 #else
 ,
 #endif
+
+#if defined SHARED && !IS_IN (ldconfig)
+# if !defined PROCINFO_DECL
+  ._dl_x86_tlsdesc_dynamic
+# else
+PROCINFO_CLASS void * _dl_x86_tlsdesc_dynamic
+# endif
+# ifndef PROCINFO_DECL
+= NULL
+# endif
+# ifdef PROCINFO_DECL
+;
+# else
+,
+# endif
+#endif
diff --git a/sysdeps/x86_64/features-offsets.sym b/sysdeps/x86/features-offsets.sym
similarity index 89%
rename from sysdeps/x86_64/features-offsets.sym
rename to sysdeps/x86/features-offsets.sym
index 9e4be3393a..77e990c705 100644
--- a/sysdeps/x86_64/features-offsets.sym
+++ b/sysdeps/x86/features-offsets.sym
@@ -3,4 +3,6 @@ 
 #include <ldsodefs.h>
 
 RTLD_GLOBAL_RO_DL_X86_CPU_FEATURES_OFFSET offsetof (struct rtld_global_ro, _dl_x86_cpu_features)
+#ifdef __x86_64__
 RTLD_GLOBAL_DL_X86_FEATURE_1_OFFSET offsetof (struct rtld_global, _dl_x86_feature_1)
+#endif
diff --git a/sysdeps/x86/malloc-for-test.c b/sysdeps/x86/malloc-for-test.c
new file mode 100644
index 0000000000..02f4dead5d
--- /dev/null
+++ b/sysdeps/x86/malloc-for-test.c
@@ -0,0 +1,33 @@ 
+/*  A malloc for intercept test.  x86 version.
+   Copyright (C) 2024 Free Software Foundation, Inc.
+   This file is part of the GNU C Library.
+
+   The GNU C Library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 2.1 of the License, or (at your option) any later version.
+
+   The GNU C Library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with the GNU C Library; if not, see
+   <http://www.gnu.org/licenses/>.  */
+
+
+/* Clear XMM0...XMM7  */
+#define PREPARE_MALLOC()				\
+{							\
+  asm volatile ("xorps %%xmm0, %%xmm0" : : : "xmm0" );	\
+  asm volatile ("xorps %%xmm1, %%xmm1" : : : "xmm1" );	\
+  asm volatile ("xorps %%xmm2, %%xmm2" : : : "xmm2" );	\
+  asm volatile ("xorps %%xmm3, %%xmm3" : : : "xmm3" );	\
+  asm volatile ("xorps %%xmm4, %%xmm4" : : : "xmm4" );	\
+  asm volatile ("xorps %%xmm5, %%xmm5" : : : "xmm5" );	\
+  asm volatile ("xorps %%xmm6, %%xmm6" : : : "xmm6" );	\
+  asm volatile ("xorps %%xmm7, %%xmm7" : : : "xmm7" );	\
+}
+
+#include <elf/malloc-for-test.c>
diff --git a/sysdeps/x86/sysdep.h b/sysdeps/x86/sysdep.h
index 5c1f0bcf53..792e2ea5ed 100644
--- a/sysdeps/x86/sysdep.h
+++ b/sysdeps/x86/sysdep.h
@@ -68,6 +68,12 @@ 
    | (1 << X86_XSTATE_ZMM_H_ID))
 #endif
 
+/* States which should be saved for TLSDESC_CALL and TLS_DESC_CALL.
+   Compiler assumes that all registers, including x87 FPU stack registers,
+   are unchanged after CALL, except for EFLAGS and RAX/EAX.  */
+#define TLSDESC_CALL_STATE_SAVE_MASK	\
+  (STATE_SAVE_MASK | (1 << X86_XSTATE_X87_ID))
+
 /* Constants for bits in __x86_string_control:  */
 
 /* Avoid short distance REP MOVSB.  */
diff --git a/sysdeps/x86_64/Makefile b/sysdeps/x86_64/Makefile
index 90f4ecfd26..e8babc9a4e 100644
--- a/sysdeps/x86_64/Makefile
+++ b/sysdeps/x86_64/Makefile
@@ -10,7 +10,7 @@  LDFLAGS-rtld += -Wl,-z,nomark-plt
 endif
 
 ifeq ($(subdir),csu)
-gen-as-const-headers += features-offsets.sym link-defines.sym
+gen-as-const-headers += link-defines.sym
 endif
 
 ifeq ($(subdir),gmon)
diff --git a/sysdeps/x86_64/dl-machine.h b/sysdeps/x86_64/dl-machine.h
index 6d605d0d32..ff5d45f7cb 100644
--- a/sysdeps/x86_64/dl-machine.h
+++ b/sysdeps/x86_64/dl-machine.h
@@ -71,9 +71,6 @@  elf_machine_runtime_setup (struct link_map *l, struct r_scope_elem *scope[],
 			   int lazy, int profile)
 {
   Elf64_Addr *got;
-  extern void _dl_runtime_resolve_fxsave (ElfW(Word)) attribute_hidden;
-  extern void _dl_runtime_resolve_xsave (ElfW(Word)) attribute_hidden;
-  extern void _dl_runtime_resolve_xsavec (ElfW(Word)) attribute_hidden;
   extern void _dl_runtime_profile_sse (ElfW(Word)) attribute_hidden;
   extern void _dl_runtime_profile_avx (ElfW(Word)) attribute_hidden;
   extern void _dl_runtime_profile_avx512 (ElfW(Word)) attribute_hidden;
@@ -96,8 +93,6 @@  elf_machine_runtime_setup (struct link_map *l, struct r_scope_elem *scope[],
       /* Identify this shared object.  */
       *(ElfW(Addr) *) (got + 1) = (ElfW(Addr)) l;
 
-      const struct cpu_features* cpu_features = __get_cpu_features ();
-
 #ifdef SHARED
       /* The got[2] entry contains the address of a function which gets
 	 called to get the address of a so far unresolved function and
@@ -107,6 +102,7 @@  elf_machine_runtime_setup (struct link_map *l, struct r_scope_elem *scope[],
 	 end in this function.  */
       if (__glibc_unlikely (profile))
 	{
+	  const struct cpu_features* cpu_features = __get_cpu_features ();
 	  if (X86_ISA_CPU_FEATURE_USABLE_P (cpu_features, AVX512F))
 	    *(ElfW(Addr) *) (got + 2) = (ElfW(Addr)) &_dl_runtime_profile_avx512;
 	  else if (X86_ISA_CPU_FEATURE_USABLE_P (cpu_features, AVX))
@@ -126,15 +122,8 @@  elf_machine_runtime_setup (struct link_map *l, struct r_scope_elem *scope[],
 	  /* This function will get called to fix up the GOT entry
 	     indicated by the offset on the stack, and then jump to
 	     the resolved address.  */
-	  if (MINIMUM_X86_ISA_LEVEL >= AVX_X86_ISA_LEVEL
-	      || GLRO(dl_x86_cpu_features).xsave_state_size != 0)
-	    *(ElfW(Addr) *) (got + 2)
-	      = (CPU_FEATURE_USABLE_P (cpu_features, XSAVEC)
-		 ? (ElfW(Addr)) &_dl_runtime_resolve_xsavec
-		 : (ElfW(Addr)) &_dl_runtime_resolve_xsave);
-	  else
-	    *(ElfW(Addr) *) (got + 2)
-	      = (ElfW(Addr)) &_dl_runtime_resolve_fxsave;
+	  *(ElfW(Addr) *) (got + 2)
+	    = (ElfW(Addr)) GLRO(dl_x86_64_runtime_resolve);
 	}
     }
 
@@ -383,7 +372,7 @@  and creates an unsatisfiable circular dependency.\n",
 		  {
 		    td->arg = _dl_make_tlsdesc_dynamic
 		      (sym_map, sym->st_value + reloc->r_addend);
-		    td->entry = _dl_tlsdesc_dynamic;
+		    td->entry = GLRO(dl_x86_tlsdesc_dynamic);
 		  }
 		else
 #  endif
diff --git a/sysdeps/x86_64/dl-procinfo.c b/sysdeps/x86_64/dl-procinfo.c
index 4d1d790fbb..06637a8154 100644
--- a/sysdeps/x86_64/dl-procinfo.c
+++ b/sysdeps/x86_64/dl-procinfo.c
@@ -41,5 +41,21 @@ 
 
 #include <sysdeps/x86/dl-procinfo.c>
 
+#if !IS_IN (ldconfig)
+# if !defined PROCINFO_DECL && defined SHARED
+  ._dl_x86_64_runtime_resolve
+# else
+PROCINFO_CLASS void * _dl_x86_64_runtime_resolve
+# endif
+# ifndef PROCINFO_DECL
+= NULL
+# endif
+# if !defined SHARED || defined PROCINFO_DECL
+;
+# else
+,
+# endif
+#endif
+
 #undef PROCINFO_DECL
 #undef PROCINFO_CLASS
diff --git a/sysdeps/x86_64/dl-tlsdesc-dynamic.h b/sysdeps/x86_64/dl-tlsdesc-dynamic.h
new file mode 100644
index 0000000000..ce0bc094ec
--- /dev/null
+++ b/sysdeps/x86_64/dl-tlsdesc-dynamic.h
@@ -0,0 +1,166 @@ 
+/* Thread-local storage handling in the ELF dynamic linker.  x86_64 version.
+   Copyright (C) 2004-2024 Free Software Foundation, Inc.
+   This file is part of the GNU C Library.
+
+   The GNU C Library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 2.1 of the License, or (at your option) any later version.
+
+   The GNU C Library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with the GNU C Library; if not, see
+   <https://www.gnu.org/licenses/>.  */
+
+#ifndef SECTION
+# define SECTION(p)	p
+#endif
+
+#undef REGISTER_SAVE_AREA
+#undef LOCAL_STORAGE_AREA
+#undef BASE
+
+#include "dl-trampoline-state.h"
+
+	.section SECTION(.text),"ax",@progbits
+
+	.hidden _dl_tlsdesc_dynamic
+	.global	_dl_tlsdesc_dynamic
+	.type	_dl_tlsdesc_dynamic,@function
+
+     /* %rax points to the TLS descriptor, such that 0(%rax) points to
+	_dl_tlsdesc_dynamic itself, and 8(%rax) points to a struct
+	tlsdesc_dynamic_arg object.  It must return in %rax the offset
+	between the thread pointer and the object denoted by the
+	argument, without clobbering any registers.
+
+	The assembly code that follows is a rendition of the following
+	C code, hand-optimized a little bit.
+
+ptrdiff_t
+_dl_tlsdesc_dynamic (register struct tlsdesc *tdp asm ("%rax"))
+{
+  struct tlsdesc_dynamic_arg *td = tdp->arg;
+  dtv_t *dtv = *(dtv_t **)((char *)__thread_pointer + DTV_OFFSET);
+  if (__builtin_expect (td->gen_count <= dtv[0].counter
+			&& (dtv[td->tlsinfo.ti_module].pointer.val
+			    != TLS_DTV_UNALLOCATED),
+			1))
+    return dtv[td->tlsinfo.ti_module].pointer.val + td->tlsinfo.ti_offset
+      - __thread_pointer;
+
+  return __tls_get_addr_internal (&td->tlsinfo) - __thread_pointer;
+}
+*/
+	cfi_startproc
+	.align 16
+_dl_tlsdesc_dynamic:
+	_CET_ENDBR
+	/* Preserve call-clobbered registers that we modify.
+	   We need two scratch regs anyway.  */
+	movq	%rsi, -16(%rsp)
+	mov	%fs:DTV_OFFSET, %RSI_LP
+	movq	%rdi, -8(%rsp)
+	movq	TLSDESC_ARG(%rax), %rdi
+	movq	(%rsi), %rax
+	cmpq	%rax, TLSDESC_GEN_COUNT(%rdi)
+	ja	2f
+	movq	TLSDESC_MODID(%rdi), %rax
+	salq	$4, %rax
+	movq	(%rax,%rsi), %rax
+	cmpq	$-1, %rax
+	je	2f
+	addq	TLSDESC_MODOFF(%rdi), %rax
+1:
+	movq	-16(%rsp), %rsi
+	sub	%fs:0, %RAX_LP
+	movq	-8(%rsp), %rdi
+	ret
+2:
+#if DL_RUNTIME_RESOLVE_REALIGN_STACK
+	movq	%rbx, -24(%rsp)
+	mov	%RSP_LP, %RBX_LP
+	cfi_def_cfa_register(%rbx)
+	and	$-STATE_SAVE_ALIGNMENT, %RSP_LP
+#endif
+#ifdef REGISTER_SAVE_AREA
+# if DL_RUNTIME_RESOLVE_REALIGN_STACK
+	# STATE_SAVE_OFFSET has space for 8 integer registers.  But we
+	# need space for RCX, RDX, RSI, RDI, R8, R9, R10 and R11, plus
+	# RBX above.
+	sub	$(REGISTER_SAVE_AREA + STATE_SAVE_ALIGNMENT), %RSP_LP
+# else
+	sub	$REGISTER_SAVE_AREA, %RSP_LP
+	cfi_adjust_cfa_offset(REGISTER_SAVE_AREA)
+# endif
+#else
+	# Allocate stack space of the required size to save the state.
+	sub	_rtld_local_ro+RTLD_GLOBAL_RO_DL_X86_CPU_FEATURES_OFFSET+XSAVE_STATE_SIZE_OFFSET(%rip), %RSP_LP
+#endif
+	/* Besides rdi and rsi, saved above, save rcx, rdx, r8, r9,
+	   r10 and r11.  */
+	movq	%rcx, REGISTER_SAVE_RCX(%rsp)
+	movq	%rdx, REGISTER_SAVE_RDX(%rsp)
+	movq	%r8, REGISTER_SAVE_R8(%rsp)
+	movq	%r9, REGISTER_SAVE_R9(%rsp)
+	movq	%r10, REGISTER_SAVE_R10(%rsp)
+	movq	%r11, REGISTER_SAVE_R11(%rsp)
+#ifdef USE_FXSAVE
+	fxsave	STATE_SAVE_OFFSET(%rsp)
+#else
+	movl	$TLSDESC_CALL_STATE_SAVE_MASK, %eax
+	xorl	%edx, %edx
+	# Clear the XSAVE Header.
+# ifdef USE_XSAVE
+	movq	%rdx, (STATE_SAVE_OFFSET + 512)(%rsp)
+	movq	%rdx, (STATE_SAVE_OFFSET + 512 + 8)(%rsp)
+# endif
+	movq	%rdx, (STATE_SAVE_OFFSET + 512 + 8 * 2)(%rsp)
+	movq	%rdx, (STATE_SAVE_OFFSET + 512 + 8 * 3)(%rsp)
+	movq	%rdx, (STATE_SAVE_OFFSET + 512 + 8 * 4)(%rsp)
+	movq	%rdx, (STATE_SAVE_OFFSET + 512 + 8 * 5)(%rsp)
+	movq	%rdx, (STATE_SAVE_OFFSET + 512 + 8 * 6)(%rsp)
+	movq	%rdx, (STATE_SAVE_OFFSET + 512 + 8 * 7)(%rsp)
+# ifdef USE_XSAVE
+	xsave	STATE_SAVE_OFFSET(%rsp)
+# else
+	xsavec	STATE_SAVE_OFFSET(%rsp)
+# endif
+#endif
+	/* %rdi already points to the tlsinfo data structure.  */
+	call	HIDDEN_JUMPTARGET (__tls_get_addr)
+	# Get register content back.
+#ifdef USE_FXSAVE
+	fxrstor	STATE_SAVE_OFFSET(%rsp)
+#else
+	/* Save and retore __tls_get_addr return value stored in RAX.  */
+	mov	%RAX_LP, %RCX_LP
+	movl	$TLSDESC_CALL_STATE_SAVE_MASK, %eax
+	xorl	%edx, %edx
+	xrstor	STATE_SAVE_OFFSET(%rsp)
+	mov	%RCX_LP, %RAX_LP
+#endif
+	movq	REGISTER_SAVE_R11(%rsp), %r11
+	movq	REGISTER_SAVE_R10(%rsp), %r10
+	movq	REGISTER_SAVE_R9(%rsp), %r9
+	movq	REGISTER_SAVE_R8(%rsp), %r8
+	movq	REGISTER_SAVE_RDX(%rsp), %rdx
+	movq	REGISTER_SAVE_RCX(%rsp), %rcx
+#if DL_RUNTIME_RESOLVE_REALIGN_STACK
+	mov	%RBX_LP, %RSP_LP
+	cfi_def_cfa_register(%rsp)
+	movq	-24(%rsp), %rbx
+	cfi_restore(%rbx)
+#else
+	add	$REGISTER_SAVE_AREA, %RSP_LP
+	cfi_adjust_cfa_offset(-REGISTER_SAVE_AREA)
+#endif
+	jmp	1b
+	cfi_endproc
+	.size	_dl_tlsdesc_dynamic, .-_dl_tlsdesc_dynamic
+
+#undef STATE_SAVE_ALIGNMENT
diff --git a/sysdeps/x86_64/dl-tlsdesc.S b/sysdeps/x86_64/dl-tlsdesc.S
index f748af2ece..ea69f5223a 100644
--- a/sysdeps/x86_64/dl-tlsdesc.S
+++ b/sysdeps/x86_64/dl-tlsdesc.S
@@ -18,7 +18,19 @@ 
 
 #include <sysdep.h>
 #include <tls.h>
+#include <cpu-features-offsets.h>
+#include <features-offsets.h>
 #include "tlsdesc.h"
+#include "dl-trampoline-save.h"
+
+/* Area on stack to save and restore registers used for parameter
+   passing when calling _dl_tlsdesc_dynamic.  */
+#define REGISTER_SAVE_RCX	0
+#define REGISTER_SAVE_RDX	(REGISTER_SAVE_RCX + 8)
+#define REGISTER_SAVE_R8	(REGISTER_SAVE_RDX + 8)
+#define REGISTER_SAVE_R9	(REGISTER_SAVE_R8 + 8)
+#define REGISTER_SAVE_R10	(REGISTER_SAVE_R9 + 8)
+#define REGISTER_SAVE_R11	(REGISTER_SAVE_R10 + 8)
 
 	.text
 
@@ -67,80 +79,24 @@  _dl_tlsdesc_undefweak:
 	.size	_dl_tlsdesc_undefweak, .-_dl_tlsdesc_undefweak
 
 #ifdef SHARED
-	.hidden _dl_tlsdesc_dynamic
-	.global	_dl_tlsdesc_dynamic
-	.type	_dl_tlsdesc_dynamic,@function
-
-     /* %rax points to the TLS descriptor, such that 0(%rax) points to
-	_dl_tlsdesc_dynamic itself, and 8(%rax) points to a struct
-	tlsdesc_dynamic_arg object.  It must return in %rax the offset
-	between the thread pointer and the object denoted by the
-	argument, without clobbering any registers.
-
-	The assembly code that follows is a rendition of the following
-	C code, hand-optimized a little bit.
-
-ptrdiff_t
-_dl_tlsdesc_dynamic (register struct tlsdesc *tdp asm ("%rax"))
-{
-  struct tlsdesc_dynamic_arg *td = tdp->arg;
-  dtv_t *dtv = *(dtv_t **)((char *)__thread_pointer + DTV_OFFSET);
-  if (__builtin_expect (td->gen_count <= dtv[0].counter
-			&& (dtv[td->tlsinfo.ti_module].pointer.val
-			    != TLS_DTV_UNALLOCATED),
-			1))
-    return dtv[td->tlsinfo.ti_module].pointer.val + td->tlsinfo.ti_offset
-      - __thread_pointer;
-
-  return __tls_get_addr_internal (&td->tlsinfo) - __thread_pointer;
-}
-*/
-	cfi_startproc
-	.align 16
-_dl_tlsdesc_dynamic:
-	_CET_ENDBR
-	/* Preserve call-clobbered registers that we modify.
-	   We need two scratch regs anyway.  */
-	movq	%rsi, -16(%rsp)
-	mov	%fs:DTV_OFFSET, %RSI_LP
-	movq	%rdi, -8(%rsp)
-	movq	TLSDESC_ARG(%rax), %rdi
-	movq	(%rsi), %rax
-	cmpq	%rax, TLSDESC_GEN_COUNT(%rdi)
-	ja	.Lslow
-	movq	TLSDESC_MODID(%rdi), %rax
-	salq	$4, %rax
-	movq	(%rax,%rsi), %rax
-	cmpq	$-1, %rax
-	je	.Lslow
-	addq	TLSDESC_MODOFF(%rdi), %rax
-.Lret:
-	movq	-16(%rsp), %rsi
-	sub	%fs:0, %RAX_LP
-	movq	-8(%rsp), %rdi
-	ret
-.Lslow:
-	/* Besides rdi and rsi, saved above, save rdx, rcx, r8, r9,
-	   r10 and r11.  Also, align the stack, that's off by 8 bytes.	*/
-	subq	$72, %rsp
-	cfi_adjust_cfa_offset (72)
-	movq	%rdx, 8(%rsp)
-	movq	%rcx, 16(%rsp)
-	movq	%r8, 24(%rsp)
-	movq	%r9, 32(%rsp)
-	movq	%r10, 40(%rsp)
-	movq	%r11, 48(%rsp)
-	/* %rdi already points to the tlsinfo data structure.  */
-	call	HIDDEN_JUMPTARGET (__tls_get_addr)
-	movq	8(%rsp), %rdx
-	movq	16(%rsp), %rcx
-	movq	24(%rsp), %r8
-	movq	32(%rsp), %r9
-	movq	40(%rsp), %r10
-	movq	48(%rsp), %r11
-	addq	$72, %rsp
-	cfi_adjust_cfa_offset (-72)
-	jmp	.Lret
-	cfi_endproc
-	.size	_dl_tlsdesc_dynamic, .-_dl_tlsdesc_dynamic
+# define USE_FXSAVE
+# define STATE_SAVE_ALIGNMENT	16
+# define _dl_tlsdesc_dynamic	_dl_tlsdesc_dynamic_fxsave
+# include "dl-tlsdesc-dynamic.h"
+# undef _dl_tlsdesc_dynamic
+# undef USE_FXSAVE
+
+# define USE_XSAVE
+# define STATE_SAVE_ALIGNMENT	64
+# define _dl_tlsdesc_dynamic	_dl_tlsdesc_dynamic_xsave
+# include "dl-tlsdesc-dynamic.h"
+# undef _dl_tlsdesc_dynamic
+# undef USE_XSAVE
+
+# define USE_XSAVEC
+# define STATE_SAVE_ALIGNMENT	64
+# define _dl_tlsdesc_dynamic	_dl_tlsdesc_dynamic_xsavec
+# include "dl-tlsdesc-dynamic.h"
+# undef _dl_tlsdesc_dynamic
+# undef USE_XSAVEC
 #endif /* SHARED */
diff --git a/sysdeps/x86_64/dl-trampoline-save.h b/sysdeps/x86_64/dl-trampoline-save.h
new file mode 100644
index 0000000000..84eac4a8ac
--- /dev/null
+++ b/sysdeps/x86_64/dl-trampoline-save.h
@@ -0,0 +1,34 @@ 
+/* x86-64 PLT trampoline register save macros.
+   Copyright (C) 2024 Free Software Foundation, Inc.
+   This file is part of the GNU C Library.
+
+   The GNU C Library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 2.1 of the License, or (at your option) any later version.
+
+   The GNU C Library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with the GNU C Library; if not, see
+   <https://www.gnu.org/licenses/>.  */
+
+#ifndef DL_STACK_ALIGNMENT
+/* Due to GCC bug:
+
+   https://gcc.gnu.org/bugzilla/show_bug.cgi?id=58066
+
+   __tls_get_addr may be called with 8-byte stack alignment.  Although
+   this bug has been fixed in GCC 4.9.4, 5.3 and 6, we can't assume
+   that stack will be always aligned at 16 bytes.  */
+# define DL_STACK_ALIGNMENT 8
+#endif
+
+/* True if _dl_runtime_resolve should align stack for STATE_SAVE or align
+   stack to 16 bytes before calling _dl_fixup.  */
+#define DL_RUNTIME_RESOLVE_REALIGN_STACK \
+  (STATE_SAVE_ALIGNMENT > DL_STACK_ALIGNMENT \
+   || 16 > DL_STACK_ALIGNMENT)
diff --git a/sysdeps/x86_64/dl-trampoline-state.h b/sysdeps/x86_64/dl-trampoline-state.h
new file mode 100644
index 0000000000..575f120797
--- /dev/null
+++ b/sysdeps/x86_64/dl-trampoline-state.h
@@ -0,0 +1,51 @@ 
+/* x86-64 PLT dl-trampoline state macros.
+   Copyright (C) 2024 Free Software Foundation, Inc.
+   This file is part of the GNU C Library.
+
+   The GNU C Library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 2.1 of the License, or (at your option) any later version.
+
+   The GNU C Library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with the GNU C Library; if not, see
+   <https://www.gnu.org/licenses/>.  */
+
+#if (STATE_SAVE_ALIGNMENT % 16) != 0
+# error STATE_SAVE_ALIGNMENT must be multiple of 16
+#endif
+
+#if (STATE_SAVE_OFFSET % STATE_SAVE_ALIGNMENT) != 0
+# error STATE_SAVE_OFFSET must be multiple of STATE_SAVE_ALIGNMENT
+#endif
+
+#if DL_RUNTIME_RESOLVE_REALIGN_STACK
+/* Local stack area before jumping to function address: RBX.  */
+# define LOCAL_STORAGE_AREA	8
+# define BASE			rbx
+# ifdef USE_FXSAVE
+/* Use fxsave to save XMM registers.  */
+#  define REGISTER_SAVE_AREA	(512 + STATE_SAVE_OFFSET)
+#  if (REGISTER_SAVE_AREA % 16) != 0
+#   error REGISTER_SAVE_AREA must be multiple of 16
+#  endif
+# endif
+#else
+# ifndef USE_FXSAVE
+#  error USE_FXSAVE must be defined
+# endif
+/* Use fxsave to save XMM registers.  */
+# define REGISTER_SAVE_AREA	(512 + STATE_SAVE_OFFSET + 8)
+/* Local stack area before jumping to function address:  All saved
+   registers.  */
+# define LOCAL_STORAGE_AREA	REGISTER_SAVE_AREA
+# define BASE			rsp
+# if (REGISTER_SAVE_AREA % 16) != 8
+#  error REGISTER_SAVE_AREA must be odd multiple of 8
+# endif
+#endif
diff --git a/sysdeps/x86_64/dl-trampoline.S b/sysdeps/x86_64/dl-trampoline.S
index b2e7e0f69b..87c5137837 100644
--- a/sysdeps/x86_64/dl-trampoline.S
+++ b/sysdeps/x86_64/dl-trampoline.S
@@ -22,25 +22,7 @@ 
 #include <features-offsets.h>
 #include <link-defines.h>
 #include <isa-level.h>
-
-#ifndef DL_STACK_ALIGNMENT
-/* Due to GCC bug:
-
-   https://gcc.gnu.org/bugzilla/show_bug.cgi?id=58066
-
-   __tls_get_addr may be called with 8-byte stack alignment.  Although
-   this bug has been fixed in GCC 4.9.4, 5.3 and 6, we can't assume
-   that stack will be always aligned at 16 bytes.  We use unaligned
-   16-byte move to load and store SSE registers, which has no penalty
-   on modern processors if stack is 16-byte aligned.  */
-# define DL_STACK_ALIGNMENT 8
-#endif
-
-/* True if _dl_runtime_resolve should align stack for STATE_SAVE or align
-   stack to 16 bytes before calling _dl_fixup.  */
-#define DL_RUNTIME_RESOLVE_REALIGN_STACK \
-  (STATE_SAVE_ALIGNMENT > DL_STACK_ALIGNMENT \
-   || 16 > DL_STACK_ALIGNMENT)
+#include "dl-trampoline-save.h"
 
 /* Area on stack to save and restore registers used for parameter
    passing when calling _dl_fixup.  */
diff --git a/sysdeps/x86_64/dl-trampoline.h b/sysdeps/x86_64/dl-trampoline.h
index f55c6ea040..d9ccfb40d4 100644
--- a/sysdeps/x86_64/dl-trampoline.h
+++ b/sysdeps/x86_64/dl-trampoline.h
@@ -27,39 +27,7 @@ 
 # undef LOCAL_STORAGE_AREA
 # undef BASE
 
-# if (STATE_SAVE_ALIGNMENT % 16) != 0
-#  error STATE_SAVE_ALIGNMENT must be multiple of 16
-# endif
-
-# if (STATE_SAVE_OFFSET % STATE_SAVE_ALIGNMENT) != 0
-#  error STATE_SAVE_OFFSET must be multiple of STATE_SAVE_ALIGNMENT
-# endif
-
-# if DL_RUNTIME_RESOLVE_REALIGN_STACK
-/* Local stack area before jumping to function address: RBX.  */
-#  define LOCAL_STORAGE_AREA	8
-#  define BASE			rbx
-#  ifdef USE_FXSAVE
-/* Use fxsave to save XMM registers.  */
-#   define REGISTER_SAVE_AREA	(512 + STATE_SAVE_OFFSET)
-#   if (REGISTER_SAVE_AREA % 16) != 0
-#    error REGISTER_SAVE_AREA must be multiple of 16
-#   endif
-#  endif
-# else
-#  ifndef USE_FXSAVE
-#   error USE_FXSAVE must be defined
-#  endif
-/* Use fxsave to save XMM registers.  */
-#  define REGISTER_SAVE_AREA	(512 + STATE_SAVE_OFFSET + 8)
-/* Local stack area before jumping to function address:  All saved
-   registers.  */
-#  define LOCAL_STORAGE_AREA	REGISTER_SAVE_AREA
-#  define BASE			rsp
-#  if (REGISTER_SAVE_AREA % 16) != 8
-#   error REGISTER_SAVE_AREA must be odd multiple of 8
-#  endif
-# endif
+# include "dl-trampoline-state.h"
 
 	.globl _dl_runtime_resolve
 	.hidden _dl_runtime_resolve