x86-64: Align the stack in __tls_get_addr [BZ #21609]
Commit Message
On Mon, Jul 3, 2017 at 9:47 PM, Florian Weimer <fweimer@redhat.com> wrote:
> On 07/03/2017 10:25 PM, H.J. Lu wrote:
>> I prefer to let GCC realign the stack for us. What do you think?
>
> I tried that first, but it does not work with all the GCC versions we
That is true.
> support. We would have to add a configure check and hope that older GCC
> versions never generate code which needs an aligned stack.
>
> Furthermore, the code GCC generates for stack realignment is really bad,
GCC generates very good code for stack realignment when
-maccumulate-outgoing-args is used.
> and __tls_get_addr appears to be quite performance-critical because not
> all callers cache the result.
>
Some comments on the patch at
https://sourceware.org/ml/libc-alpha/2017-06/msg00922.html
# define __tls_get_addr __tls_get_addr_default
+# include <elf/dl-tls.c>
+
+# undef __tls_get_addr_default
^^^^^^^ Shouldn't it be __tls_get_addr?
-#include <stdint.h>
-
#ifndef _X86_64_DL_TLSDESC_H
-# define _X86_64_DL_TLSDESC_H 1
+#define _X86_64_DL_TLSDESC_H
+
+#include <stdint.h>
+#include <dl-tls.h>
/* Type used to represent a TLS descriptor in the GOT. */
struct tlsdesc
@@ -39,12 +40,6 @@ struct tlsdesc
};
};
-typedef struct dl_tls_index
-{
- uint64_t ti_module;
- uint64_t ti_offset;
-} tls_index;
-
/* Type used as the argument in a TLS descriptor for a symbol that
needs dynamic TLS offsets. */
struct tlsdesc_dynamic_arg
@@ -59,12 +54,12 @@ extern ptrdiff_t attribute_hidden
_dl_tlsdesc_resolve_rela(struct tlsdesc *on_rax),
_dl_tlsdesc_resolve_hold(struct tlsdesc *on_rax);
-# ifdef SHARED
+#ifdef SHARED
extern void *_dl_make_tlsdesc_dynamic (struct link_map *map,
size_t ti_offset)
internal_function attribute_hidden;
extern ptrdiff_t attribute_hidden _dl_tlsdesc_dynamic(struct tlsdesc *);
-# endif
-
#endif
+
+#endif /* _X86_64_DL_TLSDESC_H */
Is this sysdeps/x86_64/dl-tlsdesc.h change related to this?
__tls_get_addr_compat:
+ .type __tls_get_addr_compat,@function
+ .global __tls_get_addr_compat
+ strong_alias (__tls_get_addr_compat, __tls_get_addr)
We can use ENTRY/END here. Why do we need __tls_get_addr_compat?
Can we just have __tls_get_addr?
Since we are talking performance here, we should add __tls_get_addr_slow
to only handle slow paths.
Here is the patch which implements those. It is tested on x86-64
and x32.
Comments
On 07/04/2017 05:16 PM, H.J. Lu wrote:
>> Furthermore, the code GCC generates for stack realignment is really bad,
>
> GCC generates very good code for stack realignment when
> -maccumulate-outgoing-args is used.
I wonder why that isn't enabled by the force attribute.
>> # define __tls_get_addr __tls_get_addr_default
>> +# include <elf/dl-tls.c>
>> +
>> +# undef __tls_get_addr_default
> ^^^^^^^ Shouldn't it be __tls_get_addr?
Looks this way. I'd have to re-test and see if it makes a difference.
>> -typedef struct dl_tls_index
>> -{
>> - uint64_t ti_module;
>> - uint64_t ti_offset;
>> -} tls_index;
> Is this sysdeps/x86_64/dl-tlsdesc.h change related to this?
It is because the patch includes both <dl-tls.h> and <dl-tlsdesc.h> from
the same file, causing a multiple definition error without the removal
of the conflicting definition.
> __tls_get_addr_compat:
> + .type __tls_get_addr_compat,@function
> + .global __tls_get_addr_compat
> + strong_alias (__tls_get_addr_compat, __tls_get_addr)
>
> We can use ENTRY/END here. Why do we need __tls_get_addr_compat?
> Can we just have __tls_get_addr?
That confuses the linker once we add the .symver directive.
> Since we are talking performance here, we should add __tls_get_addr_slow
> to only handle slow paths.
I'd prefer something that adds a new symbol version for the non-aligning
implementation, so that we eventually move away from the aligning one.
> Here is the patch which implements those. It is tested on x86-64
> and x32.
Is this really needed on x32, too? Did GCC have the same bug?
Thanks,
Florian
On Tue, Jul 4, 2017 at 8:47 AM, Florian Weimer <fweimer@redhat.com> wrote:
> On 07/04/2017 05:16 PM, H.J. Lu wrote:
>
>>> Furthermore, the code GCC generates for stack realignment is really bad,
>>
>> GCC generates very good code for stack realignment when
>> -maccumulate-outgoing-args is used.
>
> I wonder why that isn't enabled by the force attribute.
https://gcc.gnu.org/bugzilla/show_bug.cgi?id=81312
https://gcc.gnu.org/bugzilla/show_bug.cgi?id=81313
>>> # define __tls_get_addr __tls_get_addr_default
>>> +# include <elf/dl-tls.c>
>>> +
>>> +# undef __tls_get_addr_default
>> ^^^^^^^ Shouldn't it be __tls_get_addr?
>
> Looks this way. I'd have to re-test and see if it makes a difference.
>
>>> -typedef struct dl_tls_index
>>> -{
>>> - uint64_t ti_module;
>>> - uint64_t ti_offset;
>>> -} tls_index;
>
>> Is this sysdeps/x86_64/dl-tlsdesc.h change related to this?
>
> It is because the patch includes both <dl-tls.h> and <dl-tlsdesc.h> from
> the same file, causing a multiple definition error without the removal
> of the conflicting definition.
It is a cleanup. But I don't believe it is required here.
>> __tls_get_addr_compat:
>> + .type __tls_get_addr_compat,@function
>> + .global __tls_get_addr_compat
>> + strong_alias (__tls_get_addr_compat, __tls_get_addr)
>>
>> We can use ENTRY/END here. Why do we need __tls_get_addr_compat?
>> Can we just have __tls_get_addr?
>
> That confuses the linker once we add the .symver directive.
I don't think it is a case. We just define a different __tls_get_addr for
x86-64.
>> Since we are talking performance here, we should add __tls_get_addr_slow
>> to only handle slow paths.
>
> I'd prefer something that adds a new symbol version for the non-aligning
> implementation, so that we eventually move away from the aligning one.
I thought about it. There is no easy way to do it without linker help. We
can add ___tls_get_addr, similar to i386, which will take an aligned
stack. Linker must support ___tls_get_addr. Then we can use weakref
to redirect __tls_get_addr to ___tls_get_addr if linker supports
___tls_get_addr and GCC doesn't.
>> Here is the patch which implements those. It is tested on x86-64
>> and x32.
>
> Is this really needed on x32, too? Did GCC have the same bug?
>
Yes, it is needed for x32 since the bug is only fixed on I thiuGCC 4.9 branch
and above.
On Tue, Jul 4, 2017 at 8:55 AM, H.J. Lu <hjl.tools@gmail.com> wrote:
> On Tue, Jul 4, 2017 at 8:47 AM, Florian Weimer <fweimer@redhat.com> wrote:
>> On 07/04/2017 05:16 PM, H.J. Lu wrote:
>>
>>>> Furthermore, the code GCC generates for stack realignment is really bad,
>>>
>>> GCC generates very good code for stack realignment when
>>> -maccumulate-outgoing-args is used.
>>
>> I wonder why that isn't enabled by the force attribute.
>
> https://gcc.gnu.org/bugzilla/show_bug.cgi?id=81312
> https://gcc.gnu.org/bugzilla/show_bug.cgi?id=81313
>
>>>> # define __tls_get_addr __tls_get_addr_default
>>>> +# include <elf/dl-tls.c>
>>>> +
>>>> +# undef __tls_get_addr_default
>>> ^^^^^^^ Shouldn't it be __tls_get_addr?
>>
>> Looks this way. I'd have to re-test and see if it makes a difference.
>>
>>>> -typedef struct dl_tls_index
>>>> -{
>>>> - uint64_t ti_module;
>>>> - uint64_t ti_offset;
>>>> -} tls_index;
>>
>>> Is this sysdeps/x86_64/dl-tlsdesc.h change related to this?
>>
>> It is because the patch includes both <dl-tls.h> and <dl-tlsdesc.h> from
>> the same file, causing a multiple definition error without the removal
>> of the conflicting definition.
>
> It is a cleanup. But I don't believe it is required here.
>
>>> __tls_get_addr_compat:
>>> + .type __tls_get_addr_compat,@function
>>> + .global __tls_get_addr_compat
>>> + strong_alias (__tls_get_addr_compat, __tls_get_addr)
>>>
>>> We can use ENTRY/END here. Why do we need __tls_get_addr_compat?
>>> Can we just have __tls_get_addr?
>>
>> That confuses the linker once we add the .symver directive.
>
> I don't think it is a case. We just define a different __tls_get_addr for
> x86-64.
>
>>> Since we are talking performance here, we should add __tls_get_addr_slow
>>> to only handle slow paths.
>>
>> I'd prefer something that adds a new symbol version for the non-aligning
>> implementation, so that we eventually move away from the aligning one.
>
> I thought about it. There is no easy way to do it without linker help. We
> can add ___tls_get_addr, similar to i386, which will take an aligned
> stack. Linker must support ___tls_get_addr. Then we can use weakref
> to redirect __tls_get_addr to ___tls_get_addr if linker supports
> ___tls_get_addr and GCC doesn't.
>
>>> Here is the patch which implements those. It is tested on x86-64
>>> and x32.
>>
>> Is this really needed on x32, too? Did GCC have the same bug?
>>
>
> Yes, it is needed for x32 since the bug is only fixed on I thiuGCC 4.9 branch
> and above.
>
> --
> H.J.
From eb4dd17f2f8222278567dc085071bd85c6669cc4 Mon Sep 17 00:00:00 2001
From: "H.J. Lu" <hjl.tools@gmail.com>
Date: Mon, 3 Jul 2017 13:16:57 -0700
Subject: [PATCH] x86-64: Align the stack in __tls_get_addr [BZ #21609]
This change forces realignment of the stack pointer in __tls_get_addr, so
that binaries compiled by GCCs older than GCC 4.9:
https://gcc.gnu.org/bugzilla/show_bug.cgi?id=58066
continue to work even if vector instructions are used in glibc which
require the ABI stack realignment.
__tls_get_addr_slow is added to handle the slow paths in the default
implementation of__tls_get_addr in elf/dl-tls.c. The new __tls_get_addr
calls __tls_get_addr_slow after realigning the stack. Internal calls
within ld.so go directly to the default implementation of __tls_get_addr
because they do not need stack realignment.
2017-07-04 Florian Weimer <fweimer@redhat.com>
H.J. Lu <hongjiu.lu@intel.com>
[BZ #21609]
* sysdeps/x86_64/Makefile (sysdep-dl-routines): Add tls_get_addr.
(gen-as-const-headers): Add rtld-offsets.sym.
* sysdeps/x86_64/dl-tls.c: New file.
* sysdeps/x86_64/rtld-offsets.sym: Likwise.
* sysdeps/x86_64/tls_get_addr.S: Likewise.
* sysdeps/x86_64/dl-tls.h: Add multiple inclusion guards.
* sysdeps/x86_64/tlsdesc.sym (TI_MODULE_OFFSET): New.
(TI_OFFSET_OFFSET): Likwise.
---
sysdeps/x86_64/Makefile | 4 +--
sysdeps/x86_64/dl-tls.c | 53 +++++++++++++++++++++++++++++++++++
sysdeps/x86_64/dl-tls.h | 5 ++++
sysdeps/x86_64/rtld-offsets.sym | 6 ++++
sysdeps/x86_64/tls_get_addr.S | 61 +++++++++++++++++++++++++++++++++++++++++
sysdeps/x86_64/tlsdesc.sym | 3 ++
6 files changed, 130 insertions(+), 2 deletions(-)
create mode 100644 sysdeps/x86_64/dl-tls.c
create mode 100644 sysdeps/x86_64/rtld-offsets.sym
create mode 100644 sysdeps/x86_64/tls_get_addr.S
@@ -27,7 +27,7 @@ ifeq ($(subdir),elf)
CFLAGS-.os += $(if $(filter $(@F),$(patsubst %,%.os,$(all-rtld-routines))),\
-mno-mmx)
-sysdep-dl-routines += tlsdesc dl-tlsdesc
+sysdep-dl-routines += tlsdesc dl-tlsdesc tls_get_addr
tests += ifuncmain8
modules-names += ifuncmod8
@@ -120,5 +120,5 @@ endif
endif
ifeq ($(subdir),csu)
-gen-as-const-headers += tlsdesc.sym
+gen-as-const-headers += tlsdesc.sym rtld-offsets.sym
endif
new file mode 100644
@@ -0,0 +1,53 @@
+/* Thread-local storage handling in the ELF dynamic linker. x86-64 version.
+ Copyright (C) 2017 Free Software Foundation, Inc.
+ This file is part of the GNU C Library.
+
+ The GNU C Library is free software; you can redistribute it and/or
+ modify it under the terms of the GNU Lesser General Public
+ License as published by the Free Software Foundation; either
+ version 2.1 of the License, or (at your option) any later version.
+
+ The GNU C Library is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ Lesser General Public License for more details.
+
+ You should have received a copy of the GNU Lesser General Public
+ License along with the GNU C Library; if not, see
+ <http://www.gnu.org/licenses/>. */
+
+#ifdef SHARED
+/* Work around GCC PR58066, due to which __tls_get_addr may be called
+ with an unaligned stack. The compat implementation is in
+ tls_get_addr-compat.S. */
+
+# include <dl-tls.h>
+
+/* Define __tls_get_addr within elf/dl-tls.c under a different
+ name. */
+extern __typeof__ (__tls_get_addr) __tls_get_addr_default;
+
+# define __tls_get_addr __tls_get_addr_default
+# include <elf/dl-tls.c>
+# undef __tls_get_addr
+
+hidden_ver (__tls_get_addr_default, __tls_get_addr)
+
+/* Only handle slow paths for __tls_get_addr. */
+attribute_hidden
+void *
+__tls_get_addr_slow (GET_ADDR_ARGS)
+{
+ dtv_t *dtv = THREAD_DTV ();
+
+ if (__glibc_unlikely (dtv[0].counter != GL(dl_tls_generation)))
+ return update_get_addr (GET_ADDR_PARAM);
+
+ return tls_get_addr_tail (GET_ADDR_PARAM, dtv, NULL);
+}
+#else
+
+/* No compatibility symbol needed. */
+# include <elf/dl-tls.c>
+
+#endif
@@ -16,6 +16,9 @@
License along with the GNU C Library; if not, see
<http://www.gnu.org/licenses/>. */
+#ifndef _X86_64_DL_TLS_H
+#define _X86_64_DL_TLS_H
+
#include <stdint.h>
/* Type used for the representation of TLS information in the GOT. */
@@ -27,3 +30,5 @@ typedef struct dl_tls_index
extern void *__tls_get_addr (tls_index *ti);
+
+#endif /* _X86_64_DL_TLS_H */
new file mode 100644
@@ -0,0 +1,6 @@
+#define SHARED
+#include <ldsodefs.h>
+
+--
+
+GL_TLS_GENERATION_OFFSET offsetof (struct rtld_global, _dl_tls_generation)
new file mode 100644
@@ -0,0 +1,61 @@
+/* Stack-aligning implementation of __tls_get_addr. x86-64 version.
+ Copyright (C) 2017 Free Software Foundation, Inc.
+ This file is part of the GNU C Library.
+
+ The GNU C Library is free software; you can redistribute it and/or
+ modify it under the terms of the GNU Lesser General Public
+ License as published by the Free Software Foundation; either
+ version 2.1 of the License, or (at your option) any later version.
+
+ The GNU C Library is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ Lesser General Public License for more details.
+
+ You should have received a copy of the GNU Lesser General Public
+ License along with the GNU C Library; if not, see
+ <http://www.gnu.org/licenses/>. */
+
+#ifdef SHARED
+
+# include <sysdep.h>
+# include "tlsdesc.h"
+# include "rtld-offsets.h"
+
+/* See __tls_get_addr and __tls_get_addr_slow in dl-tls.c. This function
+ call __tls_get_addr_slow on both slow paths. It realigns the stack
+ before the call to work around GCC PR58066. */
+
+ENTRY (__tls_get_addr)
+ mov %fs:DTV_OFFSET, %RDX_LP
+ mov GL_TLS_GENERATION_OFFSET+_rtld_local(%rip), %RAX_LP
+ /* GL(dl_tls_generation) == dtv[0].counter */
+ cmp %RAX_LP, (%rdx)
+ jne 1f
+ mov TI_MODULE_OFFSET(%rdi), %RAX_LP
+ /* dtv[ti->ti_module] */
+# ifdef __LP64__
+ salq $4, %rax
+ movq (%rdx,%rax), %rax
+# else
+ movl (%rdx,%rax, 8), %eax
+# endif
+ cmp $-1, %RAX_LP
+ je 1f
+ add TI_OFFSET_OFFSET(%rdi), %RAX_LP
+ ret
+1:
+ /* On the slow path, align the stack. */
+ pushq %rbp
+ cfi_def_cfa_offset (16)
+ cfi_offset (%rbp, -16)
+ mov %RSP_LP, %RBP_LP
+ cfi_def_cfa_register (%rbp)
+ and $-16, %RSP_LP
+ call __tls_get_addr_slow
+ mov %RBP_LP, %RSP_LP
+ popq %rbp
+ cfi_def_cfa (%rsp, 8)
+ ret
+END (__tls_get_addr)
+#endif /* SHARED */
@@ -15,3 +15,6 @@ TLSDESC_ARG offsetof(struct tlsdesc, arg)
TLSDESC_GEN_COUNT offsetof(struct tlsdesc_dynamic_arg, gen_count)
TLSDESC_MODID offsetof(struct tlsdesc_dynamic_arg, tlsinfo.ti_module)
TLSDESC_MODOFF offsetof(struct tlsdesc_dynamic_arg, tlsinfo.ti_offset)
+
+TI_MODULE_OFFSET offsetof(tls_index, ti_module)
+TI_OFFSET_OFFSET offsetof(tls_index, ti_offset)
--
2.9.4