x86-64: Align the stack in __tls_get_addr [BZ #21609]

Message ID CAMe9rOp15N_5xrbx4uCdKY7v9iMiN7gQ0ZBOvaoEJOO2ZLM+TA@mail.gmail.com
State New, archived
Headers

Commit Message

H.J. Lu July 4, 2017, 3:16 p.m. UTC
  On Mon, Jul 3, 2017 at 9:47 PM, Florian Weimer <fweimer@redhat.com> wrote:
> On 07/03/2017 10:25 PM, H.J. Lu wrote:
>> I prefer to let GCC realign the stack for us.  What do you think?
>
> I tried that first, but it does not work with all the GCC versions we

That is true.

> support.  We would have to add a configure check and hope that older GCC
> versions never generate code which needs an aligned stack.
>
> Furthermore, the code GCC generates for stack realignment is really bad,

GCC generates very good code for stack realignment when
-maccumulate-outgoing-args is used.

> and __tls_get_addr appears to be quite performance-critical because not
> all callers cache the result.
>

Some comments on the patch at

https://sourceware.org/ml/libc-alpha/2017-06/msg00922.html

# define __tls_get_addr __tls_get_addr_default
+# include <elf/dl-tls.c>
+
+# undef __tls_get_addr_default
              ^^^^^^^ Shouldn't it be __tls_get_addr?

-#include <stdint.h>
-
 #ifndef _X86_64_DL_TLSDESC_H
-# define _X86_64_DL_TLSDESC_H 1
+#define _X86_64_DL_TLSDESC_H
+
+#include <stdint.h>
+#include <dl-tls.h>

 /* Type used to represent a TLS descriptor in the GOT.  */
 struct tlsdesc
@@ -39,12 +40,6 @@ struct tlsdesc
     };
 };

-typedef struct dl_tls_index
-{
-  uint64_t ti_module;
-  uint64_t ti_offset;
-} tls_index;
-
 /* Type used as the argument in a TLS descriptor for a symbol that
    needs dynamic TLS offsets.  */
 struct tlsdesc_dynamic_arg
@@ -59,12 +54,12 @@ extern ptrdiff_t attribute_hidden
   _dl_tlsdesc_resolve_rela(struct tlsdesc *on_rax),
   _dl_tlsdesc_resolve_hold(struct tlsdesc *on_rax);

-# ifdef SHARED
+#ifdef SHARED
 extern void *_dl_make_tlsdesc_dynamic (struct link_map *map,
        size_t ti_offset)
   internal_function attribute_hidden;

 extern ptrdiff_t attribute_hidden _dl_tlsdesc_dynamic(struct tlsdesc *);
-# endif
-
 #endif
+
+#endif /* _X86_64_DL_TLSDESC_H */

Is this sysdeps/x86_64/dl-tlsdesc.h change related to this?

__tls_get_addr_compat:
+ .type __tls_get_addr_compat,@function
+ .global __tls_get_addr_compat
+ strong_alias (__tls_get_addr_compat, __tls_get_addr)

We can use ENTRY/END here.  Why do we need __tls_get_addr_compat?
Can we just have __tls_get_addr?

Since we are talking performance here, we should add __tls_get_addr_slow
to only handle slow paths.

Here is the patch which implements those.  It is tested on x86-64
and x32.
  

Comments

Florian Weimer July 4, 2017, 3:47 p.m. UTC | #1
On 07/04/2017 05:16 PM, H.J. Lu wrote:

>> Furthermore, the code GCC generates for stack realignment is really bad,
> 
> GCC generates very good code for stack realignment when
> -maccumulate-outgoing-args is used.

I wonder why that isn't enabled by the force attribute.

>> # define __tls_get_addr __tls_get_addr_default
>> +# include <elf/dl-tls.c>
>> +
>> +# undef __tls_get_addr_default
>              ^^^^^^^ Shouldn't it be __tls_get_addr?

Looks this way.  I'd have to re-test and see if it makes a difference.

>> -typedef struct dl_tls_index
>> -{
>> -  uint64_t ti_module;
>> -  uint64_t ti_offset;
>> -} tls_index;

> Is this sysdeps/x86_64/dl-tlsdesc.h change related to this?

It is because the patch includes both <dl-tls.h> and <dl-tlsdesc.h> from
the same file, causing a multiple definition error without the removal
of the conflicting definition.

> __tls_get_addr_compat:
> + .type __tls_get_addr_compat,@function
> + .global __tls_get_addr_compat
> + strong_alias (__tls_get_addr_compat, __tls_get_addr)
> 
> We can use ENTRY/END here.  Why do we need __tls_get_addr_compat?
> Can we just have __tls_get_addr?

That confuses the linker once we add the .symver directive.

> Since we are talking performance here, we should add __tls_get_addr_slow
> to only handle slow paths.

I'd prefer something that adds a new symbol version for the non-aligning
implementation, so that we eventually move away from the aligning one.

> Here is the patch which implements those.  It is tested on x86-64
> and x32.

Is this really needed on x32, too?  Did GCC have the same bug?

Thanks,
Florian
  
H.J. Lu July 4, 2017, 3:55 p.m. UTC | #2
On Tue, Jul 4, 2017 at 8:47 AM, Florian Weimer <fweimer@redhat.com> wrote:
> On 07/04/2017 05:16 PM, H.J. Lu wrote:
>
>>> Furthermore, the code GCC generates for stack realignment is really bad,
>>
>> GCC generates very good code for stack realignment when
>> -maccumulate-outgoing-args is used.
>
> I wonder why that isn't enabled by the force attribute.

https://gcc.gnu.org/bugzilla/show_bug.cgi?id=81312
https://gcc.gnu.org/bugzilla/show_bug.cgi?id=81313

>>> # define __tls_get_addr __tls_get_addr_default
>>> +# include <elf/dl-tls.c>
>>> +
>>> +# undef __tls_get_addr_default
>>              ^^^^^^^ Shouldn't it be __tls_get_addr?
>
> Looks this way.  I'd have to re-test and see if it makes a difference.
>
>>> -typedef struct dl_tls_index
>>> -{
>>> -  uint64_t ti_module;
>>> -  uint64_t ti_offset;
>>> -} tls_index;
>
>> Is this sysdeps/x86_64/dl-tlsdesc.h change related to this?
>
> It is because the patch includes both <dl-tls.h> and <dl-tlsdesc.h> from
> the same file, causing a multiple definition error without the removal
> of the conflicting definition.

It is a cleanup.  But I don't believe it is required here.

>> __tls_get_addr_compat:
>> + .type __tls_get_addr_compat,@function
>> + .global __tls_get_addr_compat
>> + strong_alias (__tls_get_addr_compat, __tls_get_addr)
>>
>> We can use ENTRY/END here.  Why do we need __tls_get_addr_compat?
>> Can we just have __tls_get_addr?
>
> That confuses the linker once we add the .symver directive.

I don't think it is a case.  We just define a different __tls_get_addr for
x86-64.

>> Since we are talking performance here, we should add __tls_get_addr_slow
>> to only handle slow paths.
>
> I'd prefer something that adds a new symbol version for the non-aligning
> implementation, so that we eventually move away from the aligning one.

I thought about it.  There is no easy way to do it without linker help.  We
can add ___tls_get_addr, similar to i386, which will take an aligned
stack.  Linker must support ___tls_get_addr.   Then we can use weakref
to redirect __tls_get_addr to ___tls_get_addr if linker supports
___tls_get_addr and GCC doesn't.

>> Here is the patch which implements those.  It is tested on x86-64
>> and x32.
>
> Is this really needed on x32, too?  Did GCC have the same bug?
>

Yes, it is needed for x32 since the bug is only fixed on I thiuGCC 4.9 branch
and above.
  
H.J. Lu July 4, 2017, 4:24 p.m. UTC | #3
On Tue, Jul 4, 2017 at 8:55 AM, H.J. Lu <hjl.tools@gmail.com> wrote:
> On Tue, Jul 4, 2017 at 8:47 AM, Florian Weimer <fweimer@redhat.com> wrote:
>> On 07/04/2017 05:16 PM, H.J. Lu wrote:
>>
>>>> Furthermore, the code GCC generates for stack realignment is really bad,
>>>
>>> GCC generates very good code for stack realignment when
>>> -maccumulate-outgoing-args is used.
>>
>> I wonder why that isn't enabled by the force attribute.
>
> https://gcc.gnu.org/bugzilla/show_bug.cgi?id=81312
> https://gcc.gnu.org/bugzilla/show_bug.cgi?id=81313
>
>>>> # define __tls_get_addr __tls_get_addr_default
>>>> +# include <elf/dl-tls.c>
>>>> +
>>>> +# undef __tls_get_addr_default
>>>              ^^^^^^^ Shouldn't it be __tls_get_addr?
>>
>> Looks this way.  I'd have to re-test and see if it makes a difference.
>>
>>>> -typedef struct dl_tls_index
>>>> -{
>>>> -  uint64_t ti_module;
>>>> -  uint64_t ti_offset;
>>>> -} tls_index;
>>
>>> Is this sysdeps/x86_64/dl-tlsdesc.h change related to this?
>>
>> It is because the patch includes both <dl-tls.h> and <dl-tlsdesc.h> from
>> the same file, causing a multiple definition error without the removal
>> of the conflicting definition.
>
> It is a cleanup.  But I don't believe it is required here.
>
>>> __tls_get_addr_compat:
>>> + .type __tls_get_addr_compat,@function
>>> + .global __tls_get_addr_compat
>>> + strong_alias (__tls_get_addr_compat, __tls_get_addr)
>>>
>>> We can use ENTRY/END here.  Why do we need __tls_get_addr_compat?
>>> Can we just have __tls_get_addr?
>>
>> That confuses the linker once we add the .symver directive.
>
> I don't think it is a case.  We just define a different __tls_get_addr for
> x86-64.
>
>>> Since we are talking performance here, we should add __tls_get_addr_slow
>>> to only handle slow paths.
>>
>> I'd prefer something that adds a new symbol version for the non-aligning
>> implementation, so that we eventually move away from the aligning one.
>
> I thought about it.  There is no easy way to do it without linker help.  We
> can add ___tls_get_addr, similar to i386, which will take an aligned
> stack.  Linker must support ___tls_get_addr.   Then we can use weakref
> to redirect __tls_get_addr to ___tls_get_addr if linker supports
> ___tls_get_addr and GCC doesn't.
>
>>> Here is the patch which implements those.  It is tested on x86-64
>>> and x32.
>>
>> Is this really needed on x32, too?  Did GCC have the same bug?
>>
>
> Yes, it is needed for x32 since the bug is only fixed on I thiuGCC 4.9 branch
> and above.
>
> --
> H.J.
  

Patch

From eb4dd17f2f8222278567dc085071bd85c6669cc4 Mon Sep 17 00:00:00 2001
From: "H.J. Lu" <hjl.tools@gmail.com>
Date: Mon, 3 Jul 2017 13:16:57 -0700
Subject: [PATCH] x86-64: Align the stack in __tls_get_addr [BZ #21609]

This change forces realignment of the stack pointer in __tls_get_addr, so
that binaries compiled by GCCs older than GCC 4.9:

https://gcc.gnu.org/bugzilla/show_bug.cgi?id=58066

continue to work even if vector instructions are used in glibc which
require the ABI stack realignment.

__tls_get_addr_slow is added to handle the slow paths in the default
implementation of__tls_get_addr in elf/dl-tls.c.  The new __tls_get_addr
calls __tls_get_addr_slow after realigning the stack.  Internal calls
within ld.so go directly to the default implementation of __tls_get_addr
because they do not need stack realignment.

2017-07-04  Florian Weimer  <fweimer@redhat.com>
	    H.J. Lu  <hongjiu.lu@intel.com>

	[BZ #21609]
	* sysdeps/x86_64/Makefile (sysdep-dl-routines): Add tls_get_addr.
	(gen-as-const-headers): Add rtld-offsets.sym.
	* sysdeps/x86_64/dl-tls.c: New file.
	* sysdeps/x86_64/rtld-offsets.sym: Likwise.
	* sysdeps/x86_64/tls_get_addr.S: Likewise.
	* sysdeps/x86_64/dl-tls.h: Add multiple inclusion guards.
	* sysdeps/x86_64/tlsdesc.sym (TI_MODULE_OFFSET): New.
	(TI_OFFSET_OFFSET): Likwise.
---
 sysdeps/x86_64/Makefile         |  4 +--
 sysdeps/x86_64/dl-tls.c         | 53 +++++++++++++++++++++++++++++++++++
 sysdeps/x86_64/dl-tls.h         |  5 ++++
 sysdeps/x86_64/rtld-offsets.sym |  6 ++++
 sysdeps/x86_64/tls_get_addr.S   | 61 +++++++++++++++++++++++++++++++++++++++++
 sysdeps/x86_64/tlsdesc.sym      |  3 ++
 6 files changed, 130 insertions(+), 2 deletions(-)
 create mode 100644 sysdeps/x86_64/dl-tls.c
 create mode 100644 sysdeps/x86_64/rtld-offsets.sym
 create mode 100644 sysdeps/x86_64/tls_get_addr.S

diff --git a/sysdeps/x86_64/Makefile b/sysdeps/x86_64/Makefile
index 5075c91..132470d 100644
--- a/sysdeps/x86_64/Makefile
+++ b/sysdeps/x86_64/Makefile
@@ -27,7 +27,7 @@  ifeq ($(subdir),elf)
 CFLAGS-.os += $(if $(filter $(@F),$(patsubst %,%.os,$(all-rtld-routines))),\
 		   -mno-mmx)
 
-sysdep-dl-routines += tlsdesc dl-tlsdesc
+sysdep-dl-routines += tlsdesc dl-tlsdesc tls_get_addr
 
 tests += ifuncmain8
 modules-names += ifuncmod8
@@ -120,5 +120,5 @@  endif
 endif
 
 ifeq ($(subdir),csu)
-gen-as-const-headers += tlsdesc.sym
+gen-as-const-headers += tlsdesc.sym rtld-offsets.sym
 endif
diff --git a/sysdeps/x86_64/dl-tls.c b/sysdeps/x86_64/dl-tls.c
new file mode 100644
index 0000000..567224e
--- /dev/null
+++ b/sysdeps/x86_64/dl-tls.c
@@ -0,0 +1,53 @@ 
+/* Thread-local storage handling in the ELF dynamic linker.  x86-64 version.
+   Copyright (C) 2017 Free Software Foundation, Inc.
+   This file is part of the GNU C Library.
+
+   The GNU C Library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 2.1 of the License, or (at your option) any later version.
+
+   The GNU C Library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with the GNU C Library; if not, see
+   <http://www.gnu.org/licenses/>.  */
+
+#ifdef SHARED
+/* Work around GCC PR58066, due to which __tls_get_addr may be called
+   with an unaligned stack.  The compat implementation is in
+   tls_get_addr-compat.S.  */
+
+# include <dl-tls.h>
+
+/* Define __tls_get_addr within elf/dl-tls.c under a different
+   name.  */
+extern __typeof__ (__tls_get_addr) __tls_get_addr_default;
+
+# define __tls_get_addr __tls_get_addr_default
+# include <elf/dl-tls.c>
+# undef __tls_get_addr
+
+hidden_ver (__tls_get_addr_default, __tls_get_addr)
+
+/* Only handle slow paths for __tls_get_addr.  */
+attribute_hidden
+void *
+__tls_get_addr_slow (GET_ADDR_ARGS)
+{
+  dtv_t *dtv = THREAD_DTV ();
+
+  if (__glibc_unlikely (dtv[0].counter != GL(dl_tls_generation)))
+    return update_get_addr (GET_ADDR_PARAM);
+
+  return tls_get_addr_tail (GET_ADDR_PARAM, dtv, NULL);
+}
+#else
+
+/* No compatibility symbol needed.  */
+# include <elf/dl-tls.c>
+
+#endif
diff --git a/sysdeps/x86_64/dl-tls.h b/sysdeps/x86_64/dl-tls.h
index 4a59d2a..c2fb56c 100644
--- a/sysdeps/x86_64/dl-tls.h
+++ b/sysdeps/x86_64/dl-tls.h
@@ -16,6 +16,9 @@ 
    License along with the GNU C Library; if not, see
    <http://www.gnu.org/licenses/>.  */
 
+#ifndef _X86_64_DL_TLS_H
+#define _X86_64_DL_TLS_H
+
 #include <stdint.h>
 
 /* Type used for the representation of TLS information in the GOT.  */
@@ -27,3 +30,5 @@  typedef struct dl_tls_index
 
 
 extern void *__tls_get_addr (tls_index *ti);
+
+#endif /* _X86_64_DL_TLS_H */
diff --git a/sysdeps/x86_64/rtld-offsets.sym b/sysdeps/x86_64/rtld-offsets.sym
new file mode 100644
index 0000000..fd41b51
--- /dev/null
+++ b/sysdeps/x86_64/rtld-offsets.sym
@@ -0,0 +1,6 @@ 
+#define SHARED
+#include <ldsodefs.h>
+
+--
+
+GL_TLS_GENERATION_OFFSET        offsetof (struct rtld_global, _dl_tls_generation)
diff --git a/sysdeps/x86_64/tls_get_addr.S b/sysdeps/x86_64/tls_get_addr.S
new file mode 100644
index 0000000..9d38fb3
--- /dev/null
+++ b/sysdeps/x86_64/tls_get_addr.S
@@ -0,0 +1,61 @@ 
+/* Stack-aligning implementation of __tls_get_addr.  x86-64 version.
+   Copyright (C) 2017 Free Software Foundation, Inc.
+   This file is part of the GNU C Library.
+
+   The GNU C Library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 2.1 of the License, or (at your option) any later version.
+
+   The GNU C Library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with the GNU C Library; if not, see
+   <http://www.gnu.org/licenses/>.  */
+
+#ifdef SHARED
+
+# include <sysdep.h>
+# include "tlsdesc.h"
+# include "rtld-offsets.h"
+
+/* See __tls_get_addr and __tls_get_addr_slow in dl-tls.c.  This function
+   call __tls_get_addr_slow on both slow paths.  It realigns the stack
+   before the call to work around GCC PR58066.  */
+
+ENTRY (__tls_get_addr)
+	mov 	%fs:DTV_OFFSET, %RDX_LP
+	mov	GL_TLS_GENERATION_OFFSET+_rtld_local(%rip), %RAX_LP
+	/* GL(dl_tls_generation) == dtv[0].counter */
+	cmp	%RAX_LP, (%rdx)
+	jne	1f
+	mov	TI_MODULE_OFFSET(%rdi), %RAX_LP
+	/* dtv[ti->ti_module] */
+# ifdef __LP64__
+	salq	$4, %rax
+	movq	(%rdx,%rax), %rax
+# else
+	movl	(%rdx,%rax, 8), %eax
+# endif
+	cmp	$-1, %RAX_LP
+	je	1f
+	add	TI_OFFSET_OFFSET(%rdi), %RAX_LP
+	ret
+1:
+	/* On the slow path, align the stack.  */
+	pushq	%rbp
+	cfi_def_cfa_offset (16)
+	cfi_offset (%rbp, -16)
+	mov	%RSP_LP, %RBP_LP
+	cfi_def_cfa_register (%rbp)
+	and	$-16, %RSP_LP
+	call	__tls_get_addr_slow
+	mov	%RBP_LP, %RSP_LP
+	popq	%rbp
+	cfi_def_cfa (%rsp, 8)
+	ret
+END (__tls_get_addr)
+#endif /* SHARED */
diff --git a/sysdeps/x86_64/tlsdesc.sym b/sysdeps/x86_64/tlsdesc.sym
index 3385497..fc897ab 100644
--- a/sysdeps/x86_64/tlsdesc.sym
+++ b/sysdeps/x86_64/tlsdesc.sym
@@ -15,3 +15,6 @@  TLSDESC_ARG			offsetof(struct tlsdesc, arg)
 TLSDESC_GEN_COUNT		offsetof(struct tlsdesc_dynamic_arg, gen_count)
 TLSDESC_MODID			offsetof(struct tlsdesc_dynamic_arg, tlsinfo.ti_module)
 TLSDESC_MODOFF			offsetof(struct tlsdesc_dynamic_arg, tlsinfo.ti_offset)
+
+TI_MODULE_OFFSET 		offsetof(tls_index, ti_module)
+TI_OFFSET_OFFSET 		offsetof(tls_index, ti_offset)
-- 
2.9.4