x86_64: Simplify elf_machine_{load_address,dynamic}

Message ID 20210817060532.1210408-1-maskray@google.com (mailing list archive)
State Committed
Headers
Series x86_64: Simplify elf_machine_{load_address,dynamic} |

Checks

Context Check Description
dj/TryBot-apply_patch success Patch applied to master at the time it was sent
dj/TryBot-32bit success Build for i686

Commit Message

Fangrui Song Aug. 17, 2021, 6:05 a.m. UTC
  and drop reliance on _GLOBAL_OFFSET_TABLE_[0].

&__ehdr_start is a better way to get the load address.
---
 sysdeps/x86_64/dl-machine.h | 25 +++++++++----------------
 1 file changed, 9 insertions(+), 16 deletions(-)
  

Comments

H.J. Lu Aug. 17, 2021, 12:14 p.m. UTC | #1
On Mon, Aug 16, 2021 at 11:24 PM Fangrui Song via Libc-alpha
<libc-alpha@sourceware.org> wrote:
>
> and drop reliance on _GLOBAL_OFFSET_TABLE_[0].
>
> &__ehdr_start is a better way to get the load address.
> ---
>  sysdeps/x86_64/dl-machine.h | 25 +++++++++----------------
>  1 file changed, 9 insertions(+), 16 deletions(-)
>
> diff --git a/sysdeps/x86_64/dl-machine.h b/sysdeps/x86_64/dl-machine.h
> index a8596aa3fa..ceee50734e 100644
> --- a/sysdeps/x86_64/dl-machine.h
> +++ b/sysdeps/x86_64/dl-machine.h
> @@ -35,27 +35,20 @@ elf_machine_matches_host (const ElfW(Ehdr) *ehdr)
>  }
>
>
> -/* Return the link-time address of _DYNAMIC.  Conveniently, this is the
> -   first element of the GOT.  This must be inlined in a function which
> -   uses global data.  */
> -static inline ElfW(Addr) __attribute__ ((unused))
> -elf_machine_dynamic (void)
> -{
> -  /* This produces an IP-relative reloc which is resolved at link time. */
> -  extern const ElfW(Addr) _GLOBAL_OFFSET_TABLE_[] attribute_hidden;
> -  return _GLOBAL_OFFSET_TABLE_[0];
> -}
> -
> -
>  /* Return the run-time load address of the shared object.  */
>  static inline ElfW(Addr) __attribute__ ((unused))
>  elf_machine_load_address (void)
>  {
> -  /* Compute the difference between the runtime address of _DYNAMIC as seen
> -     by an IP-relative reference, and the link-time address found in the
> -     special unrelocated first GOT entry.  */
> +  extern const ElfW(Ehdr) __ehdr_start attribute_hidden;
> +  return (ElfW(Addr)) &__ehdr_start;
> +}
> +
> +/* Return the link-time address of _DYNAMIC.  */
> +static inline ElfW(Addr) __attribute__ ((unused))
> +elf_machine_dynamic (void)
> +{
>    extern ElfW(Dyn) _DYNAMIC[] attribute_hidden;
> -  return (ElfW(Addr)) &_DYNAMIC - elf_machine_dynamic ();
> +  return (ElfW(Addr)) _DYNAMIC - elf_machine_load_address ();
>  }
>
>  /* Set up the loaded object described by L so its unrelocated PLT
> --
> 2.33.0.rc1.237.g0d66db33f3-goog
>

Please provide comparison of ld.so with and without the change, in
terms of code size, code sequence as well as dynamic relocation.

Thanks.
  
Fangrui Song Aug. 17, 2021, 5:43 p.m. UTC | #2
On 2021-08-17, H.J. Lu wrote:
>On Mon, Aug 16, 2021 at 11:24 PM Fangrui Song via Libc-alpha
><libc-alpha@sourceware.org> wrote:
>>
>> and drop reliance on _GLOBAL_OFFSET_TABLE_[0].
>>
>> &__ehdr_start is a better way to get the load address.
>> ---
>>  sysdeps/x86_64/dl-machine.h | 25 +++++++++----------------
>>  1 file changed, 9 insertions(+), 16 deletions(-)
>>
>> diff --git a/sysdeps/x86_64/dl-machine.h b/sysdeps/x86_64/dl-machine.h
>> index a8596aa3fa..ceee50734e 100644
>> --- a/sysdeps/x86_64/dl-machine.h
>> +++ b/sysdeps/x86_64/dl-machine.h
>> @@ -35,27 +35,20 @@ elf_machine_matches_host (const ElfW(Ehdr) *ehdr)
>>  }
>>
>>
>> -/* Return the link-time address of _DYNAMIC.  Conveniently, this is the
>> -   first element of the GOT.  This must be inlined in a function which
>> -   uses global data.  */
>> -static inline ElfW(Addr) __attribute__ ((unused))
>> -elf_machine_dynamic (void)
>> -{
>> -  /* This produces an IP-relative reloc which is resolved at link time. */
>> -  extern const ElfW(Addr) _GLOBAL_OFFSET_TABLE_[] attribute_hidden;
>> -  return _GLOBAL_OFFSET_TABLE_[0];
>> -}
>> -
>> -
>>  /* Return the run-time load address of the shared object.  */
>>  static inline ElfW(Addr) __attribute__ ((unused))
>>  elf_machine_load_address (void)
>>  {
>> -  /* Compute the difference between the runtime address of _DYNAMIC as seen
>> -     by an IP-relative reference, and the link-time address found in the
>> -     special unrelocated first GOT entry.  */
>> +  extern const ElfW(Ehdr) __ehdr_start attribute_hidden;
>> +  return (ElfW(Addr)) &__ehdr_start;
>> +}
>> +
>> +/* Return the link-time address of _DYNAMIC.  */
>> +static inline ElfW(Addr) __attribute__ ((unused))
>> +elf_machine_dynamic (void)
>> +{
>>    extern ElfW(Dyn) _DYNAMIC[] attribute_hidden;
>> -  return (ElfW(Addr)) &_DYNAMIC - elf_machine_dynamic ();
>> +  return (ElfW(Addr)) _DYNAMIC - elf_machine_load_address ();
>>  }
>>
>>  /* Set up the loaded object described by L so its unrelocated PLT
>> --
>> 2.33.0.rc1.237.g0d66db33f3-goog
>>
>
>Please provide comparison of ld.so with and without the change, in
>terms of code size, code sequence as well as dynamic relocation.
>
>Thanks.

Neither form has dynamic relocations.


If both elf_machine_{load_address,dynamic} are emitted, the new form is
two bytes longer but has fewer GOT loads.

0000000000000050 <elf_machine_dynamic>:
       50:      	movq	(%rip), %rax            # 0x57 <elf_machine_dynamic+0x7>
		0000000000000053:  R_X86_64_GOTPC32	_GLOBAL_OFFSET_TABLE_-0x4
       57:      	retq
       58:      	nopl	(%rax,%rax)

0000000000000060 <elf_machine_load_address>:
       60:      	leaq	(%rip), %rax            # 0x67 <elf_machine_load_address+0x7>
		0000000000000063:  R_X86_64_PC32	_DYNAMIC-0x4
       67:      	subq	(%rip), %rax            # 0x6e <elf_machine_load_address+0xe>
		000000000000006a:  R_X86_64_GOTPC32	_GLOBAL_OFFSET_TABLE_-0x4
       6e:      	retq

---

0000000000000050 <elf_machine_load_address>:
       50:      	leaq	(%rip), %rax            # 0x57 <elf_machine_load_address+0x7>
		0000000000000053:  R_X86_64_PC32	__ehdr_start-0x4
       57:      	retq
       58:      	nopl	(%rax,%rax)

0000000000000060 <elf_machine_dynamic>:
       60:      	leaq	(%rip), %rdx            # 0x67 <elf_machine_dynamic+0x7>
		0000000000000063:  R_X86_64_PC32	__ehdr_start-0x4
       67:      	leaq	(%rip), %rax            # 0x6e <elf_machine_dynamic+0xe>
		000000000000006a:  R_X86_64_PC32	_DYNAMIC-0x4
       6e:      	subq	%rdx, %rax
       71:      	retq


If inlined, as what actually happens when GCC generates elf/rtld.os,
the new form is actually 3 bytes shorter. The difference is nearly
nothing when an align directive is added.


      cf6:      	shlq	$32, %rdx
      cfa:      	orq	%rdx, %rax
      cfd:      	leaq	(%rip), %rdx            # 0xd04 <_dl_start+0x24>
		0000000000000d00:  R_X86_64_PC32	_DYNAMIC-0x4
      d04:      	movq	%rax, (%rip)            # 0xd0b <_dl_start+0x2b>
		0000000000000d07:  R_X86_64_PC32	.data.rel.ro+0x7c
      d0b:      	movq	(%rip), %rax            # 0xd12 <_dl_start+0x32>
		0000000000000d0e:  R_X86_64_PC32	_DYNAMIC-0x4
      d12:      	movq	%rdx, %r12
      d15:      	subq	(%rip), %r12            # 0xd1c <_dl_start+0x3c>
		0000000000000d18:  R_X86_64_GOTPC32	_GLOBAL_OFFSET_TABLE_-0x4
      d1c:      	movq	%rdx, (%rip)            # 0xd23 <_dl_start+0x43>
		0000000000000d1f:  R_X86_64_PC32	_rtld_local+0xa14
      d23:      	movq	%r12, (%rip)            # 0xd2a <_dl_start+0x4a>
		0000000000000d26:  R_X86_64_PC32	_rtld_local+0xa04
      d2a:      	testq	%rax, %rax
      d2d:      	je	0xdd0 <_dl_start+0xf0>
      d33:      	movl	$1879048191, %edi       # imm = 0x6FFFFFFF
      d38:      	movl	$1879047679, %r10d      # imm = 0x6FFFFDFF
      d3e:      	movl	$1879047935, %ebx       # imm = 0x6FFFFEFF
      d43:      	movl	$1879048001, %r13d      # imm = 0x6FFFFF41
      d49:      	leaq	(%rip), %rcx            # 0xd50 <_dl_start+0x70>
		0000000000000d4c:  R_X86_64_PC32	_rtld_local+0xa44
      d50:      	movl	$1879047733, %r11d      # imm = 0x6FFFFE35
      d56:      	movl	$50, %r9d
      d5c:      	movl	$1879048226, %r8d       # imm = 0x70000022
      d62:      	jmp	0xd82 <_dl_start+0xa2>

---

      cf6:      	leaq	(%rip), %r13            # 0xcfd <_dl_start+0x1d>
		0000000000000cf9:  R_X86_64_PC32	__ehdr_start-0x4
      cfd:      	movq	%r13, (%rip)            # 0xd04 <_dl_start+0x24>
		0000000000000d00:  R_X86_64_PC32	_rtld_local+0xa04
      d04:      	shlq	$32, %rdx
      d08:      	orq	%rdx, %rax
      d0b:      	leaq	(%rip), %rdx            # 0xd12 <_dl_start+0x32>
		0000000000000d0e:  R_X86_64_PC32	_DYNAMIC-0x4
      d12:      	movq	%rax, (%rip)            # 0xd19 <_dl_start+0x39>
		0000000000000d15:  R_X86_64_PC32	.data.rel.ro+0x7c
      d19:      	movq	(%rip), %rax            # 0xd20 <_dl_start+0x40>
		0000000000000d1c:  R_X86_64_PC32	_DYNAMIC-0x4
      d20:      	movq	%rdx, (%rip)            # 0xd27 <_dl_start+0x47>
		0000000000000d23:  R_X86_64_PC32	_rtld_local+0xa14
      d27:      	testq	%rax, %rax
      d2a:      	je	0xdd0 <_dl_start+0xf0>
      d30:      	movl	$1879048191, %edi       # imm = 0x6FFFFFFF
      d35:      	movl	$1879047679, %r10d      # imm = 0x6FFFFDFF
      d3b:      	movl	$1879047935, %ebx       # imm = 0x6FFFFEFF
      d40:      	movl	$1879048001, %r14d      # imm = 0x6FFFFF41
      d46:      	leaq	(%rip), %rcx            # 0xd4d <_dl_start+0x6d>
		0000000000000d49:  R_X86_64_PC32	_rtld_local+0xa44
      d4d:      	movl	$1879047733, %r11d      # imm = 0x6FFFFE35
      d53:      	movl	$50, %r9d
      d59:      	movl	$1879048226, %r8d       # imm = 0x70000022
      d5f:      	jmp	0xd82 <_dl_start+0xa2>


I think the main merit is to drop reliance on _GLOBAL_OFFSET_TABLE_[0].
(Newer ports can learn from the existing x86-64/aarch64 ports that _GLOBAL_OFFSET_TABLE_[0] doesn't need to do anything special.)
  
H.J. Lu Aug. 17, 2021, 5:43 p.m. UTC | #3
On Tue, Aug 17, 2021 at 5:14 AM H.J. Lu <hjl.tools@gmail.com> wrote:
>
> On Mon, Aug 16, 2021 at 11:24 PM Fangrui Song via Libc-alpha
> <libc-alpha@sourceware.org> wrote:
> >
> > and drop reliance on _GLOBAL_OFFSET_TABLE_[0].
> >
> > &__ehdr_start is a better way to get the load address.
> > ---
> >  sysdeps/x86_64/dl-machine.h | 25 +++++++++----------------
> >  1 file changed, 9 insertions(+), 16 deletions(-)
> >
> > diff --git a/sysdeps/x86_64/dl-machine.h b/sysdeps/x86_64/dl-machine.h
> > index a8596aa3fa..ceee50734e 100644
> > --- a/sysdeps/x86_64/dl-machine.h
> > +++ b/sysdeps/x86_64/dl-machine.h
> > @@ -35,27 +35,20 @@ elf_machine_matches_host (const ElfW(Ehdr) *ehdr)
> >  }
> >
> >
> > -/* Return the link-time address of _DYNAMIC.  Conveniently, this is the
> > -   first element of the GOT.  This must be inlined in a function which
> > -   uses global data.  */
> > -static inline ElfW(Addr) __attribute__ ((unused))
> > -elf_machine_dynamic (void)
> > -{
> > -  /* This produces an IP-relative reloc which is resolved at link time. */
> > -  extern const ElfW(Addr) _GLOBAL_OFFSET_TABLE_[] attribute_hidden;
> > -  return _GLOBAL_OFFSET_TABLE_[0];
> > -}
> > -
> > -
> >  /* Return the run-time load address of the shared object.  */
> >  static inline ElfW(Addr) __attribute__ ((unused))
> >  elf_machine_load_address (void)
> >  {
> > -  /* Compute the difference between the runtime address of _DYNAMIC as seen
> > -     by an IP-relative reference, and the link-time address found in the
> > -     special unrelocated first GOT entry.  */
> > +  extern const ElfW(Ehdr) __ehdr_start attribute_hidden;
> > +  return (ElfW(Addr)) &__ehdr_start;
> > +}
> > +
> > +/* Return the link-time address of _DYNAMIC.  */
> > +static inline ElfW(Addr) __attribute__ ((unused))
> > +elf_machine_dynamic (void)
> > +{
> >    extern ElfW(Dyn) _DYNAMIC[] attribute_hidden;
> > -  return (ElfW(Addr)) &_DYNAMIC - elf_machine_dynamic ();
> > +  return (ElfW(Addr)) _DYNAMIC - elf_machine_load_address ();
> >  }
> >
> >  /* Set up the loaded object described by L so its unrelocated PLT
> > --
> > 2.33.0.rc1.237.g0d66db33f3-goog
> >
>
> Please provide comparison of ld.so with and without the change, in
> terms of code size, code sequence as well as dynamic relocation.

LGTM.

Reviewed-by: H.J. Lu <hjl.tools@gmail.com>

Thanks.
  

Patch

diff --git a/sysdeps/x86_64/dl-machine.h b/sysdeps/x86_64/dl-machine.h
index a8596aa3fa..ceee50734e 100644
--- a/sysdeps/x86_64/dl-machine.h
+++ b/sysdeps/x86_64/dl-machine.h
@@ -35,27 +35,20 @@  elf_machine_matches_host (const ElfW(Ehdr) *ehdr)
 }
 
 
-/* Return the link-time address of _DYNAMIC.  Conveniently, this is the
-   first element of the GOT.  This must be inlined in a function which
-   uses global data.  */
-static inline ElfW(Addr) __attribute__ ((unused))
-elf_machine_dynamic (void)
-{
-  /* This produces an IP-relative reloc which is resolved at link time. */
-  extern const ElfW(Addr) _GLOBAL_OFFSET_TABLE_[] attribute_hidden;
-  return _GLOBAL_OFFSET_TABLE_[0];
-}
-
-
 /* Return the run-time load address of the shared object.  */
 static inline ElfW(Addr) __attribute__ ((unused))
 elf_machine_load_address (void)
 {
-  /* Compute the difference between the runtime address of _DYNAMIC as seen
-     by an IP-relative reference, and the link-time address found in the
-     special unrelocated first GOT entry.  */
+  extern const ElfW(Ehdr) __ehdr_start attribute_hidden;
+  return (ElfW(Addr)) &__ehdr_start;
+}
+
+/* Return the link-time address of _DYNAMIC.  */
+static inline ElfW(Addr) __attribute__ ((unused))
+elf_machine_dynamic (void)
+{
   extern ElfW(Dyn) _DYNAMIC[] attribute_hidden;
-  return (ElfW(Addr)) &_DYNAMIC - elf_machine_dynamic ();
+  return (ElfW(Addr)) _DYNAMIC - elf_machine_load_address ();
 }
 
 /* Set up the loaded object described by L so its unrelocated PLT