i386: Port elf_machine_{load_address,dynamic} from x86-64

Message ID 20210924042623.3899762-1-maskray@google.com
State Superseded
Headers
Series i386: Port elf_machine_{load_address,dynamic} from x86-64 |

Checks

Context Check Description
dj/TryBot-apply_patch success Patch applied to master at the time it was sent
dj/TryBot-32bit success Build for i686

Commit Message

Fangrui Song Sept. 24, 2021, 4:26 a.m. UTC
  This drops reliance on _GLOBAL_OFFSET_TABLE_[0] being the link-time
address of _DYNAMIC.

The code sequence length does not change.
---
 sysdeps/i386/dl-machine.h | 29 +++++++++++------------------
 1 file changed, 11 insertions(+), 18 deletions(-)
  

Comments

H.J. Lu Sept. 24, 2021, 6:20 a.m. UTC | #1
On Thu, Sep 23, 2021 at 9:26 PM Fangrui Song <maskray@google.com> wrote:
>
> This drops reliance on _GLOBAL_OFFSET_TABLE_[0] being the link-time
> address of _DYNAMIC.
>
> The code sequence length does not change.
> ---
>  sysdeps/i386/dl-machine.h | 29 +++++++++++------------------
>  1 file changed, 11 insertions(+), 18 deletions(-)
>
> diff --git a/sysdeps/i386/dl-machine.h b/sysdeps/i386/dl-machine.h
> index 590b41d8d7..9f0eeaf66e 100644
> --- a/sysdeps/i386/dl-machine.h
> +++ b/sysdeps/i386/dl-machine.h
> @@ -34,27 +34,20 @@ elf_machine_matches_host (const Elf32_Ehdr *ehdr)
>  }
>
>
> -/* Return the link-time address of _DYNAMIC.  Conveniently, this is the
> -   first element of the GOT, a special entry that is never relocated.  */
> -static inline Elf32_Addr __attribute__ ((unused, const))
> -elf_machine_dynamic (void)
> -{
> -  /* This produces a GOTOFF reloc that resolves to zero at link time, so in
> -     fact just loads from the GOT register directly.  By doing it without
> -     an asm we can let the compiler choose any register.  */
> -  extern const Elf32_Addr _GLOBAL_OFFSET_TABLE_[] attribute_hidden;
> -  return _GLOBAL_OFFSET_TABLE_[0];
> -}
> -
>  /* Return the run-time load address of the shared object.  */
> -static inline Elf32_Addr __attribute__ ((unused))
> +static inline ElfW(Addr) __attribute__ ((unused))
>  elf_machine_load_address (void)
>  {
> -  /* Compute the difference between the runtime address of _DYNAMIC as seen
> -     by a GOTOFF reference, and the link-time address found in the special
> -     unrelocated first GOT entry.  */
> -  extern Elf32_Dyn bygotoff[] asm ("_DYNAMIC") attribute_hidden;
> -  return (Elf32_Addr) &bygotoff - elf_machine_dynamic ();
> +  extern const ElfW(Ehdr) __ehdr_start attribute_hidden;
> +  return (ElfW(Addr)) &__ehdr_start;
> +}
> +
> +/* Return the link-time address of _DYNAMIC.  */
> +static inline ElfW(Addr) __attribute__ ((unused))
> +elf_machine_dynamic (void)
> +{
> +  extern ElfW(Dyn) _DYNAMIC[] attribute_hidden;
> +  return (ElfW(Addr)) _DYNAMIC - elf_machine_load_address ();
>  }
>
>  /* Set up the loaded object described by L so its unrelocated PLT
> --
> 2.33.0.685.g46640cef36-goog
>

what are the code differences before and after?
  
Fangrui Song Sept. 24, 2021, 6:55 a.m. UTC | #2
On 2021-09-23, H.J. Lu wrote:
>On Thu, Sep 23, 2021 at 9:26 PM Fangrui Song <maskray@google.com> wrote:
>>
>> This drops reliance on _GLOBAL_OFFSET_TABLE_[0] being the link-time
>> address of _DYNAMIC.
>>
>> The code sequence length does not change.
>> ---
>>  sysdeps/i386/dl-machine.h | 29 +++++++++++------------------
>>  1 file changed, 11 insertions(+), 18 deletions(-)
>>
>> diff --git a/sysdeps/i386/dl-machine.h b/sysdeps/i386/dl-machine.h
>> index 590b41d8d7..9f0eeaf66e 100644
>> --- a/sysdeps/i386/dl-machine.h
>> +++ b/sysdeps/i386/dl-machine.h
>> @@ -34,27 +34,20 @@ elf_machine_matches_host (const Elf32_Ehdr *ehdr)
>>  }
>>
>>
>> -/* Return the link-time address of _DYNAMIC.  Conveniently, this is the
>> -   first element of the GOT, a special entry that is never relocated.  */
>> -static inline Elf32_Addr __attribute__ ((unused, const))
>> -elf_machine_dynamic (void)
>> -{
>> -  /* This produces a GOTOFF reloc that resolves to zero at link time, so in
>> -     fact just loads from the GOT register directly.  By doing it without
>> -     an asm we can let the compiler choose any register.  */
>> -  extern const Elf32_Addr _GLOBAL_OFFSET_TABLE_[] attribute_hidden;
>> -  return _GLOBAL_OFFSET_TABLE_[0];
>> -}
>> -
>>  /* Return the run-time load address of the shared object.  */
>> -static inline Elf32_Addr __attribute__ ((unused))
>> +static inline ElfW(Addr) __attribute__ ((unused))
>>  elf_machine_load_address (void)
>>  {
>> -  /* Compute the difference between the runtime address of _DYNAMIC as seen
>> -     by a GOTOFF reference, and the link-time address found in the special
>> -     unrelocated first GOT entry.  */
>> -  extern Elf32_Dyn bygotoff[] asm ("_DYNAMIC") attribute_hidden;
>> -  return (Elf32_Addr) &bygotoff - elf_machine_dynamic ();
>> +  extern const ElfW(Ehdr) __ehdr_start attribute_hidden;
>> +  return (ElfW(Addr)) &__ehdr_start;
>> +}
>> +
>> +/* Return the link-time address of _DYNAMIC.  */
>> +static inline ElfW(Addr) __attribute__ ((unused))
>> +elf_machine_dynamic (void)
>> +{
>> +  extern ElfW(Dyn) _DYNAMIC[] attribute_hidden;
>> +  return (ElfW(Addr)) _DYNAMIC - elf_machine_load_address ();
>>  }
>>
>>  /* Set up the loaded object described by L so its unrelocated PLT
>> --
>> 2.33.0.685.g46640cef36-goog
>>
>
>what are the code differences before and after?

long ehdr(void) {
   extern char __ehdr_start[] __attribute__((visibility("hidden")));
   return (long)__ehdr_start;
}

long got(void) {
   extern long  _GLOBAL_OFFSET_TABLE_[] __attribute__((visibility("hidden")));
   return _GLOBAL_OFFSET_TABLE_[0];
}


ehdr:
         call    __x86.get_pc_thunk.ax
         addl    $_GLOBAL_OFFSET_TABLE_, %eax
         leal    __ehdr_start@GOTOFF(%eax), %eax
         ret

got:
         call    __x86.get_pc_thunk.ax
         addl    $_GLOBAL_OFFSET_TABLE_, %eax
         movl    _GLOBAL_OFFSET_TABLE_@GOTOFF(%eax), %eax
         ret


In GCC generated elf/rtld.os, the local code sequence related to __ehdr_start/_GLOBAL_OFFSET_TABLE_
does not change its size, but globally the new code triggers some code motion
and eventually makes the file smaller.

FWIW ld.so:.text is 48 bytes smaller.

New code doesn't have memory load. I guess it may allow GCC to optimize more.
  
H.J. Lu Sept. 24, 2021, 4:05 p.m. UTC | #3
On Thu, Sep 23, 2021 at 9:26 PM Fangrui Song <maskray@google.com> wrote:
>
> This drops reliance on _GLOBAL_OFFSET_TABLE_[0] being the link-time
> address of _DYNAMIC.
>
> The code sequence length does not change.
> ---
>  sysdeps/i386/dl-machine.h | 29 +++++++++++------------------
>  1 file changed, 11 insertions(+), 18 deletions(-)
>
> diff --git a/sysdeps/i386/dl-machine.h b/sysdeps/i386/dl-machine.h
> index 590b41d8d7..9f0eeaf66e 100644
> --- a/sysdeps/i386/dl-machine.h
> +++ b/sysdeps/i386/dl-machine.h
> @@ -34,27 +34,20 @@ elf_machine_matches_host (const Elf32_Ehdr *ehdr)
>  }
>
>
> -/* Return the link-time address of _DYNAMIC.  Conveniently, this is the
> -   first element of the GOT, a special entry that is never relocated.  */
> -static inline Elf32_Addr __attribute__ ((unused, const))
> -elf_machine_dynamic (void)
> -{
> -  /* This produces a GOTOFF reloc that resolves to zero at link time, so in
> -     fact just loads from the GOT register directly.  By doing it without
> -     an asm we can let the compiler choose any register.  */
> -  extern const Elf32_Addr _GLOBAL_OFFSET_TABLE_[] attribute_hidden;
> -  return _GLOBAL_OFFSET_TABLE_[0];
> -}
> -
>  /* Return the run-time load address of the shared object.  */
> -static inline Elf32_Addr __attribute__ ((unused))
> +static inline ElfW(Addr) __attribute__ ((unused))
>  elf_machine_load_address (void)
>  {
> -  /* Compute the difference between the runtime address of _DYNAMIC as seen
> -     by a GOTOFF reference, and the link-time address found in the special
> -     unrelocated first GOT entry.  */
> -  extern Elf32_Dyn bygotoff[] asm ("_DYNAMIC") attribute_hidden;
> -  return (Elf32_Addr) &bygotoff - elf_machine_dynamic ();
> +  extern const ElfW(Ehdr) __ehdr_start attribute_hidden;
> +  return (ElfW(Addr)) &__ehdr_start;
> +}
> +
> +/* Return the link-time address of _DYNAMIC.  */
> +static inline ElfW(Addr) __attribute__ ((unused))
> +elf_machine_dynamic (void)
> +{
> +  extern ElfW(Dyn) _DYNAMIC[] attribute_hidden;
> +  return (ElfW(Addr)) _DYNAMIC - elf_machine_load_address ();
>  }

Please use Elf32 instead of ElfW.

>  /* Set up the loaded object described by L so its unrelocated PLT
> --
> 2.33.0.685.g46640cef36-goog
>
  

Patch

diff --git a/sysdeps/i386/dl-machine.h b/sysdeps/i386/dl-machine.h
index 590b41d8d7..9f0eeaf66e 100644
--- a/sysdeps/i386/dl-machine.h
+++ b/sysdeps/i386/dl-machine.h
@@ -34,27 +34,20 @@  elf_machine_matches_host (const Elf32_Ehdr *ehdr)
 }
 
 
-/* Return the link-time address of _DYNAMIC.  Conveniently, this is the
-   first element of the GOT, a special entry that is never relocated.  */
-static inline Elf32_Addr __attribute__ ((unused, const))
-elf_machine_dynamic (void)
-{
-  /* This produces a GOTOFF reloc that resolves to zero at link time, so in
-     fact just loads from the GOT register directly.  By doing it without
-     an asm we can let the compiler choose any register.  */
-  extern const Elf32_Addr _GLOBAL_OFFSET_TABLE_[] attribute_hidden;
-  return _GLOBAL_OFFSET_TABLE_[0];
-}
-
 /* Return the run-time load address of the shared object.  */
-static inline Elf32_Addr __attribute__ ((unused))
+static inline ElfW(Addr) __attribute__ ((unused))
 elf_machine_load_address (void)
 {
-  /* Compute the difference between the runtime address of _DYNAMIC as seen
-     by a GOTOFF reference, and the link-time address found in the special
-     unrelocated first GOT entry.  */
-  extern Elf32_Dyn bygotoff[] asm ("_DYNAMIC") attribute_hidden;
-  return (Elf32_Addr) &bygotoff - elf_machine_dynamic ();
+  extern const ElfW(Ehdr) __ehdr_start attribute_hidden;
+  return (ElfW(Addr)) &__ehdr_start;
+}
+
+/* Return the link-time address of _DYNAMIC.  */
+static inline ElfW(Addr) __attribute__ ((unused))
+elf_machine_dynamic (void)
+{
+  extern ElfW(Dyn) _DYNAMIC[] attribute_hidden;
+  return (ElfW(Addr)) _DYNAMIC - elf_machine_load_address ();
 }
 
 /* Set up the loaded object described by L so its unrelocated PLT