Always indirect branch to __libc_start_main via GOT

Message ID 20160609003757.GA19143@intel.com
State New, archived
Headers

Commit Message

Lu, Hongjiu June 9, 2016, 12:37 a.m. UTC
  Since __libc_start_main in libc.so is called very early, lazy binding
isn't relevant.  Always call __libc_start_main with indirect branch via
GOT to avoid extra branch to PLT slot.  In case of static executable,
ld in binutils 2.26 or above can convert indirect branch into direct
branch:

0000000000400a80 <_start>:
  400a80:       31 ed                   xor    %ebp,%ebp
  400a82:       49 89 d1                mov    %rdx,%r9
  400a85:       5e                      pop    %rsi
  400a86:       48 89 e2                mov    %rsp,%rdx
  400a89:       48 83 e4 f0             and    $0xfffffffffffffff0,%rsp
  400a8d:       50                      push   %rax
  400a8e:       54                      push   %rsp
  400a8f:       49 c7 c0 20 1b 40 00    mov    $0x401b20,%r8
  400a96:       48 c7 c1 90 1a 40 00    mov    $0x401a90,%rcx
  400a9d:       48 c7 c7 c0 03 40 00    mov    $0x4003c0,%rdi
  400aa4:       67 e8 96 09 00 00       addr32 callq 401440 <__libc_start_main>
  400aaa:       f4                      hlt

Tested on x86-64.  OK for master?


H.J.
---
	* sysdeps/x86_64/start.S (_start): Always indirect branch to
	__libc_start_main via GOT.
---
 sysdeps/x86_64/start.S | 17 ++++++++---------
 1 file changed, 8 insertions(+), 9 deletions(-)
  

Comments

Carlos O'Donell June 9, 2016, 3:03 a.m. UTC | #1
On 06/08/2016 08:37 PM, H.J. Lu wrote:
> Since __libc_start_main in libc.so is called very early, lazy binding
> isn't relevant.  Always call __libc_start_main with indirect branch via
> GOT to avoid extra branch to PLT slot.  In case of static executable,
> ld in binutils 2.26 or above can convert indirect branch into direct
> branch:
> 
> 0000000000400a80 <_start>:
>   400a80:       31 ed                   xor    %ebp,%ebp
>   400a82:       49 89 d1                mov    %rdx,%r9
>   400a85:       5e                      pop    %rsi
>   400a86:       48 89 e2                mov    %rsp,%rdx
>   400a89:       48 83 e4 f0             and    $0xfffffffffffffff0,%rsp
>   400a8d:       50                      push   %rax
>   400a8e:       54                      push   %rsp
>   400a8f:       49 c7 c0 20 1b 40 00    mov    $0x401b20,%r8
>   400a96:       48 c7 c1 90 1a 40 00    mov    $0x401a90,%rcx
>   400a9d:       48 c7 c7 c0 03 40 00    mov    $0x4003c0,%rdi
>   400aa4:       67 e8 96 09 00 00       addr32 callq 401440 <__libc_start_main>
>   400aaa:       f4                      hlt
> 
> Tested on x86-64.  OK for master?
 
Looks good to me.

I saw this particular instance while reviewing your binutils patches
to enable the same optimization.
 
> H.J.
> ---
> 	* sysdeps/x86_64/start.S (_start): Always indirect branch to
> 	__libc_start_main via GOT.
> ---
>  sysdeps/x86_64/start.S | 17 ++++++++---------
>  1 file changed, 8 insertions(+), 9 deletions(-)
> 
> diff --git a/sysdeps/x86_64/start.S b/sysdeps/x86_64/start.S
> index 2369b69..f1b961f 100644
> --- a/sysdeps/x86_64/start.S
> +++ b/sysdeps/x86_64/start.S
> @@ -102,23 +102,22 @@ ENTRY (_start)
>  	mov __libc_csu_init@GOTPCREL(%rip), %RCX_LP
>  
>  	mov main@GOTPCREL(%rip), %RDI_LP
> -
> -	/* Call the user's main function, and exit with its value.
> -	   But let the libc call main.  Since __libc_start_main is
> -	   called very early, lazy binding isn't relevant here.  Use
> -	   indirect branch via GOT to avoid extra branch to PLT slot.  */
> -	call *__libc_start_main@GOTPCREL(%rip)
>  #else
>  	/* Pass address of our own entry points to .fini and .init.  */
>  	mov $__libc_csu_fini, %R8_LP
>  	mov $__libc_csu_init, %RCX_LP
>  
>  	mov $main, %RDI_LP
> +#endif
>  
>  	/* Call the user's main function, and exit with its value.
> -	   But let the libc call main.	  */
> -	call __libc_start_main
> -#endif
> +	   But let the libc call main.  Since __libc_start_main in
> +	   libc.so is called very early, lazy binding isn't relevant
> +	   here.  Use indirect branch via GOT to avoid extra branch
> +	   to PLT slot.  In case of static executable, ld in binutils
> +	   2.26 or above can convert indirect branch into direct
> +	   branch.  */
> +	call *__libc_start_main@GOTPCREL(%rip)
>  
>  	hlt			/* Crash if somehow `exit' does return.	 */
>  END (_start)
>
  

Patch

diff --git a/sysdeps/x86_64/start.S b/sysdeps/x86_64/start.S
index 2369b69..f1b961f 100644
--- a/sysdeps/x86_64/start.S
+++ b/sysdeps/x86_64/start.S
@@ -102,23 +102,22 @@  ENTRY (_start)
 	mov __libc_csu_init@GOTPCREL(%rip), %RCX_LP
 
 	mov main@GOTPCREL(%rip), %RDI_LP
-
-	/* Call the user's main function, and exit with its value.
-	   But let the libc call main.  Since __libc_start_main is
-	   called very early, lazy binding isn't relevant here.  Use
-	   indirect branch via GOT to avoid extra branch to PLT slot.  */
-	call *__libc_start_main@GOTPCREL(%rip)
 #else
 	/* Pass address of our own entry points to .fini and .init.  */
 	mov $__libc_csu_fini, %R8_LP
 	mov $__libc_csu_init, %RCX_LP
 
 	mov $main, %RDI_LP
+#endif
 
 	/* Call the user's main function, and exit with its value.
-	   But let the libc call main.	  */
-	call __libc_start_main
-#endif
+	   But let the libc call main.  Since __libc_start_main in
+	   libc.so is called very early, lazy binding isn't relevant
+	   here.  Use indirect branch via GOT to avoid extra branch
+	   to PLT slot.  In case of static executable, ld in binutils
+	   2.26 or above can convert indirect branch into direct
+	   branch.  */
+	call *__libc_start_main@GOTPCREL(%rip)
 
 	hlt			/* Crash if somehow `exit' does return.	 */
 END (_start)