[PowerPC64] Use medium model toc accesses throughout

Message ID Ye1NJQwEpe7AyNYD@squeak.grove.modra.org
State Superseded
Headers
Series [PowerPC64] Use medium model toc accesses throughout |

Checks

Context Check Description
dj/TryBot-apply_patch success Patch applied to master at the time it was sent
dj/TryBot-32bit success Build for i686

Commit Message

Alan Modra Jan. 23, 2022, 12:42 p.m. UTC
  The PowerPC64 linker edits medium model toc-indirect code to toc-pointer
relative:
	addis r9,r2,tc_entry_for_var@toc@ha
	ld r9,tc_entry_for_var@toc@l(r9)
becomes
	addis r9,r2,(var-.TOC.)@ha
	addi r9,r9,(var-.TOC.)@l
when "var" is known to be local to the binary.  This isn't done for
small-model toc-indirect code, because "var" is almost guaranteed to
be too far away from .TOC. for a 16-bit signed offset.  And, because
the analysis of which .toc entry can be removed becomes much more
complicated in objects that mix code models, they aren't removed if
any small-model toc sequence appears in an object file.

Unfortunately glibc's build of ld.so smashes the needed objects
together in a ld -r linking stage.  This means the GOT/TOC is left
with a whole lot of relative relocations which is untidy, but in
itself is not a serious problem.  However, static-pie on powerpc64
bombs due to a segfault caused by one of the small-model accesses
before _dl_relocate_static_pie.  (The very first one in rcrt1.o
passing start_addresses in r8 to __libc_start_main.)

So this patch makes all the toc/got accesses in assembly medium code
model, and a couple of functions hidden.  By itself this is *not*
enough to give us working static-pie, but it is useful anyway to
enable better linker optimisation.

There's a serious problem in libgcc too.  libgcc ifuncs access the
AT_HWCAP words stored in the tcb with an offset from the thread
pointer (r13), but r13 isn't set at the time _dl_relocate_static_pie
runs, and I'm loathe to try calling init_tls early.  A better approach
that might work is to fake r13 so that _dl_hwcap is at the expected
offset where we'd normally find the tcb hwcap words.

Tested for regressions with a powerpc64le-linux build and test run.
OK to apply?
  

Comments

Paul E Murphy Jan. 28, 2022, 5:48 p.m. UTC | #1
On 1/23/22 6:42 AM, Alan Modra via Libc-alpha wrote:
> The PowerPC64 linker edits medium model toc-indirect code to toc-pointer
> relative:
> 	addis r9,r2,tc_entry_for_var@toc@ha
> 	ld r9,tc_entry_for_var@toc@l(r9)
> becomes
> 	addis r9,r2,(var-.TOC.)@ha
> 	addi r9,r9,(var-.TOC.)@l
> when "var" is known to be local to the binary.  This isn't done for
> small-model toc-indirect code, because "var" is almost guaranteed to
> be too far away from .TOC. for a 16-bit signed offset.  And, because
> the analysis of which .toc entry can be removed becomes much more
> complicated in objects that mix code models, they aren't removed if
> any small-model toc sequence appears in an object file.
> 
> Unfortunately glibc's build of ld.so smashes the needed objects
> together in a ld -r linking stage.  This means the GOT/TOC is left
> with a whole lot of relative relocations which is untidy, but in
> itself is not a serious problem.  However, static-pie on powerpc64
> bombs due to a segfault caused by one of the small-model accesses
> before _dl_relocate_static_pie.  (The very first one in rcrt1.o
> passing start_addresses in r8 to __libc_start_main.)
> 
> So this patch makes all the toc/got accesses in assembly medium code
> model, and a couple of functions hidden.  By itself this is *not*
> enough to give us working static-pie, but it is useful anyway to
> enable better linker optimisation.
> 
> There's a serious problem in libgcc too.  libgcc ifuncs access the
> AT_HWCAP words stored in the tcb with an offset from the thread
> pointer (r13), but r13 isn't set at the time _dl_relocate_static_pie
> runs, and I'm loathe to try calling init_tls early.  A better approach
> that might work is to fake r13 so that _dl_hwcap is at the expected
> offset where we'd normally find the tcb hwcap words.
> 
> Tested for regressions with a powerpc64le-linux build and test run.
> OK to apply?
> 

> diff --git a/sysdeps/powerpc/powerpc64/dl-trampoline.S b/sysdeps/powerpc/powerpc64/dl-trampoline.S
> index 23debc2faf..45b821607b 100644
> --- a/sysdeps/powerpc/powerpc64/dl-trampoline.S
> +++ b/sysdeps/powerpc/powerpc64/dl-trampoline.S
> @@ -32,6 +32,7 @@
>      because gcc as of 2010/05 doesn't allocate a proper stack frame for
>      a function that makes no calls except for __tls_get_addr and we
>      might be here resolving the __tls_get_addr call.  */
> +	.hidden _dl_runtime_resolve
>   #define INT_PARMS FRAME_MIN_SIZE
>   ENTRY (_dl_runtime_resolve, 4)
>   	stdu	r1,-FRAME_SIZE(r1)
> @@ -195,6 +196,7 @@ END(_dl_runtime_resolve)
>      parm1 (r3) and the index (r0) needs to be converted to an offset
>      (index * 24) in parm2 (r4).  */
>   #ifndef PROF
> +	.hidden _dl_profile_resolve
>   ENTRY (_dl_profile_resolve, 4)
>   /* Spill r30, r31 to preserve the link_map* and reloc_addr, in case we
>      need to call _dl_audit_pltexit.  */

LGTM (this should wait until after the freeze), though these two changes 
seem orthogonal to the commit title.

For my education, why are are small model accesses used in these files?
  
Alan Modra Jan. 29, 2022, 1:24 a.m. UTC | #2
On Fri, Jan 28, 2022 at 11:48:44AM -0600, Paul E Murphy wrote:
> > --- a/sysdeps/powerpc/powerpc64/dl-trampoline.S
> > +++ b/sysdeps/powerpc/powerpc64/dl-trampoline.S
> > @@ -32,6 +32,7 @@
> >      because gcc as of 2010/05 doesn't allocate a proper stack frame for
> >      a function that makes no calls except for __tls_get_addr and we
> >      might be here resolving the __tls_get_addr call.  */
> > +	.hidden _dl_runtime_resolve
> >   #define INT_PARMS FRAME_MIN_SIZE
> >   ENTRY (_dl_runtime_resolve, 4)
> >   	stdu	r1,-FRAME_SIZE(r1)
> > @@ -195,6 +196,7 @@ END(_dl_runtime_resolve)
> >      parm1 (r3) and the index (r0) needs to be converted to an offset
> >      (index * 24) in parm2 (r4).  */
> >   #ifndef PROF
> > +	.hidden _dl_profile_resolve
> >   ENTRY (_dl_profile_resolve, 4)
> >   /* Spill r30, r31 to preserve the link_map* and reloc_addr, in case we
> >      need to call _dl_audit_pltexit.  */
> 
> LGTM (this should wait until after the freeze), though these two changes
> seem orthogonal to the commit title.

True.  It could have been a separate patch.

> For my education, why are are small model accesses used in these files?

dl-trampoline.S is an old file, from before the linker edited
toc-indirect code to toc-pointer relative.  Without the link editing
small model is more efficient if the toc is small enough.
  

Patch

diff --git a/sysdeps/powerpc/powerpc64/__longjmp-common.S b/sysdeps/powerpc/powerpc64/__longjmp-common.S
index 4d71b9e102..5f629e1e0f 100644
--- a/sysdeps/powerpc/powerpc64/__longjmp-common.S
+++ b/sysdeps/powerpc/powerpc64/__longjmp-common.S
@@ -47,12 +47,14 @@ 
 ENTRY (__longjmp)
 	CALL_MCOUNT 2
 #ifndef __NO_VMX__
-	ld    r5,.LC__dl_hwcap@toc(r2)
+	addis	r5,r2,.LC__dl_hwcap@toc@ha
+	ld	r5,.LC__dl_hwcap@toc@l(r5)
 # ifdef SHARED
 	/* Load _rtld-global._dl_hwcap.  */
-	ld    r5,RTLD_GLOBAL_RO_DL_HWCAP_OFFSET(r5)
+	ld	r5,RTLD_GLOBAL_RO_DL_HWCAP_OFFSET(r5)
 # else
-	ld    r5,0(r5) /* Load extern _dl_hwcap.  */
+	/* Load extern _dl_hwcap.  */
+	ld	r5,0(r5)
 # endif
 	andis.  r5,r5,(PPC_FEATURE_HAS_ALTIVEC >> 16)
 	beq	L(no_vmx)
diff --git a/sysdeps/powerpc/powerpc64/dl-machine.h b/sysdeps/powerpc/powerpc64/dl-machine.h
index a505998351..eb41c85280 100644
--- a/sysdeps/powerpc/powerpc64/dl-machine.h
+++ b/sysdeps/powerpc/powerpc64/dl-machine.h
@@ -175,9 +175,12 @@  BODY_PREFIX "_dl_start_user:\n"						\
 /* the address of _start in r30.  */					\
 "	mr	30,3\n"							\
 /* &_dl_argc in 29, &_dl_argv in 27, and _dl_loaded in 28.  */		\
-"	ld	28,.LC__rtld_local@toc(2)\n"				\
-"	ld	29,.LC__dl_argc@toc(2)\n"				\
-"	ld	27,.LC__dl_argv@toc(2)\n"				\
+"	addis	28,2,.LC__rtld_local@toc@ha\n"				\
+"	ld	28,.LC__rtld_local@toc@l(28)\n"				\
+"	addis	29,2,.LC__dl_argc@toc@ha\n"				\
+"	ld	29,.LC__dl_argc@toc@l(29)\n"				\
+"	addis	27,2,.LC__dl_argv@toc@ha\n"				\
+"	ld	27,.LC__dl_argv@toc@l(27)\n"				\
 /* _dl_init (_dl_loaded, _dl_argc, _dl_argv, _dl_argv+_dl_argc+1).  */	\
 "	ld	3,0(28)\n"						\
 "	lwa	4,0(29)\n"						\
@@ -204,7 +207,8 @@  BODY_PREFIX "_dl_start_user:\n"						\
 "	addi	6,6,8\n"						\
 /* Pass a termination function pointer (in this case _dl_fini) in	\
    r7.  */								\
-"	ld	7,.LC__dl_fini@toc(2)\n"				\
+"	addis	7,2,.LC__dl_fini@toc@ha\n"				\
+"	ld	7,.LC__dl_fini@toc@l(7)\n"				\
 /* Pass the stack pointer in r1 (so far so good), pointing to a NULL	\
    value.  This lets our startup code distinguish between a program	\
    linked statically, which linux will call with argc on top of the	\
diff --git a/sysdeps/powerpc/powerpc64/dl-trampoline.S b/sysdeps/powerpc/powerpc64/dl-trampoline.S
index 23debc2faf..45b821607b 100644
--- a/sysdeps/powerpc/powerpc64/dl-trampoline.S
+++ b/sysdeps/powerpc/powerpc64/dl-trampoline.S
@@ -32,6 +32,7 @@ 
    because gcc as of 2010/05 doesn't allocate a proper stack frame for
    a function that makes no calls except for __tls_get_addr and we
    might be here resolving the __tls_get_addr call.  */
+	.hidden _dl_runtime_resolve
 #define INT_PARMS FRAME_MIN_SIZE
 ENTRY (_dl_runtime_resolve, 4)
 	stdu	r1,-FRAME_SIZE(r1)
@@ -195,6 +196,7 @@  END(_dl_runtime_resolve)
    parm1 (r3) and the index (r0) needs to be converted to an offset
    (index * 24) in parm2 (r4).  */
 #ifndef PROF
+	.hidden _dl_profile_resolve
 ENTRY (_dl_profile_resolve, 4)
 /* Spill r30, r31 to preserve the link_map* and reloc_addr, in case we
    need to call _dl_audit_pltexit.  */
@@ -225,12 +227,14 @@  ENTRY (_dl_profile_resolve, 4)
 	std	r9,INT_PARMS+48(r1)
 	std	r10,INT_PARMS+56(r1)
 	std	r8,CALLING_SP(r1)
-	ld	r12,.LC__dl_hwcap@toc(r2)
+	addis   r12,r2,.LC__dl_hwcap@toc@ha
+	ld	r12,.LC__dl_hwcap@toc@l(r12)
 #ifdef SHARED
 	/* Load _rtld_local_ro._dl_hwcap.  */
 	ld	r12,RTLD_GLOBAL_RO_DL_HWCAP_OFFSET(r12)
 #else
-	ld	r12,0(r12) /* Load extern _dl_hwcap.  */
+	/* Load extern _dl_hwcap.  */
+	ld	r12,0(r12)
 #endif
 	andis.  r0,r12,(PPC_FEATURE_HAS_ALTIVEC >> 16)
 	beq	L(saveFP)
diff --git a/sysdeps/powerpc/powerpc64/setjmp-common.S b/sysdeps/powerpc/powerpc64/setjmp-common.S
index 41812e3427..19e76d59ee 100644
--- a/sysdeps/powerpc/powerpc64/setjmp-common.S
+++ b/sysdeps/powerpc/powerpc64/setjmp-common.S
@@ -132,12 +132,14 @@  JUMPTARGET(GLUE(__sigsetjmp_symbol,_ent)):
 	std  r31,((JB_GPRS+17)*8)(3)
 	stfd fp31,((JB_FPRS+17)*8)(3)
 #ifndef __NO_VMX__
-	ld    r6,.LC__dl_hwcap@toc(r2)
+	addis	r6,r2,.LC__dl_hwcap@toc@ha
+	ld	r6,.LC__dl_hwcap@toc@l(r6)
 # ifdef SHARED
 	/* Load _rtld-global._dl_hwcap.  */
-	ld    r6,RTLD_GLOBAL_RO_DL_HWCAP_OFFSET(r6)
+	ld	r6,RTLD_GLOBAL_RO_DL_HWCAP_OFFSET(r6)
 # else
-	ld    r6,0(r6) /* Load extern _dl_hwcap.  */
+	/* Load extern _dl_hwcap.  */
+	ld	r6,0(r6)
 # endif
 	andis.  r6,r6,(PPC_FEATURE_HAS_ALTIVEC >> 16)
 	beq	L(no_vmx)
diff --git a/sysdeps/powerpc/powerpc64/start.S b/sysdeps/powerpc/powerpc64/start.S
index 4319dc8d3e..244d9da07b 100644
--- a/sysdeps/powerpc/powerpc64/start.S
+++ b/sysdeps/powerpc/powerpc64/start.S
@@ -74,7 +74,8 @@  ENTRY (_start)
 
  /* put the address of start_addresses in r8...  **
 ** PPC64 ABI uses R13 for thread local, so we leave it alone */
-	ld	r8,.L01@toc(r2)
+	addis	r8,r2,.L01@toc@ha
+	ld	r8,.L01@toc@l(r8)
 
  /* and continue in libc-start, in glibc.  */
 	b	JUMPTARGET(__libc_start_main)
diff --git a/sysdeps/powerpc/powerpc64/sysdep.h b/sysdeps/powerpc/powerpc64/sysdep.h
index 3fec06e0df..011068b290 100644
--- a/sysdeps/powerpc/powerpc64/sysdep.h
+++ b/sysdeps/powerpc/powerpc64/sysdep.h
@@ -469,14 +469,16 @@  LT_LABELSUFFIX(name,_name_end): ; \
 	.tc _rtld_global_ro[TC],_rtld_global_ro
 # endif
 # define __GLRO(rOUT, var, offset)		\
-	ld	rOUT,.LC__ ## var@toc(r2);	\
+	addis	rOUT,r2,.LC__ ## var@toc@ha;	\
+	ld	rOUT,.LC__ ## var@toc@l(rOUT);	\
 	lwz	rOUT,offset(rOUT)
 #else
 # define __GLRO_DEF(var)			\
 .LC__ ## var:					\
 	.tc _ ## var[TC],_ ## var
 # define __GLRO(rOUT, var, offset)		\
-	ld	rOUT,.LC__ ## var@toc(r2);	\
+	addis	rOUT,r2,.LC__ ## var@toc@ha;	\
+	ld	rOUT,.LC__ ## var@toc@l(rOUT);	\
 	lwz	rOUT,0(rOUT)
 #endif