Patchwork [2/2] aarch64: handle STO_AARCH64_VARIANT_PCS

login
register
mail settings
Submitter Szabolcs Nagy
Date May 23, 2019, 11:21 a.m.
Message ID <35229f84-57b4-5ca3-badb-82926bb6bb74@arm.com>
Download mbox | patch
Permalink /patch/32827/
State New
Headers show

Comments

Szabolcs Nagy - May 23, 2019, 11:21 a.m.
Avoid lazy binding of symbols that may follow a variant PCS with different
register usage convention from the base PCS.

Currently the lazy binding entry code does not preserve all the registers
required for AdvSIMD and SVE vector calls.  Saving and restoring all
registers unconditionally may break existing binaries, even if they never
use vector calls, because of the larger stack requirement for lazy
resolution, which can be significant on an SVE system.

The solution is to mark all symbols in the symbol table that may follow
a variant PCS so the dynamic linker can handle them specially.  In this
patch such symbols are always resolved at load time, not lazily.

So currently LD_AUDIT for variant PCS symbols are not supported, for that
the _dl_runtime_profile entry needs to be changed e.g. to unconditionally
save/restore all registers (but pass down arg and retval registers to
pltentry/exit callbacks according to the base PCS).

This patch also removes a __builtin_expect from the modified code because
the branch prediction hint did not seem useful.

2019-05-22  Szabolcs Nagy  <szabolcs.nagy@arm.com>

	* sysdeps/aarch64/dl-dtprocnum.h: New file.
	* sysdeps/aarch64/dl-machine.h (DT_AARCH64): Define.
	(elf_machine_runtime_setup): Handle DT_AARCH64_VARIANT_PCS.
	(elf_machine_lazy_rel): Check STO_AARCH64_VARIANT_PCS and bind such
	symbols at load time.
	* sysdeps/aarch64/linkmap.h (struct link_map_machine): Add variant_pcs.
Florian Weimer - May 23, 2019, 11:35 a.m.
* Szabolcs Nagy:

>    /* Check for unexpected PLT reloc type.  */
>    if (__builtin_expect (r_type == AARCH64_R(JUMP_SLOT), 1))
>      {
> -      if (__builtin_expect (map->l_mach.plt, 0) == 0)
> -	*reloc_addr += l_addr;
> -      else
> -	*reloc_addr = map->l_mach.plt;
> +      if (map->l_mach.plt == 0)
> +	{
> +	  /* Prelinking.  */
> +	  *reloc_addr += l_addr;
> +	  return;
> +	}
> +
> +      if (__glibc_unlikely (map->l_mach.variant_pcs))
> +	{
> +	  /* Check the symbol table for variant PCS symbols.  */
> +	  const Elf_Symndx symndx = ELFW (R_SYM) (reloc->r_info);
> +	  const ElfW (Sym) *symtab =
> +	    (const void *)D_PTR (map, l_info[DT_SYMTAB]);
> +	  const ElfW (Sym) *sym = &symtab[symndx];
> +	  if (__glibc_unlikely (sym->st_other & STO_AARCH64_VARIANT_PCS))
> +	    {
> +	      /* Avoid lazy resolution of variant PCS symbols.  */
> +	      const struct r_found_version *version = NULL;
> +	      if (map->l_info[VERSYMIDX (DT_VERSYM)] != NULL)
> +		{
> +		  const ElfW (Half) *vernum =
> +		    (const void *)D_PTR (map, l_info[VERSYMIDX (DT_VERSYM)]);
> +		  version = &map->l_versions[vernum[symndx] & 0x7fff];
> +		}
> +	      elf_machine_rela (map, reloc, sym, version, reloc_addr,
> +				skip_ifunc);
> +	      return;
> +	    }
> +	}
> +
> +      *reloc_addr = map->l_mach.plt;

Looking at this, I wonder if it would be more natural to handle this as
a new relocation type.  This way, you will get an error from old dynamic
linkers.

Thanks,
Florian
Szabolcs Nagy - May 23, 2019, 12:31 p.m.
On 23/05/2019 12:35, Florian Weimer wrote:
> * Szabolcs Nagy:

> 

>>    /* Check for unexpected PLT reloc type.  */

>>    if (__builtin_expect (r_type == AARCH64_R(JUMP_SLOT), 1))

>>      {

>> -      if (__builtin_expect (map->l_mach.plt, 0) == 0)

>> -	*reloc_addr += l_addr;

>> -      else

>> -	*reloc_addr = map->l_mach.plt;

>> +      if (map->l_mach.plt == 0)

>> +	{

>> +	  /* Prelinking.  */

>> +	  *reloc_addr += l_addr;

>> +	  return;

>> +	}

>> +

>> +      if (__glibc_unlikely (map->l_mach.variant_pcs))

>> +	{

>> +	  /* Check the symbol table for variant PCS symbols.  */

>> +	  const Elf_Symndx symndx = ELFW (R_SYM) (reloc->r_info);

>> +	  const ElfW (Sym) *symtab =

>> +	    (const void *)D_PTR (map, l_info[DT_SYMTAB]);

>> +	  const ElfW (Sym) *sym = &symtab[symndx];

>> +	  if (__glibc_unlikely (sym->st_other & STO_AARCH64_VARIANT_PCS))

>> +	    {

>> +	      /* Avoid lazy resolution of variant PCS symbols.  */

>> +	      const struct r_found_version *version = NULL;

>> +	      if (map->l_info[VERSYMIDX (DT_VERSYM)] != NULL)

>> +		{

>> +		  const ElfW (Half) *vernum =

>> +		    (const void *)D_PTR (map, l_info[VERSYMIDX (DT_VERSYM)]);

>> +		  version = &map->l_versions[vernum[symndx] & 0x7fff];

>> +		}

>> +	      elf_machine_rela (map, reloc, sym, version, reloc_addr,

>> +				skip_ifunc);

>> +	      return;

>> +	    }

>> +	}

>> +

>> +      *reloc_addr = map->l_mach.plt;

> 

> Looking at this, I wonder if it would be more natural to handle this as

> a new relocation type.  This way, you will get an error from old dynamic

> linkers.


yes, but that's a more intrusive change.

R_*_JUMP_SLOT is handled specially at various places
which require changes if we introduce a new dynamic
reloc.

the symbol marking is still required: static linker
needs to know which symbols are special (could be
done with static relocations, but then almost all
relocations would need a 'variant pcs' alternative
which is a lot of changes in static linkers)

so it seemed that DT_ tag + st_other marking is
minimally intrusive and sufficient, but i agree
that and R_* dynamic reloc + st_other marking
would be another feasible option (i didnt prototype
that so i cant tell how much more work that is).
Szabolcs Nagy - June 4, 2019, 11:39 a.m.
On 23/05/2019 13:31, Szabolcs Nagy wrote:
> On 23/05/2019 12:35, Florian Weimer wrote:

>> * Szabolcs Nagy:

>>

>>>    /* Check for unexpected PLT reloc type.  */

>>>    if (__builtin_expect (r_type == AARCH64_R(JUMP_SLOT), 1))

>>>      {

>>> -      if (__builtin_expect (map->l_mach.plt, 0) == 0)

>>> -	*reloc_addr += l_addr;

>>> -      else

>>> -	*reloc_addr = map->l_mach.plt;

>>> +      if (map->l_mach.plt == 0)

>>> +	{

>>> +	  /* Prelinking.  */

>>> +	  *reloc_addr += l_addr;

>>> +	  return;

>>> +	}

>>> +

>>> +      if (__glibc_unlikely (map->l_mach.variant_pcs))

>>> +	{

>>> +	  /* Check the symbol table for variant PCS symbols.  */

>>> +	  const Elf_Symndx symndx = ELFW (R_SYM) (reloc->r_info);

>>> +	  const ElfW (Sym) *symtab =

>>> +	    (const void *)D_PTR (map, l_info[DT_SYMTAB]);

>>> +	  const ElfW (Sym) *sym = &symtab[symndx];

>>> +	  if (__glibc_unlikely (sym->st_other & STO_AARCH64_VARIANT_PCS))

>>> +	    {

>>> +	      /* Avoid lazy resolution of variant PCS symbols.  */

>>> +	      const struct r_found_version *version = NULL;

>>> +	      if (map->l_info[VERSYMIDX (DT_VERSYM)] != NULL)

>>> +		{

>>> +		  const ElfW (Half) *vernum =

>>> +		    (const void *)D_PTR (map, l_info[VERSYMIDX (DT_VERSYM)]);

>>> +		  version = &map->l_versions[vernum[symndx] & 0x7fff];

>>> +		}

>>> +	      elf_machine_rela (map, reloc, sym, version, reloc_addr,

>>> +				skip_ifunc);

>>> +	      return;

>>> +	    }

>>> +	}

>>> +

>>> +      *reloc_addr = map->l_mach.plt;

>>

>> Looking at this, I wonder if it would be more natural to handle this as

>> a new relocation type.  This way, you will get an error from old dynamic

>> linkers.

> 

> yes, but that's a more intrusive change.


to expand a bit: i want old dynamic linkers to
continue to work if they do not use lazy binding.

currently only glibc supports lazy binding as far as
i can tell and even there you can force it to be off
(in which case a new binary with variant pcs calls works
with an old dyn linker).

unfortunately i don't see an easy way to error out in
an old dynamic linker just in case of lazy binding.

a new dyn reloc seems to be more in spirit of the elf
spec and means nicer code (the reloc type is enough
instead of using the reloc sym index to check the symtab),
but i think that is not good enough argument to break
bw compat.

i think the abi can be still changed if it is important to
error out when variant pcs calls are not supported, but
that means it's impossible to run new code on an old system
even with adequate workarounds in place (-Wl,-z,now linking
or no lazy binding support in libc).

do you see other problems with not using a separate dyn reloc?

> 

> R_*_JUMP_SLOT is handled specially at various places

> which require changes if we introduce a new dynamic

> reloc.

> 

> the symbol marking is still required: static linker

> needs to know which symbols are special (could be

> done with static relocations, but then almost all

> relocations would need a 'variant pcs' alternative

> which is a lot of changes in static linkers)

> 

> so it seemed that DT_ tag + st_other marking is

> minimally intrusive and sufficient, but i agree

> that and R_* dynamic reloc + st_other marking

> would be another feasible option (i didnt prototype

> that so i cant tell how much more work that is).

>
Szabolcs Nagy - June 4, 2019, 5:17 p.m.
On 04/06/2019 12:39, Szabolcs Nagy wrote:
> On 23/05/2019 13:31, Szabolcs Nagy wrote:

>> On 23/05/2019 12:35, Florian Weimer wrote:

>>> * Szabolcs Nagy:

>>>

>>>>    /* Check for unexpected PLT reloc type.  */

>>>>    if (__builtin_expect (r_type == AARCH64_R(JUMP_SLOT), 1))

>>>>      {

>>>> -      if (__builtin_expect (map->l_mach.plt, 0) == 0)

>>>> -	*reloc_addr += l_addr;

>>>> -      else

>>>> -	*reloc_addr = map->l_mach.plt;

>>>> +      if (map->l_mach.plt == 0)

>>>> +	{

>>>> +	  /* Prelinking.  */

>>>> +	  *reloc_addr += l_addr;

>>>> +	  return;

>>>> +	}

>>>> +

>>>> +      if (__glibc_unlikely (map->l_mach.variant_pcs))

>>>> +	{

>>>> +	  /* Check the symbol table for variant PCS symbols.  */

>>>> +	  const Elf_Symndx symndx = ELFW (R_SYM) (reloc->r_info);

>>>> +	  const ElfW (Sym) *symtab =

>>>> +	    (const void *)D_PTR (map, l_info[DT_SYMTAB]);

>>>> +	  const ElfW (Sym) *sym = &symtab[symndx];

>>>> +	  if (__glibc_unlikely (sym->st_other & STO_AARCH64_VARIANT_PCS))

>>>> +	    {

>>>> +	      /* Avoid lazy resolution of variant PCS symbols.  */

>>>> +	      const struct r_found_version *version = NULL;

>>>> +	      if (map->l_info[VERSYMIDX (DT_VERSYM)] != NULL)

>>>> +		{

>>>> +		  const ElfW (Half) *vernum =

>>>> +		    (const void *)D_PTR (map, l_info[VERSYMIDX (DT_VERSYM)]);

>>>> +		  version = &map->l_versions[vernum[symndx] & 0x7fff];

>>>> +		}

>>>> +	      elf_machine_rela (map, reloc, sym, version, reloc_addr,

>>>> +				skip_ifunc);

>>>> +	      return;

>>>> +	    }

>>>> +	}

>>>> +

>>>> +      *reloc_addr = map->l_mach.plt;

>>>

>>> Looking at this, I wonder if it would be more natural to handle this as

>>> a new relocation type.  This way, you will get an error from old dynamic

>>> linkers.

>>

>> yes, but that's a more intrusive change.

> 

> to expand a bit: i want old dynamic linkers to

> continue to work if they do not use lazy binding.


Rich Felker commented that it's possible to force bind now
semantics for marked symbols in the static linker.
PLT entries may still be needed (e.g. in the main exe for
pointer equality), but those can use R_*_GLOB_DAT relocs
and appear outside of the normal PLT entries that are
associated with R_*_JUMP_SLOT relocs.

then no dynamic linker change is needed, but lazy binding
(and ld_audit) cannot be supported later for vector calls.

no dynamic linker change sounds preferable to me, but
i have to think about the details.
Szabolcs Nagy - June 5, 2019, 2:41 p.m.
On 04/06/2019 18:17, Szabolcs Nagy wrote:
> On 04/06/2019 12:39, Szabolcs Nagy wrote:

>> On 23/05/2019 13:31, Szabolcs Nagy wrote:

>>> On 23/05/2019 12:35, Florian Weimer wrote:

>>>> * Szabolcs Nagy:

>>>>

>>>>>    /* Check for unexpected PLT reloc type.  */

>>>>>    if (__builtin_expect (r_type == AARCH64_R(JUMP_SLOT), 1))

>>>>>      {

>>>>> -      if (__builtin_expect (map->l_mach.plt, 0) == 0)

>>>>> -	*reloc_addr += l_addr;

>>>>> -      else

>>>>> -	*reloc_addr = map->l_mach.plt;

>>>>> +      if (map->l_mach.plt == 0)

>>>>> +	{

>>>>> +	  /* Prelinking.  */

>>>>> +	  *reloc_addr += l_addr;

>>>>> +	  return;

>>>>> +	}

>>>>> +

>>>>> +      if (__glibc_unlikely (map->l_mach.variant_pcs))

>>>>> +	{

>>>>> +	  /* Check the symbol table for variant PCS symbols.  */

>>>>> +	  const Elf_Symndx symndx = ELFW (R_SYM) (reloc->r_info);

>>>>> +	  const ElfW (Sym) *symtab =

>>>>> +	    (const void *)D_PTR (map, l_info[DT_SYMTAB]);

>>>>> +	  const ElfW (Sym) *sym = &symtab[symndx];

>>>>> +	  if (__glibc_unlikely (sym->st_other & STO_AARCH64_VARIANT_PCS))

>>>>> +	    {

>>>>> +	      /* Avoid lazy resolution of variant PCS symbols.  */

>>>>> +	      const struct r_found_version *version = NULL;

>>>>> +	      if (map->l_info[VERSYMIDX (DT_VERSYM)] != NULL)

>>>>> +		{

>>>>> +		  const ElfW (Half) *vernum =

>>>>> +		    (const void *)D_PTR (map, l_info[VERSYMIDX (DT_VERSYM)]);

>>>>> +		  version = &map->l_versions[vernum[symndx] & 0x7fff];

>>>>> +		}

>>>>> +	      elf_machine_rela (map, reloc, sym, version, reloc_addr,

>>>>> +				skip_ifunc);

>>>>> +	      return;

>>>>> +	    }

>>>>> +	}

>>>>> +

>>>>> +      *reloc_addr = map->l_mach.plt;

>>>>

>>>> Looking at this, I wonder if it would be more natural to handle this as

>>>> a new relocation type.  This way, you will get an error from old dynamic

>>>> linkers.

>>>

>>> yes, but that's a more intrusive change.

>>

>> to expand a bit: i want old dynamic linkers to

>> continue to work if they do not use lazy binding.

> 

> Rich Felker commented that it's possible to force bind now

> semantics for marked symbols in the static linker.

> PLT entries may still be needed (e.g. in the main exe for

> pointer equality), but those can use R_*_GLOB_DAT relocs

> and appear outside of the normal PLT entries that are

> associated with R_*_JUMP_SLOT relocs.

> 

> then no dynamic linker change is needed, but lazy binding

> (and ld_audit) cannot be supported later for vector calls.

> 

> no dynamic linker change sounds preferable to me, but

> i have to think about the details.


i think this approach (forcing marked symbols to bind now
without dynamic linker change) might work.

it's similar to what i described in
https://groups.google.com/d/msg/generic-abi/Bfb2CwX-u4M/2qbM5fDbDQAJ

i dropped that idea because it required two dynamic relocs,
one of which is unused (and the interaction with local ifuncs
was a bit messy).

but the same can be done with one dyn reloc if marked plt
entries are sorted to the end of the plt section (which
requires another pass over the symbols in the static linker,
but i think that's fine).

what i can do is revert the binutils patch, write a new
abi draft which uses similar symbol marking, but require
static linkers not to emit lazy JUMP_SLOT relocs (i.e. in
DT_JMPREL) for marked symbols.

then a new binary would work with an old dynamic linker
(and an old toolchain can build libmvec.so other than the
symbol markings, but those are not critical for interop).

if this plan fails i will go back to the previous plan.
Szabolcs Nagy - June 6, 2019, 8:56 a.m.
On 05/06/2019 15:41, Szabolcs Nagy wrote:
> On 04/06/2019 18:17, Szabolcs Nagy wrote:

>> On 04/06/2019 12:39, Szabolcs Nagy wrote:

>>> On 23/05/2019 13:31, Szabolcs Nagy wrote:

>>>> On 23/05/2019 12:35, Florian Weimer wrote:

>>>>>

>>>>> Looking at this, I wonder if it would be more natural to handle this as

>>>>> a new relocation type.  This way, you will get an error from old dynamic

>>>>> linkers.

>>>>

>>>> yes, but that's a more intrusive change.

>>>

>>> to expand a bit: i want old dynamic linkers to

>>> continue to work if they do not use lazy binding.

>>

>> Rich Felker commented that it's possible to force bind now

>> semantics for marked symbols in the static linker.

>> PLT entries may still be needed (e.g. in the main exe for

>> pointer equality), but those can use R_*_GLOB_DAT relocs

>> and appear outside of the normal PLT entries that are

>> associated with R_*_JUMP_SLOT relocs.

>>

>> then no dynamic linker change is needed, but lazy binding

>> (and ld_audit) cannot be supported later for vector calls.

>>

>> no dynamic linker change sounds preferable to me, but

>> i have to think about the details.

> 

> i think this approach (forcing marked symbols to bind now

> without dynamic linker change) might work.


it does not work in general, but it can be made to
work for the existing glibc implementation.

in the main exe a PLT must use a JUMP_SLOT reloc
(other relocs would resolve to the pseudo
definition provided by the PLT itself instead of
the real symbol definition) and a JUMP_SLOT reloc
may be lazy resolved.

in glibc relocs are not lazy resolved if they are
outside of the DT_JMPREL entries, but i don't think
that's a generic ELF guarantee. (glibc handles
JUMP_SLOT relocs outside of DT_JMPREL, that may
not be valid ELF depending on interpretation of
DT_JMPREL.)

the new abi text allows an implementation where
JUMP_SLOT relocs for marked symbols are not in
DT_JMPREL and thus bw compatible with old glibc,
if the abi required a new reloc type then this
would not work.

i'll try to implement the bw compat solution in
binutils that follows the current abi specs.
(then the abi does not guarantee bw compat but
the implementation in practice does)
is that acceptable?

(may be the DT_ tag can be changed to indicate that
JUMP_SLOT relocs are in DT_JMPREL and glibc can
then just reject such binaries as unsupported?)


> 

> it's similar to what i described in

> https://groups.google.com/d/msg/generic-abi/Bfb2CwX-u4M/2qbM5fDbDQAJ

> 

> i dropped that idea because it required two dynamic relocs,

> one of which is unused (and the interaction with local ifuncs

> was a bit messy).

> 

> but the same can be done with one dyn reloc if marked plt

> entries are sorted to the end of the plt section (which

> requires another pass over the symbols in the static linker,

> but i think that's fine).

> 

> what i can do is revert the binutils patch, write a new

> abi draft which uses similar symbol marking, but require

> static linkers not to emit lazy JUMP_SLOT relocs (i.e. in

> DT_JMPREL) for marked symbols.

> 

> then a new binary would work with an old dynamic linker

> (and an old toolchain can build libmvec.so other than the

> symbol markings, but those are not critical for interop).

> 

> if this plan fails i will go back to the previous plan.

>
Szabolcs Nagy - June 7, 2019, 11:54 a.m.
Florian: please explain if you still prefer a new dynamic reloc.

alternatives:

(1) stick to proposed abi text and the posted patches.
    (check st_other flag and force bind now if necessary)

(2) use a new dynamic relocation type for variant pcs
    symbols with PLT entries. (and always force that
    to bind now)

(3) keep the abi text but in static linkers for linux
    reorder PLTs such that variant pcs PLTs come at the
    end and associated relocations and GOT entries are
    moved out of the .rela.plt and .got.plt area.
    (this is backward compatible with existing libcs)

i have a prototype implementation for (3): it is nasty
because of ifunc PLT handling (many corner cases) and
it adds an ABI requirement: PLT relocs outside of the
DT_JMPREL area must not be lazy resolved. (some tools
may assume all PLT relocs are in the DT_JMPREL area,
e.g. objdump will not display @plt labels correctly,
but i don't think anything would be broken by this)

currently my preference is (1) and i'd like to hear some
feedback if that is still considered problematic.
Szabolcs Nagy - June 11, 2019, 10:31 a.m.
On 07/06/2019 12:54, Szabolcs Nagy wrote:
> Florian: please explain if you still prefer a new dynamic reloc.


ping.

if there are no further comments, i'll go
ahead with the original plan (1), i cannot
hold the abi back indefinitely.

> 

> alternatives:

> 

> (1) stick to proposed abi text and the posted patches.

>     (check st_other flag and force bind now if necessary)

> 

> (2) use a new dynamic relocation type for variant pcs

>     symbols with PLT entries. (and always force that

>     to bind now)

> 

> (3) keep the abi text but in static linkers for linux

>     reorder PLTs such that variant pcs PLTs come at the

>     end and associated relocations and GOT entries are

>     moved out of the .rela.plt and .got.plt area.

>     (this is backward compatible with existing libcs)

> 

> i have a prototype implementation for (3): it is nasty

> because of ifunc PLT handling (many corner cases) and

> it adds an ABI requirement: PLT relocs outside of the

> DT_JMPREL area must not be lazy resolved. (some tools

> may assume all PLT relocs are in the DT_JMPREL area,

> e.g. objdump will not display @plt labels correctly,

> but i don't think anything would be broken by this)

> 

> currently my preference is (1) and i'd like to hear some

> feedback if that is still considered problematic.

>
Florian Weimer - June 11, 2019, 9:15 p.m.
* Szabolcs Nagy:

> Florian: please explain if you still prefer a new dynamic reloc.
>
> alternatives:
>
> (1) stick to proposed abi text and the posted patches.
>     (check st_other flag and force bind now if necessary)
>
> (2) use a new dynamic relocation type for variant pcs
>     symbols with PLT entries. (and always force that
>     to bind now)
>
> (3) keep the abi text but in static linkers for linux
>     reorder PLTs such that variant pcs PLTs come at the
>     end and associated relocations and GOT entries are
>     moved out of the .rela.plt and .got.plt area.
>     (this is backward compatible with existing libcs)
>
> i have a prototype implementation for (3): it is nasty
> because of ifunc PLT handling (many corner cases) and
> it adds an ABI requirement: PLT relocs outside of the
> DT_JMPREL area must not be lazy resolved. (some tools
> may assume all PLT relocs are in the DT_JMPREL area,
> e.g. objdump will not display @plt labels correctly,
> but i don't think anything would be broken by this)
>
> currently my preference is (1) and i'd like to hear some
> feedback if that is still considered problematic.

If I actually had to support this for end users, I would *strongly*
prefer (2).  But for various reasons, it does not appear to be likely
for the foreseeable future that I will be personally confronted with
fallout from picking (1) over (2), so I really don't care all that much.

I think we (at Red Hat) have sufficient time to backport the loader
changes for either (1) or (2), so that our own downstream users would
never notice the loader issue.  We should put the change on the upstream
stable branches, too, so that distributions which synchronize with those
pick it up, too.  (One caveat: we have very limited backporting
capability for CentOS because there is no EUS for CentOS.)

The link editor changes are more concerning.  I expect us to backport
them because old linkers will silently create broken binaries.  That's
true for all three approaches, I think.  But given the long-standing
recommendation that you should build on the oldest distribution you want
to support, I'm not sure how successful such backporting efforts will be
in preventing broken binaries which only work superficially.  (Again,
CentOS could be a problem here.)

Perhaps we should combine the loader backport of (1) or (2) with an SVE
trampoline that zaps the first vector argument register?  Just to make
the breakage obvious immediately, and not when we change vector register
usage in the dynamic linker five years from now, exposing a substantial
number of binaries as broken.  If that ever happens, we *will* have to
save the entire register file in the trampoline at that point, like we
do now on x86-64.

Thanks,
Florian
Szabolcs Nagy - June 12, 2019, 9:50 a.m.
On 11/06/2019 22:15, Florian Weimer wrote:
> * Szabolcs Nagy:

> 

>> Florian: please explain if you still prefer a new dynamic reloc.

>>

>> alternatives:

>>

>> (1) stick to proposed abi text and the posted patches.

>>     (check st_other flag and force bind now if necessary)

>>

>> (2) use a new dynamic relocation type for variant pcs

>>     symbols with PLT entries. (and always force that

>>     to bind now)

>>

>> (3) keep the abi text but in static linkers for linux

>>     reorder PLTs such that variant pcs PLTs come at the

>>     end and associated relocations and GOT entries are

>>     moved out of the .rela.plt and .got.plt area.

>>     (this is backward compatible with existing libcs)

>>

>> i have a prototype implementation for (3): it is nasty

>> because of ifunc PLT handling (many corner cases) and

>> it adds an ABI requirement: PLT relocs outside of the

>> DT_JMPREL area must not be lazy resolved. (some tools

>> may assume all PLT relocs are in the DT_JMPREL area,

>> e.g. objdump will not display @plt labels correctly,

>> but i don't think anything would be broken by this)

>>

>> currently my preference is (1) and i'd like to hear some

>> feedback if that is still considered problematic.

> 

> If I actually had to support this for end users, I would *strongly*

> prefer (2).  But for various reasons, it does not appear to be likely

> for the foreseeable future that I will be personally confronted with

> fallout from picking (1) over (2), so I really don't care all that much.

> 

> I think we (at Red Hat) have sufficient time to backport the loader

> changes for either (1) or (2), so that our own downstream users would

> never notice the loader issue.  We should put the change on the upstream

> stable branches, too, so that distributions which synchronize with those

> pick it up, too.  (One caveat: we have very limited backporting

> capability for CentOS because there is no EUS for CentOS.)

> 

> The link editor changes are more concerning.  I expect us to backport

> them because old linkers will silently create broken binaries.  That's

> true for all three approaches, I think.  But given the long-standing

> recommendation that you should build on the oldest distribution you want

> to support, I'm not sure how successful such backporting efforts will be

> in preventing broken binaries which only work superficially.  (Again,

> CentOS could be a problem here.)


old linker can silently create broken binaries
(with unmarked SVE symbols), but the assembler
would fail when it sees the .variant_pcs directive
(which is emitted by the compiler to mark SVE
functions)

since in binutils the linker and assembler are
in one package, i think it's less likely to get
silent breakage: if you use a new compiler
with old binutils you see an assembler error
immediately.

if your toolchain has its own new assembler but
uses an old linker, then using -z now linking is
a way to avoid breakage (assuming such situation
can be detected, i didnt try any detection in gcc
because of the asm issue).

(i think new dynamic relocation would not fix
the silent breakage with old linker unless new
static relocs are introduced too for relocatable
objects, which means many new relocs, potentially
with non-trivial new asm syntax etc)

> Perhaps we should combine the loader backport of (1) or (2) with an SVE

> trampoline that zaps the first vector argument register?  Just to make

> the breakage obvious immediately, and not when we change vector register

> usage in the dynamic linker five years from now, exposing a substantial

> number of binaries as broken.  If that ever happens, we *will* have to

> save the entire register file in the trampoline at that point, like we

> do now on x86-64.


note: for SVE the breakage is quite obvious: dynamic
linkers save/restore q0-q7 registers which overlap
with z0-z7 SVE argument registers and this zeros
out the top bits before the call is entered (if the
SVE registers >128bit).

> Thanks,

> Florian


thanks for your comments, based on this i will still
go with (1) because in the long term i bet on -z now
linkage to be common/acceptable which makes the
problem go away, and (2) would affect more systems
and i don't immediately have a solution for that.
Florian Weimer - June 14, 2019, 5:09 p.m.
* Szabolcs Nagy:

>> The link editor changes are more concerning.  I expect us to backport
>> them because old linkers will silently create broken binaries.  That's
>> true for all three approaches, I think.  But given the long-standing
>> recommendation that you should build on the oldest distribution you want
>> to support, I'm not sure how successful such backporting efforts will be
>> in preventing broken binaries which only work superficially.  (Again,
>> CentOS could be a problem here.)
>
> old linker can silently create broken binaries
> (with unmarked SVE symbols), but the assembler
> would fail when it sees the .variant_pcs directive
> (which is emitted by the compiler to mark SVE
> functions)
>
> since in binutils the linker and assembler are
> in one package, i think it's less likely to get
> silent breakage: if you use a new compiler
> with old binutils you see an assembler error
> immediately.

We have three linkers today (ld.gold, ld.bfd, LLVM lld), and it is not
unusual to have four of them installed (system version plus toolchain
module/software collection version for binutils).  So I'm afraid that
it's common that software will be linked with a linker from a different
binutils version than the assembler that was used.

With LLVM, there is also the matter of the assembler built into the
compiler (not sure if this is matters for aarch64 yet).

> if your toolchain has its own new assembler but
> uses an old linker, then using -z now linking is
> a way to avoid breakage (assuming such situation
> can be detected, i didnt try any detection in gcc
> because of the asm issue).

I'm not concerned about the breakage and how end users can work around
it.  What worries me are broken binaries that appear to run correctly
today and break with a future dynamic linker update.  I consider them
time bombs.

> (i think new dynamic relocation would not fix
> the silent breakage with old linker unless new
> static relocs are introduced too for relocatable
> objects, which means many new relocs, potentially
> with non-trivial new asm syntax etc)

Yeah, this is how we have dealt with similar situations on other
architectures.  It was still not good enough because it turned out that
binutils strip had a bug, where it would replace unknown relocations
with 0 and *still* generate an output file. 8-(

>> Perhaps we should combine the loader backport of (1) or (2) with an SVE
>> trampoline that zaps the first vector argument register?  Just to make
>> the breakage obvious immediately, and not when we change vector register
>> usage in the dynamic linker five years from now, exposing a substantial
>> number of binaries as broken.  If that ever happens, we *will* have to
>> save the entire register file in the trampoline at that point, like we
>> do now on x86-64.
>
> note: for SVE the breakage is quite obvious: dynamic
> linkers save/restore q0-q7 registers which overlap
> with z0-z7 SVE argument registers and this zeros
> out the top bits before the call is entered (if the
> SVE registers >128bit).

So incorrectly linked binaries will fail if they ever call the function?
I think that's not too bad then.  You still have to test on an SVE
machine, of course, but that's probably a good idea anyway if you are
building for SVE.

If you get a failure with glibc master for an incorrectly program
*today*, that's good news.

Thanks,
Florian

Patch

diff --git a/sysdeps/aarch64/dl-dtprocnum.h b/sysdeps/aarch64/dl-dtprocnum.h

new file mode 100644
index 0000000000..4ac2adf234

--- /dev/null

+++ b/sysdeps/aarch64/dl-dtprocnum.h

@@ -0,0 +1,21 @@ 

+/* Configuration of lookup functions.  AArch64 version.

+   Copyright (C) 2019 Free Software Foundation, Inc.

+   This file is part of the GNU C Library.

+

+   The GNU C Library is free software; you can redistribute it and/or

+   modify it under the terms of the GNU Lesser General Public

+   License as published by the Free Software Foundation; either

+   version 2.1 of the License, or (at your option) any later version.

+

+   The GNU C Library is distributed in the hope that it will be useful,

+   but WITHOUT ANY WARRANTY; without even the implied warranty of

+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU

+   Lesser General Public License for more details.

+

+   You should have received a copy of the GNU Lesser General Public

+   License along with the GNU C Library.  If not, see

+   <http://www.gnu.org/licenses/>.  */

+

+/* Number of extra dynamic section entries for this architecture.  By

+   default there are none.  */

+#define DT_THISPROCNUM	DT_AARCH64_NUM

diff --git a/sysdeps/aarch64/dl-machine.h b/sysdeps/aarch64/dl-machine.h

index 823eefba46..4f27637b20 100644

--- a/sysdeps/aarch64/dl-machine.h

+++ b/sysdeps/aarch64/dl-machine.h

@@ -27,6 +27,9 @@ 

 #include <dl-irel.h>
 #include <cpu-features.c>
 
+/* Translate a processor specific dynamic tag to the index in l_info array.  */

+#define DT_AARCH64(x) (DT_AARCH64_##x - DT_LOPROC + DT_NUM)

+

 /* Return nonzero iff ELF header is compatible with the running host.  */
 static inline int __attribute__ ((unused))
 elf_machine_matches_host (const ElfW(Ehdr) *ehdr)
@@ -102,6 +105,10 @@  elf_machine_runtime_setup (struct link_map *l, int lazy, int profile)

 	}
     }
 
+  /* Check if STO_AARCH64_VARIANT_PCS needs to be handled.  */

+  if (l->l_info[DT_AARCH64 (VARIANT_PCS)])

+    l->l_mach.variant_pcs = 1;

+

   return lazy;
 }
 
@@ -388,10 +395,37 @@  elf_machine_lazy_rel (struct link_map *map,

   /* Check for unexpected PLT reloc type.  */
   if (__builtin_expect (r_type == AARCH64_R(JUMP_SLOT), 1))
     {
-      if (__builtin_expect (map->l_mach.plt, 0) == 0)

-	*reloc_addr += l_addr;

-      else

-	*reloc_addr = map->l_mach.plt;

+      if (map->l_mach.plt == 0)

+	{

+	  /* Prelinking.  */

+	  *reloc_addr += l_addr;

+	  return;

+	}

+

+      if (__glibc_unlikely (map->l_mach.variant_pcs))

+	{

+	  /* Check the symbol table for variant PCS symbols.  */

+	  const Elf_Symndx symndx = ELFW (R_SYM) (reloc->r_info);

+	  const ElfW (Sym) *symtab =

+	    (const void *)D_PTR (map, l_info[DT_SYMTAB]);

+	  const ElfW (Sym) *sym = &symtab[symndx];

+	  if (__glibc_unlikely (sym->st_other & STO_AARCH64_VARIANT_PCS))

+	    {

+	      /* Avoid lazy resolution of variant PCS symbols.  */

+	      const struct r_found_version *version = NULL;

+	      if (map->l_info[VERSYMIDX (DT_VERSYM)] != NULL)

+		{

+		  const ElfW (Half) *vernum =

+		    (const void *)D_PTR (map, l_info[VERSYMIDX (DT_VERSYM)]);

+		  version = &map->l_versions[vernum[symndx] & 0x7fff];

+		}

+	      elf_machine_rela (map, reloc, sym, version, reloc_addr,

+				skip_ifunc);

+	      return;

+	    }

+	}

+

+      *reloc_addr = map->l_mach.plt;

     }
   else if (__builtin_expect (r_type == AARCH64_R(TLSDESC), 1))
     {
diff --git a/sysdeps/aarch64/linkmap.h b/sysdeps/aarch64/linkmap.h

index ba74fe10e1..7f801b14db 100644

--- a/sysdeps/aarch64/linkmap.h

+++ b/sysdeps/aarch64/linkmap.h

@@ -20,4 +20,5 @@  struct link_map_machine

 {
   ElfW(Addr) plt;	  /* Address of .plt */
   void *tlsdesc_table;	  /* Address of TLS descriptor hash table.  */
+  int variant_pcs;	  /* If set, PLT calls may follow a variant PCS.  */

 };