elf: Do not read hwcaps from the vDSO in ld.so

Message ID 87wo57etws.fsf@oldenburg2.str.redhat.com
State Committed
Headers
Series elf: Do not read hwcaps from the vDSO in ld.so |

Commit Message

Florian Weimer May 19, 2020, 5:06 p.m. UTC
  This was only ever used for the "nosegneg" flag.  This approach for
passing hardware capability information creates a subtle dependency
between the kernel and userspace, and ld.so.cache contents.  It seems
inappropriate for toady, where people expect to be able to run
system images which very different kernel versions.

---
 elf/dl-hwcaps.c | 110 --------------------------------------------------------
 1 file changed, 110 deletions(-)
  

Comments

Adhemerval Zanella May 26, 2020, 1:17 p.m. UTC | #1
On 19/05/2020 14:06, Florian Weimer via Libc-alpha wrote:
> This was only ever used for the "nosegneg" flag.  This approach for
> passing hardware capability information creates a subtle dependency
> between the kernel and userspace, and ld.so.cache contents.  It seems
> inappropriate for toady, where people expect to be able to run
> system images which very different kernel versions.

I agree that such dependency is not suitable for current practices and
environment where glibc is used. From digging the history of "nosegneg" 
flag it seemed a hack around some issue or limitation from XEN kernels. 
Do you have more information of what exactly nosegneg tried to prevent?

LGTM, thanks.

Reviewed-by: Adhemerval Zanella <adhemerval.zanella@linaro.org>

> 
> ---
>  elf/dl-hwcaps.c | 110 --------------------------------------------------------
>  1 file changed, 110 deletions(-)
> 
> diff --git a/elf/dl-hwcaps.c b/elf/dl-hwcaps.c
> index 71aea86700..6df9efb255 100644
> --- a/elf/dl-hwcaps.c
> +++ b/elf/dl-hwcaps.c
> @@ -26,12 +26,6 @@
>  #include <dl-procinfo.h>
>  #include <dl-hwcaps.h>
>  
> -#ifdef _DL_FIRST_PLATFORM
> -# define _DL_FIRST_EXTRA (_DL_FIRST_PLATFORM + _DL_PLATFORMS_COUNT)
> -#else
> -# define _DL_FIRST_EXTRA _DL_HWCAP_COUNT
> -#endif
> -
>  /* Return an array of useful/necessary hardware capability names.  */
>  const struct r_strlenpair *
>  _dl_important_hwcaps (const char *platform, size_t platform_len, size_t *sz,

Ok.

> @@ -52,116 +46,12 @@ _dl_important_hwcaps (const char *platform, size_t platform_len, size_t *sz,
>      if ((masked & (1ULL << n)) != 0)
>        ++cnt;
>  
> -#ifdef NEED_DL_SYSINFO_DSO
> -  /* The system-supplied DSO can contain a note of type 2, vendor "GNU".
> -     This gives us a list of names to treat as fake hwcap bits.  */
> -
> -  const char *dsocaps = NULL;
> -  size_t dsocapslen = 0;
> -  if (GLRO(dl_sysinfo_map) != NULL)
> -    {
> -      const ElfW(Phdr) *const phdr = GLRO(dl_sysinfo_map)->l_phdr;
> -      const ElfW(Word) phnum = GLRO(dl_sysinfo_map)->l_phnum;
> -      for (uint_fast16_t i = 0; i < phnum; ++i)
> -	if (phdr[i].p_type == PT_NOTE)
> -	  {
> -	    const ElfW(Addr) start = (phdr[i].p_vaddr
> -				      + GLRO(dl_sysinfo_map)->l_addr);
> -	    /* NB: Some PT_NOTE segment may have alignment value of 0
> -	       or 1.  gABI specifies that PT_NOTE segments should be
> -	       aligned to 4 bytes in 32-bit objects and to 8 bytes in
> -	       64-bit objects.  As a Linux extension, we also support
> -	       4 byte alignment in 64-bit objects.  If p_align is less
> -	       than 4, we treate alignment as 4 bytes since some note
> -	       segments have 0 or 1 byte alignment.   */
> -	    ElfW(Addr) align = phdr[i].p_align;
> -	    if (align < 4)
> -	      align = 4;
> -	    else if (align != 4 && align != 8)
> -	      continue;
> -	    /* The standard ELF note layout is exactly as the anonymous struct.
> -	       The next element is a variable length vendor name of length
> -	       VENDORLEN (with a real length rounded to ElfW(Word)), followed
> -	       by the data of length DATALEN (with a real length rounded to
> -	       ElfW(Word)).  */
> -	    const struct
> -	    {
> -	      ElfW(Word) vendorlen;
> -	      ElfW(Word) datalen;
> -	      ElfW(Word) type;
> -	    } *note = (const void *) start;
> -	    while ((ElfW(Addr)) (note + 1) - start < phdr[i].p_memsz)
> -	      {
> -		/* The layout of the type 2, vendor "GNU" note is as follows:
> -		   .long <Number of capabilities enabled by this note>
> -		   .long <Capabilities mask> (as mask >> _DL_FIRST_EXTRA).
> -		   .byte <The bit number for the next capability>
> -		   .asciz <The name of the capability>.  */
> -		if (note->type == NT_GNU_HWCAP
> -		    && note->vendorlen == sizeof "GNU"
> -		    && !memcmp ((note + 1), "GNU", sizeof "GNU")
> -		    && note->datalen > 2 * sizeof (ElfW(Word)) + 2)
> -		  {
> -		    const ElfW(Word) *p
> -		      = ((const void *) note
> -			 + ELF_NOTE_DESC_OFFSET (sizeof "GNU", align));
> -		    cnt += *p++;
> -		    ++p;	/* Skip mask word.  */
> -		    dsocaps = (const char *) p; /* Pseudo-string "<b>name"  */
> -		    dsocapslen = note->datalen - sizeof *p * 2;
> -		    break;
> -		  }
> -		note = ((const void *) note
> -			+ ELF_NOTE_NEXT_OFFSET (note->vendorlen,
> -						note->datalen, align));
> -	      }
> -	    if (dsocaps != NULL)
> -	      break;
> -	  }
> -    }
> -#endif
> -
>    /* For TLS enabled builds always add 'tls'.  */
>    ++cnt;
>  
>    /* Create temporary data structure to generate result table.  */
>    struct r_strlenpair temp[cnt];
>    m = 0;
> -#ifdef NEED_DL_SYSINFO_DSO
> -  if (dsocaps != NULL)
> -    {
> -      /* dsocaps points to the .asciz string, and -1 points to the mask
> -         .long just before the string.  */
> -      const ElfW(Word) mask = ((const ElfW(Word) *) dsocaps)[-1];
> -      GLRO(dl_hwcap) |= (uint64_t) mask << _DL_FIRST_EXTRA;
> -      /* Note that we add the dsocaps to the set already chosen by the
> -	 LD_HWCAP_MASK environment variable (or default HWCAP_IMPORTANT).
> -	 So there is no way to request ignoring an OS-supplied dsocap
> -	 string and bit like you can ignore an OS-supplied HWCAP bit.  */
> -      hwcap_mask |= (uint64_t) mask << _DL_FIRST_EXTRA;
> -#if HAVE_TUNABLES
> -      TUNABLE_SET (glibc, cpu, hwcap_mask, uint64_t, hwcap_mask);
> -#else
> -      GLRO(dl_hwcap_mask) = hwcap_mask;
> -#endif
> -      size_t len;
> -      for (const char *p = dsocaps; p < dsocaps + dsocapslen; p += len + 1)
> -	{
> -	  uint_fast8_t bit = *p++;
> -	  len = strlen (p);
> -
> -	  /* Skip entries that are not enabled in the mask word.  */
> -	  if (__glibc_likely (mask & ((ElfW(Word)) 1 << bit)))
> -	    {
> -	      temp[m].str = p;
> -	      temp[m].len = len;
> -	      ++m;
> -	    }
> -	  else
> -	    --cnt;
> -	}
> -    }
> -#endif
>    for (n = 0; masked != 0; ++n)
>      if ((masked & (1ULL << n)) != 0)
>        {
> 

Ok.
  
Florian Weimer May 26, 2020, 1:47 p.m. UTC | #2
* Adhemerval Zanella via Libc-alpha:

> On 19/05/2020 14:06, Florian Weimer via Libc-alpha wrote:
>> This was only ever used for the "nosegneg" flag.  This approach for
>> passing hardware capability information creates a subtle dependency
>> between the kernel and userspace, and ld.so.cache contents.  It seems
>> inappropriate for toady, where people expect to be able to run
>> system images which very different kernel versions.
>
> I agree that such dependency is not suitable for current practices and
> environment where glibc is used. From digging the history of "nosegneg" 
> flag it seemed a hack around some issue or limitation from XEN kernels. 
> Do you have more information of what exactly nosegneg tried to prevent?

It used to select a glibc built with -mno-tls-direct-seg-refs.  Not
doing complicated address calculations in segment-based accesses somehow
simplified things for Xen with paravirtualization.  Our notes indicate
that this was caused by negative segment-relative references, so
-mno-tls-direct-seg-refs was likely too big a hammer.

In practice, this matters only in very few places in glibc where we
access TLS variables that are *not* in the TCB or the thread descriptor,
because those are at positive offsets.

An example would look like this:

Dump of assembler code for function __syscall_error:
   0x0001f090 <+0>:	endbr32 
   0x0001f094 <+4>:	call   0x132b41 <__x86.get_pc_thunk.dx>
   0x0001f099 <+9>:	add    $0x18cd8b,%edx
   0x0001f09f <+15>:	neg    %eax
   0x0001f0a1 <+17>:	mov    0xdc(%edx),%edx
   0x0001f0a7 <+23>:	mov    %eax,%gs:(%edx)
   0x0001f0aa <+26>:	mov    $0xffffffff,%eax
   0x0001f0af <+31>:	ret    

At the point of the %gs-relative load, %edx is negative because that's
where the TLS variables are.

With a little bit of care, the performance benefits would likely have
been achievable without a separate glibc build.

> LGTM, thanks.
>
> Reviewed-by: Adhemerval Zanella <adhemerval.zanella@linaro.org>

Thanks.

Florian
  
Adhemerval Zanella May 26, 2020, 1:54 p.m. UTC | #3
On 26/05/2020 10:47, Florian Weimer wrote:
> * Adhemerval Zanella via Libc-alpha:
> 
>> On 19/05/2020 14:06, Florian Weimer via Libc-alpha wrote:
>>> This was only ever used for the "nosegneg" flag.  This approach for
>>> passing hardware capability information creates a subtle dependency
>>> between the kernel and userspace, and ld.so.cache contents.  It seems
>>> inappropriate for toady, where people expect to be able to run
>>> system images which very different kernel versions.
>>
>> I agree that such dependency is not suitable for current practices and
>> environment where glibc is used. From digging the history of "nosegneg" 
>> flag it seemed a hack around some issue or limitation from XEN kernels. 
>> Do you have more information of what exactly nosegneg tried to prevent?
> 
> It used to select a glibc built with -mno-tls-direct-seg-refs.  Not
> doing complicated address calculations in segment-based accesses somehow
> simplified things for Xen with paravirtualization.  Our notes indicate
> that this was caused by negative segment-relative references, so
> -mno-tls-direct-seg-refs was likely too big a hammer.

Is it still a valid build option for either glibc or programs targetting
glibc?

> 
> In practice, this matters only in very few places in glibc where we
> access TLS variables that are *not* in the TCB or the thread descriptor,
> because those are at positive offsets.
> 
> An example would look like this:
> 
> Dump of assembler code for function __syscall_error:
>    0x0001f090 <+0>:	endbr32 
>    0x0001f094 <+4>:	call   0x132b41 <__x86.get_pc_thunk.dx>
>    0x0001f099 <+9>:	add    $0x18cd8b,%edx
>    0x0001f09f <+15>:	neg    %eax
>    0x0001f0a1 <+17>:	mov    0xdc(%edx),%edx
>    0x0001f0a7 <+23>:	mov    %eax,%gs:(%edx)
>    0x0001f0aa <+26>:	mov    $0xffffffff,%eax
>    0x0001f0af <+31>:	ret    
> 
> At the point of the %gs-relative load, %edx is negative because that's
> where the TLS variables are.
> 
> With a little bit of care, the performance benefits would likely have
> been achievable without a separate glibc build.

Thanks for the explanation, I tried to dig this rationale without
much success.

> 
>> LGTM, thanks.
>>
>> Reviewed-by: Adhemerval Zanella <adhemerval.zanella@linaro.org>
> 
> Thanks.
> 
> Florian
>
  
Florian Weimer May 26, 2020, 2:19 p.m. UTC | #4
* Adhemerval Zanella:

> On 26/05/2020 10:47, Florian Weimer wrote:
>> * Adhemerval Zanella via Libc-alpha:
>> 
>>> On 19/05/2020 14:06, Florian Weimer via Libc-alpha wrote:
>>>> This was only ever used for the "nosegneg" flag.  This approach for
>>>> passing hardware capability information creates a subtle dependency
>>>> between the kernel and userspace, and ld.so.cache contents.  It seems
>>>> inappropriate for toady, where people expect to be able to run
>>>> system images which very different kernel versions.
>>>
>>> I agree that such dependency is not suitable for current practices and
>>> environment where glibc is used. From digging the history of "nosegneg" 
>>> flag it seemed a hack around some issue or limitation from XEN kernels. 
>>> Do you have more information of what exactly nosegneg tried to prevent?
>> 
>> It used to select a glibc built with -mno-tls-direct-seg-refs.  Not
>> doing complicated address calculations in segment-based accesses somehow
>> simplified things for Xen with paravirtualization.  Our notes indicate
>> that this was caused by negative segment-relative references, so
>> -mno-tls-direct-seg-refs was likely too big a hammer.
>
> Is it still a valid build option for either glibc or programs targetting
> glibc?

It's still a valid build option.  There is no ABI impact, it's like a
workaround for the FDIV bug.  It turns this:

__syscall_error:
	call	__x86.get_pc_thunk.dx
	addl	$_GLOBAL_OFFSET_TABLE_, %edx
	negl	%eax
	movl	errno@gotntpoff(%edx), %edx
	movl	%eax, %gs:(%edx)
	movl	$-1, %eax
	ret

into this:

__syscall_error:
	call	__x86.get_pc_thunk.dx
	addl	$_GLOBAL_OFFSET_TABLE_, %edx
	movl	%gs:0, %ecx
	negl	%eax
	movl	errno@gotntpoff(%edx), %edx
	movl	%eax, (%ecx,%edx)
	movl	$-1, %eax
	ret

The thread pointer is at %gs:0, so it is really quite similar.

I see we still have dependencies on NO_TLS_DIRECT_SEG_REFS, I think we
can remove those, too.

> Thanks for the explanation, I tried to dig this rationale without
> much success.

Yeah, at one point we realized that we kept carrying forward the
nosegneg builds, even though we didn't even ship 32-bit kernels
anymore. 8-/

Thanks,
Florian
  

Patch

diff --git a/elf/dl-hwcaps.c b/elf/dl-hwcaps.c
index 71aea86700..6df9efb255 100644
--- a/elf/dl-hwcaps.c
+++ b/elf/dl-hwcaps.c
@@ -26,12 +26,6 @@ 
 #include <dl-procinfo.h>
 #include <dl-hwcaps.h>
 
-#ifdef _DL_FIRST_PLATFORM
-# define _DL_FIRST_EXTRA (_DL_FIRST_PLATFORM + _DL_PLATFORMS_COUNT)
-#else
-# define _DL_FIRST_EXTRA _DL_HWCAP_COUNT
-#endif
-
 /* Return an array of useful/necessary hardware capability names.  */
 const struct r_strlenpair *
 _dl_important_hwcaps (const char *platform, size_t platform_len, size_t *sz,
@@ -52,116 +46,12 @@  _dl_important_hwcaps (const char *platform, size_t platform_len, size_t *sz,
     if ((masked & (1ULL << n)) != 0)
       ++cnt;
 
-#ifdef NEED_DL_SYSINFO_DSO
-  /* The system-supplied DSO can contain a note of type 2, vendor "GNU".
-     This gives us a list of names to treat as fake hwcap bits.  */
-
-  const char *dsocaps = NULL;
-  size_t dsocapslen = 0;
-  if (GLRO(dl_sysinfo_map) != NULL)
-    {
-      const ElfW(Phdr) *const phdr = GLRO(dl_sysinfo_map)->l_phdr;
-      const ElfW(Word) phnum = GLRO(dl_sysinfo_map)->l_phnum;
-      for (uint_fast16_t i = 0; i < phnum; ++i)
-	if (phdr[i].p_type == PT_NOTE)
-	  {
-	    const ElfW(Addr) start = (phdr[i].p_vaddr
-				      + GLRO(dl_sysinfo_map)->l_addr);
-	    /* NB: Some PT_NOTE segment may have alignment value of 0
-	       or 1.  gABI specifies that PT_NOTE segments should be
-	       aligned to 4 bytes in 32-bit objects and to 8 bytes in
-	       64-bit objects.  As a Linux extension, we also support
-	       4 byte alignment in 64-bit objects.  If p_align is less
-	       than 4, we treate alignment as 4 bytes since some note
-	       segments have 0 or 1 byte alignment.   */
-	    ElfW(Addr) align = phdr[i].p_align;
-	    if (align < 4)
-	      align = 4;
-	    else if (align != 4 && align != 8)
-	      continue;
-	    /* The standard ELF note layout is exactly as the anonymous struct.
-	       The next element is a variable length vendor name of length
-	       VENDORLEN (with a real length rounded to ElfW(Word)), followed
-	       by the data of length DATALEN (with a real length rounded to
-	       ElfW(Word)).  */
-	    const struct
-	    {
-	      ElfW(Word) vendorlen;
-	      ElfW(Word) datalen;
-	      ElfW(Word) type;
-	    } *note = (const void *) start;
-	    while ((ElfW(Addr)) (note + 1) - start < phdr[i].p_memsz)
-	      {
-		/* The layout of the type 2, vendor "GNU" note is as follows:
-		   .long <Number of capabilities enabled by this note>
-		   .long <Capabilities mask> (as mask >> _DL_FIRST_EXTRA).
-		   .byte <The bit number for the next capability>
-		   .asciz <The name of the capability>.  */
-		if (note->type == NT_GNU_HWCAP
-		    && note->vendorlen == sizeof "GNU"
-		    && !memcmp ((note + 1), "GNU", sizeof "GNU")
-		    && note->datalen > 2 * sizeof (ElfW(Word)) + 2)
-		  {
-		    const ElfW(Word) *p
-		      = ((const void *) note
-			 + ELF_NOTE_DESC_OFFSET (sizeof "GNU", align));
-		    cnt += *p++;
-		    ++p;	/* Skip mask word.  */
-		    dsocaps = (const char *) p; /* Pseudo-string "<b>name"  */
-		    dsocapslen = note->datalen - sizeof *p * 2;
-		    break;
-		  }
-		note = ((const void *) note
-			+ ELF_NOTE_NEXT_OFFSET (note->vendorlen,
-						note->datalen, align));
-	      }
-	    if (dsocaps != NULL)
-	      break;
-	  }
-    }
-#endif
-
   /* For TLS enabled builds always add 'tls'.  */
   ++cnt;
 
   /* Create temporary data structure to generate result table.  */
   struct r_strlenpair temp[cnt];
   m = 0;
-#ifdef NEED_DL_SYSINFO_DSO
-  if (dsocaps != NULL)
-    {
-      /* dsocaps points to the .asciz string, and -1 points to the mask
-         .long just before the string.  */
-      const ElfW(Word) mask = ((const ElfW(Word) *) dsocaps)[-1];
-      GLRO(dl_hwcap) |= (uint64_t) mask << _DL_FIRST_EXTRA;
-      /* Note that we add the dsocaps to the set already chosen by the
-	 LD_HWCAP_MASK environment variable (or default HWCAP_IMPORTANT).
-	 So there is no way to request ignoring an OS-supplied dsocap
-	 string and bit like you can ignore an OS-supplied HWCAP bit.  */
-      hwcap_mask |= (uint64_t) mask << _DL_FIRST_EXTRA;
-#if HAVE_TUNABLES
-      TUNABLE_SET (glibc, cpu, hwcap_mask, uint64_t, hwcap_mask);
-#else
-      GLRO(dl_hwcap_mask) = hwcap_mask;
-#endif
-      size_t len;
-      for (const char *p = dsocaps; p < dsocaps + dsocapslen; p += len + 1)
-	{
-	  uint_fast8_t bit = *p++;
-	  len = strlen (p);
-
-	  /* Skip entries that are not enabled in the mask word.  */
-	  if (__glibc_likely (mask & ((ElfW(Word)) 1 << bit)))
-	    {
-	      temp[m].str = p;
-	      temp[m].len = len;
-	      ++m;
-	    }
-	  else
-	    --cnt;
-	}
-    }
-#endif
   for (n = 0; masked != 0; ++n)
     if ((masked & (1ULL << n)) != 0)
       {