arm: Use _dl_find_object on __gnu_Unwind_Find_exidx (BZ 31405)
Checks
Context |
Check |
Description |
redhat-pt-bot/TryBot-apply_patch |
success
|
Patch applied to master at the time it was sent
|
linaro-tcwg-bot/tcwg_glibc_build--master-arm |
success
|
Testing passed
|
redhat-pt-bot/TryBot-32bit |
success
|
Build for i686
|
linaro-tcwg-bot/tcwg_glibc_build--master-aarch64 |
success
|
Testing passed
|
linaro-tcwg-bot/tcwg_glibc_check--master-arm |
success
|
Testing passed
|
linaro-tcwg-bot/tcwg_glibc_check--master-aarch64 |
success
|
Testing passed
|
Commit Message
Instead of __dl_iterate_phdr. On ARM dlfo_eh_frame/dlfo_eh_count
maps to PT_ARM_EXIDX vaddr start / length.
On a Neoverse N1 machine with 160 cores, the following program:
$ cat test.c
#include <stdlib.h>
#include <pthread.h>
#include <assert.h>
enum {
niter = 1024,
ntimes = 128,
};
static void *
tf (void *arg)
{
int a = (int) arg;
for (int i = 0; i < niter; i++)
{
void *p[ntimes];
for (int j = 0; j < ntimes; j++)
p[j] = malloc (a * 128);
for (int j = 0; j < ntimes; j++)
free (p[j]);
}
return NULL;
}
int main (int argc, char *argv[])
{
enum { nthreads = 16 };
pthread_t t[nthreads];
for (int i = 0; i < nthreads; i ++)
assert (pthread_create (&t[i], NULL, tf, (void *) i) == 0);
for (int i = 0; i < nthreads; i++)
{
void *r;
assert (pthread_join (t[i], &r) == 0);
assert (r == NULL);
}
return 0;
}
$ arm-linux-gnueabihf-gcc -fsanitize=address test.c -o test
Improves from ~15s to 0.5s.
Checked on arm-linux-gnueabihf.
---
elf/Makefile | 2 +-
elf/dl-find_object.c | 5 ++--
include/dlfcn.h | 3 ++-
sysdeps/arm/find_exidx.c | 57 +++-------------------------------------
4 files changed, 10 insertions(+), 57 deletions(-)
Comments
* Adhemerval Zanella:
> On a Neoverse N1 machine with 160 cores, the following program:
> for (int i = 0; i < niter; i++)
> {
> void *p[ntimes];
> for (int j = 0; j < ntimes; j++)
> p[j] = malloc (a * 128);
> for (int j = 0; j < ntimes; j++)
> free (p[j]);
> }
>
> return NULL;
> }
> $ arm-linux-gnueabihf-gcc -fsanitize=address test.c -o test
>
> Improves from ~15s to 0.5s.
Is this specific to 32-bit Arm, or does _dl_find_object provide a
similar performance benefit to multi-threaded Address Sanitizer on
GCC 12 with glibc 2.35, too? I didn't expect that.
On 22/02/24 11:54, Florian Weimer wrote:
> * Adhemerval Zanella:
>
>> On a Neoverse N1 machine with 160 cores, the following program:
>
>> for (int i = 0; i < niter; i++)
>> {
>> void *p[ntimes];
>> for (int j = 0; j < ntimes; j++)
>> p[j] = malloc (a * 128);
>> for (int j = 0; j < ntimes; j++)
>> free (p[j]);
>> }
>>
>> return NULL;
>> }
>
>> $ arm-linux-gnueabihf-gcc -fsanitize=address test.c -o test
>>
>> Improves from ~15s to 0.5s.
>
> Is this specific to 32-bit Arm, or does _dl_find_object provide a
> similar performance benefit to multi-threaded Address Sanitizer on
> GCC 12 with glibc 2.35, too? I didn't expect that.
This is ARM specific afaik, libgcc/unwind-arm-common.inc calls
__gnu_Unwind_Find_exidx (if available) on _Unwind_Backtrace
call (pretty much like BZ 31405 first comment). And this has
not been changed for a while, so both gcc 12 and master uses the
same code.
* Adhemerval Zanella Netto:
> On 22/02/24 11:54, Florian Weimer wrote:
>> * Adhemerval Zanella:
>>
>>> On a Neoverse N1 machine with 160 cores, the following program:
>>
>>> for (int i = 0; i < niter; i++)
>>> {
>>> void *p[ntimes];
>>> for (int j = 0; j < ntimes; j++)
>>> p[j] = malloc (a * 128);
>>> for (int j = 0; j < ntimes; j++)
>>> free (p[j]);
>>> }
>>>
>>> return NULL;
>>> }
>>
>>> $ arm-linux-gnueabihf-gcc -fsanitize=address test.c -o test
>>>
>>> Improves from ~15s to 0.5s.
>>
>> Is this specific to 32-bit Arm, or does _dl_find_object provide a
>> similar performance benefit to multi-threaded Address Sanitizer on
>> GCC 12 with glibc 2.35, too? I didn't expect that.
>
> This is ARM specific afaik, libgcc/unwind-arm-common.inc calls
> __gnu_Unwind_Find_exidx (if available) on _Unwind_Backtrace
> call (pretty much like BZ 31405 first comment). And this has
> not been changed for a while, so both gcc 12 and master uses the
> same code.
If Address Sanitizer calls _Unwind_Backtrace (perhaps via
libbacktrace) in malloc, this should impact DWARF targets as well: the
glibc 2.35/GCC 12 combination is more similar to Arm because the
dl_iterate_phdr call in the _Unwind_Backtrace implementation has been
replaced with _dl_find_object.
The patch is looks okay to me, by the way. I had this in mind when
working on _dl_find_object, but couldn't test it at the time.
On 22/02/24 14:25, Florian Weimer wrote:
> * Adhemerval Zanella Netto:
>
>> On 22/02/24 11:54, Florian Weimer wrote:
>>> * Adhemerval Zanella:
>>>
>>>> On a Neoverse N1 machine with 160 cores, the following program:
>>>
>>>> for (int i = 0; i < niter; i++)
>>>> {
>>>> void *p[ntimes];
>>>> for (int j = 0; j < ntimes; j++)
>>>> p[j] = malloc (a * 128);
>>>> for (int j = 0; j < ntimes; j++)
>>>> free (p[j]);
>>>> }
>>>>
>>>> return NULL;
>>>> }
>>>
>>>> $ arm-linux-gnueabihf-gcc -fsanitize=address test.c -o test
>>>>
>>>> Improves from ~15s to 0.5s.
>>>
>>> Is this specific to 32-bit Arm, or does _dl_find_object provide a
>>> similar performance benefit to multi-threaded Address Sanitizer on
>>> GCC 12 with glibc 2.35, too? I didn't expect that.
>>
>> This is ARM specific afaik, libgcc/unwind-arm-common.inc calls
>> __gnu_Unwind_Find_exidx (if available) on _Unwind_Backtrace
>> call (pretty much like BZ 31405 first comment). And this has
>> not been changed for a while, so both gcc 12 and master uses the
>> same code.
>
> If Address Sanitizer calls _Unwind_Backtrace (perhaps via
> libbacktrace) in malloc, this should impact DWARF targets as well: the
> glibc 2.35/GCC 12 combination is more similar to Arm because the
> dl_iterate_phdr call in the _Unwind_Backtrace implementation has been
> replaced with _dl_find_object.
The _Unwind_Backtrace depends on then default unwind strategy (Fast/Slow)
which can be overridden by a environment variable
(fast_unwind_on_malloc={0,1}).
The fast unwinder strategy (which tries to work the callstack by parsing
the frame pointer) usually works if you used only one compiler/mode
building everything (for instance if you build everything with clang -marm);
but it start to hit some corner cases when you mix compilers and modes:
compiler-rt/lib/sanitizer_common/sanitizer_flags.inc
43 // ARM thumb/thumb2 frame pointer is inconsistent on GCC and Clang [1]
44 // and fast-unwider is also unreliable with mixing arm and thumb code [2].
45 // [1] https://gcc.gnu.org/bugzilla/show_bug.cgi?id=92172
46 // [2] https://bugs.llvm.org/show_bug.cgi?id=44158
47 COMMON_FLAG(bool, fast_unwind_on_malloc,
48 !(SANITIZER_LINUX && !SANITIZER_ANDROID && SANITIZER_ARM),
49 "If available, use the fast frame-pointer-based unwinder on "
50 "malloc/free.")
>
> The patch is looks okay to me, by the way. I had this in mind when
> working on _dl_find_object, but couldn't test it at the time.
Thanks, I will install it.
@@ -34,6 +34,7 @@ routines = \
dl-addr \
dl-addr-obj \
dl-early_allocate \
+ dl-find_object \
dl-iteratephdr \
dl-libc \
dl-origin \
@@ -60,7 +61,6 @@ dl-routines = \
dl-deps \
dl-exception \
dl-execstack \
- dl-find_object \
dl-fini \
dl-init \
dl-load \
@@ -356,7 +356,7 @@ _dlfo_lookup (uintptr_t pc, struct dl_find_object_internal *first1, size_t size)
}
int
-_dl_find_object (void *pc1, struct dl_find_object *result)
+__dl_find_object (void *pc1, struct dl_find_object *result)
{
uintptr_t pc = (uintptr_t) pc1;
@@ -463,7 +463,8 @@ _dl_find_object (void *pc1, struct dl_find_object *result)
return -1;
} /* Transaction retry loop. */
}
-rtld_hidden_def (_dl_find_object)
+hidden_def (__dl_find_object)
+weak_alias (__dl_find_object, _dl_find_object)
/* _dlfo_process_initial is called twice. First to compute the array
sizes from the initial loaded mappings. Second to fill in the
@@ -4,7 +4,8 @@
#include <link.h> /* For ElfW. */
#include <stdbool.h>
-rtld_hidden_proto (_dl_find_object)
+extern __typeof (_dl_find_object) __dl_find_object;
+hidden_proto (__dl_find_object)
/* Internally used flag. */
#define __RTLD_DLOPEN 0x80000000
@@ -16,64 +16,15 @@
<https://www.gnu.org/licenses/>. */
#include <link.h>
-#include <unwind.h>
-
-struct unw_eh_callback_data
-{
- _Unwind_Ptr pc;
- _Unwind_Ptr exidx_start;
- int exidx_len;
-};
-
-
-/* Callback to determines if the PC lies within an object, and remember the
- location of the exception index table if it does. */
-
-static int
-find_exidx_callback (struct dl_phdr_info * info, size_t size, void * ptr)
-{
- struct unw_eh_callback_data * data;
- const ElfW(Phdr) *phdr;
- int i;
- int match;
- _Unwind_Ptr load_base;
-
- data = (struct unw_eh_callback_data *) ptr;
- load_base = info->dlpi_addr;
- phdr = info->dlpi_phdr;
-
- match = 0;
- for (i = info->dlpi_phnum; i > 0; i--, phdr++)
- {
- if (phdr->p_type == PT_LOAD)
- {
- _Unwind_Ptr vaddr = phdr->p_vaddr + load_base;
- if (data->pc >= vaddr && data->pc < vaddr + phdr->p_memsz)
- match = 1;
- }
- else if (phdr->p_type == PT_ARM_EXIDX)
- {
- data->exidx_start = (_Unwind_Ptr) (phdr->p_vaddr + load_base);
- data->exidx_len = phdr->p_memsz;
- }
- }
-
- return match;
-}
-
/* Find the exception index table containing PC. */
_Unwind_Ptr
__gnu_Unwind_Find_exidx (_Unwind_Ptr pc, int * pcount)
{
- struct unw_eh_callback_data data;
-
- data.pc = pc;
- data.exidx_start = 0;
- if (__dl_iterate_phdr (find_exidx_callback, &data) <= 0)
+ struct dl_find_object data;
+ if (__dl_find_object ((void *) pc, &data) < 0)
return 0;
-
- *pcount = data.exidx_len / 8;
- return data.exidx_start;
+ *pcount = data.dlfo_eh_count;
+ return (_Unwind_Ptr) data.dlfo_eh_frame;
}