[v7,09/23] aarch64: Try to free the GCS of makecontext
Checks
Context |
Check |
Description |
redhat-pt-bot/TryBot-apply_patch |
success
|
Patch applied to master at the time it was sent
|
Commit Message
From: Szabolcs Nagy <szabolcs.nagy@arm.com>
Free GCS after a makecontext start func returns and at thread exit, so
assume makecontext cannot outlive the thread where it was created.
This is an attempt to bound the lifetime of the GCS allocated for
makecontext, but it is still possible to have significant GCS leaks,
new GCS aware APIs could solve that, but that would not allow using
GCS with existing code transparently.
---
include/set-freeres.h | 2 +
malloc/thread-freeres.c | 9 +++
sysdeps/unix/sysv/linux/aarch64/makecontext.c | 65 +++++++++++++++++++
sysdeps/unix/sysv/linux/aarch64/setcontext.S | 19 +++++-
sysdeps/unix/sysv/linux/aarch64/sysdep.h | 6 +-
5 files changed, 97 insertions(+), 4 deletions(-)
Comments
On 03/01/25 12:41, Yury Khrustalev wrote:
> From: Szabolcs Nagy <szabolcs.nagy@arm.com>
>
> Free GCS after a makecontext start func returns and at thread exit, so
> assume makecontext cannot outlive the thread where it was created.
>
> This is an attempt to bound the lifetime of the GCS allocated for
> makecontext, but it is still possible to have significant GCS leaks,
> new GCS aware APIs could solve that, but that would not allow using
> GCS with existing code transparently.
> ---
> include/set-freeres.h | 2 +
> malloc/thread-freeres.c | 9 +++
> sysdeps/unix/sysv/linux/aarch64/makecontext.c | 65 +++++++++++++++++++
> sysdeps/unix/sysv/linux/aarch64/setcontext.S | 19 +++++-
> sysdeps/unix/sysv/linux/aarch64/sysdep.h | 6 +-
> 5 files changed, 97 insertions(+), 4 deletions(-)
>
> diff --git a/include/set-freeres.h b/include/set-freeres.h
> index bb1d0a8f72..0f81344e43 100644
> --- a/include/set-freeres.h
> +++ b/include/set-freeres.h
> @@ -78,6 +78,8 @@ extern void __nss_database_freeres (void) attribute_hidden;
> extern int _IO_cleanup (void) attribute_hidden;;
> /* From dlfcn/dlerror.c */
> extern void __libc_dlerror_result_free (void) attribute_hidden;
> +/* From libc.so, arch specific. */
> +extern void ARCH_THREAD_FREERES (void) attribute_hidden;
>
This seems strange if ARCH_THREAD_FREERES is not defined, although I am
not sure if this is a problem.
> /* From either libc.so or libpthread.so */
> extern void __libpthread_freeres (void) attribute_hidden;
> diff --git a/malloc/thread-freeres.c b/malloc/thread-freeres.c
> index c7ec9f99e9..5ed5df1a3f 100644
> --- a/malloc/thread-freeres.c
> +++ b/malloc/thread-freeres.c
> @@ -22,6 +22,14 @@
> #include <shlib-compat.h>
> #include <tls-internal.h>
>
> +/* Define empty function if no arch-specific clean-up
> + function has been defined. */
> +#ifndef ARCH_THREAD_FREERES
> +void __always_inline
> +__libc_arch_thread_freeres (void) {}
> +#define ARCH_THREAD_FREERES __libc_arch_thread_freeres
> +#endif
> +
Do we need the __always_inline to force compiler to optimize it away?
> /* Thread shutdown function. Note that this function must be called
> for threads during shutdown for correctness reasons. Unlike
> __libc_freeres, skipping calls to it is not a valid optimization.
> @@ -29,6 +37,7 @@
> void
> __libc_thread_freeres (void)
> {
> + call_function_static_weak (ARCH_THREAD_FREERES);
> #if SHLIB_COMPAT (libc, GLIBC_2_0, GLIBC_2_32)
> __rpc_thread_destroy ();
> #endif
> diff --git a/sysdeps/unix/sysv/linux/aarch64/makecontext.c b/sysdeps/unix/sysv/linux/aarch64/makecontext.c
> index 99ba5b3f7c..f43fc45c76 100644
> --- a/sysdeps/unix/sysv/linux/aarch64/makecontext.c
> +++ b/sysdeps/unix/sysv/linux/aarch64/makecontext.c
> @@ -20,7 +20,9 @@
> #include <sysdep.h>
> #include <stdarg.h>
> #include <stdint.h>
> +#include <stdlib.h>
> #include <ucontext.h>
> +#include <sys/mman.h>
>
> #define GCS_MAGIC 0x47435300
>
> @@ -29,6 +31,47 @@ static struct _aarch64_ctx *extension (void *p)
> return p;
> }
>
> +struct gcs_list {
> + struct gcs_list *next;
> + void *base;
> + size_t size;
> +};
> +
> +static __thread struct gcs_list *gcs_list_head = NULL;
> +
> +static void
> +record_gcs (void *base, size_t size)
> +{
> + struct gcs_list *p = malloc (sizeof *p);
> + if (p == NULL)
> + abort ();
> + p->base = base;
> + p->size = size;
> + p->next = gcs_list_head;
> + gcs_list_head = p;
> +}
> +
> +static void
> +free_gcs_list (void)
> +{
> + for (;;)
> + {
> + struct gcs_list *p = gcs_list_head;
> + if (p == NULL)
> + break;
> + gcs_list_head = p->next;
> + __munmap (p->base, p->size);
> + free (p);
> + }
> +}
> +
> +/* Called during thread shutdown to free resources. */
> +void
> +__libc_aarch64_thread_freeres (void)
> +{
> + free_gcs_list ();
> +}
> +
> #ifndef __NR_map_shadow_stack
> # define __NR_map_shadow_stack 453
> #endif
> @@ -58,6 +101,9 @@ alloc_makecontext_gcs (size_t stack_size)
> if (base == (void *) -1)
> /* ENOSYS, bad size or OOM. */
> abort ();
> +
> + record_gcs (base, size);
> +
I think this will make makecontext non async-signal-safe when GCS is used,
when we explict document it as AS-Safe and AC-Safe. I think using
internal_signal_block_all/internal_signal_restore_set would be suffice,
but it is also a performance regression.
> uint64_t *gcsp = (uint64_t *) ((char *) base + size);
> /* Skip end of GCS token. */
> gcsp--;
> @@ -69,6 +115,25 @@ alloc_makecontext_gcs (size_t stack_size)
> return gcsp + 1;
> }
>
> +void
> +__free_makecontext_gcs (void *gcs)
> +{
> + struct gcs_list *p = gcs_list_head;
> + struct gcs_list **q = &gcs_list_head;
> + for (;;)
> + {
> + if (p == NULL)
> + abort ();
> + if (gcs == p->base + p->size - 8)
> + break;
> + q = &p->next;
> + p = p->next;
> + }
> + *q = p->next;
> + __munmap (p->base, p->size);
> + free (p);
> +}
> +
> /* makecontext sets up a stack and the registers for the
> user context. The stack looks like this:
>
> diff --git a/sysdeps/unix/sysv/linux/aarch64/setcontext.S b/sysdeps/unix/sysv/linux/aarch64/setcontext.S
> index 695fc5b9b5..f926a5dd84 100644
> --- a/sysdeps/unix/sysv/linux/aarch64/setcontext.S
> +++ b/sysdeps/unix/sysv/linux/aarch64/setcontext.S
> @@ -34,6 +34,9 @@
> .text
>
> ENTRY (__setcontext)
> + /* If x10 is set then old GCS is freed. */
> + mov x10, 0
> +__setcontext_internal:
> PTR_ARG (0)
> /* Save a copy of UCP. */
> mov x9, x0
> @@ -145,7 +148,8 @@ ENTRY (__setcontext)
> ldr x3, [x2, #oGCSPR]
> MRS_GCSPR (x2)
> mov x4, x3
> - /* x2: GCSPR now. x3, x4: target GCSPR. x5, x6: tmp regs. */
> + mov x1, x2
> + /* x1, x2: GCSPR now. x3, x4: target GCSPR. x5, x6: tmp regs. */
> L(gcs_scan):
> cmp x2, x4
> b.eq L(gcs_pop)
> @@ -162,10 +166,18 @@ L(gcs_switch):
> GCSSS2 (xzr)
> L(gcs_pop):
> cmp x2, x3
> - b.eq L(gcs_done)
> + b.eq L(gcs_free_old)
> GCSPOPM (xzr)
> add x2, x2, 8
> b L(gcs_pop)
> +L(gcs_free_old):
> + cbz x10, L(gcs_done)
> + mov x28, x0
> + mov x0, x1
> + bl __free_makecontext_gcs
> + mov x0, x28
> + ldp x28, x29, [x0, oX0 + 28 * SZREG]
> + ldr x30, [x0, oX0 + 30 * SZREG]
> L(gcs_done):
>
> 2:
> @@ -186,6 +198,7 @@ ENTRY (__startcontext)
> cfi_undefined (x30)
> blr x20
> mov x0, x19
> - cbnz x0, __setcontext
> + mov x10, 1
> + cbnz x0, __setcontext_internal
> 1: b HIDDEN_JUMPTARGET (exit)
> END (__startcontext)
> diff --git a/sysdeps/unix/sysv/linux/aarch64/sysdep.h b/sysdeps/unix/sysv/linux/aarch64/sysdep.h
> index b813805931..e2fc7e21a6 100644
> --- a/sysdeps/unix/sysv/linux/aarch64/sysdep.h
> +++ b/sysdeps/unix/sysv/linux/aarch64/sysdep.h
> @@ -29,8 +29,12 @@
>
> #include <tls.h>
>
> -/* In order to get __set_errno() definition in INLINE_SYSCALL. */
> #ifndef __ASSEMBLER__
> +/* Thread cleanup function. */
> +#define ARCH_THREAD_FREERES __libc_aarch64_thread_freeres
> +void __libc_aarch64_thread_freeres (void) attribute_hidden;
> +
> +/* In order to get __set_errno() definition in INLINE_SYSCALL. */
> #include <errno.h>
> #endif
>
On Tue, Jan 07, 2025 at 01:36:39PM -0300, Adhemerval Zanella Netto wrote:
>
>
> > +/* From libc.so, arch specific. */
> > +extern void ARCH_THREAD_FREERES (void) attribute_hidden;
> >
>
> This seems strange if ARCH_THREAD_FREERES is not defined, although I am
> not sure if this is a problem.
I've been waiting for feedback from Carlos on this change. There is probably
a better way to do this without #ifdef but it's not obvious to me.
> > +/* Define empty function if no arch-specific clean-up
> > + function has been defined. */
> > +#ifndef ARCH_THREAD_FREERES
> > +void __always_inline
> > +__libc_arch_thread_freeres (void) {}
> > +#define ARCH_THREAD_FREERES __libc_arch_thread_freeres
> > +#endif
> > +
>
> Do we need the __always_inline to force compiler to optimize it away?
Yes
> > @@ -58,6 +101,9 @@ alloc_makecontext_gcs (size_t stack_size)
> > if (base == (void *) -1)
> > /* ENOSYS, bad size or OOM. */
> > abort ();
> > +
> > + record_gcs (base, size);
> > +
>
> I think this will make makecontext non async-signal-safe when GCS is used,
> when we explict document it as AS-Safe and AC-Safe. I think using
> internal_signal_block_all/internal_signal_restore_set would be suffice,
> but it is also a performance regression.
Do you refer to the call to malloc() in record_gcs()?
We need to keep track of shadow stacks allocated via map_shadow_stack()
when a new context is created with makecontext() so that we can munmap
it when it is no longer required.
I think this is not on a performance critical path, so we can add the
internal_signal_block_all / internal_signal_restore_set pair. Could you
recommend where it would be best to use it?
Thanks,
Yury
On 09/01/25 11:39, Yury Khrustalev wrote:
> On Tue, Jan 07, 2025 at 01:36:39PM -0300, Adhemerval Zanella Netto wrote:
>>
>>
>>> +/* From libc.so, arch specific. */
>>> +extern void ARCH_THREAD_FREERES (void) attribute_hidden;
>>>
>>
>> This seems strange if ARCH_THREAD_FREERES is not defined, although I am
>> not sure if this is a problem.
>
> I've been waiting for feedback from Carlos on this change. There is probably
> a better way to do this without #ifdef but it's not obvious to me.
>
>>> +/* Define empty function if no arch-specific clean-up
>>> + function has been defined. */
>>> +#ifndef ARCH_THREAD_FREERES
>>> +void __always_inline
>>> +__libc_arch_thread_freeres (void) {}
>>> +#define ARCH_THREAD_FREERES __libc_arch_thread_freeres
>>> +#endif
>>> +
>>
>> Do we need the __always_inline to force compiler to optimize it away?
>
> Yes
>
>>> @@ -58,6 +101,9 @@ alloc_makecontext_gcs (size_t stack_size)
>>> if (base == (void *) -1)
>>> /* ENOSYS, bad size or OOM. */
>>> abort ();
>>> +
>>> + record_gcs (base, size);
>>> +
>>
>> I think this will make makecontext non async-signal-safe when GCS is used,
>> when we explict document it as AS-Safe and AC-Safe. I think using
>> internal_signal_block_all/internal_signal_restore_set would be suffice,
>> but it is also a performance regression.
>
> Do you refer to the call to malloc() in record_gcs()?
>
> We need to keep track of shadow stacks allocated via map_shadow_stack()
> when a new context is created with makecontext() so that we can munmap
> it when it is no longer required.
>
> I think this is not on a performance critical path, so we can add the
> internal_signal_block_all / internal_signal_restore_set pair. Could you
> recommend where it would be best to use it?
Yes, glibc malloc is not async-signal-safe and thus calling on makecontext
make it async-signal-unsafe (similar for setcontext, which ends up calling
free).
I am not sure if you can really use malloc here, since a makecontext call
potentially interrupt malloc itself. We had the same issue on the getrandom
vDSO call; where we ended up using mmap directly instead.
You will still need to block/unblock signal to avoid reentrant signal
handlers; and there is the consideration of extra the mmap overhead per thread.
Another possibility, which I think would be simpler, is to add a TCB buffer
of N entries and use instead of a linked-list. It would add an small
memory overhead on each thread, and it would limit the number of in-flight
makecontext a thread can make; but at least makecontext can to return
ENOMEM if it can not create a new context, and it way simpler than adding
the mmap-allocator.
On Thu, Jan 09, 2025 at 03:53:14PM -0300, Adhemerval Zanella Netto wrote:
>
> >> I think this will make makecontext non async-signal-safe when GCS is used,
> >> when we explict document it as AS-Safe and AC-Safe. I think using
> >> internal_signal_block_all/internal_signal_restore_set would be suffice,
> >> but it is also a performance regression.
> >
> > Do you refer to the call to malloc() in record_gcs()?
> >
> > We need to keep track of shadow stacks allocated via map_shadow_stack()
> > when a new context is created with makecontext() so that we can munmap
> > it when it is no longer required.
> >
> > I think this is not on a performance critical path, so we can add the
> > internal_signal_block_all / internal_signal_restore_set pair. Could you
> > recommend where it would be best to use it?
>
> Yes, glibc malloc is not async-signal-safe and thus calling on makecontext
> make it async-signal-unsafe (similar for setcontext, which ends up calling
> free).
>
> I am not sure if you can really use malloc here, since a makecontext call
> potentially interrupt malloc itself. We had the same issue on the getrandom
> vDSO call; where we ended up using mmap directly instead.
>
> You will still need to block/unblock signal to avoid reentrant signal
> handlers; and there is the consideration of extra the mmap overhead per thread.
>
> Another possibility, which I think would be simpler, is to add a TCB buffer
> of N entries and use instead of a linked-list. It would add an small
> memory overhead on each thread, and it would limit the number of in-flight
> makecontext a thread can make; but at least makecontext can to return
> ENOMEM if it can not create a new context, and it way simpler than adding
> the mmap-allocator.
makecontext() does not return a value and doesn't set errno, so we probably
cannot "return" ENOMEM. In existing code, when makecontext() failes, it calls
abort(). Would the same be appropriate when we reach max number of context
instances per thread?
Another option is to silently stop freeing shadow stacks after reaching max
number. This will allow code that needs more makecontext's to continue with
memory leak being downside (potentially leading to ENOMEM).
Kind regards,
Yury
On 10/01/25 10:01, Yury Khrustalev wrote:
> On Thu, Jan 09, 2025 at 03:53:14PM -0300, Adhemerval Zanella Netto wrote:
>>
>>>> I think this will make makecontext non async-signal-safe when GCS is used,
>>>> when we explict document it as AS-Safe and AC-Safe. I think using
>>>> internal_signal_block_all/internal_signal_restore_set would be suffice,
>>>> but it is also a performance regression.
>>>
>>> Do you refer to the call to malloc() in record_gcs()?
>>>
>>> We need to keep track of shadow stacks allocated via map_shadow_stack()
>>> when a new context is created with makecontext() so that we can munmap
>>> it when it is no longer required.
>>>
>>> I think this is not on a performance critical path, so we can add the
>>> internal_signal_block_all / internal_signal_restore_set pair. Could you
>>> recommend where it would be best to use it?
>>
>> Yes, glibc malloc is not async-signal-safe and thus calling on makecontext
>> make it async-signal-unsafe (similar for setcontext, which ends up calling
>> free).
>>
>> I am not sure if you can really use malloc here, since a makecontext call
>> potentially interrupt malloc itself. We had the same issue on the getrandom
>> vDSO call; where we ended up using mmap directly instead.
>>
>> You will still need to block/unblock signal to avoid reentrant signal
>> handlers; and there is the consideration of extra the mmap overhead per thread.
>>
>> Another possibility, which I think would be simpler, is to add a TCB buffer
>> of N entries and use instead of a linked-list. It would add an small
>> memory overhead on each thread, and it would limit the number of in-flight
>> makecontext a thread can make; but at least makecontext can to return
>> ENOMEM if it can not create a new context, and it way simpler than adding
>> the mmap-allocator.
>
> makecontext() does not return a value and doesn't set errno, so we probably
> cannot "return" ENOMEM. In existing code, when makecontext() failes, it calls
> abort(). Would the same be appropriate when we reach max number of context
> instances per thread?
Sigh, you are right (I confused it with setcontext for some reason). I think
it would be the best option and x86_64 already does this if
__allocate_shadow_stack fails.
>
> Another option is to silently stop freeing shadow stacks after reaching max
> number. This will allow code that needs more makecontext's to continue with
> memory leak being downside (potentially leading to ENOMEM).
I don't think a silent leak is the best option, specially one that user can
not take some course of action without a lot of hackery.
Hi Adhemerval,
On Fri, Jan 10, 2025 at 10:09:24AM -0300, Adhemerval Zanella Netto wrote:
>
>
> On 10/01/25 10:01, Yury Khrustalev wrote:
> > On Thu, Jan 09, 2025 at 03:53:14PM -0300, Adhemerval Zanella Netto wrote:
> >>
> >>>> I think this will make makecontext non async-signal-safe when GCS is used,
> >>>> when we explict document it as AS-Safe and AC-Safe. I think using
> >>>> internal_signal_block_all/internal_signal_restore_set would be suffice,
> >>>> but it is also a performance regression.
> >>>
> >>> Do you refer to the call to malloc() in record_gcs()?
> >>>
> >>> We need to keep track of shadow stacks allocated via map_shadow_stack()
> >>> when a new context is created with makecontext() so that we can munmap
> >>> it when it is no longer required.
> >>>
> >>> I think this is not on a performance critical path, so we can add the
> >>> internal_signal_block_all / internal_signal_restore_set pair. Could you
> >>> recommend where it would be best to use it?
> >>
> >> Yes, glibc malloc is not async-signal-safe and thus calling on makecontext
> >> make it async-signal-unsafe (similar for setcontext, which ends up calling
> >> free).
> >>
> >> I am not sure if you can really use malloc here, since a makecontext call
> >> potentially interrupt malloc itself. We had the same issue on the getrandom
> >> vDSO call; where we ended up using mmap directly instead.
> >>
> >> You will still need to block/unblock signal to avoid reentrant signal
> >> handlers; and there is the consideration of extra the mmap overhead per thread.
> >>
> >> Another possibility, which I think would be simpler, is to add a TCB buffer
> >> of N entries and use instead of a linked-list. It would add an small
> >> memory overhead on each thread, and it would limit the number of in-flight
> >> makecontext a thread can make; but at least makecontext can to return
> >> ENOMEM if it can not create a new context, and it way simpler than adding
> >> the mmap-allocator.
> >
> > makecontext() does not return a value and doesn't set errno, so we probably
> > cannot "return" ENOMEM. In existing code, when makecontext() failes, it calls
> > abort(). Would the same be appropriate when we reach max number of context
> > instances per thread?
>
> Sigh, you are right (I confused it with setcontext for some reason). I think
> it would be the best option and x86_64 already does this if
> __allocate_shadow_stack fails.
>
> >
> > Another option is to silently stop freeing shadow stacks after reaching max
> > number. This will allow code that needs more makecontext's to continue with
> > memory leak being downside (potentially leading to ENOMEM).
>
> I don't think a silent leak is the best option, specially one that user can
> not take some course of action without a lot of hackery.
>
Looking deeper into this, I realise that a proper solution might take some time
to implement and test and it will likley be quite intrusive to the existing code
and may potentially affect common code used by other targets.
As an option, would you think it might be acceptable not to deallocate shadow
stacks allocated by makecontext? We can address this after the upcoming Glibc
release with a proper solution.
Thanks,
Yury
On 13/01/25 12:52, Yury Khrustalev wrote:
> Hi Adhemerval,
>
> On Fri, Jan 10, 2025 at 10:09:24AM -0300, Adhemerval Zanella Netto wrote:
>>
>>
>> On 10/01/25 10:01, Yury Khrustalev wrote:
>>> On Thu, Jan 09, 2025 at 03:53:14PM -0300, Adhemerval Zanella Netto wrote:
>>>>
>>>>>> I think this will make makecontext non async-signal-safe when GCS is used,
>>>>>> when we explict document it as AS-Safe and AC-Safe. I think using
>>>>>> internal_signal_block_all/internal_signal_restore_set would be suffice,
>>>>>> but it is also a performance regression.
>>>>>
>>>>> Do you refer to the call to malloc() in record_gcs()?
>>>>>
>>>>> We need to keep track of shadow stacks allocated via map_shadow_stack()
>>>>> when a new context is created with makecontext() so that we can munmap
>>>>> it when it is no longer required.
>>>>>
>>>>> I think this is not on a performance critical path, so we can add the
>>>>> internal_signal_block_all / internal_signal_restore_set pair. Could you
>>>>> recommend where it would be best to use it?
>>>>
>>>> Yes, glibc malloc is not async-signal-safe and thus calling on makecontext
>>>> make it async-signal-unsafe (similar for setcontext, which ends up calling
>>>> free).
>>>>
>>>> I am not sure if you can really use malloc here, since a makecontext call
>>>> potentially interrupt malloc itself. We had the same issue on the getrandom
>>>> vDSO call; where we ended up using mmap directly instead.
>>>>
>>>> You will still need to block/unblock signal to avoid reentrant signal
>>>> handlers; and there is the consideration of extra the mmap overhead per thread.
>>>>
>>>> Another possibility, which I think would be simpler, is to add a TCB buffer
>>>> of N entries and use instead of a linked-list. It would add an small
>>>> memory overhead on each thread, and it would limit the number of in-flight
>>>> makecontext a thread can make; but at least makecontext can to return
>>>> ENOMEM if it can not create a new context, and it way simpler than adding
>>>> the mmap-allocator.
>>>
>>> makecontext() does not return a value and doesn't set errno, so we probably
>>> cannot "return" ENOMEM. In existing code, when makecontext() failes, it calls
>>> abort(). Would the same be appropriate when we reach max number of context
>>> instances per thread?
>>
>> Sigh, you are right (I confused it with setcontext for some reason). I think
>> it would be the best option and x86_64 already does this if
>> __allocate_shadow_stack fails.
>>
>>>
>>> Another option is to silently stop freeing shadow stacks after reaching max
>>> number. This will allow code that needs more makecontext's to continue with
>>> memory leak being downside (potentially leading to ENOMEM).
>>
>> I don't think a silent leak is the best option, specially one that user can
>> not take some course of action without a lot of hackery.
>>
>
> Looking deeper into this, I realise that a proper solution might take some time
> to implement and test and it will likley be quite intrusive to the existing code
> and may potentially affect common code used by other targets.
>
> As an option, would you think it might be acceptable not to deallocate shadow
> stacks allocated by makecontext? We can address this after the upcoming Glibc
> release with a proper solution.
The mmap leaking is a QoI problem, but I think it should not add security issue.
And it should not happen on any current deployment, and since you are the aarch64
maintainer I won't block it.
>
> Thanks,
> Yury
>
Maybe something as below (based on top of the patchset and completely
untested on a GCS enabled kernel).
Ideally the tls-internal-struct.h should be refactored to use the common
definition, but copying for now should be ok. Since __libc_aarch64_thread_freeres
is called on every thread deallocation, there is no need to reset the TCB state.
I think it might require some atfork handling, but I am not sure.
---
diff --git a/sysdeps/aarch64/tls-internal-struct.h b/sysdeps/aarch64/tls-internal-struct.h
new file mode 100644
index 0000000000..fd82adf30e
--- /dev/null
+++ b/sysdeps/aarch64/tls-internal-struct.h
@@ -0,0 +1,36 @@
+/* Per-thread state. Linux/AArch64 version.
+ Copyright (C) 2025 Free Software Foundation, Inc.
+ This file is part of the GNU C Library.
+
+ The GNU C Library is free software; you can redistribute it and/or
+ modify it under the terms of the GNU Lesser General Public
+ License as published by the Free Software Foundation; either
+ version 2.1 of the License, or (at your option) any later version.
+
+ The GNU C Library is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ Lesser General Public License for more details.
+
+ You should have received a copy of the GNU Lesser General Public
+ License along with the GNU C Library; if not, see
+ <https://www.gnu.org/licenses/>. */
+
+#ifndef _TLS_INTERNAL_STRUCT_H
+#define _TLS_INTERNAL_STRUCT_H 1
+
+struct gcs_record_t
+{
+ void *base;
+ size_t size;
+};
+#define MAX_GCS_RECORD 8
+
+struct tls_internal_t
+{
+ char *strsignal_buf;
+ char *strerror_l_buf;
+ struct gcs_record_t gcs_record[MAX_GCS_RECORD];
+};
+
+#endif
diff --git a/sysdeps/unix/sysv/linux/aarch64/makecontext.c b/sysdeps/unix/sysv/linux/aarch64/makecontext.c
index 8a91fed3fc..f3fd7515e2 100644
--- a/sysdeps/unix/sysv/linux/aarch64/makecontext.c
+++ b/sysdeps/unix/sysv/linux/aarch64/makecontext.c
@@ -23,6 +23,7 @@
#include <stdlib.h>
#include <ucontext.h>
#include <sys/mman.h>
+#include <tls-internal.h>
#include "aarch64-gcs.h"
#define GCS_MAGIC 0x47435300
@@ -32,45 +33,21 @@ static struct _aarch64_ctx *extension (void *p)
return p;
}
-struct gcs_list {
- struct gcs_list *next;
- void *base;
- size_t size;
-};
-
-static __thread struct gcs_list *gcs_list_head = NULL;
-
-static void
-record_gcs (void *base, size_t size)
+static inline void
+free_record_gcs (struct gcs_record_t *r)
{
- struct gcs_list *p = malloc (sizeof *p);
- if (p == NULL)
- abort ();
- p->base = base;
- p->size = size;
- p->next = gcs_list_head;
- gcs_list_head = p;
-}
-
-static void
-free_gcs_list (void)
-{
- for (;;)
- {
- struct gcs_list *p = gcs_list_head;
- if (p == NULL)
- break;
- gcs_list_head = p->next;
- __munmap (p->base, p->size);
- free (p);
- }
+ __munmap (r->base, r->size);
+ r->base = NULL;
+ r->size = 0;
}
/* Called during thread shutdown to free resources. */
void
__libc_aarch64_thread_freeres (void)
{
- free_gcs_list ();
+ struct tls_internal_t *tls_internal = __glibc_tls_internal ();
+ for (int i = 0; i < MAX_GCS_RECORD; i++)
+ free_record_gcs (&tls_internal->gcs_record[i]);
}
static void *
@@ -82,27 +59,51 @@ alloc_makecontext_gcs (size_t stack_size)
if (gcsp == NULL)
/* ENOSYS, bad size or OOM. */
abort ();
- record_gcs (base, size);
- return gcsp;
+
+ void *r = NULL;
+ internal_sigset_t set;
+ internal_signal_block_all (&set);
+ struct tls_internal_t *tls_internal = __glibc_tls_internal ();
+ for (int i = 0; i < MAX_GCS_RECORD; i++)
+ if (tls_internal->gcs_record[i].base != NULL)
+ {
+ tls_internal->gcs_record[i].base = base;
+ tls_internal->gcs_record[i].size = size;
+ r = gcsp;
+ break;
+ }
+ internal_signal_restore_set (&set);
+ /* No free record space available. */
+ if (r == NULL)
+ abort ();
+ return r;
+}
+
+static inline bool
+is_gcs_record (void *gcs, const struct gcs_record_t *r)
+{
+ return gcs == r->base + r->size - 8;
}
void
__free_makecontext_gcs (void *gcs)
{
- struct gcs_list *p = gcs_list_head;
- struct gcs_list **q = &gcs_list_head;
- for (;;)
- {
- if (p == NULL)
- abort ();
- if (gcs == p->base + p->size - 8)
- break;
- q = &p->next;
- p = p->next;
- }
- *q = p->next;
- __munmap (p->base, p->size);
- free (p);
+ bool ok = false;
+ internal_sigset_t set;
+ internal_signal_block_all (&set);
+ struct tls_internal_t *tls_internal = __glibc_tls_internal ();
+ for (int i = 0; i < MAX_GCS_RECORD; i++)
+ if (is_gcs_record (gcs, &tls_internal->gcs_record[i]))
+ {
+ free_record_gcs (&tls_internal->gcs_record[i]);
+ ok = true;
+ return;
+ }
+ internal_signal_restore_set (&set);
+
+ /* Record not found. */
+ if (!ok)
+ abort ();
}
/* makecontext sets up a stack and the registers for the
Hi Adhemerval,
On Mon, Jan 13, 2025 at 02:45:03PM -0300, Adhemerval Zanella Netto wrote:
>
> > Looking deeper into this, I realise that a proper solution might take some time
> > to implement and test and it will likley be quite intrusive to the existing code
> > and may potentially affect common code used by other targets.
> >
> > As an option, would you think it might be acceptable not to deallocate shadow
> > stacks allocated by makecontext? We can address this after the upcoming Glibc
> > release with a proper solution.
>
> The mmap leaking is a QoI problem, but I think it should not add security issue.
> And it should not happen on any current deployment, and since you are the aarch64
> maintainer I won't block it.
I will send updated patch series shortly addressing all the comments and removing
this patch "[PATCH v7 09/23] aarch64: Try to free the GCS of makecontext" from the
series.
I will add a TODO comment in the code. However, I'd like to document this properly.
Would you recommend how we can describe the current state of makecontext() when GCS
is enabled (meaning that allocated shadow stacks will remain mapped even after a
thread using them exits)?
AFAIK, other targets don't munmap() this memory too, at least this is what I have
found in the code.
Thanks,
Yury
@@ -78,6 +78,8 @@ extern void __nss_database_freeres (void) attribute_hidden;
extern int _IO_cleanup (void) attribute_hidden;;
/* From dlfcn/dlerror.c */
extern void __libc_dlerror_result_free (void) attribute_hidden;
+/* From libc.so, arch specific. */
+extern void ARCH_THREAD_FREERES (void) attribute_hidden;
/* From either libc.so or libpthread.so */
extern void __libpthread_freeres (void) attribute_hidden;
@@ -22,6 +22,14 @@
#include <shlib-compat.h>
#include <tls-internal.h>
+/* Define empty function if no arch-specific clean-up
+ function has been defined. */
+#ifndef ARCH_THREAD_FREERES
+void __always_inline
+__libc_arch_thread_freeres (void) {}
+#define ARCH_THREAD_FREERES __libc_arch_thread_freeres
+#endif
+
/* Thread shutdown function. Note that this function must be called
for threads during shutdown for correctness reasons. Unlike
__libc_freeres, skipping calls to it is not a valid optimization.
@@ -29,6 +37,7 @@
void
__libc_thread_freeres (void)
{
+ call_function_static_weak (ARCH_THREAD_FREERES);
#if SHLIB_COMPAT (libc, GLIBC_2_0, GLIBC_2_32)
__rpc_thread_destroy ();
#endif
@@ -20,7 +20,9 @@
#include <sysdep.h>
#include <stdarg.h>
#include <stdint.h>
+#include <stdlib.h>
#include <ucontext.h>
+#include <sys/mman.h>
#define GCS_MAGIC 0x47435300
@@ -29,6 +31,47 @@ static struct _aarch64_ctx *extension (void *p)
return p;
}
+struct gcs_list {
+ struct gcs_list *next;
+ void *base;
+ size_t size;
+};
+
+static __thread struct gcs_list *gcs_list_head = NULL;
+
+static void
+record_gcs (void *base, size_t size)
+{
+ struct gcs_list *p = malloc (sizeof *p);
+ if (p == NULL)
+ abort ();
+ p->base = base;
+ p->size = size;
+ p->next = gcs_list_head;
+ gcs_list_head = p;
+}
+
+static void
+free_gcs_list (void)
+{
+ for (;;)
+ {
+ struct gcs_list *p = gcs_list_head;
+ if (p == NULL)
+ break;
+ gcs_list_head = p->next;
+ __munmap (p->base, p->size);
+ free (p);
+ }
+}
+
+/* Called during thread shutdown to free resources. */
+void
+__libc_aarch64_thread_freeres (void)
+{
+ free_gcs_list ();
+}
+
#ifndef __NR_map_shadow_stack
# define __NR_map_shadow_stack 453
#endif
@@ -58,6 +101,9 @@ alloc_makecontext_gcs (size_t stack_size)
if (base == (void *) -1)
/* ENOSYS, bad size or OOM. */
abort ();
+
+ record_gcs (base, size);
+
uint64_t *gcsp = (uint64_t *) ((char *) base + size);
/* Skip end of GCS token. */
gcsp--;
@@ -69,6 +115,25 @@ alloc_makecontext_gcs (size_t stack_size)
return gcsp + 1;
}
+void
+__free_makecontext_gcs (void *gcs)
+{
+ struct gcs_list *p = gcs_list_head;
+ struct gcs_list **q = &gcs_list_head;
+ for (;;)
+ {
+ if (p == NULL)
+ abort ();
+ if (gcs == p->base + p->size - 8)
+ break;
+ q = &p->next;
+ p = p->next;
+ }
+ *q = p->next;
+ __munmap (p->base, p->size);
+ free (p);
+}
+
/* makecontext sets up a stack and the registers for the
user context. The stack looks like this:
@@ -34,6 +34,9 @@
.text
ENTRY (__setcontext)
+ /* If x10 is set then old GCS is freed. */
+ mov x10, 0
+__setcontext_internal:
PTR_ARG (0)
/* Save a copy of UCP. */
mov x9, x0
@@ -145,7 +148,8 @@ ENTRY (__setcontext)
ldr x3, [x2, #oGCSPR]
MRS_GCSPR (x2)
mov x4, x3
- /* x2: GCSPR now. x3, x4: target GCSPR. x5, x6: tmp regs. */
+ mov x1, x2
+ /* x1, x2: GCSPR now. x3, x4: target GCSPR. x5, x6: tmp regs. */
L(gcs_scan):
cmp x2, x4
b.eq L(gcs_pop)
@@ -162,10 +166,18 @@ L(gcs_switch):
GCSSS2 (xzr)
L(gcs_pop):
cmp x2, x3
- b.eq L(gcs_done)
+ b.eq L(gcs_free_old)
GCSPOPM (xzr)
add x2, x2, 8
b L(gcs_pop)
+L(gcs_free_old):
+ cbz x10, L(gcs_done)
+ mov x28, x0
+ mov x0, x1
+ bl __free_makecontext_gcs
+ mov x0, x28
+ ldp x28, x29, [x0, oX0 + 28 * SZREG]
+ ldr x30, [x0, oX0 + 30 * SZREG]
L(gcs_done):
2:
@@ -186,6 +198,7 @@ ENTRY (__startcontext)
cfi_undefined (x30)
blr x20
mov x0, x19
- cbnz x0, __setcontext
+ mov x10, 1
+ cbnz x0, __setcontext_internal
1: b HIDDEN_JUMPTARGET (exit)
END (__startcontext)
@@ -29,8 +29,12 @@
#include <tls.h>
-/* In order to get __set_errno() definition in INLINE_SYSCALL. */
#ifndef __ASSEMBLER__
+/* Thread cleanup function. */
+#define ARCH_THREAD_FREERES __libc_aarch64_thread_freeres
+void __libc_aarch64_thread_freeres (void) attribute_hidden;
+
+/* In order to get __set_errno() definition in INLINE_SYSCALL. */
#include <errno.h>
#endif