[v7,13/23] aarch64: Enable GCS in dynamic linked exe

Message ID 20250103154141.47731-14-yury.khrustalev@arm.com (mailing list archive)
State Superseded
Headers
Series aarch64: Add support for Guarded Control Stack extension |

Checks

Context Check Description
redhat-pt-bot/TryBot-apply_patch success Patch applied to master at the time it was sent

Commit Message

Yury Khrustalev Jan. 3, 2025, 3:41 p.m. UTC
  From: Szabolcs Nagy <szabolcs.nagy@arm.com>

Use the dynamic linker start code to enable GCS in the dynamic linked
case after _dl_start returns and before _dl_start_user which marks
the point after which user code may run.

Like in the static linked case this ensures that GCS is enabled on a
top level stack frame.

Reviewed-by: Wilco Dijkstra <Wilco.Dijkstra@arm.com>
---
 sysdeps/aarch64/Makefile                |  4 +++-
 sysdeps/aarch64/dl-start.S              | 25 +++++++++++++++++++++++--
 sysdeps/aarch64/rtld-global-offsets.sym |  5 +++++
 3 files changed, 31 insertions(+), 3 deletions(-)
  

Comments

Adhemerval Zanella Netto Jan. 7, 2025, 4:50 p.m. UTC | #1
On 03/01/25 12:41, Yury Khrustalev wrote:
> From: Szabolcs Nagy <szabolcs.nagy@arm.com>
> 
> Use the dynamic linker start code to enable GCS in the dynamic linked
> case after _dl_start returns and before _dl_start_user which marks
> the point after which user code may run.
> 
> Like in the static linked case this ensures that GCS is enabled on a
> top level stack frame.
> 
> Reviewed-by: Wilco Dijkstra <Wilco.Dijkstra@arm.com>
> ---
>  sysdeps/aarch64/Makefile                |  4 +++-
>  sysdeps/aarch64/dl-start.S              | 25 +++++++++++++++++++++++--
>  sysdeps/aarch64/rtld-global-offsets.sym |  5 +++++
>  3 files changed, 31 insertions(+), 3 deletions(-)
> 
> diff --git a/sysdeps/aarch64/Makefile b/sysdeps/aarch64/Makefile
> index 141d7d9cc2..ca8b96f550 100644
> --- a/sysdeps/aarch64/Makefile
> +++ b/sysdeps/aarch64/Makefile
> @@ -35,7 +35,9 @@ endif
>  ifeq ($(subdir),elf)
>  sysdep-rtld-routines += dl-start
>  sysdep-dl-routines += tlsdesc dl-tlsdesc
> -gen-as-const-headers += dl-link.sym
> +gen-as-const-headers += \
> +  dl-link.sym \
> +  rtld-global-offsets.sym
>  
>  tests-internal += tst-ifunc-arg-1 tst-ifunc-arg-2
>  
> diff --git a/sysdeps/aarch64/dl-start.S b/sysdeps/aarch64/dl-start.S
> index 3b8eff0acd..3f9faf9d57 100644
> --- a/sysdeps/aarch64/dl-start.S
> +++ b/sysdeps/aarch64/dl-start.S
> @@ -18,6 +18,7 @@
>     <https://www.gnu.org/licenses/>.  */
>  
>  #include <sysdep.h>
> +#include <rtld-global-offsets.h>
>  
>  ENTRY (_start)
>  	/* Create an initial frame with 0 LR and FP */
> @@ -25,11 +26,32 @@ ENTRY (_start)
>  	mov	x29, #0
>  	mov	x30, #0
>  
> +	/* Load and relocate all library dependencies.  */
>  	mov	x0, sp
>  	PTR_ARG (0)
>  	bl	_dl_start
>  	/* Returns user entry point in x0.  */
>  	mov	PTR_REG (21), PTR_REG (0)
> +
> +	/* Use GL(dl_aarch64_gcs) to set the shadow stack status.  */
> +	adrp	x16, _rtld_local
> +	add	PTR_REG (16), PTR_REG (16), :lo12:_rtld_local
> +	ldr	x1, [x16, GL_DL_AARCH64_GCS_OFFSET]
> +	cbz	x1, L(skip_gcs_enable)
> +
> +	/* Enable GCS before user code runs.  Note that IFUNC resolvers and
> +	   LD_AUDIT hooks may run before, but should not create threads.  */
> +#define PR_SET_SHADOW_STACK_STATUS  75
> +#define PR_SHADOW_STACK_ENABLE      (1UL << 0)
> +	mov	x0, PR_SET_SHADOW_STACK_STATUS
> +	mov	x1, PR_SHADOW_STACK_ENABLE
> +	mov	x2, 0
> +	mov	x3, 0
> +	mov	x4, 0
> +	mov	x8, #SYS_ify(prctl)
> +	svc	0x0

Should it handle errors, for the case prctl is filtered or blocked?

> +L(skip_gcs_enable):
> +
>  .globl _dl_start_user
>  .type _dl_start_user, %function
>  _dl_start_user:
> @@ -40,8 +62,7 @@ _dl_start_user:
>  	/* Compute envp.  */
>  	add	PTR_REG (3), PTR_REG (2), PTR_REG (1), lsl PTR_LOG_SIZE
>  	add	PTR_REG (3), PTR_REG (3), PTR_SIZE
> -	adrp	x16, _rtld_local
> -	add	PTR_REG (16), PTR_REG (16), :lo12:_rtld_local
> +	/* Run the init functions of the loaded modules.  */
>  	ldr	PTR_REG (0), [x16]
>  	bl	_dl_init
>  	/* Load the finalizer function.  */
> diff --git a/sysdeps/aarch64/rtld-global-offsets.sym b/sysdeps/aarch64/rtld-global-offsets.sym
> index 23cdaf7d9e..6c0690bb95 100644
> --- a/sysdeps/aarch64/rtld-global-offsets.sym
> +++ b/sysdeps/aarch64/rtld-global-offsets.sym
> @@ -3,8 +3,13 @@
>  #include <ldsodefs.h>
>  
>  #define GLRO_offsetof(name) offsetof (struct rtld_global_ro, _##name)
> +#define GL_offsetof(name) offsetof (struct rtld_global, _##name)
>  
>  -- Offsets of _rtld_global_ro in libc.so
>  
>  GLRO_DL_HWCAP_OFFSET	GLRO_offsetof (dl_hwcap)
>  GLRO_DL_HWCAP2_OFFSET	GLRO_offsetof (dl_hwcap2)
> +
> +-- Offsets of _rtld_global in libc.so
> +
> +GL_DL_AARCH64_GCS_OFFSET	GL_offsetof (dl_aarch64_gcs)
  
Yury Khrustalev Jan. 8, 2025, 1:52 p.m. UTC | #2
On Tue, Jan 07, 2025 at 01:50:41PM -0300, Adhemerval Zanella Netto wrote:
> 
> > +	/* Enable GCS before user code runs.  Note that IFUNC resolvers and
> > +	   LD_AUDIT hooks may run before, but should not create threads.  */
> > +#define PR_SET_SHADOW_STACK_STATUS  75
> > +#define PR_SHADOW_STACK_ENABLE      (1UL << 0)
> > +	mov	x0, PR_SET_SHADOW_STACK_STATUS
> > +	mov	x1, PR_SHADOW_STACK_ENABLE
> > +	mov	x2, 0
> > +	mov	x3, 0
> > +	mov	x4, 0
> > +	mov	x8, #SYS_ify(prctl)
> > +	svc	0x0
> 
> Should it handle errors, for the case prctl is filtered or blocked?

There is no need to handle errors here. If this syscall doesn't succeed, it
would be the same as if we didn't make it at all: GCS will remain disabled.

Having a check for error here would be an unnecessary overhead.

Thanks,
Yury
  
Adhemerval Zanella Netto Jan. 8, 2025, 2:04 p.m. UTC | #3
On 08/01/25 10:52, Yury Khrustalev wrote:
> On Tue, Jan 07, 2025 at 01:50:41PM -0300, Adhemerval Zanella Netto wrote:
>>
>>> +	/* Enable GCS before user code runs.  Note that IFUNC resolvers and
>>> +	   LD_AUDIT hooks may run before, but should not create threads.  */
>>> +#define PR_SET_SHADOW_STACK_STATUS  75
>>> +#define PR_SHADOW_STACK_ENABLE      (1UL << 0)
>>> +	mov	x0, PR_SET_SHADOW_STACK_STATUS
>>> +	mov	x1, PR_SHADOW_STACK_ENABLE
>>> +	mov	x2, 0
>>> +	mov	x3, 0
>>> +	mov	x4, 0
>>> +	mov	x8, #SYS_ify(prctl)
>>> +	svc	0x0
>>
>> Should it handle errors, for the case prctl is filtered or blocked?
> 
> There is no need to handle errors here. If this syscall doesn't succeed, it
> would be the same as if we didn't make it at all: GCS will remain disabled.
> 
> Having a check for error here would be an unnecessary overhead.

But this is aaik not how other security hardening work, like BTI where 
_dl_bti_check will _dl_fatal_printf if bti can not be enabled.  Without a 
check, GCS is taking as a hint, not as enforcement
  
Yury Khrustalev Jan. 8, 2025, 4:54 p.m. UTC | #4
On Wed, Jan 08, 2025 at 11:04:48AM -0300, Adhemerval Zanella Netto wrote:
> 
> 
> On 08/01/25 10:52, Yury Khrustalev wrote:
> > On Tue, Jan 07, 2025 at 01:50:41PM -0300, Adhemerval Zanella Netto wrote:
> >>
> >>> +	/* Enable GCS before user code runs.  Note that IFUNC resolvers and
> >>> +	   LD_AUDIT hooks may run before, but should not create threads.  */
> >>> +#define PR_SET_SHADOW_STACK_STATUS  75
> >>> +#define PR_SHADOW_STACK_ENABLE      (1UL << 0)
> >>> +	mov	x0, PR_SET_SHADOW_STACK_STATUS
> >>> +	mov	x1, PR_SHADOW_STACK_ENABLE
> >>> +	mov	x2, 0
> >>> +	mov	x3, 0
> >>> +	mov	x4, 0
> >>> +	mov	x8, #SYS_ify(prctl)
> >>> +	svc	0x0
> >>
> >> Should it handle errors, for the case prctl is filtered or blocked?
> > 
> > There is no need to handle errors here. If this syscall doesn't succeed, it
> > would be the same as if we didn't make it at all: GCS will remain disabled.
> > 
> > Having a check for error here would be an unnecessary overhead.
> 
> But this is aaik not how other security hardening work, like BTI where 
> _dl_bti_check will _dl_fatal_printf if bti can not be enabled.  Without a 
> check, GCS is taking as a hint, not as enforcement

The _dl_bti_check and _dl_gcs_check functions do something different: they check
consistency of markings in the binaries and dependencies. Here we are trying to
enable GCS support (in case of BTI there is no need for this as this is handled
by kernel).

We could check the value returned from syscall and call a function that would
print error message and exit using _dl_fatal_printf.

I suppose the same should be done for static executables? We'd need to check
return value of the prctl syscall in aarch64_libc_setup_tls. Is it OK to call
_dl_fatal_printf in aarch64_libc_setup_tls?

Thanks,
Yury
  
Adhemerval Zanella Netto Jan. 8, 2025, 5:11 p.m. UTC | #5
On 08/01/25 13:54, Yury Khrustalev wrote:
> On Wed, Jan 08, 2025 at 11:04:48AM -0300, Adhemerval Zanella Netto wrote:
>>
>>
>> On 08/01/25 10:52, Yury Khrustalev wrote:
>>> On Tue, Jan 07, 2025 at 01:50:41PM -0300, Adhemerval Zanella Netto wrote:
>>>>
>>>>> +	/* Enable GCS before user code runs.  Note that IFUNC resolvers and
>>>>> +	   LD_AUDIT hooks may run before, but should not create threads.  */
>>>>> +#define PR_SET_SHADOW_STACK_STATUS  75
>>>>> +#define PR_SHADOW_STACK_ENABLE      (1UL << 0)
>>>>> +	mov	x0, PR_SET_SHADOW_STACK_STATUS
>>>>> +	mov	x1, PR_SHADOW_STACK_ENABLE
>>>>> +	mov	x2, 0
>>>>> +	mov	x3, 0
>>>>> +	mov	x4, 0
>>>>> +	mov	x8, #SYS_ify(prctl)
>>>>> +	svc	0x0
>>>>
>>>> Should it handle errors, for the case prctl is filtered or blocked?
>>>
>>> There is no need to handle errors here. If this syscall doesn't succeed, it
>>> would be the same as if we didn't make it at all: GCS will remain disabled.
>>>
>>> Having a check for error here would be an unnecessary overhead.
>>
>> But this is aaik not how other security hardening work, like BTI where 
>> _dl_bti_check will _dl_fatal_printf if bti can not be enabled.  Without a 
>> check, GCS is taking as a hint, not as enforcement
> 
> The _dl_bti_check and _dl_gcs_check functions do something different: they check
> consistency of markings in the binaries and dependencies. Here we are trying to
> enable GCS support (in case of BTI there is no need for this as this is handled
> by kernel).

Yes, my point is kernel enforces BTI/PAC during process execution; while for GCS
this should be enforced by the program loader.  I am not sure if this is really
a security issue, since to force a prctl failure here would require a syscall
interception; but it fells strange that a misconfigured environment (such as
old container runtime or alike) can disable GCS usage.

> 
> We could check the value returned from syscall and call a function that would
> print error message and exit using _dl_fatal_printf.
> 
> I suppose the same should be done for static executables? We'd need to check
> return value of the prctl syscall in aarch64_libc_setup_tls. Is it OK to call
> _dl_fatal_printf in aarch64_libc_setup_tls?

I think it should be, at least with a rapid test by issuing _dl_fatal_printf
unconditionally, static tests does print the message.

> 
> Thanks,
> Yury
>
  

Patch

diff --git a/sysdeps/aarch64/Makefile b/sysdeps/aarch64/Makefile
index 141d7d9cc2..ca8b96f550 100644
--- a/sysdeps/aarch64/Makefile
+++ b/sysdeps/aarch64/Makefile
@@ -35,7 +35,9 @@  endif
 ifeq ($(subdir),elf)
 sysdep-rtld-routines += dl-start
 sysdep-dl-routines += tlsdesc dl-tlsdesc
-gen-as-const-headers += dl-link.sym
+gen-as-const-headers += \
+  dl-link.sym \
+  rtld-global-offsets.sym
 
 tests-internal += tst-ifunc-arg-1 tst-ifunc-arg-2
 
diff --git a/sysdeps/aarch64/dl-start.S b/sysdeps/aarch64/dl-start.S
index 3b8eff0acd..3f9faf9d57 100644
--- a/sysdeps/aarch64/dl-start.S
+++ b/sysdeps/aarch64/dl-start.S
@@ -18,6 +18,7 @@ 
    <https://www.gnu.org/licenses/>.  */
 
 #include <sysdep.h>
+#include <rtld-global-offsets.h>
 
 ENTRY (_start)
 	/* Create an initial frame with 0 LR and FP */
@@ -25,11 +26,32 @@  ENTRY (_start)
 	mov	x29, #0
 	mov	x30, #0
 
+	/* Load and relocate all library dependencies.  */
 	mov	x0, sp
 	PTR_ARG (0)
 	bl	_dl_start
 	/* Returns user entry point in x0.  */
 	mov	PTR_REG (21), PTR_REG (0)
+
+	/* Use GL(dl_aarch64_gcs) to set the shadow stack status.  */
+	adrp	x16, _rtld_local
+	add	PTR_REG (16), PTR_REG (16), :lo12:_rtld_local
+	ldr	x1, [x16, GL_DL_AARCH64_GCS_OFFSET]
+	cbz	x1, L(skip_gcs_enable)
+
+	/* Enable GCS before user code runs.  Note that IFUNC resolvers and
+	   LD_AUDIT hooks may run before, but should not create threads.  */
+#define PR_SET_SHADOW_STACK_STATUS  75
+#define PR_SHADOW_STACK_ENABLE      (1UL << 0)
+	mov	x0, PR_SET_SHADOW_STACK_STATUS
+	mov	x1, PR_SHADOW_STACK_ENABLE
+	mov	x2, 0
+	mov	x3, 0
+	mov	x4, 0
+	mov	x8, #SYS_ify(prctl)
+	svc	0x0
+L(skip_gcs_enable):
+
 .globl _dl_start_user
 .type _dl_start_user, %function
 _dl_start_user:
@@ -40,8 +62,7 @@  _dl_start_user:
 	/* Compute envp.  */
 	add	PTR_REG (3), PTR_REG (2), PTR_REG (1), lsl PTR_LOG_SIZE
 	add	PTR_REG (3), PTR_REG (3), PTR_SIZE
-	adrp	x16, _rtld_local
-	add	PTR_REG (16), PTR_REG (16), :lo12:_rtld_local
+	/* Run the init functions of the loaded modules.  */
 	ldr	PTR_REG (0), [x16]
 	bl	_dl_init
 	/* Load the finalizer function.  */
diff --git a/sysdeps/aarch64/rtld-global-offsets.sym b/sysdeps/aarch64/rtld-global-offsets.sym
index 23cdaf7d9e..6c0690bb95 100644
--- a/sysdeps/aarch64/rtld-global-offsets.sym
+++ b/sysdeps/aarch64/rtld-global-offsets.sym
@@ -3,8 +3,13 @@ 
 #include <ldsodefs.h>
 
 #define GLRO_offsetof(name) offsetof (struct rtld_global_ro, _##name)
+#define GL_offsetof(name) offsetof (struct rtld_global, _##name)
 
 -- Offsets of _rtld_global_ro in libc.so
 
 GLRO_DL_HWCAP_OFFSET	GLRO_offsetof (dl_hwcap)
 GLRO_DL_HWCAP2_OFFSET	GLRO_offsetof (dl_hwcap2)
+
+-- Offsets of _rtld_global in libc.so
+
+GL_DL_AARCH64_GCS_OFFSET	GL_offsetof (dl_aarch64_gcs)