Partial ILP32 support for aarch64

Message ID 1479761404.14643.10.camel@caviumnetworks.com
State New, archived
Headers

Commit Message

Steve Ellcey Nov. 21, 2016, 8:50 p.m. UTC
  Here is an updated version with the indentation fixed.

Steve Ellcey
sellcey@caviumnetworks.com


2016-11-21  Andrew Pinski  <andrew.pinski@caviumnetworks.com>
	    Yury Norov  <ynorov@caviumnetworks.com>
	    Steve Ellcey  <sellcey@caviumnetworks.com>

	* sysdeps/aarch64/crti.S: Add include of sysdep.h.
	(call_weak_fn): Use PTR_REG to get correct reg name in ILP32.
	* sysdeps/aarch64/dl-irel.h: Add include of sysdep.h.
	(elf_irela): Use AARCH64_R macro to get correct relocation in ILP32.
	* sysdeps/aarch64/dl-machine.h: Add include of sysdep.h.
	(elf_machine_load_address, RTLD_START, RTLD_START_1, RTLD_START,
	elf_machine_type_class, ELF_MACHINE_JMP_SLOT, elf_machine_rela,
	elf_machine_lazy_rel): Add ifdef's for ILP32 support.
	* sysdeps/aarch64/dl-tlsdesc.S (_dl_tlsdesc_return,
	_dl_tlsdesc_return_lazy, _dl_tlsdesc_dynamic,
	_dl_tlsdesc_resolve_hold): Extend pointers in ILP32, use PTR_REG
	to get correct reg name for ILP32.
	* sysdeps/aarch64/dl-trampoline.S (ip01): New Macro.
	(RELA_SIZE): New Macro.
	(_dl_runtime_resolve, _dl_runtime_profile): Use new macros and PTR_REG
	to support ILP32.
	* sysdeps/aarch64/jmpbuf-unwind.h (_JMPBUF_CFA_UNWINDS_ADJ): Add
	cast for ILP32 mode.
	* sysdeps/aarch64/memcmp.S (memcmp): Extend arg pointers for ILP32 mode.
	* sysdeps/aarch64/memcpy.S (memmove, memcpy): Ditto.
	* sysdeps/aarch64/memset.S (__memset): Ditto.
	* sysdeps/aarch64/strchr.S (strchr): Ditto.
	* sysdeps/aarch64/strchrnul.S (__strchrnul): Ditto.
	* sysdeps/aarch64/strcmp.S (strcmp): Ditto.
	* sysdeps/aarch64/strcpy.S (strcpy): Ditto.
	* sysdeps/aarch64/strlen.S (__strlen): Ditto.
	* sysdeps/aarch64/strncmp.S (strncmp): Ditto.
	* sysdeps/aarch64/strnlen.S (strnlen): Ditto.
	* sysdeps/aarch64/strrchr.S (strrchr): Ditto.
	* sysdeps/unix/sysv/linux/aarch64/clone.S: Ditto.
	* sysdeps/unix/sysv/linux/aarch64/setcontext.S (__setcontext): Ditto.
	* sysdeps/unix/sysv/linux/aarch64/swapcontext.S (__swapcontext): Ditto.
	* sysdeps/aarch64/__longjmp.S (__longjmp): Extend pointers in ILP32,
	change PTR_MANGLE call to use register numbers instead of names.
	* sysdeps/unix/sysv/linux/aarch64/getcontext.S (__getcontext): Ditto.
	* sysdeps/aarch64/setjmp.S (__sigsetjmp): Extend arg pointers for
	ILP32 mode, change PTR_MANGLE calls to use register numbers.
	* sysdeps/aarch64/start.S (_start): Ditto.
	* sysdeps/aarch64/nptl/bits/pthreadtypes.h
	(__PTHREAD_RWLOCK_INT_FLAGS_SHARED): New define.
	* sysdeps/aarch64/nptl/bits/semaphore.h (__SIZEOF_SEM_T): Change define.
	* sysdeps/aarch64/sysdep.h (AARCH64_R, PTR_REG, PTR_LOG_SIZE, DELOUSE,
	PTR_SIZE): New Macros.
	(LDST_PCREL, LDST_GLOBAL) Update to use PTR_REG.
	* sysdeps/unix/sysv/linux/aarch64/bits/fcntl.h (O_LARGEFILE):
	Set when in ILP32 mode.
	(F_GETLK64, F_SETLK64, F_SETLKW64): Only set in LP64 mode.
	* sysdeps/unix/sysv/linux/aarch64/dl-cache.h (DL_CACHE_DEFAULT_ID):
	Set elf flags for ILP32.
	(add_system_dir): Set ILP32 library directories.
	* sysdeps/unix/sysv/linux/aarch64/init-first.c
	(_libc_vdso_platform_setup): Set minimum kernel version for ILP32.
	* sysdeps/unix/sysv/linux/aarch64/ldconfig.h
	(SYSDEP_KNOWN_INTERPRETER_NAMES): Add ILP32 names.
	* sysdeps/unix/sysv/linux/aarch64/sigcontextinfo.h (GET_PC, SET_PC):
	New Macros.
	* sysdeps/unix/sysv/linux/aarch64/sysdep.h: Handle ILP32 pointers.
  

Comments

Andreas Schwab Nov. 23, 2016, 9:43 a.m. UTC | #1
On Nov 21 2016, Steve Ellcey <sellcey@caviumnetworks.com> wrote:

> diff --git a/sysdeps/aarch64/dl-irel.h b/sysdeps/aarch64/dl-irel.h
> index 63a8e50..2effca4 100644
> --- a/sysdeps/aarch64/dl-irel.h
> +++ b/sysdeps/aarch64/dl-irel.h
> @@ -23,6 +23,7 @@
>  #include <stdio.h>
>  #include <unistd.h>
>  #include <ldsodefs.h>
> +#include <sysdep.h>
>  
>  #define ELF_MACHINE_IRELA	1
>  
> @@ -40,7 +41,7 @@ elf_irela (const ElfW(Rela) *reloc)
>    ElfW(Addr) *const reloc_addr = (void *) reloc->r_offset;
>    const unsigned long int r_type = ELFW(R_TYPE) (reloc->r_info);
>  
> -  if (__glibc_likely (r_type == R_AARCH64_IRELATIVE))
> +  if (__glibc_likely (r_type == AARCH64_R (IRELATIVE)))

While we generally favor a space before parens, for macros that produce
an identifier we don't write one, similar to ElfW.

>  #define elf_machine_type_class(type)					\
> -  ((((type) == R_AARCH64_JUMP_SLOT ||					\
> -     (type) == R_AARCH64_TLS_DTPMOD ||					\
> -     (type) == R_AARCH64_TLS_DTPREL ||					\
> -     (type) == R_AARCH64_TLS_TPREL ||					\
> -     (type) == R_AARCH64_TLSDESC) * ELF_RTYPE_CLASS_PLT)		\
> -   | (((type) == R_AARCH64_COPY) * ELF_RTYPE_CLASS_COPY)		\
> -   | (((type) == R_AARCH64_GLOB_DAT) * ELF_RTYPE_CLASS_EXTERN_PROTECTED_DATA))
> +  ((((type) == AARCH64_R (JUMP_SLOT)					\
> +  || (type) == AARCH64_R (TLS_DTPMOD)					\
> +  || (type) == AARCH64_R (TLS_DTPREL)					\
> +  || (type) == AARCH64_R (TLS_TPREL)					\
> +  || (type) == AARCH64_R (TLSDESC)) * ELF_RTYPE_CLASS_PLT)		\

Indentation is off.

> diff --git a/sysdeps/aarch64/jmpbuf-unwind.h b/sysdeps/aarch64/jmpbuf-unwind.h
> index 3e0a37d..11ace17 100644
> --- a/sysdeps/aarch64/jmpbuf-unwind.h
> +++ b/sysdeps/aarch64/jmpbuf-unwind.h
> @@ -27,7 +27,7 @@
>    ((void *) (address) < (void *) demangle (jmpbuf[JB_SP]))
>  
>  #define _JMPBUF_CFA_UNWINDS_ADJ(jmpbuf, context, adj) \
> -  _JMPBUF_UNWINDS_ADJ (jmpbuf, (void *) _Unwind_GetCFA (context), adj)
> +  _JMPBUF_UNWINDS_ADJ (jmpbuf, (void *) (size_t) _Unwind_GetCFA (context), adj)

That should be uintptr_t.

> diff --git a/sysdeps/aarch64/nptl/bits/pthreadtypes.h b/sysdeps/aarch64/nptl/bits/pthreadtypes.h
> index c376e64..9dcf8d9 100644
> --- a/sysdeps/aarch64/nptl/bits/pthreadtypes.h
> +++ b/sysdeps/aarch64/nptl/bits/pthreadtypes.h
> @@ -32,6 +32,8 @@
>  #define __SIZEOF_PTHREAD_BARRIER_T     32
>  #define __SIZEOF_PTHREAD_BARRIERATTR_T  8
>  
> +#define __PTHREAD_RWLOCK_INT_FLAGS_SHARED 1
> +
>  
>  /* Thread identifiers.  The structure of the attribute type is not
>     exposed on purpose.  */

The pthread type sizes should be adjusted, similar to x32, to avoid
excessive padding:

#ifdef __ILP32__
# define __SIZEOF_PTHREAD_ATTR_T        32
# define __SIZEOF_PTHREAD_MUTEX_T       32
# define __SIZEOF_PTHREAD_MUTEXATTR_T    4
# define __SIZEOF_PTHREAD_COND_T        48
# define __SIZEOF_PTHREAD_CONDATTR_T     4
# define __SIZEOF_PTHREAD_RWLOCK_T      44
# define __SIZEOF_PTHREAD_RWLOCKATTR_T   8
# define __SIZEOF_PTHREAD_BARRIER_T     20
# define __SIZEOF_PTHREAD_BARRIERATTR_T  4
#else
# define __SIZEOF_PTHREAD_ATTR_T        64
# define __SIZEOF_PTHREAD_MUTEX_T       48
# define __SIZEOF_PTHREAD_MUTEXATTR_T    8
# define __SIZEOF_PTHREAD_COND_T        48
# define __SIZEOF_PTHREAD_CONDATTR_T     8
# define __SIZEOF_PTHREAD_RWLOCK_T      56
# define __SIZEOF_PTHREAD_RWLOCKATTR_T   8
# define __SIZEOF_PTHREAD_BARRIER_T     32
# define __SIZEOF_PTHREAD_BARRIERATTR_T  8
#endif

> diff --git a/sysdeps/aarch64/nptl/bits/semaphore.h b/sysdeps/aarch64/nptl/bits/semaphore.h
> index 3cc5b37..05333ea 100644
> --- a/sysdeps/aarch64/nptl/bits/semaphore.h
> +++ b/sysdeps/aarch64/nptl/bits/semaphore.h
> @@ -21,7 +21,11 @@
>  #endif
>  
>  
> -#define __SIZEOF_SEM_T	32
> +#ifdef __ILP32__
> +# define __SIZEOF_SEM_T	16
> +#else
> +# define __SIZEOF_SEM_T	32
> +#endif

sem_t needs to be 64-bit aligned, to match struct new_sem.

 typedef union
 {
   char __size[__SIZEOF_SEM_T];
-  long int __align;
+  long long int __align;
 } sem_t;

> diff --git a/sysdeps/unix/sysv/linux/aarch64/sigcontextinfo.h b/sysdeps/unix/sysv/linux/aarch64/sigcontextinfo.h
> index a579501..ee54222 100644
> --- a/sysdeps/unix/sysv/linux/aarch64/sigcontextinfo.h
> +++ b/sysdeps/unix/sysv/linux/aarch64/sigcontextinfo.h
> @@ -19,7 +19,7 @@
>  #include <sys/ucontext.h>
>  
>  #define SIGCONTEXT siginfo_t *_si, struct ucontext *
> -#define GET_PC(ctx) ((void *) (ctx)->uc_mcontext.pc)
> +#define GET_PC(ctx) ((void *) (size_t) (ctx)->uc_mcontext.pc)

uintptr_t

Andreas.
  

Patch

diff --git a/sysdeps/aarch64/__longjmp.S b/sysdeps/aarch64/__longjmp.S
index 65116be..4d411fe 100644
--- a/sysdeps/aarch64/__longjmp.S
+++ b/sysdeps/aarch64/__longjmp.S
@@ -46,6 +46,8 @@  ENTRY (__longjmp)
 	cfi_offset(d14, JB_D14<<3)
 	cfi_offset(d15, JB_D15<<3)
 
+	DELOUSE (0)
+
 	ldp	x19, x20, [x0, #JB_X19<<3]
 	ldp	x21, x22, [x0, #JB_X21<<3]
 	ldp	x23, x24, [x0, #JB_X23<<3]
@@ -53,7 +55,7 @@  ENTRY (__longjmp)
 	ldp	x27, x28, [x0, #JB_X27<<3]
 #ifdef PTR_DEMANGLE
 	ldp	x29,  x4, [x0, #JB_X29<<3]
-	PTR_DEMANGLE (x30, x4, x3, x2)
+	PTR_DEMANGLE (30, 4, 3, 2)
 #else
 	ldp	x29, x30, [x0, #JB_X29<<3]
 #endif
@@ -98,7 +100,7 @@  ENTRY (__longjmp)
 	cfi_same_value(d15)
 #ifdef PTR_DEMANGLE
 	ldr	x4, [x0, #JB_SP<<3]
-	PTR_DEMANGLE (x5, x4, x3, x2)
+	PTR_DEMANGLE (5, 4, 3, 2)
 #else
 	ldr	x5, [x0, #JB_SP<<3]
 #endif
diff --git a/sysdeps/aarch64/crti.S b/sysdeps/aarch64/crti.S
index 53ccb42..5c42fd5 100644
--- a/sysdeps/aarch64/crti.S
+++ b/sysdeps/aarch64/crti.S
@@ -39,6 +39,7 @@ 
    they can be called as functions.  The symbols _init and _fini are
    magic and cause the linker to emit DT_INIT and DT_FINI.  */
 
+#include <sysdep.h>
 #include <libc-symbols.h>
 
 #ifndef PREINIT_FUNCTION
@@ -60,7 +61,7 @@ 
 	.type	call_weak_fn, %function
 call_weak_fn:
 	adrp	x0, :got:PREINIT_FUNCTION
-	ldr	x0, [x0, #:got_lo12:PREINIT_FUNCTION]
+	ldr	PTR_REG (0), [x0, #:got_lo12:PREINIT_FUNCTION]
 	cbz	x0, 1f
 	b	PREINIT_FUNCTION
 1:
diff --git a/sysdeps/aarch64/dl-irel.h b/sysdeps/aarch64/dl-irel.h
index 63a8e50..2effca4 100644
--- a/sysdeps/aarch64/dl-irel.h
+++ b/sysdeps/aarch64/dl-irel.h
@@ -23,6 +23,7 @@ 
 #include <stdio.h>
 #include <unistd.h>
 #include <ldsodefs.h>
+#include <sysdep.h>
 
 #define ELF_MACHINE_IRELA	1
 
@@ -40,7 +41,7 @@  elf_irela (const ElfW(Rela) *reloc)
   ElfW(Addr) *const reloc_addr = (void *) reloc->r_offset;
   const unsigned long int r_type = ELFW(R_TYPE) (reloc->r_info);
 
-  if (__glibc_likely (r_type == R_AARCH64_IRELATIVE))
+  if (__glibc_likely (r_type == AARCH64_R (IRELATIVE)))
     {
       ElfW(Addr) value = elf_ifunc_invoke (reloc->r_addend);
       *reloc_addr = value;
diff --git a/sysdeps/aarch64/dl-machine.h b/sysdeps/aarch64/dl-machine.h
index 282805e..b5ea7a8 100644
--- a/sysdeps/aarch64/dl-machine.h
+++ b/sysdeps/aarch64/dl-machine.h
@@ -21,6 +21,7 @@ 
 
 #define ELF_MACHINE_NAME "aarch64"
 
+#include <sysdep.h>
 #include <tls.h>
 #include <dl-tlsdesc.h>
 #include <dl-irel.h>
@@ -53,19 +54,33 @@  elf_machine_load_address (void)
      by constructing a non GOT reference to the symbol, the dynamic
      address of the symbol we compute using adrp/add to compute the
      symbol's address relative to the PC.
-     This depends on 32bit relocations being resolved at link time
-     and that the static address fits in the 32bits.  */
+     This depends on 32/16bit relocations being resolved at link time
+     and that the static address fits in the 32/16 bits.  */
 
   ElfW(Addr) static_addr;
   ElfW(Addr) dynamic_addr;
 
   asm ("					\n"
 "	adrp	%1, _dl_start;			\n"
+#ifdef __LP64__
 "	add	%1, %1, #:lo12:_dl_start	\n"
+#else
+"	add	%w1, %w1, #:lo12:_dl_start	\n"
+#endif
 "	ldr	%w0, 1f				\n"
 "	b	2f				\n"
 "1:						\n"
+#ifdef __LP64__
 "	.word	_dl_start			\n"
+#else
+# ifdef __AARCH64EB__
+"	.short  0                               \n"
+# endif
+"	.short  _dl_start                       \n"
+# ifndef __AARCH64EB__
+"	.short  0                               \n"
+# endif
+#endif
 "2:						\n"
     : "=r" (static_addr),  "=r" (dynamic_addr));
   return dynamic_addr - static_addr;
@@ -125,80 +140,86 @@  elf_machine_runtime_setup (struct link_map *l, int lazy, int profile)
 /* Initial entry point for the dynamic linker. The C function
    _dl_start is the real entry point, its return value is the user
    program's entry point */
+#ifdef __LP64__
+# define RTLD_START RTLD_START_1 ("x", "3", "sp")
+#else
+# define RTLD_START RTLD_START_1 ("w", "2", "wsp")
+#endif
 
-#define RTLD_START asm ("\
-.text								\n\
-.globl _start							\n\
-.type _start, %function						\n\
-.globl _dl_start_user						\n\
-.type _dl_start_user, %function					\n\
-_start:								\n\
-	mov	x0,	sp					\n\
-	bl	_dl_start					\n\
-	// returns user entry point in x0			\n\
-	mov	x21, x0						\n\
-_dl_start_user:							\n\
-	// get the original arg count				\n\
-	ldr	x1, [sp]					\n\
-	// get the argv address					\n\
-	add	x2, sp, #8					\n\
-	// get _dl_skip_args to see if we were			\n\
-	// invoked as an executable				\n\
-	adrp	x4, _dl_skip_args				\n\
-        ldr	w4, [x4, #:lo12:_dl_skip_args]			\n\
-	// do we need to adjust argc/argv			\n\
-        cmp	w4, 0						\n\
-	beq	.L_done_stack_adjust				\n\
-	// subtract _dl_skip_args from original arg count	\n\
-	sub	x1, x1, x4					\n\
-	// store adjusted argc back to stack			\n\
-	str	x1, [sp]					\n\
-	// find the first unskipped argument			\n\
-	mov	x3, x2						\n\
-	add	x4, x2, x4, lsl #3				\n\
-	// shuffle argv down					\n\
-1:	ldr	x5, [x4], #8					\n\
-	str	x5, [x3], #8					\n\
-	cmp	x5, #0						\n\
-	bne	1b						\n\
-	// shuffle envp down					\n\
-1:	ldr	x5, [x4], #8					\n\
-	str	x5, [x3], #8					\n\
-	cmp	x5, #0						\n\
-	bne	1b						\n\
-	// shuffle auxv down					\n\
-1:	ldp	x0, x5, [x4, #16]!				\n\
-	stp	x0, x5, [x3], #16				\n\
-	cmp	x0, #0						\n\
-	bne	1b						\n\
-	// Update _dl_argv					\n\
-	adrp	x3, _dl_argv					\n\
-	str	x2, [x3, #:lo12:_dl_argv]			\n\
-.L_done_stack_adjust:						\n\
-	// compute envp						\n\
-	add	x3, x2, x1, lsl #3				\n\
-	add	x3, x3, #8					\n\
-	adrp	x16, _rtld_local				\n\
-        add	x16, x16, #:lo12:_rtld_local			\n\
-        ldr	x0, [x16]					\n\
-	bl	_dl_init					\n\
-	// load the finalizer function				\n\
-	adrp	x0, _dl_fini					\n\
-	add	x0, x0, #:lo12:_dl_fini				\n\
-	// jump to the user_s entry point			\n\
-	br      x21						\n\
+
+#define RTLD_START_1(PTR, PTR_SIZE_LOG, PTR_SP) asm ("\
+.text									\n\
+.globl _start								\n\
+.type _start, %function							\n\
+.globl _dl_start_user							\n\
+.type _dl_start_user, %function						\n\
+_start:									\n\
+	mov	" PTR "0, " PTR_SP "					\n\
+	bl	_dl_start						\n\
+	// returns user entry point in x0				\n\
+	mov	x21, x0							\n\
+_dl_start_user:								\n\
+	// get the original arg count					\n\
+	ldr	" PTR "1, [sp]						\n\
+	// get the argv address						\n\
+	add	" PTR "2, " PTR_SP ", #(1<<"  PTR_SIZE_LOG ")		\n\
+	// get _dl_skip_args to see if we were				\n\
+	// invoked as an executable					\n\
+	adrp	x4, _dl_skip_args					\n\
+        ldr	w4, [x4, #:lo12:_dl_skip_args]				\n\
+	// do we need to adjust argc/argv				\n\
+        cmp	w4, 0							\n\
+	beq	.L_done_stack_adjust					\n\
+	// subtract _dl_skip_args from original arg count		\n\
+	sub	" PTR "1, " PTR "1, " PTR "4				\n\
+	// store adjusted argc back to stack				\n\
+	str	" PTR "1, [sp]						\n\
+	// find the first unskipped argument				\n\
+	mov	" PTR "3, " PTR "2					\n\
+	add	" PTR "4, " PTR "2, " PTR "4, lsl #" PTR_SIZE_LOG "	\n\
+	// shuffle argv down						\n\
+1:	ldr	" PTR "5, [x4], #(1<<"  PTR_SIZE_LOG ")			\n\
+	str	" PTR "5, [x3], #(1<<"  PTR_SIZE_LOG ")			\n\
+	cmp	" PTR "5, #0						\n\
+	bne	1b							\n\
+	// shuffle envp down						\n\
+1:	ldr	" PTR "5, [x4], #(1<<"  PTR_SIZE_LOG ")			\n\
+	str	" PTR "5, [x3], #(1<<"  PTR_SIZE_LOG ")			\n\
+	cmp	" PTR "5, #0						\n\
+	bne	1b							\n\
+	// shuffle auxv down						\n\
+1:	ldp	" PTR "0, " PTR "5, [x4, #(2<<"  PTR_SIZE_LOG ")]!	\n\
+	stp	" PTR "0, " PTR "5, [x3], #(2<<"  PTR_SIZE_LOG ")	\n\
+	cmp	" PTR "0, #0						\n\
+	bne	1b							\n\
+	// Update _dl_argv						\n\
+	adrp	x3, _dl_argv						\n\
+	str	" PTR "2, [x3, #:lo12:_dl_argv]				\n\
+.L_done_stack_adjust:							\n\
+	// compute envp							\n\
+	add	" PTR "3, " PTR "2, " PTR "1, lsl #" PTR_SIZE_LOG "	\n\
+	add	" PTR "3, " PTR "3, #(1<<"  PTR_SIZE_LOG ")		\n\
+	adrp	x16, _rtld_local					\n\
+        add	" PTR "16, " PTR "16, #:lo12:_rtld_local		\n\
+        ldr	" PTR "0, [x16]						\n\
+	bl	_dl_init						\n\
+	// load the finalizer function					\n\
+	adrp	x0, _dl_fini						\n\
+	add	" PTR "0, " PTR "0, #:lo12:_dl_fini			\n\
+	// jump to the user_s entry point				\n\
+	br      x21							\n\
 ");
 
 #define elf_machine_type_class(type)					\
-  ((((type) == R_AARCH64_JUMP_SLOT ||					\
-     (type) == R_AARCH64_TLS_DTPMOD ||					\
-     (type) == R_AARCH64_TLS_DTPREL ||					\
-     (type) == R_AARCH64_TLS_TPREL ||					\
-     (type) == R_AARCH64_TLSDESC) * ELF_RTYPE_CLASS_PLT)		\
-   | (((type) == R_AARCH64_COPY) * ELF_RTYPE_CLASS_COPY)		\
-   | (((type) == R_AARCH64_GLOB_DAT) * ELF_RTYPE_CLASS_EXTERN_PROTECTED_DATA))
+  ((((type) == AARCH64_R (JUMP_SLOT)					\
+  || (type) == AARCH64_R (TLS_DTPMOD)					\
+  || (type) == AARCH64_R (TLS_DTPREL)					\
+  || (type) == AARCH64_R (TLS_TPREL)					\
+  || (type) == AARCH64_R (TLSDESC)) * ELF_RTYPE_CLASS_PLT)		\
+   | (((type) == AARCH64_R (COPY)) * ELF_RTYPE_CLASS_COPY)		\
+   | (((type) == AARCH64_R (GLOB_DAT)) * ELF_RTYPE_CLASS_EXTERN_PROTECTED_DATA))
 
-#define ELF_MACHINE_JMP_SLOT	R_AARCH64_JUMP_SLOT
+#define ELF_MACHINE_JMP_SLOT	AARCH64_R (JUMP_SLOT)
 
 /* AArch64 uses RELA not REL */
 #define ELF_MACHINE_NO_REL 1
@@ -237,9 +258,9 @@  elf_machine_rela (struct link_map *map, const ElfW(Rela) *reloc,
 		  void *const reloc_addr_arg, int skip_ifunc)
 {
   ElfW(Addr) *const reloc_addr = reloc_addr_arg;
-  const unsigned int r_type = ELF64_R_TYPE (reloc->r_info);
+  const unsigned int r_type = ELFW (R_TYPE) (reloc->r_info);
 
-  if (__builtin_expect (r_type == R_AARCH64_RELATIVE, 0))
+  if (__builtin_expect (r_type == AARCH64_R (RELATIVE), 0))
       *reloc_addr = map->l_addr + reloc->r_addend;
   else if (__builtin_expect (r_type == R_AARCH64_NONE, 0))
       return;
@@ -257,7 +278,7 @@  elf_machine_rela (struct link_map *map, const ElfW(Rela) *reloc,
 
       switch (r_type)
 	{
-	case R_AARCH64_COPY:
+	case AARCH64_R (COPY):
 	  if (sym == NULL)
 	      break;
 
@@ -275,15 +296,17 @@  elf_machine_rela (struct link_map *map, const ElfW(Rela) *reloc,
 		  MIN (sym->st_size, refsym->st_size));
 	  break;
 
-	case R_AARCH64_RELATIVE:
-	case R_AARCH64_GLOB_DAT:
-	case R_AARCH64_JUMP_SLOT:
-	case R_AARCH64_ABS32:
-	case R_AARCH64_ABS64:
+	case AARCH64_R (RELATIVE):
+	case AARCH64_R (GLOB_DAT):
+	case AARCH64_R (JUMP_SLOT):
+	case AARCH64_R (ABS32):
+#ifdef __LP64__
+	case AARCH64_R (ABS64):
+#endif
 	  *reloc_addr = value + reloc->r_addend;
 	  break;
 
-	case R_AARCH64_TLSDESC:
+	case AARCH64_R (TLSDESC):
 	  {
 	    struct tlsdesc volatile *td =
 	      (struct tlsdesc volatile *)reloc_addr;
@@ -318,7 +341,7 @@  elf_machine_rela (struct link_map *map, const ElfW(Rela) *reloc,
 	    break;
 	  }
 
-	case R_AARCH64_TLS_DTPMOD:
+	case AARCH64_R (TLS_DTPMOD):
 #ifdef RTLD_BOOTSTRAP
 	  *reloc_addr = 1;
 #else
@@ -329,12 +352,12 @@  elf_machine_rela (struct link_map *map, const ElfW(Rela) *reloc,
 #endif
 	  break;
 
-	case R_AARCH64_TLS_DTPREL:
+	case AARCH64_R (TLS_DTPREL):
 	  if (sym)
 	    *reloc_addr = sym->st_value + reloc->r_addend;
 	  break;
 
-	case R_AARCH64_TLS_TPREL:
+	case AARCH64_R (TLS_TPREL):
 	  if (sym)
 	    {
 	      CHECK_STATIC_TLS (map, sym_map);
@@ -343,7 +366,7 @@  elf_machine_rela (struct link_map *map, const ElfW(Rela) *reloc,
 	    }
 	  break;
 
-	case R_AARCH64_IRELATIVE:
+	case AARCH64_R (IRELATIVE):
 	  value = map->l_addr + reloc->r_addend;
 	  value = elf_ifunc_invoke (value);
 	  *reloc_addr = value;
@@ -374,16 +397,16 @@  elf_machine_lazy_rel (struct link_map *map,
 		      int skip_ifunc)
 {
   ElfW(Addr) *const reloc_addr = (void *) (l_addr + reloc->r_offset);
-  const unsigned int r_type = ELF64_R_TYPE (reloc->r_info);
+  const unsigned int r_type = ELFW (R_TYPE) (reloc->r_info);
   /* Check for unexpected PLT reloc type.  */
-  if (__builtin_expect (r_type == R_AARCH64_JUMP_SLOT, 1))
+  if (__builtin_expect (r_type == AARCH64_R (JUMP_SLOT), 1))
     {
       if (__builtin_expect (map->l_mach.plt, 0) == 0)
 	*reloc_addr += l_addr;
       else
 	*reloc_addr = map->l_mach.plt;
     }
-  else if (__builtin_expect (r_type == R_AARCH64_TLSDESC, 1))
+  else if (__builtin_expect (r_type == AARCH64_R (TLSDESC), 1))
     {
       struct tlsdesc volatile *td =
 	(struct tlsdesc volatile *)reloc_addr;
@@ -392,7 +415,7 @@  elf_machine_lazy_rel (struct link_map *map,
       td->entry = (void*)(D_PTR (map, l_info[ADDRIDX (DT_TLSDESC_PLT)])
 			  + map->l_addr);
     }
-  else if (__glibc_unlikely (r_type == R_AARCH64_IRELATIVE))
+  else if (__glibc_unlikely (r_type == AARCH64_R (IRELATIVE)))
     {
       ElfW(Addr) value = map->l_addr + reloc->r_addend;
       if (__glibc_likely (!skip_ifunc))
diff --git a/sysdeps/aarch64/dl-tlsdesc.S b/sysdeps/aarch64/dl-tlsdesc.S
index 05be370..42fa943 100644
--- a/sysdeps/aarch64/dl-tlsdesc.S
+++ b/sysdeps/aarch64/dl-tlsdesc.S
@@ -74,7 +74,8 @@ 
 	cfi_startproc
 	.align 2
 _dl_tlsdesc_return:
-	ldr	x0, [x0, #8]
+	DELOUSE (0)
+	ldr	PTR_REG (0), [x0, #PTR_SIZE]
 	RET
 	cfi_endproc
 	.size	_dl_tlsdesc_return, .-_dl_tlsdesc_return
@@ -95,9 +96,10 @@  _dl_tlsdesc_return_lazy:
 	   so it reads the same value (this function is the final value of
 	   td->entry) and thus it synchronizes with the release store to
 	   td->entry in _dl_tlsdesc_resolve_rela_fixup ensuring that the load
-	   from [x0,#8] here happens after the initialization of td->arg.  */
-	ldar	xzr, [x0]
-	ldr	x0, [x0, #8]
+	   from [x0,#PTR_SIZE] here happens after the initialization of td->arg. */
+	DELOUSE (0)
+	ldar	PTR_REG (zr), [x0]
+	ldr	PTR_REG (0), [x0, #PTR_SIZE]
 	RET
 	cfi_endproc
 	.size	_dl_tlsdesc_return_lazy, .-_dl_tlsdesc_return_lazy
@@ -125,10 +127,11 @@  _dl_tlsdesc_undefweak:
 	   td->entry) and thus it synchronizes with the release store to
 	   td->entry in _dl_tlsdesc_resolve_rela_fixup ensuring that the load
 	   from [x0,#8] here happens after the initialization of td->arg.  */
-	ldar	xzr, [x0]
-	ldr	x0, [x0, #8]
+	DELOUSE (0)
+	ldar	PTR_REG (zr), [x0]
+	ldr	PTR_REG (0), [x0, #PTR_SIZE]
 	mrs	x1, tpidr_el0
-	sub	x0, x0, x1
+	sub	PTR_REG (0), PTR_REG (0), PTR_REG (1)
 	ldr	x1, [sp], #16
 	cfi_adjust_cfa_offset (-16)
 	RET
@@ -174,6 +177,7 @@  _dl_tlsdesc_dynamic:
 	stp	x29, x30, [sp,#-(32+16*NSAVEXREGPAIRS)]!
 	cfi_adjust_cfa_offset (32+16*NSAVEXREGPAIRS)
 	mov	x29, sp
+	DELOUSE (0)
 
 	/* Save just enough registers to support fast path, if we fall
 	   into slow path we will save additional registers.  */
@@ -187,22 +191,22 @@  _dl_tlsdesc_dynamic:
 	   so it reads the same value (this function is the final value of
 	   td->entry) and thus it synchronizes with the release store to
 	   td->entry in _dl_tlsdesc_resolve_rela_fixup ensuring that the load
-	   from [x0,#8] here happens after the initialization of td->arg.  */
-	ldar	xzr, [x0]
-	ldr	x1, [x0,#8]
-	ldr	x0, [x4]
-	ldr	x3, [x1,#16]
-	ldr	x2, [x0]
-	cmp	x3, x2
+	   from [x0,#PTR_SIZE] here happens after the initialization of td->arg.  */
+	ldar	PTR_REG (zr), [x0]
+	ldr	PTR_REG (1), [x0,#PTR_SIZE]
+	ldr	PTR_REG (0), [x4]
+	ldr	PTR_REG (3), [x1,#(PTR_SIZE * 2)]
+	ldr	PTR_REG (2), [x0]
+	cmp	PTR_REG (3), PTR_REG (2)
 	b.hi	2f
-	ldr	x2, [x1]
-	add	x0, x0, x2, lsl #4
-	ldr	x0, [x0]
+	ldr	PTR_REG (2), [x1]
+	add	PTR_REG (0), PTR_REG (0), PTR_REG (2), lsl #(PTR_LOG_SIZE + 1)
+	ldr	PTR_REG (0), [x0]
 	cmn	x0, #0x1
 	b.eq	2f
-	ldr	x1, [x1,#8]
-	add	x0, x0, x1
-	sub	x0, x0, x4
+	ldr	PTR_REG (1), [x1,#(PTR_SIZE * 2)]
+	add	PTR_REG (0), PTR_REG (0), PTR_REG (1)
+	sub	PTR_REG (0), PTR_REG (0), PTR_REG (4)
 1:
 	ldp	 x1,  x2, [sp, #32+16*0]
 	ldp	 x3,  x4, [sp, #32+16*1]
@@ -233,7 +237,7 @@  _dl_tlsdesc_dynamic:
 	bl	__tls_get_addr
 
 	mrs	x1, tpidr_el0
-	sub	x0, x0, x1
+	sub	PTR_REG (0), PTR_REG (0), PTR_REG (1)
 
 	RESTORE_Q_REGISTERS
 
@@ -279,13 +283,15 @@  _dl_tlsdesc_resolve_rela:
 
 	SAVE_Q_REGISTERS
 
-	ldr	x1, [x3, #8]
+	DELOUSE (3)
+	ldr	PTR_REG (1), [x3, #PTR_SIZE]
 	bl	_dl_tlsdesc_resolve_rela_fixup
 
 	RESTORE_Q_REGISTERS
 
 	ldr	x0, [sp, #32+16*8]
-	ldr	x1, [x0]
+	DELOUSE (0)
+	ldr	PTR_REG (1), [x0]
 	blr	x1
 
 	ldp	 x1,  x4, [sp, #32+16*0]
@@ -346,7 +352,8 @@  _dl_tlsdesc_resolve_hold:
 	RESTORE_Q_REGISTERS
 
 	ldr	x0, [sp, #32+16*9]
-	ldr	x1, [x0]
+	DELOUSE (0)
+	ldr	PTR_REG (1), [x0]
 	blr	x1
 
 	ldp	 x1,  x2, [sp, #32+16*0]
diff --git a/sysdeps/aarch64/dl-trampoline.S b/sysdeps/aarch64/dl-trampoline.S
index 947a515..63ef6f7 100644
--- a/sysdeps/aarch64/dl-trampoline.S
+++ b/sysdeps/aarch64/dl-trampoline.S
@@ -22,9 +22,13 @@ 
 #include "dl-link.h"
 
 #define ip0 x16
+#define ip0l PTR_REG (16)
 #define ip1 x17
 #define lr  x30
 
+/* RELA relocatons are 3 pointers */
+#define RELA_SIZE (PTR_SIZE * 3)
+
 	.text
 	.globl _dl_runtime_resolve
 	.type _dl_runtime_resolve, #function
@@ -79,7 +83,7 @@  _dl_runtime_resolve:
 	cfi_rel_offset (q1, 80+7*16)
 
 	/* Get pointer to linker struct.  */
-	ldr	x0, [ip0, #-8]
+	ldr	PTR_REG (0), [ip0, #-PTR_SIZE]
 
 	/* Prepare to call _dl_fixup().  */
 	ldr	x1, [sp, 80+8*16]	/* Recover &PLTGOT[n] */
@@ -87,7 +91,7 @@  _dl_runtime_resolve:
 	sub     x1, x1, ip0
 	add     x1, x1, x1, lsl #1
 	lsl     x1, x1, #3
-	sub     x1, x1, #192
+	sub     x1, x1, #(RELA_SIZE<<3)
 	lsr     x1, x1, #3
 
 	/* Call fixup routine.  */
@@ -191,7 +195,7 @@  _dl_runtime_profile:
 	stp	x0, x1, [x29, #OFFSET_RG + DL_OFFSET_RG_SP]
 
 	/* Get pointer to linker struct.  */
-	ldr	x0, [ip0, #-8]
+	ldr	PTR_REG (0), [ip0, #-PTR_SIZE]
 
 	/* Prepare to call _dl_profile_fixup().  */
 	ldr	x1, [x29, OFFSET_PLTGOTN]	/* Recover &PLTGOT[n] */
@@ -199,7 +203,7 @@  _dl_runtime_profile:
 	sub     x1, x1, ip0
 	add     x1, x1, x1, lsl #1
 	lsl     x1, x1, #3
-	sub     x1, x1, #192
+	sub     x1, x1, #(RELA_SIZE<<3)
 	lsr     x1, x1, #3
 
 	stp	x0, x1, [x29, #OFFSET_SAVED_CALL_X0]
@@ -210,8 +214,8 @@  _dl_runtime_profile:
 	add	x4, x29, #OFFSET_FS		/* address of framesize */
 	bl	_dl_profile_fixup
 
-	ldr	ip0, [x29, #OFFSET_FS]		/* framesize == 0 */
-	cmp	ip0, #0
+	ldr	ip0l, [x29, #OFFSET_FS]		/* framesize == 0 */
+	cmp	ip0l, #0
 	bge	1f
 	cfi_remember_state
 
@@ -243,7 +247,7 @@  _dl_runtime_profile:
 1:
 	/* The new frame size is in ip0.  */
 
-	sub	x1, x29, ip0
+	sub	PTR_REG (1), PTR_REG (29), ip0l
 	and	sp, x1, #0xfffffffffffffff0
 
 	str	x0, [x29, #OFFSET_T1]
diff --git a/sysdeps/aarch64/jmpbuf-unwind.h b/sysdeps/aarch64/jmpbuf-unwind.h
index 3e0a37d..11ace17 100644
--- a/sysdeps/aarch64/jmpbuf-unwind.h
+++ b/sysdeps/aarch64/jmpbuf-unwind.h
@@ -27,7 +27,7 @@ 
   ((void *) (address) < (void *) demangle (jmpbuf[JB_SP]))
 
 #define _JMPBUF_CFA_UNWINDS_ADJ(jmpbuf, context, adj) \
-  _JMPBUF_UNWINDS_ADJ (jmpbuf, (void *) _Unwind_GetCFA (context), adj)
+  _JMPBUF_UNWINDS_ADJ (jmpbuf, (void *) (size_t) _Unwind_GetCFA (context), adj)
 
 #define _JMPBUF_UNWINDS_ADJ(_jmpbuf, _address, _adj) \
   ((uintptr_t) (_address) - (_adj) < _jmpbuf_sp (_jmpbuf) - (_adj))
diff --git a/sysdeps/aarch64/memcmp.S b/sysdeps/aarch64/memcmp.S
index ae2d997..8b87e9b 100644
--- a/sysdeps/aarch64/memcmp.S
+++ b/sysdeps/aarch64/memcmp.S
@@ -47,6 +47,9 @@ 
 #define mask		x13
 
 ENTRY_ALIGN (memcmp, 6)
+	DELOUSE (0)
+	DELOUSE (1)
+	DELOUSE (2)
 	cbz	limit, L(ret0)
 	eor	tmp1, src1, src2
 	tst	tmp1, #7
diff --git a/sysdeps/aarch64/memcpy.S b/sysdeps/aarch64/memcpy.S
index de73f0f..b269316 100644
--- a/sysdeps/aarch64/memcpy.S
+++ b/sysdeps/aarch64/memcpy.S
@@ -61,6 +61,10 @@ 
 
 ENTRY_ALIGN (memmove, 6)
 
+	DELOUSE (0)
+	DELOUSE (1)
+	DELOUSE (2)
+
 	sub	tmp1, dstin, src
 	cmp	count, 96
 	ccmp	tmp1, count, 2, hi
@@ -71,6 +75,10 @@  END (memmove)
 libc_hidden_builtin_def (memmove)
 ENTRY (memcpy)
 
+	DELOUSE (0)
+	DELOUSE (1)
+	DELOUSE (2)
+
 	prfm	PLDL1KEEP, [src]
 	add	srcend, src, count
 	add	dstend, dstin, count
diff --git a/sysdeps/aarch64/memset.S b/sysdeps/aarch64/memset.S
index 4d222c5..7bad29a 100644
--- a/sysdeps/aarch64/memset.S
+++ b/sysdeps/aarch64/memset.S
@@ -39,6 +39,9 @@ 
 
 ENTRY_ALIGN (__memset, 6)
 
+	DELOUSE (0)
+	DELOUSE (2)
+
 	dup	v0.16B, valw
 	add	dstend, dstin, count
 
diff --git a/sysdeps/aarch64/nptl/bits/pthreadtypes.h b/sysdeps/aarch64/nptl/bits/pthreadtypes.h
index c376e64..9dcf8d9 100644
--- a/sysdeps/aarch64/nptl/bits/pthreadtypes.h
+++ b/sysdeps/aarch64/nptl/bits/pthreadtypes.h
@@ -32,6 +32,8 @@ 
 #define __SIZEOF_PTHREAD_BARRIER_T     32
 #define __SIZEOF_PTHREAD_BARRIERATTR_T  8
 
+#define __PTHREAD_RWLOCK_INT_FLAGS_SHARED 1
+
 
 /* Thread identifiers.  The structure of the attribute type is not
    exposed on purpose.  */
diff --git a/sysdeps/aarch64/nptl/bits/semaphore.h b/sysdeps/aarch64/nptl/bits/semaphore.h
index 3cc5b37..05333ea 100644
--- a/sysdeps/aarch64/nptl/bits/semaphore.h
+++ b/sysdeps/aarch64/nptl/bits/semaphore.h
@@ -21,7 +21,11 @@ 
 #endif
 
 
-#define __SIZEOF_SEM_T	32
+#ifdef __ILP32__
+# define __SIZEOF_SEM_T	16
+#else
+# define __SIZEOF_SEM_T	32
+#endif
 
 
 /* Value returned if `sem_open' failed.  */
diff --git a/sysdeps/aarch64/setjmp.S b/sysdeps/aarch64/setjmp.S
index 22f4368..e03b3b5 100644
--- a/sysdeps/aarch64/setjmp.S
+++ b/sysdeps/aarch64/setjmp.S
@@ -33,6 +33,7 @@  END (_setjmp)
 libc_hidden_def (_setjmp)
 
 ENTRY (__sigsetjmp)
+	DELOUSE (0)
 
 1:
 	stp	x19, x20, [x0, #JB_X19<<3]
@@ -42,7 +43,7 @@  ENTRY (__sigsetjmp)
 	stp	x27, x28, [x0, #JB_X27<<3]
 
 #ifdef PTR_MANGLE
-	PTR_MANGLE (x4, x30, x3, x2)
+	PTR_MANGLE (4, 30, 3, 2)
 	stp	x29,  x4, [x0, #JB_X29<<3]
 #else
 	stp	x29, x30, [x0, #JB_X29<<3]
@@ -57,7 +58,7 @@  ENTRY (__sigsetjmp)
 	stp	d14, d15, [x0, #JB_D14<<3]
 #ifdef PTR_MANGLE
 	mov	x4, sp
-	PTR_MANGLE (x5, x4, x3, x2)
+	PTR_MANGLE (5, 4, 3, 2)
 	str	x5, [x0, #JB_SP<<3]
 #else
 	mov	x2,  sp
diff --git a/sysdeps/aarch64/start.S b/sysdeps/aarch64/start.S
index efe2474..9198c57 100644
--- a/sysdeps/aarch64/start.S
+++ b/sysdeps/aarch64/start.S
@@ -16,6 +16,8 @@ 
    License along with the GNU C Library.  If not, see
    <http://www.gnu.org/licenses/>.  */
 
+#include <sysdep.h>
+
 /* This is the canonical entry point, usually the first thing in the text
    segment.
 
@@ -25,7 +27,7 @@ 
 
    At this entry point, most registers' values are unspecified, except:
 
-   x0		Contains a function pointer to be registered with `atexit'.
+   x0/w0	Contains a function pointer to be registered with `atexit'.
 		This is how the dynamic linker arranges to have DT_FINI
 		functions called for shared libraries that have been loaded
 		before this code runs.
@@ -52,26 +54,26 @@  _start:
 	mov	x5, x0
 
 	/* Load argc and a pointer to argv */
-	ldr	x1, [sp, #0]
-	add	x2, sp, #8
+	ldr	PTR_REG (1), [sp, #0]
+	add	x2, sp, #PTR_SIZE
 
 	/* Setup stack limit in argument register */
 	mov	x6, sp
 
 #ifdef SHARED
         adrp    x0, :got:main
-	ldr     x0, [x0, #:got_lo12:main]
+	ldr     PTR_REG (0), [x0, #:got_lo12:main]
 
         adrp    x3, :got:__libc_csu_init
-	ldr     x3, [x3, #:got_lo12:__libc_csu_init]
+	ldr     PTR_REG (3), [x3, #:got_lo12:__libc_csu_init]
 
         adrp    x4, :got:__libc_csu_fini
-	ldr     x4, [x4, #:got_lo12:__libc_csu_fini]
+	ldr     PTR_REG (4), [x4, #:got_lo12:__libc_csu_fini]
 #else
 	/* Set up the other arguments in registers */
-	ldr	x0, =main
-	ldr	x3, =__libc_csu_init
-	ldr	x4, =__libc_csu_fini
+	ldr	PTR_REG (0), =main
+	ldr	PTR_REG (3), =__libc_csu_init
+	ldr	PTR_REG (4), =__libc_csu_fini
 #endif
 
 	/* __libc_start_main (main, argc, argv, init, fini, rtld_fini,
diff --git a/sysdeps/aarch64/strchr.S b/sysdeps/aarch64/strchr.S
index 5e3aecf..c66fea3 100644
--- a/sysdeps/aarch64/strchr.S
+++ b/sysdeps/aarch64/strchr.S
@@ -62,6 +62,7 @@ 
 /* Locals and temporaries.  */
 
 ENTRY (strchr)
+	DELOUSE (0)
 	mov	wtmp2, #0x0401
 	movk	wtmp2, #0x4010, lsl #16
 	dup	vrepchr.16b, chrin
diff --git a/sysdeps/aarch64/strchrnul.S b/sysdeps/aarch64/strchrnul.S
index a624c8d..c2cc47e 100644
--- a/sysdeps/aarch64/strchrnul.S
+++ b/sysdeps/aarch64/strchrnul.S
@@ -60,6 +60,7 @@ 
    identify exactly which byte is causing the termination.  */
 
 ENTRY (__strchrnul)
+	DELOUSE (0)
 	/* Magic constant 0x40100401 to allow us to identify which lane
 	   matches the termination condition.  */
 	mov	wtmp2, #0x0401
diff --git a/sysdeps/aarch64/strcmp.S b/sysdeps/aarch64/strcmp.S
index ba0ccb4..49e528b 100644
--- a/sysdeps/aarch64/strcmp.S
+++ b/sysdeps/aarch64/strcmp.S
@@ -49,6 +49,8 @@ 
 	/* Start of performance-critical section  -- one 64B cache line.  */
 ENTRY_ALIGN(strcmp, 6)
 
+	DELOUSE (0)
+	DELOUSE (1)
 	eor	tmp1, src1, src2
 	mov	zeroones, #REP8_01
 	tst	tmp1, #7
diff --git a/sysdeps/aarch64/strcpy.S b/sysdeps/aarch64/strcpy.S
index 0694199..45809e8 100644
--- a/sysdeps/aarch64/strcpy.S
+++ b/sysdeps/aarch64/strcpy.S
@@ -91,6 +91,8 @@ 
 #define MIN_PAGE_SIZE (1 << MIN_PAGE_P2)
 
 ENTRY_ALIGN (STRCPY, 6)
+	DELOUSE (0)
+	DELOUSE (1)
 	/* For moderately short strings, the fastest way to do the copy is to
 	   calculate the length of the string in the same way as strlen, then
 	   essentially do a memcpy of the result.  This avoids the need for
diff --git a/sysdeps/aarch64/strlen.S b/sysdeps/aarch64/strlen.S
index a07834b..5fb653a 100644
--- a/sysdeps/aarch64/strlen.S
+++ b/sysdeps/aarch64/strlen.S
@@ -85,6 +85,8 @@ 
 	   boundary.  */
 
 ENTRY_ALIGN (__strlen, 6)
+	DELOUSE (0)
+	DELOUSE (1)
 	and	tmp1, srcin, MIN_PAGE_SIZE - 1
 	mov	zeroones, REP8_01
 	cmp	tmp1, MIN_PAGE_SIZE - 16
diff --git a/sysdeps/aarch64/strncmp.S b/sysdeps/aarch64/strncmp.S
index f6a17fd..02de93c 100644
--- a/sysdeps/aarch64/strncmp.S
+++ b/sysdeps/aarch64/strncmp.S
@@ -51,6 +51,9 @@ 
 #define endloop		x15
 
 ENTRY_ALIGN_AND_PAD (strncmp, 6, 7)
+	DELOUSE (0)
+	DELOUSE (1)
+	DELOUSE (2)
 	cbz	limit, L(ret0)
 	eor	tmp1, src1, src2
 	mov	zeroones, #REP8_01
diff --git a/sysdeps/aarch64/strnlen.S b/sysdeps/aarch64/strnlen.S
index 4cce45f..af765f1 100644
--- a/sysdeps/aarch64/strnlen.S
+++ b/sysdeps/aarch64/strnlen.S
@@ -50,6 +50,9 @@ 
 #define REP8_80 0x8080808080808080
 
 ENTRY_ALIGN_AND_PAD (__strnlen, 6, 9)
+	DELOUSE (0)
+	DELOUSE (1)
+	DELOUSE (2)
 	cbz	limit, L(hit_limit)
 	mov	zeroones, #REP8_01
 	bic	src, srcin, #15
diff --git a/sysdeps/aarch64/strrchr.S b/sysdeps/aarch64/strrchr.S
index 44c1917..ea37968 100644
--- a/sysdeps/aarch64/strrchr.S
+++ b/sysdeps/aarch64/strrchr.S
@@ -68,6 +68,7 @@ 
    identify exactly which byte is causing the termination, and why.  */
 
 ENTRY(strrchr)
+	DELOUSE (0)
 	cbz	x1, L(null_search)
 	/* Magic constant 0x40100401 to allow us to identify which lane
 	   matches the requested byte.  Magic constant 0x80200802 used
diff --git a/sysdeps/aarch64/sysdep.h b/sysdeps/aarch64/sysdep.h
index e045759..0a7dccb 100644
--- a/sysdeps/aarch64/sysdep.h
+++ b/sysdeps/aarch64/sysdep.h
@@ -16,8 +16,25 @@ 
    License along with the GNU C Library; if not, see
    <http://www.gnu.org/licenses/>.  */
 
+#ifndef _AARCH64_SYSDEP_H
+#define _AARCH64_SYSDEP_H
+
 #include <sysdeps/generic/sysdep.h>
 
+#ifdef __LP64__
+# define AARCH64_R(NAME)	R_AARCH64_ ## NAME
+# define PTR_REG(n)		x##n
+# define PTR_LOG_SIZE		3
+# define DELOUSE(n)
+#else
+# define AARCH64_R(NAME)	R_AARCH64_P32_ ## NAME
+# define PTR_REG(n)		w##n
+# define PTR_LOG_SIZE		2
+# define DELOUSE(n)		mov     w##n, w##n
+#endif
+
+#define PTR_SIZE	(1<<PTR_LOG_SIZE)
+
 #ifdef	__ASSEMBLER__
 
 /* Syntactic details of assembler.  */
@@ -107,16 +124,18 @@ 
 # define L(name)         .L##name
 #endif
 
-/* Load or store to/from a pc-relative EXPR into/from R, using T.  */
-#define LDST_PCREL(OP, R, T, EXPR)  \
-	adrp	T, EXPR;	    \
-	OP	R, [T, #:lo12:EXPR];\
+/* Load or store to/from a pc-relative EXPR into/from R, using T.
+   Note R and T are register numbers and not register names.  */
+#define LDST_PCREL(OP, R, T, EXPR)			\
+	adrp	x##T, EXPR;				\
+	OP	PTR_REG (R), [x##T, #:lo12:EXPR];	\
 
-/* Load or store to/from a got-relative EXPR into/from R, using T.  */
-#define LDST_GLOBAL(OP, R, T, EXPR)     \
-	adrp	T, :got:EXPR;		\
-	ldr	T, [T, #:got_lo12:EXPR];\
-	OP	R, [T];
+/* Load or store to/from a got-relative EXPR into/from R, using T.
+   Note R and T are register numbers and not register names.  */
+#define LDST_GLOBAL(OP, R, T,  EXPR)			\
+	adrp	x##T, :got:EXPR;			\
+	ldr	PTR_REG (T), [x##T, #:got_lo12:EXPR];	\
+	OP	PTR_REG (R), [x##T];
 
 /* Since C identifiers are not normally prefixed with an underscore
    on this system, the asm identifier `syscall_error' intrudes on the
@@ -125,3 +144,5 @@ 
 #define mcount		_mcount
 
 #endif	/* __ASSEMBLER__ */
+
+#endif  /* _AARCH64_SYSDEP_H */
diff --git a/sysdeps/unix/sysv/linux/aarch64/bits/fcntl.h b/sysdeps/unix/sysv/linux/aarch64/bits/fcntl.h
index 658f696..1717c35 100644
--- a/sysdeps/unix/sysv/linux/aarch64/bits/fcntl.h
+++ b/sysdeps/unix/sysv/linux/aarch64/bits/fcntl.h
@@ -25,11 +25,17 @@ 
 #define __O_NOFOLLOW	0100000
 #define __O_DIRECT	0200000
 
-#define __O_LARGEFILE	0
+#ifdef __ILP32__
+# define __O_LARGEFILE	0400000
+#else
+# define __O_LARGEFILE	0
+#endif
 
+#ifdef __LP64__
 # define F_GETLK64	5
 # define F_SETLK64	6
 # define F_SETLKW64	7
+#endif
 
 struct flock
   {
diff --git a/sysdeps/unix/sysv/linux/aarch64/clone.S b/sysdeps/unix/sysv/linux/aarch64/clone.S
index 76baa7a..220ec11 100644
--- a/sysdeps/unix/sysv/linux/aarch64/clone.S
+++ b/sysdeps/unix/sysv/linux/aarch64/clone.S
@@ -39,6 +39,13 @@ 
  */
         .text
 ENTRY(__clone)
+	DELOUSE (0)
+	DELOUSE (1)
+	DELOUSE (2)
+	DELOUSE (3)
+	DELOUSE (4)
+	DELOUSE (5)
+	DELOUSE (6)
 	/* Save args for the child.  */
 	mov	x10, x0
 	mov	x11, x2
diff --git a/sysdeps/unix/sysv/linux/aarch64/dl-cache.h b/sysdeps/unix/sysv/linux/aarch64/dl-cache.h
index 9c7b271..86905ed 100644
--- a/sysdeps/unix/sysv/linux/aarch64/dl-cache.h
+++ b/sysdeps/unix/sysv/linux/aarch64/dl-cache.h
@@ -18,7 +18,11 @@ 
 
 #include <ldconfig.h>
 
-#define _DL_CACHE_DEFAULT_ID    (FLAG_AARCH64_LIB64 | FLAG_ELF_LIBC6)
+#ifdef __LP64__
+# define _DL_CACHE_DEFAULT_ID    (FLAG_AARCH64_LIB64 | FLAG_ELF_LIBC6)
+#else
+# define _DL_CACHE_DEFAULT_ID    (FLAG_AARCH64_LIB32 | FLAG_ELF_LIBC6)
+#endif
 
 #define _dl_cache_check_flags(flags)                    \
   ((flags) == _DL_CACHE_DEFAULT_ID)
@@ -27,18 +31,25 @@ 
   do								\
     {								\
       size_t len = strlen (dir);				\
-      char path[len + 3];					\
+      char path[len + 6];					\
       memcpy (path, dir, len + 1);				\
       if (len >= 6 && ! memcmp (path + len - 6, "/lib64", 6))	\
 	{							\
 	  len -= 2;						\
 	  path[len] = '\0';					\
 	}							\
+      if (len >= 9 && ! memcmp (path + len - 9, "/libilp32", 9))\
+	{							\
+	  len -= 5;						\
+	  path[len] = '\0';					\
+	}							\
       add_dir (path);						\
       if (len >= 4 && ! memcmp (path + len - 4, "/lib", 4))	\
 	{							\
 	  memcpy (path + len, "64", 3);				\
 	  add_dir (path);					\
+	  memcpy (path + len, "ilp32", 6);			\
+	  add_dir (path);					\
 	}							\
     } while (0)
 
diff --git a/sysdeps/unix/sysv/linux/aarch64/getcontext.S b/sysdeps/unix/sysv/linux/aarch64/getcontext.S
index c2dd5b8..f6bf24f 100644
--- a/sysdeps/unix/sysv/linux/aarch64/getcontext.S
+++ b/sysdeps/unix/sysv/linux/aarch64/getcontext.S
@@ -30,6 +30,7 @@ 
 	.text
 
 ENTRY(__getcontext)
+	DELOUSE (0)
 	/* The saved context will return to the getcontext() call point
 	   with a return value of 0 */
 	str	xzr,	  [x0, oX0 +  0 * SZREG]
@@ -90,7 +91,7 @@  ENTRY(__getcontext)
 
 	/* Grab the signal mask */
 	/* rt_sigprocmask (SIG_BLOCK, NULL, &ucp->uc_sigmask, _NSIG8) */
-	add	x2, x0, #UCONTEXT_SIGMASK
+	add	PTR_REG (2), PTR_REG (0), #UCONTEXT_SIGMASK
 	mov	x0, SIG_BLOCK
 	mov	x1, 0
 	mov	x3, _NSIG8
diff --git a/sysdeps/unix/sysv/linux/aarch64/init-first.c b/sysdeps/unix/sysv/linux/aarch64/init-first.c
index f7224a2..f7bfc4d 100644
--- a/sysdeps/unix/sysv/linux/aarch64/init-first.c
+++ b/sysdeps/unix/sysv/linux/aarch64/init-first.c
@@ -27,17 +27,21 @@  int (*VDSO_SYMBOL(clock_getres)) (clockid_t, struct timespec *);
 static inline void
 _libc_vdso_platform_setup (void)
 {
-  PREPARE_VERSION (linux2639, "LINUX_2.6.39", 123718537);
+#ifdef __LP64__
+  PREPARE_VERSION (linux_version, "LINUX_2.6.39", 123718537);
+#else
+  PREPARE_VERSION (linux_version, "LINUX_4.9", 61765625);
+#endif
 
-  void *p = _dl_vdso_vsym ("__kernel_gettimeofday", &linux2639);
+  void *p = _dl_vdso_vsym ("__kernel_gettimeofday", &linux_version);
   PTR_MANGLE (p);
   VDSO_SYMBOL(gettimeofday) = p;
 
-  p = _dl_vdso_vsym ("__kernel_clock_gettime", &linux2639);
+  p = _dl_vdso_vsym ("__kernel_clock_gettime", &linux_version);
   PTR_MANGLE (p);
   VDSO_SYMBOL(clock_gettime) = p;
 
-  p = _dl_vdso_vsym ("__kernel_clock_getres", &linux2639);
+  p = _dl_vdso_vsym ("__kernel_clock_getres", &linux_version);
   PTR_MANGLE (p);
   VDSO_SYMBOL(clock_getres) = p;
 }
diff --git a/sysdeps/unix/sysv/linux/aarch64/ldconfig.h b/sysdeps/unix/sysv/linux/aarch64/ldconfig.h
index ee91ef8..ac84194 100644
--- a/sysdeps/unix/sysv/linux/aarch64/ldconfig.h
+++ b/sysdeps/unix/sysv/linux/aarch64/ldconfig.h
@@ -21,6 +21,8 @@ 
 #define SYSDEP_KNOWN_INTERPRETER_NAMES \
   { "/lib/ld-linux-aarch64.so.1", FLAG_ELF_LIBC6 }, \
   { "/lib/ld-linux-aarch64_be.so.1", FLAG_ELF_LIBC6 }, \
+  { "/lib/ld-linux-aarch64_ilp32.so.1", FLAG_ELF_LIBC6 }, \
+  { "/lib/ld-linux-aarch64_be_ilp32.so.1", FLAG_ELF_LIBC6 }, \
   { "/lib/ld-linux.so.3", FLAG_ELF_LIBC6 }, \
   { "/lib/ld-linux-armhf.so.3", FLAG_ELF_LIBC6 },
 #define SYSDEP_KNOWN_LIBRARY_NAMES \
diff --git a/sysdeps/unix/sysv/linux/aarch64/setcontext.S b/sysdeps/unix/sysv/linux/aarch64/setcontext.S
index d17f8c8..c2bca26 100644
--- a/sysdeps/unix/sysv/linux/aarch64/setcontext.S
+++ b/sysdeps/unix/sysv/linux/aarch64/setcontext.S
@@ -34,6 +34,7 @@ 
 	.text
 
 ENTRY (__setcontext)
+	DELOUSE (0)
 	/* Save a copy of UCP.  */
 	mov	x9, x0
 
diff --git a/sysdeps/unix/sysv/linux/aarch64/sigcontextinfo.h b/sysdeps/unix/sysv/linux/aarch64/sigcontextinfo.h
index a579501..ee54222 100644
--- a/sysdeps/unix/sysv/linux/aarch64/sigcontextinfo.h
+++ b/sysdeps/unix/sysv/linux/aarch64/sigcontextinfo.h
@@ -19,7 +19,7 @@ 
 #include <sys/ucontext.h>
 
 #define SIGCONTEXT siginfo_t *_si, struct ucontext *
-#define GET_PC(ctx) ((void *) (ctx)->uc_mcontext.pc)
+#define GET_PC(ctx) ((void *) (size_t) (ctx)->uc_mcontext.pc)
 
 /* There is no reliable way to get the sigcontext unless we use a
    three-argument signal handler.  */
diff --git a/sysdeps/unix/sysv/linux/aarch64/swapcontext.S b/sysdeps/unix/sysv/linux/aarch64/swapcontext.S
index c1a16f3..8e2cadd 100644
--- a/sysdeps/unix/sysv/linux/aarch64/swapcontext.S
+++ b/sysdeps/unix/sysv/linux/aarch64/swapcontext.S
@@ -27,6 +27,7 @@ 
 
 	.text
 ENTRY(__swapcontext)
+	DELOUSE (0)
 	/* Set the value returned when swapcontext() returns in this context. */
 	str	xzr,      [x0, oX0 +  0 * SZREG]
 
diff --git a/sysdeps/unix/sysv/linux/aarch64/sysdep.h b/sysdeps/unix/sysv/linux/aarch64/sysdep.h
index a397e50..1ffabc2 100644
--- a/sysdeps/unix/sysv/linux/aarch64/sysdep.h
+++ b/sysdeps/unix/sysv/linux/aarch64/sysdep.h
@@ -250,12 +250,14 @@ 
      (!defined SHARED && (IS_IN (libc) \
 			  || IS_IN (libpthread))))
 # ifdef __ASSEMBLER__
+/* Note, dst, src, guard, and tmp are all register numbers rather than
+   register names so they will work with both ILP32 and LP64. */
 #  define PTR_MANGLE(dst, src, guard, tmp)                                \
   LDST_PCREL (ldr, guard, tmp, C_SYMBOL_NAME(__pointer_chk_guard_local)); \
   PTR_MANGLE2 (dst, src, guard)
 /* Use PTR_MANGLE2 for efficiency if guard is already loaded.  */
 #  define PTR_MANGLE2(dst, src, guard)\
-  eor dst, src, guard
+  eor x##dst, x##src, x##guard
 #  define PTR_DEMANGLE(dst, src, guard, tmp)\
   PTR_MANGLE (dst, src, guard, tmp)
 #  define PTR_DEMANGLE2(dst, src, guard)\
@@ -268,12 +270,14 @@  extern uintptr_t __pointer_chk_guard_local attribute_relro attribute_hidden;
 # endif
 #else
 # ifdef __ASSEMBLER__
+/* Note, dst, src, guard, and tmp are all register numbers rather than
+   register names so they will work with both ILP32 and LP64. */
 #  define PTR_MANGLE(dst, src, guard, tmp)                             \
   LDST_GLOBAL (ldr, guard, tmp, C_SYMBOL_NAME(__pointer_chk_guard));   \
   PTR_MANGLE2 (dst, src, guard)
 /* Use PTR_MANGLE2 for efficiency if guard is already loaded.  */
 #  define PTR_MANGLE2(dst, src, guard)\
-  eor dst, src, guard
+  eor x##dst, x##src, x##guard
 #  define PTR_DEMANGLE(dst, src, guard, tmp)\
   PTR_MANGLE (dst, src, guard, tmp)
 #  define PTR_DEMANGLE2(dst, src, guard)\