libatomic: Add support for LSE and LSE2

Message ID PAWPR08MB8982FD8B866FE4B8058A80B483009@PAWPR08MB8982.eurprd08.prod.outlook.com
State New
Headers
Series libatomic: Add support for LSE and LSE2 |

Commit Message

Wilco Dijkstra Nov. 11, 2022, 2:22 p.m. UTC
  Add support for AArch64 LSE and LSE2 to libatomic.  Disable outline atomics,
and use LSE ifuncs for 1-8 byte atomics and LSE2 ifuncs for 16-byte atomics.
On Neoverse V1, 16-byte atomics are ~4x faster due to avoiding locks.

Note this is safe since we swap all 16-byte atomics using the same ifunc,
so they either use locks or LSE2 atomics, but never a mix. This also improves
ABI compatibility with LLVM: its inlined 16-byte atomics are compatible with
the new libatomic if LSE2 is supported.

Passes regress, OK for commit?

libatomic/
        Makefile.in: Regenerated with automake 1.15.1.
        Makefile.am: Add atomic_16.S for AArch64.
        configure.tgt: Disable outline atomics in AArch64 build.
        config/linux/aarch64/atomic_16.S: New file - implementation of
        ifuncs for 128-bit atomics.
        config/linux/aarch64/host-config.h: Enable ifuncs, use LSE (HWCAP_ATOMICS)
        for 1-8-byte atomics and LSE2 (HWCAP_USCAT) for 16-byte atomics.

---
  

Comments

Richard Sandiford Nov. 14, 2022, 5:34 p.m. UTC | #1
Wilco Dijkstra via Gcc-patches <gcc-patches@gcc.gnu.org> writes:
> Add support for AArch64 LSE and LSE2 to libatomic.  Disable outline atomics,
> and use LSE ifuncs for 1-8 byte atomics and LSE2 ifuncs for 16-byte atomics.
> On Neoverse V1, 16-byte atomics are ~4x faster due to avoiding locks.
>
> Note this is safe since we swap all 16-byte atomics using the same ifunc,
> so they either use locks or LSE2 atomics, but never a mix. This also improves
> ABI compatibility with LLVM: its inlined 16-byte atomics are compatible with
> the new libatomic if LSE2 is supported.
>
> Passes regress, OK for commit?
>
> libatomic/
>         Makefile.in: Regenerated with automake 1.15.1.
>         Makefile.am: Add atomic_16.S for AArch64.
>         configure.tgt: Disable outline atomics in AArch64 build.
>         config/linux/aarch64/atomic_16.S: New file - implementation of
>         ifuncs for 128-bit atomics.
>         config/linux/aarch64/host-config.h: Enable ifuncs, use LSE (HWCAP_ATOMICS)
>         for 1-8-byte atomics and LSE2 (HWCAP_USCAT) for 16-byte atomics.
>
> ---
> diff --git a/libatomic/Makefile.am b/libatomic/Makefile.am
> index d88515e4a03bd812334ae0b7bf4c0bba119455dc..41e5da28512150780a2018386e22b4e70afcfa3f 100644
> --- a/libatomic/Makefile.am
> +++ b/libatomic/Makefile.am
> @@ -127,6 +127,8 @@ if HAVE_IFUNC
>  if ARCH_AARCH64_LINUX
>  IFUNC_OPTIONS	     = -march=armv8-a+lse
>  libatomic_la_LIBADD += $(foreach s,$(SIZES),$(addsuffix _$(s)_1_.lo,$(SIZEOBJS)))
> +libatomic_la_SOURCES += atomic_16.S
> +
>  endif
>  if ARCH_ARM_LINUX
>  IFUNC_OPTIONS	     = -march=armv7-a+fp -DHAVE_KERNEL64
> diff --git a/libatomic/Makefile.in b/libatomic/Makefile.in
> index 80d25653dc75cca995c8b0b2107a55f1234a6d52..89e29fc60a7fb74341b2f0f805e461847073082c 100644
> --- a/libatomic/Makefile.in
> +++ b/libatomic/Makefile.in
> @@ -90,13 +90,14 @@ build_triplet = @build@
>  host_triplet = @host@
>  target_triplet = @target@
>  @ARCH_AARCH64_LINUX_TRUE@@HAVE_IFUNC_TRUE@am__append_1 = $(foreach s,$(SIZES),$(addsuffix _$(s)_1_.lo,$(SIZEOBJS)))
> -@ARCH_ARM_LINUX_TRUE@@HAVE_IFUNC_TRUE@am__append_2 = $(foreach \
> +@ARCH_AARCH64_LINUX_TRUE@@HAVE_IFUNC_TRUE@am__append_2 = atomic_16.S
> +@ARCH_ARM_LINUX_TRUE@@HAVE_IFUNC_TRUE@am__append_3 = $(foreach \
>  @ARCH_ARM_LINUX_TRUE@@HAVE_IFUNC_TRUE@	s,$(SIZES),$(addsuffix \
>  @ARCH_ARM_LINUX_TRUE@@HAVE_IFUNC_TRUE@	_$(s)_1_.lo,$(SIZEOBJS))) \
>  @ARCH_ARM_LINUX_TRUE@@HAVE_IFUNC_TRUE@	$(addsuffix \
>  @ARCH_ARM_LINUX_TRUE@@HAVE_IFUNC_TRUE@	_8_2_.lo,$(SIZEOBJS))
> -@ARCH_I386_TRUE@@HAVE_IFUNC_TRUE@am__append_3 = $(addsuffix _8_1_.lo,$(SIZEOBJS))
> -@ARCH_X86_64_TRUE@@HAVE_IFUNC_TRUE@am__append_4 = $(addsuffix _16_1_.lo,$(SIZEOBJS)) \
> +@ARCH_I386_TRUE@@HAVE_IFUNC_TRUE@am__append_4 = $(addsuffix _8_1_.lo,$(SIZEOBJS))
> +@ARCH_X86_64_TRUE@@HAVE_IFUNC_TRUE@am__append_5 = $(addsuffix _16_1_.lo,$(SIZEOBJS)) \
>  @ARCH_X86_64_TRUE@@HAVE_IFUNC_TRUE@		       $(addsuffix _16_2_.lo,$(SIZEOBJS))
>  
>  subdir = .
> @@ -154,8 +155,11 @@ am__uninstall_files_from_dir = { \
>    }
>  am__installdirs = "$(DESTDIR)$(toolexeclibdir)"
>  LTLIBRARIES = $(noinst_LTLIBRARIES) $(toolexeclib_LTLIBRARIES)
> +@ARCH_AARCH64_LINUX_TRUE@@HAVE_IFUNC_TRUE@am__objects_1 =  \
> +@ARCH_AARCH64_LINUX_TRUE@@HAVE_IFUNC_TRUE@	atomic_16.lo
>  am_libatomic_la_OBJECTS = gload.lo gstore.lo gcas.lo gexch.lo \
> -	glfree.lo lock.lo init.lo fenv.lo fence.lo flag.lo
> +	glfree.lo lock.lo init.lo fenv.lo fence.lo flag.lo \
> +	$(am__objects_1)
>  libatomic_la_OBJECTS = $(am_libatomic_la_OBJECTS)
>  AM_V_lt = $(am__v_lt_@AM_V@)
>  am__v_lt_ = $(am__v_lt_@AM_DEFAULT_V@)
> @@ -165,9 +169,9 @@ libatomic_la_LINK = $(LIBTOOL) $(AM_V_lt) --tag=CC $(AM_LIBTOOLFLAGS) \
>  	$(LIBTOOLFLAGS) --mode=link $(CCLD) $(AM_CFLAGS) $(CFLAGS) \
>  	$(libatomic_la_LDFLAGS) $(LDFLAGS) -o $@
>  libatomic_convenience_la_DEPENDENCIES = $(libatomic_la_LIBADD)
> -am__objects_1 = gload.lo gstore.lo gcas.lo gexch.lo glfree.lo lock.lo \
> -	init.lo fenv.lo fence.lo flag.lo
> -am_libatomic_convenience_la_OBJECTS = $(am__objects_1)
> +am__objects_2 = gload.lo gstore.lo gcas.lo gexch.lo glfree.lo lock.lo \
> +	init.lo fenv.lo fence.lo flag.lo $(am__objects_1)
> +am_libatomic_convenience_la_OBJECTS = $(am__objects_2)
>  libatomic_convenience_la_OBJECTS =  \
>  	$(am_libatomic_convenience_la_OBJECTS)
>  AM_V_P = $(am__v_P_@AM_V@)
> @@ -185,6 +189,16 @@ am__v_at_1 =
>  depcomp = $(SHELL) $(top_srcdir)/../depcomp
>  am__depfiles_maybe = depfiles
>  am__mv = mv -f
> +CPPASCOMPILE = $(CCAS) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) \
> +	$(AM_CPPFLAGS) $(CPPFLAGS) $(AM_CCASFLAGS) $(CCASFLAGS)
> +LTCPPASCOMPILE = $(LIBTOOL) $(AM_V_lt) $(AM_LIBTOOLFLAGS) \
> +	$(LIBTOOLFLAGS) --mode=compile $(CCAS) $(DEFS) \
> +	$(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) \
> +	$(AM_CCASFLAGS) $(CCASFLAGS)
> +AM_V_CPPAS = $(am__v_CPPAS_@AM_V@)
> +am__v_CPPAS_ = $(am__v_CPPAS_@AM_DEFAULT_V@)
> +am__v_CPPAS_0 = @echo "  CPPAS   " $@;
> +am__v_CPPAS_1 = 
>  COMPILE = $(CC) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) \
>  	$(CPPFLAGS) $(AM_CFLAGS) $(CFLAGS)
>  LTCOMPILE = $(LIBTOOL) $(AM_V_lt) --tag=CC $(AM_LIBTOOLFLAGS) \
> @@ -369,6 +383,7 @@ pdfdir = @pdfdir@
>  prefix = @prefix@
>  program_transform_name = @program_transform_name@
>  psdir = @psdir@
> +runstatedir = @runstatedir@
>  sbindir = @sbindir@
>  sharedstatedir = @sharedstatedir@
>  srcdir = @srcdir@
> @@ -404,9 +419,8 @@ noinst_LTLIBRARIES = libatomic_convenience.la
>  @LIBAT_BUILD_VERSIONED_SHLIB_SUN_TRUE@@LIBAT_BUILD_VERSIONED_SHLIB_TRUE@libatomic_version_dep = libatomic.map-sun
>  libatomic_version_info = -version-info $(libtool_VERSION)
>  libatomic_la_LDFLAGS = $(libatomic_version_info) $(libatomic_version_script) $(lt_host_flags)
> -libatomic_la_SOURCES = gload.c gstore.c gcas.c gexch.c glfree.c lock.c init.c \
> -	fenv.c fence.c flag.c
> -
> +libatomic_la_SOURCES = gload.c gstore.c gcas.c gexch.c glfree.c lock.c \
> +	init.c fenv.c fence.c flag.c $(am__append_2)
>  SIZEOBJS = load store cas exch fadd fsub fand fior fxor fnand tas
>  EXTRA_libatomic_la_SOURCES = $(addsuffix _n.c,$(SIZEOBJS))
>  libatomic_la_DEPENDENCIES = $(libatomic_la_LIBADD) $(libatomic_version_dep)
> @@ -432,8 +446,8 @@ all_c_files := $(foreach dir,$(search_path),$(wildcard $(dir)/*.c))
>  # Then sort through them to find the one we want, and select the first.
>  M_SRC = $(firstword $(filter %/$(M_FILE), $(all_c_files)))
>  libatomic_la_LIBADD = $(foreach s,$(SIZES),$(addsuffix \
> -	_$(s)_.lo,$(SIZEOBJS))) $(am__append_1) $(am__append_2) \
> -	$(am__append_3) $(am__append_4)
> +	_$(s)_.lo,$(SIZEOBJS))) $(am__append_1) $(am__append_3) \
> +	$(am__append_4) $(am__append_5)
>  @ARCH_AARCH64_LINUX_TRUE@@HAVE_IFUNC_TRUE@IFUNC_OPTIONS = -march=armv8-a+lse
>  @ARCH_ARM_LINUX_TRUE@@HAVE_IFUNC_TRUE@IFUNC_OPTIONS = -march=armv7-a+fp -DHAVE_KERNEL64
>  @ARCH_I386_TRUE@@HAVE_IFUNC_TRUE@IFUNC_OPTIONS = -march=i586
> @@ -450,7 +464,7 @@ all: auto-config.h
>  	$(MAKE) $(AM_MAKEFLAGS) all-recursive
>  
>  .SUFFIXES:
> -.SUFFIXES: .c .lo .o .obj
> +.SUFFIXES: .S .c .lo .o .obj
>  am--refresh: Makefile
>  	@:
>  $(srcdir)/Makefile.in: @MAINTAINER_MODE_TRUE@ $(srcdir)/Makefile.am $(top_srcdir)/../multilib.am $(am__configure_deps)
> @@ -559,6 +573,7 @@ mostlyclean-compile:
>  distclean-compile:
>  	-rm -f *.tab.c
>  
> +@AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/atomic_16.Plo@am__quote@
>  @AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/fence.Plo@am__quote@
>  @AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/fenv.Plo@am__quote@
>  @AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/flag.Plo@am__quote@
> @@ -570,6 +585,27 @@ distclean-compile:
>  @AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/init.Plo@am__quote@
>  @AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/lock.Plo@am__quote@
>  
> +.S.o:
> +@am__fastdepCCAS_TRUE@	$(AM_V_CPPAS)$(CPPASCOMPILE) -MT $@ -MD -MP -MF $(DEPDIR)/$*.Tpo -c -o $@ $<
> +@am__fastdepCCAS_TRUE@	$(AM_V_at)$(am__mv) $(DEPDIR)/$*.Tpo $(DEPDIR)/$*.Po
> +@AMDEP_TRUE@@am__fastdepCCAS_FALSE@	$(AM_V_CPPAS)source='$<' object='$@' libtool=no @AMDEPBACKSLASH@
> +@AMDEP_TRUE@@am__fastdepCCAS_FALSE@	DEPDIR=$(DEPDIR) $(CCASDEPMODE) $(depcomp) @AMDEPBACKSLASH@
> +@am__fastdepCCAS_FALSE@	$(AM_V_CPPAS@am__nodep@)$(CPPASCOMPILE) -c -o $@ $<
> +
> +.S.obj:
> +@am__fastdepCCAS_TRUE@	$(AM_V_CPPAS)$(CPPASCOMPILE) -MT $@ -MD -MP -MF $(DEPDIR)/$*.Tpo -c -o $@ `$(CYGPATH_W) '$<'`
> +@am__fastdepCCAS_TRUE@	$(AM_V_at)$(am__mv) $(DEPDIR)/$*.Tpo $(DEPDIR)/$*.Po
> +@AMDEP_TRUE@@am__fastdepCCAS_FALSE@	$(AM_V_CPPAS)source='$<' object='$@' libtool=no @AMDEPBACKSLASH@
> +@AMDEP_TRUE@@am__fastdepCCAS_FALSE@	DEPDIR=$(DEPDIR) $(CCASDEPMODE) $(depcomp) @AMDEPBACKSLASH@
> +@am__fastdepCCAS_FALSE@	$(AM_V_CPPAS@am__nodep@)$(CPPASCOMPILE) -c -o $@ `$(CYGPATH_W) '$<'`
> +
> +.S.lo:
> +@am__fastdepCCAS_TRUE@	$(AM_V_CPPAS)$(LTCPPASCOMPILE) -MT $@ -MD -MP -MF $(DEPDIR)/$*.Tpo -c -o $@ $<
> +@am__fastdepCCAS_TRUE@	$(AM_V_at)$(am__mv) $(DEPDIR)/$*.Tpo $(DEPDIR)/$*.Plo
> +@AMDEP_TRUE@@am__fastdepCCAS_FALSE@	$(AM_V_CPPAS)source='$<' object='$@' libtool=yes @AMDEPBACKSLASH@
> +@AMDEP_TRUE@@am__fastdepCCAS_FALSE@	DEPDIR=$(DEPDIR) $(CCASDEPMODE) $(depcomp) @AMDEPBACKSLASH@
> +@am__fastdepCCAS_FALSE@	$(AM_V_CPPAS@am__nodep@)$(LTCPPASCOMPILE) -c -o $@ $<
> +
>  .c.o:
>  @am__fastdepCC_TRUE@	$(AM_V_CC)$(COMPILE) -MT $@ -MD -MP -MF $(DEPDIR)/$*.Tpo -c -o $@ $<
>  @am__fastdepCC_TRUE@	$(AM_V_at)$(am__mv) $(DEPDIR)/$*.Tpo $(DEPDIR)/$*.Po
> diff --git a/libatomic/config/linux/aarch64/atomic_16.S b/libatomic/config/linux/aarch64/atomic_16.S
> new file mode 100644
> index 0000000000000000000000000000000000000000..5f23dba4529528c39425221402323d07a14cc518
> --- /dev/null
> +++ b/libatomic/config/linux/aarch64/atomic_16.S
> @@ -0,0 +1,422 @@
> +/* Copyright (C) 2022 Free Software Foundation, Inc.
> +
> +   This file is part of the GNU Atomic Library (libatomic).
> +
> +   Libatomic is free software; you can redistribute it and/or modify it
> +   under the terms of the GNU General Public License as published by
> +   the Free Software Foundation; either version 3 of the License, or
> +   (at your option) any later version.
> +
> +   Libatomic is distributed in the hope that it will be useful, but WITHOUT ANY
> +   WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
> +   FOR A PARTICULAR PURPOSE.  See the GNU General Public License for
> +   more details.
> +
> +   Under Section 7 of GPL version 3, you are granted additional
> +   permissions described in the GCC Runtime Library Exception, version
> +   3.1, as published by the Free Software Foundation.
> +
> +   You should have received a copy of the GNU General Public License and
> +   a copy of the GCC Runtime Library Exception along with this program;
> +   see the files COPYING3 and COPYING.RUNTIME respectively.  If not, see
> +   <http://www.gnu.org/licenses/>.  */
> +
> +
> +	.arch	armv8-a+lse
> +
> +#define ENTRY(name)		\
> +	.global name;		\
> +	.hidden name;		\
> +	.type name,%function;	\
> +	.p2align 4;		\
> +name:				\
> +	.cfi_startproc;		\
> +	hint	34	// bti c
> +
> +#define END(name)		\
> +	.cfi_endproc;		\
> +	.size name, .-name;
> +
> +#define res0 x0
> +#define res1 x1
> +#define in0  x2
> +#define in1  x3
> +#define tmp0 x6
> +#define tmp1 x7
> +#define exp0 x8
> +#define exp1 x9
> +
> +#ifdef __AARCH64EB__
> +# define reslo x1
> +# define reshi x0
> +# define inlo  x3
> +# define inhi  x2
> +# define tmplo x7
> +# define tmphi x6
> +#else
> +# define reslo x0
> +# define reshi x1
> +# define inlo  x2
> +# define inhi  x3
> +# define tmplo x6
> +# define tmphi x7
> +#endif
> +
> +#define RELAXED 0
> +#define CONSUME 1
> +#define ACQUIRE 2
> +#define RELEASE 3
> +#define ACQ_REL 4
> +#define SEQ_CST 5
> +
> +
> +ENTRY (libat_load_16_i1)
> +	cbnz	w1, 1f
> +	ldp	res0, res1, [x0]
> +	ret
> +1:
> +	cmp	w1, ACQUIRE
> +	b.hi	2f
> +	ldp	res0, res1, [x0]
> +	dmb	ishld
> +	ret
> +2:
> +	ldp	res0, res1, [x0]
> +	dmb	ish
> +	ret
> +END (libat_load_16_i1)
> +
> +
> +ENTRY (libat_store_16_i1)
> +	cbnz	w4, 1f
> +	stp	in0, in1, [x0]
> +	ret
> +1:
> +	dmb	ish
> +	stp	in0, in1, [x0]
> +	cmp	w4, SEQ_CST
> +	beq	2f
> +	ret
> +2:
> +	dmb	ish
> +	ret
> +END (libat_store_16_i1)
> +
> +
> +ENTRY (libat_exchange_16_i1)
> +	mov	x5, x0
> +	cbnz	w4, 2f
> +1:
> +	ldxp	res0, res1, [x5]
> +	stxp	w4, in0, in1, [x5]
> +	cbnz	w4, 1b
> +	ret
> +2:
> +	cmp	w4, ACQUIRE
> +	b.hi	4f
> +3:
> +	ldaxp	res0, res1, [x5]
> +	stxp	w4, in0, in1, [x5]
> +	cbnz	w4, 3b
> +	ret
> +4:
> +	cmp	w4, RELEASE
> +	b.ne	6f
> +5:
> +	ldxp	res0, res1, [x5]
> +	stlxp	w4, in0, in1, [x5]
> +	cbnz	w4, 5b
> +	ret
> +6:
> +	ldaxp	res0, res1, [x5]
> +	stlxp	w4, in0, in1, [x5]
> +	cbnz	w4, 6b
> +	ret
> +END (libat_exchange_16_i1)
> +
> +
> +ENTRY (libat_compare_exchange_16_i1)
> +	ldp	exp0, exp1, [x1]
> +	mov	tmp0, exp0
> +	mov	tmp1, exp1
> +	cbz	w5, 2f
> +	cmp	w5, RELEASE
> +	b.hs	3f
> +	caspa	exp0, exp1, in0, in1, [x0]
> +0:
> +	cmp	exp0, tmp0
> +	ccmp	exp1, tmp1, 0, eq
> +	bne	1f
> +	mov	x0, 1
> +	ret
> +1:
> +	stp	exp0, exp1, [x1]
> +	mov	x0, 0
> +	ret
> +2:
> +	casp	exp0, exp1, in0, in1, [x0]
> +	b	0b
> +3:
> +	b.hi	4f
> +	caspl	exp0, exp1, in0, in1, [x0]
> +	b	0b
> +4:
> +	caspal	exp0, exp1, in0, in1, [x0]
> +	b	0b
> +END (libat_compare_exchange_16_i1)

As discussed off-list, it looks like this function should use w4 rather
than w5.  OK with that change, thanks.

Obviously completely separate work, but it would be nice to teach gcc to
use ORN for the inline nand expansion.  Maybe that's not heavily used though.

Richard

> +ENTRY (libat_fetch_add_16_i1)
> +	mov	x5, x0
> +	cbnz	w4, 2f
> +1:
> +	ldxp	res0, res1, [x5]
> +	adds	tmplo, reslo, inlo
> +	adc	tmphi, reshi, inhi
> +	stxp	w4, tmp0, tmp1, [x5]
> +	cbnz	w4, 1b
> +	ret
> +2:
> +	ldaxp	res0, res1, [x5]
> +	adds	tmplo, reslo, inlo
> +	adc	tmphi, reshi, inhi
> +	stlxp	w4, tmp0, tmp1, [x5]
> +	cbnz	w4, 2b
> +	ret
> +END (libat_fetch_add_16_i1)
> +
> +
> +ENTRY (libat_add_fetch_16_i1)
> +	mov	x5, x0
> +	cbnz	w4, 2f
> +1:
> +	ldxp	res0, res1, [x5]
> +	adds	reslo, reslo, inlo
> +	adc	reshi, reshi, inhi
> +	stxp	w4, res0, res1, [x5]
> +	cbnz	w4, 1b
> +	ret
> +2:
> +	ldaxp	res0, res1, [x5]
> +	adds	reslo, reslo, inlo
> +	adc	reshi, reshi, inhi
> +	stlxp	w4, res0, res1, [x5]
> +	cbnz	w4, 2b
> +	ret
> +END (libat_add_fetch_16_i1)
> +
> +
> +ENTRY (libat_fetch_sub_16_i1)
> +	mov	x5, x0
> +	cbnz	w4, 2f
> +1:
> +	ldxp	res0, res1, [x5]
> +	subs	tmplo, reslo, inlo
> +	sbc	tmphi, reshi, inhi
> +	stxp	w4, tmp0, tmp1, [x5]
> +	cbnz	w4, 1b
> +	ret
> +2:
> +	ldaxp	res0, res1, [x5]
> +	subs	tmplo, reslo, inlo
> +	sbc	tmphi, reshi, inhi
> +	stlxp	w4, tmp0, tmp1, [x5]
> +	cbnz	w4, 2b
> +	ret
> +END (libat_fetch_sub_16_i1)
> +
> +
> +ENTRY (libat_sub_fetch_16_i1)
> +	mov	x5, x0
> +	cbnz	w4, 2f
> +1:
> +	ldxp	res0, res1, [x5]
> +	subs	reslo, reslo, inlo
> +	sbc	reshi, reshi, inhi
> +	stxp	w4, res0, res1, [x5]
> +	cbnz	w4, 1b
> +	ret
> +2:
> +	ldaxp	res0, res1, [x5]
> +	subs	reslo, reslo, inlo
> +	sbc	reshi, reshi, inhi
> +	stlxp	w4, res0, res1, [x5]
> +	cbnz	w4, 2b
> +	ret
> +END (libat_sub_fetch_16_i1)
> +
> +
> +ENTRY (libat_fetch_or_16_i1)
> +	mov	x5, x0
> +	cbnz	w4, 2f
> +1:
> +	ldxp	res0, res1, [x5]
> +	orr	tmp0, res0, in0
> +	orr	tmp1, res1, in1
> +	stxp	w4, tmp0, tmp1, [x5]
> +	cbnz	w4, 1b
> +	ret
> +2:
> +	ldaxp	res0, res1, [x5]
> +	orr	tmp0, res0, in0
> +	orr	tmp1, res1, in1
> +	stlxp	w4, tmp0, tmp1, [x5]
> +	cbnz	w4, 2b
> +	ret
> +END (libat_fetch_or_16_i1)
> +
> +
> +ENTRY (libat_or_fetch_16_i1)
> +	mov	x5, x0
> +	cbnz	w4, 2f
> +1:
> +	ldxp	res0, res1, [x5]
> +	orr	res0, res0, in0
> +	orr	res1, res1, in1
> +	stxp	w4, res0, res1, [x5]
> +	cbnz	w4, 1b
> +	ret
> +2:
> +	ldaxp	res0, res1, [x5]
> +	orr	res0, res0, in0
> +	orr	res1, res1, in1
> +	stlxp	w4, res0, res1, [x5]
> +	cbnz	w4, 2b
> +	ret
> +END (libat_or_fetch_16_i1)
> +
> +
> +ENTRY (libat_fetch_and_16_i1)
> +	mov	x5, x0
> +	cbnz	w4, 2f
> +1:
> +	ldxp	res0, res1, [x5]
> +	and	tmp0, res0, in0
> +	and	tmp1, res1, in1
> +	stxp	w4, tmp0, tmp1, [x5]
> +	cbnz	w4, 1b
> +	ret
> +2:
> +	ldaxp	res0, res1, [x5]
> +	and	tmp0, res0, in0
> +	and	tmp1, res1, in1
> +	stlxp	w4, tmp0, tmp1, [x5]
> +	cbnz	w4, 2b
> +	ret
> +END (libat_fetch_and_16_i1)
> +
> +
> +ENTRY (libat_and_fetch_16_i1)
> +	mov	x5, x0
> +	cbnz	w4, 2f
> +1:
> +	ldxp	res0, res1, [x5]
> +	and	res0, res0, in0
> +	and	res1, res1, in1
> +	stxp	w4, res0, res1, [x5]
> +	cbnz	w4, 1b
> +	ret
> +2:
> +	ldaxp	res0, res1, [x5]
> +	and	res0, res0, in0
> +	and	res1, res1, in1
> +	stlxp	w4, res0, res1, [x5]
> +	cbnz	w4, 2b
> +	ret
> +END (libat_and_fetch_16_i1)
> +
> +
> +ENTRY (libat_fetch_xor_16_i1)
> +	mov	x5, x0
> +	cbnz	w4, 2f
> +1:
> +	ldxp	res0, res1, [x5]
> +	eor	tmp0, res0, in0
> +	eor	tmp1, res1, in1
> +	stxp	w4, tmp0, tmp1, [x5]
> +	cbnz	w4, 1b
> +	ret
> +2:
> +	ldaxp	res0, res1, [x5]
> +	eor	tmp0, res0, in0
> +	eor	tmp1, res1, in1
> +	stlxp	w4, tmp0, tmp1, [x5]
> +	cbnz	w4, 2b
> +	ret
> +END (libat_fetch_xor_16_i1)
> +
> +
> +ENTRY (libat_xor_fetch_16_i1)
> +	mov	x5, x0
> +	cbnz	w4, 2f
> +1:
> +	ldxp	res0, res1, [x5]
> +	eor	res0, res0, in0
> +	eor	res1, res1, in1
> +	stxp	w4, res0, res1, [x5]
> +	cbnz	w4, 1b
> +	ret
> +2:
> +	ldaxp	res0, res1, [x5]
> +	eor	res0, res0, in0
> +	eor	res1, res1, in1
> +	stlxp	w4, res0, res1, [x5]
> +	cbnz	w4, 2b
> +	ret
> +END (libat_xor_fetch_16_i1)
> +
> +
> +ENTRY (libat_fetch_nand_16_i1)
> +	mov	x5, x0
> +	mvn	in0, in0
> +	mvn	in1, in1
> +	cbnz	w4, 2f
> +1:
> +	ldxp	res0, res1, [x5]
> +	orn	tmp0, in0, res0
> +	orn	tmp1, in1, res1
> +	stxp	w4, tmp0, tmp1, [x5]
> +	cbnz	w4, 1b
> +	ret
> +2:
> +	ldaxp	res0, res1, [x5]
> +	orn	tmp0, in0, res0
> +	orn	tmp1, in1, res1
> +	stlxp	w4, tmp0, tmp1, [x5]
> +	cbnz	w4, 2b
> +	ret
> +END (libat_fetch_nand_16_i1)
> +
> +
> +ENTRY (libat_nand_fetch_16_i1)
> +	mov	x5, x0
> +	mvn	in0, in0
> +	mvn	in1, in1
> +	cbnz	w4, 2f
> +1:
> +	ldxp	res0, res1, [x5]
> +	orn	res0, in0, res0
> +	orn	res1, in1, res1
> +	stxp	w4, res0, res1, [x5]
> +	cbnz	w4, 1b
> +	ret
> +2:
> +	ldaxp	res0, res1, [x5]
> +	orn	res0, in0, res0
> +	orn	res1, in1, res1
> +	stlxp	w4, res0, res1, [x5]
> +	cbnz	w4, 2b
> +	ret
> +END (libat_nand_fetch_16_i1)
> +
> +
> +ENTRY (libat_test_and_set_16_i1)
> +	mov	w2, 1
> +	cbnz	w1, 2f
> +	swpb	w0, w2, [x0]
> +	ret
> +
> +2:	swpalb	w0, w2, [x0]
> +	ret
> +END (libat_test_and_set_16_i1)
> +
> diff --git a/libatomic/config/linux/aarch64/host-config.h b/libatomic/config/linux/aarch64/host-config.h
> index 769ba6edc600099122b03af754cbbb079134596a..d9b5ab31bc85cfe1d5f3773c42442e408b174cbc 100644
> --- a/libatomic/config/linux/aarch64/host-config.h
> +++ b/libatomic/config/linux/aarch64/host-config.h
> @@ -22,14 +22,22 @@
>     <http://www.gnu.org/licenses/>.  */
>  
>  #if HAVE_IFUNC
> -#include <stdlib.h>
> +#include <sys/auxv.h>
>  
> -# ifdef HWCAP_ATOMICS
> -#  define IFUNC_COND_1	(hwcap & HWCAP_ATOMICS)
> +#ifdef HWCAP_USCAT
> +# if N == 16
> +#  define IFUNC_COND_1	(hwcap & HWCAP_USCAT)
>  # else
> -#  define IFUNC_COND_1	(false)
> +#  define IFUNC_COND_1	(hwcap & HWCAP_ATOMICS)
>  # endif
> -# define IFUNC_NCOND(N)	(1)
> +#else
> +#  define IFUNC_COND_1	(false)
> +#endif
> +#define IFUNC_NCOND(N)	(1)
> +
> +#if N == 16 && IFUNC_ALT != 0
> +# define DONE 1
> +#endif
>  
>  #endif /* HAVE_IFUNC */
>  
> diff --git a/libatomic/configure.tgt b/libatomic/configure.tgt
> index 33f8c91ce7718336b05e1077d3e91feb5b706730..113420f7beca143b5040fc9eb871461c2163ae44 100644
> --- a/libatomic/configure.tgt
> +++ b/libatomic/configure.tgt
> @@ -49,6 +49,7 @@ case "${target_cpu}" in
>  		fi
>  		;;
>  	esac
> +	XCFLAGS="${XCFLAGS} -mno-outline-atomics"
>  	;;
>    arm*)
>  	ARCH=arm
  

Patch

diff --git a/libatomic/Makefile.am b/libatomic/Makefile.am
index d88515e4a03bd812334ae0b7bf4c0bba119455dc..41e5da28512150780a2018386e22b4e70afcfa3f 100644
--- a/libatomic/Makefile.am
+++ b/libatomic/Makefile.am
@@ -127,6 +127,8 @@  if HAVE_IFUNC
 if ARCH_AARCH64_LINUX
 IFUNC_OPTIONS	     = -march=armv8-a+lse
 libatomic_la_LIBADD += $(foreach s,$(SIZES),$(addsuffix _$(s)_1_.lo,$(SIZEOBJS)))
+libatomic_la_SOURCES += atomic_16.S
+
 endif
 if ARCH_ARM_LINUX
 IFUNC_OPTIONS	     = -march=armv7-a+fp -DHAVE_KERNEL64
diff --git a/libatomic/Makefile.in b/libatomic/Makefile.in
index 80d25653dc75cca995c8b0b2107a55f1234a6d52..89e29fc60a7fb74341b2f0f805e461847073082c 100644
--- a/libatomic/Makefile.in
+++ b/libatomic/Makefile.in
@@ -90,13 +90,14 @@  build_triplet = @build@
 host_triplet = @host@
 target_triplet = @target@
 @ARCH_AARCH64_LINUX_TRUE@@HAVE_IFUNC_TRUE@am__append_1 = $(foreach s,$(SIZES),$(addsuffix _$(s)_1_.lo,$(SIZEOBJS)))
-@ARCH_ARM_LINUX_TRUE@@HAVE_IFUNC_TRUE@am__append_2 = $(foreach \
+@ARCH_AARCH64_LINUX_TRUE@@HAVE_IFUNC_TRUE@am__append_2 = atomic_16.S
+@ARCH_ARM_LINUX_TRUE@@HAVE_IFUNC_TRUE@am__append_3 = $(foreach \
 @ARCH_ARM_LINUX_TRUE@@HAVE_IFUNC_TRUE@	s,$(SIZES),$(addsuffix \
 @ARCH_ARM_LINUX_TRUE@@HAVE_IFUNC_TRUE@	_$(s)_1_.lo,$(SIZEOBJS))) \
 @ARCH_ARM_LINUX_TRUE@@HAVE_IFUNC_TRUE@	$(addsuffix \
 @ARCH_ARM_LINUX_TRUE@@HAVE_IFUNC_TRUE@	_8_2_.lo,$(SIZEOBJS))
-@ARCH_I386_TRUE@@HAVE_IFUNC_TRUE@am__append_3 = $(addsuffix _8_1_.lo,$(SIZEOBJS))
-@ARCH_X86_64_TRUE@@HAVE_IFUNC_TRUE@am__append_4 = $(addsuffix _16_1_.lo,$(SIZEOBJS)) \
+@ARCH_I386_TRUE@@HAVE_IFUNC_TRUE@am__append_4 = $(addsuffix _8_1_.lo,$(SIZEOBJS))
+@ARCH_X86_64_TRUE@@HAVE_IFUNC_TRUE@am__append_5 = $(addsuffix _16_1_.lo,$(SIZEOBJS)) \
 @ARCH_X86_64_TRUE@@HAVE_IFUNC_TRUE@		       $(addsuffix _16_2_.lo,$(SIZEOBJS))
 
 subdir = .
@@ -154,8 +155,11 @@  am__uninstall_files_from_dir = { \
   }
 am__installdirs = "$(DESTDIR)$(toolexeclibdir)"
 LTLIBRARIES = $(noinst_LTLIBRARIES) $(toolexeclib_LTLIBRARIES)
+@ARCH_AARCH64_LINUX_TRUE@@HAVE_IFUNC_TRUE@am__objects_1 =  \
+@ARCH_AARCH64_LINUX_TRUE@@HAVE_IFUNC_TRUE@	atomic_16.lo
 am_libatomic_la_OBJECTS = gload.lo gstore.lo gcas.lo gexch.lo \
-	glfree.lo lock.lo init.lo fenv.lo fence.lo flag.lo
+	glfree.lo lock.lo init.lo fenv.lo fence.lo flag.lo \
+	$(am__objects_1)
 libatomic_la_OBJECTS = $(am_libatomic_la_OBJECTS)
 AM_V_lt = $(am__v_lt_@AM_V@)
 am__v_lt_ = $(am__v_lt_@AM_DEFAULT_V@)
@@ -165,9 +169,9 @@  libatomic_la_LINK = $(LIBTOOL) $(AM_V_lt) --tag=CC $(AM_LIBTOOLFLAGS) \
 	$(LIBTOOLFLAGS) --mode=link $(CCLD) $(AM_CFLAGS) $(CFLAGS) \
 	$(libatomic_la_LDFLAGS) $(LDFLAGS) -o $@
 libatomic_convenience_la_DEPENDENCIES = $(libatomic_la_LIBADD)
-am__objects_1 = gload.lo gstore.lo gcas.lo gexch.lo glfree.lo lock.lo \
-	init.lo fenv.lo fence.lo flag.lo
-am_libatomic_convenience_la_OBJECTS = $(am__objects_1)
+am__objects_2 = gload.lo gstore.lo gcas.lo gexch.lo glfree.lo lock.lo \
+	init.lo fenv.lo fence.lo flag.lo $(am__objects_1)
+am_libatomic_convenience_la_OBJECTS = $(am__objects_2)
 libatomic_convenience_la_OBJECTS =  \
 	$(am_libatomic_convenience_la_OBJECTS)
 AM_V_P = $(am__v_P_@AM_V@)
@@ -185,6 +189,16 @@  am__v_at_1 =
 depcomp = $(SHELL) $(top_srcdir)/../depcomp
 am__depfiles_maybe = depfiles
 am__mv = mv -f
+CPPASCOMPILE = $(CCAS) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) \
+	$(AM_CPPFLAGS) $(CPPFLAGS) $(AM_CCASFLAGS) $(CCASFLAGS)
+LTCPPASCOMPILE = $(LIBTOOL) $(AM_V_lt) $(AM_LIBTOOLFLAGS) \
+	$(LIBTOOLFLAGS) --mode=compile $(CCAS) $(DEFS) \
+	$(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) \
+	$(AM_CCASFLAGS) $(CCASFLAGS)
+AM_V_CPPAS = $(am__v_CPPAS_@AM_V@)
+am__v_CPPAS_ = $(am__v_CPPAS_@AM_DEFAULT_V@)
+am__v_CPPAS_0 = @echo "  CPPAS   " $@;
+am__v_CPPAS_1 = 
 COMPILE = $(CC) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) \
 	$(CPPFLAGS) $(AM_CFLAGS) $(CFLAGS)
 LTCOMPILE = $(LIBTOOL) $(AM_V_lt) --tag=CC $(AM_LIBTOOLFLAGS) \
@@ -369,6 +383,7 @@  pdfdir = @pdfdir@
 prefix = @prefix@
 program_transform_name = @program_transform_name@
 psdir = @psdir@
+runstatedir = @runstatedir@
 sbindir = @sbindir@
 sharedstatedir = @sharedstatedir@
 srcdir = @srcdir@
@@ -404,9 +419,8 @@  noinst_LTLIBRARIES = libatomic_convenience.la
 @LIBAT_BUILD_VERSIONED_SHLIB_SUN_TRUE@@LIBAT_BUILD_VERSIONED_SHLIB_TRUE@libatomic_version_dep = libatomic.map-sun
 libatomic_version_info = -version-info $(libtool_VERSION)
 libatomic_la_LDFLAGS = $(libatomic_version_info) $(libatomic_version_script) $(lt_host_flags)
-libatomic_la_SOURCES = gload.c gstore.c gcas.c gexch.c glfree.c lock.c init.c \
-	fenv.c fence.c flag.c
-
+libatomic_la_SOURCES = gload.c gstore.c gcas.c gexch.c glfree.c lock.c \
+	init.c fenv.c fence.c flag.c $(am__append_2)
 SIZEOBJS = load store cas exch fadd fsub fand fior fxor fnand tas
 EXTRA_libatomic_la_SOURCES = $(addsuffix _n.c,$(SIZEOBJS))
 libatomic_la_DEPENDENCIES = $(libatomic_la_LIBADD) $(libatomic_version_dep)
@@ -432,8 +446,8 @@  all_c_files := $(foreach dir,$(search_path),$(wildcard $(dir)/*.c))
 # Then sort through them to find the one we want, and select the first.
 M_SRC = $(firstword $(filter %/$(M_FILE), $(all_c_files)))
 libatomic_la_LIBADD = $(foreach s,$(SIZES),$(addsuffix \
-	_$(s)_.lo,$(SIZEOBJS))) $(am__append_1) $(am__append_2) \
-	$(am__append_3) $(am__append_4)
+	_$(s)_.lo,$(SIZEOBJS))) $(am__append_1) $(am__append_3) \
+	$(am__append_4) $(am__append_5)
 @ARCH_AARCH64_LINUX_TRUE@@HAVE_IFUNC_TRUE@IFUNC_OPTIONS = -march=armv8-a+lse
 @ARCH_ARM_LINUX_TRUE@@HAVE_IFUNC_TRUE@IFUNC_OPTIONS = -march=armv7-a+fp -DHAVE_KERNEL64
 @ARCH_I386_TRUE@@HAVE_IFUNC_TRUE@IFUNC_OPTIONS = -march=i586
@@ -450,7 +464,7 @@  all: auto-config.h
 	$(MAKE) $(AM_MAKEFLAGS) all-recursive
 
 .SUFFIXES:
-.SUFFIXES: .c .lo .o .obj
+.SUFFIXES: .S .c .lo .o .obj
 am--refresh: Makefile
 	@:
 $(srcdir)/Makefile.in: @MAINTAINER_MODE_TRUE@ $(srcdir)/Makefile.am $(top_srcdir)/../multilib.am $(am__configure_deps)
@@ -559,6 +573,7 @@  mostlyclean-compile:
 distclean-compile:
 	-rm -f *.tab.c
 
+@AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/atomic_16.Plo@am__quote@
 @AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/fence.Plo@am__quote@
 @AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/fenv.Plo@am__quote@
 @AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/flag.Plo@am__quote@
@@ -570,6 +585,27 @@  distclean-compile:
 @AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/init.Plo@am__quote@
 @AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/lock.Plo@am__quote@
 
+.S.o:
+@am__fastdepCCAS_TRUE@	$(AM_V_CPPAS)$(CPPASCOMPILE) -MT $@ -MD -MP -MF $(DEPDIR)/$*.Tpo -c -o $@ $<
+@am__fastdepCCAS_TRUE@	$(AM_V_at)$(am__mv) $(DEPDIR)/$*.Tpo $(DEPDIR)/$*.Po
+@AMDEP_TRUE@@am__fastdepCCAS_FALSE@	$(AM_V_CPPAS)source='$<' object='$@' libtool=no @AMDEPBACKSLASH@
+@AMDEP_TRUE@@am__fastdepCCAS_FALSE@	DEPDIR=$(DEPDIR) $(CCASDEPMODE) $(depcomp) @AMDEPBACKSLASH@
+@am__fastdepCCAS_FALSE@	$(AM_V_CPPAS@am__nodep@)$(CPPASCOMPILE) -c -o $@ $<
+
+.S.obj:
+@am__fastdepCCAS_TRUE@	$(AM_V_CPPAS)$(CPPASCOMPILE) -MT $@ -MD -MP -MF $(DEPDIR)/$*.Tpo -c -o $@ `$(CYGPATH_W) '$<'`
+@am__fastdepCCAS_TRUE@	$(AM_V_at)$(am__mv) $(DEPDIR)/$*.Tpo $(DEPDIR)/$*.Po
+@AMDEP_TRUE@@am__fastdepCCAS_FALSE@	$(AM_V_CPPAS)source='$<' object='$@' libtool=no @AMDEPBACKSLASH@
+@AMDEP_TRUE@@am__fastdepCCAS_FALSE@	DEPDIR=$(DEPDIR) $(CCASDEPMODE) $(depcomp) @AMDEPBACKSLASH@
+@am__fastdepCCAS_FALSE@	$(AM_V_CPPAS@am__nodep@)$(CPPASCOMPILE) -c -o $@ `$(CYGPATH_W) '$<'`
+
+.S.lo:
+@am__fastdepCCAS_TRUE@	$(AM_V_CPPAS)$(LTCPPASCOMPILE) -MT $@ -MD -MP -MF $(DEPDIR)/$*.Tpo -c -o $@ $<
+@am__fastdepCCAS_TRUE@	$(AM_V_at)$(am__mv) $(DEPDIR)/$*.Tpo $(DEPDIR)/$*.Plo
+@AMDEP_TRUE@@am__fastdepCCAS_FALSE@	$(AM_V_CPPAS)source='$<' object='$@' libtool=yes @AMDEPBACKSLASH@
+@AMDEP_TRUE@@am__fastdepCCAS_FALSE@	DEPDIR=$(DEPDIR) $(CCASDEPMODE) $(depcomp) @AMDEPBACKSLASH@
+@am__fastdepCCAS_FALSE@	$(AM_V_CPPAS@am__nodep@)$(LTCPPASCOMPILE) -c -o $@ $<
+
 .c.o:
 @am__fastdepCC_TRUE@	$(AM_V_CC)$(COMPILE) -MT $@ -MD -MP -MF $(DEPDIR)/$*.Tpo -c -o $@ $<
 @am__fastdepCC_TRUE@	$(AM_V_at)$(am__mv) $(DEPDIR)/$*.Tpo $(DEPDIR)/$*.Po
diff --git a/libatomic/config/linux/aarch64/atomic_16.S b/libatomic/config/linux/aarch64/atomic_16.S
new file mode 100644
index 0000000000000000000000000000000000000000..5f23dba4529528c39425221402323d07a14cc518
--- /dev/null
+++ b/libatomic/config/linux/aarch64/atomic_16.S
@@ -0,0 +1,422 @@ 
+/* Copyright (C) 2022 Free Software Foundation, Inc.
+
+   This file is part of the GNU Atomic Library (libatomic).
+
+   Libatomic is free software; you can redistribute it and/or modify it
+   under the terms of the GNU General Public License as published by
+   the Free Software Foundation; either version 3 of the License, or
+   (at your option) any later version.
+
+   Libatomic is distributed in the hope that it will be useful, but WITHOUT ANY
+   WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+   FOR A PARTICULAR PURPOSE.  See the GNU General Public License for
+   more details.
+
+   Under Section 7 of GPL version 3, you are granted additional
+   permissions described in the GCC Runtime Library Exception, version
+   3.1, as published by the Free Software Foundation.
+
+   You should have received a copy of the GNU General Public License and
+   a copy of the GCC Runtime Library Exception along with this program;
+   see the files COPYING3 and COPYING.RUNTIME respectively.  If not, see
+   <http://www.gnu.org/licenses/>.  */
+
+
+	.arch	armv8-a+lse
+
+#define ENTRY(name)		\
+	.global name;		\
+	.hidden name;		\
+	.type name,%function;	\
+	.p2align 4;		\
+name:				\
+	.cfi_startproc;		\
+	hint	34	// bti c
+
+#define END(name)		\
+	.cfi_endproc;		\
+	.size name, .-name;
+
+#define res0 x0
+#define res1 x1
+#define in0  x2
+#define in1  x3
+#define tmp0 x6
+#define tmp1 x7
+#define exp0 x8
+#define exp1 x9
+
+#ifdef __AARCH64EB__
+# define reslo x1
+# define reshi x0
+# define inlo  x3
+# define inhi  x2
+# define tmplo x7
+# define tmphi x6
+#else
+# define reslo x0
+# define reshi x1
+# define inlo  x2
+# define inhi  x3
+# define tmplo x6
+# define tmphi x7
+#endif
+
+#define RELAXED 0
+#define CONSUME 1
+#define ACQUIRE 2
+#define RELEASE 3
+#define ACQ_REL 4
+#define SEQ_CST 5
+
+
+ENTRY (libat_load_16_i1)
+	cbnz	w1, 1f
+	ldp	res0, res1, [x0]
+	ret
+1:
+	cmp	w1, ACQUIRE
+	b.hi	2f
+	ldp	res0, res1, [x0]
+	dmb	ishld
+	ret
+2:
+	ldp	res0, res1, [x0]
+	dmb	ish
+	ret
+END (libat_load_16_i1)
+
+
+ENTRY (libat_store_16_i1)
+	cbnz	w4, 1f
+	stp	in0, in1, [x0]
+	ret
+1:
+	dmb	ish
+	stp	in0, in1, [x0]
+	cmp	w4, SEQ_CST
+	beq	2f
+	ret
+2:
+	dmb	ish
+	ret
+END (libat_store_16_i1)
+
+
+ENTRY (libat_exchange_16_i1)
+	mov	x5, x0
+	cbnz	w4, 2f
+1:
+	ldxp	res0, res1, [x5]
+	stxp	w4, in0, in1, [x5]
+	cbnz	w4, 1b
+	ret
+2:
+	cmp	w4, ACQUIRE
+	b.hi	4f
+3:
+	ldaxp	res0, res1, [x5]
+	stxp	w4, in0, in1, [x5]
+	cbnz	w4, 3b
+	ret
+4:
+	cmp	w4, RELEASE
+	b.ne	6f
+5:
+	ldxp	res0, res1, [x5]
+	stlxp	w4, in0, in1, [x5]
+	cbnz	w4, 5b
+	ret
+6:
+	ldaxp	res0, res1, [x5]
+	stlxp	w4, in0, in1, [x5]
+	cbnz	w4, 6b
+	ret
+END (libat_exchange_16_i1)
+
+
+ENTRY (libat_compare_exchange_16_i1)
+	ldp	exp0, exp1, [x1]
+	mov	tmp0, exp0
+	mov	tmp1, exp1
+	cbz	w5, 2f
+	cmp	w5, RELEASE
+	b.hs	3f
+	caspa	exp0, exp1, in0, in1, [x0]
+0:
+	cmp	exp0, tmp0
+	ccmp	exp1, tmp1, 0, eq
+	bne	1f
+	mov	x0, 1
+	ret
+1:
+	stp	exp0, exp1, [x1]
+	mov	x0, 0
+	ret
+2:
+	casp	exp0, exp1, in0, in1, [x0]
+	b	0b
+3:
+	b.hi	4f
+	caspl	exp0, exp1, in0, in1, [x0]
+	b	0b
+4:
+	caspal	exp0, exp1, in0, in1, [x0]
+	b	0b
+END (libat_compare_exchange_16_i1)
+
+
+ENTRY (libat_fetch_add_16_i1)
+	mov	x5, x0
+	cbnz	w4, 2f
+1:
+	ldxp	res0, res1, [x5]
+	adds	tmplo, reslo, inlo
+	adc	tmphi, reshi, inhi
+	stxp	w4, tmp0, tmp1, [x5]
+	cbnz	w4, 1b
+	ret
+2:
+	ldaxp	res0, res1, [x5]
+	adds	tmplo, reslo, inlo
+	adc	tmphi, reshi, inhi
+	stlxp	w4, tmp0, tmp1, [x5]
+	cbnz	w4, 2b
+	ret
+END (libat_fetch_add_16_i1)
+
+
+ENTRY (libat_add_fetch_16_i1)
+	mov	x5, x0
+	cbnz	w4, 2f
+1:
+	ldxp	res0, res1, [x5]
+	adds	reslo, reslo, inlo
+	adc	reshi, reshi, inhi
+	stxp	w4, res0, res1, [x5]
+	cbnz	w4, 1b
+	ret
+2:
+	ldaxp	res0, res1, [x5]
+	adds	reslo, reslo, inlo
+	adc	reshi, reshi, inhi
+	stlxp	w4, res0, res1, [x5]
+	cbnz	w4, 2b
+	ret
+END (libat_add_fetch_16_i1)
+
+
+ENTRY (libat_fetch_sub_16_i1)
+	mov	x5, x0
+	cbnz	w4, 2f
+1:
+	ldxp	res0, res1, [x5]
+	subs	tmplo, reslo, inlo
+	sbc	tmphi, reshi, inhi
+	stxp	w4, tmp0, tmp1, [x5]
+	cbnz	w4, 1b
+	ret
+2:
+	ldaxp	res0, res1, [x5]
+	subs	tmplo, reslo, inlo
+	sbc	tmphi, reshi, inhi
+	stlxp	w4, tmp0, tmp1, [x5]
+	cbnz	w4, 2b
+	ret
+END (libat_fetch_sub_16_i1)
+
+
+ENTRY (libat_sub_fetch_16_i1)
+	mov	x5, x0
+	cbnz	w4, 2f
+1:
+	ldxp	res0, res1, [x5]
+	subs	reslo, reslo, inlo
+	sbc	reshi, reshi, inhi
+	stxp	w4, res0, res1, [x5]
+	cbnz	w4, 1b
+	ret
+2:
+	ldaxp	res0, res1, [x5]
+	subs	reslo, reslo, inlo
+	sbc	reshi, reshi, inhi
+	stlxp	w4, res0, res1, [x5]
+	cbnz	w4, 2b
+	ret
+END (libat_sub_fetch_16_i1)
+
+
+ENTRY (libat_fetch_or_16_i1)
+	mov	x5, x0
+	cbnz	w4, 2f
+1:
+	ldxp	res0, res1, [x5]
+	orr	tmp0, res0, in0
+	orr	tmp1, res1, in1
+	stxp	w4, tmp0, tmp1, [x5]
+	cbnz	w4, 1b
+	ret
+2:
+	ldaxp	res0, res1, [x5]
+	orr	tmp0, res0, in0
+	orr	tmp1, res1, in1
+	stlxp	w4, tmp0, tmp1, [x5]
+	cbnz	w4, 2b
+	ret
+END (libat_fetch_or_16_i1)
+
+
+ENTRY (libat_or_fetch_16_i1)
+	mov	x5, x0
+	cbnz	w4, 2f
+1:
+	ldxp	res0, res1, [x5]
+	orr	res0, res0, in0
+	orr	res1, res1, in1
+	stxp	w4, res0, res1, [x5]
+	cbnz	w4, 1b
+	ret
+2:
+	ldaxp	res0, res1, [x5]
+	orr	res0, res0, in0
+	orr	res1, res1, in1
+	stlxp	w4, res0, res1, [x5]
+	cbnz	w4, 2b
+	ret
+END (libat_or_fetch_16_i1)
+
+
+ENTRY (libat_fetch_and_16_i1)
+	mov	x5, x0
+	cbnz	w4, 2f
+1:
+	ldxp	res0, res1, [x5]
+	and	tmp0, res0, in0
+	and	tmp1, res1, in1
+	stxp	w4, tmp0, tmp1, [x5]
+	cbnz	w4, 1b
+	ret
+2:
+	ldaxp	res0, res1, [x5]
+	and	tmp0, res0, in0
+	and	tmp1, res1, in1
+	stlxp	w4, tmp0, tmp1, [x5]
+	cbnz	w4, 2b
+	ret
+END (libat_fetch_and_16_i1)
+
+
+ENTRY (libat_and_fetch_16_i1)
+	mov	x5, x0
+	cbnz	w4, 2f
+1:
+	ldxp	res0, res1, [x5]
+	and	res0, res0, in0
+	and	res1, res1, in1
+	stxp	w4, res0, res1, [x5]
+	cbnz	w4, 1b
+	ret
+2:
+	ldaxp	res0, res1, [x5]
+	and	res0, res0, in0
+	and	res1, res1, in1
+	stlxp	w4, res0, res1, [x5]
+	cbnz	w4, 2b
+	ret
+END (libat_and_fetch_16_i1)
+
+
+ENTRY (libat_fetch_xor_16_i1)
+	mov	x5, x0
+	cbnz	w4, 2f
+1:
+	ldxp	res0, res1, [x5]
+	eor	tmp0, res0, in0
+	eor	tmp1, res1, in1
+	stxp	w4, tmp0, tmp1, [x5]
+	cbnz	w4, 1b
+	ret
+2:
+	ldaxp	res0, res1, [x5]
+	eor	tmp0, res0, in0
+	eor	tmp1, res1, in1
+	stlxp	w4, tmp0, tmp1, [x5]
+	cbnz	w4, 2b
+	ret
+END (libat_fetch_xor_16_i1)
+
+
+ENTRY (libat_xor_fetch_16_i1)
+	mov	x5, x0
+	cbnz	w4, 2f
+1:
+	ldxp	res0, res1, [x5]
+	eor	res0, res0, in0
+	eor	res1, res1, in1
+	stxp	w4, res0, res1, [x5]
+	cbnz	w4, 1b
+	ret
+2:
+	ldaxp	res0, res1, [x5]
+	eor	res0, res0, in0
+	eor	res1, res1, in1
+	stlxp	w4, res0, res1, [x5]
+	cbnz	w4, 2b
+	ret
+END (libat_xor_fetch_16_i1)
+
+
+ENTRY (libat_fetch_nand_16_i1)
+	mov	x5, x0
+	mvn	in0, in0
+	mvn	in1, in1
+	cbnz	w4, 2f
+1:
+	ldxp	res0, res1, [x5]
+	orn	tmp0, in0, res0
+	orn	tmp1, in1, res1
+	stxp	w4, tmp0, tmp1, [x5]
+	cbnz	w4, 1b
+	ret
+2:
+	ldaxp	res0, res1, [x5]
+	orn	tmp0, in0, res0
+	orn	tmp1, in1, res1
+	stlxp	w4, tmp0, tmp1, [x5]
+	cbnz	w4, 2b
+	ret
+END (libat_fetch_nand_16_i1)
+
+
+ENTRY (libat_nand_fetch_16_i1)
+	mov	x5, x0
+	mvn	in0, in0
+	mvn	in1, in1
+	cbnz	w4, 2f
+1:
+	ldxp	res0, res1, [x5]
+	orn	res0, in0, res0
+	orn	res1, in1, res1
+	stxp	w4, res0, res1, [x5]
+	cbnz	w4, 1b
+	ret
+2:
+	ldaxp	res0, res1, [x5]
+	orn	res0, in0, res0
+	orn	res1, in1, res1
+	stlxp	w4, res0, res1, [x5]
+	cbnz	w4, 2b
+	ret
+END (libat_nand_fetch_16_i1)
+
+
+ENTRY (libat_test_and_set_16_i1)
+	mov	w2, 1
+	cbnz	w1, 2f
+	swpb	w0, w2, [x0]
+	ret
+
+2:	swpalb	w0, w2, [x0]
+	ret
+END (libat_test_and_set_16_i1)
+
diff --git a/libatomic/config/linux/aarch64/host-config.h b/libatomic/config/linux/aarch64/host-config.h
index 769ba6edc600099122b03af754cbbb079134596a..d9b5ab31bc85cfe1d5f3773c42442e408b174cbc 100644
--- a/libatomic/config/linux/aarch64/host-config.h
+++ b/libatomic/config/linux/aarch64/host-config.h
@@ -22,14 +22,22 @@ 
    <http://www.gnu.org/licenses/>.  */
 
 #if HAVE_IFUNC
-#include <stdlib.h>
+#include <sys/auxv.h>
 
-# ifdef HWCAP_ATOMICS
-#  define IFUNC_COND_1	(hwcap & HWCAP_ATOMICS)
+#ifdef HWCAP_USCAT
+# if N == 16
+#  define IFUNC_COND_1	(hwcap & HWCAP_USCAT)
 # else
-#  define IFUNC_COND_1	(false)
+#  define IFUNC_COND_1	(hwcap & HWCAP_ATOMICS)
 # endif
-# define IFUNC_NCOND(N)	(1)
+#else
+#  define IFUNC_COND_1	(false)
+#endif
+#define IFUNC_NCOND(N)	(1)
+
+#if N == 16 && IFUNC_ALT != 0
+# define DONE 1
+#endif
 
 #endif /* HAVE_IFUNC */
 
diff --git a/libatomic/configure.tgt b/libatomic/configure.tgt
index 33f8c91ce7718336b05e1077d3e91feb5b706730..113420f7beca143b5040fc9eb871461c2163ae44 100644
--- a/libatomic/configure.tgt
+++ b/libatomic/configure.tgt
@@ -49,6 +49,7 @@  case "${target_cpu}" in
 		fi
 		;;
 	esac
+	XCFLAGS="${XCFLAGS} -mno-outline-atomics"
 	;;
   arm*)
 	ARCH=arm