[ARM] ] Add support for fenv_private on ARM
Commit Message
Hi,
This patch improves performance of common math functions by avoiding unnecessary
writes to FPSCR. Add fenv_private.h with faster inline variants of fenv functions
which avoid no-change writes to FPSCR. The number of FPSCR reads/writes reduces
from 4/3 for a call to sin() to 3/1 with the inline fenv implementation, and 1/0
for the HAVE_RM_CTX implementation.
A summary of performance on Cortex-A15:
No fenv_private.h:
cos(): ITERS:2.07e+07: TOTAL:10.6831s, MAX:1519.12ns, MIN:231.833ns, 1.93763e+06 iter/s
exp(): ITERS:3.598e+06: TOTAL:10.6089s, MAX:11415.5ns, MIN:175.375ns, 339148 iter/s
pow(): ITERS:3.3712e+07: TOTAL:9.91444s, MAX:531.669ns, MIN:57.833ns, 3.40029e+06 iter/s
sin(): ITERS:1.96e+07: TOTAL:10.5283s, MAX:1498.83ns, MIN:224.166ns, 1.86165e+06 iter/s
sincos(): ITERS:1.8684e+07: TOTAL:9.84671s, MAX:1599.79ns, MIN:499.417ns, 1.89749e+06 iter/s
tan(): ITERS:2.2701e+07: TOTAL:11.0817s, MAX:1001.79ns, MIN:225.333ns, 2.04852e+06 iter/s
With fenv_private.h:
cos(): ITERS:2.99e+07: TOTAL:9.93882s, MAX:2341.34ns, MIN:43.875ns, 3.00841e+06 iter/s
exp(): ITERS:3.598e+06: TOTAL:10.0066s, MAX:10440.2ns, MIN:26.5ns, 359562 iter/s
pow(): ITERS:5.8093e+07: TOTAL:9.86581s, MAX:1102.29ns, MIN:63.042ns, 5.88832e+06 iter/s
sin(): ITERS:3.08e+07: TOTAL:10.8619s, MAX:3371.59ns, MIN:37.708ns, 2.8356e+06 iter/s
sincos(): ITERS:5.7708e+07: TOTAL:9.88083s, MAX:1348.21ns, MIN:148.875ns, 5.8404e+06 iter/s
tan(): ITERS:3.243e+07: TOTAL:10.1926s, MAX:1840.3ns, MIN:50.042ns, 3.18171e+06 iter/s
GLIBC tests pass with same number of failures with the new fenv_private.h (both with
and without HAVE_RM_CTX).
OK for commit?
Wilco
>From ba7c978b428967ee8217f7edef88156a288c8014 Mon Sep 17 00:00:00 2001
From: Wilco Dijkstra <wdijkstr@localhost.localdomain>
Date: Tue, 4 Mar 2014 13:44:44 +0000
Subject: [PATCH 1/2] Add support for fenv_private on ARM.
---
sysdeps/arm/fenv_private.h | 250 ++++++++++++++++++++++++++++++++++++++++++++
sysdeps/arm/fpu_control.h | 7 +-
sysdeps/arm/math_private.h | 6 ++
3 files changed, 262 insertions(+), 1 deletion(-)
create mode 100644 sysdeps/arm/fenv_private.h
create mode 100644 sysdeps/arm/math_private.h
Comments
On Thu, 6 Mar 2014, Wilco wrote:
> +#if ARM_HAVE_VFP
ARM_HAVE_VFP isn't suitable for use in #if; see the
sysdeps/unix/sysv/linux/arm/arm-features.h definition. You need either a
new macro meaning "VFP is known at compile time to be available", or to
move to checks with "if" inside the functions.
(The case of a soft-float build, VFP hardware available at runtime - the
one addressed by "if" conditionals - has other problems with exceptions
and rounding modes, that I think would best be addressed by use of IFUNCs
in libgcc for the relevant RTABI functions; see bug 10064. Anyway, the
present patch is purely an optimization, so there's certainly no need for
it to cover all cases as long as it doesn't break them.)
new file mode 100644
@@ -0,0 +1,250 @@
+/* Private floating point rounding and exceptions handling. ARM VFP version.
+ Copyright (C) 2014 Free Software Foundation, Inc.
+ This file is part of the GNU C Library.
+
+ The GNU C Library is free software; you can redistribute it and/or
+ modify it under the terms of the GNU Lesser General Public
+ License as published by the Free Software Foundation; either
+ version 2.1 of the License, or (at your option) any later version.
+
+ The GNU C Library is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ Lesser General Public License for more details.
+
+ You should have received a copy of the GNU Lesser General Public
+ License along with the GNU C Library. If not, see
+ <http://www.gnu.org/licenses/>. */
+
+#ifndef FENV_PRIVATE_H
+#define FENV_PRIVATE_H 1
+
+#include <fenv.h>
+#include <fpu_control.h>
+#include <arm-features.h>
+
+#if ARM_HAVE_VFP
+
+static __always_inline void
+libc_feholdexcept_vfp (fenv_t *envp)
+{
+ fpu_control_t fpscr;
+
+ _FPU_GETCW (fpscr);
+ envp->__cw = fpscr;
+
+ /* Clear exception flags and set all exceptions to non-stop. */
+ fpscr &= ~_FPU_MASK_EXCEPT;
+ _FPU_SETCW (fpscr);
+}
+
+static __always_inline void
+libc_fesetround_vfp (int round)
+{
+ fpu_control_t fpscr;
+
+ _FPU_GETCW (fpscr);
+
+ /* Set new rounding mode if different. */
+ if (__glibc_unlikely ((fpscr & FE_TOWARDZERO) != round))
+ _FPU_SETCW ((fpscr & ~FE_TOWARDZERO) | round);
+}
+
+static __always_inline void
+libc_feholdexcept_setround_vfp (fenv_t *envp, int round)
+{
+ fpu_control_t fpscr;
+
+ _FPU_GETCW (fpscr);
+ envp->__cw = fpscr;
+
+ /* Clear exception flags, set all exceptions to non-stop,
+ and set new rounding mode. */
+ fpscr &= ~(_FPU_MASK_EXCEPT | FE_TOWARDZERO);
+ _FPU_SETCW (fpscr | round);
+}
+
+static __always_inline void
+libc_feholdsetround_vfp (fenv_t *envp, int round)
+{
+ fpu_control_t fpscr;
+
+ _FPU_GETCW (fpscr);
+ envp->__cw = fpscr;
+
+ /* Set new rounding mode if different. */
+ if (__glibc_unlikely ((fpscr & FE_TOWARDZERO) != round))
+ _FPU_SETCW ((fpscr & ~FE_TOWARDZERO) | round);
+}
+
+static __always_inline void
+libc_feresetround_vfp (fenv_t *envp)
+{
+ fpu_control_t fpscr, round;
+
+ _FPU_GETCW (fpscr);
+
+ /* Check whether rounding modes are different. */
+ round = (envp->__cw ^ fpscr) & FE_TOWARDZERO;
+
+ /* Restore the rounding mode if it was changed. */
+ if (__glibc_unlikely (round != 0))
+ _FPU_SETCW (fpscr ^ round);
+}
+
+static __always_inline int
+libc_fetestexcept_vfp (int ex)
+{
+ fpu_control_t fpscr;
+
+ _FPU_GETCW (fpscr);
+ return fpscr & ex & FE_ALL_EXCEPT;
+}
+
+static __always_inline void
+libc_fesetenv_vfp (fenv_t *envp)
+{
+ fpu_control_t fpscr, new_fpscr;
+
+ _FPU_GETCW (fpscr);
+ new_fpscr = envp->__cw;
+
+ /* Write new FPSCR if different (ignoring NZCV flags). */
+ if (__glibc_unlikely (((fpscr ^ new_fpscr) & ~_FPU_MASK_NZCV) != 0))
+ _FPU_SETCW (new_fpscr);
+}
+
+static __always_inline int
+libc_feupdateenv_test_vfp (fenv_t *envp, int ex)
+{
+ fpu_control_t fpscr, new_fpscr;
+ int excepts;
+
+ _FPU_GETCW (fpscr);
+
+ /* Merge current exception flags with the saved fenv. */
+ excepts = fpscr & FE_ALL_EXCEPT;
+ new_fpscr = envp->__cw | excepts;
+
+ /* Write new FPSCR if different (ignoring NZCV flags). */
+ if (__glibc_unlikely (((fpscr ^ new_fpscr) & ~_FPU_MASK_NZCV) != 0))
+ _FPU_SETCW (new_fpscr);
+
+ /* Raise the exceptions if enabled in the new FP state. */
+ if (__glibc_unlikely (excepts & (new_fpscr >> FE_EXCEPT_SHIFT)))
+ feraiseexcept (excepts);
+
+ return excepts & ex;
+}
+
+static __always_inline void
+libc_feupdateenv_vfp (fenv_t *envp)
+{
+ libc_feupdateenv_test_vfp (envp, 0);
+}
+
+#define libc_feholdexcept libc_feholdexcept_vfp
+#define libc_feholdexceptf libc_feholdexcept_vfp
+#define libc_feholdexceptl libc_feholdexcept_vfp
+
+#define libc_fesetround libc_fesetround_vfp
+#define libc_fesetroundf libc_fesetround_vfp
+#define libc_fesetroundl libc_fesetround_vfp
+
+#define libc_feresetround libc_feresetround_vfp
+#define libc_feresetroundf libc_feresetround_vfp
+#define libc_feresetroundl libc_feresetround_vfp
+
+#define libc_feresetround_noex libc_fesetenv_vfp
+#define libc_feresetround_noexf libc_fesetenv_vfp
+#define libc_feresetround_noexl libc_fesetenv_vfp
+
+#define libc_feholdexcept_setround libc_feholdexcept_setround_vfp
+#define libc_feholdexcept_setroundf libc_feholdexcept_setround_vfp
+#define libc_feholdexcept_setroundl libc_feholdexcept_setround_vfp
+
+#define libc_feholdsetround libc_feholdsetround_vfp
+#define libc_feholdsetroundf libc_feholdsetround_vfp
+#define libc_feholdsetroundl libc_feholdsetround_vfp
+
+#define libc_fetestexcept libc_fetestexcept_vfp
+#define libc_fetestexceptf libc_fetestexcept_vfp
+#define libc_fetestexceptl libc_fetestexcept_vfp
+
+#define libc_fesetenv libc_fesetenv_vfp
+#define libc_fesetenvf libc_fesetenv_vfp
+#define libc_fesetenvl libc_fesetenv_vfp
+
+#define libc_feupdateenv libc_feupdateenv_vfp
+#define libc_feupdateenvf libc_feupdateenv_vfp
+#define libc_feupdateenvl libc_feupdateenv_vfp
+
+#define libc_feupdateenv_test libc_feupdateenv_test_vfp
+#define libc_feupdateenv_testf libc_feupdateenv_test_vfp
+#define libc_feupdateenv_testl libc_feupdateenv_test_vfp
+
+/* We have support for rounding mode context. */
+#define HAVE_RM_CTX 1
+
+static __always_inline void
+libc_feholdsetround_vfp_ctx (struct rm_ctx *ctx, int r)
+{
+ fpu_control_t fpscr, round;
+
+ _FPU_GETCW (fpscr);
+ ctx->updated_status = false;
+ ctx->env.__cw = fpscr;
+
+ /* Check whether rounding modes are different. */
+ round = (fpscr ^ r) & FE_TOWARDZERO;
+
+ /* Set the rounding mode if changed. */
+ if (__glibc_unlikely (round != 0))
+ {
+ ctx->updated_status = true;
+ _FPU_SETCW (fpscr ^ round);
+ }
+}
+
+static __always_inline void
+libc_feresetround_vfp_ctx (struct rm_ctx *ctx)
+{
+ /* Restore the rounding mode if updated. */
+ if (__glibc_unlikely (ctx->updated_status))
+ {
+ fpu_control_t fpscr;
+
+ _FPU_GETCW (fpscr);
+ fpscr = (fpscr & ~FE_TOWARDZERO) | (ctx->env.__cw & FE_TOWARDZERO);
+ _FPU_SETCW (fpscr);
+ }
+}
+
+static __always_inline void
+libc_fesetenv_vfp_ctx (struct rm_ctx *ctx)
+{
+ fpu_control_t fpscr, new_fpscr;
+
+ _FPU_GETCW (fpscr);
+ new_fpscr = ctx->env.__cw;
+
+ /* Write new FPSCR if different (ignoring NZCV flags). */
+ if (__glibc_unlikely (((fpscr ^ new_fpscr) & ~_FPU_MASK_NZCV) != 0))
+ _FPU_SETCW (new_fpscr);
+}
+
+#define libc_feholdsetround_ctx libc_feholdsetround_vfp_ctx
+#define libc_feresetround_ctx libc_feresetround_vfp_ctx
+#define libc_feresetround_noex_ctx libc_fesetenv_vfp_ctx
+
+#define libc_feholdsetroundf_ctx libc_feholdsetround_vfp_ctx
+#define libc_feresetroundf_ctx libc_feresetround_vfp_ctx
+#define libc_feresetround_noexf_ctx libc_fesetenv_vfp_ctx
+
+#define libc_feholdsetroundl_ctx libc_feholdsetround_vfp_ctx
+#define libc_feresetroundl_ctx libc_feresetround_vfp_ctx
+#define libc_feresetround_noexl_ctx libc_fesetenv_vfp_ctx
+
+#endif
+
+#endif /* FENV_PRIVATE_H */
@@ -37,11 +37,16 @@ extern fpu_control_t __fpu_control;
#define _FPU_MASK_UM 0x00000800 /* underflow */
#define _FPU_MASK_PM 0x00001000 /* inexact */
+#define _FPU_MASK_NZCV 0xF0000000 /* NZCV flags */
+
+#define _FPU_MASK_EXCEPT 0x00001f1f /* all exception flags */
+
/* Some bits in the FPSCR are not yet defined. They must be preserved when
modifying the contents. */
#define _FPU_RESERVED 0x00086060
#define _FPU_DEFAULT 0x00000000
-/* Default + exceptions enabled. */
+
+/* Default + exceptions enabled. */
#define _FPU_IEEE (_FPU_DEFAULT | 0x00001f00)
/* Type of the control word. */
new file mode 100644
@@ -0,0 +1,6 @@
+#ifndef _MATH_PRIVATE_H
+
+#include "fenv_private.h"
+#include_next <math_private.h>
+
+#endif