[ARM] ] Add support for fenv_private on ARM

Message ID 000b01cf3949$7e3a0080$7aae0180$@com
State Superseded
Headers show

Commit Message

Wilco Dijkstra March 6, 2014, 2:36 p.m. UTC
Hi,

This patch improves performance of common math functions by avoiding unnecessary
writes to FPSCR. Add fenv_private.h with faster inline variants of fenv functions 
which avoid no-change writes to FPSCR. The number of FPSCR reads/writes reduces 
from 4/3 for a call to sin() to 3/1 with the inline fenv implementation, and 1/0 
for the HAVE_RM_CTX implementation.

A summary of performance on Cortex-A15:

No fenv_private.h:

cos(): ITERS:2.07e+07: TOTAL:10.6831s, MAX:1519.12ns, MIN:231.833ns, 1.93763e+06 iter/s
exp(): ITERS:3.598e+06: TOTAL:10.6089s, MAX:11415.5ns, MIN:175.375ns, 339148 iter/s
pow(): ITERS:3.3712e+07: TOTAL:9.91444s, MAX:531.669ns, MIN:57.833ns, 3.40029e+06 iter/s
sin(): ITERS:1.96e+07: TOTAL:10.5283s, MAX:1498.83ns, MIN:224.166ns, 1.86165e+06 iter/s
sincos(): ITERS:1.8684e+07: TOTAL:9.84671s, MAX:1599.79ns, MIN:499.417ns, 1.89749e+06 iter/s
tan(): ITERS:2.2701e+07: TOTAL:11.0817s, MAX:1001.79ns, MIN:225.333ns, 2.04852e+06 iter/s

With fenv_private.h:

cos(): ITERS:2.99e+07: TOTAL:9.93882s, MAX:2341.34ns, MIN:43.875ns, 3.00841e+06 iter/s
exp(): ITERS:3.598e+06: TOTAL:10.0066s, MAX:10440.2ns, MIN:26.5ns, 359562 iter/s
pow(): ITERS:5.8093e+07: TOTAL:9.86581s, MAX:1102.29ns, MIN:63.042ns, 5.88832e+06 iter/s
sin(): ITERS:3.08e+07: TOTAL:10.8619s, MAX:3371.59ns, MIN:37.708ns, 2.8356e+06 iter/s
sincos(): ITERS:5.7708e+07: TOTAL:9.88083s, MAX:1348.21ns, MIN:148.875ns, 5.8404e+06 iter/s
tan(): ITERS:3.243e+07: TOTAL:10.1926s, MAX:1840.3ns, MIN:50.042ns, 3.18171e+06 iter/s

GLIBC tests pass with same number of failures with the new fenv_private.h (both with 
and without HAVE_RM_CTX).

OK for commit?

Wilco

>From ba7c978b428967ee8217f7edef88156a288c8014 Mon Sep 17 00:00:00 2001
From: Wilco Dijkstra <wdijkstr@localhost.localdomain>
Date: Tue, 4 Mar 2014 13:44:44 +0000
Subject: [PATCH 1/2] Add support for fenv_private on ARM.

---
 sysdeps/arm/fenv_private.h |  250 ++++++++++++++++++++++++++++++++++++++++++++
 sysdeps/arm/fpu_control.h  |    7 +-
 sysdeps/arm/math_private.h |    6 ++
 3 files changed, 262 insertions(+), 1 deletion(-)
 create mode 100644 sysdeps/arm/fenv_private.h
 create mode 100644 sysdeps/arm/math_private.h

Comments

Joseph Myers March 6, 2014, 3:56 p.m. UTC | #1
On Thu, 6 Mar 2014, Wilco wrote:

> +#if ARM_HAVE_VFP

ARM_HAVE_VFP isn't suitable for use in #if; see the 
sysdeps/unix/sysv/linux/arm/arm-features.h definition.  You need either a 
new macro meaning "VFP is known at compile time to be available", or to 
move to checks with "if" inside the functions.

(The case of a soft-float build, VFP hardware available at runtime - the 
one addressed by "if" conditionals - has other problems with exceptions 
and rounding modes, that I think would best be addressed by use of IFUNCs 
in libgcc for the relevant RTABI functions; see bug 10064.  Anyway, the 
present patch is purely an optimization, so there's certainly no need for 
it to cover all cases as long as it doesn't break them.)

Patch

diff --git a/sysdeps/arm/fenv_private.h b/sysdeps/arm/fenv_private.h
new file mode 100644
index 0000000..6c65cfa
--- /dev/null
+++ b/sysdeps/arm/fenv_private.h
@@ -0,0 +1,250 @@ 
+/* Private floating point rounding and exceptions handling.  ARM VFP version.
+   Copyright (C) 2014 Free Software Foundation, Inc.
+   This file is part of the GNU C Library.
+
+   The GNU C Library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 2.1 of the License, or (at your option) any later version.
+
+   The GNU C Library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with the GNU C Library.  If not, see
+   <http://www.gnu.org/licenses/>.  */
+
+#ifndef FENV_PRIVATE_H
+#define FENV_PRIVATE_H 1
+
+#include <fenv.h>
+#include <fpu_control.h>
+#include <arm-features.h>
+
+#if ARM_HAVE_VFP
+
+static __always_inline void
+libc_feholdexcept_vfp (fenv_t *envp)
+{
+  fpu_control_t fpscr;
+
+  _FPU_GETCW (fpscr);
+  envp->__cw = fpscr;
+
+  /* Clear exception flags and set all exceptions to non-stop.  */
+  fpscr &= ~_FPU_MASK_EXCEPT;
+  _FPU_SETCW (fpscr);
+}
+
+static __always_inline void
+libc_fesetround_vfp (int round)
+{
+  fpu_control_t fpscr;
+
+  _FPU_GETCW (fpscr);
+
+  /* Set new rounding mode if different.  */
+  if (__glibc_unlikely ((fpscr & FE_TOWARDZERO) != round))
+    _FPU_SETCW ((fpscr & ~FE_TOWARDZERO) | round);
+}
+
+static __always_inline void
+libc_feholdexcept_setround_vfp (fenv_t *envp, int round)
+{
+  fpu_control_t fpscr;
+
+  _FPU_GETCW (fpscr);
+  envp->__cw = fpscr;
+
+  /* Clear exception flags, set all exceptions to non-stop,
+     and set new rounding mode.  */
+  fpscr &= ~(_FPU_MASK_EXCEPT | FE_TOWARDZERO);
+  _FPU_SETCW (fpscr | round);
+}
+
+static __always_inline void
+libc_feholdsetround_vfp (fenv_t *envp, int round)
+{
+  fpu_control_t fpscr;
+
+  _FPU_GETCW (fpscr);
+  envp->__cw = fpscr;
+
+  /* Set new rounding mode if different.  */
+  if (__glibc_unlikely ((fpscr & FE_TOWARDZERO) != round))
+    _FPU_SETCW ((fpscr & ~FE_TOWARDZERO) | round);
+}
+
+static __always_inline void
+libc_feresetround_vfp (fenv_t *envp)
+{
+  fpu_control_t fpscr, round;
+
+  _FPU_GETCW (fpscr);
+
+  /* Check whether rounding modes are different.  */
+  round = (envp->__cw ^ fpscr) & FE_TOWARDZERO;
+
+  /* Restore the rounding mode if it was changed.  */
+  if (__glibc_unlikely (round != 0))
+    _FPU_SETCW (fpscr ^ round);
+}
+
+static __always_inline int
+libc_fetestexcept_vfp (int ex)
+{
+  fpu_control_t fpscr;
+
+  _FPU_GETCW (fpscr);
+  return fpscr & ex & FE_ALL_EXCEPT;
+}
+
+static __always_inline void
+libc_fesetenv_vfp (fenv_t *envp)
+{
+  fpu_control_t fpscr, new_fpscr;
+
+  _FPU_GETCW (fpscr);
+  new_fpscr = envp->__cw;
+
+  /* Write new FPSCR if different (ignoring NZCV flags).  */
+  if (__glibc_unlikely (((fpscr ^ new_fpscr) & ~_FPU_MASK_NZCV) != 0))
+    _FPU_SETCW (new_fpscr);
+}
+
+static __always_inline int
+libc_feupdateenv_test_vfp (fenv_t *envp, int ex)
+{
+  fpu_control_t fpscr, new_fpscr;
+  int excepts;
+
+  _FPU_GETCW (fpscr);
+
+  /* Merge current exception flags with the saved fenv.  */
+  excepts = fpscr & FE_ALL_EXCEPT;
+  new_fpscr = envp->__cw | excepts;
+
+  /* Write new FPSCR if different (ignoring NZCV flags).  */
+  if (__glibc_unlikely (((fpscr ^ new_fpscr) & ~_FPU_MASK_NZCV) != 0))
+    _FPU_SETCW (new_fpscr);
+
+  /* Raise the exceptions if enabled in the new FP state.  */
+  if (__glibc_unlikely (excepts & (new_fpscr >> FE_EXCEPT_SHIFT)))
+    feraiseexcept (excepts);
+
+  return excepts & ex;
+}
+
+static __always_inline void
+libc_feupdateenv_vfp (fenv_t *envp)
+{
+  libc_feupdateenv_test_vfp (envp, 0);
+}
+
+#define libc_feholdexcept  libc_feholdexcept_vfp
+#define libc_feholdexceptf libc_feholdexcept_vfp
+#define libc_feholdexceptl libc_feholdexcept_vfp
+
+#define libc_fesetround  libc_fesetround_vfp
+#define libc_fesetroundf libc_fesetround_vfp
+#define libc_fesetroundl libc_fesetround_vfp
+
+#define libc_feresetround  libc_feresetround_vfp
+#define libc_feresetroundf libc_feresetround_vfp
+#define libc_feresetroundl libc_feresetround_vfp
+
+#define libc_feresetround_noex  libc_fesetenv_vfp
+#define libc_feresetround_noexf libc_fesetenv_vfp
+#define libc_feresetround_noexl libc_fesetenv_vfp
+
+#define libc_feholdexcept_setround  libc_feholdexcept_setround_vfp
+#define libc_feholdexcept_setroundf libc_feholdexcept_setround_vfp
+#define libc_feholdexcept_setroundl libc_feholdexcept_setround_vfp
+
+#define libc_feholdsetround  libc_feholdsetround_vfp
+#define libc_feholdsetroundf libc_feholdsetround_vfp
+#define libc_feholdsetroundl libc_feholdsetround_vfp
+
+#define libc_fetestexcept  libc_fetestexcept_vfp
+#define libc_fetestexceptf libc_fetestexcept_vfp
+#define libc_fetestexceptl libc_fetestexcept_vfp
+
+#define libc_fesetenv  libc_fesetenv_vfp
+#define libc_fesetenvf libc_fesetenv_vfp
+#define libc_fesetenvl libc_fesetenv_vfp
+
+#define libc_feupdateenv  libc_feupdateenv_vfp
+#define libc_feupdateenvf libc_feupdateenv_vfp
+#define libc_feupdateenvl libc_feupdateenv_vfp
+
+#define libc_feupdateenv_test  libc_feupdateenv_test_vfp
+#define libc_feupdateenv_testf libc_feupdateenv_test_vfp
+#define libc_feupdateenv_testl libc_feupdateenv_test_vfp
+
+/* We have support for rounding mode context.  */
+#define HAVE_RM_CTX 1
+
+static __always_inline void
+libc_feholdsetround_vfp_ctx (struct rm_ctx *ctx, int r)
+{
+  fpu_control_t fpscr, round;
+
+  _FPU_GETCW (fpscr);
+  ctx->updated_status = false;
+  ctx->env.__cw = fpscr;
+
+  /* Check whether rounding modes are different.  */
+  round = (fpscr ^ r) & FE_TOWARDZERO;
+
+  /* Set the rounding mode if changed.  */
+  if (__glibc_unlikely (round != 0))
+    {
+      ctx->updated_status = true;
+      _FPU_SETCW (fpscr ^ round);
+    }
+}
+
+static __always_inline void
+libc_feresetround_vfp_ctx (struct rm_ctx *ctx)
+{
+  /* Restore the rounding mode if updated.  */
+  if (__glibc_unlikely (ctx->updated_status))
+    {
+      fpu_control_t fpscr;
+
+      _FPU_GETCW (fpscr);
+      fpscr = (fpscr & ~FE_TOWARDZERO) | (ctx->env.__cw & FE_TOWARDZERO);
+      _FPU_SETCW (fpscr);
+    }
+}
+
+static __always_inline void
+libc_fesetenv_vfp_ctx (struct rm_ctx *ctx)
+{
+  fpu_control_t fpscr, new_fpscr;
+
+  _FPU_GETCW (fpscr);
+  new_fpscr = ctx->env.__cw;
+
+  /* Write new FPSCR if different (ignoring NZCV flags).  */
+  if (__glibc_unlikely (((fpscr ^ new_fpscr) & ~_FPU_MASK_NZCV) != 0))
+    _FPU_SETCW (new_fpscr);
+}
+
+#define libc_feholdsetround_ctx		libc_feholdsetround_vfp_ctx
+#define libc_feresetround_ctx		libc_feresetround_vfp_ctx
+#define libc_feresetround_noex_ctx	libc_fesetenv_vfp_ctx
+
+#define libc_feholdsetroundf_ctx	libc_feholdsetround_vfp_ctx
+#define libc_feresetroundf_ctx		libc_feresetround_vfp_ctx
+#define libc_feresetround_noexf_ctx	libc_fesetenv_vfp_ctx
+
+#define libc_feholdsetroundl_ctx	libc_feholdsetround_vfp_ctx
+#define libc_feresetroundl_ctx		libc_feresetround_vfp_ctx
+#define libc_feresetround_noexl_ctx	libc_fesetenv_vfp_ctx
+
+#endif
+
+#endif /* FENV_PRIVATE_H */
diff --git a/sysdeps/arm/fpu_control.h b/sysdeps/arm/fpu_control.h
index 6d54b9b..0377697 100644
--- a/sysdeps/arm/fpu_control.h
+++ b/sysdeps/arm/fpu_control.h
@@ -37,11 +37,16 @@  extern fpu_control_t __fpu_control;
 #define _FPU_MASK_UM	0x00000800	/* underflow */
 #define _FPU_MASK_PM	0x00001000	/* inexact */
 
+#define _FPU_MASK_NZCV	0xF0000000	/* NZCV flags */
+
+#define _FPU_MASK_EXCEPT 0x00001f1f	/* all exception flags */
+
 /* Some bits in the FPSCR are not yet defined.  They must be preserved when
    modifying the contents.  */
 #define _FPU_RESERVED	0x00086060
 #define _FPU_DEFAULT    0x00000000
-/* Default + exceptions enabled. */
+
+/* Default + exceptions enabled.  */
 #define _FPU_IEEE	(_FPU_DEFAULT | 0x00001f00)
 
 /* Type of the control word.  */
diff --git a/sysdeps/arm/math_private.h b/sysdeps/arm/math_private.h
new file mode 100644
index 0000000..541a7f8
--- /dev/null
+++ b/sysdeps/arm/math_private.h
@@ -0,0 +1,6 @@ 
+#ifndef _MATH_PRIVATE_H
+
+#include "fenv_private.h"
+#include_next <math_private.h>
+
+#endif