Add generic HAVE_RM_CTX implementation

Message ID 000101cf58af$83900a90$8ab01fb0$@com
State Superseded
Headers

Commit Message

Wilco Dijkstra April 15, 2014, 1:35 p.m. UTC
  Hi,

This patch adds a generic implementation of HAVE_RM_CTX using standard fenv calls. As a result math
functions using SET_RESTORE_ROUND* macros do not suffer from a large slowdown on targets which do
not implement optimized libc_fe*_ctx inline functions. Most of the libc_fe* inline functions are now
unused and could be removed in the future (there are a few math functions left which use a mixture
of standard fenv calls and libc_fe* inline functions - they could be updated to use
SET_RESTORE_ROUND or improved to avoid expensive fenv manipulations across just a few FP
instructions).

libc_feholdsetround*_noex_ctx is added to enable better optimization of SET_RESTORE_ROUND_NOEX*
implementations.

Performance measurements on ARM and x86 of sin() show significant gains over the current default,
fairly close to a highly optimized fenv_private:

                       ARM   x86
no fenv_private      : 100%  100%
generic HAVE_RM_CTX  : 250%  350%
fenv_private (CTX)   : 250%  450%

Wilco

ChangeLog:
2014-04-15  Wilco  <wdijkstr@arm.com>

	* sysdeps/generic/math_private.h: Add generic HAVE_RM_CTX
	implementation. New function (libc_feholdsetround_noex_ctx).

---
 sysdeps/generic/math_private.h |  116 ++++++++++++++++++++++++++++++++--------
 1 file changed, 93 insertions(+), 23 deletions(-)
  

Comments

Siddhesh Poyarekar April 30, 2014, 11:39 a.m. UTC | #1
On Tue, Apr 15, 2014 at 02:35:06PM +0100, Wilco wrote:
> Hi,
> 
> This patch adds a generic implementation of HAVE_RM_CTX using standard fenv calls. As a result math
> functions using SET_RESTORE_ROUND* macros do not suffer from a large slowdown on targets which do
> not implement optimized libc_fe*_ctx inline functions. Most of the libc_fe* inline functions are now
> unused and could be removed in the future (there are a few math functions left which use a mixture
> of standard fenv calls and libc_fe* inline functions - they could be updated to use
> SET_RESTORE_ROUND or improved to avoid expensive fenv manipulations across just a few FP
> instructions).
> 
> libc_feholdsetround*_noex_ctx is added to enable better optimization of SET_RESTORE_ROUND_NOEX*
> implementations.
> 
> Performance measurements on ARM and x86 of sin() show significant gains over the current default,
> fairly close to a highly optimized fenv_private:
> 
>                        ARM   x86
> no fenv_private      : 100%  100%
> generic HAVE_RM_CTX  : 250%  350%
> fenv_private (CTX)   : 250%  450%

Nice.  This looks useful, but I don't know if you should wait for
feedback from arch maintainers that would actually be affected by this
change.

> Wilco
> 
> ChangeLog:
> 2014-04-15  Wilco  <wdijkstr@arm.com>
> 
> 	* sysdeps/generic/math_private.h: Add generic HAVE_RM_CTX
> 	implementation. New function (libc_feholdsetround_noex_ctx).
> 
> ---
>  sysdeps/generic/math_private.h |  116 ++++++++++++++++++++++++++++++++--------
>  1 file changed, 93 insertions(+), 23 deletions(-)
> 
> diff --git a/sysdeps/generic/math_private.h b/sysdeps/generic/math_private.h
> index 9b881a3..fade483 100644
> --- a/sysdeps/generic/math_private.h
> +++ b/sysdeps/generic/math_private.h
> @@ -20,6 +20,7 @@
>  #include <stdint.h>
>  #include <sys/types.h>
>  #include <fenv.h>
> +#include <get-rounding-mode.h>
>  
>  /* The original fdlibm code used statements like:
>  	n0 = ((*(int*)&one)>>29)^1;		* index of high word *
> @@ -557,6 +558,16 @@ default_libc_feupdateenv_test (fenv_t *e, int ex)
>     block is different from the current state.  This saves a lot of time when
>     the floating point unit is much slower than the fixed point units.  */
>  
> +# ifndef libc_feholdsetround_noex_ctx
> +#   define libc_feholdsetround_noex_ctx  libc_feholdsetround_ctx
> +# endif
> +# ifndef libc_feholdsetround_noexf_ctx
> +#   define libc_feholdsetround_noexf_ctx libc_feholdsetroundf_ctx
> +# endif
> +# ifndef libc_feholdsetround_noexl_ctx
> +#   define libc_feholdsetround_noexl_ctx libc_feholdsetroundl_ctx
> +# endif
> +
>  # ifndef libc_feresetround_noex_ctx
>  #   define libc_feresetround_noex_ctx  libc_fesetenv_ctx
>  # endif
> @@ -567,24 +578,80 @@ default_libc_feupdateenv_test (fenv_t *e, int ex)
>  #   define libc_feresetround_noexl_ctx libc_fesetenvl_ctx
>  # endif
>  
> -# ifndef libc_feholdsetround_53bit_ctx
> -#   define libc_feholdsetround_53bit_ctx libc_feholdsetround_ctx
> -# endif
> +#else
>  
> -# ifndef libc_feresetround_53bit_ctx
> -#   define libc_feresetround_53bit_ctx libc_feresetround_ctx
> -# endif
> +/* Default implementation using standard fenv functions.
> +   Avoid unnecessary rounding mode changes by first checking the
> +   current rounding mode.  Note the use of __glibc_unlikely is
> +   important for performance.  */
>  
> -# define SET_RESTORE_ROUND_GENERIC(RM,ROUNDFUNC,CLEANUPFUNC) \
> -  struct rm_ctx ctx __attribute__((cleanup(CLEANUPFUNC ## _ctx)));	      \
> -  ROUNDFUNC ## _ctx (&ctx, (RM))
> -#else
> -# define SET_RESTORE_ROUND_GENERIC(RM, ROUNDFUNC, CLEANUPFUNC) \
> -  fenv_t __libc_save_rm __attribute__((cleanup(CLEANUPFUNC)));	\
> -  ROUNDFUNC (&__libc_save_rm, (RM))
> +static __always_inline void
> +libc_feholdsetround_ctx (struct rm_ctx *ctx, int round)
> +{
> +  ctx->updated_status = false;
> +
> +  /* Update rounding mode only if different.  */
> +  if (__glibc_unlikely (round != get_rounding_mode ()))
> +    {
> +      ctx->updated_status = true;
> +      fegetenv (&ctx->env);
> +      fesetround (round);

Shouldn't this be __fegetenv and __fesetround?  We shouldn't need to
go through the extra dereference since these functions are
internal-only.  Likewise for all uses below.

> +    }
> +}
> +
> +static __always_inline void
> +libc_feresetround_ctx (struct rm_ctx *ctx)
> +{
> +  /* Restore the rounding mode if updated.  */
> +  if (__glibc_unlikely (ctx->updated_status))
> +    feupdateenv (&ctx->env);
> +}
> +
> +static __always_inline void
> +libc_feholdsetround_noex_ctx (struct rm_ctx *ctx, int round)
> +{
> +  /* Save exception flags and rounding mode.  */
> +  fegetenv (&ctx->env);
> +
> +  /* Update rounding mode only if different.  */
> +  if (__glibc_unlikely (round != get_rounding_mode ()))
> +    fesetround (round);
> +}
> +
> +static __always_inline void
> +libc_feresetround_noex_ctx (struct rm_ctx *ctx)
> +{
> +  /* Restore exception flags and rounding mode.  */
> +  fesetenv (&ctx->env);
> +}
> +
> +# define libc_feholdsetroundf_ctx libc_feholdsetround_ctx
> +# define libc_feholdsetroundl_ctx libc_feholdsetround_ctx
> +# define libc_feresetroundf_ctx   libc_feresetround_ctx
> +# define libc_feresetroundl_ctx   libc_feresetround_ctx
> +
> +# define libc_feholdsetround_noexf_ctx libc_feholdsetround_noex_ctx
> +# define libc_feholdsetround_noexl_ctx libc_feholdsetround_noex_ctx
> +# define libc_feresetround_noexf_ctx   libc_feresetround_noex_ctx
> +# define libc_feresetround_noexl_ctx   libc_feresetround_noex_ctx
> +
> +#endif
> +
> +#ifndef libc_feholdsetround_53bit_ctx
> +#  define libc_feholdsetround_53bit_ctx libc_feholdsetround_ctx
>  #endif
> +#ifndef libc_feresetround_53bit_ctx
> +#  define libc_feresetround_53bit_ctx libc_feresetround_ctx
> +#endif
> +
> +#define SET_RESTORE_ROUND_GENERIC(RM,ROUNDFUNC,CLEANUPFUNC) \
> +  struct rm_ctx ctx __attribute__((cleanup (CLEANUPFUNC ## _ctx))); \
> +  ROUNDFUNC ## _ctx (&ctx, (RM))
>  
> -/* Save and restore the rounding mode within a lexical block.  */
> +/* Set the rounding mode within a lexical block.  Restore the rounding mode to
> +   the value at the start of the block.  The exception mode must be preserved.
> +   Exceptions raised within the block must be set in the exception flags.
> +   Non-stop mode may be enabled inside the block.  */
>  
>  #define SET_RESTORE_ROUND(RM) \
>    SET_RESTORE_ROUND_GENERIC (RM, libc_feholdsetround, libc_feresetround)
> @@ -593,15 +660,18 @@ default_libc_feupdateenv_test (fenv_t *e, int ex)
>  #define SET_RESTORE_ROUNDL(RM) \
>    SET_RESTORE_ROUND_GENERIC (RM, libc_feholdsetroundl, libc_feresetroundl)
>  
> -/* Save and restore the rounding mode within a lexical block, and also
> -   the set of exceptions raised within the block may be discarded.  */
> -
> -#define SET_RESTORE_ROUND_NOEX(RM) \
> -  SET_RESTORE_ROUND_GENERIC (RM, libc_feholdsetround, libc_feresetround_noex)
> -#define SET_RESTORE_ROUND_NOEXF(RM) \
> -  SET_RESTORE_ROUND_GENERIC (RM, libc_feholdsetroundf, libc_feresetround_noexf)
> -#define SET_RESTORE_ROUND_NOEXL(RM) \
> -  SET_RESTORE_ROUND_GENERIC (RM, libc_feholdsetroundl, libc_feresetround_noexl)
> +/* Set the rounding mode within a lexical block.  Restore the rounding mode to
> +   the value at the start of the block.  The exception mode must be preserved.
> +   Exceptions raised within the block must be discarded, and exception flags
> +   are restored to the value at the start of the block.
> +   Non-stop mode may be enabled inside the block.  */
> +
> +#define SET_RESTORE_ROUND_NOEX(RM) SET_RESTORE_ROUND_GENERIC (RM, \
> +  libc_feholdsetround_noex, libc_feresetround_noex)
> +#define SET_RESTORE_ROUND_NOEXF(RM) SET_RESTORE_ROUND_GENERIC (RM, \
> +  libc_feholdsetround_noexf, libc_feresetround_noexf)
> +#define SET_RESTORE_ROUND_NOEXL(RM) SET_RESTORE_ROUND_GENERIC (RM, \
> +  libc_feholdsetround_noexl, libc_feresetround_noexl)

Minor bikeshedding: the following formatting would look nicer:

#define SET_RESTORE_ROUND_NOEX(RM) \
  SET_RESTORE_ROUND_GENERIC (RM, libc_feholdsetround_noex, \
			     libc_feresetround_noex)

>  /* Like SET_RESTORE_ROUND, but also set rounding precision to 53 bits.  */
>  #define SET_RESTORE_ROUND_53BIT(RM) \
> -- 
> 1.7.9.5
> 
> 
> 
> 
>
  
Wilco Dijkstra April 30, 2014, 1:39 p.m. UTC | #2
> From: Siddhesh Poyarekar [mailto:siddhesh@redhat.com]
> On Tue, Apr 15, 2014 at 02:35:06PM +0100, Wilco wrote:
> > Hi,
> >
> > This patch adds a generic implementation of HAVE_RM_CTX using standard fenv calls. As a
> result math
> > functions using SET_RESTORE_ROUND* macros do not suffer from a large slowdown on targets
> which do
> > not implement optimized libc_fe*_ctx inline functions. Most of the libc_fe* inline functions
> are now
> > unused and could be removed in the future (there are a few math functions left which use a
> mixture
> > of standard fenv calls and libc_fe* inline functions - they could be updated to use
> > SET_RESTORE_ROUND or improved to avoid expensive fenv manipulations across just a few FP
> > instructions).
> >
> > libc_feholdsetround*_noex_ctx is added to enable better optimization of
> SET_RESTORE_ROUND_NOEX*
> > implementations.
> >
> > Performance measurements on ARM and x86 of sin() show significant gains over the current
> default,
> > fairly close to a highly optimized fenv_private:
> >
> >                        ARM   x86
> > no fenv_private      : 100%  100%
> > generic HAVE_RM_CTX  : 250%  350%
> > fenv_private (CTX)   : 250%  450%
> 
> Nice.  This looks useful, but I don't know if you should wait for
> feedback from arch maintainers that would actually be affected by this
> change.

Would anyone mind a huge speedup? Basically I see this as a first step towards cleaning
up math_private.h and the way math functions use fenv. Ideally the default implementation 
should be efficient across all targets, and it looks like this is easy to achieve. 
If the rounding mode changes had been done like this when originally added, targets wouldn't
have experienced major slowdowns of common math functions.

Wilco
  
Siddhesh Poyarekar May 12, 2014, 6:02 a.m. UTC | #3
On Wed, Apr 30, 2014 at 02:39:36PM +0100, Wilco wrote:
> Would anyone mind a huge speedup? 

They wouldn't, but I haven't done an exhaustive code check across
architectures (or testing for that matter) to confidently assert that
the change would definitely be welcome by all.  That is why I asked
for another opinion from another maintainer on this.

> Basically I see this as a first step towards cleaning up
> math_private.h and the way math functions use fenv. Ideally the
> default implementation should be efficient across all targets, and
> it looks like this is easy to achieve.

Agreed and if you can get another maintainer to agree, then I think
this patch is good to go with the changes I suggested earlier.

Siddhesh
  
Wilco Dijkstra May 12, 2014, 12:02 p.m. UTC | #4
> Siddhesh Poyarekar wrote:
> On Wed, Apr 30, 2014 at 02:39:36PM +0100, Wilco wrote:
> > Would anyone mind a huge speedup?
> 
> They wouldn't, but I haven't done an exhaustive code check across
> architectures (or testing for that matter) to confidently assert that
> the change would definitely be welcome by all.  That is why I asked
> for another opinion from another maintainer on this.

Marcus (AArch64) already reviewed it.

> > Basically I see this as a first step towards cleaning up
> > math_private.h and the way math functions use fenv. Ideally the
> > default implementation should be efficient across all targets, and
> > it looks like this is easy to achieve.
> 
> Agreed and if you can get another maintainer to agree, then I think
> this patch is good to go with the changes I suggested earlier.

As for your other comment:

> Shouldn't this be __fegetenv and __fesetround?  We shouldn't need to
> go through the extra dereference since these functions are
> internal-only.  Likewise for all uses below.

I agree we should avoid PLTs for internal calls. However I'm certain
most targets wouldn't build if I used __fegetenv etc. libm_hidden_ver
is used inconsistently, so this is yet another area that needs a cleanup.

Solving that wasn't the purpose of my patch - in fact it is consistent
with the rest of math_private.h.

Btw can we agree on what the rules are? Eg.

* Every exported symbol should have a libm_hidden_ver.
* All internal calls must use the hidden symbol.

I'll improve the formatting of SET_RESTORE_ROUND_NOEX*.

Cheers,
Wilco
  
Patchwork Bot May 12, 2014, 1:26 p.m. UTC | #5
On 12 May 2014 17:32, Wilco <wdijkstr@arm.com> wrote:
> Marcus (AArch64) already reviewed it.

I don't see his review on this thread; is it on another thread?

> I agree we should avoid PLTs for internal calls. However I'm certain
> most targets wouldn't build if I used __fegetenv etc. libm_hidden_ver
> is used inconsistently, so this is yet another area that needs a cleanup.

Fair enough.  We can look at it as a separate cleanup then.

> Solving that wasn't the purpose of my patch - in fact it is consistent
> with the rest of math_private.h.
>
> Btw can we agree on what the rules are? Eg.
>
> * Every exported symbol should have a libm_hidden_ver.
> * All internal calls must use the hidden symbol.

An exported symbol should have a hidden alias if it is called
internally (and internal calls should use that hidden symbol), to
avoid the extra PLT dereference in the internal call.  So it is not
necessary to define a hidden alias for *every* exported symbol.

Siddhesh
  
Joseph Myers May 12, 2014, 2:57 p.m. UTC | #6
On Mon, 12 May 2014, Wilco wrote:

> > Shouldn't this be __fegetenv and __fesetround?  We shouldn't need to
> > go through the extra dereference since these functions are
> > internal-only.  Likewise for all uses below.
> 
> I agree we should avoid PLTs for internal calls. However I'm certain
> most targets wouldn't build if I used __fegetenv etc. libm_hidden_ver
> is used inconsistently, so this is yet another area that needs a cleanup.

As I said in <https://sourceware.org/ml/libc-alpha/2014-04/msg00471.html>, 
PLT avoidance (via hidden_def etc.) is a separate matter from 
namespace-cleanness (via __-prefixed names).

I think you can assume that for the symbols with libm_hidden_proto in 
include/fenv.h, all architectures use libm_hidden_def (otherwise they'd 
have had obvious build failures, at least if the symbol is in fact used 
internally).  *But*, if you want to use an __-prefixed name, you need to 
make sure all architectures do in fact define that name and then use a 
weak alias for the public name, as they may well not be consistent in that 
regard.  And it may still be useful to declare that __-prefixed name as 
hidden (i.e. use attribute_hidden on the prototype) or use hidden_proto / 
hidden_def with it; I'm not sure how important any compile-time 
optimizations this enables are, but it does allow for such optimizations 
at compile time rather than just at link time.

Namespace-cleanness is an issue here when the functions get used from C90 
libm functions, since C90 doesn't include the <fenv.h> functions (I expect 
it's also a pre-existing bug there and so any fix should include a bug 
report in Bugzilla, etc.) - C90 functions shouldn't use the fe* names (or 
a reserved name that brings in a non-reserved name as a *strong* alias).  
It's not relevant for calls from non-C90 functions (e.g. those for float 
and long double) except where in some other standard that lacks the 
<fenv.h> functions (e.g. Unix98 has some of the C99 functions for double).  
But in practice it would be fragile to rely on which math_private / 
fenv_private facilities get called from which libm functions, so I think 
systematically using the __-* names should be the eventual goal.
  
Wilco Dijkstra May 13, 2014, 1:35 p.m. UTC | #7
> Siddhesh Poyarekar wrote:
> On 12 May 2014 17:32, Wilco <wdijkstr@arm.com> wrote:
> > Marcus (AArch64) already reviewed it.
> 
> I don't see his review on this thread; is it on another thread?

This was an internal review before the patch goes public.

> > I agree we should avoid PLTs for internal calls. However I'm certain
> > most targets wouldn't build if I used __fegetenv etc. libm_hidden_ver
> > is used inconsistently, so this is yet another area that needs a cleanup.
> 
> Fair enough.  We can look at it as a separate cleanup then.
> 
> > Solving that wasn't the purpose of my patch - in fact it is consistent
> > with the rest of math_private.h.
> >
> > Btw can we agree on what the rules are? Eg.
> >
> > * Every exported symbol should have a libm_hidden_ver.
> > * All internal calls must use the hidden symbol.
> 
> An exported symbol should have a hidden alias if it is called
> internally (and internal calls should use that hidden symbol), to
> avoid the extra PLT dereference in the internal call.  So it is not
> necessary to define a hidden alias for *every* exported symbol.

It may not be necessary but it would be the only way to avoid inconsistencies.

Note it turns out the fenv functions called by math_private.h already use direct 
calls. So libm_hidden_def appears to be sufficient to bypass PLTs, and the 
libm_hidden_ver is only required to avoid the namespace issue Joseph mentioned.

I've attatched the new version. Besides the formatting I blocked the annoying
undefined HAVE_RM_CTX warnings.

Cheers,
Wilco
  

Patch

diff --git a/sysdeps/generic/math_private.h b/sysdeps/generic/math_private.h
index 9b881a3..fade483 100644
--- a/sysdeps/generic/math_private.h
+++ b/sysdeps/generic/math_private.h
@@ -20,6 +20,7 @@ 
 #include <stdint.h>
 #include <sys/types.h>
 #include <fenv.h>
+#include <get-rounding-mode.h>
 
 /* The original fdlibm code used statements like:
 	n0 = ((*(int*)&one)>>29)^1;		* index of high word *
@@ -557,6 +558,16 @@  default_libc_feupdateenv_test (fenv_t *e, int ex)
    block is different from the current state.  This saves a lot of time when
    the floating point unit is much slower than the fixed point units.  */
 
+# ifndef libc_feholdsetround_noex_ctx
+#   define libc_feholdsetround_noex_ctx  libc_feholdsetround_ctx
+# endif
+# ifndef libc_feholdsetround_noexf_ctx
+#   define libc_feholdsetround_noexf_ctx libc_feholdsetroundf_ctx
+# endif
+# ifndef libc_feholdsetround_noexl_ctx
+#   define libc_feholdsetround_noexl_ctx libc_feholdsetroundl_ctx
+# endif
+
 # ifndef libc_feresetround_noex_ctx
 #   define libc_feresetround_noex_ctx  libc_fesetenv_ctx
 # endif
@@ -567,24 +578,80 @@  default_libc_feupdateenv_test (fenv_t *e, int ex)
 #   define libc_feresetround_noexl_ctx libc_fesetenvl_ctx
 # endif
 
-# ifndef libc_feholdsetround_53bit_ctx
-#   define libc_feholdsetround_53bit_ctx libc_feholdsetround_ctx
-# endif
+#else
 
-# ifndef libc_feresetround_53bit_ctx
-#   define libc_feresetround_53bit_ctx libc_feresetround_ctx
-# endif
+/* Default implementation using standard fenv functions.
+   Avoid unnecessary rounding mode changes by first checking the
+   current rounding mode.  Note the use of __glibc_unlikely is
+   important for performance.  */
 
-# define SET_RESTORE_ROUND_GENERIC(RM,ROUNDFUNC,CLEANUPFUNC) \
-  struct rm_ctx ctx __attribute__((cleanup(CLEANUPFUNC ## _ctx)));	      \
-  ROUNDFUNC ## _ctx (&ctx, (RM))
-#else
-# define SET_RESTORE_ROUND_GENERIC(RM, ROUNDFUNC, CLEANUPFUNC) \
-  fenv_t __libc_save_rm __attribute__((cleanup(CLEANUPFUNC)));	\
-  ROUNDFUNC (&__libc_save_rm, (RM))
+static __always_inline void
+libc_feholdsetround_ctx (struct rm_ctx *ctx, int round)
+{
+  ctx->updated_status = false;
+
+  /* Update rounding mode only if different.  */
+  if (__glibc_unlikely (round != get_rounding_mode ()))
+    {
+      ctx->updated_status = true;
+      fegetenv (&ctx->env);
+      fesetround (round);
+    }
+}
+
+static __always_inline void
+libc_feresetround_ctx (struct rm_ctx *ctx)
+{
+  /* Restore the rounding mode if updated.  */
+  if (__glibc_unlikely (ctx->updated_status))
+    feupdateenv (&ctx->env);
+}
+
+static __always_inline void
+libc_feholdsetround_noex_ctx (struct rm_ctx *ctx, int round)
+{
+  /* Save exception flags and rounding mode.  */
+  fegetenv (&ctx->env);
+
+  /* Update rounding mode only if different.  */
+  if (__glibc_unlikely (round != get_rounding_mode ()))
+    fesetround (round);
+}
+
+static __always_inline void
+libc_feresetround_noex_ctx (struct rm_ctx *ctx)
+{
+  /* Restore exception flags and rounding mode.  */
+  fesetenv (&ctx->env);
+}
+
+# define libc_feholdsetroundf_ctx libc_feholdsetround_ctx
+# define libc_feholdsetroundl_ctx libc_feholdsetround_ctx
+# define libc_feresetroundf_ctx   libc_feresetround_ctx
+# define libc_feresetroundl_ctx   libc_feresetround_ctx
+
+# define libc_feholdsetround_noexf_ctx libc_feholdsetround_noex_ctx
+# define libc_feholdsetround_noexl_ctx libc_feholdsetround_noex_ctx
+# define libc_feresetround_noexf_ctx   libc_feresetround_noex_ctx
+# define libc_feresetround_noexl_ctx   libc_feresetround_noex_ctx
+
+#endif
+
+#ifndef libc_feholdsetround_53bit_ctx
+#  define libc_feholdsetround_53bit_ctx libc_feholdsetround_ctx
 #endif
+#ifndef libc_feresetround_53bit_ctx
+#  define libc_feresetround_53bit_ctx libc_feresetround_ctx
+#endif
+
+#define SET_RESTORE_ROUND_GENERIC(RM,ROUNDFUNC,CLEANUPFUNC) \
+  struct rm_ctx ctx __attribute__((cleanup (CLEANUPFUNC ## _ctx))); \
+  ROUNDFUNC ## _ctx (&ctx, (RM))
 
-/* Save and restore the rounding mode within a lexical block.  */
+/* Set the rounding mode within a lexical block.  Restore the rounding mode to
+   the value at the start of the block.  The exception mode must be preserved.
+   Exceptions raised within the block must be set in the exception flags.
+   Non-stop mode may be enabled inside the block.  */
 
 #define SET_RESTORE_ROUND(RM) \
   SET_RESTORE_ROUND_GENERIC (RM, libc_feholdsetround, libc_feresetround)
@@ -593,15 +660,18 @@  default_libc_feupdateenv_test (fenv_t *e, int ex)
 #define SET_RESTORE_ROUNDL(RM) \
   SET_RESTORE_ROUND_GENERIC (RM, libc_feholdsetroundl, libc_feresetroundl)
 
-/* Save and restore the rounding mode within a lexical block, and also
-   the set of exceptions raised within the block may be discarded.  */
-
-#define SET_RESTORE_ROUND_NOEX(RM) \
-  SET_RESTORE_ROUND_GENERIC (RM, libc_feholdsetround, libc_feresetround_noex)
-#define SET_RESTORE_ROUND_NOEXF(RM) \
-  SET_RESTORE_ROUND_GENERIC (RM, libc_feholdsetroundf, libc_feresetround_noexf)
-#define SET_RESTORE_ROUND_NOEXL(RM) \
-  SET_RESTORE_ROUND_GENERIC (RM, libc_feholdsetroundl, libc_feresetround_noexl)
+/* Set the rounding mode within a lexical block.  Restore the rounding mode to
+   the value at the start of the block.  The exception mode must be preserved.
+   Exceptions raised within the block must be discarded, and exception flags
+   are restored to the value at the start of the block.
+   Non-stop mode may be enabled inside the block.  */
+
+#define SET_RESTORE_ROUND_NOEX(RM) SET_RESTORE_ROUND_GENERIC (RM, \
+  libc_feholdsetround_noex, libc_feresetround_noex)
+#define SET_RESTORE_ROUND_NOEXF(RM) SET_RESTORE_ROUND_GENERIC (RM, \
+  libc_feholdsetround_noexf, libc_feresetround_noexf)
+#define SET_RESTORE_ROUND_NOEXL(RM) SET_RESTORE_ROUND_GENERIC (RM, \
+  libc_feholdsetround_noexl, libc_feresetround_noexl)
 
 /* Like SET_RESTORE_ROUND, but also set rounding precision to 53 bits.  */
 #define SET_RESTORE_ROUND_53BIT(RM) \