[AArch64] Inline __ieee754_sqrt(f)

Message ID 000101d07776$0c9c4ac0$25d4e040$@com
State Committed
Headers

Commit Message

Wilco Dijkstra April 15, 2015, 12:16 p.m. UTC
  Inline __ieee754_sqrt and __ieee754_sqrtf. Also add external definitions.

OK for commit?

2015-04-15  Wilco Dijkstra  <wdijkstr@arm.com>

	* sysdeps/aarch64/fpu/math_private.h (__ieee754_sqrt):
	New function.  (__ieee754_sqrtf): New function.
	* sysdeps/aarch64/fpu/e_sqrt.c (__ieee754_sqrt):
	New function.
	* sysdeps/aarch64/fpu/e_sqrtf.c (__ieee754_sqrtf):
	New function.

---
 sysdeps/aarch64/fpu/e_sqrt.c       | 28 ++++++++++++++++++++++++++++
 sysdeps/aarch64/fpu/e_sqrtf.c      | 28 ++++++++++++++++++++++++++++
 sysdeps/aarch64/fpu/math_private.h | 16 ++++++++++++++++
 3 files changed, 72 insertions(+)
 create mode 100644 sysdeps/aarch64/fpu/e_sqrt.c
 create mode 100644 sysdeps/aarch64/fpu/e_sqrtf.c
  

Comments

Szabolcs Nagy May 15, 2015, 11:18 a.m. UTC | #1
On 15/04/15 13:16, Wilco Dijkstra wrote:
> Inline __ieee754_sqrt and __ieee754_sqrtf. Also add external definitions.
> 
> OK for commit?
> 

ping

(looks good to me and same should be done for fabs)

> 2015-04-15  Wilco Dijkstra  <wdijkstr@arm.com>
> 
> 	* sysdeps/aarch64/fpu/math_private.h (__ieee754_sqrt):
> 	New function.  (__ieee754_sqrtf): New function.
> 	* sysdeps/aarch64/fpu/e_sqrt.c (__ieee754_sqrt):
> 	New function.
> 	* sysdeps/aarch64/fpu/e_sqrtf.c (__ieee754_sqrtf):
> 	New function.
> 
> ---
>  sysdeps/aarch64/fpu/e_sqrt.c       | 28 ++++++++++++++++++++++++++++
>  sysdeps/aarch64/fpu/e_sqrtf.c      | 28 ++++++++++++++++++++++++++++
>  sysdeps/aarch64/fpu/math_private.h | 16 ++++++++++++++++
>  3 files changed, 72 insertions(+)
>  create mode 100644 sysdeps/aarch64/fpu/e_sqrt.c
>  create mode 100644 sysdeps/aarch64/fpu/e_sqrtf.c
> 
> diff --git a/sysdeps/aarch64/fpu/e_sqrt.c b/sysdeps/aarch64/fpu/e_sqrt.c
> new file mode 100644
> index 0000000..4f11ca2
> --- /dev/null
> +++ b/sysdeps/aarch64/fpu/e_sqrt.c
> @@ -0,0 +1,28 @@
> +/* Square root of floating point number.
> +   Copyright (C) 2015 Free Software Foundation, Inc.
> +   This file is part of the GNU C Library.
> +
> +   The GNU C Library is free software; you can redistribute it and/or
> +   modify it under the terms of the GNU Lesser General Public
> +   License as published by the Free Software Foundation; either
> +   version 2.1 of the License, or (at your option) any later version.
> +
> +   The GNU C Library is distributed in the hope that it will be useful,
> +   but WITHOUT ANY WARRANTY; without even the implied warranty of
> +   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
> +   Lesser General Public License for more details.
> +
> +   You should have received a copy of the GNU Lesser General Public
> +   License along with the GNU C Library; if not, see
> +   <http://www.gnu.org/licenses/>.  */
> +
> +#include <math_private.h>
> +
> +double
> +__ieee754_sqrt (double d)
> +{
> +  double res;
> +  asm ("fsqrt   %d0, %d1" : "=w" (res) : "w" (d));
> +  return res;
> +}
> +strong_alias (__ieee754_sqrt, __sqrt_finite)
> diff --git a/sysdeps/aarch64/fpu/e_sqrtf.c b/sysdeps/aarch64/fpu/e_sqrtf.c
> new file mode 100644
> index 0000000..a2e99e1
> --- /dev/null
> +++ b/sysdeps/aarch64/fpu/e_sqrtf.c
> @@ -0,0 +1,28 @@
> +/* Single-precision floating point square root.
> +   Copyright (C) 2015 Free Software Foundation, Inc.
> +   This file is part of the GNU C Library.
> +
> +   The GNU C Library is free software; you can redistribute it and/or
> +   modify it under the terms of the GNU Lesser General Public
> +   License as published by the Free Software Foundation; either
> +   version 2.1 of the License, or (at your option) any later version.
> +
> +   The GNU C Library is distributed in the hope that it will be useful,
> +   but WITHOUT ANY WARRANTY; without even the implied warranty of
> +   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
> +   Lesser General Public License for more details.
> +
> +   You should have received a copy of the GNU Lesser General Public
> +   License along with the GNU C Library; if not, see
> +   <http://www.gnu.org/licenses/>.  */
> +
> +#include <math_private.h>
> +
> +float
> +__ieee754_sqrtf (float s)
> +{
> +  float res;
> +  asm ("fsqrt   %s0, %s1" : "=w" (res) : "w" (s));
> +  return res;
> +}
> +strong_alias (__ieee754_sqrtf, __sqrtf_finite)
> diff --git a/sysdeps/aarch64/fpu/math_private.h b/sysdeps/aarch64/fpu/math_private.h
> index 52a6ad9..b3c2509 100644
> --- a/sysdeps/aarch64/fpu/math_private.h
> +++ b/sysdeps/aarch64/fpu/math_private.h
> @@ -22,6 +22,22 @@
>  #include <fenv.h>
>  #include <fpu_control.h>
>  
> +extern __always_inline double
> +__ieee754_sqrt (double d)
> +{
> +  double res;
> +  asm __volatile__ ("fsqrt   %d0, %d1" : "=w" (res) : "w" (d));
> +  return res;
> +}
> +
> +extern __always_inline float
> +__ieee754_sqrtf (float s)
> +{
> +  float res;
> +  asm __volatile__ ("fsqrt   %s0, %s1" : "=w" (res) : "w" (s));
> +  return res;
> +}
> +
>  static __always_inline void
>  libc_feholdexcept_aarch64 (fenv_t *envp)
>  {
>
  
Wilco Dijkstra May 18, 2015, 10:04 a.m. UTC | #2
> Szabolcs Nagy wrote:
>
> (looks good to me and same should be done for fabs)

Yes - and that is yet another example of unnecessary target code...

Given GLIBC can now rely on a modern GCC for its build, can we avoid
this and actually use the builtins rather than having each target
override the generic implementation with an inline assembler version?

double
 __fabs (double x)
 {
-  u_int32_t high;
-  GET_HIGH_WORD (high, x);
-  SET_HIGH_WORD (x, high & 0x7fffffff);
-  return x;
+  return __builtin_fabs (x);
 }

GCC will always inline __builtin_fabs, with -O0, -Os and -fno-builtin.

Wilco
  
Joseph Myers May 18, 2015, 4:57 p.m. UTC | #3
On Mon, 18 May 2015, Wilco Dijkstra wrote:

> Given GLIBC can now rely on a modern GCC for its build, can we avoid
> this and actually use the builtins rather than having each target
> override the generic implementation with an inline assembler version?
> 
> double
>  __fabs (double x)
>  {
> -  u_int32_t high;
> -  GET_HIGH_WORD (high, x);
> -  SET_HIGH_WORD (x, high & 0x7fffffff);
> -  return x;
> +  return __builtin_fabs (x);
>  }
> 
> GCC will always inline __builtin_fabs, with -O0, -Os and -fno-builtin.

I believe doing this is safe (in that a proper inline expansion will be 
generated, by bit-fiddling if needed) *except for the ldbl-128ibm version 
of fabsl* (where fabs isn't simply a matter of clearing one bit, and GCC 
doesn't know how to do it correctly inline for soft-float / e500 - see GCC 
bug 29253 and the corresponding -fno-builtin-fabsl in 
sysdeps/powerpc/nofpu/Makefile).

However, there might be cases where GCC generates calls to an 
absolute-value instruction that fails to clear the sign bit of a NaN, or 
predates IEEE 754-2008 and raises "invalid" for a signaling NaN.  At least 
the former would be a clear GCC bug (on MIPS, for example, GCC makes sure 
to avoid abs instructions unless -mabs=2008 or -ffinite-math-only, for 
that reason), but it's something to watch out for.
  
Wilco Dijkstra May 19, 2015, 2 p.m. UTC | #4
> Joseph Myers wrote:
> On Mon, 18 May 2015, Wilco Dijkstra wrote:
> 
> > Given GLIBC can now rely on a modern GCC for its build, can we avoid
> > this and actually use the builtins rather than having each target
> > override the generic implementation with an inline assembler version?
> >
> > double
> >  __fabs (double x)
> >  {
> > -  u_int32_t high;
> > -  GET_HIGH_WORD (high, x);
> > -  SET_HIGH_WORD (x, high & 0x7fffffff);
> > -  return x;
> > +  return __builtin_fabs (x);
> >  }
> >
> > GCC will always inline __builtin_fabs, with -O0, -Os and -fno-builtin.
> 
> I believe doing this is safe (in that a proper inline expansion will be
> generated, by bit-fiddling if needed) *except for the ldbl-128ibm version
> of fabsl* (where fabs isn't simply a matter of clearing one bit, and GCC
> doesn't know how to do it correctly inline for soft-float / e500 - see GCC
> bug 29253 and the corresponding -fno-builtin-fabsl in
> sysdeps/powerpc/nofpu/Makefile).

OK, so I'll only do float and double. It seems 128-bit is usually emulated
so the current implementation is already optimal.
 
> However, there might be cases where GCC generates calls to an
> absolute-value instruction that fails to clear the sign bit of a NaN, or
> predates IEEE 754-2008 and raises "invalid" for a signaling NaN.  At least
> the former would be a clear GCC bug (on MIPS, for example, GCC makes sure
> to avoid abs instructions unless -mabs=2008 or -ffinite-math-only, for
> that reason), but it's something to watch out for.

If GCC implements fabs incorrectly then any code that uses fabs (including
libm which calls fabs in many places) would fail today. So the above patch
cannot introduce any new failures. I'll send out my patch at later date.

Wilco
  
Joseph Myers May 19, 2015, 3:03 p.m. UTC | #5
On Tue, 19 May 2015, Wilco Dijkstra wrote:

> > I believe doing this is safe (in that a proper inline expansion will be
> > generated, by bit-fiddling if needed) *except for the ldbl-128ibm version
> > of fabsl* (where fabs isn't simply a matter of clearing one bit, and GCC
> > doesn't know how to do it correctly inline for soft-float / e500 - see GCC
> > bug 29253 and the corresponding -fno-builtin-fabsl in
> > sysdeps/powerpc/nofpu/Makefile).
> 
> OK, so I'll only do float and double. It seems 128-bit is usually emulated
> so the current implementation is already optimal.

I think S/390 uses hardware 128-bit.

> > However, there might be cases where GCC generates calls to an
> > absolute-value instruction that fails to clear the sign bit of a NaN, or
> > predates IEEE 754-2008 and raises "invalid" for a signaling NaN.  At least
> > the former would be a clear GCC bug (on MIPS, for example, GCC makes sure
> > to avoid abs instructions unless -mabs=2008 or -ffinite-math-only, for
> > that reason), but it's something to watch out for.
> 
> If GCC implements fabs incorrectly then any code that uses fabs (including
> libm which calls fabs in many places) would fail today. So the above patch
> cannot introduce any new failures. I'll send out my patch at later date.

I don't think any libm code depends on the handling of signs of NaNs from 
GCC __builtin_fabs, or on what GCC __builtin_fabs does with sNaNs (it does 
depend on handling of signed zeroes, hence the -fno-builtin-fabsl for 
powerpc-nofpu).  The libm testsuite does verify the signs of NaNs from 
libm fabs (so if GCC gets it wrong then that would be visible as libm test 
failures for fabs), but not sNaN handling (Thomas proposed sNaN tests in 
<https://sourceware.org/ml/libc-ports/2013-04/msg00008.html>, but those 
would need a major rework to go in now).
  
Marcus Shawcroft June 23, 2015, 11:08 a.m. UTC | #6
On 15 April 2015 at 13:16, Wilco Dijkstra <wdijkstr@arm.com> wrote:
> Inline __ieee754_sqrt and __ieee754_sqrtf. Also add external definitions.
>
> OK for commit?
>
> 2015-04-15  Wilco Dijkstra  <wdijkstr@arm.com>
>
>         * sysdeps/aarch64/fpu/math_private.h (__ieee754_sqrt):
>         New function.  (__ieee754_sqrtf): New function.
>         * sysdeps/aarch64/fpu/e_sqrt.c (__ieee754_sqrt):
>         New function.
>         * sysdeps/aarch64/fpu/e_sqrtf.c (__ieee754_sqrtf):
>         New function.

OK /Marcus
  

Patch

diff --git a/sysdeps/aarch64/fpu/e_sqrt.c b/sysdeps/aarch64/fpu/e_sqrt.c
new file mode 100644
index 0000000..4f11ca2
--- /dev/null
+++ b/sysdeps/aarch64/fpu/e_sqrt.c
@@ -0,0 +1,28 @@ 
+/* Square root of floating point number.
+   Copyright (C) 2015 Free Software Foundation, Inc.
+   This file is part of the GNU C Library.
+
+   The GNU C Library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 2.1 of the License, or (at your option) any later version.
+
+   The GNU C Library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with the GNU C Library; if not, see
+   <http://www.gnu.org/licenses/>.  */
+
+#include <math_private.h>
+
+double
+__ieee754_sqrt (double d)
+{
+  double res;
+  asm ("fsqrt   %d0, %d1" : "=w" (res) : "w" (d));
+  return res;
+}
+strong_alias (__ieee754_sqrt, __sqrt_finite)
diff --git a/sysdeps/aarch64/fpu/e_sqrtf.c b/sysdeps/aarch64/fpu/e_sqrtf.c
new file mode 100644
index 0000000..a2e99e1
--- /dev/null
+++ b/sysdeps/aarch64/fpu/e_sqrtf.c
@@ -0,0 +1,28 @@ 
+/* Single-precision floating point square root.
+   Copyright (C) 2015 Free Software Foundation, Inc.
+   This file is part of the GNU C Library.
+
+   The GNU C Library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 2.1 of the License, or (at your option) any later version.
+
+   The GNU C Library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with the GNU C Library; if not, see
+   <http://www.gnu.org/licenses/>.  */
+
+#include <math_private.h>
+
+float
+__ieee754_sqrtf (float s)
+{
+  float res;
+  asm ("fsqrt   %s0, %s1" : "=w" (res) : "w" (s));
+  return res;
+}
+strong_alias (__ieee754_sqrtf, __sqrtf_finite)
diff --git a/sysdeps/aarch64/fpu/math_private.h b/sysdeps/aarch64/fpu/math_private.h
index 52a6ad9..b3c2509 100644
--- a/sysdeps/aarch64/fpu/math_private.h
+++ b/sysdeps/aarch64/fpu/math_private.h
@@ -22,6 +22,22 @@ 
 #include <fenv.h>
 #include <fpu_control.h>
 
+extern __always_inline double
+__ieee754_sqrt (double d)
+{
+  double res;
+  asm __volatile__ ("fsqrt   %d0, %d1" : "=w" (res) : "w" (d));
+  return res;
+}
+
+extern __always_inline float
+__ieee754_sqrtf (float s)
+{
+  float res;
+  asm __volatile__ ("fsqrt   %s0, %s1" : "=w" (res) : "w" (s));
+  return res;
+}
+
 static __always_inline void
 libc_feholdexcept_aarch64 (fenv_t *envp)
 {