Add math-inline benchmark

Message ID 001c01d0a912$42357710$c6a06530$@com
State Superseded
Headers

Commit Message

Wilco Dijkstra June 17, 2015, 3:28 p.m. UTC
  Hi,

Due to popular demand, here is a new benchmark that tests isinf, isnan, 
isnormal, isfinite and fpclassify. It uses 2 arrays with 1024 doubles, 
one with 99% finite FP numbers (10% zeroes, 10% negative) and 1% inf/NaN,
the other with 50% inf, and 50% Nan.

Results shows that using the GCC built-ins in math.h will give huge speedups
due to avoiding explict calls, PLT indirection to execute a function with
3-4 instructions. The GCC builtins have similar performance as the existing 
math_private inlines for __isnan, __finite and __isinf_ns.

OK for commit?

ChangeLog:
2015-06-17  Wilco Dijkstra  <wdijkstr@arm.com>

	* benchtests/Makefile: Add bench-math-inlines.c.
	* benchtests/bench-math-inlines.c: New benchmark.

---
 benchtests/Makefile             |  14 +--
 benchtests/bench-math-inlines.c | 203 ++++++++++++++++++++++++++++++++++++++++
 2 files changed, 211 insertions(+), 6 deletions(-)
 create mode 100644 benchtests/bench-math-inlines.c
  

Comments

Joseph Myers June 17, 2015, 5:14 p.m. UTC | #1
I'll leave it to Siddhesh to review this patch, but it seems wrong to add 
this benchmark to string-bench (if it can be handled identically to the 
string benchmarks, maybe you need to change the variable name to reflect 
what's actually common between them).
  
Siddhesh Poyarekar June 22, 2015, 3:39 a.m. UTC | #2
On Wed, Jun 17, 2015 at 04:28:27PM +0100, Wilco Dijkstra wrote:
> Hi,
> 
> Due to popular demand, here is a new benchmark that tests isinf, isnan, 
> isnormal, isfinite and fpclassify. It uses 2 arrays with 1024 doubles, 
> one with 99% finite FP numbers (10% zeroes, 10% negative) and 1% inf/NaN,
> the other with 50% inf, and 50% Nan.
> 
> Results shows that using the GCC built-ins in math.h will give huge speedups
> due to avoiding explict calls, PLT indirection to execute a function with
> 3-4 instructions. The GCC builtins have similar performance as the existing 
> math_private inlines for __isnan, __finite and __isinf_ns.
> 
> OK for commit?
> 
> ChangeLog:
> 2015-06-17  Wilco Dijkstra  <wdijkstr@arm.com>
> 
> 	* benchtests/Makefile: Add bench-math-inlines.c.
> 	* benchtests/bench-math-inlines.c: New benchmark.
> 
> ---
>  benchtests/Makefile             |  14 +--
>  benchtests/bench-math-inlines.c | 203 ++++++++++++++++++++++++++++++++++++++++
>  2 files changed, 211 insertions(+), 6 deletions(-)
>  create mode 100644 benchtests/bench-math-inlines.c
> 
> diff --git a/benchtests/Makefile b/benchtests/Makefile
> index 8e615e5..3c20180 100644
> --- a/benchtests/Makefile
> +++ b/benchtests/Makefile
> @@ -30,12 +30,13 @@ bench-pthread := pthread_once
>  bench := $(bench-math) $(bench-pthread)
>  
>  # String function benchmarks.
> -string-bench := bcopy bzero memccpy memchr memcmp memcpy memmem memmove \
> -		mempcpy memset rawmemchr stpcpy stpncpy strcasecmp strcasestr \
> -		strcat strchr strchrnul strcmp strcpy strcspn strlen \
> -		strncasecmp strncat strncmp strncpy strnlen strpbrk strrchr \
> -		strspn strstr strcpy_chk stpcpy_chk memrchr strsep strtok \
> -		strcoll
> +string-bench := bcopy bzero math-inlines memccpy memchr memcmp memcpy memmem \
> +		memmove mempcpy memset rawmemchr stpcpy stpncpy strcasecmp \
> +		strcasestr strcat strchr strchrnul strcmp strcpy strcspn \
> +		strlen strncasecmp strncat strncmp strncpy strnlen strpbrk \
> +		strrchr strspn strstr strcpy_chk stpcpy_chk memrchr strsep \
> +		strtok strcoll
> +

Putting mathinline test cases in string-bench is wrong.  Make a new
benchset-math or something similar.

>  string-bench-all := $(string-bench)
>  
>  # We have to generate locales
> @@ -58,6 +59,7 @@ CFLAGS-bench-ffsll.c += -fno-builtin
>  bench-malloc := malloc-thread
>  
>  $(addprefix $(objpfx)bench-,$(bench-math)): $(libm)
> +$(addprefix $(objpfx)bench-,math-inlines): $(libm)

This doesn't need an addprefix, but if you make a benchset-math, then
you can use this format for benchset-math.

>  $(addprefix $(objpfx)bench-,$(bench-pthread)): $(shared-thread-library)
>  $(objpfx)bench-malloc-thread: $(shared-thread-library)
>  
> diff --git a/benchtests/bench-math-inlines.c b/benchtests/bench-math-inlines.c
> new file mode 100644
> index 0000000..c21a3d3
> --- /dev/null
> +++ b/benchtests/bench-math-inlines.c
> @@ -0,0 +1,203 @@
> +/* Measure math inline functions.

We've agreed to print outputs in JSON.  Please see other newer
benchtests to see how you can do that.  The string benchtests need to
be ported to the JSON output format.

> +   Copyright (C) 2015 Free Software Foundation, Inc.
> +   This file is part of the GNU C Library.
> +
> +   The GNU C Library is free software; you can redistribute it and/or
> +   modify it under the terms of the GNU Lesser General Public
> +   License as published by the Free Software Foundation; either
> +   version 2.1 of the License, or (at your option) any later version.
> +
> +   The GNU C Library is distributed in the hope that it will be useful,
> +   but WITHOUT ANY WARRANTY; without even the implied warranty of
> +   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
> +   Lesser General Public License for more details.
> +
> +   You should have received a copy of the GNU Lesser General Public
> +   License along with the GNU C Library; if not, see
> +   <http://www.gnu.org/licenses/>.  */
> +
> +#define SIZE 1024
> +#define TEST_MAIN
> +#define TEST_NAME "math-inlines"
> +#include "bench-string.h"
> +
> +#include <stdlib.h>
> +#include <math.h>
> +#include <stdint.h>
> +
> +#define BOOLTEST(func)					  \
> +int							  \
> +func ## _t (volatile double *p, size_t n, size_t iters)   \
> +{							  \
> +  int i, j;						  \
> +  int res = 0;						  \
> +  for (j = 0; j < iters; j++)				  \
> +    for (i = 0; i < n; i++)				  \
> +      if (func (p[i] * 2.0)) res++;			  \
> +  return res;						  \
> +}
> +
> +#define VALUETEST(func)					  \
> +int							  \
> +func ## _t (volatile double *p, size_t n, size_t iters)	  \
> +{							  \
> +  int i, j;						  \
> +  int res = 0;						  \
> +  for (j = 0; j < iters; j++)				  \
> +    for (i = 0; i < n; i++)				  \
> +      res += func (p[i] * 2.0);				  \
> +  return res;						  \
> +}
> +
> +typedef union
> +{
> +  double value;
> +  uint64_t word;
> +} ieee_double_shape_type;
> +
> +#define EXTRACT_WORDS64(i,d)                              \
> +do {                                                      \
> +  ieee_double_shape_type gh_u;                            \
> +  gh_u.value = (d);                                       \
> +  (i) = gh_u.word;                                        \
> +} while (0)
> +
> +/* Explicit inlines similar to math_private.h versions.  */
> +
> +extern __always_inline int
> +__isnan_inl (double d)
> +{
> +  uint64_t di;
> +  EXTRACT_WORDS64 (di, d);
> +  return (di & 0x7fffffffffffffffull) > 0x7ff0000000000000ull;
> +}
> +
> +extern __always_inline int
> +__isinf_inl (double x)
> +{
> +  uint64_t ix;
> +  EXTRACT_WORDS64 (ix,x);
> +  if ((ix << 1) != 0xffe0000000000000ull)
> +    return 0;
> +  return (int)(ix >> 32);
> +}
> +
> +extern __always_inline int
> +__finite_inl (double d)
> +{
> +  uint64_t di;
> +  EXTRACT_WORDS64 (di, d);
> +  return (di & 0x7fffffffffffffffull) < 0x7ff0000000000000ull;
> +}
> +
> +/* Explicit inline similar to existing math.h implementation.  */
> +
> +#define __isnormal_inl(X) (__fpclassify (X) == FP_NORMAL)
> +#define __isnormal_inl2(X) (fpclassify (X) == FP_NORMAL)
> +
> +/* Test fpclassify with use of only 2 of the 5 results.  */
> +
> +extern __always_inline int
> +__fpclassify_test1 (double d)
> +{
> +  int cl = fpclassify (d);
> +  return cl == FP_NAN || cl == FP_INFINITE;
> +}
> +
> +extern __always_inline int
> +__fpclassify_test2 (double d)
> +{
> +  return isnan (d) || isinf (d);
> +}
> +
> +/* Create test functions for each possibility.  */
> +
> +BOOLTEST (__isnan)
> +BOOLTEST (isnan)
> +BOOLTEST (__isnan_inl)
> +
> +BOOLTEST (__isinf)
> +BOOLTEST (isinf)
> +BOOLTEST (__isinf_inl)
> +
> +BOOLTEST (__finite)
> +BOOLTEST (isfinite)
> +BOOLTEST (__finite_inl)
> +
> +BOOLTEST (isnormal)
> +BOOLTEST (__isnormal_inl)
> +BOOLTEST (__isnormal_inl2)
> +
> +VALUETEST (fpclassify)
> +VALUETEST (__fpclassify)
> +BOOLTEST (__fpclassify_test1)
> +BOOLTEST (__fpclassify_test2)
> +
> +IMPL (isnan_t, 0)
> +IMPL (__isnan_t, 1)
> +IMPL (__isnan_inl_t, 2)
> +IMPL (isinf_t, 3)
> +IMPL (__isinf_t, 4)
> +IMPL (__isinf_inl_t, 5)
> +IMPL (isfinite_t, 6)
> +IMPL (__finite_t, 7)
> +IMPL (__finite_inl_t, 8)
> +IMPL (isnormal_t, 9)
> +IMPL (__isnormal_inl_t, 10)
> +IMPL (__isnormal_inl2_t, 11)
> +IMPL (fpclassify_t, 12)
> +IMPL (__fpclassify_t, 13)
> +IMPL (__fpclassify_test1_t, 14)
> +IMPL (__fpclassify_test2_t, 15)
> +
> +typedef int (*proto_t) (volatile double *p, size_t n, size_t iters);
> +
> +static void
> +do_one_test (impl_t *impl, volatile double *arr, size_t len)
> +{
> +  size_t iters = INNER_LOOP_ITERS * 10;
> +  timing_t start, stop, cur;
> +
> +  TIMING_NOW (start);
> +  CALL (impl, arr, len, iters);
> +  TIMING_NOW (stop);
> +  TIMING_DIFF (cur, start, stop);
> +
> +  TIMING_PRINT_MEAN ((double) cur, (double) iters);
> +}
> +
> +static volatile double arr1[SIZE];
> +static volatile double arr2[SIZE];
> +
> +int
> +test_main (void)
> +{
> +  size_t i;
> +
> +  test_init ();
> +
> +  /* Create 2 test arrays, one with 10% zeroes, 10% negative values,
> +     79% positive values and 1% infinity/NaN.  The other contains
> +     50% inf, 50% NaN.  */
> +
> +  for (i = 0; i < SIZE; i++)
> +    {
> +      int x = rand () & 255;
> +      arr1[i] = (x < 25) ? 0.0 : ((x < 50) ? -1 : 100);
> +      if (x == 255) arr1[i] = __builtin_inf ();
> +      if (x == 254) arr1[i] = __builtin_nan ("0");
> +      arr2[i] = (x < 128) ? __builtin_inf () : __builtin_nan ("0");
> +    }
> +
> +  FOR_EACH_IMPL (impl, 0)
> +    {
> +      printf ("%20s: ", impl->name);
> +      do_one_test (impl, arr1, SIZE);
> +      do_one_test (impl, arr2, SIZE);
> +      putchar ('\n');
> +    }
> +
> +  return ret;
> +}
> +
> +#include "../test-skeleton.c"
> -- 
> 1.9.1
> 
>
  
Ondrej Bilka June 22, 2015, 8:36 a.m. UTC | #3
On Wed, Jun 17, 2015 at 04:28:27PM +0100, Wilco Dijkstra wrote:
> Hi,
> 
> Due to popular demand, here is a new benchmark that tests isinf, isnan, 
> isnormal, isfinite and fpclassify. It uses 2 arrays with 1024 doubles, 
> one with 99% finite FP numbers (10% zeroes, 10% negative) and 1% inf/NaN,
> the other with 50% inf, and 50% Nan.
> 
> Results shows that using the GCC built-ins in math.h will give huge speedups
> due to avoiding explict calls, PLT indirection to execute a function with
> 3-4 instructions. The GCC builtins have similar performance as the existing 
> math_private inlines for __isnan, __finite and __isinf_ns.
> 
> OK for commit?
>
Ran these, on x64 using builtins is regression even with your benchmark. 

Main problem here is what exactly you do measure. I don't know how much
of your results were caused by measuring latency of load/multiply/move
to int register chain. With OoO that latency shouldn't be problem.

Original results are following, when I also inlined isfinite: 

__fpclassify_test2_t: 	3660.24	3733.22
__fpclassify_test1_t: 	3696.33	3691.3
      __fpclassify_t: 	14365.8	11116.5
        fpclassify_t: 	6045.69	3128.76
   __isnormal_inl2_t: 	5275.85	14562.6
    __isnormal_inl_t: 	14753.3	11143.5
          isnormal_t: 	4418.84	4411.59
      __finite_inl_t: 	3038.75	3038.4
          __finite_t: 	7712.42	7697.24
          isfinite_t: 	3108.91	3107.85
       __isinf_inl_t: 	2109.05	2817.19
           __isinf_t: 	8555.51	8559.36
             isinf_t: 	3472.62	3408.8
       __isnan_inl_t: 	2682.12	2691.39
           __isnan_t: 	7698.14	7735.29
             isnan_t: 	2592.58	2572.82


But with latency hiding by using argument first suddenly even isnan and
isnormal become regression.

    for (i = 0; i < n; i++){ res += 3*sin(p[i] * 2.0);    \
      if (func (p[i] * 2.0)) res += 5;}                   \


__fpclassify_test2_t:   92929.4 37256.8
__fpclassify_test1_t:   94020.1 35512.1
      __fpclassify_t:   17321.2 13325.1 
        fpclassify_t:   8021.29 4376.89
   __isnormal_inl2_t:   93896.9 38941.8
    __isnormal_inl_t:   98069.2 46140.4 
          isnormal_t:   94775.6 36941.8
      __finite_inl_t:   84059.9 38304
          __finite_t:   96052.4 45998.2 
          isfinite_t:   93371.5 36659.1
       __isinf_inl_t:   92532.9 36050.1
           __isinf_t:   95929.4 46585.2
             isinf_t:   93290.1 36445.6
       __isnan_inl_t:   82760.7 37452.2 
           __isnan_t:   98064.6 45338.8
             isnan_t:   93386.7 37786.4
  
Wilco Dijkstra July 6, 2015, 2:50 p.m. UTC | #4
> Ondřej Bílka wrote:
> On Wed, Jun 17, 2015 at 04:28:27PM +0100, Wilco Dijkstra wrote:
> > Hi,
> >
> > Due to popular demand, here is a new benchmark that tests isinf, isnan,
> > isnormal, isfinite and fpclassify. It uses 2 arrays with 1024 doubles,
> > one with 99% finite FP numbers (10% zeroes, 10% negative) and 1% inf/NaN,
> > the other with 50% inf, and 50% Nan.
> >
> > Results shows that using the GCC built-ins in math.h will give huge speedups
> > due to avoiding explict calls, PLT indirection to execute a function with
> > 3-4 instructions. The GCC builtins have similar performance as the existing
> > math_private inlines for __isnan, __finite and __isinf_ns.
> >
> > OK for commit?
> >
> Ran these, on x64 using builtins is regression even with your benchmark.
> 
> Main problem here is what exactly you do measure. I don't know how much
> of your results were caused by measuring latency of load/multiply/move
> to int register chain. With OoO that latency shouldn't be problem.
> 
> Original results are following, when I also inlined isfinite:
> 
> __fpclassify_test2_t: 	3660.24	3733.22
> __fpclassify_test1_t: 	3696.33	3691.3
>       __fpclassify_t: 	14365.8	11116.5
>         fpclassify_t: 	6045.69	3128.76
>    __isnormal_inl2_t: 	5275.85	14562.6
>     __isnormal_inl_t: 	14753.3	11143.5
>           isnormal_t: 	4418.84	4411.59
>       __finite_inl_t: 	3038.75	3038.4
>           __finite_t: 	7712.42	7697.24
>           isfinite_t: 	3108.91	3107.85
>        __isinf_inl_t: 	2109.05	2817.19
>            __isinf_t: 	8555.51	8559.36
>              isinf_t: 	3472.62	3408.8
>        __isnan_inl_t: 	2682.12	2691.39
>            __isnan_t: 	7698.14	7735.29
>              isnan_t: 	2592.58	2572.82
> 
> 
> But with latency hiding by using argument first suddenly even isnan and
> isnormal become regression.
> 
>     for (i = 0; i < n; i++){ res += 3*sin(p[i] * 2.0);    \
>       if (func (p[i] * 2.0)) res += 5;}                   \
> 
> 
> __fpclassify_test2_t:   92929.4 37256.8
> __fpclassify_test1_t:   94020.1 35512.1
>       __fpclassify_t:   17321.2 13325.1
>         fpclassify_t:   8021.29 4376.89
>    __isnormal_inl2_t:   93896.9 38941.8
>     __isnormal_inl_t:   98069.2 46140.4
>           isnormal_t:   94775.6 36941.8
>       __finite_inl_t:   84059.9 38304
>           __finite_t:   96052.4 45998.2
>           isfinite_t:   93371.5 36659.1
>        __isinf_inl_t:   92532.9 36050.1
>            __isinf_t:   95929.4 46585.2
>              isinf_t:   93290.1 36445.6
>        __isnan_inl_t:   82760.7 37452.2
>            __isnan_t:   98064.6 45338.8
>              isnan_t:   93386.7 37786.4

Can you try this with:

    for (i = 0; i < n; i++)                               \
      { double tmp = p[i] * 2.0;    \
      if (sin(tmp) < 1.0) res++; if (func (tmp)) res += 5;}                   \

Basically GCC does the array read and multiply twice just like you told it
to (remember this is not using -ffast-math). Also avoid adding unnecessary
FP operations and conversions (which may interact badly with timing the
code we're trying to test). 

For me the fixed version still shows the expected answer: the built-ins are
either faster or as fast as the inlines. So I don't think there is any
regression here (remember also that previously there were no inlines at all
except for a few inside GLIBC, so the real speedup is much larger).

__fpclassify_test2_t:	1.07
__fpclassify_test1_t:	1.07
__fpclassify_t:	1.24
fpclassify_t:	1
__isnormal_inl2_t:	1.11
__isnormal_inl_t:	1.24
isnormal_t:	1.04
__finite_inl_t:	1.04
__finite_t:	1.19
isfinite_t:	1
__isinf_inl_t:	1.07
__isinf_t:	1.22
isinf_t:	1
__isnan_inl_t:	1.04
__isnan_t:	1.14
isnan_t:	1

Wilco
  
Ondrej Bilka July 9, 2015, 12:44 p.m. UTC | #5
On Mon, Jul 06, 2015 at 03:50:11PM +0100, Wilco Dijkstra wrote:
> 
> 
> > Ondřej Bílka wrote:
> > But with latency hiding by using argument first suddenly even isnan and
> > isnormal become regression.
> > 
> >     for (i = 0; i < n; i++){ res += 3*sin(p[i] * 2.0);    \
> >       if (func (p[i] * 2.0)) res += 5;}                   \
> > 
> > 
> > __fpclassify_test2_t:   92929.4 37256.8
> > __fpclassify_test1_t:   94020.1 35512.1
> >       __fpclassify_t:   17321.2 13325.1
> >         fpclassify_t:   8021.29 4376.89
> >    __isnormal_inl2_t:   93896.9 38941.8
> >     __isnormal_inl_t:   98069.2 46140.4
> >           isnormal_t:   94775.6 36941.8
> >       __finite_inl_t:   84059.9 38304
> >           __finite_t:   96052.4 45998.2
> >           isfinite_t:   93371.5 36659.1
> >        __isinf_inl_t:   92532.9 36050.1
> >            __isinf_t:   95929.4 46585.2
> >              isinf_t:   93290.1 36445.6
> >        __isnan_inl_t:   82760.7 37452.2
> >            __isnan_t:   98064.6 45338.8
> >              isnan_t:   93386.7 37786.4
> 
> Can you try this with:
> 
>     for (i = 0; i < n; i++)                               \
>       { double tmp = p[i] * 2.0;    \
>       if (sin(tmp) < 1.0) res++; if (func (tmp)) res += 5;}                   \
>
That doesn't change outcome:

__fpclassify_test2_t: 	99721	51051.6
__fpclassify_test1_t: 	85015.2	43607.4
      __fpclassify_t: 	13997.3	10475.1
        fpclassify_t: 	13502.5	10253.6
   __isnormal_inl2_t: 	76479.4	41531.7
    __isnormal_inl_t: 	76526.9	41560.8
          isnormal_t: 	76458.6	41547.7
      __finite_inl_t: 	71108.6	33271.3
          __finite_t: 	73031	37452.3
          isfinite_t: 	73024.9	37447
       __isinf_inl_t: 	68599.2	32792.9
           __isinf_t: 	74851	40108.8
             isinf_t: 	74871.9	40109.9
       __isnan_inl_t: 	71100.8	33659.6
           __isnan_t: 	72914	37592.4
             isnan_t: 	72909.4	37635.8
 
> Basically GCC does the array read and multiply twice just like you told it
> to (remember this is not using -ffast-math). Also avoid adding unnecessary
> FP operations and conversions (which may interact badly with timing the
> code we're trying to test). 
> 
And how do you know that most users don't use fp conversions in their
code just before isinf? These interactions make benchtests worthless as
in practice a different variant would be faster than one that you
measure.

> For me the fixed version still shows the expected answer: the built-ins are
> either faster or as fast as the inlines. So I don't think there is any
> regression here (remember also that previously there were no inlines at all
> except for a few inside GLIBC, so the real speedup is much larger).

Thats arm only. So it looks that we need platform-specific headers and testing.

These give speedup but as internal on x64 are better as they are its
natural to ask if using these in general would give same speedup. That
leads to fixing gcc builtins.
  
Wilco Dijkstra July 10, 2015, 4:09 p.m. UTC | #6
> Ondřej Bílka wrote:
> On Mon, Jul 06, 2015 at 03:50:11PM +0100, Wilco Dijkstra wrote:
> >
> >
> > > Ondřej Bílka wrote:
> > > But with latency hiding by using argument first suddenly even isnan and
> > > isnormal become regression.
> > >
> > >     for (i = 0; i < n; i++){ res += 3*sin(p[i] * 2.0);    \
> > >       if (func (p[i] * 2.0)) res += 5;}                   \
> > >
> > >
> > > __fpclassify_test2_t:   92929.4 37256.8
> > > __fpclassify_test1_t:   94020.1 35512.1
> > >       __fpclassify_t:   17321.2 13325.1
> > >         fpclassify_t:   8021.29 4376.89
> > >    __isnormal_inl2_t:   93896.9 38941.8
> > >     __isnormal_inl_t:   98069.2 46140.4
> > >           isnormal_t:   94775.6 36941.8
> > >       __finite_inl_t:   84059.9 38304
> > >           __finite_t:   96052.4 45998.2
> > >           isfinite_t:   93371.5 36659.1
> > >        __isinf_inl_t:   92532.9 36050.1
> > >            __isinf_t:   95929.4 46585.2
> > >              isinf_t:   93290.1 36445.6
> > >        __isnan_inl_t:   82760.7 37452.2
> > >            __isnan_t:   98064.6 45338.8
> > >              isnan_t:   93386.7 37786.4
> >
> > Can you try this with:
> >
> >     for (i = 0; i < n; i++)                               \
> >       { double tmp = p[i] * 2.0;    \
> >       if (sin(tmp) < 1.0) res++; if (func (tmp)) res += 5;}                   \
> >
> That doesn't change outcome:
> 
> __fpclassify_test2_t: 	99721	51051.6
> __fpclassify_test1_t: 	85015.2	43607.4
>       __fpclassify_t: 	13997.3	10475.1
>         fpclassify_t: 	13502.5	10253.6
>    __isnormal_inl2_t: 	76479.4	41531.7
>     __isnormal_inl_t: 	76526.9	41560.8
>           isnormal_t: 	76458.6	41547.7
>       __finite_inl_t: 	71108.6	33271.3
>           __finite_t: 	73031	37452.3
>           isfinite_t: 	73024.9	37447
>        __isinf_inl_t: 	68599.2	32792.9
>            __isinf_t: 	74851	40108.8
>              isinf_t: 	74871.9	40109.9
>        __isnan_inl_t: 	71100.8	33659.6
>            __isnan_t: 	72914	37592.4
>              isnan_t: 	72909.4	37635.8

That doesn't look correct - it looks like this didn't use the built-ins at all,
did you forget to apply that patch?

Anyway I received a new machine so now GLIBC finally builds for x64. Since 
there appear large variations from run to run I repeat the same tests 4 times 
by copying the FOR_EACH_IMPL loop. The first 1 or 2 are bad, the last 2 
converge to useable results. So I suspect frequency scaling is an issue here.

Without the sin(tmp) part I get:

   remainder_test2_t:   40786.9 192862
   remainder_test1_t:   43008.2 196311
__fpclassify_test2_t:   2856.56 3020.12
__fpclassify_test1_t:   3043.53 3135.89
      __fpclassify_t:   12500.6 10152.5
        fpclassify_t:   2972.54 3047.65
   __isnormal_inl2_t:   4619.55 14491.1
    __isnormal_inl_t:   12896.3 10306.7
          isnormal_t:   4254.42 3667.87
      __finite_inl_t:   3979.58 3991.6
          __finite_t:   7039.61 7039.37
          isfinite_t:   2992.65 2969.25
       __isinf_inl_t:   2852.1  3239.23
           __isinf_t:   8991.81 8813.44
             isinf_t:   3241.75 3241.54
       __isnan_inl_t:   4003.51 3977.73
           __isnan_t:   7054.54 7054.5
             isnan_t:   2819.66 2801.94

And with the sin() addition:

   remainder_test2_t:   105093  214635
   remainder_test1_t:   106635  218012
__fpclassify_test2_t:   64290.9 32116.6
__fpclassify_test1_t:   64365.1 32310.2
      __fpclassify_t:   72006.1 41607
        fpclassify_t:   64190.3 33450.1
   __isnormal_inl2_t:   65959.1 33672
    __isnormal_inl_t:   71875.7 41727.3
          isnormal_t:   65676.1 32826.1
      __finite_inl_t:   69600.6 35293.3
          __finite_t:   67653.8 38627.2
          isfinite_t:   64435.9 34904.9
       __isinf_inl_t:   68556.6 33176
           __isinf_t:   69066.4 39562.7
             isinf_t:   64755.5 34244.6
       __isnan_inl_t:   69577.3 34776.2
           __isnan_t:   67538.8 38321.3
             isnan_t:   63963   33276.6

The remainder test is basically math/w_remainder.c adapted to use __isinf_inl
and __isnan_inl (test1) or the isinf/isnan built-ins (test2).

From this it seems that __isinf_inl is slightly better than the builtin, but
it does not show up as a regression when combined with sin or in the remainder
test.

So I don't see any potential regression here on x64 - in fact it looks like
inlining using the built-ins gives quite good speedups across the board. And 
besides inlining applications using GLIBC it also inlines a lot of callsites
within GLIBC that weren't previously inlined.

> > Basically GCC does the array read and multiply twice just like you told it
> > to (remember this is not using -ffast-math). Also avoid adding unnecessary
> > FP operations and conversions (which may interact badly with timing the
> > code we're trying to test).
> >
> And how do you know that most users don't use fp conversions in their
> code just before isinf? These interactions make benchtests worthless as
> in practice a different variant would be faster than one that you
> measure.

You always get such interactions, it's unavoidable. That's why I added some
actual math code that uses isinf/isnan to see how it performs in real life.

> > For me the fixed version still shows the expected answer: the built-ins are
> > either faster or as fast as the inlines. So I don't think there is any
> > regression here (remember also that previously there were no inlines at all
> > except for a few inside GLIBC, so the real speedup is much larger).
> 
> Thats arm only. So it looks that we need platform-specific headers and testing.

Well I just confirmed the same gains apply to x64.

Wilco
  
Ondrej Bilka July 10, 2015, 6:13 p.m. UTC | #7
On Fri, Jul 10, 2015 at 05:09:16PM +0100, Wilco Dijkstra wrote:
> > Ondřej Bílka wrote:
> > On Mon, Jul 06, 2015 at 03:50:11PM +0100, Wilco Dijkstra wrote:
> > >
> > >
> > > > Ondřej Bílka wrote:
> > > > But with latency hiding by using argument first suddenly even isnan and
> > > > isnormal become regression.
> > > >
> 
> That doesn't look correct - it looks like this didn't use the built-ins at all,
> did you forget to apply that patch?
>
No, from what you wrote I expected that patch already tests builtins
which doesn't. Applied patch and got different results. When I added
patch results are similar.
 
> Anyway I received a new machine so now GLIBC finally builds for x64. Since 
> there appear large variations from run to run I repeat the same tests 4 times 
> by copying the FOR_EACH_IMPL loop. The first 1 or 2 are bad, the last 2 
> converge to useable results. So I suspect frequency scaling is an issue here.
> 
> Without the sin(tmp) part I get:
> 
>    remainder_test2_t:   40786.9 192862
>    remainder_test1_t:   43008.2 196311
> __fpclassify_test2_t:   2856.56 3020.12
> __fpclassify_test1_t:   3043.53 3135.89
>       __fpclassify_t:   12500.6 10152.5
>         fpclassify_t:   2972.54 3047.65
>    __isnormal_inl2_t:   4619.55 14491.1
>     __isnormal_inl_t:   12896.3 10306.7
>           isnormal_t:   4254.42 3667.87
>       __finite_inl_t:   3979.58 3991.6
>           __finite_t:   7039.61 7039.37
>           isfinite_t:   2992.65 2969.25
>        __isinf_inl_t:   2852.1  3239.23
>            __isinf_t:   8991.81 8813.44
>              isinf_t:   3241.75 3241.54
>        __isnan_inl_t:   4003.51 3977.73
>            __isnan_t:   7054.54 7054.5
>              isnan_t:   2819.66 2801.94
> 
> And with the sin() addition:
> 
>    remainder_test2_t:   105093  214635
>    remainder_test1_t:   106635  218012
> __fpclassify_test2_t:   64290.9 32116.6
> __fpclassify_test1_t:   64365.1 32310.2
>       __fpclassify_t:   72006.1 41607
>         fpclassify_t:   64190.3 33450.1
>    __isnormal_inl2_t:   65959.1 33672
>     __isnormal_inl_t:   71875.7 41727.3
>           isnormal_t:   65676.1 32826.1
>       __finite_inl_t:   69600.6 35293.3
>           __finite_t:   67653.8 38627.2
>           isfinite_t:   64435.9 34904.9
>        __isinf_inl_t:   68556.6 33176
>            __isinf_t:   69066.4 39562.7
>              isinf_t:   64755.5 34244.6
>        __isnan_inl_t:   69577.3 34776.2
>            __isnan_t:   67538.8 38321.3
>              isnan_t:   63963   33276.6
> 
> The remainder test is basically math/w_remainder.c adapted to use __isinf_inl
> and __isnan_inl (test1) or the isinf/isnan built-ins (test2).
> 
Which still doesn't have to mean anything, only if you test a
application that frequently uses these you will get result without
doubt.

Here a simple modification produces different results. One of many
objections is that by simply adding gcc will try to make branchless code
like converting that to res += 5 * (isnan(tmp)). So use more difficult
branch and and with following two I get __builtin_isinf lot slower.

    { double tmp = p[i] * 2.0;    \
       res += 3 * sin (tmp); if (func (tmp)) res += 3* sin (2 * tmp) ;} \

    { double tmp = p[i] * 2.0;    \
       if (func (tmp)) res += 3 * sin (2 * tmp) ;} \


__fpclassify_test2_t: 	4645.32	34549.8
__fpclassify_test1_t: 	4858.35	35682
      __fpclassify_t: 	12959.6	9402.96
        fpclassify_t: 	6163.18	3573.92
   __isnormal_inl2_t: 	72463.8	14535.8
    __isnormal_inl_t: 	78154.4	10665.7
          isnormal_t: 	72172.7	3978.68
      __finite_inl_t: 	69525.5	2048.91
          __finite_t: 	75383.9	10260.5
          isfinite_t: 	71913.3	3449.38
       __isinf_inl_t: 	2279.96	19166.1
           __isinf_t: 	10462.3	32308.5
             isinf_t: 	3706.35	22804.2
       __isnan_inl_t: 	2376.56	20769.1
           __isnan_t: 	9558.7	23683.9
             isnan_t: 	2025.91	19824.9

and

__fpclassify_test2_t: 	69720.7	70690
__fpclassify_test1_t: 	69575.2	57032
      __fpclassify_t: 	12839.1	9460.82
        fpclassify_t: 	6136.18	3355.79
   __isnormal_inl2_t: 	133754	34673.9
    __isnormal_inl_t: 	137851	41468.9
          isnormal_t: 	133421	37423.4
      __finite_inl_t: 	131153	34229.9
          __finite_t: 	135990	39609.5
          isfinite_t: 	132679	34604.8
       __isinf_inl_t: 	66901.3	44636.5
           __isinf_t: 	74330.2	53375.7
             isinf_t: 	68432.6	45350.5
       __isnan_inl_t: 	67125.3	43663.6
           __isnan_t: 	71864.2	49792.7
             isnan_t: 	67107.9	42478.3

> >From this it seems that __isinf_inl is slightly better than the builtin, but
> it does not show up as a regression when combined with sin or in the remainder
> test.
> 
That doesn't hold generaly as remainder test it could be just caused by
isnan being slower than isinf.

> So I don't see any potential regression here on x64 - in fact it looks like
> inlining using the built-ins gives quite good speedups across the board. And 
> besides inlining applications using GLIBC it also inlines a lot of callsites
> within GLIBC that weren't previously inlined.
> 
That doesn't mean anything as this is still poor benchmark. It only uses
tight loop which doesn't measure cost of loading constants for example.

> > > Basically GCC does the array read and multiply twice just like you told it
> > > to (remember this is not using -ffast-math). Also avoid adding unnecessary
> > > FP operations and conversions (which may interact badly with timing the
> > > code we're trying to test).
> > >
> > And how do you know that most users don't use fp conversions in their
> > code just before isinf? These interactions make benchtests worthless as
> > in practice a different variant would be faster than one that you
> > measure.
> 
> You always get such interactions, it's unavoidable. That's why I added some
> actual math code that uses isinf/isnan to see how it performs in real life.
> 
> > > For me the fixed version still shows the expected answer: the built-ins are
> > > either faster or as fast as the inlines. So I don't think there is any
> > > regression here (remember also that previously there were no inlines at all
> > > except for a few inside GLIBC, so the real speedup is much larger).
> > 
> > Thats arm only. So it looks that we need platform-specific headers and testing.
> 
> Well I just confirmed the same gains apply to x64.
> 
No, that doesn't confirm anything yet. You need to do more extensive
testing to get somewhat reliable answer and still you won't be sure. 

I asked you to run on arm my benchmark to measure results of inlining.
I attached again version. You should run it to see how results will differ.
test()
{
gcc -O3 -DBRANCHED -DI1=$1 -DI2=$2 -DT$3 ft.c 2>/dev/null
echo branched
time ./a.out
gcc -O3  -DI1=$1 -DI2=$2 -DT$3 ft.c 2>/dev/null
echo builtin
time ./a.out
}
test2()
{
  echo conditional add
  test $1 $2 1
  echo branch
  test $1 $2 2
  echo sum
  test $1 $2 3

}
echo "don't inline"
test2 '__attribute__((noinline))' '__attribute__((noinline))'
echo inline outer call
test2 '__attribute__((noinline))' '__attribute__((always_inline))'
echo inline inner call
test2 '__attribute__((always_inline))' '__attribute__((noinline))'
echo tigth loop
test2 '__attribute__((always_inline))' '__attribute__((always_inline))'
#include <stdlib.h>
#include <stdio.h>
//#include <math.h>
#undef isinf2
#ifdef BRANCHED
int I2
isinf2 (double dx)
{
  union u {
    double d;
    long l;
  };
  volatile union u u;
  u.d = dx;
  long x = u.l;
  return 2 * x == 0xffe0000000000000 ? (x == 0x7ff0000000000000 ? 1 :
-1) : 0;
}
#else
 int I2
isinf2(double x)
{
  return __builtin_isinf (x);
}
#endif

void I1
t1(double x,double *d)
{
  if (isinf2 (x))
   *d+=42;
}
void I1
t2(double x,double *d)
{
  if (isinf2 (x))
    abort();
}
void I1
t3(double x,double *d)
{
  *d += isinf2 (x);
}




int main()
{
  double ret= 0.0;
  int i, j;
  double *d = malloc (800000);
  for (i=0;i<1000;i++)
    d[i]=4.2;//(rand() % 2) ? 0.0 : (rand() % 2 ? 1.3 : 1.0/0.0);

  for (j=0; j<300000; j++)
#ifdef T1
  for (i=0; i<1000; i++)
    t1(d[i]+1.3, &ret);
#endif
#ifdef T2
  for (i=0; i<1000; i++)
    t2(d[i]+1.3, &ret);
#endif
#ifdef T3
  for (i=0; i<1000; i++)
    t3(d[i]+1.3, &ret);
#endif
  
  return ret;
}
  
Wilco Dijkstra July 13, 2015, 11:02 a.m. UTC | #8
> Ondřej Bílka wrote:
> On Fri, Jul 10, 2015 at 05:09:16PM +0100, Wilco Dijkstra wrote:
> > > Ondřej Bílka wrote:
> > > On Mon, Jul 06, 2015 at 03:50:11PM +0100, Wilco Dijkstra wrote:
> > > >
> > > >
> > > > > Ondřej Bílka wrote:
> > > > > But with latency hiding by using argument first suddenly even isnan and
> > > > > isnormal become regression.
> > > > >
> >
> > That doesn't look correct - it looks like this didn't use the built-ins at all,
> > did you forget to apply that patch?
> >
> No, from what you wrote I expected that patch already tests builtins
> which doesn't. Applied patch and got different results. When I added
> patch results are similar.

OK, I extended the benchmark to add the built-ins explicitly so that
you don't need to apply the math.h inline patch first.

> Which still doesn't have to mean anything, only if you test a
> application that frequently uses these you will get result without
> doubt.

We don't have applications that uses these, but we can say without any
doubt that they will show huge speedups if they do use these functions
frequently or any math functions that use them a lot. Remainder() for
example shows ~7% gain with the new inlines.

> Here a simple modification produces different results. One of many
> objections is that by simply adding gcc will try to make branchless code
> like converting that to res += 5 * (isnan(tmp)). So use more difficult
> branch and and with following two I get __builtin_isinf lot slower.
> 
>     { double tmp = p[i] * 2.0;    \
>        res += 3 * sin (tmp); if (func (tmp)) res += 3* sin (2 * tmp) ;} \
> 
>     { double tmp = p[i] * 2.0;    \
>        if (func (tmp)) res += 3 * sin (2 * tmp) ;} \

So here are the results again for the original test and your 2 tests above:

   remainder_test2_t:   40966.3 192314
   remainder_test1_t:   43697.4 196474
      __fpclassify_t:   12665.2 9951.16
        fpclassify_t:   2979.56 2974.35
__fpclassify_test2_t:   2889.92 2984.95
__fpclassify_test1_t:   3269.67 3199.05
          isnormal_t:   4381.54 4041.78
__isnormal_builtin_t:   4586.15 4318.18
   __isnormal_inl2_t:   4371.76 10737.4
    __isnormal_inl_t:   12635.5 10418.4
          isfinite_t:   2992.79 2979.5
__isfinite_builtin_t:   2982.96 2982.92
      __finite_inl_t:   4090.2  4064.52
          __finite_t:   7058.1  7039.74
             isinf_t:   3274.14 3299.75
   __isinf_builtin_t:   3195.79 3196.05
__isinf_ns_builtin_t:   3241.91 3241.96
        __isinf_ns_t:   3500.85 3493.8
       __isinf_inl_t:   2834.83 3433.89
           __isinf_t:   8794.62 8812.5
             isnan_t:   2801.83 2801.67
   __isnan_builtin_t:   2794.7  2891.37
       __isnan_inl_t:   4216.83 3980.52
           __isnan_t:   7070.36 7088.15

   remainder_test2_t:   105654  239008
   remainder_test1_t:   107533  239310
      __fpclassify_t:   12523.5 10080.2
        fpclassify_t:   2974.47 2983.21
__fpclassify_test2_t:   64227.5 55564.1
__fpclassify_test1_t:   64036.1 55424
          isnormal_t:   122300  34529.5
__isnormal_builtin_t:   122592  34616
   __isnormal_inl2_t:   123425  35056
    __isnormal_inl_t:   129589  41615.3
          isfinite_t:   123254  34041.5
__isfinite_builtin_t:   123302  34093
      __finite_inl_t:   123455  34631.8
          __finite_t:   127298  39587.5
             isinf_t:   63744   45997.6
   __isinf_builtin_t:   63545.2 46100.2
__isinf_ns_builtin_t:   63570.9 46087.5
        __isinf_ns_t:   63890.9 45754.5
       __isinf_inl_t:   64008.5 46505.2
           __isinf_t:   68915.7 51833.8
             isnan_t:   62866.8 45023.5
   __isnan_builtin_t:   62951.9 44956.8
       __isnan_inl_t:   63855.1 45294.6
           __isnan_t:   67156.5 49505.3

   remainder_test2_t:   41147.4 216349
   remainder_test1_t:   43860.8 220614
      __fpclassify_t:   12569.1 10124.3
        fpclassify_t:   3068.91 2974.31
__fpclassify_test2_t:   4048.88 32446.5
__fpclassify_test1_t:   4005.86 32783.1
          isnormal_t:   63707.9 14550.8
__isnormal_builtin_t:   63672   14383
   __isnormal_inl2_t:   65730.4 15059.1
    __isnormal_inl_t:   73352.2 10570.7
          isfinite_t:   64756.7 2719.12
__isfinite_builtin_t:   64748.8 2664.12
      __finite_inl_t:   65331.4 2740.1
          __finite_t:   70374.5 7944.69
             isinf_t:   2927.67 20684.2
   __isinf_builtin_t:   2848.58 20050.9
__isinf_ns_builtin_t:   2932.22 21809.9
        __isinf_ns_t:   2908.41 18973.9
       __isinf_inl_t:   2971.63 18025.5
           __isinf_t:   9010.58 28392.3
             isnan_t:   2841.28 16457.3
   __isnan_builtin_t:   2841.34 15017.8
       __isnan_inl_t:   2846.25 19736.8
           __isnan_t:   8171.32 23874.3

> > >From this it seems that __isinf_inl is slightly better than the builtin, but
> > it does not show up as a regression when combined with sin or in the remainder
> > test.
> >
> That doesn't hold generaly as remainder test it could be just caused by
> isnan being slower than isinf.

No, the new isinf/isnan are both faster than the previous versions (some isinf
calls were inlined as __isinf_ns, but even that one is clearly slower than the
builtin in all the results). Remember once again this patch creates new inlines
that didn't exist before as well as replacing existing inlines in GLIBC with
even faster ones. The combination of these means it is simply an impossibility
that anything could become slower.

> > Well I just confirmed the same gains apply to x64.
> >
> No, that doesn't confirm anything yet. You need to do more extensive
> testing to get somewhat reliable answer and still you won't be sure.

No, this benchmark does give a very clear and reliable answer: everything
speeds up by a huge factor.

> I asked you to run on arm my benchmark to measure results of inlining.
> I attached again version. You should run it to see how results will differ.

I did run it but I don't understand what it's supposed to mean, and I can't share
the results. So do you have something simpler that shows what point you're trying
to make? Or maybe you could add your own benchmark to GLIBC?

Wilco
  
Carlos O'Donell July 13, 2015, 4:53 p.m. UTC | #9
On 07/13/2015 07:02 AM, Wilco Dijkstra wrote:
> OK, I extended the benchmark to add the built-ins explicitly so that
> you don't need to apply the math.h inline patch first.

Please post the new version of your benchmark so we can review for
inclusion in 2.23.

Cheers,
Carlos.
  
Ondrej Bilka July 16, 2015, 8:49 p.m. UTC | #10
On Mon, Jul 13, 2015 at 12:02:51PM +0100, Wilco Dijkstra wrote:
> > Ondřej Bílka wrote:
> > On Fri, Jul 10, 2015 at 05:09:16PM +0100, Wilco Dijkstra wrote:
> > > > Ondřej Bílka wrote:
> > > > On Mon, Jul 06, 2015 at 03:50:11PM +0100, Wilco Dijkstra wrote:
> > > > >
> > > > >
> > > > > > Ondřej Bílka wrote:
> > > > > > But with latency hiding by using argument first suddenly even isnan and
> > > > > > isnormal become regression.
> > > > > >
> > >
> > > That doesn't look correct - it looks like this didn't use the built-ins at all,
> > > did you forget to apply that patch?
> > >
> > No, from what you wrote I expected that patch already tests builtins
> > which doesn't. Applied patch and got different results. When I added
> > patch results are similar.
> 
> OK, I extended the benchmark to add the built-ins explicitly so that
> you don't need to apply the math.h inline patch first.
>
Thats also good general policy to include all strategies so we will see
that some is optimal on some architecture. 
 
> > Which still doesn't have to mean anything, only if you test a
> > application that frequently uses these you will get result without
> > doubt.
> 
> We don't have applications that uses these, but we can say without any
> doubt that they will show huge speedups if they do use these functions
> frequently or any math functions that use them a lot. Remainder() for
> example shows ~7% gain with the new inlines.
>
No, this is just microbenchmark as I and Joseph are telling you ten
times. There are lot factors that could change performance like need of
constants which you don't measure as you inline remainder. So your
speedup could be only illusory. 
 
> > Here a simple modification produces different results. One of many
> > objections is that by simply adding gcc will try to make branchless code
> > like converting that to res += 5 * (isnan(tmp)). So use more difficult
> > branch and and with following two I get __builtin_isinf lot slower.
> > 
> >     { double tmp = p[i] * 2.0;    \
> >        res += 3 * sin (tmp); if (func (tmp)) res += 3* sin (2 * tmp) ;} \
> > 
> >     { double tmp = p[i] * 2.0;    \
> >        if (func (tmp)) res += 3 * sin (2 * tmp) ;} \
> 
> So here are the results again for the original test and your 2 tests above:

Also which gcc are you using? There is problem that recent gcc started
optimize better libc inlines but not builtins.

> > > >From this it seems that __isinf_inl is slightly better than the builtin, but
> > > it does not show up as a regression when combined with sin or in the remainder
> > > test.
> > >
> > That doesn't hold generaly as remainder test it could be just caused by
> > isnan being slower than isinf.
> 
> No, the new isinf/isnan are both faster than the previous versions (some isinf
> calls were inlined as __isinf_ns, but even that one is clearly slower than the
> builtin in all the results). Remember once again this patch creates new inlines
> that didn't exist before as well as replacing existing inlines in GLIBC with
> even faster ones. The combination of these means it is simply an impossibility
> that anything could become slower.
> 
I got that isinf libc inline is faster so how do you explain it?

> > > Well I just confirmed the same gains apply to x64.
> > >
> > No, that doesn't confirm anything yet. You need to do more extensive
> > testing to get somewhat reliable answer and still you won't be sure.
> 
> No, this benchmark does give a very clear and reliable answer: everything
> speeds up by a huge factor.
> 
While that is true it isn't exactly what we asked. We asked what inline
is best one. So you need accurate benchmark to also measure that, not
rough one that could only tell that inlining helps.

> > I asked you to run on arm my benchmark to measure results of inlining.
> > I attached again version. You should run it to see how results will differ.
> 
> I did run it but I don't understand what it's supposed to mean, and I can't share
> the results. So do you have something simpler that shows what point you're trying
> to make? Or maybe you could add your own benchmark to GLIBC?
> 
It was previously explained in thread. Please read it again, I will
reply separately whats wrong with v2 benchmark.
  
Wilco Dijkstra July 17, 2015, 1:24 p.m. UTC | #11
> Ondřej Bílka wrote: 
> On Mon, Jul 13, 2015 at 12:02:51PM +0100, Wilco Dijkstra wrote:
> > > Ondřej Bílka wrote:
> > > On Fri, Jul 10, 2015 at 05:09:16PM +0100, Wilco Dijkstra wrote:
> > > > > Ondřej Bílka wrote:
> > > > > On Mon, Jul 06, 2015 at 03:50:11PM +0100, Wilco Dijkstra wrote:
> > > > > >

> > We don't have applications that uses these, but we can say without any
> > doubt that they will show huge speedups if they do use these functions
> > frequently or any math functions that use them a lot. Remainder() for
> > example shows ~7% gain with the new inlines.
> >
> No, this is just microbenchmark as I and Joseph are telling you ten
> times. There are lot factors that could change performance like need of
> constants which you don't measure as you inline remainder. So your
> speedup could be only illusory.

Of course it is a microbenchmark, that's how I wrote it. And because the new 
inlines show huge speedups it proves they will improve performance everywhere.
Note the remainder functions are explicitly not inlined.

> Also which gcc are you using? There is problem that recent gcc started
> optimize better libc inlines but not builtins.

I always use latest trunk.
 
> > > > >From this it seems that __isinf_inl is slightly better than the builtin, but
> > > > it does not show up as a regression when combined with sin or in the remainder
> > > > test.
> > > >
> > > That doesn't hold generaly as remainder test it could be just caused by
> > > isnan being slower than isinf.
> >
> > No, the new isinf/isnan are both faster than the previous versions (some isinf
> > calls were inlined as __isinf_ns, but even that one is clearly slower than the
> > builtin in all the results). Remember once again this patch creates new inlines
> > that didn't exist before as well as replacing existing inlines in GLIBC with
> > even faster ones. The combination of these means it is simply an impossibility
> > that anything could become slower.
> >
> I got that isinf libc inline is faster so how do you explain it?

Why does it matter? My patch as it stands does not change that internal inline.
Again, my patch adds *new* inlines for functions which were previously not inlined.

> > > > Well I just confirmed the same gains apply to x64.
> > > >
> > > No, that doesn't confirm anything yet. You need to do more extensive
> > > testing to get somewhat reliable answer and still you won't be sure.
> >
> > No, this benchmark does give a very clear and reliable answer: everything
> > speeds up by a huge factor.
> >
> While that is true it isn't exactly what we asked. We asked what inline
> is best one. So you need accurate benchmark to also measure that, not
> rough one that could only tell that inlining helps.

No that's not what anyone asked, not what I agreed with Joseph and most 
definitely not what I intended. I posted a patch to speedup these functions
as they are currently not inlined and thus extremely slow, plus a benchmark
to prove that my patch makes a huge difference to performance.

There is no doubt it is possible to improve the GCC built-ins further, so I 
created a bug to do so (https://gcc.gnu.org/bugzilla/show_bug.cgi?id=66462). 
However that is beyond the scope of this patch and is likely microarchitecture
specific, so each target has to decide what code they generate for each builtin.

Wilco
  
Carlos O'Donell July 17, 2015, 3:37 p.m. UTC | #12
On 07/17/2015 09:24 AM, Wilco Dijkstra wrote:
> There is no doubt it is possible to improve the GCC built-ins further, so I 
> created a bug to do so (https://gcc.gnu.org/bugzilla/show_bug.cgi?id=66462). 
> However that is beyond the scope of this patch and is likely microarchitecture
> specific, so each target has to decide what code they generate for each builtin.

Agreed. Thanks for the good work Wilco.

c.
  

Patch

diff --git a/benchtests/Makefile b/benchtests/Makefile
index 8e615e5..3c20180 100644
--- a/benchtests/Makefile
+++ b/benchtests/Makefile
@@ -30,12 +30,13 @@  bench-pthread := pthread_once
 bench := $(bench-math) $(bench-pthread)
 
 # String function benchmarks.
-string-bench := bcopy bzero memccpy memchr memcmp memcpy memmem memmove \
-		mempcpy memset rawmemchr stpcpy stpncpy strcasecmp strcasestr \
-		strcat strchr strchrnul strcmp strcpy strcspn strlen \
-		strncasecmp strncat strncmp strncpy strnlen strpbrk strrchr \
-		strspn strstr strcpy_chk stpcpy_chk memrchr strsep strtok \
-		strcoll
+string-bench := bcopy bzero math-inlines memccpy memchr memcmp memcpy memmem \
+		memmove mempcpy memset rawmemchr stpcpy stpncpy strcasecmp \
+		strcasestr strcat strchr strchrnul strcmp strcpy strcspn \
+		strlen strncasecmp strncat strncmp strncpy strnlen strpbrk \
+		strrchr strspn strstr strcpy_chk stpcpy_chk memrchr strsep \
+		strtok strcoll
+
 string-bench-all := $(string-bench)
 
 # We have to generate locales
@@ -58,6 +59,7 @@  CFLAGS-bench-ffsll.c += -fno-builtin
 bench-malloc := malloc-thread
 
 $(addprefix $(objpfx)bench-,$(bench-math)): $(libm)
+$(addprefix $(objpfx)bench-,math-inlines): $(libm)
 $(addprefix $(objpfx)bench-,$(bench-pthread)): $(shared-thread-library)
 $(objpfx)bench-malloc-thread: $(shared-thread-library)
 
diff --git a/benchtests/bench-math-inlines.c b/benchtests/bench-math-inlines.c
new file mode 100644
index 0000000..c21a3d3
--- /dev/null
+++ b/benchtests/bench-math-inlines.c
@@ -0,0 +1,203 @@ 
+/* Measure math inline functions.
+   Copyright (C) 2015 Free Software Foundation, Inc.
+   This file is part of the GNU C Library.
+
+   The GNU C Library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 2.1 of the License, or (at your option) any later version.
+
+   The GNU C Library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with the GNU C Library; if not, see
+   <http://www.gnu.org/licenses/>.  */
+
+#define SIZE 1024
+#define TEST_MAIN
+#define TEST_NAME "math-inlines"
+#include "bench-string.h"
+
+#include <stdlib.h>
+#include <math.h>
+#include <stdint.h>
+
+#define BOOLTEST(func)					  \
+int							  \
+func ## _t (volatile double *p, size_t n, size_t iters)   \
+{							  \
+  int i, j;						  \
+  int res = 0;						  \
+  for (j = 0; j < iters; j++)				  \
+    for (i = 0; i < n; i++)				  \
+      if (func (p[i] * 2.0)) res++;			  \
+  return res;						  \
+}
+
+#define VALUETEST(func)					  \
+int							  \
+func ## _t (volatile double *p, size_t n, size_t iters)	  \
+{							  \
+  int i, j;						  \
+  int res = 0;						  \
+  for (j = 0; j < iters; j++)				  \
+    for (i = 0; i < n; i++)				  \
+      res += func (p[i] * 2.0);				  \
+  return res;						  \
+}
+
+typedef union
+{
+  double value;
+  uint64_t word;
+} ieee_double_shape_type;
+
+#define EXTRACT_WORDS64(i,d)                              \
+do {                                                      \
+  ieee_double_shape_type gh_u;                            \
+  gh_u.value = (d);                                       \
+  (i) = gh_u.word;                                        \
+} while (0)
+
+/* Explicit inlines similar to math_private.h versions.  */
+
+extern __always_inline int
+__isnan_inl (double d)
+{
+  uint64_t di;
+  EXTRACT_WORDS64 (di, d);
+  return (di & 0x7fffffffffffffffull) > 0x7ff0000000000000ull;
+}
+
+extern __always_inline int
+__isinf_inl (double x)
+{
+  uint64_t ix;
+  EXTRACT_WORDS64 (ix,x);
+  if ((ix << 1) != 0xffe0000000000000ull)
+    return 0;
+  return (int)(ix >> 32);
+}
+
+extern __always_inline int
+__finite_inl (double d)
+{
+  uint64_t di;
+  EXTRACT_WORDS64 (di, d);
+  return (di & 0x7fffffffffffffffull) < 0x7ff0000000000000ull;
+}
+
+/* Explicit inline similar to existing math.h implementation.  */
+
+#define __isnormal_inl(X) (__fpclassify (X) == FP_NORMAL)
+#define __isnormal_inl2(X) (fpclassify (X) == FP_NORMAL)
+
+/* Test fpclassify with use of only 2 of the 5 results.  */
+
+extern __always_inline int
+__fpclassify_test1 (double d)
+{
+  int cl = fpclassify (d);
+  return cl == FP_NAN || cl == FP_INFINITE;
+}
+
+extern __always_inline int
+__fpclassify_test2 (double d)
+{
+  return isnan (d) || isinf (d);
+}
+
+/* Create test functions for each possibility.  */
+
+BOOLTEST (__isnan)
+BOOLTEST (isnan)
+BOOLTEST (__isnan_inl)
+
+BOOLTEST (__isinf)
+BOOLTEST (isinf)
+BOOLTEST (__isinf_inl)
+
+BOOLTEST (__finite)
+BOOLTEST (isfinite)
+BOOLTEST (__finite_inl)
+
+BOOLTEST (isnormal)
+BOOLTEST (__isnormal_inl)
+BOOLTEST (__isnormal_inl2)
+
+VALUETEST (fpclassify)
+VALUETEST (__fpclassify)
+BOOLTEST (__fpclassify_test1)
+BOOLTEST (__fpclassify_test2)
+
+IMPL (isnan_t, 0)
+IMPL (__isnan_t, 1)
+IMPL (__isnan_inl_t, 2)
+IMPL (isinf_t, 3)
+IMPL (__isinf_t, 4)
+IMPL (__isinf_inl_t, 5)
+IMPL (isfinite_t, 6)
+IMPL (__finite_t, 7)
+IMPL (__finite_inl_t, 8)
+IMPL (isnormal_t, 9)
+IMPL (__isnormal_inl_t, 10)
+IMPL (__isnormal_inl2_t, 11)
+IMPL (fpclassify_t, 12)
+IMPL (__fpclassify_t, 13)
+IMPL (__fpclassify_test1_t, 14)
+IMPL (__fpclassify_test2_t, 15)
+
+typedef int (*proto_t) (volatile double *p, size_t n, size_t iters);
+
+static void
+do_one_test (impl_t *impl, volatile double *arr, size_t len)
+{
+  size_t iters = INNER_LOOP_ITERS * 10;
+  timing_t start, stop, cur;
+
+  TIMING_NOW (start);
+  CALL (impl, arr, len, iters);
+  TIMING_NOW (stop);
+  TIMING_DIFF (cur, start, stop);
+
+  TIMING_PRINT_MEAN ((double) cur, (double) iters);
+}
+
+static volatile double arr1[SIZE];
+static volatile double arr2[SIZE];
+
+int
+test_main (void)
+{
+  size_t i;
+
+  test_init ();
+
+  /* Create 2 test arrays, one with 10% zeroes, 10% negative values,
+     79% positive values and 1% infinity/NaN.  The other contains
+     50% inf, 50% NaN.  */
+
+  for (i = 0; i < SIZE; i++)
+    {
+      int x = rand () & 255;
+      arr1[i] = (x < 25) ? 0.0 : ((x < 50) ? -1 : 100);
+      if (x == 255) arr1[i] = __builtin_inf ();
+      if (x == 254) arr1[i] = __builtin_nan ("0");
+      arr2[i] = (x < 128) ? __builtin_inf () : __builtin_nan ("0");
+    }
+
+  FOR_EACH_IMPL (impl, 0)
+    {
+      printf ("%20s: ", impl->name);
+      do_one_test (impl, arr1, SIZE);
+      do_one_test (impl, arr2, SIZE);
+      putchar ('\n');
+    }
+
+  return ret;
+}
+
+#include "../test-skeleton.c"