[v2] Add math-inline benchmark

Message ID 20150716225056.GA24479@domone
State Superseded
Headers

Commit Message

Ondrej Bilka July 16, 2015, 10:50 p.m. UTC
  On Thu, Jul 16, 2015 at 12:15:19PM +0100, Wilco Dijkstra wrote:
> Add a benchmark for isinf/isnan/isnormal/isfinite/fpclassify. This new version adds explicit tests
> for the GCC built-ins and uses json format as suggested and no longer includes any string headers.
> The test uses 2 arrays with 1024 doubles, one with 99% finite FP numbers (10% zeroes, 10% negative)
> and 1% inf/NaN, the other with 50% inf, and 50% Nan. 
> 
> Results shows that using the GCC built-ins in math.h gives huge speedups due to avoiding explict
> calls, PLT indirection to execute a function with 3-4 instructions - around 7x on AArch64 and 2.8x
> on x64. The GCC builtins have better performance than the existing math_private inlines for __isnan,
> __finite and __isinf_ns, so these should be removed.
>
No, this benchmark is invalid for following two reasons.

1) It doesn't measure real workload at all. Constructing large constant
could be costy and by inlining this benchmark ignores cost. 
2) Results on x64 don't measure inlines but inferior version as they use
assembly to change double into integer.

As I and Joseph told you multiple times to measure these and I send
benchmarks to demonstrate its effects. So I fixed your benchmark and now
it clearly shows that in all cases math_private inlines are better than
builtins on x64 (Now its x64 only due EXTRACT_WORDS64, you need to sync that with math-private). 
Even remainder is slower.

So at least on x64 we should publish math_private inlines instead using
slow builtins.

I also added better finite,isnormal inlines, these should have same
speed as isnan. Main improvement is multiplication by 2 instead anding
with constant to mask sign bit.

I tried to save bit of space by using 32bit constants
instead 64bit (foo_new2) but it doesn't help.



  "math-inlines": {
   "__isnan_t": {
    "inf/nan": {
     "duration": 1.48695e+07,
     "iterations": 500,
     "mean": 29738
    }
   },
   "__isnan_inl_t": {
    "inf/nan": {
     "duration": 1.12726e+07,
     "iterations": 500,
     "mean": 22545
    }
   },
   "__isnan_builtin_t": {
    "inf/nan": {
     "duration": 1.06417e+07,
     "iterations": 500,
     "mean": 21283
    }
   },
   "isnan_t": {
    "inf/nan": {
     "duration": 1.47359e+07,
     "iterations": 500,
     "mean": 29471
    }
   },
   "isnan_new_t": {
    "inf/nan": {
     "duration": 1.0537e+07,
     "iterations": 500,
     "mean": 21073
    }
   },
   "__isinf_t": {
    "inf/nan": {
     "duration": 1.68862e+07,
     "iterations": 500,
     "mean": 33772
    }
   },
   "__isinf_inl_t": {
    "inf/nan": {
     "duration": 1.14818e+07,
     "iterations": 500,
     "mean": 22963
    }
   },
   "__isinf_ns_t": {
    "inf/nan": {
     "duration": 1.18318e+07,
     "iterations": 500,
     "mean": 23663
    }
   },
   "__isinf_ns_builtin_t": {
    "inf/nan": {
     "duration": 1.20574e+07,
     "iterations": 500,
     "mean": 24114
    }
   },
   "__isinf_builtin_t": {
    "inf/nan": {
     "duration": 1.22987e+07,
     "iterations": 500,
     "mean": 24597
    }
   },
   "isinf_t": {
    "inf/nan": {
     "duration": 1.68442e+07,
     "iterations": 500,
     "mean": 33688
    }
   },
   "isinf_new_t": {
    "inf/nan": {
     "duration": 1.26923e+07,
     "iterations": 500,
     "mean": 25384
    }
   },
   "isinf_new2_t": {
    "inf/nan": {
     "duration": 1.21769e+07,
     "iterations": 500,
     "mean": 24353
    }
   },
   "__finite_t": {
    "inf/nan": {
     "duration": 6.00459e+06,
     "iterations": 500,
     "mean": 12009
    }
   },
   "__finite_inl_t": {
    "inf/nan": {
     "duration": 3.45365e+06,
     "iterations": 500,
     "mean": 6907
    }
   },
   "__isfinite_builtin_t": {
    "inf/nan": {
     "duration": 3.41982e+06,
     "iterations": 500,
     "mean": 6839
    }
   },
   "isfinite_t": {
    "inf/nan": {
     "duration": 5.98703e+06,
     "iterations": 500,
     "mean": 11974
    }
   },
   "finite_new_t": {
    "inf/nan": {
     "duration": 3.41983e+06,
     "iterations": 500,
     "mean": 6839
    }
   },
   "finite_new2_t": {
    "inf/nan": {
     "duration": 3.4199e+06,
     "iterations": 500,
     "mean": 6839
    }
   },
   "__isnormal_inl_t": {
    "inf/nan": {
     "duration": 7.75655e+06,
     "iterations": 500,
     "mean": 15513
    }
   },
   "__isnormal_inl2_t": {
    "inf/nan": {
     "duration": 7.71059e+06,
     "iterations": 500,
     "mean": 15421
    }
   },
   "__isnormal_builtin_t": {
    "inf/nan": {
     "duration": 2.99474e+06,
     "iterations": 500,
     "mean": 5989
    }
   },
   "isnormal_t": {
    "inf/nan": {
     "duration": 7.69216e+06,
     "iterations": 500,
     "mean": 15384
    }
   },
   "isnormal_new_t": {
    "inf/nan": {
     "duration": 3.42363e+06,
     "iterations": 500,
     "mean": 6847
    }
   },
   "isnormal_new2_t": {
    "inf/nan": {
     "duration": 3.01772e+06,
     "iterations": 500,
     "mean": 6035
    }
   },
   "__fpclassify_test1_t": {
    "inf/nan": {
     "duration": 2.31874e+07,
     "iterations": 500,
     "mean": 46374
    }
   },
   "__fpclassify_test2_t": {
    "inf/nan": {
     "duration": 1.78065e+07,
     "iterations": 500,
     "mean": 35613
    }
   },
   "__fpclassify_t": {
    "inf/nan": {
     "duration": 5.13234e+06,
     "iterations": 500,
     "mean": 10264
    }
   },
   "fpclassify_t": {
    "inf/nan": {
     "duration": 5.20095e+06,
     "iterations": 500,
     "mean": 10401
    }
   },
   "remainder_test1_t": {
    "inf/nan": {
     "duration": 2.65477e+07,
     "iterations": 500,
     "mean": 53095
    }
   },
   "remainder_test2_t": {
    "inf/nan": {
     "duration": 2.80853e+07,
     "iterations": 500,
     "mean": 56170
    }
   },
   "__isnan_t": {
    "normal": {
     "duration": 6.50742e+06,
     "iterations": 500,
     "mean": 13014
    }
   },
   "__isnan_inl_t": {
    "normal": {
     "duration": 3.49208e+06,
     "iterations": 500,
     "mean": 6984
    }
   },
   "__isnan_builtin_t": {
    "normal": {
     "duration": 2.65462e+06,
     "iterations": 500,
     "mean": 5309
    }
   },
   "isnan_t": {
    "normal": {
     "duration": 6.47484e+06,
     "iterations": 500,
     "mean": 12949
    }
   },
   "isnan_new_t": {
    "normal": {
     "duration": 2.6487e+06,
     "iterations": 500,
     "mean": 5297
    }
   },
   "__isinf_t": {
    "normal": {
     "duration": 6.50518e+06,
     "iterations": 500,
     "mean": 13010
    }
   },
   "__isinf_inl_t": {
    "normal": {
     "duration": 3.15952e+06,
     "iterations": 500,
     "mean": 6319
    }
   },
   "__isinf_ns_t": {
    "normal": {
     "duration": 3.51585e+06,
     "iterations": 500,
     "mean": 7031
    }
   },
   "__isinf_ns_builtin_t": {
    "normal": {
     "duration": 3.51377e+06,
     "iterations": 500,
     "mean": 7027
    }
   },
   "__isinf_builtin_t": {
    "normal": {
     "duration": 4.36361e+06,
     "iterations": 500,
     "mean": 8727
    }
   },
   "isinf_t": {
    "normal": {
     "duration": 6.51039e+06,
     "iterations": 500,
     "mean": 13020
    }
   },
   "isinf_new_t": {
    "normal": {
     "duration": 3.09707e+06,
     "iterations": 500,
     "mean": 6194
    }
   },
   "isinf_new2_t": {
    "normal": {
     "duration": 3.11053e+06,
     "iterations": 500,
     "mean": 6221
    }
   },
   "__finite_t": {
    "normal": {
     "duration": 3.68569e+07,
     "iterations": 500,
     "mean": 73713
    }
   },
   "__finite_inl_t": {
    "normal": {
     "duration": 3.42074e+07,
     "iterations": 500,
     "mean": 68414
    }
   },
   "__isfinite_builtin_t": {
    "normal": {
     "duration": 3.43805e+07,
     "iterations": 500,
     "mean": 68760
    }
   },
   "isfinite_t": {
    "normal": {
     "duration": 3.67975e+07,
     "iterations": 500,
     "mean": 73595
    }
   },
   "finite_new_t": {
    "normal": {
     "duration": 3.40305e+07,
     "iterations": 500,
     "mean": 68061
    }
   },
   "finite_new2_t": {
    "normal": {
     "duration": 3.40128e+07,
     "iterations": 500,
     "mean": 68025
    }
   },
   "__isnormal_inl_t": {
    "normal": {
     "duration": 3.87965e+07,
     "iterations": 500,
     "mean": 77592
    }
   },
   "__isnormal_inl2_t": {
    "normal": {
     "duration": 3.87941e+07,
     "iterations": 500,
     "mean": 77588
    }
   },
   "__isnormal_builtin_t": {
    "normal": {
     "duration": 3.61693e+07,
     "iterations": 500,
     "mean": 72338
    }
   },
   "isnormal_t": {
    "normal": {
     "duration": 3.87878e+07,
     "iterations": 500,
     "mean": 77575
    }
   },
   "isnormal_new_t": {
    "normal": {
     "duration": 3.45548e+07,
     "iterations": 500,
     "mean": 69109
    }
   },
   "isnormal_new2_t": {
    "normal": {
     "duration": 3.41735e+07,
     "iterations": 500,
     "mean": 68347
    }
   },
   "__fpclassify_test1_t": {
    "normal": {
     "duration": 8.74787e+06,
     "iterations": 500,
     "mean": 17495
    }
   },
   "__fpclassify_test2_t": {
    "normal": {
     "duration": 3.17414e+06,
     "iterations": 500,
     "mean": 6348
    }
   },
   "__fpclassify_t": {
    "normal": {
     "duration": 6.0656e+06,
     "iterations": 500,
     "mean": 12131
    }
   },
   "fpclassify_t": {
    "normal": {
     "duration": 6.07758e+06,
     "iterations": 500,
     "mean": 12155
    }
   },
   "remainder_test1_t": {
    "normal": {
     "duration": 2.54391e+07,
     "iterations": 500,
     "mean": 50878
    }
   },
   "remainder_test2_t": {
    "normal": {
     "duration": 2.65189e+07,
     "iterations": 500,
     "mean": 53037
    }
   }
  }
  

Comments

Wilco Dijkstra July 17, 2015, 1:26 p.m. UTC | #1
> Ondřej Bílka wrote:
> On Thu, Jul 16, 2015 at 12:15:19PM +0100, Wilco Dijkstra wrote:
> > Add a benchmark for isinf/isnan/isnormal/isfinite/fpclassify. This new version adds explicit
> tests
> > for the GCC built-ins and uses json format as suggested and no longer includes any string
> headers.
> > The test uses 2 arrays with 1024 doubles, one with 99% finite FP numbers (10% zeroes, 10%
> negative)
> > and 1% inf/NaN, the other with 50% inf, and 50% Nan.
> >
> > Results shows that using the GCC built-ins in math.h gives huge speedups due to avoiding
> explict
> > calls, PLT indirection to execute a function with 3-4 instructions - around 7x on AArch64
> and 2.8x
> > on x64. The GCC builtins have better performance than the existing math_private inlines for
> __isnan,
> > __finite and __isinf_ns, so these should be removed.
> >
> No, this benchmark is invalid for following two reasons.
> 
> 1) It doesn't measure real workload at all. Constructing large constant
> could be costy and by inlining this benchmark ignores cost.

It was never meant to measure a real workload. If you'd like to add your own 
workload, that would be great, but my micro benchmark is more than sufficient
in proving that the new inlines give a huge performance gain.

> 2) Results on x64 don't measure inlines but inferior version as they use
> assembly to change double into integer.
> 
> As I and Joseph told you multiple times to measure these and I send
> benchmarks to demonstrate its effects. So I fixed your benchmark and now
> it clearly shows that in all cases math_private inlines are better than
> builtins on x64 (Now its x64 only due EXTRACT_WORDS64, you need to sync that with math-
> private).
> Even remainder is slower.

It shouldn't be necessary to use assembly code, GCC is perfectly capable of
generating efficient code for EXTRACT_WORDS. If GCC doesn't for x64 then you
should report it and ensure it gets fixed. I don't understand why people keep
using target-specific hacks and assembly rather than fixing the underlying
issues and use generic code that works well on all targets...

> So at least on x64 we should publish math_private inlines instead using
> slow builtins.

Well it was agreed we are going to use the GCC built-ins and then improve
those. If you want to propose additional patches with special inlines for
x64 then please go ahead, but my plan is to improve the builtins.

> I also added better finite,isnormal inlines, these should have same
> speed as isnan. Main improvement is multiplication by 2 instead anding
> with constant to mask sign bit.

Your inlines may well be very fast but they are not correct, eg. this doesn't
check the exponent at all:

> +extern __always_inline int
> +isinf_new (double dx)
> +{
> +  uint64_t di;
> +  EXTRACT_WORDS64 (di, dx);
> +
> +  if (__builtin_expect ((di << 12) != 0, 1))
> +    return 0;
> +
> +  return (int) (di >> 32);
> +}

Wilco
  
Carlos O'Donell July 17, 2015, 1:48 p.m. UTC | #2
On 07/17/2015 09:26 AM, Wilco Dijkstra wrote:
> It shouldn't be necessary to use assembly code, GCC is perfectly capable of
> generating efficient code for EXTRACT_WORDS. If GCC doesn't for x64 then you
> should report it and ensure it gets fixed. I don't understand why people keep
> using target-specific hacks and assembly rather than fixing the underlying
> issues and use generic code that works well on all targets...

Agreed. The compiler can't schedule the assembly at all, and worse.

>> So at least on x64 we should publish math_private inlines instead using
>> slow builtins.
> 
> Well it was agreed we are going to use the GCC built-ins and then improve
> those. If you want to propose additional patches with special inlines for
> x64 then please go ahead, but my plan is to improve the builtins.

Agreed.

Cheers,
Carlos.
  
Ondrej Bilka July 18, 2015, 11:34 a.m. UTC | #3
On Fri, Jul 17, 2015 at 02:26:53PM +0100, Wilco Dijkstra wrote:
> > Ondřej Bílka wrote:
> > On Thu, Jul 16, 2015 at 12:15:19PM +0100, Wilco Dijkstra wrote:
> > > Add a benchmark for isinf/isnan/isnormal/isfinite/fpclassify. This new version adds explicit
> > tests
> > > for the GCC built-ins and uses json format as suggested and no longer includes any string
> > headers.
> > > The test uses 2 arrays with 1024 doubles, one with 99% finite FP numbers (10% zeroes, 10%
> > negative)
> > > and 1% inf/NaN, the other with 50% inf, and 50% Nan.
> > >
> > > Results shows that using the GCC built-ins in math.h gives huge speedups due to avoiding
> > explict
> > > calls, PLT indirection to execute a function with 3-4 instructions - around 7x on AArch64
> > and 2.8x
> > > on x64. The GCC builtins have better performance than the existing math_private inlines for
> > __isnan,
> > > __finite and __isinf_ns, so these should be removed.
> > >
> > No, this benchmark is invalid for following two reasons.
> > 
> > 1) It doesn't measure real workload at all. Constructing large constant
> > could be costy and by inlining this benchmark ignores cost.
> 
> It was never meant to measure a real workload. If you'd like to add your own 
> workload, that would be great, but my micro benchmark is more than sufficient
> in proving that the new inlines give a huge performance gain.
> 
But you claimed following in original mail which is wrong:

"
Results shows that using the GCC built-ins in math.h gives huge speedups due to avoiding explict
calls, PLT indirection to execute a function with 3-4 instructions - around 7x on AArch64 and 2.8x
on x64. The GCC builtins have better performance than the existing math_private inlines for __isnan,
__finite and __isinf_ns, so these should be removed.
"

Also when inlines give speedup you should also add math inlines for
signaling nan case. That gives similar speedup. And it would be natural
to ask if you should use these inlines everytime if they are already
faster than builtins.

> > 2) Results on x64 don't measure inlines but inferior version as they use
> > assembly to change double into integer.
> > 
> > As I and Joseph told you multiple times to measure these and I send
> > benchmarks to demonstrate its effects. So I fixed your benchmark and now
> > it clearly shows that in all cases math_private inlines are better than
> > builtins on x64 (Now its x64 only due EXTRACT_WORDS64, you need to sync that with math-
> > private).
> > Even remainder is slower.
> 
> It shouldn't be necessary to use assembly code, GCC is perfectly capable of
> generating efficient code for EXTRACT_WORDS. If GCC doesn't for x64 then you
> should report it and ensure it gets fixed. I don't understand why people keep
> using target-specific hacks and assembly rather than fixing the underlying
> issues and use generic code that works well on all targets...
>
Thats problem its bit of special casing, if everybody wanted to add
special case assembly hack then gcc would be huge mess of these. As for
this I will open pr to fix that/add something like builtin_convert_fp_i.
 
> > So at least on x64 we should publish math_private inlines instead using
> > slow builtins.
> 
> Well it was agreed we are going to use the GCC built-ins and then improve
> those. If you want to propose additional patches with special inlines for
> x64 then please go ahead, but my plan is to improve the builtins.
>
And how are you sure that its just isolated x64 case. It may also happen
on powerpc, arm, sparc and other architectures and you need to test
that.

So I ask you again to run my benchmark with changed EXTRACT_WORDS64 to
see if this is problem also and arm.

> > I also added better finite,isnormal inlines, these should have same
> > speed as isnan. Main improvement is multiplication by 2 instead anding
> > with constant to mask sign bit.
> 
> Your inlines may well be very fast but they are not correct, eg. this doesn't
> check the exponent at all:
>
Most are correct. For isinf I was testing if this heuristic helps as
most floats have nonzero mantissa. Following is correct, performance
should be same but it depends on benchmark, how often it sets nonzero
mantissa.

 if (__builtin_expect ((di << 12) != 0, 1))
    return 0;
 if ((di << 1) != 0xffe0000000000000ull)
    return 0;

 return (int) (di >> 32);

 
> > +extern __always_inline int
> > +isinf_new (double dx)
> > +{
> > +  uint64_t di;
> > +  EXTRACT_WORDS64 (di, dx);
> > +
> > +  if (__builtin_expect ((di << 12) != 0, 1))
> > +    return 0;
> > +
> > +  return (int) (di >> 32);
> > +}
  
Wilco Dijkstra July 20, 2015, 11:01 a.m. UTC | #4
> Ondřej Bílka wrote:
> On Fri, Jul 17, 2015 at 02:26:53PM +0100, Wilco Dijkstra wrote:
> > > Ondřej Bílka wrote:
> > > On Thu, Jul 16, 2015 at 12:15:19PM +0100, Wilco Dijkstra wrote:
> > > > Add a benchmark for isinf/isnan/isnormal/isfinite/fpclassify. This new version adds
> explicit
> > > tests
> > > > for the GCC built-ins and uses json format as suggested and no longer includes any
> string
> > > headers.
> > > > The test uses 2 arrays with 1024 doubles, one with 99% finite FP numbers (10% zeroes,
> 10%
> > > negative)
> > > > and 1% inf/NaN, the other with 50% inf, and 50% Nan.
> > > >
> > > > Results shows that using the GCC built-ins in math.h gives huge speedups due to avoiding
> > > explict
> > > > calls, PLT indirection to execute a function with 3-4 instructions - around 7x on
> AArch64
> > > and 2.8x
> > > > on x64. The GCC builtins have better performance than the existing math_private inlines
> for
> > > __isnan,
> > > > __finite and __isinf_ns, so these should be removed.
> > > >
> > > No, this benchmark is invalid for following two reasons.
> > >
> > > 1) It doesn't measure real workload at all. Constructing large constant
> > > could be costy and by inlining this benchmark ignores cost.
> >
> > It was never meant to measure a real workload. If you'd like to add your own
> > workload, that would be great, but my micro benchmark is more than sufficient
> > in proving that the new inlines give a huge performance gain.
> >
> But you claimed following in original mail which is wrong:
> 
> "
> Results shows that using the GCC built-ins in math.h gives huge speedups due to avoiding
> explict
> calls, PLT indirection to execute a function with 3-4 instructions - around 7x on AArch64 and
> 2.8x
> on x64. The GCC builtins have better performance than the existing math_private inlines for
> __isnan,
> __finite and __isinf_ns, so these should be removed.
> "

No that statement is 100% correct.

> Also when inlines give speedup you should also add math inlines for
> signaling nan case. That gives similar speedup. And it would be natural
> to ask if you should use these inlines everytime if they are already
> faster than builtins.

I'm not sure what you mean here - I enable the new inlines in exactly the
right case. Improvements to support signalling NaNs or to speedup the 
built-ins further will be done in GCC.

> > > So at least on x64 we should publish math_private inlines instead using
> > > slow builtins.
> >
> > Well it was agreed we are going to use the GCC built-ins and then improve
> > those. If you want to propose additional patches with special inlines for
> > x64 then please go ahead, but my plan is to improve the builtins.
> >
> And how are you sure that its just isolated x64 case. It may also happen
> on powerpc, arm, sparc and other architectures and you need to test
> that.

It's obvious the huge speedup applies to all other architectures as well -
it's hard to imagine that avoiding a call, a return, a PLT indirection and 
additional optimization of 3-4 instructions could ever cause a slowdown...

> So I ask you again to run my benchmark with changed EXTRACT_WORDS64 to
> see if this is problem also and arm.

Here are the results for x64 with inlining disabled (__always_inline changed
into noinline) and the movq instruction like you suggested:

   "__isnan_t": {
    "normal": {
     "duration": 3.52048e+06,
     "iterations": 500,
     "mean": 7040
    }
   },
   "__isnan_inl_t": {
    "normal": {
     "duration": 3.09247e+06,
     "iterations": 500,
     "mean": 6184
    }
   },
   "__isnan_builtin_t": {
    "normal": {
     "duration": 2.20378e+06,
     "iterations": 500,
     "mean": 4407
    }
   },
   "isnan_t": {
    "normal": {
     "duration": 1.50514e+06,
     "iterations": 500,
     "mean": 3010
    }
   },
   "__isinf_t": {
    "normal": {
     "duration": 4.54681e+06,
     "iterations": 500,
     "mean": 9093
    }
   },
   "__isinf_inl_t": {
    "normal": {
     "duration": 3.09981e+06,
     "iterations": 500,
     "mean": 6199
    }
   },
   "__isinf_ns_t": {
    "normal": {
     "duration": 3.08074e+06,
     "iterations": 500,
     "mean": 6161
    }
   },
   "__isinf_ns_builtin_t": {
    "normal": {
     "duration": 2.64185e+06,
     "iterations": 500,
     "mean": 5283
    }
   },
   "__isinf_builtin_t": {
    "normal": {
     "duration": 3.13833e+06,
     "iterations": 500,
     "mean": 6276
    }
   },
   "isinf_t": {
    "normal": {
     "duration": 1.60055e+06,
     "iterations": 500,
     "mean": 3201
    }
   },
   "__finite_t": {
    "normal": {
     "duration": 3.54966e+06,
     "iterations": 500,
     "mean": 7099
    }
   },
   "__finite_inl_t": {
    "normal": {
     "duration": 3.08112e+06,
     "iterations": 500,
     "mean": 6162
    }
   },
   "__isfinite_builtin_t": {
    "normal": {
     "duration": 2.6426e+06,
     "iterations": 500,
     "mean": 5285
    }
   },
   "isfinite_t": {
    "normal": {
     "duration": 1.49071e+06,
     "iterations": 500,
     "mean": 2981
    }
   },
   "__isnormal_inl_t": {
    "normal": {
     "duration": 6.31925e+06,
     "iterations": 500,
     "mean": 12638
    }
   },
   "__isnormal_inl2_t": {
    "normal": {
     "duration": 2.18113e+06,
     "iterations": 500,
     "mean": 4362
    }
   },
   "__isnormal_builtin_t": {
    "normal": {
     "duration": 3.08183e+06,
     "iterations": 500,
     "mean": 6163
    }
   },
   "isnormal_t": {
    "normal": {
     "duration": 2.19867e+06,
     "iterations": 500,
     "mean": 4397
    }
   },
   "__fpclassify_test1_t": {
    "normal": {
     "duration": 2.71552e+06,
     "iterations": 500,
     "mean": 5431
    }
   },
   "__fpclassify_test2_t": {
    "normal": {
     "duration": 2.69459e+06,
     "iterations": 500,
     "mean": 5389
    }
   },
   "__fpclassify_t": {
    "normal": {
     "duration": 6.42558e+06,
     "iterations": 500,
     "mean": 12851
    }
   },
   "fpclassify_t": {
    "normal": {
     "duration": 1.4907e+06,
     "iterations": 500,
     "mean": 2981
    }
   },
   "remainder_test1_t": {
    "normal": {
     "duration": 2.20506e+07,
     "iterations": 500,
     "mean": 44101
    }
   },
   "remainder_test2_t": {
    "normal": {
     "duration": 2.12782e+07,
     "iterations": 500,
     "mean": 42556
    }
   }
  
Ondrej Bilka July 20, 2015, 7:22 p.m. UTC | #5
On Mon, Jul 20, 2015 at 12:01:50PM +0100, Wilco Dijkstra wrote:
> > Ondřej Bílka wrote:
> > On Fri, Jul 17, 2015 at 02:26:53PM +0100, Wilco Dijkstra wrote:
> > But you claimed following in original mail which is wrong:
> > 
> > "
> > Results shows that using the GCC built-ins in math.h gives huge speedups due to avoiding
> > explict
> > calls, PLT indirection to execute a function with 3-4 instructions - around 7x on AArch64 and
> > 2.8x
> > on x64. The GCC builtins have better performance than the existing math_private inlines for
> > __isnan,
> > __finite and __isinf_ns, so these should be removed.
> > "
> 
> No that statement is 100% correct.
>
As for isinf_ns on some architectures current isinf inline is better so
it should be replaced by that instead.

Your claim about __finite builtin is definitely false on x64, its
slower:

   "__finite_inl_t": {
    "normal": {
     "duration": 3.42074e+07,
     "iterations": 500,
     "mean": 68414
    }
   },
   "__isfinite_builtin_t": {
    "normal": {
     "duration": 3.43805e+07,
     "iterations": 500,
     "mean": 68760
    }
   },
   "finite_new_t": {
    "normal": {
     "duration": 3.40305e+07,
     "iterations": 500,
     "mean": 68061
    }
   },
   "finite_new2_t": {
    "normal": {
     "duration": 3.40128e+07,
     "iterations": 500,
     "mean": 68025
    }

Only isnan looks correct. Probably nothing could beat builtin which does
#define isnan(x) x!=x

                       
> > Also when inlines give speedup you should also add math inlines for
> > signaling nan case. That gives similar speedup. And it would be natural
> > to ask if you should use these inlines everytime if they are already
> > faster than builtins.
> 
> I'm not sure what you mean here - I enable the new inlines in exactly the
> right case. Improvements to support signalling NaNs or to speedup the 
> built-ins further will be done in GCC.
> 
Why we cant just use
#ifdef __SUPPORT_SNAN__
math inline
#else
builtin
#endif


> > > > So at least on x64 we should publish math_private inlines instead using
> > > > slow builtins.
> > >
> > > Well it was agreed we are going to use the GCC built-ins and then improve
> > > those. If you want to propose additional patches with special inlines for
> > > x64 then please go ahead, but my plan is to improve the builtins.
> > >
> > And how are you sure that its just isolated x64 case. It may also happen
> > on powerpc, arm, sparc and other architectures and you need to test
> > that.
> 
> It's obvious the huge speedup applies to all other architectures as well -
> it's hard to imagine that avoiding a call, a return, a PLT indirection and 
> additional optimization of 3-4 instructions could ever cause a slowdown...
> 
But I didn't asked about that. I asked that you made bug x64 specific
but its not clear all all if other architectures are affected or not. So
you must test these.

> > So I ask you again to run my benchmark with changed EXTRACT_WORDS64 to
> > see if this is problem also and arm.
> 
> Here are the results for x64 with inlining disabled (__always_inline changed
> into noinline) and the movq instruction like you suggested:
> 
I asked to run my benchmark on arm, not your benchmark on x64. As you described
modifications it does measure just performance of noninline function
which we don't want. There is difference in performance how gcc
optimizes that so you must surround entire expression in noinline. For
example gcc optimizes

# define isinf(x) (noninf(x) ? (x == 1.0 / 0.0 ? 1 : 0))
if (isinf(x))
  foo()

into

if (!noninf)
  foo()


>    "__isnan_t": {
>     "normal": {
>      "duration": 3.52048e+06,
>      "iterations": 500,
>      "mean": 7040
>     }
>    },
>    "__isnan_inl_t": {
>     "normal": {
>      "duration": 3.09247e+06,
>      "iterations": 500,
>      "mean": 6184
>     }
>    },
>    "__isnan_builtin_t": {
>     "normal": {
>      "duration": 2.20378e+06,
>      "iterations": 500,
>      "mean": 4407
>     }
>    },
>    "isnan_t": {
>     "normal": {
>      "duration": 1.50514e+06,
>      "iterations": 500,
>      "mean": 3010
>     }
>    },

why is isnan faster than builtin?

>    "__isnormal_inl2_t": {
>     "normal": {
>      "duration": 2.18113e+06,
>      "iterations": 500,
>      "mean": 4362
>     }
>    },
>    "__isnormal_builtin_t": {
>     "normal": {
>      "duration": 3.08183e+06,
>      "iterations": 500,
>      "mean": 6163
>     }
>    },

also here why got isnormal2 so good?
  
Wilco Dijkstra July 21, 2015, 4:14 p.m. UTC | #6
> Ondřej Bílka wrote:
> On Mon, Jul 20, 2015 at 12:01:50PM +0100, Wilco Dijkstra wrote:
> > > Ondřej Bílka wrote:
> > > On Fri, Jul 17, 2015 at 02:26:53PM +0100, Wilco Dijkstra wrote:
> > > But you claimed following in original mail which is wrong:
> > >
> > > "
> > > Results shows that using the GCC built-ins in math.h gives huge speedups due to avoiding
> > > explict
> > > calls, PLT indirection to execute a function with 3-4 instructions - around 7x on AArch64
> and
> > > 2.8x
> > > on x64. The GCC builtins have better performance than the existing math_private inlines
> for
> > > __isnan,
> > > __finite and __isinf_ns, so these should be removed.
> > > "
> >
> > No that statement is 100% correct.
> >
> As for isinf_ns on some architectures current isinf inline is better so
> it should be replaced by that instead.

The current inline (__isinf_ns) is not better than the builtin.

> Your claim about __finite builtin is definitely false on x64, its
> slower:

No that's not what I am getting on x64. With the movq instruction
and inlining as my original patch:

   "__finite_inl_t": {
    "normal": {
     "duration": 1.59802e+06,
     "iterations": 500,
     "mean": 3196
    }
   },
   "__isfinite_builtin_t": {
    "normal": {
     "duration": 1.48391e+06,
     "iterations": 500,
     "mean": 2967
    }
   },

With movq but inlining disabled:

   "__finite_inl_t": {
    "normal": {
     "duration": 3.09839e+06,
     "iterations": 500,
     "mean": 6196
    }
   },
   "__isfinite_builtin_t": {
    "normal": {
     "duration": 2.6429e+06,
     "iterations": 500,
     "mean": 5285
    }
   },

So the benefit of the builtin increases when we can't lift the immediate.

> > > Also when inlines give speedup you should also add math inlines for
> > > signaling nan case. That gives similar speedup. And it would be natural
> > > to ask if you should use these inlines everytime if they are already
> > > faster than builtins.
> >
> > I'm not sure what you mean here - I enable the new inlines in exactly the
> > right case. Improvements to support signalling NaNs or to speedup the
> > built-ins further will be done in GCC.
> >
> Why we cant just use
> #ifdef __SUPPORT_SNAN__
> math inline
> #else
> builtin
> #endif

That's a lot of work for little gain giving signalling NaNs are not enabled often.

> > It's obvious the huge speedup applies to all other architectures as well -
> > it's hard to imagine that avoiding a call, a return, a PLT indirection and
> > additional optimization of 3-4 instructions could ever cause a slowdown...
> >
> But I didn't asked about that. I asked that you made bug x64 specific
> but its not clear all all if other architectures are affected or not. So
> you must test these.

All architectures are affected as they will get the speedup from the inlining.

> > > So I ask you again to run my benchmark with changed EXTRACT_WORDS64 to
> > > see if this is problem also and arm.
> >
> > Here are the results for x64 with inlining disabled (__always_inline changed
> > into noinline) and the movq instruction like you suggested:
> >
> I asked to run my benchmark on arm, not your benchmark on x64. As you described
> modifications it does measure just performance of noninline function
> which we don't want. There is difference in performance how gcc
> optimizes that so you must surround entire expression in noinline. For
> example gcc optimizes
> 
> # define isinf(x) (noninf(x) ? (x == 1.0 / 0.0 ? 1 : 0))
> if (isinf(x))
>   foo()
> 
> into
> 
> if (!noninf)
>   foo()

Which is exactly what we want to happen (and why the non-inlined isinf results
are not interesting as 99.9% of times we do the optimization).

> >    "__isnan_t": {
> >     "normal": {
> >      "duration": 3.52048e+06,
> >      "iterations": 500,
> >      "mean": 7040
> >     }
> >    },
> >    "__isnan_inl_t": {
> >     "normal": {
> >      "duration": 3.09247e+06,
> >      "iterations": 500,
> >      "mean": 6184
> >     }
> >    },
> >    "__isnan_builtin_t": {
> >     "normal": {
> >      "duration": 2.20378e+06,
> >      "iterations": 500,
> >      "mean": 4407
> >     }
> >    },
> >    "isnan_t": {
> >     "normal": {
> >      "duration": 1.50514e+06,
> >      "iterations": 500,
> >      "mean": 3010
> >     }
> >    },
> 
> why is isnan faster than builtin?

You asked for results with inlining turned off, so basically this shows the difference
between inlining the isnan builtin into a loop and calling a function which has the
isnan builtin inlined.

> >    "__isnormal_inl2_t": {
> >     "normal": {
> >      "duration": 2.18113e+06,
> >      "iterations": 500,
> >      "mean": 4362
> >     }
> >    },
> >    "__isnormal_builtin_t": {
> >     "normal": {
> >      "duration": 3.08183e+06,
> >      "iterations": 500,
> >      "mean": 6163
> >     }
> >    },
> 
> also here why got isnormal2 so good?

isnormal_inl2_t is still inlined of course as it is a define.

Wilco
  
Ondrej Bilka July 21, 2015, 5:47 p.m. UTC | #7
On Tue, Jul 21, 2015 at 05:14:51PM +0100, Wilco Dijkstra wrote:
> > Ondřej Bílka wrote:
> > On Mon, Jul 20, 2015 at 12:01:50PM +0100, Wilco Dijkstra wrote:
> > > > Ondřej Bílka wrote:
> > > > On Fri, Jul 17, 2015 at 02:26:53PM +0100, Wilco Dijkstra wrote:
> > > > But you claimed following in original mail which is wrong:
> > > >
> > > > "
> > > > Results shows that using the GCC built-ins in math.h gives huge speedups due to avoiding
> > > > explict
> > > > calls, PLT indirection to execute a function with 3-4 instructions - around 7x on AArch64
> > and
> > > > 2.8x
> > > > on x64. The GCC builtins have better performance than the existing math_private inlines
> > for
> > > > __isnan,
> > > > __finite and __isinf_ns, so these should be removed.
> > > > "
> > >
> > > No that statement is 100% correct.
> > >
> > As for isinf_ns on some architectures current isinf inline is better so
> > it should be replaced by that instead.
> 
> The current inline (__isinf_ns) is not better than the builtin.
>
Thats correct as I also said that. But it doesn't change my point.
isinf_ns is slower than builtin. builtin is on x64 slower that current
inline. So we should replace internal isinf_ns with current inline to
get better performance.
 
> > Your claim about __finite builtin is definitely false on x64, its
> > slower:
> 
> No that's not what I am getting on x64. With the movq instruction
> and inlining as my original patch:
> 
Thats only your patch. It isn't clear at all if that happens in reality.
My benchmark shows that it don't. 

So Carlos we have two benchmarks each saying that one alternative is
better than other, how you solve that?
 
> > > It's obvious the huge speedup applies to all other architectures as well -
> > > it's hard to imagine that avoiding a call, a return, a PLT indirection and
> > > additional optimization of 3-4 instructions could ever cause a slowdown...
> > >
> > But I didn't asked about that. I asked that you made bug x64 specific
> > but its not clear all all if other architectures are affected or not. So
> > you must test these.
> 
> All architectures are affected as they will get the speedup from the inlining.
>
Again this is irrelevant. Current inlines could be faster than builtins
on most architectures so you should test that and open gcc bugs.
 
> > > > So I ask you again to run my benchmark with changed EXTRACT_WORDS64 to
> > > > see if this is problem also and arm.
> > >
> > > Here are the results for x64 with inlining disabled (__always_inline changed
> > > into noinline) and the movq instruction like you suggested:
> > >
> > I asked to run my benchmark on arm, not your benchmark on x64. As you described
> > modifications it does measure just performance of noninline function
> > which we don't want. There is difference in performance how gcc
> > optimizes that so you must surround entire expression in noinline. For
> > example gcc optimizes
> > 
> > # define isinf(x) (noninf(x) ? (x == 1.0 / 0.0 ? 1 : 0))
> > if (isinf(x))
> >   foo()
> > 
> > into
> > 
> > if (!noninf)
> >   foo()
> 
> Which is exactly what we want to happen (and why the non-inlined isinf results
> are not interesting as 99.9% of times we do the optimization).
>
That isn't completely correct. While this happens most callers do is*
check only once at start so loading constants costs you. So you need to
be careful how you write benchmark to get correct result like I did in
mine.
> > >    "isnan_t": {
> > >     "normal": {
> > >      "duration": 1.50514e+06,
> > >      "iterations": 500,
> > >      "mean": 3010
> > >     }
> > >    },
> > 
> > why is isnan faster than builtin?
> 
> You asked for results with inlining turned off, so basically this shows the difference
> between inlining the isnan builtin into a loop and calling a function which has the
> isnan builtin inlined.
>
I never asked to do that. I said that you should measure expression
using isinf surrounded in noinline function to avoid reusing constants
in loop.

That will give you different results than you get.
  
Wilco Dijkstra July 21, 2015, 7:09 p.m. UTC | #8
> Ondřej Bílka wrote:
> On Tue, Jul 21, 2015 at 05:14:51PM +0100, Wilco Dijkstra wrote:
> > > Ondřej Bílka wrote:
> > > On Mon, Jul 20, 2015 at 12:01:50PM +0100, Wilco Dijkstra wrote:
> > > > > Ondřej Bílka wrote:
> > > > > On Fri, Jul 17, 2015 at 02:26:53PM +0100, Wilco Dijkstra wrote:
> > > > > But you claimed following in original mail which is wrong:
> > > > >
> > > > > "
> > > > > Results shows that using the GCC built-ins in math.h gives huge speedups due to
> avoiding
> > > > > explict
> > > > > calls, PLT indirection to execute a function with 3-4 instructions - around 7x on
> AArch64
> > > and
> > > > > 2.8x
> > > > > on x64. The GCC builtins have better performance than the existing math_private
> inlines
> > > for
> > > > > __isnan,
> > > > > __finite and __isinf_ns, so these should be removed.
> > > > > "
> > > >
> > > > No that statement is 100% correct.
> > > >
> > > As for isinf_ns on some architectures current isinf inline is better so
> > > it should be replaced by that instead.
> >
> > The current inline (__isinf_ns) is not better than the builtin.
> >
> Thats correct as I also said that. But it doesn't change my point.
> isinf_ns is slower than builtin. builtin is on x64 slower that current
> inline. 
>
> So we should replace internal isinf_ns with current inline to
> get better performance.

Sure it is certainly possible to improve upon the built-ins. But that
was never the goal of my patch, so it is not relevant to this discussion.

> > > Your claim about __finite builtin is definitely false on x64, its
> > > slower:
> >
> > No that's not what I am getting on x64. With the movq instruction
> > and inlining as my original patch:
> >
> Thats only your patch. It isn't clear at all if that happens in reality.
> My benchmark shows that it don't.
> 
> So Carlos we have two benchmarks each saying that one alternative is
> better than other, how you solve that?

You'd solve it by showing a particular inline is faster than the builtin on
some GLIBC math functions and propose a patch.

> > > > It's obvious the huge speedup applies to all other architectures as well -
> > > > it's hard to imagine that avoiding a call, a return, a PLT indirection and
> > > > additional optimization of 3-4 instructions could ever cause a slowdown...
> > > >
> > > But I didn't asked about that. I asked that you made bug x64 specific
> > > but its not clear all all if other architectures are affected or not. So
> > > you must test these.
> >
> > All architectures are affected as they will get the speedup from the inlining.
> >
> Again this is irrelevant. Current inlines could be faster than builtins
> on most architectures so you should test that and open gcc bugs.

Since the patch has been out for review, people have been able to test it for 
their architecture. Anyone can suggest further improvements if they want even more 
performance. There is already a GCC bug to improve the built-ins.

> > > > > So I ask you again to run my benchmark with changed EXTRACT_WORDS64 to
> > > > > see if this is problem also and arm.
> > > >
> > > > Here are the results for x64 with inlining disabled (__always_inline changed
> > > > into noinline) and the movq instruction like you suggested:
> > > >
> > > I asked to run my benchmark on arm, not your benchmark on x64. As you described
> > > modifications it does measure just performance of noninline function
> > > which we don't want. There is difference in performance how gcc
> > > optimizes that so you must surround entire expression in noinline. For
> > > example gcc optimizes
> > >
> > > # define isinf(x) (noninf(x) ? (x == 1.0 / 0.0 ? 1 : 0))
> > > if (isinf(x))
> > >   foo()
> > >
> > > into
> > >
> > > if (!noninf)
> > >   foo()
> >
> > Which is exactly what we want to happen (and why the non-inlined isinf results
> > are not interesting as 99.9% of times we do the optimization).
> >
> That isn't completely correct. While this happens most callers do is*
> check only once at start so loading constants costs you. So you need to
> be careful how you write benchmark to get correct result like I did in
> mine.

Actually almost all uses within GLIBC use the macros multiple times within the 
same function, so the immediates can be shared (sometimes even between 2 different
macros, and any fabs or FP->int moves can be shared too).

> > > >    "isnan_t": {
> > > >     "normal": {
> > > >      "duration": 1.50514e+06,
> > > >      "iterations": 500,
> > > >      "mean": 3010
> > > >     }
> > > >    },
> > >
> > > why is isnan faster than builtin?
> >
> > You asked for results with inlining turned off, so basically this shows the difference
> > between inlining the isnan builtin into a loop and calling a function which has the
> > isnan builtin inlined.
> >
> I never asked to do that. I said that you should measure expression
> using isinf surrounded in noinline function to avoid reusing constants
> in loop.
>
> That will give you different results than you get.

No that wouldn't make any difference - all that matters for that experiment was 
encapsulating the immediate in a separate function, you'll get one call every 
iteration either way.

Wilco
  
Ondrej Bilka July 23, 2015, 3:39 p.m. UTC | #9
On Tue, Jul 21, 2015 at 08:09:35PM +0100, Wilco Dijkstra wrote:
> > Ondřej Bílka wrote:
> > On Tue, Jul 21, 2015 at 05:14:51PM +0100, Wilco Dijkstra wrote:
> > > > Ondřej Bílka wrote:
> > > > On Mon, Jul 20, 2015 at 12:01:50PM +0100, Wilco Dijkstra wrote:
> > > > > > Ondřej Bílka wrote:
> > > > > > On Fri, Jul 17, 2015 at 02:26:53PM +0100, Wilco Dijkstra wrote:
> > > > > > But you claimed following in original mail which is wrong:
> > > > > >
> > > > > > "
> > > > > > Results shows that using the GCC built-ins in math.h gives huge speedups due to
> > avoiding
> > > > > > explict
> > > > > > calls, PLT indirection to execute a function with 3-4 instructions - around 7x on
> > AArch64
> > > > and
> > > > > > 2.8x
> > > > > > on x64. The GCC builtins have better performance than the existing math_private
> > inlines
> > > > for
> > > > > > __isnan,
> > > > > > __finite and __isinf_ns, so these should be removed.
> > > > > > "
> > > > >
> > > > > No that statement is 100% correct.
> > > > >
> > > > As for isinf_ns on some architectures current isinf inline is better so
> > > > it should be replaced by that instead.
> > >
> > > The current inline (__isinf_ns) is not better than the builtin.
> > >
> > Thats correct as I also said that. But it doesn't change my point.
> > isinf_ns is slower than builtin. builtin is on x64 slower that current
> > inline. 
> >
> > So we should replace internal isinf_ns with current inline to
> > get better performance.
> 
> Sure it is certainly possible to improve upon the built-ins. But that
> was never the goal of my patch, so it is not relevant to this discussion.
>
As in libc current math-private inlines are better there is no reason to
use internally builtins. Until you will convince us that builtins result
in faster code these shouldn't be used internally.
 
> > > > Your claim about __finite builtin is definitely false on x64, its
> > > > slower:
> > >
> > > No that's not what I am getting on x64. With the movq instruction
> > > and inlining as my original patch:
> > >
> > Thats only your patch. It isn't clear at all if that happens in reality.
> > My benchmark shows that it don't.
> > 
> > So Carlos we have two benchmarks each saying that one alternative is
> > better than other, how you solve that?
> 
> You'd solve it by showing a particular inline is faster than the builtin on
> some GLIBC math functions and propose a patch.
> 
I told you at start of thread that you should convince us by testing,
for example ctan function. Remeber that its your responsibility to show
that change leads to better result.

> > > > > It's obvious the huge speedup applies to all other architectures as well -
> > > > > it's hard to imagine that avoiding a call, a return, a PLT indirection and
> > > > > additional optimization of 3-4 instructions could ever cause a slowdown...
> > > > >
> > > > But I didn't asked about that. I asked that you made bug x64 specific
> > > > but its not clear all all if other architectures are affected or not. So
> > > > you must test these.
> > >
> > > All architectures are affected as they will get the speedup from the inlining.
> > >
> > Again this is irrelevant. Current inlines could be faster than builtins
> > on most architectures so you should test that and open gcc bugs.
> 
> Since the patch has been out for review, people have been able to test it for 
> their architecture. Anyone can suggest further improvements if they want even more 
> performance. There is already a GCC bug to improve the built-ins.
>
So do it. I ask you again to run my benchmark on arm to see if builtins
could be improved or not.
 
> > > > > > So I ask you again to run my benchmark with changed EXTRACT_WORDS64 to
> > > > > > see if this is problem also and arm.
> > > > >
> > > > > Here are the results for x64 with inlining disabled (__always_inline changed
> > > > > into noinline) and the movq instruction like you suggested:
> > > > >
> > > > I asked to run my benchmark on arm, not your benchmark on x64. As you described
> > > > modifications it does measure just performance of noninline function
> > > > which we don't want. There is difference in performance how gcc
> > > > optimizes that so you must surround entire expression in noinline. For
> > > > example gcc optimizes
> > > >
> > > > # define isinf(x) (noninf(x) ? (x == 1.0 / 0.0 ? 1 : 0))
> > > > if (isinf(x))
> > > >   foo()
> > > >
> > > > into
> > > >
> > > > if (!noninf)
> > > >   foo()
> > >
> > > Which is exactly what we want to happen (and why the non-inlined isinf results
> > > are not interesting as 99.9% of times we do the optimization).
> > >
> > That isn't completely correct. While this happens most callers do is*
> > check only once at start so loading constants costs you. So you need to
> > be careful how you write benchmark to get correct result like I did in
> > mine.
> 
> Actually almost all uses within GLIBC use the macros multiple times within the 
> same function, so the immediates can be shared (sometimes even between 2 different
> macros, and any fabs or FP->int moves can be shared too).
>
There is difference of using macro several times where overhead is
smaller and in loop where there is no overhead.

As for sharing its complex, selection I don't know better than select
float/int implementation on case-by-case basis as condition what is
faster are complex.

A basic example that integer implementation could be faster even if
floating is faster in general case is

if (isinf(x))
  foo();
else if (isnan(x))
  bar();
else
  baz();

As for isinf you use check if (2 * to_int (x) == c) and for isnan (2 * to_int (x) > c) then gcc transforms this into
one comparison instruction of 2 * to_int (x) >= c


 
> > > > >    "isnan_t": {
> > > > >     "normal": {
> > > > >      "duration": 1.50514e+06,
> > > > >      "iterations": 500,
> > > > >      "mean": 3010
> > > > >     }
> > > > >    },
> > > >
> > > > why is isnan faster than builtin?
> > >
> > > You asked for results with inlining turned off, so basically this shows the difference
> > > between inlining the isnan builtin into a loop and calling a function which has the
> > > isnan builtin inlined.
> > >
> > I never asked to do that. I said that you should measure expression
> > using isinf surrounded in noinline function to avoid reusing constants
> > in loop.
> >
> > That will give you different results than you get.
> 
> No that wouldn't make any difference - all that matters for that experiment was 
> encapsulating the immediate in a separate function, you'll get one call every 
> iteration either way.
> 
Thats completely false as my implementation shows that different
implementation is faster than yours when I use noinline with specific
expressions. For example these could be invalid as gcc decides to turn
all implementations into branchless code which completely changes
performance characteristic.
  
Carlos O'Donell July 23, 2015, 6:04 p.m. UTC | #10
On 07/23/2015 11:39 AM, Ondřej Bílka wrote:
>> No that wouldn't make any difference - all that matters for that experiment was 
>> encapsulating the immediate in a separate function, you'll get one call every 
>> iteration either way.
>>
> Thats completely false as my implementation shows that different
> implementation is faster than yours when I use noinline with specific
> expressions. For example these could be invalid as gcc decides to turn
> all implementations into branchless code which completely changes
> performance characteristic.

When a conversation gets to a sufficient length it is beneficial for one party
in the conversation to summarize the current position without responding point
by point through a long thread.

It would be immensely beneficial to additional reviewers to start the thread
again from the v3 version of the patch, and instead of responding point by point,
respond at a high level with a summary of what has been agreed upon and what
issues are still outstanding.

With a sufficiently complex topic we are going to need to learn how to have
proper discussions that gravitate towards consensus rather than spiral apart
towards long threads. That is to say that the emails should get shorter and
shorter as fewer and fewer topics are left to clarify and discuss after
consensus is reached one-by-one on each point.

Cheers,
Carlos.
  

Patch

diff --git a/benchtests/Makefile b/benchtests/Makefile
index 8e615e5..51de91e 100644
--- a/benchtests/Makefile
+++ b/benchtests/Makefile
@@ -36,6 +36,7 @@  string-bench := bcopy bzero memccpy memchr memcmp memcpy memmem memmove \
 		strncasecmp strncat strncmp strncpy strnlen strpbrk strrchr \
 		strspn strstr strcpy_chk stpcpy_chk memrchr strsep strtok \
 		strcoll
+
 string-bench-all := $(string-bench)
 
 # We have to generate locales
@@ -50,7 +51,9 @@  stdlib-bench := strtod
 
 stdio-common-bench := sprintf
 
-benchset := $(string-bench-all) $(stdlib-bench) $(stdio-common-bench)
+math-benchset := math-inlines
+
+benchset := $(math-benchset) $(string-bench-all) $(stdlib-bench) $(stdio-common-bench) 
 
 CFLAGS-bench-ffs.c += -fno-builtin
 CFLAGS-bench-ffsll.c += -fno-builtin
@@ -58,6 +61,7 @@  CFLAGS-bench-ffsll.c += -fno-builtin
 bench-malloc := malloc-thread
 
 $(addprefix $(objpfx)bench-,$(bench-math)): $(libm)
+$(addprefix $(objpfx)bench-,$(math-benchset)): $(libm)
 $(addprefix $(objpfx)bench-,$(bench-pthread)): $(shared-thread-library)
 $(objpfx)bench-malloc-thread: $(shared-thread-library)
 
diff --git a/benchtests/bench-math-inlines.c b/benchtests/bench-math-inlines.c
new file mode 100644
index 0000000..222fdc3
--- /dev/null
+++ b/benchtests/bench-math-inlines.c
@@ -0,0 +1,461 @@ 
+/* Measure math inline functions.
+   Copyright (C) 2015 Free Software Foundation, Inc.
+   This file is part of the GNU C Library.
+
+   The GNU C Library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 2.1 of the License, or (at your option) any later version.
+
+   The GNU C Library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with the GNU C Library; if not, see
+   <http://www.gnu.org/licenses/>.  */
+
+#define SIZE 1024
+#define TEST_MAIN
+#define TEST_NAME "math-inlines"
+#define TEST_FUNCTION test_main ()
+#include "bench-timing.h"
+#include "json-lib.h"
+
+#include <stdlib.h>
+#include <math.h>
+#include <stdint.h>
+
+
+#define BOOLTEST(func)					  \
+int __attribute__((noinline)) func## _t_t (double d)	  \
+{							  \
+  if (func(d))						  \
+    return 3 * sin (d);					  \
+  else							  \
+    return 2;						  \
+}							  \
+int							  \
+func ## _t (volatile double *p, size_t n, size_t iters)   \
+{							  \
+  int i, j;						  \
+  int res = 0;						  \
+  for (j = 0; j < iters; j++)				  \
+    for (i = 0; i < n; i++)				  \
+      res += func## _t_t (2.0 * p[i]); 			  \
+  return res;						  \
+}
+
+#define VALUETEST(func)					  \
+int							  \
+func ## _t (volatile double *p, size_t n, size_t iters)	  \
+{							  \
+  int i, j;						  \
+  int res = 0;						  \
+  for (j = 0; j < iters; j++)				  \
+    for (i = 0; i < n; i++)				  \
+      { double tmp = p[i] * 2.0;			  \
+	if (func (tmp)) res += 5; }			  \
+  return res;						  \
+}
+
+typedef union
+{
+  double value;
+  uint64_t word;
+} ieee_double_shape_type;
+
+
+
+#define EXTRACT_WORDS64(i, d)                                                 \
+  do {                                                                        \
+    int64_t i_;                                                               \
+    asm ("movq %1, %0" : "=rm" (i_) : "x" ((double) (d)));                   \
+    (i) = i_;                                                                 \
+  } while (0)
+
+extern __always_inline int
+isinf_new (double dx)
+{
+  uint64_t di;
+  EXTRACT_WORDS64 (di, dx);
+
+  if (__builtin_expect ((di << 12) != 0, 1))
+    return 0;                                
+
+  return (int) (di >> 32);
+}
+
+extern __always_inline int
+isinf_new2 (double dx)
+{
+  uint64_t di;
+  EXTRACT_WORDS64 (di, dx);
+
+  if (__builtin_expect(((unsigned int)(di >> 31)) != 0xffe00000ull,1))
+    return 0;
+  if ((int)di)
+    return 0;
+
+  return (int) (di >> 32);
+}
+
+
+
+
+extern __always_inline int
+finite_new (double dx)
+{ 
+  uint64_t di;
+  EXTRACT_WORDS64 (di, dx);
+
+  return (2 * di) < 2 * 0x7ff0000000000000ull;
+}
+
+extern __always_inline int
+finite_new2 (double dx)
+{ 
+  uint64_t di;
+  EXTRACT_WORDS64 (di, dx);
+
+  return ((unsigned int)(di >> 31)) < 2 * 0x7ff00000ull;
+}
+
+
+
+extern __always_inline int
+isnan_new (double dx)
+{ 
+  uint64_t di;
+  EXTRACT_WORDS64 (di, dx);
+  
+  return (2 * di) > 2 * 0x7ff0000000000000ull;
+}
+
+
+
+extern __always_inline int
+isnormal_new (double dx)
+{
+  uint64_t di;
+  EXTRACT_WORDS64 (di, dx);
+
+  return (((2 * di) >> 53)  - 1 < 2046);;
+}
+
+extern __always_inline int
+isnormal_new2 (double dx)
+{
+  uint64_t di;
+  EXTRACT_WORDS64 (di, dx);
+
+  return (2 * di - (1UL << 53) < (2046UL << 53));
+}
+
+/* Explicit inlines similar to math_private.h versions.  */
+
+extern __always_inline int
+__isnan_inl (double d)
+{
+  uint64_t di;
+  EXTRACT_WORDS64 (di, d);
+  return (di & 0x7fffffffffffffffull) > 0x7ff0000000000000ull;
+}
+
+extern __always_inline int
+__isnan_builtin (double d)
+{
+  return __builtin_isnan (d);
+}
+
+extern __always_inline int
+__isinf_inl (double x)
+{
+  uint64_t ix;
+  EXTRACT_WORDS64 (ix,x);
+  if ((ix << 1) != 0xffe0000000000000ull)
+    return 0;
+  return (int)(ix >> 32);
+}
+
+extern __always_inline int
+__isinf_ns (double d)
+{
+  uint64_t di;
+  EXTRACT_WORDS64 (di, d);
+  return (di & 0x7fffffffffffffffull) == 0x7ff0000000000000ull;
+}
+
+extern __always_inline int
+__isinf_ns_builtin (double d)
+{
+  return __builtin_isinf (d);
+}
+
+extern __always_inline int
+__isinf_builtin (double d)
+{
+  return __builtin_isinf_sign (d);
+}
+
+
+extern __always_inline int
+__finite_inl (double d)
+{
+  uint64_t di;
+  EXTRACT_WORDS64 (di, d);
+  return (di & 0x7fffffffffffffffull) < 0x7ff0000000000000ull;
+}
+
+extern __always_inline int
+__isfinite_builtin (double d)
+{
+  return __builtin_isfinite (d);
+}
+
+
+/* Explicit inline similar to existing math.h implementation.  */
+
+#define __isnormal_inl(X) (__fpclassify (X) == FP_NORMAL)
+#define __isnormal_inl2(X) (fpclassify (X) == FP_NORMAL)
+
+extern __always_inline int
+__isnormal_builtin (double d)
+{
+  return __builtin_isnormal (d);
+}
+
+/* Test fpclassify with use of only 2 of the 5 results.  */
+
+extern __always_inline int
+__fpclassify_test1 (double d)
+{
+  int cl = fpclassify (d);
+  return cl == FP_NAN || cl == FP_INFINITE;
+}
+
+extern __always_inline int
+__fpclassify_test2 (double d)
+{
+  return __builtin_isnan (d) || __builtin_isinf (d);
+}
+
+double __attribute ((noinline))
+kernel_standard (double x, double y, int z)
+{
+  return x * y + z;
+}
+
+double __attribute ((noinline))
+remainder2 (double x, double y)
+{
+  if (((__builtin_expect (y == 0.0, 0) && !__builtin_isnan (x))
+	|| (__builtin_expect (__builtin_isinf (x), 0) && !__builtin_isnan (y))))
+    return kernel_standard (x, y, 10);
+
+  return remainder (x, y);
+}
+
+double __attribute ((noinline))
+remainder1 (double x, double y)
+{
+  if (((__builtin_expect (y == 0.0, 0) && !__isnan_inl (x))
+       || (__builtin_expect (__isinf_inl (x), 0) && !__isnan_inl (y))))
+    return kernel_standard (x, y, 10);
+
+  return remainder (x, y);
+}
+
+volatile double rem1 = 2.5;
+
+extern __always_inline int
+remainder_test1 (double d)
+{
+  return remainder1 (d, rem1);
+}
+
+extern __always_inline int
+remainder_test2 (double d)
+{
+  return remainder2 (d, rem1);
+}
+
+/* Create test functions for each possibility.  */
+
+BOOLTEST (__isnan)
+BOOLTEST (__isnan_inl)
+BOOLTEST (__isnan_builtin)
+BOOLTEST (isnan)
+BOOLTEST (isnan_new)
+
+BOOLTEST (__isinf)
+BOOLTEST (__isinf_inl)
+BOOLTEST (__isinf_ns)
+BOOLTEST (__isinf_ns_builtin)
+BOOLTEST (__isinf_builtin)
+BOOLTEST (isinf)
+BOOLTEST (isinf_new)
+BOOLTEST (isinf_new2)
+
+
+BOOLTEST (__finite)
+BOOLTEST (__finite_inl)
+BOOLTEST (__isfinite_builtin)
+BOOLTEST (isfinite)
+BOOLTEST (finite_new)
+BOOLTEST (finite_new2)
+
+
+BOOLTEST (__isnormal_inl)
+BOOLTEST (__isnormal_inl2)
+BOOLTEST (__isnormal_builtin)
+BOOLTEST (isnormal)
+BOOLTEST (isnormal_new)
+BOOLTEST (isnormal_new2)
+
+
+
+BOOLTEST (__fpclassify_test1)
+BOOLTEST (__fpclassify_test2)
+VALUETEST (__fpclassify)
+VALUETEST (fpclassify)
+
+BOOLTEST (remainder_test1)
+BOOLTEST (remainder_test2)
+
+typedef int (*proto_t) (volatile double *p, size_t n, size_t iters);
+
+typedef struct
+{
+  const char *name;
+  proto_t fn;
+} impl_t;
+
+#define IMPL(name) { #name, name }
+
+impl_t test_list[] =
+{
+  IMPL (__isnan_t),
+  IMPL (__isnan_inl_t),
+  IMPL (__isnan_builtin_t),
+  IMPL (isnan_t),
+  IMPL (isnan_new_t),
+
+
+  IMPL (__isinf_t),
+  IMPL (__isinf_inl_t),
+  IMPL (__isinf_ns_t),
+  IMPL (__isinf_ns_builtin_t),
+  IMPL (__isinf_builtin_t),
+  IMPL (isinf_t),
+  IMPL (isinf_new_t),
+  IMPL (isinf_new2_t),
+
+
+  IMPL (__finite_t),
+  IMPL (__finite_inl_t),
+  IMPL (__isfinite_builtin_t),
+  IMPL (isfinite_t),
+  IMPL (finite_new_t),
+  IMPL (finite_new2_t),
+
+
+  IMPL (__isnormal_inl_t),
+  IMPL (__isnormal_inl2_t),
+  IMPL (__isnormal_builtin_t),
+  IMPL (isnormal_t),
+  IMPL (isnormal_new_t),
+  IMPL (isnormal_new2_t),
+
+
+  IMPL (__fpclassify_test1_t),
+  IMPL (__fpclassify_test2_t),
+  IMPL (__fpclassify_t),
+  IMPL (fpclassify_t),
+
+  IMPL (remainder_test1_t),
+  IMPL (remainder_test2_t)
+};
+
+static void
+do_one_test (json_ctx_t *json_ctx, proto_t test_fn, volatile double *arr,
+	     size_t len, const char *testname)
+{
+  size_t iters = 500;
+  timing_t start, stop, cur;
+
+  json_attr_object_begin (json_ctx, testname);
+
+  TIMING_NOW (start);
+  test_fn (arr, len, iters);
+  TIMING_NOW (stop);
+  TIMING_DIFF (cur, start, stop);
+
+  json_attr_double (json_ctx, "duration", cur);
+  json_attr_double (json_ctx, "iterations", iters);
+  json_attr_double (json_ctx, "mean", cur / iters);
+  json_attr_object_end (json_ctx);
+}
+
+volatile unsigned int dontoptimize = 0;
+
+void
+startup (void)
+{
+  /* This loop should cause CPU to switch to maximal freqency.
+     This makes subsequent measurement more accurate.  We need a side effect
+     to prevent the loop being deleted by compiler.
+     This should be enough to cause CPU to speed up and it is simpler than
+     running loop for constant time.  This is used when user does not have root
+     access to set a constant freqency.  */
+  for (int k = 0; k < 100000000; k++)
+    dontoptimize += 23 * dontoptimize + 2;
+}
+
+static volatile double arr1[SIZE];
+static volatile double arr2[SIZE];
+
+int
+test_main (void)
+{
+  json_ctx_t json_ctx;
+  size_t i;
+
+  startup ();
+
+  json_init (&json_ctx, 2, stdout);
+  json_attr_object_begin (&json_ctx, "math-inlines");
+
+  /* Create 2 test arrays, one with 10% zeroes, 10% negative values,
+     79% positive values and 1% infinity/NaN.  The other contains
+     50% inf, 50% NaN.  */
+
+  for (i = 0; i < SIZE; i++)
+    {
+      int x = rand () & 255;
+      arr1[i] = (x < 25) ? 0.1 : ((x < 50) ? -1.1 : 100);
+      if (x == 255) arr1[i] = __builtin_inf ();
+      if (x == 254) arr1[i] = __builtin_nan ("0");
+      arr2[i] = (x < 128) ? __builtin_inf () : __builtin_nan ("0");
+    }
+
+  for (i = 0; i < sizeof (test_list) / sizeof (test_list[0]); i++)
+    {
+      json_attr_object_begin (&json_ctx, test_list[i].name);
+      do_one_test (&json_ctx, test_list[i].fn, arr2, SIZE, "inf/nan");
+      json_attr_object_end (&json_ctx);
+    }
+
+  for (i = 0; i < sizeof (test_list) / sizeof (test_list[0]); i++)
+    {
+      json_attr_object_begin (&json_ctx, test_list[i].name);
+      do_one_test (&json_ctx, test_list[i].fn, arr1, SIZE, "normal");
+      json_attr_object_end (&json_ctx);
+    }
+
+  json_attr_object_end (&json_ctx);
+  return 0;
+}
+
+#include "../test-skeleton.c"