x86-64: Optimize e_expf with FMA [BZ #21912]

Message ID 20170816133450.GA4074@gmail.com
State New, archived
Headers

Commit Message

Lu, Hongjiu Aug. 16, 2017, 1:34 p.m. UTC
  FMA optimized e_expf improves performance by more than 50% on Skylake.

Any comments?

H.J.
	[BZ #21912]
	* sysdeps/x86_64/fpu/multiarch/Makefile (libm-sysdep_routines):
	Add and e_expf-fma.
	* sysdeps/x86_64/fpu/multiarch/e_expf-fma.S: New file.
	* sysdeps/x86_64/fpu/multiarch/e_expf.c: Likewise.
	* sysdeps/x86_64/fpu/multiarch/ifunc-fma.h: Likewise.
---
 sysdeps/x86_64/fpu/multiarch/Makefile     |   2 +
 sysdeps/x86_64/fpu/multiarch/e_expf-fma.S | 182 ++++++++++++++++++++++++++++++
 sysdeps/x86_64/fpu/multiarch/e_expf.c     |  26 +++++
 sysdeps/x86_64/fpu/multiarch/ifunc-fma.h  |  34 ++++++
 4 files changed, 244 insertions(+)
 create mode 100644 sysdeps/x86_64/fpu/multiarch/e_expf-fma.S
 create mode 100644 sysdeps/x86_64/fpu/multiarch/e_expf.c
 create mode 100644 sysdeps/x86_64/fpu/multiarch/ifunc-fma.h
  

Comments

Carlos O'Donell Aug. 16, 2017, 2:04 p.m. UTC | #1
On 08/16/2017 09:34 AM, H.J. Lu wrote:
> FMA optimized e_expf improves performance by more than 50% on Skylake.
> 
> Any comments?

Exactly how much of e_expf-fma.S do you need to achieve that 50% speedup?

How does this algorithm compare to what is already implemented for e_expf?

My questions are basically leading to this:

(a) Can we write a generic e_expf in C with the special cases written
    in C, and...

(b) Is there a core kernel computation that we can then implement in assembly?

> H.J.
> 	[BZ #21912]
> 	* sysdeps/x86_64/fpu/multiarch/Makefile (libm-sysdep_routines):
> 	Add and e_expf-fma.
> 	* sysdeps/x86_64/fpu/multiarch/e_expf-fma.S: New file.
> 	* sysdeps/x86_64/fpu/multiarch/e_expf.c: Likewise.
> 	* sysdeps/x86_64/fpu/multiarch/ifunc-fma.h: Likewise.
> ---
>  sysdeps/x86_64/fpu/multiarch/Makefile     |   2 +
>  sysdeps/x86_64/fpu/multiarch/e_expf-fma.S | 182 ++++++++++++++++++++++++++++++
>  sysdeps/x86_64/fpu/multiarch/e_expf.c     |  26 +++++
>  sysdeps/x86_64/fpu/multiarch/ifunc-fma.h  |  34 ++++++
>  4 files changed, 244 insertions(+)
>  create mode 100644 sysdeps/x86_64/fpu/multiarch/e_expf-fma.S
>  create mode 100644 sysdeps/x86_64/fpu/multiarch/e_expf.c
>  create mode 100644 sysdeps/x86_64/fpu/multiarch/ifunc-fma.h
> 
> diff --git a/sysdeps/x86_64/fpu/multiarch/Makefile b/sysdeps/x86_64/fpu/multiarch/Makefile
> index 9daf2cf205..20dd44a7ff 100644
> --- a/sysdeps/x86_64/fpu/multiarch/Makefile
> +++ b/sysdeps/x86_64/fpu/multiarch/Makefile
> @@ -35,6 +35,8 @@ CFLAGS-slowpow-fma.c = -mfma -mavx2
>  CFLAGS-s_sin-fma.c = -mfma -mavx2
>  CFLAGS-s_tan-fma.c = -mfma -mavx2
>  
> +libm-sysdep_routines += e_expf-fma
> +

Add a comment explaining why this is a distinct line from the ones below.

>  libm-sysdep_routines += e_exp-fma4 e_log-fma4 e_pow-fma4 s_atan-fma4 \
>  			e_asin-fma4 e_atan2-fma4 s_sin-fma4 s_tan-fma4 \
>  			mplog-fma4 mpa-fma4 slowexp-fma4 slowpow-fma4 \
> diff --git a/sysdeps/x86_64/fpu/multiarch/e_expf-fma.S b/sysdeps/x86_64/fpu/multiarch/e_expf-fma.S
> new file mode 100644
> index 0000000000..e081186667
> --- /dev/null
> +++ b/sysdeps/x86_64/fpu/multiarch/e_expf-fma.S
> @@ -0,0 +1,182 @@
> +/* FMA/AVX2 version of IEEE 754 expf.
> +   Copyright (C) 2017 Free Software Foundation, Inc.
> +   This file is part of the GNU C Library.
> +
> +   The GNU C Library is free software; you can redistribute it and/or
> +   modify it under the terms of the GNU Lesser General Public
> +   License as published by the Free Software Foundation; either
> +   version 2.1 of the License, or (at your option) any later version.
> +
> +   The GNU C Library is distributed in the hope that it will be useful,
> +   but WITHOUT ANY WARRANTY; without even the implied warranty of
> +   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
> +   Lesser General Public License for more details.
> +
> +   You should have received a copy of the GNU Lesser General Public
> +   License along with the GNU C Library; if not, see
> +   <http://www.gnu.org/licenses/>.  */
> +
> +#include <sysdep.h>
> +
> +/* Short algorithm description:
> + *
> + *  Let K = 64 (table size).
> + *       e^x  = 2^(x/log(2)) = 2^n * T[j] * (1 + P(y))
> + *  where
> + *       x = m*log(2)/K + y,    y in [0.0..log(2)/K]
> + *       m = n*K + j,           m,n,j - signed integer, j in [0..K-1]
> + *       values of 2^(j/K) are tabulated as T[j].
> + *
> + *       P(y) is a minimax polynomial approximation of expf(x)-1
> + *       on small interval [0.0..log(2)/K].
> + *
> + *       P(y) = P3*y*y*y*y + P2*y*y*y + P1*y*y + P0*y, calculated as
> + *       z = y*y;    P(y) = (P3*z + P1)*z + (P2*z + P0)*y
> + *
> + * Special cases:
> + *  expf(NaN) = NaN
> + *  expf(+INF) = +INF
> + *  expf(-INF) = 0
> + *  expf(x) = 1 for subnormals
> + *  for finite argument, only expf(0)=1 is exact
> + *  expf(x) overflows if x>88.7228317260742190
> + *  expf(x) underflows if x<-103.972076416015620
> + */

Use GNU-style comments please.

> +
> +	.section .text.fma,"ax",@progbits
> +ENTRY(__ieee754_expf_fma)
> +	/* Input: single precision x in %xmm0 */
> +	vcvtss2sd %xmm0, %xmm0, %xmm1	/* Convert x to double precision */
> +	vmovd	%xmm0, %ecx		/* Copy x */
> +	vmovsd	L(DP_KLN2)(%rip), %xmm2	/* DP K/log(2) */
> +	vfmadd213sd L(DP_RD)(%rip), %xmm1, %xmm2 /* DP x*K/log(2)+RD */
> +	vmovsd	L(DP_P2)(%rip), %xmm3	/* DP P2 */
> +	movl	%ecx, %eax		/* x */
> +	andl	$0x7fffffff, %ecx	/* |x| */
> +	lea	L(DP_T)(%rip), %rsi	/* address of table T[j] */
> +	vmovsd	L(DP_P3)(%rip), %xmm4	/* DP P3 */
> +
> +	cmpl	$0x42ad496b, %ecx	/* |x|<125*log(2) ? */
> +	jae	L(special_paths_fma)
> +
> +	/* Here if |x|<125*log(2) */
> +	cmpl	$0x31800000, %ecx	/* |x|<2^(-28) ? */
> +	jb	L(small_arg_fma)
> +
> +	/* Main path: here if 2^(-28)<=|x|<125*log(2) */
> +						/* %xmm2 = SP x*K/log(2)+RS */
> +	vmovd	  %xmm2, %eax
> +	vsubsd	  L(DP_RD)(%rip), %xmm2, %xmm2 	/* DP t=round(x*K/log(2)) */
> +	movl	  %eax, %edx			/* n*K+j with trash */
> +	andl	  $0x3f, %eax			/* bits of j */
> +	vmovsd	  (%rsi,%rax,8), %xmm5		/* T[j] */
> +	andl	  $0xffffffc0, %edx		/* bits of n */
> +
> +	vfmadd132sd  L(DP_NLN2K)(%rip), %xmm1, %xmm2 /*  DP y=x-t*log(2)/K */
> +	vmulsd	    %xmm2, %xmm2, %xmm6		/* DP z=y*y */
> +
> +
> +	vfmadd213sd L(DP_P1)(%rip), %xmm6, %xmm4 /* DP P3*z + P1 */
> +	vfmadd213sd L(DP_P0)(%rip), %xmm6, %xmm3 /* DP P2*z+P0 */
> +
> +	addl	    $0x1fc0, %edx		/* bits of n + SP exponent bias */
> +	shll	    $17, %edx			/* SP 2^n */
> +	vmovd       %edx, %xmm1			/* SP 2^n */
> +
> +	vmulsd      %xmm6, %xmm4, %xmm4		/* DP (P3*z+P1)*z */
> +
> +	vfmadd213sd %xmm4, %xmm3, %xmm2		/* DP P(Y)  (P2*z+P0)*y */
> +	vfmadd213sd %xmm5, %xmm5, %xmm2		/* DP T[j]*(P(y)+1) */
> +	vcvtsd2ss   %xmm2, %xmm2, %xmm0		/* SP T[j]*(P(y)+1) */
> +	vmulss	    %xmm1, %xmm0, %xmm0		/* SP result=2^n*(T[j]*(P(y)+1)) */
> +	ret
> +
> +	.p2align	4
> +L(small_arg_fma):
> +	/* Here if 0<=|x|<2^(-28) */
> +	vaddss	L(SP_ONE)(%rip), %xmm0, %xmm0	/* 1.0 + x */
> +	/* Return 1.0 with inexact raised, except for x==0 */
> +	ret
> +
> +	.p2align	4
> +L(special_paths_fma):
> +	/* Here if 125*log(2)<=|x| */
> +	shrl	$31, %eax		/* Get sign bit of x, and depending on it: */
> +	lea	L(SP_RANGE)(%rip), %rdx	/* load over/underflow bound */
> +	cmpl	(%rdx,%rax,4), %ecx	/* |x|<under/overflow bound ? */
> +	jbe	L(near_under_or_overflow_fma)
> +
> +	/* Here if |x|>under/overflow bound */
> +	cmpl	$0x7f800000, %ecx	/* |x| is finite ? */
> +	jae	L(arg_inf_or_nan_fma)
> +
> +	/* Here if |x|>under/overflow bound, and x is finite */
> +	testl	%eax, %eax		/* sign of x nonzero ? */
> +	je	L(res_overflow_fma)
> +
> +	/* Here if -inf<x<underflow bound (x<0) */
> +	vmovss	L(SP_SMALL)(%rip), %xmm0/* load small value 2^(-100) */
> +	vmulss	%xmm0, %xmm0, %xmm0	/* Return underflowed result (zero or subnormal) */
> +	ret
> +
> +	.p2align	4
> +L(res_overflow_fma):
> +	/* Here if overflow bound<x<inf (x>0) */
> +	vmovss	L(SP_LARGE)(%rip), %xmm0/* load large value 2^100 */
> +	vmulss	%xmm0, %xmm0, %xmm0	/* Return overflowed result (Inf or max normal) */
> +	ret
> +
> +	.p2align	4
> +L(arg_inf_or_nan_fma):
> +	/* Here if |x| is Inf or NAN */
> +	jne	L(arg_nan_fma)	/* |x| is Inf ? */
> +
> +	/* Here if |x| is Inf */
> +	lea	L(SP_INF_0)(%rip), %rdx	/* depending on sign of x: */
> +	vmovss	(%rdx,%rax,4), %xmm0	/* return zero or Inf */
> +	ret
> +
> +	.p2align	4
> +L(arg_nan_fma):
> +	/* Here if |x| is NaN */
> +	vaddss	%xmm0, %xmm0, %xmm0	/* Return x+x (raise invalid) */
> +	ret
> +
> +	.p2align	4
> +L(near_under_or_overflow_fma):
> +	/* Here if 125*log(2)<=|x|<under/overflow bound */
> +	vmovd	%xmm2, %eax		/* bits of n*K+j with trash */
> +	vsubsd	L(DP_RD)(%rip), %xmm2, %xmm2 	/* DP t=round(x*K/log(2)) */
> +	movl	%eax, %edx		/* n*K+j with trash */
> +	andl	$0x3f, %eax		/* bits of j */
> +	vmulsd	L(DP_NLN2K)(%rip),%xmm2, %xmm2/* DP -t*log(2)/K */
> +	andl	$0xffffffc0, %edx	/* bits of n */
> +	vaddsd	%xmm1, %xmm2, %xmm0	/* DP y=x-t*log(2)/K */
> +	vmulsd	%xmm0, %xmm0, %xmm2	/* DP z=y*y */
> +	addl	$0xffc0, %edx		/* bits of n + DP exponent bias */
> +	vfmadd213sd L(DP_P0)(%rip), %xmm2, %xmm3/* DP P2*z+P0 */
> +	shlq	$46, %rdx		/* DP 2^n */
> +	vfmadd213sd L(DP_P1)(%rip), %xmm2, %xmm4/* DP P3*z+P1 */
> +	vmovq	%rdx, %xmm1		/* DP 2^n */
> +	vmulsd	%xmm2, %xmm4, %xmm4	/* DP (P3*z+P1)*z */
> +	vfmadd213sd %xmm4, %xmm3, %xmm0	/* DP (P2*z+P0)*y */
> +	vmovsd	(%rsi,%rax,8), %xmm2
> +	vfmadd213sd %xmm2, %xmm2, %xmm0 /* DP T[j]*(P(y)+1) */
> +	vmulsd	%xmm1, %xmm0, %xmm0	/* DP result=2^n*(T[j]*(P(y)+1)) */
> +	vcvtsd2ss %xmm0, %xmm0, %xmm0	/* convert result to single precision */
> +	ret
> +END(__ieee754_expf_fma)
> +
> +	.section .rodata.cst8,"aM",@progbits,8
> +	.p2align 3
> +L(DP_RD): /* double precision 2^52+2^51 */
> +	.long	0x00000000, 0x43380000
> +	.type L(DP_RD), @object
> +	ASM_SIZE_DIRECTIVE(L(DP_RD))
> +
> +#define __ieee754_expf __ieee754_expf_sse2
> +
> +#undef strong_alias
> +#define strong_alias(ignored1, ignored2)
> +
> +#include <sysdeps/x86_64/fpu/e_expf.S>
> diff --git a/sysdeps/x86_64/fpu/multiarch/e_expf.c b/sysdeps/x86_64/fpu/multiarch/e_expf.c
> new file mode 100644
> index 0000000000..864d5f9b21
> --- /dev/null
> +++ b/sysdeps/x86_64/fpu/multiarch/e_expf.c
> @@ -0,0 +1,26 @@
> +/* Multiple versions of IEEE 754 expf.
> +   Copyright (C) 2017 Free Software Foundation, Inc.
> +   This file is part of the GNU C Library.
> +
> +   The GNU C Library is free software; you can redistribute it and/or
> +   modify it under the terms of the GNU Lesser General Public
> +   License as published by the Free Software Foundation; either
> +   version 2.1 of the License, or (at your option) any later version.
> +
> +   The GNU C Library is distributed in the hope that it will be useful,
> +   but WITHOUT ANY WARRANTY; without even the implied warranty of
> +   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
> +   Lesser General Public License for more details.
> +
> +   You should have received a copy of the GNU Lesser General Public
> +   License along with the GNU C Library; if not, see
> +   <http://www.gnu.org/licenses/>.  */
> +
> +extern double __redirect_ieee754_expf (double);
> +
> +#define SYMBOL_NAME ieee754_expf
> +#include "ifunc-fma.h"
> +
> +libc_ifunc_redirected (__redirect_ieee754_expf, __ieee754_expf,
> +		       IFUNC_SELECTOR ());
> +strong_alias (__ieee754_expf, __expf_finite)

OK.

> diff --git a/sysdeps/x86_64/fpu/multiarch/ifunc-fma.h b/sysdeps/x86_64/fpu/multiarch/ifunc-fma.h
> new file mode 100644
> index 0000000000..383c41ffb1
> --- /dev/null
> +++ b/sysdeps/x86_64/fpu/multiarch/ifunc-fma.h
> @@ -0,0 +1,34 @@
> +/* Common definition for ifunc selections optimized with AVX2/FMA.
> +   Copyright (C) 2017 Free Software Foundation, Inc.
> +   This file is part of the GNU C Library.
> +
> +   The GNU C Library is free software; you can redistribute it and/or
> +   modify it under the terms of the GNU Lesser General Public
> +   License as published by the Free Software Foundation; either
> +   version 2.1 of the License, or (at your option) any later version.
> +
> +   The GNU C Library is distributed in the hope that it will be useful,
> +   but WITHOUT ANY WARRANTY; without even the implied warranty of
> +   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
> +   Lesser General Public License for more details.
> +
> +   You should have received a copy of the GNU Lesser General Public
> +   License along with the GNU C Library; if not, see
> +   <http://www.gnu.org/licenses/>.  */
> +
> +#include <init-arch.h>
> +
> +extern __typeof (REDIRECT_NAME) OPTIMIZE (sse2) attribute_hidden;
> +extern __typeof (REDIRECT_NAME) OPTIMIZE (fma) attribute_hidden;
> +
> +static inline void *
> +IFUNC_SELECTOR (void)
> +{
> +  const struct cpu_features* cpu_features = __get_cpu_features ();
> +
> +  if (CPU_FEATURES_ARCH_P (cpu_features, FMA_Usable)
> +      && CPU_FEATURES_ARCH_P (cpu_features, AVX2_Usable))
> +    return OPTIMIZE (fma);
> +
> +  return OPTIMIZE (sse2);
> +}

OK.
  
Arjan van de Ven Aug. 16, 2017, 2:31 p.m. UTC | #2
On 8/16/2017 7:04 AM, Carlos O'Donell wrote:
> On 08/16/2017 09:34 AM, H.J. Lu wrote:
>> FMA optimized e_expf improves performance by more than 50% on Skylake.
>>
>> Any comments?
>
> Exactly how much of e_expf-fma.S do you need to achieve that 50% speedup?

the core "fast path"
(the bit after	/* Main path: here if 2^(-28)<=|x|<125*log(2) */ )


>
> How does this algorithm compare to what is already implemented for e_expf?

I started with the SSE version of that e_expf, turned it into AVX, used FMA where possible and fixed a few
glass jaws in the fast path that you hit on skylake.

the slow path is more a direct 1:1 translation from SSE to AVX (because mixing SSE and AVX
is generally a bad idea)

>
> My questions are basically leading to this:
>
> (a) Can we write a generic e_expf in C with the special cases written
>     in C, and...

the special cases are tiny for expf() (compared to say, powf() )
>
> (b) Is there a core kernel computation that we can then implement in assembly?

some of the hot path instructions are pretty long latency in the cpu, and to
get the performance, those are done before the special case tests
(so that their execution overlaps, effectively making all the special case tests free)

if you do the first part in C you lose all of that. The whole expf() (when hitting the fast path)
has a throughput cost of somewhere between 10 and 12 cycles (depending on the CPU generation).
A big chunk of achieving that requires overlapping some of the expensive things with more mundane
instructions (like the special case tests). Giving that up for doing these checks in C first
likely doubles the cost of expf() or something very close to that.

(note this new code does not change the basics, the existing SSE code already has all of this in assembly)
  
Szabolcs Nagy Aug. 16, 2017, 2:56 p.m. UTC | #3
On 16/08/17 15:31, Arjan van de Ven wrote:
> On 8/16/2017 7:04 AM, Carlos O'Donell wrote:
>> On 08/16/2017 09:34 AM, H.J. Lu wrote:
>>> FMA optimized e_expf improves performance by more than 50% on Skylake.
>>>
>>> Any comments?
>>
>> Exactly how much of e_expf-fma.S do you need to achieve that 50% speedup?
> 
> the core "fast path"
> (the bit after    /* Main path: here if 2^(-28)<=|x|<125*log(2) */ )
> 
> 
>>
>> How does this algorithm compare to what is already implemented for e_expf?
> 
> I started with the SSE version of that e_expf, turned it into AVX, used FMA where possible and fixed a few
> glass jaws in the fast path that you hit on skylake.
> 
> the slow path is more a direct 1:1 translation from SSE to AVX (because mixing SSE and AVX
> is generally a bad idea)
> 

based on my benchmarks portable c code can
easily beat the hand written sse asm
(i haven't tested with avx+fma though).

the idea is that the x86 asm has overkill
precision (very close to 0.5 ulp error, but
not correctly rounded), we can debate this
later, but i think the polynomial can be
reduced and there should not be much difference
between asm and c performance (only the
round/convert to int operation is tricky:
for different targets the optimal code is
different, but that can be a target specific
macro hook).

anyway i posted my code to the arm
optimized-routines github repo, i'll start
posting the patches to glibc soon.

(one of the reasons posting glibc patches is
difficult is the nonsensical target specific
asm codes and ifunc resolvers that break when
i update the generic code in a way that
bypasses the wrapper function which is another
source of improvements.)
  
Arjan van de Ven Aug. 16, 2017, 2:59 p.m. UTC | #4
On 8/16/2017 7:56 AM, Szabolcs Nagy wrote:
> based on my benchmarks portable c code can
> easily beat the hand written sse asm

the hand written sse asm is not great on skylake or similar new cpus.
(few serious glass jaws)

the avx one is a lot better... but I'll welcome the challenge.

if you have a version in C, can you give an URL or something to it;
I'll just hook it into my local testbench and compare
  
Adhemerval Zanella Aug. 16, 2017, 3 p.m. UTC | #5
On 16/08/2017 11:56, Szabolcs Nagy wrote:
> On 16/08/17 15:31, Arjan van de Ven wrote:
>> On 8/16/2017 7:04 AM, Carlos O'Donell wrote:
>>> On 08/16/2017 09:34 AM, H.J. Lu wrote:
>>>> FMA optimized e_expf improves performance by more than 50% on Skylake.
>>>>
>>>> Any comments?
>>>
>>> Exactly how much of e_expf-fma.S do you need to achieve that 50% speedup?
>>
>> the core "fast path"
>> (the bit after    /* Main path: here if 2^(-28)<=|x|<125*log(2) */ )
>>
>>
>>>
>>> How does this algorithm compare to what is already implemented for e_expf?
>>
>> I started with the SSE version of that e_expf, turned it into AVX, used FMA where possible and fixed a few
>> glass jaws in the fast path that you hit on skylake.
>>
>> the slow path is more a direct 1:1 translation from SSE to AVX (because mixing SSE and AVX
>> is generally a bad idea)
>>
> 
> based on my benchmarks portable c code can
> easily beat the hand written sse asm
> (i haven't tested with avx+fma though).
> 
> the idea is that the x86 asm has overkill
> precision (very close to 0.5 ulp error, but
> not correctly rounded), we can debate this
> later, but i think the polynomial can be
> reduced and there should not be much difference
> between asm and c performance (only the
> round/convert to int operation is tricky:
> for different targets the optimal code is
> different, but that can be a target specific
> macro hook).
> 
> anyway i posted my code to the arm
> optimized-routines github repo, i'll start
> posting the patches to glibc soon.
> 
> (one of the reasons posting glibc patches is
> difficult is the nonsensical target specific
> asm codes and ifunc resolvers that break when
> i update the generic code in a way that
> bypasses the wrapper function which is another
> source of improvements.)
> 

Yes, the include of generic implementation for ifunc default version could 
use some cleanup.  However mostly, if not all, can be checked by
build-many-glibc.py (it would take time though).
  
Szabolcs Nagy Aug. 16, 2017, 3:14 p.m. UTC | #6
On 16/08/17 15:59, Arjan van de Ven wrote:
> On 8/16/2017 7:56 AM, Szabolcs Nagy wrote:
>> based on my benchmarks portable c code can
>> easily beat the hand written sse asm
> 
> the hand written sse asm is not great on skylake or similar new cpus.
> (few serious glass jaws)
> 
> the avx one is a lot better... but I'll welcome the challenge.
> 
> if you have a version in C, can you give an URL or something to it;
> I'll just hook it into my local testbench and compare
> 

you can wait for a few days when i clean up the patches
for glibc, this one has inline error handling and
various config options that may not be relevant for glibc

https://github.com/ARM-software/optimized-routines/blob/master/math/e_expf.c
https://github.com/ARM-software/optimized-routines/blob/master/math/e_exp2f_data.c
  
Carlos O'Donell Aug. 16, 2017, 3:30 p.m. UTC | #7
On 08/16/2017 10:31 AM, Arjan van de Ven wrote:
> On 8/16/2017 7:04 AM, Carlos O'Donell wrote:
>> On 08/16/2017 09:34 AM, H.J. Lu wrote:
>>> FMA optimized e_expf improves performance by more than 50% on
>>> Skylake.
>>> 
>>> Any comments?
>> 
>> Exactly how much of e_expf-fma.S do you need to achieve that 50%
>> speedup?
> 
> the core "fast path" (the bit after    /* Main path: here if
> 2^(-28)<=|x|<125*log(2) */ )

OK.
 
>> 
>> How does this algorithm compare to what is already implemented for
>> e_expf?
> 
> I started with the SSE version of that e_expf, turned it into AVX,
> used FMA where possible and fixed a few glass jaws in the fast path
> that you hit on skylake.
> 
> the slow path is more a direct 1:1 translation from SSE to AVX
> (because mixing SSE and AVX is generally a bad idea)

OK.

>> 
>> My questions are basically leading to this:
>> 
>> (a) Can we write a generic e_expf in C with the special cases
>> written in C, and...
> 
> the special cases are tiny for expf() (compared to say, powf() )
>> 
>> (b) Is there a core kernel computation that we can then implement
>> in assembly?
> 
> some of the hot path instructions are pretty long latency in the cpu,
> and to get the performance, those are done before the special case
> tests (so that their execution overlaps, effectively making all the
> special case tests free)
> 
> if you do the first part in C you lose all of that. The whole expf()
> (when hitting the fast path) has a throughput cost of somewhere
> between 10 and 12 cycles (depending on the CPU generation). A big
> chunk of achieving that requires overlapping some of the expensive
> things with more mundane instructions (like the special case tests).
> Giving that up for doing these checks in C first likely doubles the
> cost of expf() or something very close to that.
> 
> (note this new code does not change the basics, the existing SSE code
> already has all of this in assembly) 

OK, so that is what I thought would happen. We saw similar problems
when instrumenting the malloc hot paths, you needed access to the
entire function to hide as much of the cost of the profiling as
possible.

I don't object to HJ's patch, it looks good to me, and is just another
assembly implementation optimization.

My goal with this review is just to double-check that we can't do this
in any other way that makes it easier to maintain.
  
Joseph Myers Aug. 16, 2017, 5:11 p.m. UTC | #8
On Wed, 16 Aug 2017, Szabolcs Nagy wrote:

> the idea is that the x86 asm has overkill
> precision (very close to 0.5 ulp error, but
> not correctly rounded), we can debate this
> later, but i think the polynomial can be
> reduced and there should not be much difference

I think that when choosing a level of accuracy for which to tune a 
polynomial (when the function implementation is such that you can readily 
choose the accuracy) it's reasonable to aim for a round-to-nearest result 
that is the result of rounding the mathematical result up or down (which 
implies the exact result when the mathematical result is exactly 
representable), and a result in other modes that is either one of those 
values, or the result of applying nextup or nextdown to one of them.  
(This implies a round-to-nearest error of at most 1ulp in the terms in 
which the libm-test infrastructure measures it, relative to a 
correctly-rounded result, and an error in other rounding modes of at most 
3ulp, normally at most 2ulp, in those terms.)

Correct rounding should explicitly not be a goal for libm functions not 
bound to IEEE operations, and I'm doubtful of aiming for 0.501ulp errors 
either.
  

Patch

diff --git a/sysdeps/x86_64/fpu/multiarch/Makefile b/sysdeps/x86_64/fpu/multiarch/Makefile
index 9daf2cf205..20dd44a7ff 100644
--- a/sysdeps/x86_64/fpu/multiarch/Makefile
+++ b/sysdeps/x86_64/fpu/multiarch/Makefile
@@ -35,6 +35,8 @@  CFLAGS-slowpow-fma.c = -mfma -mavx2
 CFLAGS-s_sin-fma.c = -mfma -mavx2
 CFLAGS-s_tan-fma.c = -mfma -mavx2
 
+libm-sysdep_routines += e_expf-fma
+
 libm-sysdep_routines += e_exp-fma4 e_log-fma4 e_pow-fma4 s_atan-fma4 \
 			e_asin-fma4 e_atan2-fma4 s_sin-fma4 s_tan-fma4 \
 			mplog-fma4 mpa-fma4 slowexp-fma4 slowpow-fma4 \
diff --git a/sysdeps/x86_64/fpu/multiarch/e_expf-fma.S b/sysdeps/x86_64/fpu/multiarch/e_expf-fma.S
new file mode 100644
index 0000000000..e081186667
--- /dev/null
+++ b/sysdeps/x86_64/fpu/multiarch/e_expf-fma.S
@@ -0,0 +1,182 @@ 
+/* FMA/AVX2 version of IEEE 754 expf.
+   Copyright (C) 2017 Free Software Foundation, Inc.
+   This file is part of the GNU C Library.
+
+   The GNU C Library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 2.1 of the License, or (at your option) any later version.
+
+   The GNU C Library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with the GNU C Library; if not, see
+   <http://www.gnu.org/licenses/>.  */
+
+#include <sysdep.h>
+
+/* Short algorithm description:
+ *
+ *  Let K = 64 (table size).
+ *       e^x  = 2^(x/log(2)) = 2^n * T[j] * (1 + P(y))
+ *  where
+ *       x = m*log(2)/K + y,    y in [0.0..log(2)/K]
+ *       m = n*K + j,           m,n,j - signed integer, j in [0..K-1]
+ *       values of 2^(j/K) are tabulated as T[j].
+ *
+ *       P(y) is a minimax polynomial approximation of expf(x)-1
+ *       on small interval [0.0..log(2)/K].
+ *
+ *       P(y) = P3*y*y*y*y + P2*y*y*y + P1*y*y + P0*y, calculated as
+ *       z = y*y;    P(y) = (P3*z + P1)*z + (P2*z + P0)*y
+ *
+ * Special cases:
+ *  expf(NaN) = NaN
+ *  expf(+INF) = +INF
+ *  expf(-INF) = 0
+ *  expf(x) = 1 for subnormals
+ *  for finite argument, only expf(0)=1 is exact
+ *  expf(x) overflows if x>88.7228317260742190
+ *  expf(x) underflows if x<-103.972076416015620
+ */
+
+	.section .text.fma,"ax",@progbits
+ENTRY(__ieee754_expf_fma)
+	/* Input: single precision x in %xmm0 */
+	vcvtss2sd %xmm0, %xmm0, %xmm1	/* Convert x to double precision */
+	vmovd	%xmm0, %ecx		/* Copy x */
+	vmovsd	L(DP_KLN2)(%rip), %xmm2	/* DP K/log(2) */
+	vfmadd213sd L(DP_RD)(%rip), %xmm1, %xmm2 /* DP x*K/log(2)+RD */
+	vmovsd	L(DP_P2)(%rip), %xmm3	/* DP P2 */
+	movl	%ecx, %eax		/* x */
+	andl	$0x7fffffff, %ecx	/* |x| */
+	lea	L(DP_T)(%rip), %rsi	/* address of table T[j] */
+	vmovsd	L(DP_P3)(%rip), %xmm4	/* DP P3 */
+
+	cmpl	$0x42ad496b, %ecx	/* |x|<125*log(2) ? */
+	jae	L(special_paths_fma)
+
+	/* Here if |x|<125*log(2) */
+	cmpl	$0x31800000, %ecx	/* |x|<2^(-28) ? */
+	jb	L(small_arg_fma)
+
+	/* Main path: here if 2^(-28)<=|x|<125*log(2) */
+						/* %xmm2 = SP x*K/log(2)+RS */
+	vmovd	  %xmm2, %eax
+	vsubsd	  L(DP_RD)(%rip), %xmm2, %xmm2 	/* DP t=round(x*K/log(2)) */
+	movl	  %eax, %edx			/* n*K+j with trash */
+	andl	  $0x3f, %eax			/* bits of j */
+	vmovsd	  (%rsi,%rax,8), %xmm5		/* T[j] */
+	andl	  $0xffffffc0, %edx		/* bits of n */
+
+	vfmadd132sd  L(DP_NLN2K)(%rip), %xmm1, %xmm2 /*  DP y=x-t*log(2)/K */
+	vmulsd	    %xmm2, %xmm2, %xmm6		/* DP z=y*y */
+
+
+	vfmadd213sd L(DP_P1)(%rip), %xmm6, %xmm4 /* DP P3*z + P1 */
+	vfmadd213sd L(DP_P0)(%rip), %xmm6, %xmm3 /* DP P2*z+P0 */
+
+	addl	    $0x1fc0, %edx		/* bits of n + SP exponent bias */
+	shll	    $17, %edx			/* SP 2^n */
+	vmovd       %edx, %xmm1			/* SP 2^n */
+
+	vmulsd      %xmm6, %xmm4, %xmm4		/* DP (P3*z+P1)*z */
+
+	vfmadd213sd %xmm4, %xmm3, %xmm2		/* DP P(Y)  (P2*z+P0)*y */
+	vfmadd213sd %xmm5, %xmm5, %xmm2		/* DP T[j]*(P(y)+1) */
+	vcvtsd2ss   %xmm2, %xmm2, %xmm0		/* SP T[j]*(P(y)+1) */
+	vmulss	    %xmm1, %xmm0, %xmm0		/* SP result=2^n*(T[j]*(P(y)+1)) */
+	ret
+
+	.p2align	4
+L(small_arg_fma):
+	/* Here if 0<=|x|<2^(-28) */
+	vaddss	L(SP_ONE)(%rip), %xmm0, %xmm0	/* 1.0 + x */
+	/* Return 1.0 with inexact raised, except for x==0 */
+	ret
+
+	.p2align	4
+L(special_paths_fma):
+	/* Here if 125*log(2)<=|x| */
+	shrl	$31, %eax		/* Get sign bit of x, and depending on it: */
+	lea	L(SP_RANGE)(%rip), %rdx	/* load over/underflow bound */
+	cmpl	(%rdx,%rax,4), %ecx	/* |x|<under/overflow bound ? */
+	jbe	L(near_under_or_overflow_fma)
+
+	/* Here if |x|>under/overflow bound */
+	cmpl	$0x7f800000, %ecx	/* |x| is finite ? */
+	jae	L(arg_inf_or_nan_fma)
+
+	/* Here if |x|>under/overflow bound, and x is finite */
+	testl	%eax, %eax		/* sign of x nonzero ? */
+	je	L(res_overflow_fma)
+
+	/* Here if -inf<x<underflow bound (x<0) */
+	vmovss	L(SP_SMALL)(%rip), %xmm0/* load small value 2^(-100) */
+	vmulss	%xmm0, %xmm0, %xmm0	/* Return underflowed result (zero or subnormal) */
+	ret
+
+	.p2align	4
+L(res_overflow_fma):
+	/* Here if overflow bound<x<inf (x>0) */
+	vmovss	L(SP_LARGE)(%rip), %xmm0/* load large value 2^100 */
+	vmulss	%xmm0, %xmm0, %xmm0	/* Return overflowed result (Inf or max normal) */
+	ret
+
+	.p2align	4
+L(arg_inf_or_nan_fma):
+	/* Here if |x| is Inf or NAN */
+	jne	L(arg_nan_fma)	/* |x| is Inf ? */
+
+	/* Here if |x| is Inf */
+	lea	L(SP_INF_0)(%rip), %rdx	/* depending on sign of x: */
+	vmovss	(%rdx,%rax,4), %xmm0	/* return zero or Inf */
+	ret
+
+	.p2align	4
+L(arg_nan_fma):
+	/* Here if |x| is NaN */
+	vaddss	%xmm0, %xmm0, %xmm0	/* Return x+x (raise invalid) */
+	ret
+
+	.p2align	4
+L(near_under_or_overflow_fma):
+	/* Here if 125*log(2)<=|x|<under/overflow bound */
+	vmovd	%xmm2, %eax		/* bits of n*K+j with trash */
+	vsubsd	L(DP_RD)(%rip), %xmm2, %xmm2 	/* DP t=round(x*K/log(2)) */
+	movl	%eax, %edx		/* n*K+j with trash */
+	andl	$0x3f, %eax		/* bits of j */
+	vmulsd	L(DP_NLN2K)(%rip),%xmm2, %xmm2/* DP -t*log(2)/K */
+	andl	$0xffffffc0, %edx	/* bits of n */
+	vaddsd	%xmm1, %xmm2, %xmm0	/* DP y=x-t*log(2)/K */
+	vmulsd	%xmm0, %xmm0, %xmm2	/* DP z=y*y */
+	addl	$0xffc0, %edx		/* bits of n + DP exponent bias */
+	vfmadd213sd L(DP_P0)(%rip), %xmm2, %xmm3/* DP P2*z+P0 */
+	shlq	$46, %rdx		/* DP 2^n */
+	vfmadd213sd L(DP_P1)(%rip), %xmm2, %xmm4/* DP P3*z+P1 */
+	vmovq	%rdx, %xmm1		/* DP 2^n */
+	vmulsd	%xmm2, %xmm4, %xmm4	/* DP (P3*z+P1)*z */
+	vfmadd213sd %xmm4, %xmm3, %xmm0	/* DP (P2*z+P0)*y */
+	vmovsd	(%rsi,%rax,8), %xmm2
+	vfmadd213sd %xmm2, %xmm2, %xmm0 /* DP T[j]*(P(y)+1) */
+	vmulsd	%xmm1, %xmm0, %xmm0	/* DP result=2^n*(T[j]*(P(y)+1)) */
+	vcvtsd2ss %xmm0, %xmm0, %xmm0	/* convert result to single precision */
+	ret
+END(__ieee754_expf_fma)
+
+	.section .rodata.cst8,"aM",@progbits,8
+	.p2align 3
+L(DP_RD): /* double precision 2^52+2^51 */
+	.long	0x00000000, 0x43380000
+	.type L(DP_RD), @object
+	ASM_SIZE_DIRECTIVE(L(DP_RD))
+
+#define __ieee754_expf __ieee754_expf_sse2
+
+#undef strong_alias
+#define strong_alias(ignored1, ignored2)
+
+#include <sysdeps/x86_64/fpu/e_expf.S>
diff --git a/sysdeps/x86_64/fpu/multiarch/e_expf.c b/sysdeps/x86_64/fpu/multiarch/e_expf.c
new file mode 100644
index 0000000000..864d5f9b21
--- /dev/null
+++ b/sysdeps/x86_64/fpu/multiarch/e_expf.c
@@ -0,0 +1,26 @@ 
+/* Multiple versions of IEEE 754 expf.
+   Copyright (C) 2017 Free Software Foundation, Inc.
+   This file is part of the GNU C Library.
+
+   The GNU C Library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 2.1 of the License, or (at your option) any later version.
+
+   The GNU C Library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with the GNU C Library; if not, see
+   <http://www.gnu.org/licenses/>.  */
+
+extern double __redirect_ieee754_expf (double);
+
+#define SYMBOL_NAME ieee754_expf
+#include "ifunc-fma.h"
+
+libc_ifunc_redirected (__redirect_ieee754_expf, __ieee754_expf,
+		       IFUNC_SELECTOR ());
+strong_alias (__ieee754_expf, __expf_finite)
diff --git a/sysdeps/x86_64/fpu/multiarch/ifunc-fma.h b/sysdeps/x86_64/fpu/multiarch/ifunc-fma.h
new file mode 100644
index 0000000000..383c41ffb1
--- /dev/null
+++ b/sysdeps/x86_64/fpu/multiarch/ifunc-fma.h
@@ -0,0 +1,34 @@ 
+/* Common definition for ifunc selections optimized with AVX2/FMA.
+   Copyright (C) 2017 Free Software Foundation, Inc.
+   This file is part of the GNU C Library.
+
+   The GNU C Library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 2.1 of the License, or (at your option) any later version.
+
+   The GNU C Library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with the GNU C Library; if not, see
+   <http://www.gnu.org/licenses/>.  */
+
+#include <init-arch.h>
+
+extern __typeof (REDIRECT_NAME) OPTIMIZE (sse2) attribute_hidden;
+extern __typeof (REDIRECT_NAME) OPTIMIZE (fma) attribute_hidden;
+
+static inline void *
+IFUNC_SELECTOR (void)
+{
+  const struct cpu_features* cpu_features = __get_cpu_features ();
+
+  if (CPU_FEATURES_ARCH_P (cpu_features, FMA_Usable)
+      && CPU_FEATURES_ARCH_P (cpu_features, AVX2_Usable))
+    return OPTIMIZE (fma);
+
+  return OPTIMIZE (sse2);
+}