From patchwork Tue Dec 5 20:45:42 2017 Content-Type: text/plain; charset="utf-8" MIME-Version: 1.0 Content-Transfer-Encoding: 7bit X-Patchwork-Submitter: "Paul A. Clarke" X-Patchwork-Id: 24748 Received: (qmail 10267 invoked by alias); 5 Dec 2017 20:45:53 -0000 Mailing-List: contact libc-alpha-help@sourceware.org; run by ezmlm Precedence: bulk List-Id: List-Unsubscribe: List-Subscribe: List-Archive: List-Post: List-Help: , Sender: libc-alpha-owner@sourceware.org Delivered-To: mailing list libc-alpha@sourceware.org Received: (qmail 10247 invoked by uid 89); 5 Dec 2017 20:45:52 -0000 Authentication-Results: sourceware.org; auth=none X-Virus-Found: No X-Spam-SWARE-Status: No, score=-26.8 required=5.0 tests=BAYES_00, GIT_PATCH_0, GIT_PATCH_1, GIT_PATCH_2, GIT_PATCH_3, KAM_ASCII_DIVIDERS, KAM_SHORT, RCVD_IN_DNSWL_LOW, SPF_PASS autolearn=ham version=3.3.2 spammy=adoption X-HELO: mx0a-001b2d01.pphosted.com From: Paul Clarke Subject: [PATCH] New generic cosf To: libc-alpha@sourceware.org Date: Tue, 5 Dec 2017 14:45:42 -0600 User-Agent: Mozilla/5.0 (X11; Linux x86_64; rv:52.0) Gecko/20100101 Thunderbird/52.4.0 MIME-Version: 1.0 X-TM-AS-GCONF: 00 x-cbid: 17120520-0028-0000-0000-000008C53464 X-IBM-SpamModules-Scores: X-IBM-SpamModules-Versions: BY=3.00008155; HX=3.00000241; KW=3.00000007; PH=3.00000004; SC=3.00000243; SDB=6.00955929; UDB=6.00483172; IPR=6.00735966; BA=6.00005729; NDR=6.00000001; ZLA=6.00000005; ZF=6.00000009; ZB=6.00000000; ZP=6.00000000; ZH=6.00000000; ZU=6.00000002; MB=3.00018367; XFM=3.00000015; UTC=2017-12-05 20:45:45 X-IBM-AV-DETECTION: SAVI=unused REMOTE=unused XFE=unused x-cbparentid: 17120520-0029-0000-0000-0000389C34C8 Message-Id: <72875dc7-0d21-17e6-3802-4cca20b62371@us.ibm.com> X-Proofpoint-Virus-Version: vendor=fsecure engine=2.50.10432:, , definitions=2017-12-05_07:, , signatures=0 X-Proofpoint-Spam-Details: rule=outbound_spam_definite policy=outbound score=100 priorityscore=1501 malwarescore=0 suspectscore=0 phishscore=0 bulkscore=0 spamscore=100 clxscore=1011 lowpriorityscore=0 impostorscore=0 adultscore=0 classifier=spam adjust=0 reason=mlx scancount=1 engine=8.0.1-1709140000 definitions=main-1712050296 From 60e12f38145e4699cf231f7815cf1894d5e2d68d Mon Sep 17 00:00:00 2001 From: "Paul A. Clarke" Date: Tue, 5 Dec 2017 09:32:56 -0600 Subject: [PATCH] New generic cosf The same logic used in s_cosf.S version for x86 and powerpc is used to create a generic s_cosf.c, so there is no performance improvement in x86_64 and powerpc64. -- 8< -- For s390, this is the improvement noted. With patch: "cosf": { "": { "duration": 1.00479e+10, "iterations": 1.53856e+08, "max": 900.645, "min": 4.264, "mean": 65.3074 } } Without patch: "cosf": { "": { "duration": 9.93841e+09, "iterations": 4.63972e+08, "max": 1010.9, "min": 6.593, "mean": 21.4203 } } Tested on s390x, x86_64 and powerpc64le and powerpc32. I include below a diff with recent generic s_sinf.c, as it is more instructive than a diff with existing s_cosf.c. There are a fair number of cosmetic changes, a few hard differences because it's a different computation, and adoption of changes from recent patches deemed acceptable. --- 1c1 < /* Compute sine of argument. --- > /* Compute cosine of argument. 24,25c24,25 < #ifndef SINF < # define SINF_FUNC __sinf --- > #ifndef COSF > # define COSF_FUNC __cosf 27c27 < # define SINF_FUNC SINF --- > # define COSF_FUNC COSF 44,46c44,46 < /* Chebyshev constants for sin, range 2^-27 - 2^-5. */ < static const double SS0 = -0x1.555555543d49dp-3; < static const double SS1 = 0x1.110f475cec8c5p-7; --- > /* Chebyshev constants for cos, range 2^-27 - 2^-5. */ > static const double CC0 = -0x1.fffffff5cc6fdp-2; > static const double CC1 = 0x1.55514b178dac5p-5; 52d51 < static const double SMALL = 0x1p-50; /* 2^-50. */ 78c77 < static const int ones[] = { +1, -1 }; --- > static const double ones[] = { +1, -1 }; 80c79 < /* Compute the sine value using Chebyshev polynomials where --- > /* Compute the cosine value using Chebyshev polynomials where 85,86c84 < SIGNBIT is used to add the correct sign after the Chebyshev < polynomial is computed. */ --- > the sign of the result. */ 88,89c86 < reduced (const double theta, const unsigned long int n, < const unsigned long int signbit) --- > reduced (double theta, unsigned int n) 91c88 < double sx; --- > double sign, cx; 93,95c90 < /* We are operating on |x|, so we need to add back the original < signbit for sinf. */ < int sign; --- > 97c92,94 < sign = ones[((n >> 2) & 1) ^ signbit]; --- > n += 2; > sign = ones[(n >> 2) & 1]; > 101c98 < /* Here sinf() is calculated using sin Chebyshev polynomial: --- > /* Here cosf() is calculated using sin Chebyshev polynomial: 103,107c100,104 < sx = S3 + theta2 * S4; /* S3+x^2*S4. */ < sx = S2 + theta2 * sx; /* S2+x^2*(S3+x^2*S4). */ < sx = S1 + theta2 * sx; /* S1+x^2*(S2+x^2*(S3+x^2*S4)). */ < sx = S0 + theta2 * sx; /* S0+x^2*(S1+x^2*(S2+x^2*(S3+x^2*S4))). */ < sx = theta + theta * theta2 * sx; --- > cx = S3 + theta2 * S4; > cx = S2 + theta2 * cx; > cx = S1 + theta2 * cx; > cx = S0 + theta2 * cx; > cx = theta + theta * theta2 * cx; 111c108 < /* Here sinf() is calculated using cos Chebyshev polynomial: --- > /* Here cosf() is calculated using cos Chebyshev polynomial: 113,117c110,114 < sx = C3 + theta2 * C4; /* C3+x^2*C4. */ < sx = C2 + theta2 * sx; /* C2+x^2*(C3+x^2*C4). */ < sx = C1 + theta2 * sx; /* C1+x^2*(C2+x^2*(C3+x^2*C4)). */ < sx = C0 + theta2 * sx; /* C0+x^2*(C1+x^2*(C2+x^2*(C3+x^2*C4))). */ < sx = 1.0 + theta2 * sx; --- > cx = C3 + theta2 * C4; > cx = C2 + theta2 * cx; > cx = C1 + theta2 * cx; > cx = C0 + theta2 * cx; > cx = 1. + theta2 * cx; 119,121c116 < < /* Add in the signbit and assign the result. */ < return sign * sx; --- > return sign * cx; 125c120 < SINF_FUNC (float x) --- > COSF_FUNC (float x) 127d121 < double cx; 130,131c124 < /* If |x|< Pi/4. */ < if (abstheta < M_PI_4) --- > if (isless (abstheta, M_PI_4)) 133c126,127 < if (abstheta >= 0x1p-5) /* |x| >= 2^-5. */ --- > double cx; > if (abstheta >= 0x1p-5) 136,142c130,136 < /* Chebyshev polynomial of the form for sin < x+x^3*(S0+x^2*(S1+x^2*(S2+x^2*(S3+x^2*S4)))). */ < cx = S3 + theta2 * S4; < cx = S2 + theta2 * cx; < cx = S1 + theta2 * cx; < cx = S0 + theta2 * cx; < cx = theta + theta * theta2 * cx; --- > /* Chebyshev polynomial of the form for cos: > * 1 + x^2 (C0 + x^2 (C1 + x^2 (C2 + x^2 (C3 + x^2 * C4)))). */ > cx = C3 + theta2 * C4; > cx = C2 + theta2 * cx; > cx = C1 + theta2 * cx; > cx = C0 + theta2 * cx; > cx = 1. + theta2 * cx; 145c139 < else if (abstheta >= 0x1p-27) /* |x| >= 2^-27. */ --- > else if (abstheta >= 0x1p-27) 148c142 < for sin: x+x^3*(SS0+x^2*SS1). */ --- > * 1 + x^2 (CC0 + x^3 * CC1). */ 150,151c144,145 < cx = SS0 + theta2 * SS1; < cx = theta + theta * theta2 * cx; --- > cx = CC0 + theta * theta2 * CC1; > cx = 1.0 + theta2 * cx; 156,160c150,151 < /* Handle some special cases. */ < if (theta) < return theta - (theta * SMALL); < else < return theta; --- > /* For small enough |theta|, this is close enough. */ > return 1.0 - abstheta; 163c154 < else /* |x| >= Pi/4. */ --- > else /* |theta| >= Pi/4. */ 165,166c156 < unsigned long int signbit = (x < 0); < if (abstheta < 9 * M_PI_4) /* |x| < 9*Pi/4. */ --- > if (isless (abstheta, 9 * M_PI_4)) 172c162 < unsigned long int n = (abstheta * inv_PI_4) + 1; --- > unsigned int n = (abstheta * inv_PI_4) + 1; 174c164 < return reduced (theta, n, signbit); --- > return reduced (theta, n); 178c168 < if (abstheta < 0x1p+23) /* |x| < 2^23. */ --- > if (abstheta < 0x1p+23) 180,181c170,171 < unsigned long int n = __floor (abstheta * inv_PI_4) + 1.0; < double x = __floor (n / 2.0); --- > unsigned int n = ((unsigned int) (abstheta * inv_PI_4)) + 1.0; > double x = n / 2.0; 184c174 < return reduced (theta, n, signbit); --- > return reduced (theta, n); 186c176 < else /* |x| >= 2^23. */ --- > else /* |theta| >= 2^23. */ 191,192c181,182 < exponent < = (exponent >> FLOAT_EXPONENT_SHIFT) - FLOAT_EXPONENT_BIAS; --- > exponent = (exponent >> FLOAT_EXPONENT_SHIFT) > - FLOAT_EXPONENT_BIAS; 212c202 < return reduced (e, l + 1, signbit); --- > return reduced (e, l + 1); 222c212 < return reduced (e, l + 1, signbit); --- > return reduced (e, l + 1); 229c219 < return reduced (e, l + 1, signbit); --- > return reduced (e, l + 1); 237d226 < /* High word of x. */ 239,240c228,229 < /* Sin(Inf or NaN) is NaN. */ < if (ix == 0x7f800000) --- > /* cos(Inf or NaN) is NaN. */ > if (ix == 0x7f800000) /* Inf. */ 247,248c236,237 < #ifndef SINF < libm_alias_float (__sin, sin) --- > #ifndef COSF > libm_alias_float (__cos, cos) -- 8< -- 2017-12-05 Paul A. Clarke * sysdeps/ieee754/flt-32/s_cosf.c: New implementation. --- sysdeps/ieee754/flt-32/s_cosf.c | 254 +++++++++++++++++++++++++++++++++------- 1 file changed, 214 insertions(+), 40 deletions(-) diff --git a/sysdeps/ieee754/flt-32/s_cosf.c b/sysdeps/ieee754/flt-32/s_cosf.c index 5ed0bca..301b36e 100644 --- a/sysdeps/ieee754/flt-32/s_cosf.c +++ b/sysdeps/ieee754/flt-32/s_cosf.c @@ -1,21 +1,20 @@ -/* s_cosf.c -- float version of s_cos.c. - * Conversion to float by Ian Lance Taylor, Cygnus Support, ian@cygnus.com. - */ - -/* - * ==================================================== - * Copyright (C) 1993 by Sun Microsystems, Inc. All rights reserved. - * - * Developed at SunPro, a Sun Microsystems, Inc. business. - * Permission to use, copy, modify, and distribute this - * software is freely granted, provided that this notice - * is preserved. - * ==================================================== - */ - -#if defined(LIBM_SCCS) && !defined(lint) -static char rcsid[] = "$NetBSD: s_cosf.c,v 1.4 1995/05/10 20:47:03 jtc Exp $"; -#endif +/* Compute cosine of argument. + Copyright (C) 2017 Free Software Foundation, Inc. + This file is part of the GNU C Library. + + The GNU C Library is free software; you can redistribute it and/or + modify it under the terms of the GNU Lesser General Public + License as published by the Free Software Foundation; either + version 2.1 of the License, or (at your option) any later version. + + The GNU C Library is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + Lesser General Public License for more details. + + You should have received a copy of the GNU Lesser General Public + License along with the GNU C Library; if not, see + . */ #include #include @@ -28,35 +27,210 @@ static char rcsid[] = "$NetBSD: s_cosf.c,v 1.4 1995/05/10 20:47:03 jtc Exp $"; # define COSF_FUNC COSF #endif -float COSF_FUNC(float x) +/* Chebyshev constants for cos, range -PI/4 - PI/4. */ +static const double C0 = -0x1.ffffffffe98aep-2; +static const double C1 = 0x1.55555545c50c7p-5; +static const double C2 = -0x1.6c16b348b6874p-10; +static const double C3 = 0x1.a00eb9ac43ccp-16; +static const double C4 = -0x1.23c97dd8844d7p-22; + +/* Chebyshev constants for sin, range -PI/4 - PI/4. */ +static const double S0 = -0x1.5555555551cd9p-3; +static const double S1 = 0x1.1111110c2688bp-7; +static const double S2 = -0x1.a019f8b4bd1f9p-13; +static const double S3 = 0x1.71d7264e6b5b4p-19; +static const double S4 = -0x1.a947e1674b58ap-26; + +/* Chebyshev constants for cos, range 2^-27 - 2^-5. */ +static const double CC0 = -0x1.fffffff5cc6fdp-2; +static const double CC1 = 0x1.55514b178dac5p-5; + +/* PI/2 with 98 bits of accuracy. */ +static const double PI_2_hi = -0x1.921fb544p+0; +static const double PI_2_lo = -0x1.0b4611a626332p-34; + +static const double inv_PI_4 = 0x1.45f306dc9c883p+0; /* 4/PI. */ + +#define FLOAT_EXPONENT_SHIFT 23 +#define FLOAT_EXPONENT_BIAS 127 + +static const double pio2_table[] = { + 0 * M_PI_2, + 1 * M_PI_2, + 2 * M_PI_2, + 3 * M_PI_2, + 4 * M_PI_2, + 5 * M_PI_2 +}; + +static const double invpio4_table[] = { + 0x0p+0, + 0x1.45f306cp+0, + 0x1.c9c882ap-28, + 0x1.4fe13a8p-58, + 0x1.f47d4dp-85, + 0x1.bb81b6cp-112, + 0x1.4acc9ep-142, + 0x1.0e4107cp-169 +}; + +static const double ones[] = { +1, -1 }; + +/* Compute the cosine value using Chebyshev polynomials where + THETA is the range reduced absolute value of the input + and it is less than Pi/4, + N is calculated as trunc(|x|/(Pi/4)) + 1 and it is used to decide + whether a sine or cosine approximation is more accurate and + the sign of the result. */ +static inline float +reduced (double theta, unsigned int n) { - float y[2],z=0.0; - int32_t n,ix; + double sign, cx; + const double theta2 = theta * theta; - GET_FLOAT_WORD(ix,x); + /* Determine positive or negative primary interval. */ + n += 2; + sign = ones[(n >> 2) & 1]; - /* |x| ~< pi/4 */ - ix &= 0x7fffffff; - if(ix <= 0x3f490fd8) return __kernel_cosf(x,z); + /* Are we in the primary interval of sin or cos? */ + if ((n & 2) == 0) + { + /* Here cosf() is calculated using sin Chebyshev polynomial: + x+x^3*(S0+x^2*(S1+x^2*(S2+x^2*(S3+x^2*S4)))). */ + cx = S3 + theta2 * S4; + cx = S2 + theta2 * cx; + cx = S1 + theta2 * cx; + cx = S0 + theta2 * cx; + cx = theta + theta * theta2 * cx; + } + else + { + /* Here cosf() is calculated using cos Chebyshev polynomial: + 1.0+x^2*(C0+x^2*(C1+x^2*(C2+x^2*(C3+x^2*C4)))). */ + cx = C3 + theta2 * C4; + cx = C2 + theta2 * cx; + cx = C1 + theta2 * cx; + cx = C0 + theta2 * cx; + cx = 1. + theta2 * cx; + } + return sign * cx; +} - /* cos(Inf or NaN) is NaN */ - else if (ix>=0x7f800000) { - if (ix == 0x7f800000) - __set_errno (EDOM); - return x-x; +float +COSF_FUNC (float x) +{ + double theta = x; + double abstheta = fabs (theta); + if (isless (abstheta, M_PI_4)) + { + double cx; + if (abstheta >= 0x1p-5) + { + const double theta2 = theta * theta; + /* Chebyshev polynomial of the form for cos: + * 1 + x^2 (C0 + x^2 (C1 + x^2 (C2 + x^2 (C3 + x^2 * C4)))). */ + cx = C3 + theta2 * C4; + cx = C2 + theta2 * cx; + cx = C1 + theta2 * cx; + cx = C0 + theta2 * cx; + cx = 1. + theta2 * cx; + return cx; } - - /* argument reduction needed */ - else { - n = __ieee754_rem_pio2f(x,y); - switch(n&3) { - case 0: return __kernel_cosf(y[0],y[1]); - case 1: return -__kernel_sinf(y[0],y[1],1); - case 2: return -__kernel_cosf(y[0],y[1]); - default: - return __kernel_sinf(y[0],y[1],1); + else if (abstheta >= 0x1p-27) + { + /* A simpler Chebyshev approximation is close enough for this range: + * 1 + x^2 (CC0 + x^3 * CC1). */ + const double theta2 = theta * theta; + cx = CC0 + theta * theta2 * CC1; + cx = 1.0 + theta2 * cx; + return cx; + } + else + { + /* For small enough |theta|, this is close enough. */ + return 1.0 - abstheta; + } + } + else /* |theta| >= Pi/4. */ + { + if (isless (abstheta, 9 * M_PI_4)) + { + /* There are cases where FE_UPWARD rounding mode can + produce a result of abstheta * inv_PI_4 == 9, + where abstheta < 9pi/4, so the domain for + pio2_table must go to 5 (9 / 2 + 1). */ + unsigned int n = (abstheta * inv_PI_4) + 1; + theta = abstheta - pio2_table[n / 2]; + return reduced (theta, n); + } + else if (isless (abstheta, INFINITY)) + { + if (abstheta < 0x1p+23) + { + unsigned int n = ((unsigned int) (abstheta * inv_PI_4)) + 1.0; + double x = n / 2.0; + theta = x * PI_2_lo + (x * PI_2_hi + abstheta); + /* Argument reduction needed. */ + return reduced (theta, n); + } + else /* |theta| >= 2^23. */ + { + x = fabsf (x); + int exponent; + GET_FLOAT_WORD (exponent, x); + exponent = (exponent >> FLOAT_EXPONENT_SHIFT) + - FLOAT_EXPONENT_BIAS; + exponent += 3; + exponent /= 28; + double a = invpio4_table[exponent] * x; + double b = invpio4_table[exponent + 1] * x; + double c = invpio4_table[exponent + 2] * x; + double d = invpio4_table[exponent + 3] * x; + uint64_t l = a; + l &= ~0x7; + a -= l; + double e = a + b; + l = e; + e = a - l; + if (l & 1) + { + e -= 1.0; + e += b; + e += c; + e += d; + e *= M_PI_4; + return reduced (e, l + 1); + } + else + { + e += b; + e += c; + e += d; + if (e <= 1.0) + { + e *= M_PI_4; + return reduced (e, l + 1); + } + else + { + l++; + e -= 2.0; + e *= M_PI_4; + return reduced (e, l + 1); + } + } } } + else + { + int32_t ix; + GET_FLOAT_WORD (ix, abstheta); + /* cos(Inf or NaN) is NaN. */ + if (ix == 0x7f800000) /* Inf. */ + __set_errno (EDOM); + return x - x; + } + } } #ifndef COSF