Patchwork [03/28] powerpc: Remove power4 mpa optimization

login
register
mail settings
Submitter Adhemerval Zanella Netto
Date March 29, 2019, 1:35 p.m.
Message ID <20190329133529.22523-4-adhemerval.zanella@linaro.org>
Download mbox | patch
Permalink /patch/32063/
State New
Headers show

Comments

Adhemerval Zanella Netto - March 29, 2019, 1:35 p.m.
This patch removes the POWER4 optimized mpa optimization used for power4+
build.  For newer chips, GCC generates *worse* code than generic
implementation as benchmaks result below.  One possibilty would to add
IFUNC variants for the mpa routines (as x86_64), but it will add complexity
only for older chips (and one would need to check if power5, power5+, and
power6 do benefict from this optimization), and only for specific
implementation (since most used ones such as sin, cos, exp, pow already
avoid calling the slow multiprecision path).

  * POWER9 patched
    "atan": {
     "": {
      "duration": 5.12565e+09,
      "iterations": 1.552e+08,
      "max": 100.552,
      "min": 7.799,
      "mean": 33.0261
     },
     "144bits": {
      "duration": 5.12745e+09,
      "iterations": 825000,
      "max": 7517.17,
      "min": 6186.3,
      "mean": 6215.09
     }
    }
    "acos": {
     "": {
      "duration": 5.21741e+09,
      "iterations": 1.269e+08,
      "max": 191.738,
      "min": 7.931,
      "mean": 41.1144
     },
     "slow": {
      "duration": 5.25999e+09,
      "iterations": 198000,
      "max": 26681.7,
      "min": 26463.6,
      "mean": 26565.6
     }
    }

  * POWER9 master
    "atan": {
     "": {
      "duration": 5.12815e+09,
      "iterations": 1.552e+08,
      "max": 134.788,
      "min": 7.803,
      "mean": 33.0422
     },
     "144bits": {
      "duration": 5.1209e+09,
      "iterations": 447000,
      "max": 11615.8,
      "min": 11301.8,
      "mean": 11456.2
     }
    }
    "acos": {
     "": {
      "duration": 5.22272e+09,
      "iterations": 1.269e+08,
      "max": 115.981,
      "min": 7.931,
      "mean": 41.1562
     },
     "slow": {
      "duration": 5.28723e+09,
      "iterations": 96000,
      "max": 55434.1,
      "min": 54820.6,
      "mean": 55075.3
     }
    }

  * POWER8 patched
    "acos": {
     "": {
      "duration": 5.16398e+09,
      "iterations": 9.99e+07,
      "max": 174.408,
      "min": 8.645,
      "mean": 51.6915
     },
     "slow": {
      "duration": 5.16982e+09,
      "iterations": 96000,
      "max": 54830.5,
      "min": 53703.8,
      "mean": 53852.3
     }
    }
  * POWER8 master
    "acos": {
     "": {
      "duration": 5.17019e+09,
      "iterations": 9.99e+07,
      "max": 186.127,
      "min": 8.633,
      "mean": 51.7537
     },
     "slow": {
      "duration": 5.34225e+09,
      "iterations": 90000,
      "max": 60353.2,
      "min": 59155.3,
      "mean": 59358.4
     }
    }

  * POWER7 patched
    "asin": {
     "": {
      "duration": 5.15559e+09,
      "iterations": 6.5e+07,
      "max": 193.335,
      "min": 12.227,
      "mean": 79.3168
     },
     "slow": {
      "duration": 5.20538e+09,
      "iterations": 80000,
      "max": 65705.2,
      "min": 64299.4,
      "mean": 65067.3
     }
    }
  * POWER7 master
    "asin": {
     "": {
      "duration": 5.15446e+09,
      "iterations": 6.5e+07,
      "max": 184.575,
      "min": 12.226,
      "mean": 79.2994
     },
     "slow": {
      "duration": 5.20616e+09,
      "iterations": 80000,
      "max": 65705.1,
      "min": 64336.6,
      "mean": 65076.9
     }
    }

Checked on powerpc-linux-gnu (built without --with-cpu, with
--with-cpu=power4 and with --with-cpu=power5+ and --disable-multi-arch),
powerpc64-linux-gnu (built without --with-cp and with --with-cpu=power5+
and --disable-multi-arch).

	* sysdeps/powerpc/power4/fpu/Makefile: Remove file.
	* sysdeps/powerpc/power4/fpu/mpa-arch.h: Likewise.
	* sysdeps/powerpc/power4/fpu/mpa.c: Likewise.
---
 sysdeps/powerpc/power4/fpu/Makefile   |   5 -
 sysdeps/powerpc/power4/fpu/mpa-arch.h |  56 -------
 sysdeps/powerpc/power4/fpu/mpa.c      | 214 --------------------------
 3 files changed, 275 deletions(-)
 delete mode 100644 sysdeps/powerpc/power4/fpu/Makefile
 delete mode 100644 sysdeps/powerpc/power4/fpu/mpa-arch.h
 delete mode 100644 sysdeps/powerpc/power4/fpu/mpa.c
Gabriel F. T. Gomes - April 24, 2019, 9:51 p.m.
On Fri, Mar 29 2019, Adhemerval Zanella wrote:
> This patch removes the POWER4 optimized mpa optimization used for power4+
> build.  For newer chips, GCC generates *worse* code than generic
> implementation as benchmaks result below.

I don't think I have access to POWER4 hardware, but when running
power4-targeted builds on newer processors, I saw no performance
degradation even in that scenario.

> One possibilty would to add
> IFUNC variants for the mpa routines (as x86_64), but it will add complexity
> only for older chips (and one would need to check if power5, power5+, and
> power6 do benefict from this optimization), and only for specific
> implementation (since most used ones such as sin, cos, exp, pow already
> avoid calling the slow multiprecision path).

I don't think it's worth the effort.

> Checked on powerpc-linux-gnu (built without --with-cpu, with
> --with-cpu=power4 and with --with-cpu=power5+ and --disable-multi-arch),
> powerpc64-linux-gnu (built without --with-cp and with --with-cpu=power5+
> and --disable-multi-arch).

Thanks for the thorough checking.

> 	* sysdeps/powerpc/power4/fpu/Makefile: Remove file.
> 	* sysdeps/powerpc/power4/fpu/mpa-arch.h: Likewise.
> 	* sysdeps/powerpc/power4/fpu/mpa.c: Likewise.

Looks good to me.

Reviewed-by: Gabriel F. T. Gomes <gabriel@inconstante.eti.br>
Adhemerval Zanella Netto - April 25, 2019, 12:19 p.m.
On 24/04/2019 18:51, Gabriel F. T. Gomes wrote:
> 
> On Fri, Mar 29 2019, Adhemerval Zanella wrote:
>> This patch removes the POWER4 optimized mpa optimization used for power4+
>> build.  For newer chips, GCC generates *worse* code than generic
>> implementation as benchmaks result below.
> 
> I don't think I have access to POWER4 hardware, but when running
> power4-targeted builds on newer processors, I saw no performance
> degradation even in that scenario.

My guess is the optimization was trying to overcome some compiler
missed optimization (POWER4 was released at same time as GCC 3.2.3
and mostly likely distributions were using even older releases to
build system libc).

> 
>> One possibilty would to add
>> IFUNC variants for the mpa routines (as x86_64), but it will add complexity
>> only for older chips (and one would need to check if power5, power5+, and
>> power6 do benefict from this optimization), and only for specific
>> implementation (since most used ones such as sin, cos, exp, pow already
>> avoid calling the slow multiprecision path).
> 
> I don't think it's worth the effort.

Agreed.

> 
>> Checked on powerpc-linux-gnu (built without --with-cpu, with
>> --with-cpu=power4 and with --with-cpu=power5+ and --disable-multi-arch),
>> powerpc64-linux-gnu (built without --with-cp and with --with-cpu=power5+
>> and --disable-multi-arch).
> 
> Thanks for the thorough checking.
> 
>> 	* sysdeps/powerpc/power4/fpu/Makefile: Remove file.
>> 	* sysdeps/powerpc/power4/fpu/mpa-arch.h: Likewise.
>> 	* sysdeps/powerpc/power4/fpu/mpa.c: Likewise.
> 
> Looks good to me.
> 
> Reviewed-by: Gabriel F. T. Gomes <gabriel@inconstante.eti.br>
> 

Thanks.

Patch

diff --git a/sysdeps/powerpc/power4/fpu/Makefile b/sysdeps/powerpc/power4/fpu/Makefile
deleted file mode 100644
index f487ed6014..0000000000
--- a/sysdeps/powerpc/power4/fpu/Makefile
+++ /dev/null
@@ -1,5 +0,0 @@ 
-# Makefile fragment for POWER4/5/5+ with FPU.
-
-ifeq ($(subdir),math)
-CFLAGS-mpa.c += --param max-unroll-times=4 -funroll-loops -fpeel-loops
-endif
diff --git a/sysdeps/powerpc/power4/fpu/mpa-arch.h b/sysdeps/powerpc/power4/fpu/mpa-arch.h
deleted file mode 100644
index 929c60b314..0000000000
--- a/sysdeps/powerpc/power4/fpu/mpa-arch.h
+++ /dev/null
@@ -1,56 +0,0 @@ 
-/* Overridable constants and operations.
-   Copyright (C) 2013-2019 Free Software Foundation, Inc.
-
-   This program is free software; you can redistribute it and/or modify
-   it under the terms of the GNU Lesser General Public License as published by
-   the Free Software Foundation; either version 2.1 of the License, or
-   (at your option) any later version.
-
-   This program is distributed in the hope that it will be useful,
-   but WITHOUT ANY WARRANTY; without even the implied warranty of
-   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
-   GNU Lesser General Public License for more details.
-
-   You should have received a copy of the GNU Lesser General Public License
-   along with this program; if not, see <http://www.gnu.org/licenses/>.  */
-
-typedef double mantissa_t;
-typedef double mantissa_store_t;
-
-#define TWOPOW(i) (0x1.0p##i)
-
-#define RADIX TWOPOW (24)		/* 2^24    */
-#define CUTTER TWOPOW (76)		/* 2^76    */
-#define RADIXI 0x1.0p-24		/* 2^-24 */
-#define TWO52 TWOPOW (52)		/* 2^52 */
-
-/* Divide D by RADIX and put the remainder in R.  */
-#define DIV_RADIX(d,r) \
-  ({									      \
-    double u = ((d) + CUTTER) - CUTTER;					      \
-    if (u > (d))							      \
-      u -= RADIX;							      \
-    r = (d) - u;							      \
-    (d) = u * RADIXI;							      \
-  })
-
-/* Put the integer component of a double X in R and retain the fraction in
-   X.  */
-#define INTEGER_OF(x, r) \
-  ({									      \
-    double u = ((x) + TWO52) - TWO52;					      \
-    if (u > (x))							      \
-      u -= 1;								      \
-    (r) = u;								      \
-    (x) -= u;								      \
-  })
-
-/* Align IN down to a multiple of F, where F is a power of two.  */
-#define ALIGN_DOWN_TO(in, f) \
-  ({									      \
-    double factor = f * TWO52;						      \
-    double u = (in + factor) - factor;					      \
-    if (u > in)								      \
-      u -= f;								      \
-    u;									      \
-  })
diff --git a/sysdeps/powerpc/power4/fpu/mpa.c b/sysdeps/powerpc/power4/fpu/mpa.c
deleted file mode 100644
index 1be2e93cb7..0000000000
--- a/sysdeps/powerpc/power4/fpu/mpa.c
+++ /dev/null
@@ -1,214 +0,0 @@ 
-
-/*
- * IBM Accurate Mathematical Library
- * written by International Business Machines Corp.
- * Copyright (C) 2001-2019 Free Software Foundation, Inc.
- *
- * This program is free software; you can redistribute it and/or modify
- * it under the terms of the GNU Lesser General Public License as published by
- * the Free Software Foundation; either version 2.1 of the License, or
- * (at your option) any later version.
- *
- * This program is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
- * GNU Lesser General Public License for more details.
- *
- * You should have received a copy of the GNU Lesser General Public License
- * along with this program; if not, see <http://www.gnu.org/licenses/>.
- */
-
-/* Define __mul and __sqr and use the rest from generic code.  */
-#define NO__MUL
-#define NO__SQR
-
-#include <sysdeps/ieee754/dbl-64/mpa.c>
-
-/* Multiply *X and *Y and store result in *Z.  X and Y may overlap but not X
-   and Z or Y and Z.  For P in [1, 2, 3], the exact result is truncated to P
-   digits.  In case P > 3 the error is bounded by 1.001 ULP.  */
-void
-__mul (const mp_no *x, const mp_no *y, mp_no *z, int p)
-{
-  long i, i1, i2, j, k, k2;
-  long p2 = p;
-  double u, zk, zk2;
-
-  /* Is z=0?  */
-  if (__glibc_unlikely (X[0] * Y[0] == 0))
-    {
-      Z[0] = 0;
-      return;
-    }
-
-  /* Multiply, add and carry */
-  k2 = (p2 < 3) ? p2 + p2 : p2 + 3;
-  zk = Z[k2] = 0;
-  for (k = k2; k > 1;)
-    {
-      if (k > p2)
-	{
-	  i1 = k - p2;
-	  i2 = p2 + 1;
-	}
-      else
-	{
-	  i1 = 1;
-	  i2 = k;
-	}
-#if 1
-      /* Rearrange this inner loop to allow the fmadd instructions to be
-         independent and execute in parallel on processors that have
-         dual symmetrical FP pipelines.  */
-      if (i1 < (i2 - 1))
-	{
-	  /* Make sure we have at least 2 iterations.  */
-	  if (((i2 - i1) & 1L) == 1L)
-	    {
-	      /* Handle the odd iterations case.  */
-	      zk2 = x->d[i2 - 1] * y->d[i1];
-	    }
-	  else
-	    zk2 = 0.0;
-	  /* Do two multiply/adds per loop iteration, using independent
-	     accumulators; zk and zk2.  */
-	  for (i = i1, j = i2 - 1; i < i2 - 1; i += 2, j -= 2)
-	    {
-	      zk += x->d[i] * y->d[j];
-	      zk2 += x->d[i + 1] * y->d[j - 1];
-	    }
-	  zk += zk2;		/* Final sum.  */
-	}
-      else
-	{
-	  /* Special case when iterations is 1.  */
-	  zk += x->d[i1] * y->d[i1];
-	}
-#else
-      /* The original code.  */
-      for (i = i1, j = i2 - 1; i < i2; i++, j--)
-	zk += X[i] * Y[j];
-#endif
-
-      u = (zk + CUTTER) - CUTTER;
-      if (u > zk)
-	u -= RADIX;
-      Z[k] = zk - u;
-      zk = u * RADIXI;
-      --k;
-    }
-  Z[k] = zk;
-
-  int e = EX + EY;
-  /* Is there a carry beyond the most significant digit?  */
-  if (Z[1] == 0)
-    {
-      for (i = 1; i <= p2; i++)
-	Z[i] = Z[i + 1];
-      e--;
-    }
-
-  EZ = e;
-  Z[0] = X[0] * Y[0];
-}
-
-/* Square *X and store result in *Y.  X and Y may not overlap.  For P in
-   [1, 2, 3], the exact result is truncated to P digits.  In case P > 3 the
-   error is bounded by 1.001 ULP.  This is a faster special case of
-   multiplication.  */
-void
-__sqr (const mp_no *x, mp_no *y, int p)
-{
-  long i, j, k, ip;
-  double u, yk;
-
-  /* Is z=0?  */
-  if (__glibc_unlikely (X[0] == 0))
-    {
-      Y[0] = 0;
-      return;
-    }
-
-  /* We need not iterate through all X's since it's pointless to
-     multiply zeroes.  */
-  for (ip = p; ip > 0; ip--)
-    if (X[ip] != 0)
-      break;
-
-  k = (__glibc_unlikely (p < 3)) ? p + p : p + 3;
-
-  while (k > 2 * ip + 1)
-    Y[k--] = 0;
-
-  yk = 0;
-
-  while (k > p)
-    {
-      double yk2 = 0.0;
-      long lim = k / 2;
-
-      if (k % 2 == 0)
-        {
-	  yk += X[lim] * X[lim];
-	  lim--;
-	}
-
-      /* In __mul, this loop (and the one within the next while loop) run
-         between a range to calculate the mantissa as follows:
-
-         Z[k] = X[k] * Y[n] + X[k+1] * Y[n-1] ... + X[n-1] * Y[k+1]
-		+ X[n] * Y[k]
-
-         For X == Y, we can get away with summing halfway and doubling the
-	 result.  For cases where the range size is even, the mid-point needs
-	 to be added separately (above).  */
-      for (i = k - p, j = p; i <= lim; i++, j--)
-	yk2 += X[i] * X[j];
-
-      yk += 2.0 * yk2;
-
-      u = (yk + CUTTER) - CUTTER;
-      if (u > yk)
-	u -= RADIX;
-      Y[k--] = yk - u;
-      yk = u * RADIXI;
-    }
-
-  while (k > 1)
-    {
-      double yk2 = 0.0;
-      long lim = k / 2;
-
-      if (k % 2 == 0)
-        {
-	  yk += X[lim] * X[lim];
-	  lim--;
-	}
-
-      /* Likewise for this loop.  */
-      for (i = 1, j = k - 1; i <= lim; i++, j--)
-	yk2 += X[i] * X[j];
-
-      yk += 2.0 * yk2;
-
-      u = (yk + CUTTER) - CUTTER;
-      if (u > yk)
-	u -= RADIX;
-      Y[k--] = yk - u;
-      yk = u * RADIXI;
-    }
-  Y[k] = yk;
-
-  /* Squares are always positive.  */
-  Y[0] = 1.0;
-
-  int e = EX * 2;
-  /* Is there a carry beyond the most significant digit?  */
-  if (__glibc_unlikely (Y[1] == 0))
-    {
-      for (i = 1; i <= p; i++)
-	Y[i] = Y[i + 1];
-      e--;
-    }
-  EY = e;
-}