libstdc++: Another merge from fast_float upstream [PR107468]

Message ID Y384D/1PiDqjqBdt@tucnak
State New
Headers
Series libstdc++: Another merge from fast_float upstream [PR107468] |

Commit Message

Jakub Jelinek Nov. 24, 2022, 9:23 a.m. UTC
  Hi!

Upstream fast_float came up with a cheaper test for
fegetround () == FE_TONEAREST using one float addition, one subtraction
and one comparison.  If we know we are rounding to nearest, we can use
fast path in more cases as before.
The following patch merges those changes into libstdc++.

Bootstrapped/regtested on x86_64-linux and i686-linux, ok for trunk?

2022-11-24  Jakub Jelinek  <jakub@redhat.com>

	PR libstdc++/107468
	* src/c++17/fast_float/MERGE: Adjust for merge from upstream.
	* src/c++17/fast_float/fast_float.h: Merge from fast_float
	2ef9abbcf6a11958b6fa685a89d0150022e82e78 commit.


	Jakub
  

Comments

Jonathan Wakely Nov. 24, 2022, 9:28 a.m. UTC | #1
On Thu, 24 Nov 2022 at 09:23, Jakub Jelinek wrote:
>
> Hi!
>
> Upstream fast_float came up with a cheaper test for
> fegetround () == FE_TONEAREST using one float addition, one subtraction
> and one comparison.  If we know we are rounding to nearest, we can use
> fast path in more cases as before.
> The following patch merges those changes into libstdc++.
>
> Bootstrapped/regtested on x86_64-linux and i686-linux, ok for trunk?

OK, thanks.


>
> 2022-11-24  Jakub Jelinek  <jakub@redhat.com>
>
>         PR libstdc++/107468
>         * src/c++17/fast_float/MERGE: Adjust for merge from upstream.
>         * src/c++17/fast_float/fast_float.h: Merge from fast_float
>         2ef9abbcf6a11958b6fa685a89d0150022e82e78 commit.
>
> --- libstdc++-v3/src/c++17/fast_float/MERGE.jj  2022-11-07 15:17:14.035071694 +0100
> +++ libstdc++-v3/src/c++17/fast_float/MERGE     2022-11-23 17:09:20.940866070 +0100
> @@ -1,4 +1,4 @@
> -662497742fea7055f0e0ee27e5a7ddc382c2c38e
> +2ef9abbcf6a11958b6fa685a89d0150022e82e78
>
>  The first line of this file holds the git revision number of the
>  last merge done from the master library sources.
> --- libstdc++-v3/src/c++17/fast_float/fast_float.h.jj   2022-11-07 15:17:14.066071268 +0100
> +++ libstdc++-v3/src/c++17/fast_float/fast_float.h      2022-11-23 17:19:41.735693122 +0100
> @@ -99,11 +99,11 @@ from_chars_result from_chars_advanced(co
>         || defined(__MINGW64__)                                          \
>         || defined(__s390x__)                                            \
>         || (defined(__ppc64__) || defined(__PPC64__) || defined(__ppc64le__) || defined(__PPC64LE__)) )
> -#define FASTFLOAT_64BIT
> +#define FASTFLOAT_64BIT 1
>  #elif (defined(__i386) || defined(__i386__) || defined(_M_IX86)   \
>       || defined(__arm__) || defined(_M_ARM)                   \
>       || defined(__MINGW32__) || defined(__EMSCRIPTEN__))
> -#define FASTFLOAT_32BIT
> +#define FASTFLOAT_32BIT 1
>  #else
>    // Need to check incrementally, since SIZE_MAX is a size_t, avoid overflow.
>    // We can never tell the register width, but the SIZE_MAX is a good approximation.
> @@ -111,9 +111,9 @@ from_chars_result from_chars_advanced(co
>    #if SIZE_MAX == 0xffff
>      #error Unknown platform (16-bit, unsupported)
>    #elif SIZE_MAX == 0xffffffff
> -    #define FASTFLOAT_32BIT
> +    #define FASTFLOAT_32BIT 1
>    #elif SIZE_MAX == 0xffffffffffffffff
> -    #define FASTFLOAT_64BIT
> +    #define FASTFLOAT_64BIT 1
>    #else
>      #error Unknown platform (not 32-bit, not 64-bit?)
>    #endif
> @@ -359,10 +359,12 @@ template <typename T> struct binary_form
>    static inline constexpr int minimum_exponent();
>    static inline constexpr int infinite_power();
>    static inline constexpr int sign_index();
> +  static inline constexpr int min_exponent_fast_path(); // used when fegetround() == FE_TONEAREST
>    static inline constexpr int max_exponent_fast_path();
>    static inline constexpr int max_exponent_round_to_even();
>    static inline constexpr int min_exponent_round_to_even();
>    static inline constexpr uint64_t max_mantissa_fast_path(int64_t power);
> +  static inline constexpr uint64_t max_mantissa_fast_path(); // used when fegetround() == FE_TONEAREST
>    static inline constexpr int largest_power_of_ten();
>    static inline constexpr int smallest_power_of_ten();
>    static inline constexpr T exact_power_of_ten(int64_t power);
> @@ -372,6 +374,22 @@ template <typename T> struct binary_form
>    static inline constexpr equiv_uint hidden_bit_mask();
>  };
>
> +template <> inline constexpr int binary_format<double>::min_exponent_fast_path() {
> +#if (FLT_EVAL_METHOD != 1) && (FLT_EVAL_METHOD != 0)
> +  return 0;
> +#else
> +  return -22;
> +#endif
> +}
> +
> +template <> inline constexpr int binary_format<float>::min_exponent_fast_path() {
> +#if (FLT_EVAL_METHOD != 1) && (FLT_EVAL_METHOD != 0)
> +  return 0;
> +#else
> +  return -10;
> +#endif
> +}
> +
>  template <> inline constexpr int binary_format<double>::mantissa_explicit_bits() {
>    return 52;
>  }
> @@ -418,13 +436,18 @@ template <> inline constexpr int binary_
>  template <> inline constexpr int binary_format<float>::max_exponent_fast_path() {
>    return 10;
>  }
> -
> +template <> inline constexpr uint64_t binary_format<double>::max_mantissa_fast_path() {
> +  return uint64_t(2) << mantissa_explicit_bits();
> +}
>  template <> inline constexpr uint64_t binary_format<double>::max_mantissa_fast_path(int64_t power) {
>    // caller is responsible to ensure that
>    // power >= 0 && power <= 22
>    //
>    return max_mantissa_double[power];
>  }
> +template <> inline constexpr uint64_t binary_format<float>::max_mantissa_fast_path() {
> +  return uint64_t(2) << mantissa_explicit_bits();
> +}
>  template <> inline constexpr uint64_t binary_format<float>::max_mantissa_fast_path(int64_t power) {
>    // caller is responsible to ensure that
>    // power >= 0 && power <= 10
> @@ -619,10 +642,6 @@ parsed_number_string parse_number_string
>
>    uint64_t i = 0; // an unsigned int avoids signed overflows (which are bad)
>
> -  while ((std::distance(p, pend) >= 8) && is_made_of_eight_digits_fast(p)) {
> -    i = i * 100000000 + parse_eight_digits_unrolled(p); // in rare cases, this will overflow, but that's ok
> -    p += 8;
> -  }
>    while ((p != pend) && is_integer(*p)) {
>      // a multiplication by 10 is cheaper than an arbitrary integer
>      // multiplication
> @@ -1640,7 +1659,7 @@ namespace fast_float {
>  // we might have platforms where `CHAR_BIT` is not 8, so let's avoid
>  // doing `8 * sizeof(limb)`.
>  #if defined(FASTFLOAT_64BIT) && !defined(__sparc)
> -#define FASTFLOAT_64BIT_LIMB
> +#define FASTFLOAT_64BIT_LIMB 1
>  typedef uint64_t limb;
>  constexpr size_t limb_bits = 64;
>  #else
> @@ -2314,10 +2333,6 @@ parsed_number_string parse_number_string
>
>    uint64_t i = 0; // an unsigned int avoids signed overflows (which are bad)
>
> -  while ((std::distance(p, pend) >= 8) && is_made_of_eight_digits_fast(p)) {
> -    i = i * 100000000 + parse_eight_digits_unrolled(p); // in rare cases, this will overflow, but that's ok
> -    p += 8;
> -  }
>    while ((p != pend) && is_integer(*p)) {
>      // a multiplication by 10 is cheaper than an arbitrary integer
>      // multiplication
> @@ -2892,6 +2907,48 @@ from_chars_result parse_infnan(const cha
>    return answer;
>  }
>
> +/**
> + * Returns true if the floating-pointing rounding mode is to 'nearest'.
> + * It is the default on most system. This function is meant to be inexpensive.
> + * Credit : @mwalcott3
> + */
> +fastfloat_really_inline bool rounds_to_nearest() noexcept {
> +  // See
> +  // A fast function to check your floating-point rounding mode
> +  // https://lemire.me/blog/2022/11/16/a-fast-function-to-check-your-floating-point-rounding-mode/
> +  //
> +  // This function is meant to be equivalent to :
> +  // prior: #include <cfenv>
> +  //  return fegetround() == FE_TONEAREST;
> +  // However, it is expected to be much faster than the fegetround()
> +  // function call.
> +  //
> +  // The volatile keywoard prevents the compiler from computing the function
> +  // at compile-time.
> +  // There might be other ways to prevent compile-time optimizations (e.g., asm).
> +  // The value does not need to be std::numeric_limits<float>::min(), any small
> +  // value so that 1 + x should round to 1 would do (after accounting for excess
> +  // precision, as in 387 instructions).
> +  static volatile float fmin = std::numeric_limits<float>::min();
> +  float fmini = fmin; // we copy it so that it gets loaded at most once.
> +  //
> +  // Explanation:
> +  // Only when fegetround() == FE_TONEAREST do we have that
> +  // fmin + 1.0f == 1.0f - fmin.
> +  //
> +  // FE_UPWARD:
> +  //  fmin + 1.0f > 1
> +  //  1.0f - fmin == 1
> +  //
> +  // FE_DOWNWARD or  FE_TOWARDZERO:
> +  //  fmin + 1.0f == 1
> +  //  1.0f - fmin < 1
> +  //
> +  // Note: This may fail to be accurate if fast-math has been
> +  // enabled, as rounding conventions may not apply.
> +  return (fmini + 1.0f == 1.0f - fmini);
> +}
> +
>  } // namespace detail
>
>  template<typename T>
> @@ -2919,12 +2976,45 @@ from_chars_result from_chars_advanced(co
>    }
>    answer.ec = std::errc(); // be optimistic
>    answer.ptr = pns.lastmatch;
> -  // Next is a modified Clinger's fast path, inspired by Jakub Jelínek's proposal
> -  if (pns.exponent >= 0 && pns.exponent <= binary_format<T>::max_exponent_fast_path() && pns.mantissa <=binary_format<T>::max_mantissa_fast_path(pns.exponent) && !pns.too_many_digits) {
> -    value = T(pns.mantissa);
> -    value = value * binary_format<T>::exact_power_of_ten(pns.exponent);
> -    if (pns.negative) { value = -value; }
> -    return answer;
> +  // The implementation of the Clinger's fast path is convoluted because
> +  // we want round-to-nearest in all cases, irrespective of the rounding mode
> +  // selected on the thread.
> +  // We proceed optimistically, assuming that detail::rounds_to_nearest() returns
> +  // true.
> +  if (binary_format<T>::min_exponent_fast_path() <= pns.exponent && pns.exponent <= binary_format<T>::max_exponent_fast_path() && !pns.too_many_digits) {
> +    // Unfortunately, the conventional Clinger's fast path is only possible
> +    // when the system rounds to the nearest float.
> +    //
> +    // We expect the next branch to almost always be selected.
> +    // We could check it first (before the previous branch), but
> +    // there might be performance advantages at having the check
> +    // be last.
> +    if(detail::rounds_to_nearest())  {
> +      // We have that fegetround() == FE_TONEAREST.
> +      // Next is Clinger's fast path.
> +      if (pns.mantissa <=binary_format<T>::max_mantissa_fast_path()) {
> +        value = T(pns.mantissa);
> +        if (pns.exponent < 0) { value = value / binary_format<T>::exact_power_of_ten(-pns.exponent); }
> +        else { value = value * binary_format<T>::exact_power_of_ten(pns.exponent); }
> +        if (pns.negative) { value = -value; }
> +        return answer;
> +      }
> +    } else {
> +      // We do not have that fegetround() == FE_TONEAREST.
> +      // Next is a modified Clinger's fast path, inspired by Jakub Jelínek's proposal
> +      if (pns.exponent >= 0 && pns.mantissa <=binary_format<T>::max_mantissa_fast_path(pns.exponent)) {
> +#if (defined(_WIN32) && defined(__clang__))
> +        // ClangCL may map 0 to -0.0 when fegetround() == FE_DOWNWARD
> +        if(pns.mantissa == 0) {
> +          value = 0;
> +          return answer;
> +        }
> +#endif
> +        value = T(pns.mantissa) * binary_format<T>::exact_power_of_ten(pns.exponent);
> +        if (pns.negative) { value = -value; }
> +        return answer;
> +      }
> +    }
>    }
>    adjusted_mantissa am = compute_float<binary_format<T>>(pns.exponent, pns.mantissa);
>    if(pns.too_many_digits && am.power2 >= 0) {
>
>         Jakub
>
  

Patch

--- libstdc++-v3/src/c++17/fast_float/MERGE.jj	2022-11-07 15:17:14.035071694 +0100
+++ libstdc++-v3/src/c++17/fast_float/MERGE	2022-11-23 17:09:20.940866070 +0100
@@ -1,4 +1,4 @@ 
-662497742fea7055f0e0ee27e5a7ddc382c2c38e
+2ef9abbcf6a11958b6fa685a89d0150022e82e78
 
 The first line of this file holds the git revision number of the
 last merge done from the master library sources.
--- libstdc++-v3/src/c++17/fast_float/fast_float.h.jj	2022-11-07 15:17:14.066071268 +0100
+++ libstdc++-v3/src/c++17/fast_float/fast_float.h	2022-11-23 17:19:41.735693122 +0100
@@ -99,11 +99,11 @@  from_chars_result from_chars_advanced(co
        || defined(__MINGW64__)                                          \
        || defined(__s390x__)                                            \
        || (defined(__ppc64__) || defined(__PPC64__) || defined(__ppc64le__) || defined(__PPC64LE__)) )
-#define FASTFLOAT_64BIT
+#define FASTFLOAT_64BIT 1
 #elif (defined(__i386) || defined(__i386__) || defined(_M_IX86)   \
      || defined(__arm__) || defined(_M_ARM)                   \
      || defined(__MINGW32__) || defined(__EMSCRIPTEN__))
-#define FASTFLOAT_32BIT
+#define FASTFLOAT_32BIT 1
 #else
   // Need to check incrementally, since SIZE_MAX is a size_t, avoid overflow.
   // We can never tell the register width, but the SIZE_MAX is a good approximation.
@@ -111,9 +111,9 @@  from_chars_result from_chars_advanced(co
   #if SIZE_MAX == 0xffff
     #error Unknown platform (16-bit, unsupported)
   #elif SIZE_MAX == 0xffffffff
-    #define FASTFLOAT_32BIT
+    #define FASTFLOAT_32BIT 1
   #elif SIZE_MAX == 0xffffffffffffffff
-    #define FASTFLOAT_64BIT
+    #define FASTFLOAT_64BIT 1
   #else
     #error Unknown platform (not 32-bit, not 64-bit?)
   #endif
@@ -359,10 +359,12 @@  template <typename T> struct binary_form
   static inline constexpr int minimum_exponent();
   static inline constexpr int infinite_power();
   static inline constexpr int sign_index();
+  static inline constexpr int min_exponent_fast_path(); // used when fegetround() == FE_TONEAREST
   static inline constexpr int max_exponent_fast_path();
   static inline constexpr int max_exponent_round_to_even();
   static inline constexpr int min_exponent_round_to_even();
   static inline constexpr uint64_t max_mantissa_fast_path(int64_t power);
+  static inline constexpr uint64_t max_mantissa_fast_path(); // used when fegetround() == FE_TONEAREST
   static inline constexpr int largest_power_of_ten();
   static inline constexpr int smallest_power_of_ten();
   static inline constexpr T exact_power_of_ten(int64_t power);
@@ -372,6 +374,22 @@  template <typename T> struct binary_form
   static inline constexpr equiv_uint hidden_bit_mask();
 };
 
+template <> inline constexpr int binary_format<double>::min_exponent_fast_path() {
+#if (FLT_EVAL_METHOD != 1) && (FLT_EVAL_METHOD != 0)
+  return 0;
+#else
+  return -22;
+#endif
+}
+
+template <> inline constexpr int binary_format<float>::min_exponent_fast_path() {
+#if (FLT_EVAL_METHOD != 1) && (FLT_EVAL_METHOD != 0)
+  return 0;
+#else
+  return -10;
+#endif
+}
+
 template <> inline constexpr int binary_format<double>::mantissa_explicit_bits() {
   return 52;
 }
@@ -418,13 +436,18 @@  template <> inline constexpr int binary_
 template <> inline constexpr int binary_format<float>::max_exponent_fast_path() {
   return 10;
 }
-
+template <> inline constexpr uint64_t binary_format<double>::max_mantissa_fast_path() {
+  return uint64_t(2) << mantissa_explicit_bits();
+}
 template <> inline constexpr uint64_t binary_format<double>::max_mantissa_fast_path(int64_t power) {
   // caller is responsible to ensure that
   // power >= 0 && power <= 22
   //
   return max_mantissa_double[power];
 }
+template <> inline constexpr uint64_t binary_format<float>::max_mantissa_fast_path() {
+  return uint64_t(2) << mantissa_explicit_bits();
+}
 template <> inline constexpr uint64_t binary_format<float>::max_mantissa_fast_path(int64_t power) {
   // caller is responsible to ensure that
   // power >= 0 && power <= 10
@@ -619,10 +642,6 @@  parsed_number_string parse_number_string
 
   uint64_t i = 0; // an unsigned int avoids signed overflows (which are bad)
 
-  while ((std::distance(p, pend) >= 8) && is_made_of_eight_digits_fast(p)) {
-    i = i * 100000000 + parse_eight_digits_unrolled(p); // in rare cases, this will overflow, but that's ok
-    p += 8;
-  }
   while ((p != pend) && is_integer(*p)) {
     // a multiplication by 10 is cheaper than an arbitrary integer
     // multiplication
@@ -1640,7 +1659,7 @@  namespace fast_float {
 // we might have platforms where `CHAR_BIT` is not 8, so let's avoid
 // doing `8 * sizeof(limb)`.
 #if defined(FASTFLOAT_64BIT) && !defined(__sparc)
-#define FASTFLOAT_64BIT_LIMB
+#define FASTFLOAT_64BIT_LIMB 1
 typedef uint64_t limb;
 constexpr size_t limb_bits = 64;
 #else
@@ -2314,10 +2333,6 @@  parsed_number_string parse_number_string
 
   uint64_t i = 0; // an unsigned int avoids signed overflows (which are bad)
 
-  while ((std::distance(p, pend) >= 8) && is_made_of_eight_digits_fast(p)) {
-    i = i * 100000000 + parse_eight_digits_unrolled(p); // in rare cases, this will overflow, but that's ok
-    p += 8;
-  }
   while ((p != pend) && is_integer(*p)) {
     // a multiplication by 10 is cheaper than an arbitrary integer
     // multiplication
@@ -2892,6 +2907,48 @@  from_chars_result parse_infnan(const cha
   return answer;
 }
 
+/**
+ * Returns true if the floating-pointing rounding mode is to 'nearest'.
+ * It is the default on most system. This function is meant to be inexpensive.
+ * Credit : @mwalcott3
+ */
+fastfloat_really_inline bool rounds_to_nearest() noexcept {
+  // See
+  // A fast function to check your floating-point rounding mode
+  // https://lemire.me/blog/2022/11/16/a-fast-function-to-check-your-floating-point-rounding-mode/
+  //
+  // This function is meant to be equivalent to :
+  // prior: #include <cfenv>
+  //  return fegetround() == FE_TONEAREST;
+  // However, it is expected to be much faster than the fegetround()
+  // function call.
+  //
+  // The volatile keywoard prevents the compiler from computing the function
+  // at compile-time.
+  // There might be other ways to prevent compile-time optimizations (e.g., asm).
+  // The value does not need to be std::numeric_limits<float>::min(), any small
+  // value so that 1 + x should round to 1 would do (after accounting for excess
+  // precision, as in 387 instructions).
+  static volatile float fmin = std::numeric_limits<float>::min();
+  float fmini = fmin; // we copy it so that it gets loaded at most once.
+  //
+  // Explanation:
+  // Only when fegetround() == FE_TONEAREST do we have that
+  // fmin + 1.0f == 1.0f - fmin.
+  //
+  // FE_UPWARD:
+  //  fmin + 1.0f > 1
+  //  1.0f - fmin == 1
+  //
+  // FE_DOWNWARD or  FE_TOWARDZERO:
+  //  fmin + 1.0f == 1
+  //  1.0f - fmin < 1
+  //
+  // Note: This may fail to be accurate if fast-math has been
+  // enabled, as rounding conventions may not apply.
+  return (fmini + 1.0f == 1.0f - fmini);
+}
+
 } // namespace detail
 
 template<typename T>
@@ -2919,12 +2976,45 @@  from_chars_result from_chars_advanced(co
   }
   answer.ec = std::errc(); // be optimistic
   answer.ptr = pns.lastmatch;
-  // Next is a modified Clinger's fast path, inspired by Jakub Jelínek's proposal
-  if (pns.exponent >= 0 && pns.exponent <= binary_format<T>::max_exponent_fast_path() && pns.mantissa <=binary_format<T>::max_mantissa_fast_path(pns.exponent) && !pns.too_many_digits) {
-    value = T(pns.mantissa);
-    value = value * binary_format<T>::exact_power_of_ten(pns.exponent);
-    if (pns.negative) { value = -value; }
-    return answer;
+  // The implementation of the Clinger's fast path is convoluted because
+  // we want round-to-nearest in all cases, irrespective of the rounding mode
+  // selected on the thread.
+  // We proceed optimistically, assuming that detail::rounds_to_nearest() returns
+  // true.
+  if (binary_format<T>::min_exponent_fast_path() <= pns.exponent && pns.exponent <= binary_format<T>::max_exponent_fast_path() && !pns.too_many_digits) {
+    // Unfortunately, the conventional Clinger's fast path is only possible
+    // when the system rounds to the nearest float.
+    //
+    // We expect the next branch to almost always be selected.
+    // We could check it first (before the previous branch), but
+    // there might be performance advantages at having the check
+    // be last.
+    if(detail::rounds_to_nearest())  {
+      // We have that fegetround() == FE_TONEAREST.
+      // Next is Clinger's fast path.
+      if (pns.mantissa <=binary_format<T>::max_mantissa_fast_path()) {
+        value = T(pns.mantissa);
+        if (pns.exponent < 0) { value = value / binary_format<T>::exact_power_of_ten(-pns.exponent); }
+        else { value = value * binary_format<T>::exact_power_of_ten(pns.exponent); }
+        if (pns.negative) { value = -value; }
+        return answer;
+      }
+    } else {
+      // We do not have that fegetround() == FE_TONEAREST.
+      // Next is a modified Clinger's fast path, inspired by Jakub Jelínek's proposal
+      if (pns.exponent >= 0 && pns.mantissa <=binary_format<T>::max_mantissa_fast_path(pns.exponent)) {
+#if (defined(_WIN32) && defined(__clang__))
+        // ClangCL may map 0 to -0.0 when fegetround() == FE_DOWNWARD
+        if(pns.mantissa == 0) {
+          value = 0;
+          return answer;
+        }
+#endif
+        value = T(pns.mantissa) * binary_format<T>::exact_power_of_ten(pns.exponent);
+        if (pns.negative) { value = -value; }
+        return answer;
+      }
+    }
   }
   adjusted_mantissa am = compute_float<binary_format<T>>(pns.exponent, pns.mantissa);
   if(pns.too_many_digits && am.power2 >= 0) {