[v4] libstdc++: Add platform wait functions for FreeBSD [PR120527]

Message ID 20260111163321.2979132-1-jwakely@redhat.com
State New
Headers
Series [v4] libstdc++: Add platform wait functions for FreeBSD [PR120527] |

Commit Message

Jonathan Wakely Jan. 11, 2026, 4:28 p.m. UTC
  This defines __platform_wait, __platform_notify, and
__platform_wait_until for FreeBSD, making use of the _umtx_op syscall.

The Linux versions of those functions only support 32-bit integers, but
the FreeBSD versions use the syscall for both 32-bit and 64-bit types,
as the _umtx_op supports both.

libstdc++-v3/ChangeLog:

	PR libstdc++/120527
	* include/bits/atomic_wait.h [__FreeBSD__] (__platform_wait_t):
	Define typedef.
	[__FreeBSD__] (__platform_wait_uses_type): Define variable
	template.
	* src/c++20/atomic.cc [__FreeBSD__] (_GLIBCXX_HAVE_PLATFORM_WAIT)
	(__platform_wait, __platform_notify, __platform_wait_until):
	Define.
	(__platform_load): Define.
---

v4: Tomasz noticed that __spin_impl assumes the waitable is always a
__platform_wait_t, but the FreeBSD patch enables the fast path for both
32-bit and 64-bit integers, so we need to adjust how __spin_impl loads
the current value of the waitable. This adds a __platform_load function,
which does an atomic load of the right size. For Linux, it's always
loading from int* but for FreeBSD it might be a 4B or 8B type.

Tested x86_64-linux and x86_64-freebsd.

I see a few unexplained FAILs when testing with -m32 on freebsd, but
they're already present on trunk so aren't caused by this patch. This
passes all tests with -m64 (and the changes should only affect 64-bit
more, because of the __SIZEOF_LONG__ == 8 check).

 libstdc++-v3/include/bits/atomic_wait.h | 11 ++++
 libstdc++-v3/src/c++20/atomic.cc        | 78 ++++++++++++++++++++++++-
 2 files changed, 87 insertions(+), 2 deletions(-)
  

Comments

Tomasz Kaminski Jan. 12, 2026, 8:28 a.m. UTC | #1
On Sun, Jan 11, 2026 at 5:34 PM Jonathan Wakely <jwakely@redhat.com> wrote:

> This defines __platform_wait, __platform_notify, and
> __platform_wait_until for FreeBSD, making use of the _umtx_op syscall.
>
> The Linux versions of those functions only support 32-bit integers, but
> the FreeBSD versions use the syscall for both 32-bit and 64-bit types,
> as the _umtx_op supports both.
>
> libstdc++-v3/ChangeLog:
>
>         PR libstdc++/120527
>         * include/bits/atomic_wait.h [__FreeBSD__] (__platform_wait_t):
>         Define typedef.
>         [__FreeBSD__] (__platform_wait_uses_type): Define variable
>         template.
>         * src/c++20/atomic.cc [__FreeBSD__] (_GLIBCXX_HAVE_PLATFORM_WAIT)
>         (__platform_wait, __platform_notify, __platform_wait_until):
>         Define.
>         (__platform_load): Define.
> ---
>
> v4: Tomasz noticed that __spin_impl assumes the waitable is always a
> __platform_wait_t, but the FreeBSD patch enables the fast path for both
> 32-bit and 64-bit integers, so we need to adjust how __spin_impl loads
> the current value of the waitable. This adds a __platform_load function,
>
which does an atomic load of the right size. For Linux, it's always
> loading from int* but for FreeBSD it might be a 4B or 8B type.
>
I think this should go to patch description, or preferably follow what you
did
in previous patches, and put introduction into __platform_load into
separate commit,
explaining why we need it.

Outside of that LGTM.

>
> Tested x86_64-linux and x86_64-freebsd.
>
> I see a few unexplained FAILs when testing with -m32 on freebsd, but
> they're already present on trunk so aren't caused by this patch. This
> passes all tests with -m64 (and the changes should only affect 64-bit
> more, because of the __SIZEOF_LONG__ == 8 check).
>
>  libstdc++-v3/include/bits/atomic_wait.h | 11 ++++
>  libstdc++-v3/src/c++20/atomic.cc        | 78 ++++++++++++++++++++++++-
>  2 files changed, 87 insertions(+), 2 deletions(-)
>
> diff --git a/libstdc++-v3/include/bits/atomic_wait.h
> b/libstdc++-v3/include/bits/atomic_wait.h
> index eff1be604eb4..e8487390ecb5 100644
> --- a/libstdc++-v3/include/bits/atomic_wait.h
> +++ b/libstdc++-v3/include/bits/atomic_wait.h
> @@ -69,6 +69,17 @@ _GLIBCXX_BEGIN_NAMESPACE_VERSION
>      inline constexpr bool __platform_wait_uses_type
>        = __detail::__waitable<_Tp>
>           && sizeof(_Tp) == sizeof(int) && alignof(_Tp) >= 4;
> +#elif defined __FreeBSD__ && __SIZEOF_LONG__ == 8
> +  namespace __detail
> +  {
> +    using __platform_wait_t = __UINT64_TYPE__;
> +    inline constexpr size_t __platform_wait_alignment = 8;
> +  }
> +  template<typename _Tp>
> +    inline constexpr bool __platform_wait_uses_type
> +      = __detail::__waitable<_Tp>
> +         && ((sizeof(_Tp) == 4 && alignof(_Tp) >= 4)
> +               || (sizeof(_Tp) == 8 && alignof(_Tp) >= 8));
>  #else
>  // define _GLIBCX_HAVE_PLATFORM_WAIT and implement __platform_wait()
>  // and __platform_notify() if there is a more efficient primitive
> supported
> diff --git a/libstdc++-v3/src/c++20/atomic.cc
> b/libstdc++-v3/src/c++20/atomic.cc
> index 3167592786e1..215f1212dd99 100644
> --- a/libstdc++-v3/src/c++20/atomic.cc
> +++ b/libstdc++-v3/src/c++20/atomic.cc
> @@ -27,7 +27,7 @@
>  #if __glibcxx_atomic_wait
>  #include <atomic>
>  #include <bits/atomic_timed_wait.h>
> -#include <cstdint> // uint32_t, uint64_t
> +#include <cstdint> // uint32_t, uint64_t, uintptr_t
>  #include <climits> // INT_MAX
>  #include <cerrno>  // errno, ETIMEDOUT, etc.
>  #include <bits/std_mutex.h>  // std::mutex, std::__condvar
> @@ -39,6 +39,11 @@
>  # include <unistd.h>
>  # include <sys/time.h> // timespec
>  # define _GLIBCXX_HAVE_PLATFORM_WAIT 1
> +#elif defined __FreeBSD__ && __FreeBSD__ >= 11 && __SIZEOF_LONG__ == 8
> +# include <sys/types.h>
> +# include <sys/umtx.h>
> +# include <sys/time.h>
> +# define _GLIBCXX_HAVE_PLATFORM_WAIT 1
>  #endif
>
>  #pragma GCC diagnostic ignored "-Wmissing-field-initializers"
> @@ -87,6 +92,13 @@ namespace
>                         __wait_clock_t::time_point timeout,
>                         int obj_size) = delete;
>
> +  // This is needed even when we don't have __platform_wait
> +  [[gnu::always_inline]]
> +  inline __wait_value_type
> +  __platform_load(const __platform_wait_t* addr, int memory_order,
> +                 int /* obj_sz */) noexcept
> +  { return __atomic_load_n(addr, memory_order); }
> +
>  #elif defined _GLIBCXX_HAVE_LINUX_FUTEX
>
>    const int futex_private_flag = 128;
> @@ -136,6 +148,68 @@ namespace
>        }
>      return true;
>    }
> +
> +  [[gnu::always_inline]]
> +  inline __wait_value_type
> +  __platform_load(const int* addr, int order, int /* obj_sz */) noexcept
>
I checked that we use int  for other __platform functions with futex patch.

> +  { return __atomic_load_n(addr, order); }
> +
> +#elif defined __FreeBSD__ && __SIZEOF_LONG__ == 8
> +  [[gnu::always_inline]]
> +  inline int
> +  wait_op(int obj_sz) noexcept
> +  { return obj_sz == sizeof(unsigned) ? UMTX_OP_WAIT_UINT : UMTX_OP_WAIT;
> }
> +
> +  void
> +  __platform_wait(const void* addr, uint64_t val, int obj_sz) noexcept
> +  {
> +    if (_umtx_op(const_cast<void*>(addr), wait_op(obj_sz), val,
> +                nullptr, nullptr))
> +      if (errno != EINTR)
> +       __throw_system_error(errno);
> +  }
> +
> +  void
> +  __platform_notify(const void* addr, bool all, int /* obj_sz */) noexcept
> +  {
> +    const int count = all ? INT_MAX : 1;
> +    _umtx_op(const_cast<void*>(addr), UMTX_OP_WAKE, count, nullptr,
> nullptr);
> +  }
> +
> +  // returns true if wait ended before timeout
> +  bool
> +  __platform_wait_until(const void* addr, uint64_t val,
> +                       const __wait_clock_t::time_point& atime,
> +                       int obj_sz) noexcept
> +  {
> +    struct _umtx_time timeout = {
> +      ._timeout = chrono::__to_timeout_timespec(atime),
> +      ._flags = UMTX_ABSTIME,
> +      ._clockid = CLOCK_MONOTONIC
> +    };
> +    // _umtx_op hangs if timeout._timeout is {0, 0}
> +    if (atime.time_since_epoch() < chrono::nanoseconds(1))
> +      return false;
> +    constexpr uintptr_t timeout_sz = sizeof(timeout);
> +    if (_umtx_op(const_cast<void*>(addr), wait_op(obj_sz), val,
> +                (void*)timeout_sz, &timeout))
> +      {
> +       if (errno == ETIMEDOUT)
> +         return false;
> +       if (errno != EINTR)
> +         __throw_system_error(errno);
> +      }
> +    return true;
> +  }
> +
> +  [[gnu::always_inline]]
> +  inline __wait_value_type
> +  __platform_load(const void* addr, int order, int obj_sz) noexcept
> +  {
> +    if (obj_sz == sizeof(long))
> +      return __atomic_load_n(static_cast<const long*>(addr), order);
> +    return __atomic_load_n(static_cast<const unsigned*>(addr), order);
> +  }
>  #endif // HAVE_PLATFORM_WAIT
>
>    // The state used by atomic waiting and notifying functions.
> @@ -259,7 +333,7 @@ namespace
>      __wait_value_type wval;
>      for (auto i = 0; i < atomic_spin_count; ++i)
>        {
> -       wval = __atomic_load_n(addr, args._M_order);
> +       wval = __platform_load(addr, args._M_order, args._M_obj_size);
>
I was thinking if we should have separate __spin_impl, which would hois the
check
outside of the loop, but this is anyway busy, wait, and the compiler sees
the __platform_load
body. And the __platform_load solution, does not add any if on platform
where waiting only
on single size is supported.

>         if (wval != args._M_old)
>           return { ._M_val = wval, ._M_has_val = true, ._M_timeout = false
> };
>         if (i < atomic_spin_count_relax)
> --
> 2.52.0
>
>
  
Jonathan Wakely Jan. 12, 2026, 10:52 a.m. UTC | #2
On Mon, 12 Jan 2026 at 08:28, Tomasz Kaminski <tkaminsk@redhat.com> wrote:
>
>
>
> On Sun, Jan 11, 2026 at 5:34 PM Jonathan Wakely <jwakely@redhat.com> wrote:
>>
>> This defines __platform_wait, __platform_notify, and
>> __platform_wait_until for FreeBSD, making use of the _umtx_op syscall.
>>
>> The Linux versions of those functions only support 32-bit integers, but
>> the FreeBSD versions use the syscall for both 32-bit and 64-bit types,
>> as the _umtx_op supports both.
>>
>> libstdc++-v3/ChangeLog:
>>
>>         PR libstdc++/120527
>>         * include/bits/atomic_wait.h [__FreeBSD__] (__platform_wait_t):
>>         Define typedef.
>>         [__FreeBSD__] (__platform_wait_uses_type): Define variable
>>         template.
>>         * src/c++20/atomic.cc [__FreeBSD__] (_GLIBCXX_HAVE_PLATFORM_WAIT)
>>         (__platform_wait, __platform_notify, __platform_wait_until):
>>         Define.
>>         (__platform_load): Define.
>> ---
>>
>> v4: Tomasz noticed that __spin_impl assumes the waitable is always a
>> __platform_wait_t, but the FreeBSD patch enables the fast path for both
>> 32-bit and 64-bit integers, so we need to adjust how __spin_impl loads
>> the current value of the waitable. This adds a __platform_load function,
>>
>> which does an atomic load of the right size. For Linux, it's always
>> loading from int* but for FreeBSD it might be a 4B or 8B type.
>
> I think this should go to patch description, or preferably follow what you did
> in previous patches, and put introduction into __platform_load into separate commit,
> explaining why we need it.

I'll add it to the commit msg, that would be better.

>
> Outside of that LGTM.
>>
>>
>> Tested x86_64-linux and x86_64-freebsd.
>>
>> I see a few unexplained FAILs when testing with -m32 on freebsd, but
>> they're already present on trunk so aren't caused by this patch. This
>> passes all tests with -m64 (and the changes should only affect 64-bit
>> more, because of the __SIZEOF_LONG__ == 8 check).
>>
>>  libstdc++-v3/include/bits/atomic_wait.h | 11 ++++
>>  libstdc++-v3/src/c++20/atomic.cc        | 78 ++++++++++++++++++++++++-
>>  2 files changed, 87 insertions(+), 2 deletions(-)
>>
>> diff --git a/libstdc++-v3/include/bits/atomic_wait.h b/libstdc++-v3/include/bits/atomic_wait.h
>> index eff1be604eb4..e8487390ecb5 100644
>> --- a/libstdc++-v3/include/bits/atomic_wait.h
>> +++ b/libstdc++-v3/include/bits/atomic_wait.h
>> @@ -69,6 +69,17 @@ _GLIBCXX_BEGIN_NAMESPACE_VERSION
>>      inline constexpr bool __platform_wait_uses_type
>>        = __detail::__waitable<_Tp>
>>           && sizeof(_Tp) == sizeof(int) && alignof(_Tp) >= 4;
>> +#elif defined __FreeBSD__ && __SIZEOF_LONG__ == 8
>> +  namespace __detail
>> +  {
>> +    using __platform_wait_t = __UINT64_TYPE__;
>> +    inline constexpr size_t __platform_wait_alignment = 8;
>> +  }
>> +  template<typename _Tp>
>> +    inline constexpr bool __platform_wait_uses_type
>> +      = __detail::__waitable<_Tp>
>> +         && ((sizeof(_Tp) == 4 && alignof(_Tp) >= 4)
>> +               || (sizeof(_Tp) == 8 && alignof(_Tp) >= 8));
>>  #else
>>  // define _GLIBCX_HAVE_PLATFORM_WAIT and implement __platform_wait()
>>  // and __platform_notify() if there is a more efficient primitive supported
>> diff --git a/libstdc++-v3/src/c++20/atomic.cc b/libstdc++-v3/src/c++20/atomic.cc
>> index 3167592786e1..215f1212dd99 100644
>> --- a/libstdc++-v3/src/c++20/atomic.cc
>> +++ b/libstdc++-v3/src/c++20/atomic.cc
>> @@ -27,7 +27,7 @@
>>  #if __glibcxx_atomic_wait
>>  #include <atomic>
>>  #include <bits/atomic_timed_wait.h>
>> -#include <cstdint> // uint32_t, uint64_t
>> +#include <cstdint> // uint32_t, uint64_t, uintptr_t
>>  #include <climits> // INT_MAX
>>  #include <cerrno>  // errno, ETIMEDOUT, etc.
>>  #include <bits/std_mutex.h>  // std::mutex, std::__condvar
>> @@ -39,6 +39,11 @@
>>  # include <unistd.h>
>>  # include <sys/time.h> // timespec
>>  # define _GLIBCXX_HAVE_PLATFORM_WAIT 1
>> +#elif defined __FreeBSD__ && __FreeBSD__ >= 11 && __SIZEOF_LONG__ == 8
>> +# include <sys/types.h>
>> +# include <sys/umtx.h>
>> +# include <sys/time.h>
>> +# define _GLIBCXX_HAVE_PLATFORM_WAIT 1
>>  #endif
>>
>>  #pragma GCC diagnostic ignored "-Wmissing-field-initializers"
>> @@ -87,6 +92,13 @@ namespace
>>                         __wait_clock_t::time_point timeout,
>>                         int obj_size) = delete;
>>
>> +  // This is needed even when we don't have __platform_wait
>> +  [[gnu::always_inline]]
>> +  inline __wait_value_type
>> +  __platform_load(const __platform_wait_t* addr, int memory_order,
>> +                 int /* obj_sz */) noexcept
>> +  { return __atomic_load_n(addr, memory_order); }
>> +
>>  #elif defined _GLIBCXX_HAVE_LINUX_FUTEX
>>
>>    const int futex_private_flag = 128;
>> @@ -136,6 +148,68 @@ namespace
>>        }
>>      return true;
>>    }
>> +
>> +  [[gnu::always_inline]]
>> +  inline __wait_value_type
>> +  __platform_load(const int* addr, int order, int /* obj_sz */) noexcept
>
> I checked that we use int  for other __platform functions with futex patch.

Yes, that's intentional so that we would get a compilation error if we
passed a __wait_value_type* or some other type here. The argument
should be __platform_wait_t* and for Linux that should be int*.


>>
>> +  { return __atomic_load_n(addr, order); }
>> +
>> +#elif defined __FreeBSD__ && __SIZEOF_LONG__ == 8
>> +  [[gnu::always_inline]]
>> +  inline int
>> +  wait_op(int obj_sz) noexcept
>> +  { return obj_sz == sizeof(unsigned) ? UMTX_OP_WAIT_UINT : UMTX_OP_WAIT; }
>> +
>> +  void
>> +  __platform_wait(const void* addr, uint64_t val, int obj_sz) noexcept
>> +  {
>> +    if (_umtx_op(const_cast<void*>(addr), wait_op(obj_sz), val,
>> +                nullptr, nullptr))
>> +      if (errno != EINTR)
>> +       __throw_system_error(errno);
>> +  }
>> +
>> +  void
>> +  __platform_notify(const void* addr, bool all, int /* obj_sz */) noexcept
>> +  {
>> +    const int count = all ? INT_MAX : 1;
>> +    _umtx_op(const_cast<void*>(addr), UMTX_OP_WAKE, count, nullptr, nullptr);
>> +  }
>> +
>> +  // returns true if wait ended before timeout
>> +  bool
>> +  __platform_wait_until(const void* addr, uint64_t val,
>> +                       const __wait_clock_t::time_point& atime,
>> +                       int obj_sz) noexcept
>> +  {
>> +    struct _umtx_time timeout = {
>> +      ._timeout = chrono::__to_timeout_timespec(atime),
>> +      ._flags = UMTX_ABSTIME,
>> +      ._clockid = CLOCK_MONOTONIC
>> +    };
>> +    // _umtx_op hangs if timeout._timeout is {0, 0}
>> +    if (atime.time_since_epoch() < chrono::nanoseconds(1))
>> +      return false;
>> +    constexpr uintptr_t timeout_sz = sizeof(timeout);
>> +    if (_umtx_op(const_cast<void*>(addr), wait_op(obj_sz), val,
>> +                (void*)timeout_sz, &timeout))
>> +      {
>> +       if (errno == ETIMEDOUT)
>> +         return false;
>> +       if (errno != EINTR)
>> +         __throw_system_error(errno);
>> +      }
>> +    return true;
>> +  }
>> +
>> +  [[gnu::always_inline]]
>> +  inline __wait_value_type
>> +  __platform_load(const void* addr, int order, int obj_sz) noexcept
>> +  {
>> +    if (obj_sz == sizeof(long))
>> +      return __atomic_load_n(static_cast<const long*>(addr), order);
>> +    return __atomic_load_n(static_cast<const unsigned*>(addr), order);
>> +  }
>>  #endif // HAVE_PLATFORM_WAIT
>>
>>    // The state used by atomic waiting and notifying functions.
>> @@ -259,7 +333,7 @@ namespace
>>      __wait_value_type wval;
>>      for (auto i = 0; i < atomic_spin_count; ++i)
>>        {
>> -       wval = __atomic_load_n(addr, args._M_order);
>> +       wval = __platform_load(addr, args._M_order, args._M_obj_size);
>
> I was thinking if we should have separate __spin_impl, which would hois the check
> outside of the loop, but this is anyway busy, wait, and the compiler sees the __platform_load
> body. And the __platform_load solution, does not add any if on platform where waiting only
> on single size is supported.

Yes, exactly. For Linux we don't need a runtime branch on _M_obj_size
because it's always 4.

Only the targets that support waiting on different sizes need the
extra branch in __platform_load.

>>
>>         if (wval != args._M_old)
>>           return { ._M_val = wval, ._M_has_val = true, ._M_timeout = false };
>>         if (i < atomic_spin_count_relax)
>> --
>> 2.52.0
>>
  

Patch

diff --git a/libstdc++-v3/include/bits/atomic_wait.h b/libstdc++-v3/include/bits/atomic_wait.h
index eff1be604eb4..e8487390ecb5 100644
--- a/libstdc++-v3/include/bits/atomic_wait.h
+++ b/libstdc++-v3/include/bits/atomic_wait.h
@@ -69,6 +69,17 @@  _GLIBCXX_BEGIN_NAMESPACE_VERSION
     inline constexpr bool __platform_wait_uses_type
       = __detail::__waitable<_Tp>
 	  && sizeof(_Tp) == sizeof(int) && alignof(_Tp) >= 4;
+#elif defined __FreeBSD__ && __SIZEOF_LONG__ == 8
+  namespace __detail
+  {
+    using __platform_wait_t = __UINT64_TYPE__;
+    inline constexpr size_t __platform_wait_alignment = 8;
+  }
+  template<typename _Tp>
+    inline constexpr bool __platform_wait_uses_type
+      = __detail::__waitable<_Tp>
+	  && ((sizeof(_Tp) == 4 && alignof(_Tp) >= 4)
+		|| (sizeof(_Tp) == 8 && alignof(_Tp) >= 8));
 #else
 // define _GLIBCX_HAVE_PLATFORM_WAIT and implement __platform_wait()
 // and __platform_notify() if there is a more efficient primitive supported
diff --git a/libstdc++-v3/src/c++20/atomic.cc b/libstdc++-v3/src/c++20/atomic.cc
index 3167592786e1..215f1212dd99 100644
--- a/libstdc++-v3/src/c++20/atomic.cc
+++ b/libstdc++-v3/src/c++20/atomic.cc
@@ -27,7 +27,7 @@ 
 #if __glibcxx_atomic_wait
 #include <atomic>
 #include <bits/atomic_timed_wait.h>
-#include <cstdint> // uint32_t, uint64_t
+#include <cstdint> // uint32_t, uint64_t, uintptr_t
 #include <climits> // INT_MAX
 #include <cerrno>  // errno, ETIMEDOUT, etc.
 #include <bits/std_mutex.h>  // std::mutex, std::__condvar
@@ -39,6 +39,11 @@ 
 # include <unistd.h>
 # include <sys/time.h> // timespec
 # define _GLIBCXX_HAVE_PLATFORM_WAIT 1
+#elif defined __FreeBSD__ && __FreeBSD__ >= 11 && __SIZEOF_LONG__ == 8
+# include <sys/types.h>
+# include <sys/umtx.h>
+# include <sys/time.h>
+# define _GLIBCXX_HAVE_PLATFORM_WAIT 1
 #endif
 
 #pragma GCC diagnostic ignored "-Wmissing-field-initializers"
@@ -87,6 +92,13 @@  namespace
 			__wait_clock_t::time_point timeout,
 			int obj_size) = delete;
 
+  // This is needed even when we don't have __platform_wait
+  [[gnu::always_inline]]
+  inline __wait_value_type
+  __platform_load(const __platform_wait_t* addr, int memory_order,
+		  int /* obj_sz */) noexcept
+  { return __atomic_load_n(addr, memory_order); }
+
 #elif defined _GLIBCXX_HAVE_LINUX_FUTEX
 
   const int futex_private_flag = 128;
@@ -136,6 +148,68 @@  namespace
       }
     return true;
   }
+
+  [[gnu::always_inline]]
+  inline __wait_value_type
+  __platform_load(const int* addr, int order, int /* obj_sz */) noexcept
+  { return __atomic_load_n(addr, order); }
+
+#elif defined __FreeBSD__ && __SIZEOF_LONG__ == 8
+  [[gnu::always_inline]]
+  inline int
+  wait_op(int obj_sz) noexcept
+  { return obj_sz == sizeof(unsigned) ? UMTX_OP_WAIT_UINT : UMTX_OP_WAIT; }
+
+  void
+  __platform_wait(const void* addr, uint64_t val, int obj_sz) noexcept
+  {
+    if (_umtx_op(const_cast<void*>(addr), wait_op(obj_sz), val,
+		 nullptr, nullptr))
+      if (errno != EINTR)
+	__throw_system_error(errno);
+  }
+
+  void
+  __platform_notify(const void* addr, bool all, int /* obj_sz */) noexcept
+  {
+    const int count = all ? INT_MAX : 1;
+    _umtx_op(const_cast<void*>(addr), UMTX_OP_WAKE, count, nullptr, nullptr);
+  }
+
+  // returns true if wait ended before timeout
+  bool
+  __platform_wait_until(const void* addr, uint64_t val,
+			const __wait_clock_t::time_point& atime,
+			int obj_sz) noexcept
+  {
+    struct _umtx_time timeout = {
+      ._timeout = chrono::__to_timeout_timespec(atime),
+      ._flags = UMTX_ABSTIME,
+      ._clockid = CLOCK_MONOTONIC
+    };
+    // _umtx_op hangs if timeout._timeout is {0, 0}
+    if (atime.time_since_epoch() < chrono::nanoseconds(1))
+      return false;
+    constexpr uintptr_t timeout_sz = sizeof(timeout);
+    if (_umtx_op(const_cast<void*>(addr), wait_op(obj_sz), val,
+		 (void*)timeout_sz, &timeout))
+      {
+	if (errno == ETIMEDOUT)
+	  return false;
+	if (errno != EINTR)
+	  __throw_system_error(errno);
+      }
+    return true;
+  }
+
+  [[gnu::always_inline]]
+  inline __wait_value_type
+  __platform_load(const void* addr, int order, int obj_sz) noexcept
+  {
+    if (obj_sz == sizeof(long))
+      return __atomic_load_n(static_cast<const long*>(addr), order);
+    return __atomic_load_n(static_cast<const unsigned*>(addr), order);
+  }
 #endif // HAVE_PLATFORM_WAIT
 
   // The state used by atomic waiting and notifying functions.
@@ -259,7 +333,7 @@  namespace
     __wait_value_type wval;
     for (auto i = 0; i < atomic_spin_count; ++i)
       {
-	wval = __atomic_load_n(addr, args._M_order);
+	wval = __platform_load(addr, args._M_order, args._M_obj_size);
 	if (wval != args._M_old)
 	  return { ._M_val = wval, ._M_has_val = true, ._M_timeout = false };
 	if (i < atomic_spin_count_relax)