From patchwork Tue Jun 6 14:13:52 2023 Content-Type: text/plain; charset="utf-8" MIME-Version: 1.0 Content-Transfer-Encoding: 8bit X-Patchwork-Submitter: Matthias Kretz X-Patchwork-Id: 70656 Return-Path: X-Original-To: patchwork@sourceware.org Delivered-To: patchwork@sourceware.org Received: from server2.sourceware.org (localhost [IPv6:::1]) by sourceware.org (Postfix) with ESMTP id BA4E83856962 for ; Tue, 6 Jun 2023 14:14:28 +0000 (GMT) DKIM-Filter: OpenDKIM Filter v2.11.0 sourceware.org BA4E83856962 DKIM-Signature: v=1; a=rsa-sha256; c=relaxed/relaxed; d=gcc.gnu.org; s=default; t=1686060868; bh=jd+a7DowR4bDN4GjN5T04roe38UMRny9q7NOOXQkMQM=; h=To:Subject:Date:List-Id:List-Unsubscribe:List-Archive:List-Post: List-Help:List-Subscribe:From:Reply-To:From; b=n+QVXWfolBaQnND5KpHoRSxaljH79rsRSJioD42Z1DnmRiNB/kZiXDrvM7jnGEW8F RPJMKyEUy6QKYbg1m/q720lcs3ybuxP+MtyBZ3G7/OIUCfosslRyCw7I+VN7xQl2Td mbo2VePcce6ESiCaACtmIkMH+P1ouGnr+TSWObis= X-Original-To: gcc-patches@gcc.gnu.org Delivered-To: gcc-patches@gcc.gnu.org Received: from lxmtout2.gsi.de (lxmtout2.gsi.de [140.181.3.112]) by sourceware.org (Postfix) with ESMTPS id E81B73858C5F; Tue, 6 Jun 2023 14:13:54 +0000 (GMT) DMARC-Filter: OpenDMARC Filter v1.4.2 sourceware.org E81B73858C5F Received: from localhost (localhost [127.0.0.1]) by lxmtout2.gsi.de (Postfix) with ESMTP id C106F2038F6A; Tue, 6 Jun 2023 16:13:53 +0200 (CEST) X-Virus-Scanned: Debian amavisd-new at lxmtout2.gsi.de Received: from lxmtout2.gsi.de ([127.0.0.1]) by localhost (lxmtout2.gsi.de [127.0.0.1]) (amavisd-new, port 10024) with LMTP id NuzsxueZB_6T; Tue, 6 Jun 2023 16:13:53 +0200 (CEST) Received: from srvEX6.campus.gsi.de (unknown [10.10.4.96]) (using TLSv1.2 with cipher ECDHE-RSA-AES256-GCM-SHA384 (256/256 bits)) (No client certificate requested) by lxmtout2.gsi.de (Postfix) with ESMTPS id A53D4203E7FB; Tue, 6 Jun 2023 16:13:53 +0200 (CEST) Received: from minbar.localnet (140.181.3.12) by srvEX6.campus.gsi.de (10.10.4.96) with Microsoft SMTP Server (version=TLS1_2, cipher=TLS_ECDHE_RSA_WITH_AES_256_GCM_SHA384) id 15.2.1118.26; Tue, 6 Jun 2023 16:13:53 +0200 To: , Subject: [PATCH] libstdc++: Rewrite or avoid casts to 64-bit element types Date: Tue, 6 Jun 2023 16:13:52 +0200 Message-ID: <4608206.VLH7GnMWUR@minbar> Organization: GSI Helmholtz Centre for Heavy Ion Research MIME-Version: 1.0 X-Originating-IP: [140.181.3.12] X-ClientProxiedBy: srvEX8.Campus.gsi.de (10.10.4.160) To srvEX6.campus.gsi.de (10.10.4.96) X-Spam-Status: No, score=-10.2 required=5.0 tests=BAYES_00, BODY_8BITS, GIT_PATCH_0, KAM_DMARC_STATUS, SPF_HELO_PASS, SPF_PASS, TXREP, T_SCC_BODY_TEXT_LINE autolearn=ham autolearn_force=no version=3.4.6 X-Spam-Checker-Version: SpamAssassin 3.4.6 (2021-04-09) on server2.sourceware.org X-BeenThere: gcc-patches@gcc.gnu.org X-Mailman-Version: 2.1.29 Precedence: list List-Id: Gcc-patches mailing list List-Unsubscribe: , List-Archive: List-Post: List-Help: List-Subscribe: , X-Patchwork-Original-From: Matthias Kretz via Gcc-patches From: Matthias Kretz Reply-To: Matthias Kretz Errors-To: gcc-patches-bounces+patchwork=sourceware.org@gcc.gnu.org Sender: "Gcc-patches" Last part to resolve test failure introduced with PR109822. OK for master and backports (gcc-11 doesn't have __builtin_shufflevector, though)? Tested on x86_64-pc-linux-gnu and powerpc64le-linux-gnu ----- >8 ----- Fix SFINAE on __is_intrinsic_type for 64-bit element types on non-VSX POWER targets. Replace __extract_part implementation (which was the only user of __shift_elements_right) by relying on __builtin_shufflevector (not available in GCC 11). This removes another cast to 64-bit element type, which breaks on non-VSX POWER. Signed-off-by: Matthias Kretz libstdc++-v3/ChangeLog: PR libstdc++/109822 * include/experimental/bits/simd.h: Include for min(initializer_list). Define __intrinsic_type_impl for all vectorizable types, but without type member, if the target doesn't support it. * include/experimental/bits/simd_builtin.h (__idx_permute): New. (__shift_elements_right): Removed. (__extract_part): Rewrite using __idx_permute. (_S_reduce): Avoid unconditional cast to 64-bit element type when the element type is known to be smaller. --- libstdc++-v3/include/experimental/bits/simd.h | 34 ++- .../include/experimental/bits/simd_builtin.h | 250 +++++------------- 2 files changed, 96 insertions(+), 188 deletions(-) -- ────────────────────────────────────────────────────────────────────────── Dr. Matthias Kretz https://mattkretz.github.io GSI Helmholtz Centre for Heavy Ion Research https://gsi.de stdₓ::simd ────────────────────────────────────────────────────────────────────────── diff --git a/libstdc++-v3/include/experimental/bits/simd.h b/libstdc++-v3/include/experimental/bits/simd.h index 90523ea57dc..effbc60ae46 100644 --- a/libstdc++-v3/include/experimental/bits/simd.h +++ b/libstdc++-v3/include/experimental/bits/simd.h @@ -29,6 +29,7 @@ #include "simd_detail.h" #include "numeric_traits.h" +#include #include #include #ifdef _GLIBCXX_DEBUG_UB @@ -2431,25 +2432,38 @@ struct __intrinsic_type_impl #define _GLIBCXX_SIMD_PPC_INTRIN(_Tp) \ template <> \ struct __intrinsic_type_impl<_Tp> { using type = __vector _Tp; } -_GLIBCXX_SIMD_PPC_INTRIN(float); #ifdef __VSX__ -_GLIBCXX_SIMD_PPC_INTRIN(double); +#define _GLIBCXX_SIMD_PPC_INTRIN_VSX(_Tp) _GLIBCXX_SIMD_PPC_INTRIN(_Tp) +#else +#define _GLIBCXX_SIMD_PPC_INTRIN_VSX(_Tp) \ + template <> \ + struct __intrinsic_type_impl<_Tp> \ + {} +#endif +#if defined __VSX__ || __SIZEOF_LONG__ == 4 +#define _GLIBCXX_SIMD_PPC_INTRIN_VSX_4(_Tp) _GLIBCXX_SIMD_PPC_INTRIN(_Tp) +#else +#define _GLIBCXX_SIMD_PPC_INTRIN_VSX_4(_Tp) \ + template <> \ + struct __intrinsic_type_impl<_Tp> \ + {} #endif + +_GLIBCXX_SIMD_PPC_INTRIN(float); +_GLIBCXX_SIMD_PPC_INTRIN_VSX(double); _GLIBCXX_SIMD_PPC_INTRIN(signed char); _GLIBCXX_SIMD_PPC_INTRIN(unsigned char); _GLIBCXX_SIMD_PPC_INTRIN(signed short); _GLIBCXX_SIMD_PPC_INTRIN(unsigned short); _GLIBCXX_SIMD_PPC_INTRIN(signed int); _GLIBCXX_SIMD_PPC_INTRIN(unsigned int); -#if defined __VSX__ || __SIZEOF_LONG__ == 4 -_GLIBCXX_SIMD_PPC_INTRIN(signed long); -_GLIBCXX_SIMD_PPC_INTRIN(unsigned long); -#endif -#ifdef __VSX__ -_GLIBCXX_SIMD_PPC_INTRIN(signed long long); -_GLIBCXX_SIMD_PPC_INTRIN(unsigned long long); -#endif +_GLIBCXX_SIMD_PPC_INTRIN_VSX_4(signed long); +_GLIBCXX_SIMD_PPC_INTRIN_VSX_4(unsigned long); +_GLIBCXX_SIMD_PPC_INTRIN_VSX(signed long long); +_GLIBCXX_SIMD_PPC_INTRIN_VSX(unsigned long long); #undef _GLIBCXX_SIMD_PPC_INTRIN +#undef _GLIBCXX_SIMD_PPC_INTRIN_VSX +#undef _GLIBCXX_SIMD_PPC_INTRIN_VSX_4 template struct __intrinsic_type<_Tp, _Bytes, enable_if_t<__is_vectorizable_v<_Tp> && _Bytes <= 16>> diff --git a/libstdc++-v3/include/experimental/bits/simd_builtin.h b/libstdc++-v3/include/experimental/bits/simd_builtin.h index 6ccc2fcec9c..f131b4ebba8 100644 --- a/libstdc++-v3/include/experimental/bits/simd_builtin.h +++ b/libstdc++-v3/include/experimental/bits/simd_builtin.h @@ -48,6 +48,57 @@ = __andnot(_S_signmask<_V>, _S_allbits<_V>); //}}} + +constexpr int __idx_permute_dontcare = -1; +constexpr int __idx_permute_zero = -2; + +template + _GLIBCXX_SIMD_INTRINSIC constexpr _SimdWrapper<_Tp, _NewN> + __idx_permute_impl(_SimdWrapper<_Tp, _Np> __x, _IdxPerm __perm, + std::integer_sequence) + { + constexpr int _InputWidth = _Np; + constexpr int _FullWidth = __x._S_full_size; + return __builtin_shufflevector(__x._M_data, __vector_type_t<_Tp, _Np>(), + [&](auto __i) constexpr -> int { + if constexpr (__i >= int(_NewN)) + return -1; + else + { + constexpr int __j = __perm(__i); + static_assert(__j < _InputWidth); + if constexpr (__j == __idx_permute_dontcare) + return -1; + else if constexpr (__j == __idx_permute_zero) + return _FullWidth; + else + return __j; + } + }(std::integral_constant())...); + } + +/** @internal + * Permutes elements of @p __x according to index mapping defined by @p __perm. + * + * The special values @c __idx_permute_zero and @c __idx_permute_dontcare can be returned by @p + * __perm in addition to any number between 0 and @c _Np-1 (inclusive). + * + * @param __x Input vector, to be permuted. + * @param __perm Callable (consteval), called with arguments of type integral_constant, + * where i is a valid index of the output vector. + * + * @tparam _NewN The width of the returned vector + */ +template + _GLIBCXX_SIMD_INTRINSIC constexpr _SimdWrapper<_Tp, _NewN == 0 ? _Np : _NewN> + __idx_permute(_SimdWrapper<_Tp, _Np> __x, _IdxPerm __perm) + { + static_assert(_NewN >= 0); + constexpr size_t __new_width = _NewN == 0 ? _Np : _NewN; + return __idx_permute_impl<__new_width>( + __x, __perm, std::make_integer_sequence()); + } + // __vector_permute{{{ // Index == -1 requests zeroing of the output element template , @@ -92,116 +143,6 @@ __wrapper_bitcast(_SimdWrapper<_Up, _M> __x) return __intrin_bitcast<__vector_type_t<_Tp, _Np>>(__x._M_data); } -// }}} -// __shift_elements_right{{{ -// if (__shift % 2ⁿ == 0) => the low n Bytes are correct -template > - _GLIBCXX_SIMD_INTRINSIC _Tp - __shift_elements_right(_Tp __v) - { - [[maybe_unused]] const auto __iv = __to_intrin(__v); - static_assert(__shift <= sizeof(_Tp)); - if constexpr (__shift == 0) - return __v; - else if constexpr (__shift == sizeof(_Tp)) - return _Tp(); -#if _GLIBCXX_SIMD_X86INTRIN // {{{ - else if constexpr (__have_sse && __shift == 8 - && _TVT::template _S_is) - return _mm_movehl_ps(__iv, __iv); - else if constexpr (__have_sse2 && __shift == 8 - && _TVT::template _S_is) - return _mm_unpackhi_pd(__iv, __iv); - else if constexpr (__have_sse2 && sizeof(_Tp) == 16) - return reinterpret_cast( - _mm_srli_si128(reinterpret_cast<__m128i>(__iv), __shift)); - else if constexpr (__shift == 16 && sizeof(_Tp) == 32) - { - /*if constexpr (__have_avx && _TVT::template _S_is) - return _mm256_permute2f128_pd(__iv, __iv, 0x81); - else if constexpr (__have_avx && _TVT::template _S_is) - return _mm256_permute2f128_ps(__iv, __iv, 0x81); - else if constexpr (__have_avx) - return reinterpret_cast( - _mm256_permute2f128_si256(__iv, __iv, 0x81)); - else*/ - return __zero_extend(__hi128(__v)); - } - else if constexpr (__have_avx2 && sizeof(_Tp) == 32 && __shift < 16) - { - const auto __vll = __vector_bitcast<_LLong>(__v); - return reinterpret_cast( - _mm256_alignr_epi8(_mm256_permute2x128_si256(__vll, __vll, 0x81), - __vll, __shift)); - } - else if constexpr (__have_avx && sizeof(_Tp) == 32 && __shift < 16) - { - const auto __vll = __vector_bitcast<_LLong>(__v); - return reinterpret_cast( - __concat(_mm_alignr_epi8(__hi128(__vll), __lo128(__vll), __shift), - _mm_srli_si128(__hi128(__vll), __shift))); - } - else if constexpr (sizeof(_Tp) == 32 && __shift > 16) - return __zero_extend(__shift_elements_right<__shift - 16>(__hi128(__v))); - else if constexpr (sizeof(_Tp) == 64 && __shift == 32) - return __zero_extend(__hi256(__v)); - else if constexpr (__have_avx512f && sizeof(_Tp) == 64) - { - if constexpr (__shift >= 48) - return __zero_extend( - __shift_elements_right<__shift - 48>(__extract<3, 4>(__v))); - else if constexpr (__shift >= 32) - return __zero_extend( - __shift_elements_right<__shift - 32>(__hi256(__v))); - else if constexpr (__shift % 8 == 0) - return reinterpret_cast( - _mm512_alignr_epi64(__m512i(), __intrin_bitcast<__m512i>(__v), - __shift / 8)); - else if constexpr (__shift % 4 == 0) - return reinterpret_cast( - _mm512_alignr_epi32(__m512i(), __intrin_bitcast<__m512i>(__v), - __shift / 4)); - else if constexpr (__have_avx512bw && __shift < 16) - { - const auto __vll = __vector_bitcast<_LLong>(__v); - return reinterpret_cast( - _mm512_alignr_epi8(_mm512_shuffle_i32x4(__vll, __vll, 0xf9), - __vll, __shift)); - } - else if constexpr (__have_avx512bw && __shift < 32) - { - const auto __vll = __vector_bitcast<_LLong>(__v); - return reinterpret_cast( - _mm512_alignr_epi8(_mm512_shuffle_i32x4(__vll, __m512i(), 0xee), - _mm512_shuffle_i32x4(__vll, __vll, 0xf9), - __shift - 16)); - } - else - __assert_unreachable<_Tp>(); - } - /* - } else if constexpr (__shift % 16 == 0 && sizeof(_Tp) == 64) - return __auto_bitcast(__extract<__shift / 16, 4>(__v)); - */ -#endif // _GLIBCXX_SIMD_X86INTRIN }}} - else - { - constexpr int __chunksize = __shift % 8 == 0 ? 8 - : __shift % 4 == 0 ? 4 - : __shift % 2 == 0 ? 2 - : 1; - auto __w = __vector_bitcast<__int_with_sizeof_t<__chunksize>>(__v); - using _Up = decltype(__w); - return __intrin_bitcast<_Tp>( - __call_with_n_evaluations<(sizeof(_Tp) - __shift) / __chunksize>( - [](auto... __chunks) _GLIBCXX_SIMD_ALWAYS_INLINE_LAMBDA { - return _Up{__chunks...}; - }, [&](auto __i) _GLIBCXX_SIMD_ALWAYS_INLINE_LAMBDA { - return __w[__shift / __chunksize + __i]; - })); - } - } - // }}} // __extract_part(_SimdWrapper<_Tp, _Np>) {{{ template @@ -209,66 +150,13 @@ __shift_elements_right(_Tp __v) _SimdWrapper<_Tp, _Np / _Total * _Combine> __extract_part(const _SimdWrapper<_Tp, _Np> __x) { - if constexpr (_Index % 2 == 0 && _Total % 2 == 0 && _Combine % 2 == 0) - return __extract_part<_Index / 2, _Total / 2, _Combine / 2>(__x); - else - { - constexpr size_t __values_per_part = _Np / _Total; - constexpr size_t __values_to_skip = _Index * __values_per_part; - constexpr size_t __return_size = __values_per_part * _Combine; - using _R = __vector_type_t<_Tp, __return_size>; - static_assert((_Index + _Combine) * __values_per_part * sizeof(_Tp) - <= sizeof(__x), - "out of bounds __extract_part"); - // the following assertion would ensure no "padding" to be read - // static_assert(_Total >= _Index + _Combine, "_Total must be greater - // than _Index"); - - // static_assert(__return_size * _Total == _Np, "_Np must be divisible - // by _Total"); - if (__x._M_is_constprop()) - return __generate_from_n_evaluations<__return_size, _R>( - [&](auto __i) _GLIBCXX_SIMD_ALWAYS_INLINE_LAMBDA { - return __x[__values_to_skip + __i]; - }); - if constexpr (_Index == 0 && _Total == 1) - return __x; - else if constexpr (_Index == 0) - return __intrin_bitcast<_R>(__as_vector(__x)); -#if _GLIBCXX_SIMD_X86INTRIN // {{{ - else if constexpr (sizeof(__x) == 32 - && __return_size * sizeof(_Tp) <= 16) - { - constexpr size_t __bytes_to_skip = __values_to_skip * sizeof(_Tp); - if constexpr (__bytes_to_skip == 16) - return __vector_bitcast<_Tp, __return_size>( - __hi128(__as_vector(__x))); - else - return __vector_bitcast<_Tp, __return_size>( - _mm_alignr_epi8(__hi128(__vector_bitcast<_LLong>(__x)), - __lo128(__vector_bitcast<_LLong>(__x)), - __bytes_to_skip)); - } -#endif // _GLIBCXX_SIMD_X86INTRIN }}} - else if constexpr (_Index > 0 - && (__values_to_skip % __return_size != 0 - || sizeof(_R) >= 8) - && (__values_to_skip + __return_size) * sizeof(_Tp) - <= 64 - && sizeof(__x) >= 16) - return __intrin_bitcast<_R>( - __shift_elements_right<__values_to_skip * sizeof(_Tp)>( - __as_vector(__x))); - else - { - _R __r = {}; - __builtin_memcpy(&__r, - reinterpret_cast(&__x) - + sizeof(_Tp) * __values_to_skip, - __return_size * sizeof(_Tp)); - return __r; - } - } + constexpr int __values_per_part = _Np / _Total; + constexpr int __return_size = __values_per_part * _Combine; + constexpr int __values_to_skip = _Index * __values_per_part; + return __idx_permute<__return_size>(__x, [](int __i) constexpr -> int { + const unsigned __j = __values_to_skip + __i; + return __j < _Np ? __j : __idx_permute_dontcare; + }); } // }}} @@ -1921,16 +1809,22 @@ for (size_t __i = 1; __i < _Np; ++__i) if constexpr (_Np >= 4) { using _Up = conditional_t, float, int>; + auto __y = __vector_bitcast<_Up>(__data(__x)); + __x = __binary_op(__x, _M_make_simd<_Tp, _Np>( + __vector_bitcast<_Tp>(__vector_permute<3, 2, 1, 0>(__y)))); + __y = __vector_bitcast<_Up>(__data(__x)); + __x = __binary_op(__x, _M_make_simd<_Tp, _Np>( + __vector_bitcast<_Tp>(__vector_permute<1, 0, 3, 2>(__y)))); + return __x[0]; + } + else + { + using _Up = conditional_t, double, _LLong>; const auto __y = __vector_bitcast<_Up>(__data(__x)); - __x = __binary_op(__x, - _M_make_simd<_Tp, _Np>(__vector_bitcast<_Tp>( - __vector_permute<3, 2, 1, 0>(__y)))); + __x = __binary_op(__x, _M_make_simd<_Tp, _Np>(__vector_bitcast<_Tp>( + __vector_permute<1, 1>(__y)))); + return __x[0]; } - using _Up = conditional_t, double, _LLong>; - const auto __y = __vector_bitcast<_Up>(__data(__x)); - __x = __binary_op(__x, _M_make_simd<_Tp, _Np>(__vector_bitcast<_Tp>( - __vector_permute<1, 1>(__y)))); - return __x[0]; } //}}} else {