diff --git a/include/eve/arch/cpu/logical_wide.hpp b/include/eve/arch/cpu/logical_wide.hpp index c6d8eba651..8db52ba8c2 100644 --- a/include/eve/arch/cpu/logical_wide.hpp +++ b/include/eve/arch/cpu/logical_wide.hpp @@ -125,11 +125,11 @@ namespace eve //! Constructs a eve::logical by splatting a scalar value in all lanes template EVE_FORCEINLINE explicit logical(S v) noexcept - : storage_base(detail::make(eve::as{}, v)) {} + : storage_base(make(eve::as{}, v)) {} //! Construct from a `bool` EVE_FORCEINLINE explicit logical(bool v) noexcept - : storage_base(detail::make(eve::as{}, v)) {} + : storage_base(make(eve::as{}, v)) {} //! Constructs a eve::logical from a sequence of scalar values of proper size template @@ -138,7 +138,7 @@ namespace eve && (... && std::convertible_to>) && (Cardinal::value == 2 + sizeof...(Ts)) ) - : storage_base(detail::make(eve::as{}, v0, v1, vs...)) + : storage_base(make(eve::as{}, v0, v1, vs...)) {} //============================================================================================== diff --git a/include/eve/arch/cpu/wide.hpp b/include/eve/arch/cpu/wide.hpp index 52e921eb14..6a2f16c1a8 100644 --- a/include/eve/arch/cpu/wide.hpp +++ b/include/eve/arch/cpu/wide.hpp @@ -143,7 +143,7 @@ namespace eve template requires std::constructible_from EVE_FORCEINLINE explicit wide(S const& v) noexcept - : storage_base(detail::make(eve::as{}, Type(v))) + : storage_base(make(eve::as{}, Type(v))) {} //! Constructs a eve::wide from a sequence of scalar values of proper size @@ -153,7 +153,7 @@ namespace eve && std::is_convertible_v && (std::is_convertible_v && ... && std::is_convertible_v) ) - : storage_base(detail::make(eve::as {}, + : storage_base(make(eve::as {}, static_cast(v0), static_cast(v1), static_cast(vs)...)) diff --git a/include/eve/detail/function/make.hpp b/include/eve/detail/function/make.hpp index f8d18c9252..2fd24b468e 100644 --- a/include/eve/detail/function/make.hpp +++ b/include/eve/detail/function/make.hpp @@ -8,19 +8,30 @@ #pragma once #include -#include +#include -#if defined(EVE_INCLUDE_X86_HEADER) -# include -#endif +namespace eve +{ + template + struct make_t : callable + { + template + EVE_FORCEINLINE constexpr auto operator()(as tgt, T... vs) const noexcept + { + return EVE_DISPATCH_CALL(tgt, vs...); + } -#if defined(EVE_INCLUDE_POWERPC_HEADER) -# include -#endif + EVE_CALLABLE_OBJECT(make_t, make_); + }; -#if defined(EVE_INCLUDE_ARM_HEADER) -# include -#endif + inline constexpr auto make = functor; +} + +#include + +// #if defined(EVE_INCLUDE_X86_HEADER) +// # include +// #endif #if defined(EVE_INCLUDE_SVE_HEADER) # include diff --git a/include/eve/detail/function/simd/arm/neon/make.hpp b/include/eve/detail/function/simd/arm/neon/make.hpp deleted file mode 100644 index 3f5ffb986c..0000000000 --- a/include/eve/detail/function/simd/arm/neon/make.hpp +++ /dev/null @@ -1,95 +0,0 @@ -//================================================================================================== -/* - EVE - Expressive Vector Engine - Copyright : EVE Project Contributors - SPDX-License-Identifier: BSL-1.0 -*/ -//================================================================================================== -#pragma once - -#include -#include -#include - -namespace eve::detail -{ - //================================================================================================ - // Arithmetic cases - //================================================================================================ - template struct neon_maker - { - using abi_type = abi_t; - template auto operator()(Vs... vs) const - { - using type = as_register_t, abi_type>; - type that {static_cast(vs)...}; - return that; - } - - template static U val(U u, int i) - { - if constexpr( std::same_as ) return (i < Size::value) ? u : U{0}; - else return u; - } - - template auto operator()(V v) const - { - auto impl = [&](auto... I) - { - using type = as_register_t, abi_type>; - auto u = static_cast(v); - return type {val(u, I)...}; - }; - - return apply>(impl); - } - }; - - template - EVE_FORCEINLINE auto make(eve::as> const &, Vs... vs) noexcept - requires arm_abi> - { - return neon_maker {}(vs...); - } - - //================================================================================================ - // Logical cases - //================================================================================================ - template struct neon_maker, Size> - { - using abi_type = abi_t; - - template auto operator()(Vs... vs) const - { - using type = as_logical_register_t, abi_type>; - type that {logical(vs).bits()...}; - return that; - } - - template static U val(U u, U z, int i) - { - if constexpr( std::same_as ) return (i < Size::value) ? u : z; - else return u; - } - - template auto operator()(V v) const - { - auto impl = [&](auto... I) { - using type = as_logical_register_t, abi_type>; - - auto u = logical(v).bits(); - auto z = logical(false).bits(); - return type {val(u,z,I)...}; - }; - - return apply>(impl); - } - }; - - template - EVE_FORCEINLINE auto make(eve::as>> const &, Vs... vs) noexcept - requires arm_abi> - { - return neon_maker, N> {}(vs...); - } -} diff --git a/include/eve/detail/function/simd/arm/sve/make.hpp b/include/eve/detail/function/simd/arm/sve/make.hpp index 002f9eded4..30ae88d574 100644 --- a/include/eve/detail/function/simd/arm/sve/make.hpp +++ b/include/eve/detail/function/simd/arm/sve/make.hpp @@ -18,95 +18,81 @@ namespace eve::detail { -//============================s==================================================================== -// Enumerated make -//================================================================================================ -template -EVE_FORCEINLINE auto -make(eve::as>, Vs... vs) noexcept -requires sve_abi> -{ - static_assert(sizeof...(Vs) == N::value, "[eve::make] - Invalid number of arguments"); - if constexpr( wide::size() < eve::fundamental_cardinal_v ) +template +requires sve_abi> && (N::value > 1) +EVE_FORCEINLINE auto make_(EVE_REQUIRES(sve_), O const&, as> tgt, V0 v, Vs... vs) noexcept +{ + if constexpr (sizeof...(Vs) == 0) { - return [&](std::index_sequence) + // This may be suboptimal, we a one instruction iota on sve + if constexpr(N::value < fundamental_cardinal_t::value) { - return make(as>> {}, vs..., ((void)i, 0)...); + // Use svdup then mask using optimized iota comparison + return wide{v} & (iota(as>>{}) < N::value).mask(); } - (std::make_index_sequence - N::value> {}); - } - else - { - std::array on_stack {static_cast(vs)...}; - return load(on_stack.data(), N {}); - } -} + else + { + constexpr auto c = categorize>(); -//================================================================================================ -// splat make -//================================================================================================ -template -EVE_FORCEINLINE auto -make(eve::as>, T x) noexcept -requires sve_abi> && (N::value > 1) -{ - // This may be suboptimal, we a one instruction iota on sve - if constexpr( N::value < eve::fundamental_cardinal_v ) - { - // Use svdup then mask using optimized iota comparison - return wide{x} & (iota(as>>{}) < N::value).mask(); + if constexpr( match(c, category::int8) ) return svdup_s8(v); + else if constexpr( match(c, category::uint8) ) return svdup_u8(v); + else if constexpr( match(c, category::int16) ) return svdup_s16(v); + else if constexpr( match(c, category::uint16) ) return svdup_u16(v); + else if constexpr( match(c, category::int32) ) return svdup_s32(v); + else if constexpr( match(c, category::uint32) ) return svdup_u32(v); + else if constexpr( match(c, category::int64) ) return svdup_s64(v); + else if constexpr( match(c, category::uint64) ) return svdup_u64(v); + else if constexpr( match(c, category::float32) ) return svdup_f32(v); + else if constexpr( match(c, category::float64) ) return svdup_f64(v); + } } else { - constexpr auto c = categorize>(); + static_assert(sizeof...(Vs) == N::value, "[eve::make] - Invalid number of arguments"); - if constexpr( match(c, category::int8) ) return svdup_s8(x); - else if constexpr( match(c, category::uint8) ) return svdup_u8(x); - else if constexpr( match(c, category::int16) ) return svdup_s16(x); - else if constexpr( match(c, category::uint16) ) return svdup_u16(x); - else if constexpr( match(c, category::int32) ) return svdup_s32(x); - else if constexpr( match(c, category::uint32) ) return svdup_u32(x); - else if constexpr( match(c, category::int64) ) return svdup_s64(x); - else if constexpr( match(c, category::uint64) ) return svdup_u64(x); - else if constexpr( match(c, category::float32) ) return svdup_f32(x); - else if constexpr( match(c, category::float64) ) return svdup_f64(x); + if constexpr( wide::size() < eve::fundamental_cardinal_v ) + { + return [&](std::index_sequence) + { + return make(as>> {}, vs..., ((void)i, 0)...); + } + (std::make_index_sequence - N::value> {}); + } + else + { + std::array on_stack {static_cast(vs)...}; + return load(on_stack.data(), N {}); + } } } -//================================================================================================ -// logical cases -//================================================================================================ -template -EVE_FORCEINLINE auto -make(as>>, Vs... vs) noexcept -requires sve_abi> -{ - using bits_type = typename logical>::bits_type; - using e_t = element_type_t; - - auto bits = make(as {}, (vs ? (e_t)-1 : 0)...); - return svcmpne(sve_true(), bits, (e_t)0); -} - -template -EVE_FORCEINLINE auto -make(eve::as>>, V x) noexcept +template requires sve_abi> && (N::value > 1) +EVE_FORCEINLINE auto make_(EVE_REQUIRES(sve_), O const&, as>> tgt, V0 v, Vs... vs) noexcept { - using f_t = fundamental_cardinal_t; - - if constexpr( N::value < f_t::value ) + if constexpr (sizeof...(Vs) == 0) { - // Use svdup then mask using optimized iota comparison - return logical>{(bool)x} && (iota(as>>{}) < N::value); + if constexpr(N::value < fundamental_cardinal_t::value) + { + // Use svdup then mask using optimized iota comparison + return logical>{(bool) v} && (iota(as>>{}) < N::value); + } + else + { + if constexpr( sizeof(T) == 1 ) return svdup_b8(v); + else if constexpr( sizeof(T) == 2 ) return svdup_b16(v); + else if constexpr( sizeof(T) == 4 ) return svdup_b32(v); + else if constexpr( sizeof(T) == 8 ) return svdup_b64(v); + } } else { - if constexpr( sizeof(T) == 1 ) return svdup_b8(x); - else if constexpr( sizeof(T) == 2 ) return svdup_b16(x); - else if constexpr( sizeof(T) == 4 ) return svdup_b32(x); - else if constexpr( sizeof(T) == 8 ) return svdup_b64(x); + using bits_type = typename logical>::bits_type; + using e_t = element_type_t; + + auto bits = make(as {}, (vs ? (e_t)-1 : 0)...); + return svcmpne(sve_true(), bits, (e_t)0); } } diff --git a/include/eve/detail/function/simd/common/make.hpp b/include/eve/detail/function/simd/common/make.hpp index 0dd0bb5cbd..87f2b7da99 100644 --- a/include/eve/detail/function/simd/common/make.hpp +++ b/include/eve/detail/function/simd/common/make.hpp @@ -16,7 +16,7 @@ namespace eve::detail { //================================================================================================ - // Emulation + // Emulation Helpers //================================================================================================ template EVE_FORCEINLINE auto make_emulated(V0 v0, Vs... vs) noexcept @@ -36,53 +36,8 @@ namespace eve::detail }(std::make_index_sequence{}); } - template - EVE_FORCEINLINE auto make(eve::as> const &, Vs... vs) noexcept - requires std::same_as, emulated_> - { - if constexpr (has_plain_translation) - { - return bit_cast(make(eve::as, N>>{}, translate(vs)...), as>{}); - } - else - { - return make_emulated>(vs...); - } - } - - template - EVE_FORCEINLINE auto make(eve::as>> const &, Vs... vs) noexcept - requires std::same_as, emulated_> - { - if constexpr (has_plain_translation) - { - return bit_cast(make(eve::as, N>>>{}, translate(vs)...), as>>{}); - } - else - { - return make_emulated>>(vs...); - } - } - //================================================================================================ - // Bundle - //================================================================================================ - template - EVE_FORCEINLINE auto make(eve::as> const &, Vs... vs) noexcept - requires std::same_as, bundle_> - { - using kumi::get; - typename wide::storage_type that; - - kumi::for_each_index( [&](I, M& m) { m = M{ get(vs)... }; } - , that - ); - - return that; - } - - //================================================================================================ - // Aggregation + // Aggregation Helpers //================================================================================================ template EVE_FORCEINLINE Pack make_aggregated(V0 v0, Vs... vs) noexcept @@ -108,17 +63,51 @@ namespace eve::detail return that; } - template - EVE_FORCEINLINE auto make(eve::as> const &, Vs... vs) noexcept - requires std::same_as, aggregated_> + template + EVE_FORCEINLINE constexpr auto make_(EVE_REQUIRES(cpu_), O const&, as tgt, T0 v, TS... vs) noexcept { - return make_aggregated>(vs...); - } + using type = typename Target::storage_type; + using v_type = typename Target::value_type; - template - EVE_FORCEINLINE auto make(eve::as>> const &, Vs... vs) noexcept - requires std::same_as, aggregated_> - { - return make_aggregated>>(vs...); + if constexpr (has_aggregated_abi_v) + { + return make_aggregated(v, vs...); + } + else if constexpr (has_emulated_abi_v) + { + return make_emulated(v, vs...); + } + else if constexpr (kumi::product_type) + { + using kumi::get; + type that; + + kumi::for_each_index( [&](I, M& m) { m = M{ get(v), get(vs)... }; } + , that + ); + + return that; + } + else + { + if constexpr (sizeof...(vs) == 0) + { + return [&](std::index_sequence const&) + { + auto val = [](auto vv, auto) + { + if constexpr (logical_value) return as_logical_t(vv).bits(); + else return vv; + }; + + return type { val(v, N)... }; + }(std::make_index_sequence()); + } + else + { + if constexpr (logical_value) return type {as_logical_t(v).bits(), as_logical_t(vs).bits()...}; + else return type {v, vs...}; + } + } } } diff --git a/include/eve/detail/function/simd/ppc/make.hpp b/include/eve/detail/function/simd/ppc/make.hpp deleted file mode 100644 index 119fe903d5..0000000000 --- a/include/eve/detail/function/simd/ppc/make.hpp +++ /dev/null @@ -1,67 +0,0 @@ -//================================================================================================== -/* - EVE - Expressive Vector Engine - Copyright : EVE Project Contributors - SPDX-License-Identifier: BSL-1.0 -*/ -//================================================================================================== -#pragma once - -#include -#include -#include -#include - -namespace eve::detail -{ - //================================================================================================ - // arithmetic cases - //================================================================================================ - template - EVE_FORCEINLINE auto make(eve::as> const &, Vs... vs) noexcept - requires ppc_abi> - { - using type = as_register_t; - type that = {static_cast(vs)...}; - return that; - } - - template - EVE_FORCEINLINE auto make(eve::as> const &, V v) noexcept - requires ppc_abi> - { - using type = as_register_t; - - return [&](std::index_sequence const&) - { - auto val = [](auto vv, auto) { return vv; }; - return type { val(v, N)... }; - }(std::make_index_sequence()); - } - - //================================================================================================ - // logical cases - //================================================================================================ - template - EVE_FORCEINLINE auto make(eve::as>> const &, Vs... vs) noexcept - requires ppc_abi> - { - using type = as_logical_register_t; - type that = {logical(vs).bits()...}; - return that; - } - - template - EVE_FORCEINLINE auto make(eve::as>> const &, V v) noexcept - requires ppc_abi> - { - using type = as_logical_register_t; - - return [&](std::index_sequence const&) - { - auto u = logical(v).bits(); - auto val = [](auto vv, auto) { return vv; }; - return type { val(u, N)... }; - }(std::make_index_sequence()); - } -} diff --git a/include/eve/detail/function/simd/x86/make.hpp b/include/eve/detail/function/simd/x86/make.hpp index 7a576d5a43..71948a0735 100644 --- a/include/eve/detail/function/simd/x86/make.hpp +++ b/include/eve/detail/function/simd/x86/make.hpp @@ -11,6 +11,7 @@ #include #include #include +#include #include #include @@ -21,185 +22,167 @@ namespace eve::detail { - //================================================================================================ - // enumerated make - 128bits - //================================================================================================ - template - EVE_FORCEINLINE auto make(eve::as> const &, Vs... vs) noexcept - requires std::same_as, x86_128_> + template + requires x86_abi> + EVE_FORCEINLINE auto make_(EVE_REQUIRES(sse2_), O const&, as> tgt, V0 v, Vs... vs) noexcept { - static_assert ( sizeof...(Vs) <= S::value - , "[eve::make] - Invalid number of arguments" - ); + static_assert((sizeof...(Vs) + 1) <= N::value, "[eve::make] - Invalid number of arguments"); - constexpr auto c = categorize>(); + constexpr auto c = categorize>(); + using abi = abi_t; - if constexpr( c == category::float64x2) return _mm_setr_pd(static_cast(vs)...); - else if constexpr( c == category::float32x4) - { - return [&](std::index_sequence const&) - { - return _mm_setr_ps(vs..., (N ? 0:0)...); - }(std::make_index_sequence<4 - sizeof...(vs)>()); - } - else if constexpr( match(c,category::int8x16, category::uint8x16) ) - { - return [&](std::index_sequence const&) - { - return _mm_setr_epi8(vs..., (N ? 0:0)...); - }(std::make_index_sequence<16 - sizeof...(vs)>()); - } - else if constexpr( match(c,category::int16x8, category::uint16x8) ) + if constexpr (sizeof...(vs) == 0) { - return [&](std::index_sequence const&) + // splat make + if constexpr(wide::size() < eve::fundamental_cardinal_v) { - return _mm_setr_epi16(vs..., (N ? 0:0)...); - }(std::make_index_sequence<8 - sizeof...(vs)>()); - } - else if constexpr( match(c,category::int32x4, category::uint32x4) ) - { - return [&](std::index_sequence const&) + return [&](std::index_sequence const&) + { + return make(as>>{}, (I < N::value ? v : 0)...); + }(std::make_index_sequence>()); + } + else { - return _mm_setr_epi32(vs..., (N ? 0:0)...); - }(std::make_index_sequence<4 - sizeof...(vs)>()); + constexpr auto c = categorize>(); + + if constexpr( c == category::float64x8 ) return _mm512_set1_pd(v); + else if constexpr( c == category::float64x4 ) return _mm256_set1_pd(v); + else if constexpr( c == category::float64x2 ) return _mm_set1_pd(v); + else if constexpr( c == category::float32x16 ) return _mm512_set1_ps(v); + else if constexpr( c == category::float32x8 ) return _mm256_set1_ps(v); + else if constexpr( c == category::float32x4 ) return _mm_set1_ps(v); + else if constexpr( match(c,category::int8x64 , category::uint8x64) ) return _mm512_set1_epi8(v); + else if constexpr( match(c,category::int8x32 , category::uint8x32) ) return _mm256_set1_epi8(v); + else if constexpr( match(c,category::int8x16 , category::uint8x16) ) return _mm_set1_epi8(v); + else if constexpr( match(c,category::int16x32, category::uint16x32) ) return _mm512_set1_epi16(v); + else if constexpr( match(c,category::int16x16, category::uint16x16) ) return _mm256_set1_epi16(v); + else if constexpr( match(c,category::int16x8 , category::uint16x8) ) return _mm_set1_epi16(v); + else if constexpr( match(c,category::int32x16, category::uint32x16) ) return _mm512_set1_epi32(v); + else if constexpr( match(c,category::int32x8 , category::uint32x8) ) return _mm256_set1_epi32(v); + else if constexpr( match(c,category::int32x4 , category::uint32x4) ) return _mm_set1_epi32(v); + else if constexpr( match(c,category::int64x8 , category::uint64x8) ) return _mm512_set1_epi64(v); + else if constexpr( match(c,category::int64x4 , category::uint64x4) ) return _mm256_set1_epi64x(v); + else if constexpr( match(c,category::int64x2 , category::uint64x2) ) + { + [[maybe_unused]] __m128i that; + T *ptr = reinterpret_cast*>(&that); + ptr[0] = ptr[1] = static_cast(v); + return that; + } + } } - else if constexpr( match(c,category::int64x2, category::uint64x2) ) + else // sizeof...(vs) > 0 { - [[maybe_unused]] __m128i that; + // ############### + // enumerated make + // ############### - T *ptr = reinterpret_cast *>(&that); - T d[] = {static_cast(vs)...}; - - ptr[0] = d[0]; - ptr[1] = d[1]; - - return that; - } - } + // 128bits + if constexpr (std::is_same_v) + { + if constexpr( c == category::float64x2) return _mm_setr_pd(v, static_cast(vs)...); + else if constexpr( c == category::float32x4) + { + return [&](std::index_sequence const&) + { + return _mm_setr_ps(v, vs..., (I ? 0 : 0)...); + }(std::make_index_sequence<4 - sizeof...(vs) - 1>()); + } + else if constexpr( match(c,category::int8x16, category::uint8x16) ) + { + return [&](std::index_sequence const&) + { + return _mm_setr_epi8(v, vs..., (I ? 0 : 0)...); + }(std::make_index_sequence<16 - sizeof...(vs) - 1>()); + } + else if constexpr( match(c,category::int16x8, category::uint16x8) ) + { + return [&](std::index_sequence const&) + { + return _mm_setr_epi16(v, vs..., (I ? 0 : 0)...); + }(std::make_index_sequence<8 - sizeof...(vs) - 1>()); + } + else if constexpr( match(c,category::int32x4, category::uint32x4) ) + { + return [&](std::index_sequence const&) + { + return _mm_setr_epi32(v, vs..., (I ? 0 : 0)...); + }(std::make_index_sequence<4 - sizeof...(vs) - 1>()); + } + else if constexpr( match(c,category::int64x2, category::uint64x2) ) + { + [[maybe_unused]] __m128i that; + + T *ptr = reinterpret_cast *>(&that); + T d[] = { v, static_cast(vs)... }; + + ptr[0] = d[0]; + ptr[1] = d[1]; + + return that; + } + } - //================================================================================================ - // enumerated make - 256bits - //================================================================================================ - template - EVE_FORCEINLINE auto make(eve::as> const &, Vs... vs) noexcept - requires std::same_as, x86_256_> - { - static_assert ( sizeof...(Vs) <= S::value - , "[eve::make] - Invalid number of arguments" - ); - - constexpr auto c = categorize>(); - - if constexpr( c == category::float64x4) return _mm256_setr_pd(vs...); - else if constexpr( c == category::float32x8) return _mm256_setr_ps(vs...); - else if constexpr( sizeof(T) == 1) return _mm256_setr_epi8(vs...); - else if constexpr( sizeof(T) == 2) return _mm256_setr_epi16(vs...); - else if constexpr( sizeof(T) == 4) return _mm256_setr_epi32(vs...); - else if constexpr( sizeof(T) == 8) return _mm256_setr_epi64x(vs...); - } + // 256 bits + else if constexpr (std::is_same_v) + { + if constexpr( c == category::float64x4) return _mm256_setr_pd(v, vs...); + else if constexpr( c == category::float32x8) return _mm256_setr_ps(v, vs...); + else if constexpr( sizeof(T) == 1) return _mm256_setr_epi8(v, vs...); + else if constexpr( sizeof(T) == 2) return _mm256_setr_epi16(v, vs...); + else if constexpr( sizeof(T) == 4) return _mm256_setr_epi32(v, vs...); + else if constexpr( sizeof(T) == 8) return _mm256_setr_epi64x(v, vs...); + } - //================================================================================================ - // enumerated make - 512bits - //================================================================================================ - template - EVE_FORCEINLINE auto make(eve::as> const &, Vs... vs) noexcept - requires std::same_as, x86_512_> - { - static_assert ( sizeof...(Vs) <= S::value - , "[eve::make] - Invalid number of arguments" - ); - - constexpr auto c = categorize>(); - - /* - Please take a minute to acknowledge the effect of deciding _mm512_setr should be - a macro on g++. Thanks, I hate it - - Press F for respect. - */ - if constexpr( c == category::float64x8) - return []( auto a0,auto a1,auto a2,auto a3, auto a4,auto a5,auto a6,auto a7) - { return _mm512_setr_pd(a0,a1,a2,a3,a4,a5,a6,a7); }(vs...); - else if constexpr( c == category::float32x16) - return []( auto a0,auto a1,auto a2,auto a3, auto a4,auto a5,auto a6,auto a7 - , auto b0,auto b1,auto b2,auto b3, auto b4,auto b5,auto b6,auto b7 - ) - { return _mm512_setr_ps(a0,a1,a2,a3,a4,a5,a6,a7,b0,b1,b2,b3,b4,b5,b6,b7); }(vs...); - else if constexpr( sizeof(T) == 8) - return []( auto a0,auto a1,auto a2,auto a3, auto a4,auto a5,auto a6,auto a7) - { return _mm512_setr_epi64(a0,a1,a2,a3,a4,a5,a6,a7); }(vs...); - else if constexpr( sizeof(T) == 4) - return []( auto a0,auto a1,auto a2,auto a3, auto a4,auto a5,auto a6,auto a7 - , auto b0,auto b1,auto b2,auto b3, auto b4,auto b5,auto b6,auto b7 - ) - { return _mm512_setr_epi32(a0,a1,a2,a3,a4,a5,a6,a7,b0,b1,b2,b3,b4,b5,b6,b7); }(vs...); - else if constexpr( sizeof(T) == 2) - return []( auto a0,auto a1,auto a2,auto a3, auto a4,auto a5,auto a6,auto a7 - , auto b0,auto b1,auto b2,auto b3, auto b4,auto b5,auto b6,auto b7 - , auto c0,auto c1,auto c2,auto c3, auto c4,auto c5,auto c6,auto c7 - , auto d0,auto d1,auto d2,auto d3, auto d4,auto d5,auto d6,auto d7 - ) - { return _mm512_set_epi16 ( d7,d6,d5,d4,d3,d2,d1,d0,c7,c6,c5,c4,c3,c2,c1,c0,b7, - b6,b5,b4,b3,b2,b1,b0,a7,a6,a5,a4,a3,a2,a1,a0 - ); }(vs...); - else if constexpr( sizeof(T) == 1) + // 512 bits + else if constexpr (std::is_same_v) + { + /* + Please take a minute to acknowledge the effect of deciding _mm512_setr should be + a macro on g++. Thanks, I hate it + + Press F for respect. + */ + if constexpr( c == category::float64x8) + return []( auto a0,auto a1,auto a2,auto a3, auto a4,auto a5,auto a6,auto a7) + { return _mm512_setr_pd(a0,a1,a2,a3,a4,a5,a6,a7); }(v, vs...); + else if constexpr( c == category::float32x16) + return []( auto a0,auto a1,auto a2,auto a3, auto a4,auto a5,auto a6,auto a7 + , auto b0,auto b1,auto b2,auto b3, auto b4,auto b5,auto b6,auto b7 + ) + { return _mm512_setr_ps(a0,a1,a2,a3,a4,a5,a6,a7,b0,b1,b2,b3,b4,b5,b6,b7); }(v, vs...); + else if constexpr( sizeof(T) == 8) + return []( auto a0,auto a1,auto a2,auto a3, auto a4,auto a5,auto a6,auto a7) + { return _mm512_setr_epi64(a0,a1,a2,a3,a4,a5,a6,a7); }(v, vs...); + else if constexpr( sizeof(T) == 4) + return []( auto a0,auto a1,auto a2,auto a3, auto a4,auto a5,auto a6,auto a7 + , auto b0,auto b1,auto b2,auto b3, auto b4,auto b5,auto b6,auto b7 + ) + { return _mm512_setr_epi32(a0,a1,a2,a3,a4,a5,a6,a7,b0,b1,b2,b3,b4,b5,b6,b7); }(v, vs...); + else if constexpr( sizeof(T) == 2) return []( auto a0,auto a1,auto a2,auto a3, auto a4,auto a5,auto a6,auto a7 , auto b0,auto b1,auto b2,auto b3, auto b4,auto b5,auto b6,auto b7 , auto c0,auto c1,auto c2,auto c3, auto c4,auto c5,auto c6,auto c7 , auto d0,auto d1,auto d2,auto d3, auto d4,auto d5,auto d6,auto d7 - , auto e0,auto e1,auto e2,auto e3, auto e4,auto e5,auto e6,auto e7 - , auto f0,auto f1,auto f2,auto f3, auto f4,auto f5,auto f6,auto f7 - , auto g0,auto g1,auto g2,auto g3, auto g4,auto g5,auto g6,auto g7 - , auto h0,auto h1,auto h2,auto h3, auto h4,auto h5,auto h6,auto h7 ) - { return _mm512_set_epi8( h7,h6,h5,h4,h3,h2,h1,h0,g7,g6,g5,g4,g3,g2,g1,g0, - f7,f6,f5,f4,f3,f2,f1,f0,e7,e6,e5,e4,e3,e2,e1,e0, - d7,d6,d5,d4,d3,d2,d1,d0,c7,c6,c5,c4,c3,c2,c1,c0,b7, - b6,b5,b4,b3,b2,b1,b0,a7,a6,a5,a4,a3,a2,a1,a0 - ); }(vs...); - } - - //================================================================================================ - // splat make - //================================================================================================ - template - EVE_FORCEINLINE auto make(eve::as> const&, V v) noexcept - requires x86_abi> - { - if constexpr(wide::size() < eve::fundamental_cardinal_v) - { - return [&](std::index_sequence const&) - { - return make(as>>{}, (N>()); - } - else - { - constexpr auto c = categorize>(); - - if constexpr( c == category::float64x8 ) return _mm512_set1_pd(v); - else if constexpr( c == category::float64x4 ) return _mm256_set1_pd(v); - else if constexpr( c == category::float64x2 ) return _mm_set1_pd(v); - else if constexpr( c == category::float32x16 ) return _mm512_set1_ps(v); - else if constexpr( c == category::float32x8 ) return _mm256_set1_ps(v); - else if constexpr( c == category::float32x4 ) return _mm_set1_ps(v); - else if constexpr( match(c,category::int8x64 , category::uint8x64) ) return _mm512_set1_epi8(v); - else if constexpr( match(c,category::int8x32 , category::uint8x32) ) return _mm256_set1_epi8(v); - else if constexpr( match(c,category::int8x16 , category::uint8x16) ) return _mm_set1_epi8(v); - else if constexpr( match(c,category::int16x32, category::uint16x32)) return _mm512_set1_epi16(v); - else if constexpr( match(c,category::int16x16, category::uint16x16)) return _mm256_set1_epi16(v); - else if constexpr( match(c,category::int16x8 , category::uint16x8) ) return _mm_set1_epi16(v); - else if constexpr( match(c,category::int32x16, category::uint32x16)) return _mm512_set1_epi32(v); - else if constexpr( match(c,category::int32x8 , category::uint32x8) ) return _mm256_set1_epi32(v); - else if constexpr( match(c,category::int32x4 , category::uint32x4) ) return _mm_set1_epi32(v); - else if constexpr( match(c,category::int64x8 , category::uint64x8) ) return _mm512_set1_epi64(v); - else if constexpr( match(c,category::int64x4 , category::uint64x4) ) return _mm256_set1_epi64x(v); - else if constexpr( match(c,category::int64x2 , category::uint64x2) ) - { - [[maybe_unused]] __m128i that; - T *ptr = reinterpret_cast *>(&that); - ptr[0] = ptr[1] = static_cast(v); - return that; + { return _mm512_set_epi16 ( d7,d6,d5,d4,d3,d2,d1,d0,c7,c6,c5,c4,c3,c2,c1,c0,b7, + b6,b5,b4,b3,b2,b1,b0,a7,a6,a5,a4,a3,a2,a1,a0 + ); }(v, vs...); + else if constexpr( sizeof(T) == 1) + return []( auto a0,auto a1,auto a2,auto a3, auto a4,auto a5,auto a6,auto a7 + , auto b0,auto b1,auto b2,auto b3, auto b4,auto b5,auto b6,auto b7 + , auto c0,auto c1,auto c2,auto c3, auto c4,auto c5,auto c6,auto c7 + , auto d0,auto d1,auto d2,auto d3, auto d4,auto d5,auto d6,auto d7 + , auto e0,auto e1,auto e2,auto e3, auto e4,auto e5,auto e6,auto e7 + , auto f0,auto f1,auto f2,auto f3, auto f4,auto f5,auto f6,auto f7 + , auto g0,auto g1,auto g2,auto g3, auto g4,auto g5,auto g6,auto g7 + , auto h0,auto h1,auto h2,auto h3, auto h4,auto h5,auto h6,auto h7 + ) + { return _mm512_set_epi8( h7,h6,h5,h4,h3,h2,h1,h0,g7,g6,g5,g4,g3,g2,g1,g0, + f7,f6,f5,f4,f3,f2,f1,f0,e7,e6,e5,e4,e3,e2,e1,e0, + d7,d6,d5,d4,d3,d2,d1,d0,c7,c6,c5,c4,c3,c2,c1,c0,b7, + b6,b5,b4,b3,b2,b1,b0,a7,a6,a5,a4,a3,a2,a1,a0 + ); }(v, vs...); } } } @@ -207,44 +190,38 @@ namespace eve::detail //================================================================================================ // logical cases //================================================================================================ - template - EVE_FORCEINLINE auto make(as>> const &, Vs... vs) noexcept - requires x86_abi> + template + requires x86_abi> + EVE_FORCEINLINE auto make_(EVE_REQUIRES(sse2_), O const&, as>> tgt, V0 v, Vs... vs) noexcept { - if constexpr( !abi_t::is_wide_logical ) + if constexpr( !abi_t::is_wide_logical ) { - typename logical>::storage_type that{}; - [&](auto& v, std::index_sequence){ (( v |= vs?(1ULL<{}); - - return that; - } - else - { - return make(as> {}, logical(vs).mask()...); - } - } - - template - EVE_FORCEINLINE auto make(as>> const &, V v) noexcept - requires x86_abi> - { - if constexpr( !abi_t::is_wide_logical ) - { - using s_t = typename logical>::storage_type; + using s_t = typename logical>::storage_type; using i_t = typename s_t::type; - constexpr i_t false_bits = i_t{0}; - constexpr i_t true_bits = []{ - if constexpr ( S() < s_t::bits ) return i_t{ (1ULL << S::value) -1 }; - else return ~i_t{0}; - }(); + if constexpr (sizeof...(vs) == 0) + { + constexpr i_t false_bits = i_t{0}; + constexpr i_t true_bits = []{ + if constexpr ( N() < s_t::bits ) return i_t{ (1ULL << N::value) -1 }; + else return ~i_t{0}; + }(); - return s_t{ !!v ? true_bits : false_bits }; + return s_t{ !!v ? true_bits : false_bits }; + } + else + { + typename logical>::storage_type that{}; + that.value |= v ? 1ULL : 0; + [&](auto& val, std::index_sequence){ (( val |= vs ? (1ULL << (I + 1)) : 0), ...); } + (that.value, std::make_index_sequence{}); + + return that; + } } else { - return make(as> {}, logical(v).mask()); + return make(as> {}, logical(v).mask(), logical(vs).mask()...); } } }