diff --git a/include/xsimd/arch/xsimd_sve.hpp b/include/xsimd/arch/xsimd_sve.hpp index 4a45e09aa..8c9073f19 100644 --- a/include/xsimd/arch/xsimd_sve.hpp +++ b/include/xsimd/arch/xsimd_sve.hpp @@ -78,22 +78,114 @@ namespace xsimd // enable for all SVE supported types template using sve_enable_all_t = std::enable_if_t::value, int>; + + // Trait describing the SVE types that correspond to a scalar, + // parameterised by (byte size, signedness, floating-point-ness). + // + // `scalar` is the matching fixed-width scalar (int8_t, ..., float, + // double). SVE load/store intrinsics are overloaded on these + // pointer types, so remapping integers through `scalar` avoids + // platform quirks such as darwin arm64's `long` vs `long long` + // distinction and rejects `char` as an element type. + // + // `sizeless` is the matching sizeless SVE type. xsimd stores SVE + // vectors as fixed-size attributed types (arm_sve_vector_bits), + // which clang treats as implicitly convertible to every sizeless + // SVE type — including multi-vector tuples — making the overloaded + // svreinterpret_*/svsel/etc. intrinsics ambiguous. Static-casting + // to `sizeless` first collapses the overload set to the single + // 1-vector candidate. + template + struct sve_type; + template <> + struct sve_type<1, true, false> + { + using scalar = int8_t; + using sizeless = svint8_t; + }; + template <> + struct sve_type<1, false, false> + { + using scalar = uint8_t; + using sizeless = svuint8_t; + }; + template <> + struct sve_type<2, true, false> + { + using scalar = int16_t; + using sizeless = svint16_t; + }; + template <> + struct sve_type<2, false, false> + { + using scalar = uint16_t; + using sizeless = svuint16_t; + }; + template <> + struct sve_type<4, true, false> + { + using scalar = int32_t; + using sizeless = svint32_t; + }; + template <> + struct sve_type<4, false, false> + { + using scalar = uint32_t; + using sizeless = svuint32_t; + }; + template <> + struct sve_type<8, true, false> + { + using scalar = int64_t; + using sizeless = svint64_t; + }; + template <> + struct sve_type<8, false, false> + { + using scalar = uint64_t; + using sizeless = svuint64_t; + }; + template <> + struct sve_type<4, true, true> + { + using scalar = float; + using sizeless = svfloat32_t; + }; + template <> + struct sve_type<8, true, true> + { + using scalar = double; + using sizeless = svfloat64_t; + }; + + template + using sve_type_for = sve_type::value, std::is_floating_point::value>; + + template + using sve_sizeless_t = typename sve_type_for::sizeless; + + // Remap integer Ts to their matching fixed-width counterpart (via + // sve_type::scalar) so svld1/svst1 see the pointer type their + // overload set expects; pass non-integer Ts through unchanged. + template >::value> + struct sve_fix_integer_impl + { + using type = T; + }; + template + struct sve_fix_integer_impl + { + using type = typename sve_type_for>::scalar; + }; + + template + using sve_fix_char_t = typename sve_fix_integer_impl::type; } // namespace detail /********* * Load * *********/ - namespace detail - { - // "char" is not allowed in SVE load/store operations - using sve_fix_char_t_impl = std::conditional_t::value, int8_t, uint8_t>; - - template - using sve_fix_char_t = std::conditional_t>::value, - sve_fix_char_t_impl, T>; - } - template = 0> XSIMD_INLINE batch load_aligned(T const* src, convert, requires_arch) noexcept { @@ -108,7 +200,7 @@ namespace xsimd // load_masked template = 0> - XSIMD_INLINE batch load_masked(T const* mem, batch_bool_constant mask, Mode, requires_arch) noexcept + XSIMD_INLINE batch load_masked(T const* mem, batch_bool_constant, Mode, requires_arch) noexcept { return svld1(detail::sve_pmask(), reinterpret_cast const*>(mem)); } @@ -323,25 +415,25 @@ namespace xsimd template = 0> XSIMD_INLINE batch neg(batch const& arg, requires_arch) noexcept { - return svreinterpret_u8(svneg_x(detail::sve_ptrue(), svreinterpret_s8(arg))); + return svreinterpret_u8(svneg_x(detail::sve_ptrue(), svreinterpret_s8(static_cast>(arg)))); } template = 0> XSIMD_INLINE batch neg(batch const& arg, requires_arch) noexcept { - return svreinterpret_u16(svneg_x(detail::sve_ptrue(), svreinterpret_s16(arg))); + return svreinterpret_u16(svneg_x(detail::sve_ptrue(), svreinterpret_s16(static_cast>(arg)))); } template = 0> XSIMD_INLINE batch neg(batch const& arg, requires_arch) noexcept { - return svreinterpret_u32(svneg_x(detail::sve_ptrue(), svreinterpret_s32(arg))); + return svreinterpret_u32(svneg_x(detail::sve_ptrue(), svreinterpret_s32(static_cast>(arg)))); } template = 0> XSIMD_INLINE batch neg(batch const& arg, requires_arch) noexcept { - return svreinterpret_u64(svneg_x(detail::sve_ptrue(), svreinterpret_s64(arg))); + return svreinterpret_u64(svneg_x(detail::sve_ptrue(), svreinterpret_s64(static_cast>(arg)))); } template = 0> @@ -405,8 +497,8 @@ namespace xsimd template XSIMD_INLINE batch bitwise_and(batch const& lhs, batch const& rhs, requires_arch) noexcept { - const auto lhs_bits = svreinterpret_u32(lhs); - const auto rhs_bits = svreinterpret_u32(rhs); + const auto lhs_bits = svreinterpret_u32(static_cast>(lhs)); + const auto rhs_bits = svreinterpret_u32(static_cast>(rhs)); const auto result_bits = svand_x(detail::sve_ptrue(), lhs_bits, rhs_bits); return svreinterpret_f32(result_bits); } @@ -414,8 +506,8 @@ namespace xsimd template XSIMD_INLINE batch bitwise_and(batch const& lhs, batch const& rhs, requires_arch) noexcept { - const auto lhs_bits = svreinterpret_u64(lhs); - const auto rhs_bits = svreinterpret_u64(rhs); + const auto lhs_bits = svreinterpret_u64(static_cast>(lhs)); + const auto rhs_bits = svreinterpret_u64(static_cast>(rhs)); const auto result_bits = svand_x(detail::sve_ptrue(), lhs_bits, rhs_bits); return svreinterpret_f64(result_bits); } @@ -436,8 +528,8 @@ namespace xsimd template XSIMD_INLINE batch bitwise_andnot(batch const& lhs, batch const& rhs, requires_arch) noexcept { - const auto lhs_bits = svreinterpret_u32(lhs); - const auto rhs_bits = svreinterpret_u32(rhs); + const auto lhs_bits = svreinterpret_u32(static_cast>(lhs)); + const auto rhs_bits = svreinterpret_u32(static_cast>(rhs)); const auto result_bits = svbic_x(detail::sve_ptrue(), lhs_bits, rhs_bits); return svreinterpret_f32(result_bits); } @@ -445,8 +537,8 @@ namespace xsimd template XSIMD_INLINE batch bitwise_andnot(batch const& lhs, batch const& rhs, requires_arch) noexcept { - const auto lhs_bits = svreinterpret_u64(lhs); - const auto rhs_bits = svreinterpret_u64(rhs); + const auto lhs_bits = svreinterpret_u64(static_cast>(lhs)); + const auto rhs_bits = svreinterpret_u64(static_cast>(rhs)); const auto result_bits = svbic_x(detail::sve_ptrue(), lhs_bits, rhs_bits); return svreinterpret_f64(result_bits); } @@ -467,8 +559,8 @@ namespace xsimd template XSIMD_INLINE batch bitwise_or(batch const& lhs, batch const& rhs, requires_arch) noexcept { - const auto lhs_bits = svreinterpret_u32(lhs); - const auto rhs_bits = svreinterpret_u32(rhs); + const auto lhs_bits = svreinterpret_u32(static_cast>(lhs)); + const auto rhs_bits = svreinterpret_u32(static_cast>(rhs)); const auto result_bits = svorr_x(detail::sve_ptrue(), lhs_bits, rhs_bits); return svreinterpret_f32(result_bits); } @@ -476,8 +568,8 @@ namespace xsimd template XSIMD_INLINE batch bitwise_or(batch const& lhs, batch const& rhs, requires_arch) noexcept { - const auto lhs_bits = svreinterpret_u64(lhs); - const auto rhs_bits = svreinterpret_u64(rhs); + const auto lhs_bits = svreinterpret_u64(static_cast>(lhs)); + const auto rhs_bits = svreinterpret_u64(static_cast>(rhs)); const auto result_bits = svorr_x(detail::sve_ptrue(), lhs_bits, rhs_bits); return svreinterpret_f64(result_bits); } @@ -498,8 +590,8 @@ namespace xsimd template XSIMD_INLINE batch bitwise_xor(batch const& lhs, batch const& rhs, requires_arch) noexcept { - const auto lhs_bits = svreinterpret_u32(lhs); - const auto rhs_bits = svreinterpret_u32(rhs); + const auto lhs_bits = svreinterpret_u32(static_cast>(lhs)); + const auto rhs_bits = svreinterpret_u32(static_cast>(rhs)); const auto result_bits = sveor_x(detail::sve_ptrue(), lhs_bits, rhs_bits); return svreinterpret_f32(result_bits); } @@ -507,8 +599,8 @@ namespace xsimd template XSIMD_INLINE batch bitwise_xor(batch const& lhs, batch const& rhs, requires_arch) noexcept { - const auto lhs_bits = svreinterpret_u64(lhs); - const auto rhs_bits = svreinterpret_u64(rhs); + const auto lhs_bits = svreinterpret_u64(static_cast>(lhs)); + const auto rhs_bits = svreinterpret_u64(static_cast>(rhs)); const auto result_bits = sveor_x(detail::sve_ptrue(), lhs_bits, rhs_bits); return svreinterpret_f64(result_bits); } @@ -529,7 +621,7 @@ namespace xsimd template XSIMD_INLINE batch bitwise_not(batch const& arg, requires_arch) noexcept { - const auto arg_bits = svreinterpret_u32(arg); + const auto arg_bits = svreinterpret_u32(static_cast>(arg)); const auto result_bits = svnot_x(detail::sve_ptrue(), arg_bits); return svreinterpret_f32(result_bits); } @@ -537,7 +629,7 @@ namespace xsimd template XSIMD_INLINE batch bitwise_not(batch const& arg, requires_arch) noexcept { - const auto arg_bits = svreinterpret_u64(arg); + const auto arg_bits = svreinterpret_u64(static_cast>(arg)); const auto result_bits = svnot_x(detail::sve_ptrue(), arg_bits); return svreinterpret_f64(result_bits); } @@ -557,25 +649,25 @@ namespace xsimd template XSIMD_INLINE batch sve_to_unsigned_batch_impl(batch const& arg, index<1>) noexcept { - return svreinterpret_u8(arg); + return svreinterpret_u8(static_cast>(arg)); } template XSIMD_INLINE batch sve_to_unsigned_batch_impl(batch const& arg, index<2>) noexcept { - return svreinterpret_u16(arg); + return svreinterpret_u16(static_cast>(arg)); } template XSIMD_INLINE batch sve_to_unsigned_batch_impl(batch const& arg, index<4>) noexcept { - return svreinterpret_u32(arg); + return svreinterpret_u32(static_cast>(arg)); } template XSIMD_INLINE batch sve_to_unsigned_batch_impl(batch const& arg, index<8>) noexcept { - return svreinterpret_u64(arg); + return svreinterpret_u64(static_cast>(arg)); } template > @@ -825,7 +917,7 @@ namespace xsimd template = 0> XSIMD_INLINE batch select(batch_bool const& cond, batch const& a, batch const& b, requires_arch) noexcept { - return svsel(cond, a, b); + return svsel(cond, static_cast>(a), static_cast>(b)); } template @@ -964,7 +1056,7 @@ namespace xsimd // create a predicate with only the I-th lane activated const auto iota = detail::sve_iota(); const auto index_predicate = svcmpeq(detail::sve_ptrue(), iota, static_cast>(I)); - return svsel(index_predicate, broadcast(val, sve {}), arg); + return svsel(index_predicate, static_cast>(broadcast(val, sve {})), static_cast>(arg)); } // first @@ -992,61 +1084,61 @@ namespace xsimd template = 0, detail::enable_sized_unsigned_t = 0> XSIMD_INLINE batch bitwise_cast(batch const& arg, batch const&, requires_arch) noexcept { - return svreinterpret_u8(arg); + return svreinterpret_u8(static_cast>(arg)); } template = 0, detail::enable_sized_signed_t = 0> XSIMD_INLINE batch bitwise_cast(batch const& arg, batch const&, requires_arch) noexcept { - return svreinterpret_s8(arg); + return svreinterpret_s8(static_cast>(arg)); } template = 0, detail::enable_sized_unsigned_t = 0> XSIMD_INLINE batch bitwise_cast(batch const& arg, batch const&, requires_arch) noexcept { - return svreinterpret_u16(arg); + return svreinterpret_u16(static_cast>(arg)); } template = 0, detail::enable_sized_signed_t = 0> XSIMD_INLINE batch bitwise_cast(batch const& arg, batch const&, requires_arch) noexcept { - return svreinterpret_s16(arg); + return svreinterpret_s16(static_cast>(arg)); } template = 0, detail::enable_sized_unsigned_t = 0> XSIMD_INLINE batch bitwise_cast(batch const& arg, batch const&, requires_arch) noexcept { - return svreinterpret_u32(arg); + return svreinterpret_u32(static_cast>(arg)); } template = 0, detail::enable_sized_signed_t = 0> XSIMD_INLINE batch bitwise_cast(batch const& arg, batch const&, requires_arch) noexcept { - return svreinterpret_s32(arg); + return svreinterpret_s32(static_cast>(arg)); } template = 0, detail::enable_sized_unsigned_t = 0> XSIMD_INLINE batch bitwise_cast(batch const& arg, batch const&, requires_arch) noexcept { - return svreinterpret_u64(arg); + return svreinterpret_u64(static_cast>(arg)); } template = 0, detail::enable_sized_signed_t = 0> XSIMD_INLINE batch bitwise_cast(batch const& arg, batch const&, requires_arch) noexcept { - return svreinterpret_s64(arg); + return svreinterpret_s64(static_cast>(arg)); } template = 0> XSIMD_INLINE batch bitwise_cast(batch const& arg, batch const&, requires_arch) noexcept { - return svreinterpret_f32(arg); + return svreinterpret_f32(static_cast>(arg)); } template = 0> XSIMD_INLINE batch bitwise_cast(batch const& arg, batch const&, requires_arch) noexcept { - return svreinterpret_f64(arg); + return svreinterpret_f64(static_cast>(arg)); } // batch_bool_cast