diff --git a/include/xsimd/arch/common/xsimd_common_memory.hpp b/include/xsimd/arch/common/xsimd_common_memory.hpp index 6ead88f99..24220a055 100644 --- a/include/xsimd/arch/common/xsimd_common_memory.hpp +++ b/include/xsimd/arch/common/xsimd_common_memory.hpp @@ -224,7 +224,8 @@ namespace xsimd template XSIMD_INLINE typename batch, A>::value_type get(batch, A> const& self, ::xsimd::index, requires_arch) noexcept { - alignas(A::alignment()) T buffer[batch, A>::size]; + using value_type = typename batch, A>::value_type; + alignas(A::alignment()) value_type buffer[batch, A>::size]; self.store_aligned(&buffer[0]); return buffer[I]; } diff --git a/include/xsimd/arch/xsimd_avx.hpp b/include/xsimd/arch/xsimd_avx.hpp index 441371643..82b41ff8d 100644 --- a/include/xsimd/arch/xsimd_avx.hpp +++ b/include/xsimd/arch/xsimd_avx.hpp @@ -748,6 +748,61 @@ namespace xsimd return self - batch(mask.data); } + // get + template + XSIMD_INLINE float get(batch const& self, ::xsimd::index, requires_arch) noexcept + { + constexpr size_t elements_per_lane = 4; + constexpr size_t lane = I / elements_per_lane; + constexpr size_t sub_index = I % elements_per_lane; + __m128 half; + XSIMD_IF_CONSTEXPR(lane == 0) + { + half = _mm256_castps256_ps128(self); + } + else + { + half = detail::upper_half((__m256)self); + } + return kernel::get(batch(half), ::xsimd::index {}, sse4_1 {}); + } + + template + XSIMD_INLINE double get(batch const& self, ::xsimd::index, requires_arch) noexcept + { + constexpr size_t elements_per_lane = 2; + constexpr size_t lane = I / elements_per_lane; + constexpr size_t sub_index = I % elements_per_lane; + __m128d half; + XSIMD_IF_CONSTEXPR(lane == 0) + { + half = _mm256_castpd256_pd128(self); + } + else + { + half = detail::upper_half((__m256d)self); + } + return kernel::get(batch(half), ::xsimd::index {}, sse4_1 {}); + } + + template ::value>> + XSIMD_INLINE T get(batch const& self, ::xsimd::index, requires_arch) noexcept + { + constexpr size_t elements_per_lane = 16 / sizeof(T); + constexpr size_t lane = I / elements_per_lane; + constexpr size_t sub_index = I % elements_per_lane; + __m128i half; + XSIMD_IF_CONSTEXPR(lane == 0) + { + half = _mm256_castsi256_si128(self); + } + else + { + half = detail::upper_half((__m256i)self); + } + return kernel::get(batch(half), ::xsimd::index {}, sse4_1 {}); + } + // insert template ::value>> XSIMD_INLINE batch insert(batch const& self, T val, index pos, requires_arch) noexcept diff --git a/include/xsimd/arch/xsimd_avx512f.hpp b/include/xsimd/arch/xsimd_avx512f.hpp index fe8d33d99..ab3a532eb 100644 --- a/include/xsimd/arch/xsimd_avx512f.hpp +++ b/include/xsimd/arch/xsimd_avx512f.hpp @@ -1346,6 +1346,61 @@ namespace xsimd } } + // get + template + XSIMD_INLINE float get(batch const& self, ::xsimd::index, requires_arch) noexcept + { + constexpr size_t elements_per_lane = 8; + constexpr size_t lane = I / elements_per_lane; + constexpr size_t sub_index = I % elements_per_lane; + __m256 half; + XSIMD_IF_CONSTEXPR(lane == 0) + { + half = _mm512_castps512_ps256(self); + } + else + { + half = detail::upper_half((__m512)self); + } + return kernel::get(batch(half), ::xsimd::index {}, avx {}); + } + + template + XSIMD_INLINE double get(batch const& self, ::xsimd::index, requires_arch) noexcept + { + constexpr size_t elements_per_lane = 4; + constexpr size_t lane = I / elements_per_lane; + constexpr size_t sub_index = I % elements_per_lane; + __m256d half; + XSIMD_IF_CONSTEXPR(lane == 0) + { + half = _mm512_castpd512_pd256(self); + } + else + { + half = detail::upper_half((__m512d)self); + } + return kernel::get(batch(half), ::xsimd::index {}, avx {}); + } + + template ::value>> + XSIMD_INLINE T get(batch const& self, ::xsimd::index, requires_arch) noexcept + { + constexpr size_t elements_per_lane = 32 / sizeof(T); + constexpr size_t lane = I / elements_per_lane; + constexpr size_t sub_index = I % elements_per_lane; + __m256i half; + XSIMD_IF_CONSTEXPR(lane == 0) + { + half = _mm512_castsi512_si256(self); + } + else + { + half = detail::upper_half((__m512i)self); + } + return kernel::get(batch(half), ::xsimd::index {}, avx {}); + } + // insert template XSIMD_INLINE batch insert(batch const& self, float val, index, requires_arch) noexcept diff --git a/include/xsimd/arch/xsimd_neon.hpp b/include/xsimd/arch/xsimd_neon.hpp index 4af19a650..8d33d5a9e 100644 --- a/include/xsimd/arch/xsimd_neon.hpp +++ b/include/xsimd/arch/xsimd_neon.hpp @@ -2742,6 +2742,61 @@ namespace xsimd return vshrq_n_s64(x, shift); } + // get + template + XSIMD_INLINE float get(batch const& self, ::xsimd::index, requires_arch) noexcept + { + return vgetq_lane_f32(self, I); + } + + template = 0> + XSIMD_INLINE T get(batch const& self, ::xsimd::index, requires_arch) noexcept + { + return vgetq_lane_u8(self, I); + } + + template = 0> + XSIMD_INLINE T get(batch const& self, ::xsimd::index, requires_arch) noexcept + { + return vgetq_lane_s8(self, I); + } + + template = 0> + XSIMD_INLINE T get(batch const& self, ::xsimd::index, requires_arch) noexcept + { + return vgetq_lane_u16(self, I); + } + + template = 0> + XSIMD_INLINE T get(batch const& self, ::xsimd::index, requires_arch) noexcept + { + return vgetq_lane_s16(self, I); + } + + template = 0> + XSIMD_INLINE T get(batch const& self, ::xsimd::index, requires_arch) noexcept + { + return vgetq_lane_u32(self, I); + } + + template = 0> + XSIMD_INLINE T get(batch const& self, ::xsimd::index, requires_arch) noexcept + { + return vgetq_lane_s32(self, I); + } + + template = 0> + XSIMD_INLINE T get(batch const& self, ::xsimd::index, requires_arch) noexcept + { + return vgetq_lane_u64(self, I); + } + + template = 0> + XSIMD_INLINE T get(batch const& self, ::xsimd::index, requires_arch) noexcept + { + return vgetq_lane_s64(self, I); + } + // first template XSIMD_INLINE float first(batch const& self, requires_arch) noexcept diff --git a/include/xsimd/arch/xsimd_neon64.hpp b/include/xsimd/arch/xsimd_neon64.hpp index 602b4b207..39b6a2edf 100644 --- a/include/xsimd/arch/xsimd_neon64.hpp +++ b/include/xsimd/arch/xsimd_neon64.hpp @@ -31,6 +31,13 @@ namespace xsimd { using namespace types; + // get + template + XSIMD_INLINE double get(batch const& self, ::xsimd::index, requires_arch) noexcept + { + return vgetq_lane_f64(self, I); + } + // first template XSIMD_INLINE double first(batch const& self, requires_arch) noexcept diff --git a/include/xsimd/arch/xsimd_sse2.hpp b/include/xsimd/arch/xsimd_sse2.hpp index 7e00b2b74..c018818f9 100644 --- a/include/xsimd/arch/xsimd_sse2.hpp +++ b/include/xsimd/arch/xsimd_sse2.hpp @@ -798,6 +798,102 @@ namespace xsimd return _mm_castsi128_pd(_mm_cmpeq_epi32(_mm_castpd_si128(self), _mm_castpd_si128(other))); } + // get + template + XSIMD_INLINE float get(batch const& self, ::xsimd::index, requires_arch) noexcept + { + XSIMD_IF_CONSTEXPR(I == 0) + { + return _mm_cvtss_f32(self); + } + else + { + return _mm_cvtss_f32(_mm_shuffle_ps(self, self, _MM_SHUFFLE(I, I, I, I))); + } + } + + template + XSIMD_INLINE double get(batch const& self, ::xsimd::index, requires_arch) noexcept + { + XSIMD_IF_CONSTEXPR(I == 0) + { + return _mm_cvtsd_f64(self); + } + else + { + return _mm_cvtsd_f64(_mm_unpackhi_pd(self, self)); + } + } + + template ::value>> + XSIMD_INLINE T get(batch const& self, ::xsimd::index, requires_arch) noexcept + { + XSIMD_IF_CONSTEXPR(sizeof(T) == 1) + { + XSIMD_IF_CONSTEXPR(I == 0) + { + return static_cast(_mm_cvtsi128_si32(self) & 0xFF); + } + else + { + return static_cast((_mm_cvtsi128_si32(_mm_srli_si128(self, I)) & 0xFF)); + } + } + else XSIMD_IF_CONSTEXPR(sizeof(T) == 2) + { + XSIMD_IF_CONSTEXPR(I == 0) + { + return static_cast(_mm_cvtsi128_si32(self) & 0xFFFF); + } + else + { + return static_cast((_mm_cvtsi128_si32(_mm_srli_si128(self, I * 2)) & 0xFFFF)); + } + } + else XSIMD_IF_CONSTEXPR(sizeof(T) == 4) + { + XSIMD_IF_CONSTEXPR(I == 0) + { + return static_cast(_mm_cvtsi128_si32(self)); + } + else + { + return static_cast(_mm_cvtsi128_si32(_mm_shuffle_epi32(self, _MM_SHUFFLE(I, I, I, I)))); + } + } + else XSIMD_IF_CONSTEXPR(sizeof(T) == 8) + { +#if defined(__x86_64__) + XSIMD_IF_CONSTEXPR(I == 0) + { + return static_cast(_mm_cvtsi128_si64(self)); + } + else + { + return static_cast(_mm_cvtsi128_si64(_mm_srli_si128(self, 8))); + } +#else + __m128i shifted; + XSIMD_IF_CONSTEXPR(I == 0) + { + shifted = self; + } + else + { + shifted = _mm_srli_si128(self, 8); + } + int64_t i; + _mm_storel_epi64(reinterpret_cast<__m128i*>(&i), shifted); + return static_cast(i); +#endif + } + else + { + assert(false && "unsupported arch/op combination"); + return {}; + } + } + // first template XSIMD_INLINE float first(batch const& self, requires_arch) noexcept diff --git a/include/xsimd/arch/xsimd_sse4_1.hpp b/include/xsimd/arch/xsimd_sse4_1.hpp index bb3a6ca2c..903466ae5 100644 --- a/include/xsimd/arch/xsimd_sse4_1.hpp +++ b/include/xsimd/arch/xsimd_sse4_1.hpp @@ -105,6 +105,50 @@ namespace xsimd return _mm_floor_pd(self); } + // get + template ::value>> + XSIMD_INLINE T get(batch const& self, ::xsimd::index, requires_arch) noexcept + { + XSIMD_IF_CONSTEXPR(sizeof(T) == 1) + { + return static_cast(_mm_extract_epi8(self, I)); + } + else XSIMD_IF_CONSTEXPR(sizeof(T) == 2) + { + return static_cast(_mm_extract_epi16(self, I)); + } + else XSIMD_IF_CONSTEXPR(sizeof(T) == 4) + { + return static_cast(_mm_extract_epi32(self, I)); + } + else XSIMD_IF_CONSTEXPR(sizeof(T) == 8) + { +#if defined(__x86_64__) + return static_cast(_mm_extract_epi64(self, I)); +#else + return get(self, ::xsimd::index {}, sse2 {}); +#endif + } + else + { + assert(false && "unsupported arch/op combination"); + return {}; + } + } + + template + XSIMD_INLINE float get(batch const& self, ::xsimd::index, requires_arch) noexcept + { + XSIMD_IF_CONSTEXPR(I == 0) + { + return _mm_cvtss_f32(self); + } + else + { + return bit_cast(static_cast(_mm_extract_epi32(_mm_castps_si128(self), I))); + } + } + // insert template ::value>> XSIMD_INLINE batch insert(batch const& self, T val, index pos, requires_arch) noexcept diff --git a/include/xsimd/types/xsimd_api.hpp b/include/xsimd/types/xsimd_api.hpp index 5e6b3a209..b88f49033 100644 --- a/include/xsimd/types/xsimd_api.hpp +++ b/include/xsimd/types/xsimd_api.hpp @@ -1109,6 +1109,37 @@ namespace xsimd return x > y; } + /** + * @ingroup batch_data_transfer + * + * Extract the scalar element at compile-time index \c I from batch \c b. + * @param b the batch to extract from. + * @return the scalar element at index \c I. + */ + template + XSIMD_INLINE T get(batch const& b) noexcept + { + static_assert(I < batch::size, "index out of bounds"); + detail::static_check_supported_config(); + return kernel::get(b, index {}, A {}); + } + + template + XSIMD_INLINE bool get(batch_bool const& b) noexcept + { + static_assert(I < batch_bool::size, "index out of bounds"); + detail::static_check_supported_config(); + return kernel::get(b, index {}, A {}); + } + + template + XSIMD_INLINE typename batch, A>::value_type get(batch, A> const& b) noexcept + { + static_assert(I < batch, A>::size, "index out of bounds"); + detail::static_check_supported_config(); + return kernel::get(b, index {}, A {}); + } + /** * @ingroup batch_reducers * diff --git a/test/test_batch.cpp b/test/test_batch.cpp index 5cf47f3d7..6689dfc7b 100644 --- a/test/test_batch.cpp +++ b/test/test_batch.cpp @@ -158,6 +158,26 @@ struct batch_test CHECK_EQ(res.first(), lhs[0]); } + template + void check_get_element(batch_type const& res) const + { + CHECK_EQ(xsimd::get(res), lhs[I]); + } + + template + void check_get_all(batch_type const& res, std::index_sequence) const + { + int dummy[] = { (check_get_element(res), 0)... }; + (void)dummy; + } + + void test_get() const + { + batch_type res = batch_lhs(); + check_get_all(res, std::make_index_sequence {}); + CHECK_EQ(xsimd::get<0>(res), res.first()); + } + void test_arithmetic() const { // +batch @@ -986,6 +1006,11 @@ TEST_CASE_TEMPLATE("[batch]", B, BATCH_TYPES) Test.test_first_element(); } + SUBCASE("get") + { + Test.test_get(); + } + SUBCASE("arithmetic") { Test.test_arithmetic(); diff --git a/test/test_batch_complex.cpp b/test/test_batch_complex.cpp index e06ad83fb..ff3a24f14 100644 --- a/test/test_batch_complex.cpp +++ b/test/test_batch_complex.cpp @@ -182,6 +182,26 @@ struct batch_complex_test CHECK_EQ(res.first(), lhs[0]); } + template + void check_get_element(batch_type const& res) const + { + CHECK_EQ(xsimd::get(res), lhs[I]); + } + + template + void check_get_all(batch_type const& res, std::index_sequence) const + { + int dummy[] = { (check_get_element(res), 0)... }; + (void)dummy; + } + + void test_get() const + { + batch_type res = batch_lhs(); + check_get_all(res, std::make_index_sequence {}); + CHECK_EQ(xsimd::get<0>(res), res.first()); + } + void test_arithmetic() const { // +batch @@ -689,6 +709,8 @@ TEST_CASE_TEMPLATE("[xsimd complex batches]", B, BATCH_COMPLEX_TYPES) SUBCASE("first element") { Test.test_first_element(); } + SUBCASE("get") { Test.test_get(); } + SUBCASE("arithmetic") { Test.test_arithmetic(); } SUBCASE("computed_assignment") { Test.test_computed_assignment(); }