From 108a99b92a332e35b425040ff1170d5eae1e6134 Mon Sep 17 00:00:00 2001 From: Marco Barbone Date: Mon, 1 Jun 2026 11:51:09 -0400 Subject: [PATCH 1/4] ci: support XSIMD_DEFAULT_ARCH override and verify default_arch Let CMake force a specific default arch via -DXSIMD_DEFAULT_ARCH (idiomatic if(XSIMD_DEFAULT_ARCH) guard), add a test_arch.cpp check that the forced arch is the default, and fix the linux.yml CXXFLAGS typo. --- .github/workflows/linux.yml | 2 +- test/CMakeLists.txt | 3 +++ test/test_arch.cpp | 6 ++++++ 3 files changed, 10 insertions(+), 1 deletion(-) diff --git a/.github/workflows/linux.yml b/.github/workflows/linux.yml index 03a914bda..b66c9a97c 100644 --- a/.github/workflows/linux.yml +++ b/.github/workflows/linux.yml @@ -100,7 +100,7 @@ jobs: fi if [[ '${{ matrix.sys.flags }}' == 'avx512vl_128' ]]; then CMAKE_EXTRA_ARGS="$CMAKE_EXTRA_ARGS -DTARGET_ARCH=skylake-avx512" - CXXFLAGS="$CXX_FLAGS -DXSIMD_DEFAULT_ARCH=avx512vl_128" + CXXFLAGS="$CXXFLAGS -DXSIMD_DEFAULT_ARCH=avx512vl_128" fi if [[ '${{ matrix.sys.flags }}' == 'avx512vl_256' ]]; then CMAKE_EXTRA_ARGS="$CMAKE_EXTRA_ARGS -DTARGET_ARCH=skylake-avx512" diff --git a/test/CMakeLists.txt b/test/CMakeLists.txt index 662dcdc3f..c159977f7 100644 --- a/test/CMakeLists.txt +++ b/test/CMakeLists.txt @@ -49,6 +49,9 @@ if (TARGET_EMULATED) message(STATUS "Using emulated target: ${TARGET_EMULATED}") set(EMULATED_COMPILE_FLAGS -DXSIMD_DEFAULT_ARCH=${TARGET_ARCH};-DXSIMD_WITH_EMULATED=1) unset(TARGET_ARCH CACHE) +elseif (XSIMD_DEFAULT_ARCH) + message(STATUS "Forcing default arch to xsimd::${XSIMD_DEFAULT_ARCH}") + set(EMULATED_COMPILE_FLAGS -DXSIMD_DEFAULT_ARCH=${XSIMD_DEFAULT_ARCH}) endif() if (CMAKE_CXX_COMPILER_ID MATCHES "Clang" OR CMAKE_CXX_COMPILER_ID MATCHES "GNU" OR CMAKE_CXX_COMPILER_ID MATCHES "Intel") diff --git a/test/test_arch.cpp b/test/test_arch.cpp index d2d0df249..5d479bbaf 100644 --- a/test/test_arch.cpp +++ b/test/test_arch.cpp @@ -23,6 +23,12 @@ static_assert(xsimd::default_arch::supported(), "default arch must be supported" static_assert(std::is_same::value, "default arch is the best available"); static_assert(xsimd::supported_architectures::contains(), "default arch is supported"); static_assert(xsimd::all_architectures::contains(), "default arch is a valid arch"); +#else +namespace xsimd +{ + static_assert(std::is_same::value, + "default_arch does not match XSIMD_DEFAULT_ARCH"); +} #endif #if !XSIMD_WITH_SVE From 9ebcf0f4ea2c9a51b80a776e25eb097bbe629a26 Mon Sep 17 00:00:00 2001 From: Marco Barbone Date: Mon, 1 Jun 2026 11:51:09 -0400 Subject: [PATCH 2/4] chore: small drive-by fixes (avx_128 swizzle, doc typo) Split the avx_128 variable swizzle into explicit float/double overloads with a width static_assert, and fix an AVX512DQ -> AVX512VL doc comment. --- include/xsimd/arch/xsimd_avx_128.hpp | 28 +++++++++---------- .../xsimd/types/xsimd_avx512vl_register.hpp | 2 +- 2 files changed, 15 insertions(+), 15 deletions(-) diff --git a/include/xsimd/arch/xsimd_avx_128.hpp b/include/xsimd/arch/xsimd_avx_128.hpp index 46fc9acb7..07dafd78b 100644 --- a/include/xsimd/arch/xsimd_avx_128.hpp +++ b/include/xsimd/arch/xsimd_avx_128.hpp @@ -129,20 +129,20 @@ namespace xsimd } // swizzle (dynamic mask) - template ::value && sizeof(T) == sizeof(ITy)>> - XSIMD_INLINE batch swizzle(batch const& self, batch mask, requires_arch) noexcept - { - XSIMD_IF_CONSTEXPR(std::is_same::value) - { - return _mm_permutevar_ps(self, mask); - } - else - { - // VPERMILPD's variable control reads bit 1 of each 64-bit selector - // (bit 0 is ignored), so a {0,1} index needs to become {0,2}. - // Negation is a cheap alternative to a left shift by 1. - return _mm_permutevar_pd(self, -mask); - } + template + XSIMD_INLINE batch swizzle(batch const& self, batch mask, requires_arch) noexcept + { + static_assert(sizeof(float) == sizeof(ITy), "index type must match value width"); + return _mm_permutevar_ps(self, mask); + } + template + XSIMD_INLINE batch swizzle(batch const& self, batch mask, requires_arch) noexcept + { + static_assert(sizeof(double) == sizeof(ITy), "index type must match value width"); + // VPERMILPD's variable control reads bit 1 of each 64-bit selector + // (bit 0 is ignored), so a {0,1} index needs to become {0,2}. + // Negation is a cheap alternative to a left shift by 1. + return _mm_permutevar_pd(self, -mask); } // swizzle (constant mask) diff --git a/include/xsimd/types/xsimd_avx512vl_register.hpp b/include/xsimd/types/xsimd_avx512vl_register.hpp index c73c2a963..46d887333 100644 --- a/include/xsimd/types/xsimd_avx512vl_register.hpp +++ b/include/xsimd/types/xsimd_avx512vl_register.hpp @@ -20,7 +20,7 @@ namespace xsimd /** * @ingroup architectures * - * AVX512DQ instructions + * AVX512VL instructions */ struct avx512vl : avx512cd { From 34bca15a20a2065b301ad5476b0eb9aa6ef24b3d Mon Sep 17 00:00:00 2001 From: Marco Barbone Date: Mon, 1 Jun 2026 11:51:09 -0400 Subject: [PATCH 3/4] feat(avx512vl): native EVEX masked load/store on avx512vl_128/256 Add the missing int64/uint64/float/double load_masked overloads and correct the store_masked batch_bool_constant typing on avx512vl_128 and avx512vl_256, branching aligned vs unaligned to the right EVEX intrinsic (vmovdqu{32,64}{k}{z} / vmov{a,u}p{s,d}{k}{z}); unsigned overloads delegate via bitwise_cast. Resolve the avx/avx2/avx512f half-fold target through make_sized_batch_t::arch_type so a 512-bit masked op picks the VL arch and emits EVEX instead of VEX vpmaskmov*/vmaskmov*. --- include/xsimd/arch/xsimd_avx512f.hpp | 31 ++--- include/xsimd/arch/xsimd_avx512vl_128.hpp | 134 +++++++++++++++++----- include/xsimd/arch/xsimd_avx512vl_256.hpp | 134 +++++++++++++++++----- 3 files changed, 223 insertions(+), 76 deletions(-) diff --git a/include/xsimd/arch/xsimd_avx512f.hpp b/include/xsimd/arch/xsimd_avx512f.hpp index 6a7316722..cc057eacf 100644 --- a/include/xsimd/arch/xsimd_avx512f.hpp +++ b/include/xsimd/arch/xsimd_avx512f.hpp @@ -305,16 +305,17 @@ namespace xsimd convert, Mode, requires_arch) noexcept { constexpr auto half = batch::size / 2; - XSIMD_IF_CONSTEXPR(mask.countl_zero() >= half) // lower-half AVX2 forwarding + using half_arch = typename ::xsimd::make_sized_batch_t::arch_type; + XSIMD_IF_CONSTEXPR(mask.countl_zero() >= half) // lower 256-bit half { - constexpr auto mlo = ::xsimd::detail::lower_half(mask); - const auto lo = load_masked(mem, mlo, convert {}, Mode {}, avx2 {}); + constexpr auto mlo = ::xsimd::detail::lower_half(mask); + const auto lo = load_masked(mem, mlo, convert {}, Mode {}, half_arch {}); return detail::load_masked(lo); // zero-extend low half } - else XSIMD_IF_CONSTEXPR(mask.countr_zero() >= half) // upper-half AVX2 forwarding + else XSIMD_IF_CONSTEXPR(mask.countr_zero() >= half) // upper 256-bit half { - constexpr auto mhi = ::xsimd::detail::upper_half(mask); - const auto hi = load_masked(mem + half, mhi, convert {}, Mode {}, avx2 {}); + constexpr auto mhi = ::xsimd::detail::upper_half(mask); + const auto hi = load_masked(mem + half, mhi, convert {}, Mode {}, half_arch {}); return detail::load_masked(hi, detail::high_tag {}); } else @@ -332,17 +333,19 @@ namespace xsimd Mode, requires_arch) noexcept { constexpr auto half = batch::size / 2; - XSIMD_IF_CONSTEXPR(mask.countl_zero() >= half) // lower-half AVX2 forwarding + using half_batch = ::xsimd::make_sized_batch_t; + using half_arch = typename half_batch::arch_type; + XSIMD_IF_CONSTEXPR(mask.countl_zero() >= half) // lower 256-bit half { - constexpr auto mlo = ::xsimd::detail::lower_half(mask); - const auto lo = detail::lower_half(src); - store_masked(mem, lo, mlo, Mode {}, avx2 {}); + constexpr auto mlo = ::xsimd::detail::lower_half(mask); + const half_batch lo = detail::lower_half(src); + store_masked(mem, lo, mlo, Mode {}, half_arch {}); } - else XSIMD_IF_CONSTEXPR(mask.countr_zero() >= half) // upper-half AVX2 forwarding + else XSIMD_IF_CONSTEXPR(mask.countr_zero() >= half) // upper 256-bit half { - constexpr auto mhi = ::xsimd::detail::upper_half(mask); - const auto hi = detail::upper_half(src); - store_masked(mem + half, hi, mhi, Mode {}, avx2 {}); + constexpr auto mhi = ::xsimd::detail::upper_half(mask); + const half_batch hi = detail::upper_half(src); + store_masked(mem + half, hi, mhi, Mode {}, half_arch {}); } else { diff --git a/include/xsimd/arch/xsimd_avx512vl_128.hpp b/include/xsimd/arch/xsimd_avx512vl_128.hpp index 155338425..855870af3 100644 --- a/include/xsimd/arch/xsimd_avx512vl_128.hpp +++ b/include/xsimd/arch/xsimd_avx512vl_128.hpp @@ -188,53 +188,125 @@ namespace xsimd return _mm_abs_epi64(self); } - // load masked - template - XSIMD_INLINE batch load_masked(int32_t const* mem, batch_bool_constant mask, convert, Mode, requires_arch) noexcept + // Per-type masked load/store — partial ordering picks these over the + // avx2 bridges this arch inherits. Unsigned overloads reinterpret to + // the signed EVEX intrinsic. + template + XSIMD_INLINE batch load_masked(int32_t const* mem, batch_bool_constant mask, convert, Mode, requires_arch) noexcept { - constexpr auto imm_mask = mask.mask(); - return _mm_mask_loadu_epi32(_mm_setzero_si128(), imm_mask, mem); + XSIMD_IF_CONSTEXPR(std::is_same::value) + { + return _mm_maskz_load_epi32(mask.mask(), mem); + } + else + { + return _mm_maskz_loadu_epi32(mask.mask(), mem); + } } - template - XSIMD_INLINE batch load_masked(uint32_t const* mem, batch_bool_constant mask, convert, Mode, requires_arch) noexcept + template + XSIMD_INLINE batch load_masked(uint32_t const* mem, batch_bool_constant, convert, Mode, requires_arch) noexcept { - constexpr auto imm_mask = mask.mask(); - return _mm_mask_loadu_epi32(_mm_setzero_si128(), imm_mask, mem); + return bitwise_cast(load_masked(reinterpret_cast(mem), batch_bool_constant {}, convert {}, Mode {}, avx512vl_128 {})); } - - // store masked - template - XSIMD_INLINE void store_masked(uint32_t* mem, batch const& src, batch_bool_constant mask, Mode, requires_arch) noexcept + template + XSIMD_INLINE batch load_masked(int64_t const* mem, batch_bool_constant mask, convert, Mode, requires_arch) noexcept { - _mm_mask_storeu_epi32(mem, mask.mask(), src); + XSIMD_IF_CONSTEXPR(std::is_same::value) + { + return _mm_maskz_load_epi64(mask.mask(), mem); + } + else + { + return _mm_maskz_loadu_epi64(mask.mask(), mem); + } } - template - XSIMD_INLINE void store_masked(int32_t* mem, batch const& src, batch_bool_constant mask, Mode, requires_arch) noexcept + template + XSIMD_INLINE batch load_masked(uint64_t const* mem, batch_bool_constant, convert, Mode, requires_arch) noexcept { - _mm_mask_storeu_epi32(mem, mask.mask(), src); + return bitwise_cast(load_masked(reinterpret_cast(mem), batch_bool_constant {}, convert {}, Mode {}, avx512vl_128 {})); } - - template - XSIMD_INLINE void store_masked(uint64_t* mem, batch const& src, batch_bool_constant mask, Mode, requires_arch) noexcept + template + XSIMD_INLINE batch load_masked(float const* mem, batch_bool_constant mask, convert, Mode, requires_arch) noexcept + { + XSIMD_IF_CONSTEXPR(std::is_same::value) + { + return _mm_maskz_load_ps(mask.mask(), mem); + } + else + { + return _mm_maskz_loadu_ps(mask.mask(), mem); + } + } + template + XSIMD_INLINE batch load_masked(double const* mem, batch_bool_constant mask, convert, Mode, requires_arch) noexcept { - _mm_mask_storeu_epi64(mem, mask.mask(), src); + XSIMD_IF_CONSTEXPR(std::is_same::value) + { + return _mm_maskz_load_pd(mask.mask(), mem); + } + else + { + return _mm_maskz_loadu_pd(mask.mask(), mem); + } } - template - XSIMD_INLINE void store_masked(int64_t* mem, batch const& src, batch_bool_constant mask, Mode, requires_arch) noexcept + template + XSIMD_INLINE void store_masked(int32_t* mem, batch const& src, batch_bool_constant mask, Mode, requires_arch) noexcept { - _mm_mask_storeu_epi64(mem, mask.mask(), src); + XSIMD_IF_CONSTEXPR(std::is_same::value) + { + _mm_mask_store_epi32(mem, mask.mask(), src); + } + else + { + _mm_mask_storeu_epi32(mem, mask.mask(), src); + } } - template - XSIMD_INLINE void store_masked(float* mem, batch const& src, batch_bool_constant mask, Mode, requires_arch) noexcept + template + XSIMD_INLINE void store_masked(uint32_t* mem, batch const& src, batch_bool_constant, Mode, requires_arch) noexcept { - _mm_mask_storeu_ps(mem, mask.mask(), src); + store_masked(reinterpret_cast(mem), bitwise_cast(src), batch_bool_constant {}, Mode {}, avx512vl_128 {}); } - - template - XSIMD_INLINE void store_masked(double* mem, batch const& src, batch_bool_constant mask, Mode, requires_arch) noexcept + template + XSIMD_INLINE void store_masked(int64_t* mem, batch const& src, batch_bool_constant mask, Mode, requires_arch) noexcept + { + XSIMD_IF_CONSTEXPR(std::is_same::value) + { + _mm_mask_store_epi64(mem, mask.mask(), src); + } + else + { + _mm_mask_storeu_epi64(mem, mask.mask(), src); + } + } + template + XSIMD_INLINE void store_masked(uint64_t* mem, batch const& src, batch_bool_constant, Mode, requires_arch) noexcept + { + store_masked(reinterpret_cast(mem), bitwise_cast(src), batch_bool_constant {}, Mode {}, avx512vl_128 {}); + } + template + XSIMD_INLINE void store_masked(float* mem, batch const& src, batch_bool_constant mask, Mode, requires_arch) noexcept { - _mm_mask_storeu_pd(mem, mask.mask(), src); + XSIMD_IF_CONSTEXPR(std::is_same::value) + { + _mm_mask_store_ps(mem, mask.mask(), src); + } + else + { + _mm_mask_storeu_ps(mem, mask.mask(), src); + } + } + template + XSIMD_INLINE void store_masked(double* mem, batch const& src, batch_bool_constant mask, Mode, requires_arch) noexcept + { + XSIMD_IF_CONSTEXPR(std::is_same::value) + { + _mm_mask_store_pd(mem, mask.mask(), src); + } + else + { + _mm_mask_storeu_pd(mem, mask.mask(), src); + } } // max diff --git a/include/xsimd/arch/xsimd_avx512vl_256.hpp b/include/xsimd/arch/xsimd_avx512vl_256.hpp index a5ea546bc..c0b4a568e 100644 --- a/include/xsimd/arch/xsimd_avx512vl_256.hpp +++ b/include/xsimd/arch/xsimd_avx512vl_256.hpp @@ -188,53 +188,125 @@ namespace xsimd return _mm256_abs_epi64(self); } - // load masked - template - XSIMD_INLINE batch load_masked(int32_t const* mem, batch_bool_constant mask, convert, Mode, requires_arch) noexcept + // Per-type masked load/store — partial ordering picks these over the + // avx2 bridges this arch inherits. Unsigned overloads reinterpret to + // the signed EVEX intrinsic. + template + XSIMD_INLINE batch load_masked(int32_t const* mem, batch_bool_constant mask, convert, Mode, requires_arch) noexcept { - constexpr auto imm_mask = mask.mask(); - return _mm256_mask_loadu_epi32(_mm256_setzero_si256(), imm_mask, mem); + XSIMD_IF_CONSTEXPR(std::is_same::value) + { + return _mm256_maskz_load_epi32(mask.mask(), mem); + } + else + { + return _mm256_maskz_loadu_epi32(mask.mask(), mem); + } } - template - XSIMD_INLINE batch load_masked(uint32_t const* mem, batch_bool_constant mask, convert, Mode, requires_arch) noexcept + template + XSIMD_INLINE batch load_masked(uint32_t const* mem, batch_bool_constant, convert, Mode, requires_arch) noexcept { - constexpr auto imm_mask = mask.mask(); - return _mm256_mask_loadu_epi32(_mm256_setzero_si256(), imm_mask, mem); + return bitwise_cast(load_masked(reinterpret_cast(mem), batch_bool_constant {}, convert {}, Mode {}, avx512vl_256 {})); } - - // store masked - template - XSIMD_INLINE void store_masked(uint32_t* mem, batch const& src, batch_bool_constant mask, Mode, requires_arch) noexcept + template + XSIMD_INLINE batch load_masked(int64_t const* mem, batch_bool_constant mask, convert, Mode, requires_arch) noexcept { - _mm256_mask_storeu_epi32(mem, mask.mask(), src); + XSIMD_IF_CONSTEXPR(std::is_same::value) + { + return _mm256_maskz_load_epi64(mask.mask(), mem); + } + else + { + return _mm256_maskz_loadu_epi64(mask.mask(), mem); + } } - template - XSIMD_INLINE void store_masked(int32_t* mem, batch const& src, batch_bool_constant mask, Mode, requires_arch) noexcept + template + XSIMD_INLINE batch load_masked(uint64_t const* mem, batch_bool_constant, convert, Mode, requires_arch) noexcept { - _mm256_mask_storeu_epi32(mem, mask.mask(), src); + return bitwise_cast(load_masked(reinterpret_cast(mem), batch_bool_constant {}, convert {}, Mode {}, avx512vl_256 {})); } - - template - XSIMD_INLINE void store_masked(uint64_t* mem, batch const& src, batch_bool_constant mask, Mode, requires_arch) noexcept + template + XSIMD_INLINE batch load_masked(float const* mem, batch_bool_constant mask, convert, Mode, requires_arch) noexcept + { + XSIMD_IF_CONSTEXPR(std::is_same::value) + { + return _mm256_maskz_load_ps(mask.mask(), mem); + } + else + { + return _mm256_maskz_loadu_ps(mask.mask(), mem); + } + } + template + XSIMD_INLINE batch load_masked(double const* mem, batch_bool_constant mask, convert, Mode, requires_arch) noexcept { - _mm256_mask_storeu_epi64(mem, mask.mask(), src); + XSIMD_IF_CONSTEXPR(std::is_same::value) + { + return _mm256_maskz_load_pd(mask.mask(), mem); + } + else + { + return _mm256_maskz_loadu_pd(mask.mask(), mem); + } } - template - XSIMD_INLINE void store_masked(int64_t* mem, batch const& src, batch_bool_constant mask, Mode, requires_arch) noexcept + template + XSIMD_INLINE void store_masked(int32_t* mem, batch const& src, batch_bool_constant mask, Mode, requires_arch) noexcept { - _mm256_mask_storeu_epi64(mem, mask.mask(), src); + XSIMD_IF_CONSTEXPR(std::is_same::value) + { + _mm256_mask_store_epi32(mem, mask.mask(), src); + } + else + { + _mm256_mask_storeu_epi32(mem, mask.mask(), src); + } } - template - XSIMD_INLINE void store_masked(float* mem, batch const& src, batch_bool_constant mask, Mode, requires_arch) noexcept + template + XSIMD_INLINE void store_masked(uint32_t* mem, batch const& src, batch_bool_constant, Mode, requires_arch) noexcept { - _mm256_mask_storeu_ps(mem, mask.mask(), src); + store_masked(reinterpret_cast(mem), bitwise_cast(src), batch_bool_constant {}, Mode {}, avx512vl_256 {}); } - - template - XSIMD_INLINE void store_masked(double* mem, batch const& src, batch_bool_constant mask, Mode, requires_arch) noexcept + template + XSIMD_INLINE void store_masked(int64_t* mem, batch const& src, batch_bool_constant mask, Mode, requires_arch) noexcept + { + XSIMD_IF_CONSTEXPR(std::is_same::value) + { + _mm256_mask_store_epi64(mem, mask.mask(), src); + } + else + { + _mm256_mask_storeu_epi64(mem, mask.mask(), src); + } + } + template + XSIMD_INLINE void store_masked(uint64_t* mem, batch const& src, batch_bool_constant, Mode, requires_arch) noexcept + { + store_masked(reinterpret_cast(mem), bitwise_cast(src), batch_bool_constant {}, Mode {}, avx512vl_256 {}); + } + template + XSIMD_INLINE void store_masked(float* mem, batch const& src, batch_bool_constant mask, Mode, requires_arch) noexcept { - _mm256_mask_storeu_pd(mem, mask.mask(), src); + XSIMD_IF_CONSTEXPR(std::is_same::value) + { + _mm256_mask_store_ps(mem, mask.mask(), src); + } + else + { + _mm256_mask_storeu_ps(mem, mask.mask(), src); + } + } + template + XSIMD_INLINE void store_masked(double* mem, batch const& src, batch_bool_constant mask, Mode, requires_arch) noexcept + { + XSIMD_IF_CONSTEXPR(std::is_same::value) + { + _mm256_mask_store_pd(mem, mask.mask(), src); + } + else + { + _mm256_mask_storeu_pd(mem, mask.mask(), src); + } } // max From 5a40538fd60ef3ad04a18f9263d2863571747130 Mon Sep 17 00:00:00 2001 From: Marco Barbone Date: Mon, 1 Jun 2026 11:51:09 -0400 Subject: [PATCH 4/4] refactor(masked-memory): dispatch via overload conversion ranking Drop the cross-arch SFINAE/tag mechanism: a concrete requires_arch overload now beats the inherited avx2/avx2_128 one by overload conversion ranking, so no arch file knows about another. xsimd_common_memory.hpp keeps only requires_arch and dispatches on the arch-agnostic trait masked_memory_uses_fp_bitcast (integral with a same-width float register -> reuse that float vmaskmov* path, else a scalar buffer). avx/avx2/avx2_128 drop every is_base_of guard; avx2_128 routes native 128-bit integer masked memory through vpmaskmov* (long long* cast for 64-bit) and tags int64/uint64 on avx2_128 (those intrinsics need AVX2). detail::maskstore takes a bool mask and casts internally; xsimd_batch.hpp keeps a make_sized_batch fwd-decl and simplifies the store_masked call; xsimd_isa.hpp documents the _128-first include order; sse2.hpp adapts to the new store_masked(common) signature. --- .../xsimd/arch/common/xsimd_common_memory.hpp | 136 +++++++++--------- include/xsimd/arch/xsimd_avx.hpp | 54 ++++--- include/xsimd/arch/xsimd_avx2.hpp | 25 ++-- include/xsimd/arch/xsimd_avx2_128.hpp | 28 ++-- include/xsimd/arch/xsimd_common_fwd.hpp | 19 +-- include/xsimd/arch/xsimd_isa.hpp | 9 +- include/xsimd/arch/xsimd_sse2.hpp | 2 +- include/xsimd/types/xsimd_batch.hpp | 10 +- 8 files changed, 152 insertions(+), 131 deletions(-) diff --git a/include/xsimd/arch/common/xsimd_common_memory.hpp b/include/xsimd/arch/common/xsimd_common_memory.hpp index 7a1ed73a3..bd2d14f93 100644 --- a/include/xsimd/arch/common/xsimd_common_memory.hpp +++ b/include/xsimd/arch/common/xsimd_common_memory.hpp @@ -13,6 +13,7 @@ #define XSIMD_COMMON_MEMORY_HPP #include "../../types/xsimd_batch_constant.hpp" +#include "../../utils/xsimd_type_traits.hpp" #include "./xsimd_common_details.hpp" #include @@ -360,88 +361,87 @@ namespace xsimd return load_unaligned(mem, convert {}, A {}); } - template - XSIMD_INLINE batch - load_masked(T_in const* mem, batch_bool_constant, convert, alignment, requires_arch) noexcept - { - constexpr std::size_t size = batch::size; - alignas(A::alignment()) std::array buffer {}; - constexpr bool mask[size] = { Values... }; - - for (std::size_t i = 0; i < size; ++i) - buffer[i] = mask[i] ? static_cast(mem[i]) : T_out(0); - - return batch::load(buffer.data(), aligned_mode {}); - } - - template - XSIMD_INLINE void - store_masked(T_out* mem, batch const& src, batch_bool_constant, alignment, requires_arch) noexcept + // Masked-memory dispatch idiom. To give an arch a native masked path, add a + // `requires_arch` overload in its arch file; conversion ranking makes + // it beat the inherited one. Keep this base layer arch-agnostic: + // (a) specialize via a concrete `requires_arch` overload -- no register + // tag, no `enable_if` on `A`; + // (b) base overloads use the `requires_arch` tag only; a generic + // `requires_arch` here ties with an arch's own overload (gcc-10 ambiguity); + // (c) capability decisions go through arch-agnostic traits (see below). + namespace detail { - constexpr std::size_t size = batch::size; - constexpr bool mask[size] = { Values... }; + // True when an integer access can borrow the same-width float `vmaskmov*` path + // (integral type, same-size float exists, arch has that float register); + // otherwise the scalar-buffer fallback is used. Names no architecture. + template + using masked_memory_uses_fp_bitcast = std::integral_constant::value + && std::is_integral::value + && !std::is_void>::value + && types::has_simd_register, A>::value>; - for (std::size_t i = 0; i < size; ++i) - if (mask[i]) - { - mem[i] = static_cast(src.get(i)); - } - } + // Scalar-buffer fallback: materialize masked-off lanes as zero, then load. + template + XSIMD_INLINE batch + load_masked_common(T_in const* mem, batch_bool_constant, convert, alignment, std::false_type /* uses_fp_bitcast */) noexcept + { + constexpr std::size_t size = batch::size; + alignas(A::alignment()) std::array buffer {}; + constexpr bool mask[size] = { Values... }; - template - XSIMD_INLINE batch load_masked(int32_t const* mem, batch_bool_constant, convert, Mode, requires_arch) noexcept - { - const auto f = load_masked(reinterpret_cast(mem), batch_bool_constant {}, convert {}, Mode {}, A {}); - return bitwise_cast(f); - } + for (std::size_t i = 0; i < size; ++i) + buffer[i] = mask[i] ? static_cast(mem[i]) : T_out(0); - template - XSIMD_INLINE batch load_masked(uint32_t const* mem, batch_bool_constant, convert, Mode, requires_arch) noexcept - { - const auto f = load_masked(reinterpret_cast(mem), batch_bool_constant {}, convert {}, Mode {}, A {}); - return bitwise_cast(f); - } + return batch::load(buffer.data(), aligned_mode {}); + } - template - XSIMD_INLINE std::enable_if_t::value, batch> - load_masked(int64_t const* mem, batch_bool_constant, convert, Mode, requires_arch) noexcept - { - const auto d = load_masked(reinterpret_cast(mem), batch_bool_constant {}, convert {}, Mode {}, A {}); - return bitwise_cast(d); - } + // Integer-via-float path: reinterpret to the same-width float type, reuse the + // floating-point masked load (e.g. `vmaskmovps`), then bitcast the result back. + template + XSIMD_INLINE batch + load_masked_common(T const* mem, batch_bool_constant, convert, Mode, std::true_type /* uses_fp_bitcast */) noexcept + { + using fp_t = sized_fp_t; + const auto f = ::xsimd::kernel::load_masked(reinterpret_cast(mem), batch_bool_constant {}, convert {}, Mode {}, A {}); + return bitwise_cast(f); + } - template - XSIMD_INLINE std::enable_if_t::value, batch> - load_masked(uint64_t const* mem, batch_bool_constant, convert, Mode, requires_arch) noexcept - { - const auto d = load_masked(reinterpret_cast(mem), batch_bool_constant {}, convert {}, Mode {}, A {}); - return bitwise_cast(d); - } + template + XSIMD_INLINE void + store_masked_common(T_out* mem, batch const& src, batch_bool_constant, alignment, std::false_type /* uses_fp_bitcast */) noexcept + { + constexpr std::size_t size = batch::size; + constexpr bool mask[size] = { Values... }; - template - XSIMD_INLINE void store_masked(int32_t* mem, batch const& src, batch_bool_constant, Mode, requires_arch) noexcept - { - store_masked(reinterpret_cast(mem), bitwise_cast(src), batch_bool_constant {}, Mode {}, A {}); - } + for (std::size_t i = 0; i < size; ++i) + if (mask[i]) + { + mem[i] = static_cast(src.get(i)); + } + } - template - XSIMD_INLINE void store_masked(uint32_t* mem, batch const& src, batch_bool_constant, Mode, requires_arch) noexcept - { - store_masked(reinterpret_cast(mem), bitwise_cast(src), batch_bool_constant {}, Mode {}, A {}); + template + XSIMD_INLINE void + store_masked_common(T* mem, batch const& src, batch_bool_constant, Mode, std::true_type /* uses_fp_bitcast */) noexcept + { + using fp_t = sized_fp_t; + ::xsimd::kernel::store_masked(reinterpret_cast(mem), bitwise_cast(src), batch_bool_constant {}, Mode {}, A {}); + } } - template - XSIMD_INLINE std::enable_if_t::value> - store_masked(int64_t* mem, batch const& src, batch_bool_constant, Mode, requires_arch) noexcept + template + XSIMD_INLINE batch + load_masked(T_in const* mem, batch_bool_constant mask, convert cvt, alignment mode, requires_arch) noexcept { - store_masked(reinterpret_cast(mem), bitwise_cast(src), batch_bool_constant {}, Mode {}, A {}); + return detail::load_masked_common(mem, mask, cvt, mode, detail::masked_memory_uses_fp_bitcast {}); } - template - XSIMD_INLINE std::enable_if_t::value> - store_masked(uint64_t* mem, batch const& src, batch_bool_constant, Mode, requires_arch) noexcept + template + XSIMD_INLINE void + store_masked(T_out* mem, batch const& src, batch_bool_constant mask, alignment mode, requires_arch) noexcept { - store_masked(reinterpret_cast(mem), bitwise_cast(src), batch_bool_constant {}, Mode {}, A {}); + detail::store_masked_common(mem, src, mask, mode, detail::masked_memory_uses_fp_bitcast {}); } template diff --git a/include/xsimd/arch/xsimd_avx.hpp b/include/xsimd/arch/xsimd_avx.hpp index 1ee0c5b89..a542d3f31 100644 --- a/include/xsimd/arch/xsimd_avx.hpp +++ b/include/xsimd/arch/xsimd_avx.hpp @@ -993,19 +993,20 @@ namespace xsimd { using int_t = as_integer_t; constexpr size_t half_size = batch::size / 2; + using half_arch = typename ::xsimd::make_sized_batch_t::arch_type; - // confined to lower 128-bit half → forward to 128 bit + // lower 128-bit half XSIMD_IF_CONSTEXPR(mask.countl_zero() >= half_size) { - constexpr auto mlo = ::xsimd::detail::lower_half(batch_bool_constant {}); - const auto lo = load_masked(reinterpret_cast(mem), mlo, convert {}, Mode {}, avx_128 {}); + constexpr auto mlo = ::xsimd::detail::lower_half(batch_bool_constant {}); + const auto lo = load_masked(reinterpret_cast(mem), mlo, convert {}, Mode {}, half_arch {}); return bitwise_cast(batch(_mm256_zextsi128_si256(lo))); } - // confined to upper 128-bit half → forward to 128 bit + // upper 128-bit half else XSIMD_IF_CONSTEXPR(mask.countr_zero() >= half_size) { - constexpr auto mhi = ::xsimd::detail::upper_half(mask); - const auto hi = load_masked(mem + half_size, mhi, convert {}, Mode {}, avx_128 {}); + constexpr auto mhi = ::xsimd::detail::upper_half(mask); + const auto hi = load_masked(mem + half_size, mhi, convert {}, Mode {}, half_arch {}); return detail::zero_extend(hi); } else @@ -1018,41 +1019,54 @@ namespace xsimd // store_masked namespace detail { - template + // True when batch_bool is the legacy VEX vector mask, i.e. it is stored + // in the same register as the data (__m256 / __m256d) rather than in an EVEX + // k-register (__mmask8) as on the avx512vl architectures. The _mm256_cast*_si256 + // path below is only well-formed for the vector-mask representation. This names + // no architecture — it tests the mask's representation, in the spirit of + // detail::masked_memory_uses_fp_bitcast. + template + using uses_vector_mask = std::is_same::register_type, + typename batch::register_type>; + + template ::value>> XSIMD_INLINE void maskstore(float* mem, batch_bool const& mask, batch const& src) noexcept { - _mm256_maskstore_ps(mem, mask, src); + _mm256_maskstore_ps(mem, _mm256_castps_si256(mask), src); } - template + template ::value>> XSIMD_INLINE void maskstore(double* mem, batch_bool const& mask, batch const& src) noexcept { - _mm256_maskstore_pd(mem, mask, src); + _mm256_maskstore_pd(mem, _mm256_castpd_si256(mask), src); } } - template + template ::value && detail::uses_vector_mask::value>> XSIMD_INLINE void store_masked(T* mem, batch const& src, batch_bool_constant mask, Mode, requires_arch) noexcept { constexpr size_t half_size = batch::size / 2; + using half_batch = ::xsimd::make_sized_batch_t; + using half_arch = typename half_batch::arch_type; - // confined to lower 128-bit half → forward to 128 bit + // lower 128-bit half XSIMD_IF_CONSTEXPR(mask.countl_zero() >= half_size) { - constexpr auto mlo = ::xsimd::detail::lower_half(mask); - const auto lo = detail::lower_half(src); - store_masked(mem, lo, mlo, Mode {}, sse4_2 {}); + constexpr auto mlo = ::xsimd::detail::lower_half(mask); + const half_batch lo = detail::lower_half(src); + store_masked(mem, lo, mlo, Mode {}, half_arch {}); } - // confined to upper 128-bit half → forward to 128 bit + // upper 128-bit half else XSIMD_IF_CONSTEXPR(mask.countr_zero() >= half_size) { - constexpr auto mhi = ::xsimd::detail::upper_half(mask); - const auto hi = detail::upper_half(src); - store_masked(mem + half_size, hi, mhi, Mode {}, sse4_2 {}); + constexpr auto mhi = ::xsimd::detail::upper_half(mask); + const half_batch hi = detail::upper_half(src); + store_masked(mem + half_size, hi, mhi, Mode {}, half_arch {}); } else { - detail::maskstore(mem, mask.as_batch(), src); + detail::maskstore(mem, mask.as_batch_bool(), src); } } diff --git a/include/xsimd/arch/xsimd_avx2.hpp b/include/xsimd/arch/xsimd_avx2.hpp index e2c223cc7..5cb47f908 100644 --- a/include/xsimd/arch/xsimd_avx2.hpp +++ b/include/xsimd/arch/xsimd_avx2.hpp @@ -190,24 +190,27 @@ namespace xsimd } } - template + template ::value && (sizeof(T) >= 4)>> XSIMD_INLINE void store_masked(T* mem, batch const& src, batch_bool_constant mask, Mode, requires_arch) noexcept { constexpr size_t lanes_per_half = batch::size / 2; + using half_batch = ::xsimd::make_sized_batch_t; + using half_arch = typename half_batch::arch_type; - // confined to lower 128-bit half → forward to SSE + // lower 128-bit half XSIMD_IF_CONSTEXPR(mask.countl_zero() >= lanes_per_half) { - constexpr auto mlo = ::xsimd::detail::lower_half(mask); - const auto lo = detail::lower_half(src); - store_masked(mem, lo, mlo, Mode {}, sse4_2 {}); + constexpr auto mlo = ::xsimd::detail::lower_half(mask); + const half_batch lo = detail::lower_half(src); + store_masked(mem, lo, mlo, Mode {}, half_arch {}); } - // confined to upper 128-bit half → forward to SSE + // upper 128-bit half else XSIMD_IF_CONSTEXPR(mask.countr_zero() >= lanes_per_half) { - constexpr auto mhi = ::xsimd::detail::upper_half(mask); - const auto hi = detail::upper_half(src); - store_masked(mem + lanes_per_half, hi, mhi, Mode {}, sse4_2 {}); + constexpr auto mhi = ::xsimd::detail::upper_half(mask); + const half_batch hi = detail::upper_half(src); + store_masked(mem + lanes_per_half, hi, mhi, Mode {}, half_arch {}); } else { @@ -216,10 +219,10 @@ namespace xsimd } template - XSIMD_INLINE void store_masked(uint32_t* mem, batch const& src, batch_bool_constant mask, Mode, requires_arch) noexcept + XSIMD_INLINE void store_masked(uint32_t* mem, batch const& src, batch_bool_constant, Mode, requires_arch) noexcept { const auto s32 = bitwise_cast(src); - store_masked(reinterpret_cast(mem), s32, mask, Mode {}, avx2 {}); + store_masked(reinterpret_cast(mem), s32, batch_bool_constant {}, Mode {}, avx2 {}); } template diff --git a/include/xsimd/arch/xsimd_avx2_128.hpp b/include/xsimd/arch/xsimd_avx2_128.hpp index 7a590c74f..c0f119e4e 100644 --- a/include/xsimd/arch/xsimd_avx2_128.hpp +++ b/include/xsimd/arch/xsimd_avx2_128.hpp @@ -89,7 +89,11 @@ namespace xsimd } } - // load_masked + // load_masked — native 128-bit integer masked loads. Tagged on avx2_128 + // because the vpmaskmov* intrinsics require AVX2; an AVX1-only build routes + // integer masked memory through the float path in xsimd_common_memory.hpp. + // Any arch with a native masked path provides its own exact-tag overload that + // out-ranks this one, so no cross-arch exclusion is needed here. template XSIMD_INLINE batch load_masked(int32_t const* mem, batch_bool_constant mask, convert, Mode, requires_arch) noexcept { @@ -98,20 +102,20 @@ namespace xsimd template XSIMD_INLINE batch load_masked(uint32_t const* mem, batch_bool_constant mask, convert, Mode, requires_arch) noexcept { - return _mm_maskload_epi32((int32_t*)mem, mask.as_batch()); + return _mm_maskload_epi32(reinterpret_cast(mem), mask.as_batch()); } template - XSIMD_INLINE batch load_masked(int64_t const* mem, batch_bool_constant mask, convert, Mode, requires_arch) noexcept + XSIMD_INLINE batch load_masked(int64_t const* mem, batch_bool_constant mask, convert, Mode, requires_arch) noexcept { - return _mm_maskload_epi64(mem, mask.as_batch()); + return _mm_maskload_epi64(reinterpret_cast(mem), mask.as_batch()); } template - XSIMD_INLINE batch load_masked(uint64_t const* mem, batch_bool_constant mask, convert, Mode, requires_arch) noexcept + XSIMD_INLINE batch load_masked(uint64_t const* mem, batch_bool_constant mask, convert, Mode, requires_arch) noexcept { - return _mm_maskload_epi64((int64_t*)mem, mask.as_batch()); + return _mm_maskload_epi64(reinterpret_cast(mem), mask.as_batch()); } - // store_masked + // store_masked — native 128-bit integer masked stores (see load note above). template XSIMD_INLINE void store_masked(int32_t* mem, batch const& src, batch_bool_constant mask, Mode, requires_arch) noexcept { @@ -120,17 +124,17 @@ namespace xsimd template XSIMD_INLINE void store_masked(uint32_t* mem, batch const& src, batch_bool_constant mask, Mode, requires_arch) noexcept { - return _mm_maskstore_epi32((int32_t*)mem, mask.as_batch(), src); + return _mm_maskstore_epi32(reinterpret_cast(mem), mask.as_batch(), src); } template - XSIMD_INLINE void store_masked(int64_t* mem, batch const& src, batch_bool_constant mask, Mode, requires_arch) noexcept + XSIMD_INLINE void store_masked(int64_t* mem, batch const& src, batch_bool_constant mask, Mode, requires_arch) noexcept { - return _mm_maskstore_epi64(mem, mask.as_batch(), src); + return _mm_maskstore_epi64(reinterpret_cast(mem), mask.as_batch(), src); } template - XSIMD_INLINE void store_masked(uint64_t* mem, batch const& src, batch_bool_constant mask, Mode, requires_arch) noexcept + XSIMD_INLINE void store_masked(uint64_t* mem, batch const& src, batch_bool_constant mask, Mode, requires_arch) noexcept { - return _mm_maskstore_epi64((int64_t*)mem, mask.as_batch(), src); + return _mm_maskstore_epi64(reinterpret_cast(mem), mask.as_batch(), src); } // gather diff --git a/include/xsimd/arch/xsimd_common_fwd.hpp b/include/xsimd/arch/xsimd_common_fwd.hpp index f5a7f4ffe..8c4818176 100644 --- a/include/xsimd/arch/xsimd_common_fwd.hpp +++ b/include/xsimd/arch/xsimd_common_fwd.hpp @@ -13,6 +13,9 @@ #ifndef XSIMD_COMMON_FWD_HPP #define XSIMD_COMMON_FWD_HPP +#include "../config/xsimd_macros.hpp" +#include "../types/xsimd_common_arch.hpp" + #include #include #include @@ -87,22 +90,6 @@ namespace xsimd XSIMD_INLINE batch load_masked(T_in const* mem, batch_bool_constant mask, convert, alignment, requires_arch) noexcept; template XSIMD_INLINE void store_masked(T_out* mem, batch const& src, batch_bool_constant mask, alignment, requires_arch) noexcept; - template - XSIMD_INLINE batch load_masked(int32_t const* mem, batch_bool_constant mask, convert, Mode, requires_arch) noexcept; - template - XSIMD_INLINE batch load_masked(uint32_t const* mem, batch_bool_constant mask, convert, Mode, requires_arch) noexcept; - template - XSIMD_INLINE std::enable_if_t::value, batch> load_masked(int64_t const*, batch_bool_constant, convert, Mode, requires_arch) noexcept; - template - XSIMD_INLINE std::enable_if_t::value, batch> load_masked(uint64_t const*, batch_bool_constant, convert, Mode, requires_arch) noexcept; - template - XSIMD_INLINE void store_masked(int32_t* mem, batch const& src, batch_bool_constant mask, Mode, requires_arch) noexcept; - template - XSIMD_INLINE void store_masked(uint32_t* mem, batch const& src, batch_bool_constant mask, Mode, requires_arch) noexcept; - template - XSIMD_INLINE std::enable_if_t::value> store_masked(int64_t*, batch const&, batch_bool_constant, Mode, requires_arch) noexcept; - template - XSIMD_INLINE std::enable_if_t::value> store_masked(uint64_t*, batch const&, batch_bool_constant, Mode, requires_arch) noexcept; // Forward declarations for pack-level helpers namespace detail diff --git a/include/xsimd/arch/xsimd_isa.hpp b/include/xsimd/arch/xsimd_isa.hpp index cf88f64d7..06edfa98f 100644 --- a/include/xsimd/arch/xsimd_isa.hpp +++ b/include/xsimd/arch/xsimd_isa.hpp @@ -48,8 +48,11 @@ #endif #if XSIMD_WITH_AVX -#include "./xsimd_avx.hpp" +// clang-format off +// _128 first: avx half-fold recursive call needs avx_128 visible at parse time. #include "./xsimd_avx_128.hpp" +#include "./xsimd_avx.hpp" +// clang-format on #endif #if XSIMD_WITH_FMA3_AVX @@ -61,8 +64,10 @@ #endif #if XSIMD_WITH_AVX2 -#include "./xsimd_avx2.hpp" +// clang-format off #include "./xsimd_avx2_128.hpp" +#include "./xsimd_avx2.hpp" +// clang-format on #endif #if XSIMD_WITH_FMA3_AVX2 diff --git a/include/xsimd/arch/xsimd_sse2.hpp b/include/xsimd/arch/xsimd_sse2.hpp index c6cfb5f07..0a95aae8b 100644 --- a/include/xsimd/arch/xsimd_sse2.hpp +++ b/include/xsimd/arch/xsimd_sse2.hpp @@ -2331,7 +2331,7 @@ namespace xsimd } else { - store_masked(mem, src, mask, requires_arch {}); + store_masked(mem, src, mask, aligned_mode {}, common {}); } } diff --git a/include/xsimd/types/xsimd_batch.hpp b/include/xsimd/types/xsimd_batch.hpp index 970483150..8b44491f2 100644 --- a/include/xsimd/types/xsimd_batch.hpp +++ b/include/xsimd/types/xsimd_batch.hpp @@ -544,6 +544,14 @@ namespace xsimd "Please use batch, A> initialized from xtl::xcomplex instead"); }; #endif + + // Forward declarations: the AVX/AVX2 masked load/store kernels (pulled in + // by xsimd_isa.hpp below) reference make_sized_batch_t::arch_type + // before xsimd_traits.hpp — which carries the full definition — is included. + template + struct make_sized_batch; + template + using make_sized_batch_t = typename make_sized_batch::type; } #include "../arch/xsimd_isa.hpp" @@ -763,7 +771,7 @@ namespace xsimd } else { - kernel::store_masked(mem, *this, mask, mode, A {}); + kernel::store_masked(mem, *this, mask, mode, A {}); } }