From b5047a68308bd98febb2cbbb2df68247674ec9ec Mon Sep 17 00:00:00 2001 From: AntoinePrv Date: Thu, 9 Apr 2026 13:33:37 +0200 Subject: [PATCH 1/2] Reduce x86_cpu_feature size on non-x86 platforms --- .../xsimd/config/xsimd_cpu_features_x86.hpp | 300 +++++++++++------- 1 file changed, 180 insertions(+), 120 deletions(-) diff --git a/include/xsimd/config/xsimd_cpu_features_x86.hpp b/include/xsimd/config/xsimd_cpu_features_x86.hpp index f0ddf7941..b94c7c8cd 100644 --- a/include/xsimd/config/xsimd_cpu_features_x86.hpp +++ b/include/xsimd/config/xsimd_cpu_features_x86.hpp @@ -566,6 +566,99 @@ namespace xsimd xcr0_reg_t m_low {}; }; + /** + * Orchestrator for `CPUID` calls. + * + * This class orchestrate `CPUID` and `XCR0` calls so that they are made in the appropriate + * order. It also implements lazy calling and cache mechanism around those calls. + * Works on all platforms, and return all zeros on non `x86` platforms. + */ + class x86_cpu_features_backend_cpuid + { + public: + x86_cpu_features_backend_cpuid() noexcept = default; + + inline x86_xcr0 const& xcr0() const noexcept; + inline x86_cpuid_leaf0 const& leaf0() const; + inline x86_cpuid_leaf80000000 const& leaf80000000() const; + inline x86_cpuid_leaf1 const& leaf1() const; + inline x86_cpuid_leaf7 const& leaf7() const; + inline x86_cpuid_leaf7sub1 const& leaf7sub1() const; + inline x86_cpuid_leaf80000001 const& leaf80000001() const; + + private: + enum class status + { + leaf0_valid = 0, + leaf1_valid = 1, + leaf7_valid = 2, + leaf7sub1_valid = 3, + leaf80000000_valid = 4, + leaf80000001_valid = 5, + xcr0_valid = 6, + }; + + using status_bitset = utils::uint_bitset; + + mutable x86_cpuid_leaf0 m_leaf0 {}; + mutable x86_cpuid_leaf1 m_leaf1 {}; + mutable x86_cpuid_leaf7 m_leaf7 {}; + mutable x86_cpuid_leaf7sub1 m_leaf7sub1 {}; + mutable x86_cpuid_leaf80000000 m_leaf80000000 {}; + mutable x86_cpuid_leaf80000001 m_leaf80000001 {}; + mutable x86_xcr0 m_xcr0 {}; + mutable status_bitset m_status {}; + + inline bool osxsave() const noexcept; + + /** + * Internal utility to lazily read and cache a CPUID leaf. + * + * @tparam status_id The status bit tracking whether this leaf has been read and cached. + * @tparam L The CPUID leaf type (e.g. x86_cpuid_leaf1, x86_cpuid_leaf7). + * @param leaf_cache A non-const reference to the class member that stores the leaf + * value. It must be non-const because this function may write to it on first + * call. It is passed explicitly (rather than accessed via `this`) to allow + * factoring the caching logic across different leaf members. + * @return A const reference to `leaf_cache`. The non-const input / const-ref output + * asymmetry is intentional: callers must not modify the cached value, but + * this function needs write access to populate it. + * + * On first call, checks whether the leaf number is within the range advertised as + * supported by CPUID (via leaf 0 for the standard range, leaf 0x80000000 for the + * extended range). If supported, reads the leaf from the CPU; otherwise leaves + * `leaf_cache` at its zero-initialized default (all feature bits false). Either + * way, `status_id` is set so subsequent calls return immediately. + */ + template + inline auto const& safe_read_leaf(L& leaf_cache) const; + }; + + /** + * No-Op orchestrator for `CPUID` calls + * + * This does nothing and return zero-constructed objects on all calls. + * This is meant as an optimization on non `x86` platforms as the + * `x86_cpu_features_backend_cpuid` can be slightly large (hundred of bytes). + */ + class x86_cpu_features_backend_noop + { + public: + constexpr x86_xcr0 xcr0() const noexcept { return {}; } + constexpr x86_cpuid_leaf0 leaf0() const { return {}; } + constexpr x86_cpuid_leaf80000000 leaf80000000() const { return {}; } + constexpr x86_cpuid_leaf1 leaf1() const { return {}; } + constexpr x86_cpuid_leaf7 leaf7() const { return {}; } + constexpr x86_cpuid_leaf7sub1 leaf7sub1() const { return {}; } + constexpr x86_cpuid_leaf80000001 leaf80000001() const { return {}; } + }; + +#if XSIMD_TARGET_X86 + using x86_cpu_features_backend_default = x86_cpu_features_backend_cpuid; +#else + using x86_cpu_features_backend_default = x86_cpu_features_backend_noop; +#endif + /** * An opiniated CPU feature detection utility for x86. * @@ -576,7 +669,7 @@ namespace xsimd * This is well defined on all architectures. It will always return false on * non-x86 architectures. */ - class x86_cpu_features + class x86_cpu_features : private x86_cpu_features_backend_default { public: x86_cpu_features() noexcept = default; @@ -681,155 +774,122 @@ namespace xsimd inline bool avxvnni() const noexcept { return avx_enabled() && leaf7sub1().all_bits_set(); } inline bool fma4() const noexcept { return avx_enabled() && leaf80000001().all_bits_set(); } + }; - private: - enum class status - { - leaf0_valid = 0, - leaf1_valid = 1, - leaf7_valid = 2, - leaf7sub1_valid = 3, - leaf80000000_valid = 4, - leaf80000001_valid = 5, - xcr0_valid = 6, - }; - - using status_bitset = utils::uint_bitset; + /******************** + * Implementation * + ********************/ - mutable x86_cpuid_leaf0 m_leaf0 {}; - mutable x86_cpuid_leaf1 m_leaf1 {}; - mutable x86_cpuid_leaf7 m_leaf7 {}; - mutable x86_cpuid_leaf7sub1 m_leaf7sub1 {}; - mutable x86_cpuid_leaf80000000 m_leaf80000000 {}; - mutable x86_cpuid_leaf80000001 m_leaf80000001 {}; - mutable x86_xcr0 m_xcr0 {}; - mutable status_bitset m_status {}; - - inline x86_xcr0 const& xcr0() const noexcept + template + inline auto const& x86_cpu_features_backend_cpuid::safe_read_leaf(L& leaf_cache) const + { + // Check if already initialized + if (m_status.bit_is_set()) { - if (!m_status.bit_is_set()) - { - m_xcr0 = osxsave() ? x86_xcr0::read() : x86_xcr0::safe_default(); - m_status.set_bit(); - } - return m_xcr0; + return leaf_cache; } - inline x86_cpuid_leaf0 const& leaf0() const - { - if (!m_status.bit_is_set()) - { - m_leaf0 = x86_cpuid_leaf0::read(); - m_status.set_bit(); - } - return m_leaf0; - } + // Limit where we need to check leaf0 or leaf 80000000. + constexpr auto extended_threshold = x86_cpuid_leaf80000000::leaf; - inline x86_cpuid_leaf80000000 const& leaf80000000() const + // Check if it is safe to call CPUID with this value. + // First we identify if the leaf is in the regular or extended range. + // TODO(C++17): if constexpr + if (L::leaf < extended_threshold) { - if (!m_status.bit_is_set()) + // Check leaf0 in regular range + if (L::leaf <= leaf0().highest_leaf()) { - m_leaf80000000 = x86_cpuid_leaf80000000::read(); - m_status.set_bit(); + leaf_cache = L::read(); } - return m_leaf80000000; } - - /** - * Internal utility to lazily read and cache a CPUID leaf. - * - * @tparam status_id The status bit tracking whether this leaf has been read and cached. - * @tparam L The CPUID leaf type (e.g. x86_cpuid_leaf1, x86_cpuid_leaf7). - * @param leaf_cache A non-const reference to the class member that stores the leaf - * value. It must be non-const because this function may write to it on first - * call. It is passed explicitly (rather than accessed via `this`) to allow - * factoring the caching logic across different leaf members. - * @return A const reference to `leaf_cache`. The non-const input / const-ref output - * asymmetry is intentional: callers must not modify the cached value, but - * this function needs write access to populate it. - * - * On first call, checks whether the leaf number is within the range advertised as - * supported by CPUID (via leaf 0 for the standard range, leaf 0x80000000 for the - * extended range). If supported, reads the leaf from the CPU; otherwise leaves - * `leaf_cache` at its zero-initialized default (all feature bits false). Either - * way, `status_id` is set so subsequent calls return immediately. - */ - template - inline auto const& safe_read_leaf(L& leaf_cache) const + else { - // Check if already initialized - if (m_status.bit_is_set()) + // Check leaf80000000 in extended range + if (L::leaf <= leaf80000000().highest_leaf()) { - return leaf_cache; + leaf_cache = L::read(); } + } - // Limit where we need to check leaf0 or leaf 80000000. - constexpr auto extended_threshold = x86_cpuid_leaf80000000::leaf; - - // Check if it is safe to call CPUID with this value. - // First we identify if the leaf is in the regular or extended range. - // TODO(C++17): if constexpr - if (L::leaf < extended_threshold) - { - // Check leaf0 in regular range - if (L::leaf <= leaf0().highest_leaf()) - { - leaf_cache = L::read(); - } - } - else - { - // Check leaf80000000 in extended range - if (L::leaf <= leaf80000000().highest_leaf()) - { - leaf_cache = L::read(); - } - } + // Mark as valid in all cases, including if it was not read. + // In this case it will be filled with zeros (all false). + m_status.set_bit(); + return leaf_cache; + } - // Mark as valid in all cases, including if it was not read. - // In this case it will be filled with zeros (all false). - m_status.set_bit(); - return leaf_cache; + inline x86_xcr0 const& x86_cpu_features_backend_cpuid::xcr0() const noexcept + { + if (!m_status.bit_is_set()) + { + m_xcr0 = osxsave() ? x86_xcr0::read() : x86_xcr0::safe_default(); + m_status.set_bit(); } + return m_xcr0; + } - inline x86_cpuid_leaf1 const& leaf1() const + inline x86_cpuid_leaf0 const& x86_cpu_features_backend_cpuid::leaf0() const + { + if (!m_status.bit_is_set()) { - return safe_read_leaf(m_leaf1); + m_leaf0 = x86_cpuid_leaf0::read(); + m_status.set_bit(); } + return m_leaf0; + } - inline x86_cpuid_leaf7 const& leaf7() const + inline x86_cpuid_leaf80000000 const& x86_cpu_features_backend_cpuid::leaf80000000() const + { + if (!m_status.bit_is_set()) { - return safe_read_leaf(m_leaf7); + m_leaf80000000 = x86_cpuid_leaf80000000::read(); + m_status.set_bit(); } + return m_leaf80000000; + } - inline x86_cpuid_leaf7sub1 const& leaf7sub1() const - { - // Check if already initialized - if (m_status.bit_is_set()) - { - return m_leaf7sub1; - } + inline x86_cpuid_leaf1 const& x86_cpu_features_backend_cpuid::leaf1() const + { + return safe_read_leaf(m_leaf1); + } - // Check if safe to call CPUID with this value as subleaf. - constexpr auto start = x86_cpuid_leaf7::eax::highest_subleaf_start; - constexpr auto end = x86_cpuid_leaf7::eax::highest_subleaf_end; - const auto highest_subleaf7 = leaf7().get_range(); - if (x86_cpuid_leaf7sub1::subleaf <= highest_subleaf7) - { - m_leaf7sub1 = x86_cpuid_leaf7sub1::read(); - } + inline x86_cpuid_leaf7 const& x86_cpu_features_backend_cpuid::leaf7() const + { + return safe_read_leaf(m_leaf7); + } - // Mark as valid in all cases, including if it was not read. - // In this case it will be filled with zeros (all false). - m_status.set_bit(); + inline x86_cpuid_leaf7sub1 const& x86_cpu_features_backend_cpuid::leaf7sub1() const + { + // Check if already initialized + if (m_status.bit_is_set()) + { return m_leaf7sub1; } - inline x86_cpuid_leaf80000001 const& leaf80000001() const + // Check if safe to call CPUID with this value as subleaf. + constexpr auto start = x86_cpuid_leaf7::eax::highest_subleaf_start; + constexpr auto end = x86_cpuid_leaf7::eax::highest_subleaf_end; + const auto highest_subleaf7 = leaf7().get_range(); + if (x86_cpuid_leaf7sub1::subleaf <= highest_subleaf7) { - return safe_read_leaf(m_leaf80000001); + m_leaf7sub1 = x86_cpuid_leaf7sub1::read(); } - }; + + // Mark as valid in all cases, including if it was not read. + // In this case it will be filled with zeros (all false). + m_status.set_bit(); + return m_leaf7sub1; + } + + inline x86_cpuid_leaf80000001 const& x86_cpu_features_backend_cpuid::leaf80000001() const + { + return safe_read_leaf(m_leaf80000001); + } + + inline bool x86_cpu_features_backend_cpuid::osxsave() const noexcept + { + return leaf1().all_bits_set(); + } namespace detail { From a2ae28808efe7b10c4484f8366a97832b0c80df8 Mon Sep 17 00:00:00 2001 From: AntoinePrv Date: Thu, 9 Apr 2026 14:42:09 +0200 Subject: [PATCH 2/2] Reduce x86_cpu_feature size --- .../xsimd/config/xsimd_cpu_features_x86.hpp | 203 ++++++++++++------ 1 file changed, 138 insertions(+), 65 deletions(-) diff --git a/include/xsimd/config/xsimd_cpu_features_x86.hpp b/include/xsimd/config/xsimd_cpu_features_x86.hpp index b94c7c8cd..b98e0e7dc 100644 --- a/include/xsimd/config/xsimd_cpu_features_x86.hpp +++ b/include/xsimd/config/xsimd_cpu_features_x86.hpp @@ -16,6 +16,7 @@ #include #include #include +#include #if __cplusplus >= 201703L #include #endif @@ -52,40 +53,70 @@ namespace xsimd inline x86_reg32_t x86_xcr0_low() noexcept; + /** A strongly type bitset for a 32 bits register. */ template using x86_reg32_bitset = utils::uint_bitset; - template - class x86_cpuid_regs - : private x86_reg32_bitset, - private x86_reg32_bitset, - private x86_reg32_bitset, - private x86_reg32_bitset + /** A wrapper to attach a register bitfield descriptor and its CPUID index. */ + template + struct x86_reg_id + { + static constexpr x86_reg32_t index = I; + using bits = E; + + static_assert(index >= 0 && index < 4, "At most 4 register in CPUID"); + }; + + /** Find the register id with index k. */ + template + struct find_reg_k; + + /** Find the register id with index k (empty / nothing found case). */ + template + struct find_reg_k + { + using type = x86_reg_id; + }; + + /** Find the register id with index k (recursive case). */ + template + struct find_reg_k + { + using type = std::conditional_t< + reg_id_head::index == K, + reg_id_head, + typename find_reg_k::type>; + }; + + /** + * A class with strongly typed bitfield for `CPUID` registers. + * + * The class stores a variable number of register (up to four) from the CPUID + * output. This is a space optimization to avoid storing many zeros in the + * final `x86_cpu_features`. + * As a result, some of the type aliases `eax`, `ebx`, `ecx`, `edx` may be `void`. + */ + template + class x86_cpuid_regs : private x86_reg32_bitset... { private: - using eax_bitset = x86_reg32_bitset; - using ebx_bitset = x86_reg32_bitset; - using ecx_bitset = x86_reg32_bitset; - using edx_bitset = x86_reg32_bitset; + static_assert(sizeof...(reg_ids) <= 4, "At most 4 register in CPUID"); /* Parse CPUINFO register value into individual bit components.*/ constexpr explicit x86_cpuid_regs(const cpuid_reg_t& regs) noexcept - : eax_bitset(regs[0]) - , ebx_bitset(regs[1]) - , ecx_bitset(regs[2]) - , edx_bitset(regs[3]) + : x86_reg32_bitset(regs[reg_ids::index])... { } public: - using eax = A; - using ebx = B; - using ecx = C; - using edx = D; static constexpr x86_reg32_t leaf = leaf_num; static constexpr x86_reg32_t subleaf = subleaf_num; + using eax = typename find_reg_k<0, reg_ids...>::type::bits; + using ebx = typename find_reg_k<1, reg_ids...>::type::bits; + using ecx = typename find_reg_k<2, reg_ids...>::type::bits; + using edx = typename find_reg_k<3, reg_ids...>::type::bits; + inline static x86_cpuid_regs read() { return x86_cpuid_regs(detail::x86_cpuid(leaf, subleaf)); @@ -93,22 +124,76 @@ namespace xsimd constexpr x86_cpuid_regs() noexcept = default; - using eax_bitset::all_bits_set; - using eax_bitset::get_range; - using ebx_bitset::all_bits_set; - using ebx_bitset::get_range; - using ecx_bitset::all_bits_set; - using ecx_bitset::get_range; - using edx_bitset::all_bits_set; - using edx_bitset::get_range; - }; + // TODO(C++17) compact version for which this was designed. + // The else clause contains a very verbose port. +#if 0 + using x86_reg32_bitset::all_bits_set...; + using x86_reg32_bitset::get_range...; +#else + + private: + template + struct m_empty_reg + { + enum class type {}; + }; + + using eax_or_empty = typename std::conditional::value, typename m_empty_reg<0>::type, eax>::type; + using ebx_or_empty = typename std::conditional::value, typename m_empty_reg<1>::type, ebx>::type; + using ecx_or_empty = typename std::conditional::value, typename m_empty_reg<2>::type, ecx>::type; + using edx_or_empty = typename std::conditional::value, typename m_empty_reg<3>::type, edx>::type; + + public: + template ::value, int>::type = 0> + constexpr bool all_bits_set() const noexcept + { + return x86_reg32_bitset::template all_bits_set(); + } + + template ::value, int>::type = 0> + constexpr x86_reg32_t get_range() const noexcept + { + return x86_reg32_bitset::template get_range(); + } + + template ::value, int>::type = 0> + constexpr bool all_bits_set() const noexcept + { + return x86_reg32_bitset::template all_bits_set(); + } + + template ::value, int>::type = 0> + constexpr x86_reg32_t get_range() const noexcept + { + return x86_reg32_bitset::template get_range(); + } + + template ::value, int>::type = 0> + constexpr bool all_bits_set() const noexcept + { + return x86_reg32_bitset::template all_bits_set(); + } + + template ::value, int>::type = 0> + constexpr x86_reg32_t get_range() const noexcept + { + return x86_reg32_bitset::template get_range(); + } + + template ::value, int>::type = 0> + constexpr bool all_bits_set() const noexcept + { + return x86_reg32_bitset::template all_bits_set(); + } + + template ::value, int>::type = 0> + constexpr x86_reg32_t get_range() const noexcept + { + return x86_reg32_bitset::template get_range(); + } - template - using make_x86_cpuid_regs = x86_cpuid_regs; +#endif // C++17 + }; template struct x86_cpuid_highest_func @@ -298,12 +383,6 @@ namespace xsimd static constexpr detail::x86_reg32_t leaf = 1; static constexpr detail::x86_reg32_t subleaf = 0; - enum class eax - { - }; - enum class ebx - { - }; enum class ecx { /* Streaming SIMD Extensions 3. */ @@ -328,6 +407,10 @@ namespace xsimd /* Streaming SIMD Extensions 2. */ sse2 = 26, }; + + using regs_t = detail::x86_cpuid_regs, + detail::x86_reg_id>; }; /** @@ -340,7 +423,7 @@ namespace xsimd * * @see https://en.wikipedia.org/wiki/CPUID */ - using x86_cpuid_leaf1 = detail::make_x86_cpuid_regs; + using x86_cpuid_leaf1 = typename x86_cpuid_leaf1_traits::regs_t; struct x86_cpuid_leaf7_traits { @@ -386,9 +469,11 @@ namespace xsimd /* AVX-512 Vector Neural Network instructions. */ avx512vnni_bw = 11, }; - enum class edx - { - }; + + using regs_t = detail::x86_cpuid_regs, + detail::x86_reg_id, + detail::x86_reg_id>; }; /** @@ -401,7 +486,7 @@ namespace xsimd * * @see https://en.wikipedia.org/wiki/CPUID */ - using x86_cpuid_leaf7 = detail::make_x86_cpuid_regs; + using x86_cpuid_leaf7 = typename x86_cpuid_leaf7_traits::regs_t; struct x86_cpuid_leaf7sub1_traits { @@ -413,15 +498,9 @@ namespace xsimd /* AVX (VEX-encoded) Vector Neural Network instructions. */ avxvnni = 4, }; - enum class ebx - { - }; - enum class ecx - { - }; - enum class edx - { - }; + + using regs_t = detail::x86_cpuid_regs>; }; /** @@ -434,7 +513,7 @@ namespace xsimd * * @see https://en.wikipedia.org/wiki/CPUID */ - using x86_cpuid_leaf7sub1 = detail::make_x86_cpuid_regs; + using x86_cpuid_leaf7sub1 = typename x86_cpuid_leaf7sub1_traits::regs_t; /** * Highest Extended CPUID Function Parameter (EAX=0x80000000). @@ -451,20 +530,14 @@ namespace xsimd static constexpr detail::x86_reg32_t leaf = 0x80000001; static constexpr detail::x86_reg32_t subleaf = 0; - enum class eax - { - }; - enum class ebx - { - }; enum class ecx { /* AMD Fused multiply-add with 4 operands (FMA4). */ fma4 = 16, }; - enum class edx - { - }; + + using regs_t = detail::x86_cpuid_regs>; }; /** @@ -477,7 +550,7 @@ namespace xsimd * * @see https://en.wikipedia.org/wiki/CPUID */ - using x86_cpuid_leaf80000001 = detail::make_x86_cpuid_regs; + using x86_cpuid_leaf80000001 = typename x86_cpuid_leaf80000001_traits::regs_t; /* * Extended Control Register 0 (XCR0).