From e7f623c2126d83ab7e550d7806f91fc7e9bca734 Mon Sep 17 00:00:00 2001 From: TimThuering <56958925+TimThuering@users.noreply.github.com> Date: Thu, 28 May 2026 13:25:46 +0200 Subject: [PATCH 01/31] add MPI to CMake configuration --- CMakeLists.txt | 31 ++++++++++++++++++++++++++++++- README.md | 5 +++++ 2 files changed, 35 insertions(+), 1 deletion(-) diff --git a/CMakeLists.txt b/CMakeLists.txt index cf96cc6..ff588aa 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -25,7 +25,7 @@ add_library(${HWS_LIBRARY_NAME} SHARED ${HWS_SOURCES}) add_library(hws::hws ALIAS ${HWS_LIBRARY_NAME}) # set install target -set(HWS_TARGETS_TO_INSTALL ) +set(HWS_TARGETS_TO_INSTALL) # use C++17 target_compile_features(${HWS_LIBRARY_NAME} PUBLIC cxx_std_17) @@ -156,6 +156,35 @@ if (HWS_ENABLE_PYTHON_BINDINGS) endif () +#################################################################################################################### +## enable MPI support ## +#################################################################################################################### +set(HWS_ENABLE_MPI_SUPPORT AUTO CACHE STRING "Enable MPI support.") +set_property(CACHE HWS_ENABLE_MPI_SUPPORT PROPERTY STRINGS AUTO ON OFF) + +if (HWS_ENABLE_MPI_SUPPORT MATCHES "AUTO" OR HWS_ENABLE_MPI_SUPPORT) + # try finding MPI + find_package(MPI QUIET) + + # check if MPI could be found + if (NOT MPI_FOUND) + if (HWS_ENABLE_MPI_SUPPORT MATCHES "ON") + message(SEND_ERROR "Cannot find MPI but MPI support was explicitly requested!") + else () + message(STATUS "Cannot find MPI. MPI support disabled.") + endif () + else () + message(STATUS "Enable MPI support (${MPI_CXX_VERSION}).") + + # link against necessary libraries + target_link_libraries(${HWS_LIBRARY_NAME} PUBLIC MPI::MPI_CXX) + + # add compile definition + target_compile_definitions(${HWS_LIBRARY_NAME} PUBLIC HWS_MPI_SUPPORT_ENABLED) + endif () +endif () + + ######################################################################################################################## ## add documentation ## ######################################################################################################################## diff --git a/README.md b/README.md index 7846580..1bd62df 100644 --- a/README.md +++ b/README.md @@ -75,6 +75,11 @@ The `[optional_options]` can be one or multiple of: - `HWS_SAMPLING_INTERVAL=100ms` (default: `100ms`): set the sampling interval in milliseconds - `HWS_ENABLE_PYTHON_BINDINGS=ON|OFF` (default: `ON`): enable Python bindings +- `HWS_ENABLE_MPI_SUPPORT=ON|OFF|AUTO` (default: `AUTO`): + - `ON`: check whether MPI is available and fail if this is not the case + - `AUTO`: check whether MPI is available but **do not** fail if this is not the case + - `OFF`: do not check whether MPI is available + ### Installing via CMake The library supports the `install` target: From 673917ccc53932aaac4540f7611f25f49225188a Mon Sep 17 00:00:00 2001 From: TimThuering <56958925+TimThuering@users.noreply.github.com> Date: Thu, 28 May 2026 15:27:38 +0200 Subject: [PATCH 02/31] add global yaml dump with data from all MPI ranks to system_hardware_sampler --- include/hws/system_hardware_sampler.hpp | 21 +++++++ include/hws/utility.hpp | 25 ++++++++ src/hws/system_hardware_sampler.cpp | 46 ++++++++++++++ src/hws/utility.cpp | 79 +++++++++++++++++++++++++ 4 files changed, 171 insertions(+) diff --git a/include/hws/system_hardware_sampler.hpp b/include/hws/system_hardware_sampler.hpp index 42924ac..593a0d5 100644 --- a/include/hws/system_hardware_sampler.hpp +++ b/include/hws/system_hardware_sampler.hpp @@ -22,6 +22,10 @@ #include // std::string #include // std::vector +#if defined(HWS_MPI_SUPPORT_ENABLED) + #include // MPI_Comm +#endif + namespace hws { /** @@ -175,6 +179,23 @@ class system_hardware_sampler { */ void dump_yaml(const std::filesystem::path &filename) const; +#if defined(HWS_MPI_SUPPORT_ENABLED) + /** + * @brief Let MPI rank 0 dump the hardware samples of all hardware samplers of all MPI ranks to the YAML file with @p filename. + * @param[in] filename the YAML file to append the hardware samples to + * @param[in] communicator the MPI communicator to use + */ + void dump_yaml_global(const char *filename, MPI_Comm communicator) const; + /** + * @copydoc hws::system_hardware_sampler::dump_yaml(const char *) const + */ + void dump_yaml_global(const std::string &filename, MPI_Comm communicator) const; + /** + * @copydoc hws::system_hardware_sampler::dump_yaml(const char *) const + */ + void dump_yaml_global(const std::filesystem::path &filename, MPI_Comm communicator) const; +#endif + /** * @brief Return the hardware samples as YAML string. * @return the YAML content as string (`[[nodiscard]]`) diff --git a/include/hws/utility.hpp b/include/hws/utility.hpp index aacaf23..cbb1075 100644 --- a/include/hws/utility.hpp +++ b/include/hws/utility.hpp @@ -27,6 +27,10 @@ #include // std::is_same_v, std::is_floating_point_v, std::remove_cv_t, std::remove_reference_t, std::true_type, std::false_type #include // std::vector +#if defined(HWS_MPI_SUPPORT_ENABLED) +#include // MPI_Comm +#endif + namespace hws::detail { /** @@ -247,6 +251,14 @@ template return quoted; } +/** + * @brief Prefix all lines in a string with the given indentation. + * @param[in] text the input text + * @param[in] prefix the prefix (indentation) added to each line + * @return the indented string + */ +[[nodiscard]] std::string indent_lines(const std::string &text, const std::string &prefix); + /*****************************************************************************************************/ /** other free functions **/ /*****************************************************************************************************/ @@ -312,6 +324,19 @@ template } } +#if defined(HWS_MPI_SUPPORT_ENABLED) +/** + * @brief Gather YAML strings from all MPI ranks and assemble them in rank order on rank 0. + * + * @param[in] local_yaml the local YAML string contribution + * @param[in] communicator the MPI communicator + * + * @return concatenated YAML string on rank 0, empty string on all other ranks + */ +[[nodiscard]] +std::string gather_yaml_strings_mpi(const std::string& local_yaml, MPI_Comm communicator); +#endif + } // namespace hws::detail #endif // HWS_UTILITY_HPP_ diff --git a/src/hws/system_hardware_sampler.cpp b/src/hws/system_hardware_sampler.cpp index 1c08762..e07d7a7 100644 --- a/src/hws/system_hardware_sampler.cpp +++ b/src/hws/system_hardware_sampler.cpp @@ -41,6 +41,10 @@ #include // std::out_of_range #include // std::vector +#if defined(HWS_MPI_SUPPORT_ENABLED) +#include // MPI_Comm +#endif + namespace hws { system_hardware_sampler::system_hardware_sampler(const sample_category category) : @@ -201,6 +205,48 @@ void system_hardware_sampler::dump_yaml(const std::filesystem::path &filename) c std::for_each(samplers_.cbegin(), samplers_.cend(), [&filename](const auto &ptr) { ptr->dump_yaml(filename); }); } +#if defined(HWS_MPI_SUPPORT_ENABLED) +void system_hardware_sampler::dump_yaml_global(const char *filename, MPI_Comm communicator) const { + int initialized = 0; + MPI_Initialized(&initialized); + + if (!initialized) { + throw std::runtime_error("MPI must already be initialized"); + } + + // MPI rank and world size for identification and communication + int rank = 0; + MPI_Comm_rank(communicator, &rank); + + std::string rank_yaml_output; // yaml file as string per rank + + rank_yaml_output += "---\n\n"; + rank_yaml_output += "rank: " + std::to_string(rank) + "\n\n"; + + // accumulate string from each sampler + std::size_t sampler_idx = 0; + std::for_each(samplers_.cbegin(), samplers_.cend(), [&rank_yaml_output, &sampler_idx](const auto &ptr) { + rank_yaml_output += "sampler_" + std::to_string(sampler_idx++) + ":\n"; + rank_yaml_output += detail::indent_lines(ptr->as_yaml_string(), " "); + }); + + const std::string global_yaml_output = detail::gather_yaml_strings_mpi(rank_yaml_output, communicator); + + if (rank == 0) { + std::ofstream file(filename); + file << global_yaml_output; + } +} + +void system_hardware_sampler::dump_yaml_global(const std::string &filename, MPI_Comm communicator) const { + this->dump_yaml_global(filename.c_str(), communicator); +} + +void system_hardware_sampler::dump_yaml_global(const std::filesystem::path &filename, MPI_Comm communicator) const { + this->dump_yaml_global(filename.string().c_str(), communicator); +} +#endif + std::string system_hardware_sampler::as_yaml_string() const { return std::accumulate(samplers_.cbegin(), samplers_.cend(), std::string{}, [](const std::string str, const auto &ptr) { return str + ptr->as_yaml_string(); }); } diff --git a/src/hws/utility.cpp b/src/hws/utility.cpp index 054db8d..c5d1dcd 100644 --- a/src/hws/utility.cpp +++ b/src/hws/utility.cpp @@ -9,10 +9,15 @@ #include // std::min, std::transform, std::all_of #include // std::tolower, std::isdigit +#include // std::stringstream #include // std::string #include // std::string_view #include // std::vector +#if defined(HWS_MPI_SUPPORT_ENABLED) +#include // MPI_Comm, MPI_Gatherv, MPI_Gather, MPI_Initialized, MPI_Comm_rank, MPI_Comm_size +#endif + namespace hws::detail { bool starts_with(const std::string_view sv, const std::string_view start) noexcept { @@ -61,4 +66,78 @@ bool is_integer(std::string_view str) { return std::all_of(str.cbegin(), str.cend(), [](const char c) { return std::isdigit(static_cast(c)); }); } +std::string indent_lines(const std::string &text, const std::string &prefix) { + std::stringstream ss{ text }; + + std::string line; + std::string out; + + while (std::getline(ss, line)) { + out += prefix + line + '\n'; + } + + return out; +} + +#if defined(HWS_MPI_SUPPORT_ENABLED) +std::string gather_yaml_strings_mpi(const std::string &local_yaml, MPI_Comm communicator) { + int initialized = 0; + MPI_Initialized(&initialized); + + if (!initialized) { + throw std::runtime_error("MPI must already be initialized"); + } + + // MPI rank and world size for identification and communication + int rank = 0, world_size = 0; + MPI_Comm_rank(communicator, &rank); + MPI_Comm_size(communicator, &world_size); + + // gather the size of the yaml string from each rank + int local_size = static_cast(local_yaml.size()); + + std::vector recv_sizes; + + if (rank == 0) { + recv_sizes.resize(world_size); + } + + MPI_Gather(&local_size, 1, MPI_INT, recv_sizes.data(), 1, MPI_INT, 0, communicator); + + // compute the displacements from the rank string sizes + std::vector displacements; + int total_size = 0; + + if (rank == 0) { + displacements.resize(world_size); + + for (int i = 0; i < world_size; ++i) { + displacements[i] = total_size; + total_size += recv_sizes[i]; + } + } + + // gather the local yaml strings from all ranks + std::vector recv_buffer; + + if (rank == 0) { + recv_buffer.resize(total_size); + } + + MPI_Gatherv(local_yaml.data(), local_size, MPI_CHAR, recv_buffer.data(), recv_sizes.data(), displacements.data(), MPI_CHAR, 0, communicator); + + // build final yaml string on rank 0 + std::string global_yaml; + + if (rank == 0) { + for (int r = 0; r < world_size; ++r) { + global_yaml.append(recv_buffer.data() + displacements[r], recv_sizes[r]); + global_yaml += '\n'; + } + } + + return global_yaml; +} +#endif + } // namespace hws::detail From 1f113ef23c256d114bf5810d0e4f80f9904abb5d Mon Sep 17 00:00:00 2001 From: TimThuering <56958925+TimThuering@users.noreply.github.com> Date: Thu, 28 May 2026 16:58:51 +0200 Subject: [PATCH 03/31] update documentation --- include/hws/system_hardware_sampler.hpp | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/include/hws/system_hardware_sampler.hpp b/include/hws/system_hardware_sampler.hpp index 593a0d5..2003b28 100644 --- a/include/hws/system_hardware_sampler.hpp +++ b/include/hws/system_hardware_sampler.hpp @@ -187,11 +187,11 @@ class system_hardware_sampler { */ void dump_yaml_global(const char *filename, MPI_Comm communicator) const; /** - * @copydoc hws::system_hardware_sampler::dump_yaml(const char *) const + * @copydoc hws::system_hardware_sampler::dump_yaml_global(const char *) const */ void dump_yaml_global(const std::string &filename, MPI_Comm communicator) const; /** - * @copydoc hws::system_hardware_sampler::dump_yaml(const char *) const + * @copydoc hws::system_hardware_sampler::dump_yaml_global(const char *) const */ void dump_yaml_global(const std::filesystem::path &filename, MPI_Comm communicator) const; #endif From 772d22642b16b3cff899851500fa19e80c5b44f9 Mon Sep 17 00:00:00 2001 From: TimThuering <56958925+TimThuering@users.noreply.github.com> Date: Thu, 28 May 2026 17:00:42 +0200 Subject: [PATCH 04/31] add global yaml output for individual hardware sampler on all MPI ranks --- include/hws/hardware_sampler.hpp | 21 ++++++++++++++++ src/hws/hardware_sampler.cpp | 42 ++++++++++++++++++++++++++++++++ 2 files changed, 63 insertions(+) diff --git a/include/hws/hardware_sampler.hpp b/include/hws/hardware_sampler.hpp index 326eb7e..1d1f64c 100644 --- a/include/hws/hardware_sampler.hpp +++ b/include/hws/hardware_sampler.hpp @@ -23,6 +23,10 @@ #include // std::thread #include // std::vector +#if defined(HWS_MPI_SUPPORT_ENABLED) + #include // MPI_Comm +#endif + namespace hws { /** @@ -162,6 +166,23 @@ class hardware_sampler { */ void dump_yaml(const std::filesystem::path &filename) const; + #if defined(HWS_MPI_SUPPORT_ENABLED) + /** + * @brief Let MPI rank 0 dump the hardware samples of this hardware sampler of all MPI ranks to the YAML file with @p filename. + * @param[in] filename the YAML file to append the hardware samples to + * @param[in] communicator the MPI communicator to use + */ + void dump_yaml_global(const char *filename, MPI_Comm communicator) const; + /** + * @copydoc hws::hardware_sampler::dump_yaml_global(const char *) const + */ + void dump_yaml_global(const std::string &filename, MPI_Comm communicator) const; + /** + * @copydoc hws::hardware_sampler::dump_yaml_global(const char *) const + */ + void dump_yaml_global(const std::filesystem::path &filename, MPI_Comm communicator) const; + #endif + /** * @brief Return the unique device identification. Can be used as unique key in the YAML string. * @return the unique device identification (`[[nodiscard]]`) diff --git a/src/hws/hardware_sampler.cpp b/src/hws/hardware_sampler.cpp index abd907d..4c5aa1d 100644 --- a/src/hws/hardware_sampler.cpp +++ b/src/hws/hardware_sampler.cpp @@ -24,6 +24,10 @@ #include // std::thread #include // std::move +#if defined(HWS_MPI_SUPPORT_ENABLED) +#include // MPI_Comm +#endif + namespace hws { hardware_sampler::hardware_sampler(const std::chrono::milliseconds sampling_interval, const sample_category category) : @@ -144,6 +148,44 @@ void hardware_sampler::dump_yaml(const std::filesystem::path &filename) const { this->dump_yaml(filename.string().c_str()); } +#if defined(HWS_MPI_SUPPORT_ENABLED) +void hardware_sampler::dump_yaml_global(const char *filename, MPI_Comm communicator) const { + int initialized = 0; + MPI_Initialized(&initialized); + + if (!initialized) { + throw std::runtime_error("MPI must already be initialized"); + } + + // MPI rank and world size for identification and communication + int rank = 0; + MPI_Comm_rank(communicator, &rank); + + std::string rank_yaml_output; // yaml file as string per rank + + rank_yaml_output += "---\n\n"; + rank_yaml_output += "rank: " + std::to_string(rank) + "\n\n"; + + // add yaml string of this hardware sampler + rank_yaml_output += this->as_yaml_string(); + + const std::string global_yaml_output = detail::gather_yaml_strings_mpi(rank_yaml_output, communicator); + + if (rank == 0) { + std::ofstream file(filename); + file << global_yaml_output; + } +} + +void hardware_sampler::dump_yaml_global(const std::string &filename, MPI_Comm communicator) const { + this->dump_yaml_global(filename.c_str(), communicator); +} + +void hardware_sampler::dump_yaml_global(const std::filesystem::path &filename, MPI_Comm communicator) const { + this->dump_yaml_global(filename.string().c_str(), communicator); +} +#endif + std::string hardware_sampler::as_yaml_string() const { if (!this->has_sampling_stopped()) { throw std::runtime_error{ "Can return samples as string only after the sampling has been stopped!" }; From 9ae4c80ada4cc5b63f8b84f8d2ff13ad1ac3c3ac Mon Sep 17 00:00:00 2001 From: TimThuering <56958925+TimThuering@users.noreply.github.com> Date: Fri, 29 May 2026 13:01:39 +0200 Subject: [PATCH 05/31] add mpi4py compatibility for python bindings --- CMakeLists.txt | 23 +++++++++++-------- bindings/CMakeLists.txt | 24 ++++++++++++++++++++ bindings/main.cpp | 39 ++++++++++++++++++++++++++++++++ bindings/mpi4py_communicator.hpp | 28 +++++++++++++++++++++++ pyproject.toml | 3 +++ 5 files changed, 108 insertions(+), 9 deletions(-) create mode 100644 bindings/mpi4py_communicator.hpp diff --git a/CMakeLists.txt b/CMakeLists.txt index ff588aa..cb049c2 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -147,20 +147,13 @@ else () endif () -#################################################################################################################### -## enable Python bindings ## -#################################################################################################################### -option(HWS_ENABLE_PYTHON_BINDINGS "Build language bindings for Python." ON) -if (HWS_ENABLE_PYTHON_BINDINGS) - add_subdirectory(bindings) -endif () - - #################################################################################################################### ## enable MPI support ## #################################################################################################################### set(HWS_ENABLE_MPI_SUPPORT AUTO CACHE STRING "Enable MPI support.") set_property(CACHE HWS_ENABLE_MPI_SUPPORT PROPERTY STRINGS AUTO ON OFF) +# Default: assume MPI support inactive +set(HWS_MPI_SUPPORT_ACTIVE FALSE CACHE BOOL "MPI support enabled in core library") if (HWS_ENABLE_MPI_SUPPORT MATCHES "AUTO" OR HWS_ENABLE_MPI_SUPPORT) # try finding MPI @@ -181,10 +174,22 @@ if (HWS_ENABLE_MPI_SUPPORT MATCHES "AUTO" OR HWS_ENABLE_MPI_SUPPORT) # add compile definition target_compile_definitions(${HWS_LIBRARY_NAME} PUBLIC HWS_MPI_SUPPORT_ENABLED) + + # Expose that MPI is really enabled for the Python bindings (and potentially other submodules) via a cache variable. + set(HWS_MPI_SUPPORT_ACTIVE TRUE CACHE BOOL "MPI support enabled in core library" FORCE) endif () endif () +#################################################################################################################### +## enable Python bindings ## +#################################################################################################################### +option(HWS_ENABLE_PYTHON_BINDINGS "Build language bindings for Python." ON) +if (HWS_ENABLE_PYTHON_BINDINGS) + add_subdirectory(bindings) +endif () + + ######################################################################################################################## ## add documentation ## ######################################################################################################################## diff --git a/bindings/CMakeLists.txt b/bindings/CMakeLists.txt index 93d8e98..fd4f564 100644 --- a/bindings/CMakeLists.txt +++ b/bindings/CMakeLists.txt @@ -64,6 +64,30 @@ target_include_directories(${HWS_PYTHON_BINDINGS_LIBRARY_NAME} PRIVATE ${CMAKE_C target_link_libraries(${HWS_PYTHON_BINDINGS_LIBRARY_NAME} PRIVATE ${HWS_LIBRARY_NAME}) target_compile_definitions(${HWS_PYTHON_BINDINGS_LIBRARY_NAME} PRIVATE PYBIND11_DETAILED_ERROR_MESSAGES) +if(HWS_MPI_SUPPORT_ACTIVE) + message(STATUS "MPI support enabled. Adding mpi4py include directory and linking against MPI.") + # Get mpi4py's C header location, simultaneously checking if mpi4py is importable in the current Python environment + execute_process( + COMMAND "${Python_EXECUTABLE}" -c + "import mpi4py, sys; sys.stdout.write(mpi4py.get_include())" + RESULT_VARIABLE MPI4PY_IMPORT_RESULT + OUTPUT_VARIABLE MPI4PY_INCLUDE_DIR + OUTPUT_STRIP_TRAILING_WHITESPACE + ) + + if(MPI4PY_IMPORT_RESULT) + message(FATAL_ERROR + "MPI support is enabled in hws (HWS_ENABLE_MPI_SUPPORT=AUTO/ON and MPI_FOUND) " + "but mpi4py is not importable in Python_EXECUTABLE='${Python_EXECUTABLE}'. " + "Install mpi4py in this environment or disable python bindings.") + endif() + + target_include_directories(${HWS_PYTHON_BINDINGS_LIBRARY_NAME} PRIVATE ${MPI4PY_INCLUDE_DIR}) + + # Propagate the same macro used on the C++ side into the Python module + target_compile_definitions(${HWS_PYTHON_BINDINGS_LIBRARY_NAME} PRIVATE HWS_MPI_SUPPORT_ENABLED) +endif() + include(GNUInstallDirs) # install Python bindings install(TARGETS ${HWS_PYTHON_BINDINGS_LIBRARY_NAME} diff --git a/bindings/main.cpp b/bindings/main.cpp index 3f062e7..2b1a161 100644 --- a/bindings/main.cpp +++ b/bindings/main.cpp @@ -11,6 +11,12 @@ #include // std::string_view +#if defined(HWS_MPI_SUPPORT_ENABLED) +#include +#include +#include "mpi4py_communicator.hpp" +#endif + #define HWS_IS_DEFINED_HELPER(x) #x #define HWS_IS_DEFINED(x) (std::string_view{ #x } != std::string_view{ HWS_IS_DEFINED_HELPER(x) }) @@ -32,6 +38,15 @@ PYBIND11_MODULE(HardwareSampling, m) { m.doc() = "Hardware Sampling for CPUs and GPUs"; m.attr("__version__") = hws::version::version; + // MPI support +#if defined(HWS_MPI_SUPPORT_ENABLED) + // Initialize mpi4py C-API so PyMPIComm_* are usable + if (import_mpi4py() < 0) { + throw py::error_already_set(); + } +#endif + m.def("has_mpi_support", []() { return HWS_IS_DEFINED(HWS_MPI_SUPPORT_ENABLED); }); + init_event(m); init_sample_category(m); init_relative_event(m); @@ -64,3 +79,27 @@ PYBIND11_MODULE(HardwareSampling, m) { init_version(m); } + + + +#if defined(HWS_MPI_SUPPORT_ENABLED) +/** + * Extracts an MPI_Comm from a python mpi4py.MPI.Comm object. + * Has to be in same translation unit as the import_mpi4py() call to ensure that the mpi4py C-API is initialized and the PyMPIComm_Type is available. + * + * @param py_comm a Python object that is expected to be an mpi4py.MPI.Comm instance + * @return the extracted MPI_Comm + */ +MPI_Comm mpi_comm_from_python(py::object py_comm) { + if (!PyObject_TypeCheck(py_comm.ptr(), &PyMPIComm_Type)) { + throw std::runtime_error("expected mpi4py.MPI.Comm as communicator argument"); + } + + MPI_Comm *comm_ptr = PyMPIComm_Get(py_comm.ptr()); + if (comm_ptr == nullptr) { + throw std::runtime_error("could not extract MPI_Comm from mpi4py communicator"); + } + + return *comm_ptr; +} +#endif \ No newline at end of file diff --git a/bindings/mpi4py_communicator.hpp b/bindings/mpi4py_communicator.hpp new file mode 100644 index 0000000..9ba2749 --- /dev/null +++ b/bindings/mpi4py_communicator.hpp @@ -0,0 +1,28 @@ +/** +* @file + * @author Tim Thüring + * @copyright 2024-today All Rights Reserved + * @license This file is released under the MIT license. + * See the LICENSE.md file in the project root for full license information. + * + * @brief Utility functions for transforming mpi4py communicators into C++ MPI communicators + */ + +#ifndef HWS_HARDWARE_SAMPLING_FOR_GPUS_AND_CPUS_MPI4PY_COMMUNICATOR_HPP +#define HWS_HARDWARE_SAMPLING_FOR_GPUS_AND_CPUS_MPI4PY_COMMUNICATOR_HPP +#pragma once + +#include "pybind11/pybind11.h" + +#if defined(HWS_MPI_SUPPORT_ENABLED) +# include +#endif + +namespace py = pybind11; + +#if defined(HWS_MPI_SUPPORT_ENABLED) +MPI_Comm mpi_comm_from_python(py::object py_comm); +#endif + + +#endif // HWS_HARDWARE_SAMPLING_FOR_GPUS_AND_CPUS_MPI4PY_COMMUNICATOR_HPP diff --git a/pyproject.toml b/pyproject.toml index 45c4cfa..910c2b7 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -36,6 +36,9 @@ classifiers = [ "Programming Language :: C++", "Programming Language :: Python :: 3" ] +# optional dependencies +[project.optional-dependencies] +mpi = ["mpi4py>=4"] # project specific URLs [project.urls] documentation = "https://sc-sgs.github.io/hardware_sampling/" From b3912c23747f982f0852c7859a7337f156defab1 Mon Sep 17 00:00:00 2001 From: TimThuering <56958925+TimThuering@users.noreply.github.com> Date: Fri, 29 May 2026 13:02:12 +0200 Subject: [PATCH 06/31] add dump_yaml_global to system_hardware_sampler python bindings --- bindings/system_hardware_sampler.cpp | 18 ++++++++++++++++++ 1 file changed, 18 insertions(+) diff --git a/bindings/system_hardware_sampler.cpp b/bindings/system_hardware_sampler.cpp index d9af622..6633444 100644 --- a/bindings/system_hardware_sampler.cpp +++ b/bindings/system_hardware_sampler.cpp @@ -19,6 +19,11 @@ #include "relative_event.hpp" // hws::detail::relative_event #include // std::string +#if defined(HWS_MPI_SUPPORT_ENABLED) +#include +#include "mpi4py_communicator.hpp" +#endif + namespace py = pybind11; void init_system_hardware_sampler(py::module_ &m) { @@ -67,5 +72,18 @@ void init_system_hardware_sampler(py::module_ &m) { .def("sampler", [](hws::system_hardware_sampler &self, const std::size_t idx) { return self.sampler(idx).get(); }, "get the i-th hardware sampler available for the whole system") .def("dump_yaml", py::overload_cast(&hws::system_hardware_sampler::dump_yaml, py::const_), "dump all hardware samples for all hardware samplers to the given YAML file") .def("as_yaml_string", &hws::system_hardware_sampler::as_yaml_string, "return all hardware samples for all hardware samplers as YAML string") +#if defined(HWS_MPI_SUPPORT_ENABLED) + .def("dump_yaml_global", + [](const hws::system_hardware_sampler &self, + const std::string &filename, + py::object py_comm) { + const MPI_Comm comm = mpi_comm_from_python(py_comm); + self.dump_yaml_global(filename, comm); + }, + py::arg("filename"), + py::arg("comm"), + "Let MPI rank 0 dump the hardware samples of all hardware samplers of all MPI ranks to the given YAML file using the provided mpi4py communicator." + ) +#endif .def("__repr__", [](const hws::system_hardware_sampler &self) { return fmt::format("", self.num_samplers()); }); } From 59caf70c46f0eee3c66648ff1c49266a49aa9359 Mon Sep 17 00:00:00 2001 From: TimThuering <56958925+TimThuering@users.noreply.github.com> Date: Fri, 29 May 2026 13:18:19 +0200 Subject: [PATCH 07/31] add dump_yaml_global to hardware_sampler python bindings --- bindings/hardware_sampler.cpp | 18 ++++++++++++++++++ 1 file changed, 18 insertions(+) diff --git a/bindings/hardware_sampler.cpp b/bindings/hardware_sampler.cpp index 5a12141..f8f5251 100644 --- a/bindings/hardware_sampler.cpp +++ b/bindings/hardware_sampler.cpp @@ -31,6 +31,11 @@ #include "relative_event.hpp" // hws::detail::relative_event #include // std::string +#if defined(HWS_MPI_SUPPORT_ENABLED) +#include +#include "mpi4py_communicator.hpp" +#endif + namespace py = pybind11; void init_hardware_sampler(py::module_ &m) { @@ -62,6 +67,19 @@ void init_hardware_sampler(py::module_ &m) { .def("relative_time_points", [](const hws::hardware_sampler &self) { return hws::detail::durations_from_reference_time(self.sampling_time_points(), self.get_event(0).time_point); }, "get the relative durations of the respective hardware samples in seconds (as \"normal\" number)") .def("sampling_interval", &hws::hardware_sampler::sampling_interval, "get the sampling interval of this hardware sampler (in ms)") .def("dump_yaml", py::overload_cast(&hws::hardware_sampler::dump_yaml, py::const_), "dump all hardware samples to the given YAML file") +#if defined(HWS_MPI_SUPPORT_ENABLED) + .def("dump_yaml_global", + [](const hws::hardware_sampler &self, + const std::string &filename, + py::object py_comm) { + const MPI_Comm comm = mpi_comm_from_python(py_comm); + self.dump_yaml_global(filename, comm); + }, + py::arg("filename"), + py::arg("comm"), + "Let MPI rank 0 dump the hardware samples of this hardware sampler of all MPI ranks to the given YAML file using the provided mpi4py communicator." + ) +#endif .def("as_yaml_string", &hws::hardware_sampler::as_yaml_string, "return all hardware samples including additional information like events as YAML string") .def("samples_only_as_yaml_string", &hws::hardware_sampler::samples_only_as_yaml_string, "return all hardware samples as YAML string") .def("__repr__", [](const hws::hardware_sampler &self) { From 0cfe52785d4ebb3581df0cb6fdeab562660a4464 Mon Sep 17 00:00:00 2001 From: TimThuering <56958925+TimThuering@users.noreply.github.com> Date: Mon, 8 Jun 2026 09:03:39 +0200 Subject: [PATCH 08/31] added system hardware sampler creation which avoids duplicates for MPI with NVIDIA and AMD GPUs --- include/hws/system_hardware_sampler.hpp | 28 ++ include/hws/utility.hpp | 327 +++++++++++++++++++++++- src/hws/gpu_amd/CMakeLists.txt | 2 +- src/hws/gpu_nvidia/CMakeLists.txt | 2 +- src/hws/system_hardware_sampler.cpp | 154 +++++++---- 5 files changed, 452 insertions(+), 61 deletions(-) diff --git a/include/hws/system_hardware_sampler.hpp b/include/hws/system_hardware_sampler.hpp index 2003b28..a5a1265 100644 --- a/include/hws/system_hardware_sampler.hpp +++ b/include/hws/system_hardware_sampler.hpp @@ -14,6 +14,7 @@ #include "hws/event.hpp" // hws::event #include "hws/hardware_sampler.hpp" // hws::hardware_sampler #include "hws/sample_category.hpp" // hws::sample_category +#include "hws/utility.hpp" // hws::detail::mpi_sampling_mode #include // std::chrono::{milliseconds, steady_clock::time_point} #include // std::size_t @@ -45,6 +46,24 @@ class system_hardware_sampler { * @param[in] category the sample categories that are enabled for hardware sampling (default: all) */ explicit system_hardware_sampler(std::chrono::milliseconds sampling_interval, sample_category category = sample_category::all); +#if defined(HWS_MPI_SUPPORT_ENABLED) + /** + * @brief Construct hardware samplers with the default sampling interval and MPI support. + * @param[in] communicator the MPI communicator + * @param[in] mode the MPI sampling mode + * @param[in] category the sample categories that are enabled for hardware sampling (default: all) + */ + explicit system_hardware_sampler(MPI_Comm communicator, detail::mpi_sampling_mode mode, sample_category category = sample_category::all); + /** + * @brief Construct hardware samplers with the provided @p sampling_interval and MPI support. + * @param[in] communicator the MPI communicator + * @param[in] mode the MPI sampling mode + * @param[in] sampling_interval the used sampling interval + * @param[in] category the sample categories that are enabled for hardware sampling (default: all) + */ + explicit system_hardware_sampler(MPI_Comm communicator, detail::mpi_sampling_mode mode, std::chrono::milliseconds sampling_interval, sample_category category = sample_category::all); +#endif + /** * @brief Delete the copy-constructor. @@ -211,6 +230,15 @@ class system_hardware_sampler { private: /// The different hardware sampler for the current system. std::vector> samplers_; + + /** + * Creates hardware samplers for all visible devices. Used by non-MPI class constructor. + * @param sampling_interval the used sampling interval + * @param category the sample category + */ + void create_local_samplers(std::chrono::milliseconds sampling_interval, hws::sample_category category); + + }; } // namespace hws diff --git a/include/hws/utility.hpp b/include/hws/utility.hpp index cbb1075..6abc3d8 100644 --- a/include/hws/utility.hpp +++ b/include/hws/utility.hpp @@ -15,20 +15,30 @@ #include "fmt/format.h" // fmt::format #include "fmt/ranges.h" // fmt::join -#include // std::from_chars -#include // std::chrono::duration -#include // std::trunc -#include // std::size_t -#include // std::optional -#include // std::runtime_error -#include // std::string, std::stof, std::stod, std::stold -#include // std::string_view -#include // std::errc -#include // std::is_same_v, std::is_floating_point_v, std::remove_cv_t, std::remove_reference_t, std::true_type, std::false_type -#include // std::vector +#include // std::from_chars +#include // std::chrono::duration +#include // std::trunc +#include // std::size_t +#include // std::optional +#include // std::runtime_error +#include // std::string, std::stof, std::stod, std::stold +#include // std::string_view +#include // std::errc +#include // std::is_same_v, std::is_floating_point_v, std::remove_cv_t, std::remove_reference_t, std::true_type, std::false_type +#include // std::vector +#include // std::unordered_map #if defined(HWS_MPI_SUPPORT_ENABLED) -#include // MPI_Comm + #include // MPI_Comm +#endif + +#if defined(HWS_FOR_NVIDIA_GPUS_ENABLED) + #include "hws/gpu_nvidia/utility.hpp" // HWS_CUDA_ERROR_CHECK + #include "cuda_runtime.h" // cuda functions +#endif +#if defined(HWS_FOR_AMD_GPUS_ENABLED) + #include "hws/gpu_amd/utility.hpp" // HWS_HIP_ERROR_CHECK + #include "hip/hip_runtime.h" // hip functions #endif namespace hws::detail { @@ -333,8 +343,297 @@ template * * @return concatenated YAML string on rank 0, empty string on all other ranks */ -[[nodiscard]] -std::string gather_yaml_strings_mpi(const std::string& local_yaml, MPI_Comm communicator); +[[nodiscard]] std::string gather_yaml_strings_mpi(const std::string &local_yaml, MPI_Comm communicator); + +/** + * @brief The mode to use for MPI sampling. + * per_rank: each rank creates hardware samplers for all devices visible to that rank + * whole_node: if the same device is visible to more than one rank, only one of those ranks creates a hardware sampler for that device + */ +enum class mpi_sampling_mode { + per_rank, + whole_node +}; + +/** + * @brief Information about a node-local MPI communicator for whole-node sampling. + */ +struct hostname_comm_info { + MPI_Comm node_comm = MPI_COMM_NULL; + int node_rank = 0; + int node_size = 1; +}; + +/** + * @brief Create a node-local MPI communicator for whole-node sampling based on node hostnames. + * @param comm the parent MPI communicator to split into node-local communicators + * @return the node-local MPI communicator information + */ +inline hostname_comm_info make_hostname_comm(MPI_Comm comm) { + int world_rank = 0, world_size = 0; + MPI_Comm_rank(comm, &world_rank); + MPI_Comm_size(comm, &world_size); + + // Gather all hostnames + char name[MPI_MAX_PROCESSOR_NAME]; + int name_len = 0; + MPI_Get_processor_name(name, &name_len); + + std::vector name_lengths(world_size); + MPI_Allgather(&name_len, 1, MPI_INT, name_lengths.data(), 1, MPI_INT, comm); + + // Build displacements and total byte count + std::vector displs(world_size); + int total = 0; + for (int i = 0; i < world_size; ++i) { + displs[i] = total; + total += name_lengths[i]; + } + + std::vector all_names(total); + MPI_Allgatherv(name, name_len, MPI_CHAR, all_names.data(), name_lengths.data(), displs.data(), MPI_CHAR, comm); + + // Assign colors locally on every rank + // + // All ranks hold identical copies of all_names, name_lengths, and displs, + // so they can each compute the same deterministic color map independently. + + std::unordered_map host_to_color; + host_to_color.reserve(world_size); + std::vector colors(world_size); + int next_color = 0; + for (int r = 0; r < world_size; ++r) { + // get host name of rank r + std::string_view host(&all_names[displs[r]], static_cast(name_lengths[r])); + + // try to insert it into the host_to_color map + auto [it, inserted] = host_to_color.emplace(host, next_color); + + // check if host was new, if yes, increment color + if (inserted) { + ++next_color; + } + // save color of current rank, either from newly created or existing entry + colors[r] = it->second; + } + + // Split communicator + + hostname_comm_info info{}; + MPI_Comm_split(comm, colors[world_rank], world_rank, &info.node_comm); + MPI_Comm_rank(info.node_comm, &info.node_rank); + MPI_Comm_size(info.node_comm, &info.node_size); + return info; +} + +/** + * @brief Free a node-local MPI communicator for whole-node sampling. + * @param info the node-local MPI communicator information to free + */ +inline void free_hostname_comm(hostname_comm_info &info) { + if (info.node_comm != MPI_COMM_NULL) { + MPI_Comm_free(&info.node_comm); + } +} + +enum class device_backend_kind { + nvidia, + amd, + intel +}; + +struct visible_gpu_device { + device_backend_kind backend; + int local_index; // device index for that backend on this rank + std::string physical_id; // stable per-node identifier +}; + +#endif + +#if defined(HWS_FOR_NVIDIA_GPUS_ENABLED) +/** + * @brief returns a stable physical ID for the NVIDIA GPU device with the given local index + * The ID is at least unique per node and can be used to identify the same device across different MPI ranks on the same node. + * + * @param local_index the local index of the NVIDIA GPU device + * @return the physical ID of the NVIDIA GPU device + */ +inline std::string nvidia_physical_id(int local_index) { + char bus_id[64] = {}; + HWS_CUDA_ERROR_CHECK(cudaDeviceGetPCIBusId(bus_id, sizeof(bus_id), local_index)); + return std::string{ "nvidia:" } + bus_id; +} + +/** + * @brief creates a list of all visible nvidia GPU devices + * + * @return a vector of all visible NVIDIA GPU devices on the local node, each with its local index and physical ID + */ +inline std::vector enumerate_local_nvidia_devices() { + std::vector out; + int count = 0; + HWS_CUDA_ERROR_CHECK(cudaGetDeviceCount(&count)); + for (int i = 0; i < count; ++i) { + visible_gpu_device d; + d.backend = device_backend_kind::nvidia; + d.local_index = i; + d.physical_id = nvidia_physical_id(i); + out.push_back(std::move(d)); + } + return out; +} + +#endif + +#if defined(HWS_FOR_AMD_GPUS_ENABLED) +inline std::string amd_physical_id(int local_index) { + char bus_id[64] = {}; + HWS_HIP_ERROR_CHECK(hipDeviceGetPCIBusId(bus_id, sizeof(bus_id), local_index)); + return std::string{ "amd:" } + bus_id; +} + +inline std::vector enumerate_local_amd_devices() { + std::vector out; + int count = 0; + HWS_HIP_ERROR_CHECK(hipGetDeviceCount(&count)); + for (int i = 0; i < count; ++i) { + visible_gpu_device d; + d.backend = device_backend_kind::amd; + d.local_index = i; + d.physical_id = amd_physical_id(i); + out.push_back(std::move(d)); + } + return out; +} +#endif + +#if defined(HWS_FOR_INTEL_GPUS_ENABLED) +inline std::string intel_physical_id(ze_device_handle_t device) { + ze_device_properties_t props{}; + props.stype = ZE_STRUCTURE_TYPE_DEVICE_PROPERTIES; + props.pNext = nullptr; + HWS_LEVEL_ZERO_ERROR_CHECK(zeDeviceGetProperties(device, &props)); + + char buf[2 * ZE_MAX_DEVICE_UUID_SIZE + 1] = {}; + for (std::size_t i = 0; i < ZE_MAX_DEVICE_UUID_SIZE; ++i) { + snprintf(buf + 2 * i, 3, "%02x", props.uuid.id[i]); + } + + return std::string{ "intel:" } + buf; +} + +inline std::vector enumerate_local_intel_devices() { + std::vector out; + + // get the GPU driver + ze_driver_handle_t driver{}; + HWS_LEVEL_ZERO_ERROR_CHECK(zeDriverGet(&driver_count, &driver)); + + // Discover devices for this driver + std::uint32_t device_count = 0; + HWS_LEVEL_ZERO_ERROR_CHECK(zeDeviceGet(driver, &device_count, nullptr)); + if (device_count == 0) { + return out; // no Intel GPUs visible + } + + std::vector devices(device_count); + HWS_LEVEL_ZERO_ERROR_CHECK(zeDeviceGet(driver, &device_count, devices.data())); + + // Fill visible_gpu_device list + for (std::uint32_t i = 0; i < device_count; ++i) { + ze_device_handle_t dev = devices[i]; + + visible_gpu_device d; + d.backend = device_backend_kind::intel; + d.local_index = static_cast(i); + d.physical_id = intel_physical_id(dev); + + out.push_back(std::move(d)); + } + + return out; +} +#endif + +#if defined(HWS_MPI_SUPPORT_ENABLED) + +/** + * Computes for each MPI rank a list of devices that have to be sampled by this rank. Ensures that + * each device is sampled by exactly one rank. + * + * @param local_devices a vector of visible_gpu_device for the local rank, each containing a local index and a physical ID + * @param node_comm a node local MPI communicator + * @return all device indices that have to be sampled by this rank + */ +inline std::vector owned_local_indices_for_backend(const std::vector &local_devices, MPI_Comm node_comm) { + int node_rank = 0, node_size = 0; + MPI_Comm_rank(node_comm, &node_rank); + MPI_Comm_size(node_comm, &node_size); + + // Pack physical IDs into a newline-separated string + std::string packed; + for (const auto &d : local_devices) { + packed += d.physical_id; + packed += '\n'; + } + const int local_size = static_cast(packed.size()); + + // Allgather sizes + std::vector sizes(node_size); + MPI_Allgather(&local_size, 1, MPI_INT, sizes.data(), 1, MPI_INT, node_comm); + + // Displacements and total length + std::vector displs(node_size); + int total = 0; + for (int r = 0; r < node_size; ++r) { + displs[r] = total; + total += sizes[r]; + } + + // Allgatherv packed physical IDs + std::vector all_data(total); + MPI_Allgatherv(packed.data(), local_size, MPI_CHAR, all_data.data(), sizes.data(), displs.data(), MPI_CHAR, node_comm); + + // Build owner map: physical_id -> first node_rank that reports it + std::unordered_map owner_rank_for_id; + owner_rank_for_id.reserve(local_devices.size() * 2 + 1); + + for (int r = 0; r < node_size; ++r) { + if (sizes[r] == 0) { + continue; + } + + const char *base = all_data.data() + displs[r]; + const int len = sizes[r]; + + int line_start = 0; + while (line_start < len) { + int line_end = line_start; + while (line_end < len && base[line_end] != '\n') { + ++line_end; + } + if (line_end > line_start) { + const std::string id(base + line_start, base + line_end); // copy just this ID + owner_rank_for_id.emplace(id, r); // first insertion wins + } + line_start = line_end + 1; + } + } + + // Decide which local indices we own: those whose physical_id is mapped to node_rank + std::vector owned_indices; + owned_indices.reserve(local_devices.size()); + + for (const auto &d : local_devices) { + auto it = owner_rank_for_id.find(d.physical_id); + if (it != owner_rank_for_id.end() && it->second == node_rank) { + owned_indices.push_back(d.local_index); + } + } + + return owned_indices; +} + #endif } // namespace hws::detail diff --git a/src/hws/gpu_amd/CMakeLists.txt b/src/hws/gpu_amd/CMakeLists.txt index 0b5f104..16aecb2 100644 --- a/src/hws/gpu_amd/CMakeLists.txt +++ b/src/hws/gpu_amd/CMakeLists.txt @@ -22,7 +22,7 @@ message(STATUS "Enable sampling of AMD GPU information using ROCm SMI (${rocm_sm find_package(HIP REQUIRED) # link against necessary libraries -target_link_libraries(${HWS_LIBRARY_NAME} PRIVATE rocm_smi64 hip::host) +target_link_libraries(${HWS_LIBRARY_NAME} PUBLIC rocm_smi64 hip::host) target_include_directories(${HWS_LIBRARY_NAME} PRIVATE ${ROCM_SMI_INCLUDE_DIR}) # add source file to source file list diff --git a/src/hws/gpu_nvidia/CMakeLists.txt b/src/hws/gpu_nvidia/CMakeLists.txt index 437e063..f8a5749 100644 --- a/src/hws/gpu_nvidia/CMakeLists.txt +++ b/src/hws/gpu_nvidia/CMakeLists.txt @@ -19,7 +19,7 @@ endif () message(STATUS "Enable sampling of NVIDIA GPU information using NVML (${CUDAToolkit_VERSION}).") # link against necessary libraries -target_link_libraries(${HWS_LIBRARY_NAME} PRIVATE CUDA::nvml CUDA::cudart) +target_link_libraries(${HWS_LIBRARY_NAME} PUBLIC CUDA::nvml CUDA::cudart) # add source file to source file list target_sources(${HWS_LIBRARY_NAME} PRIVATE diff --git a/src/hws/system_hardware_sampler.cpp b/src/hws/system_hardware_sampler.cpp index e07d7a7..eb06924 100644 --- a/src/hws/system_hardware_sampler.cpp +++ b/src/hws/system_hardware_sampler.cpp @@ -42,7 +42,7 @@ #include // std::vector #if defined(HWS_MPI_SUPPORT_ENABLED) -#include // MPI_Comm + #include // MPI_Comm #endif namespace hws { @@ -52,56 +52,68 @@ system_hardware_sampler::system_hardware_sampler(const sample_category category) system_hardware_sampler::system_hardware_sampler(const std::chrono::milliseconds sampling_interval, sample_category category) { // create the hardware samplers based on the available hardware -#if defined(HWS_FOR_CPUS_ENABLED) - { - samplers_.push_back(std::make_unique(sampling_interval, category)); - } -#endif -#if defined(HWS_FOR_NVIDIA_GPUS_ENABLED) - { - int device_count{}; - HWS_CUDA_ERROR_CHECK(cudaGetDeviceCount(&device_count)); - for (int device = 0; device < device_count; ++device) { - samplers_.push_back(std::make_unique(static_cast(device), sampling_interval, category)); + create_local_samplers(sampling_interval, category); +} + +#if defined(HWS_MPI_SUPPORT_ENABLED) +system_hardware_sampler::system_hardware_sampler(MPI_Comm communicator, detail::mpi_sampling_mode mode, sample_category category) : + system_hardware_sampler(communicator, mode, HWS_SAMPLING_INTERVAL, category) { } + +system_hardware_sampler::system_hardware_sampler(MPI_Comm communicator, detail::mpi_sampling_mode mode, std::chrono::milliseconds sampling_interval, sample_category category) { + if (mode == detail::mpi_sampling_mode::per_rank) { + // each rank creates samplers for all devices visible to him + create_local_samplers(sampling_interval, category); + } else if (mode == detail::mpi_sampling_mode::whole_node) { + // create a custom, node-local MPI communicator + auto nc = detail::make_hostname_comm(communicator); + + // CPU: one sampler per node --> node leader only + #if defined(HWS_FOR_CPUS_ENABLED) + if (nc.node_rank == 0) { + samplers_.push_back(std::make_unique(sampling_interval, category)); } - } -#endif -#if defined(HWS_FOR_AMD_GPUS_ENABLED) - { - int device_count{}; - HWS_HIP_ERROR_CHECK(hipGetDeviceCount(&device_count)); - for (int device = 0; device < device_count; ++device) { - samplers_.push_back(std::make_unique(static_cast(device), sampling_interval, category)); + #endif + + // NVIDIA + #if defined(HWS_FOR_NVIDIA_GPUS_ENABLED) + { + const auto local = detail::enumerate_local_nvidia_devices(); + const auto owned = detail::owned_local_indices_for_backend(local, nc.node_comm); + for (int idx : owned) { + samplers_.push_back(std::make_unique(static_cast(idx), sampling_interval, category)); + } } - } -#endif -#if defined(HWS_FOR_INTEL_GPUS_ENABLED) - { - // init level zero driver - HWS_LEVEL_ZERO_ERROR_CHECK(zeInit(ZE_INIT_FLAG_GPU_ONLY)) - - // discover the number of drivers - std::uint32_t driver_count{ 0 }; - HWS_LEVEL_ZERO_ERROR_CHECK(zeDriverGet(&driver_count, nullptr)) - - // check if only the single GPU driver has been found - if (driver_count > 1) { - throw std::runtime_error{ fmt::format("Found too many GPU drivers ({})!", driver_count) }; + #endif + + // AMD + #if defined(HWS_FOR_AMD_GPUS_ENABLED) + { + const auto local = detail::enumerate_local_amd_devices(); + const auto owned = detail::owned_local_indices_for_backend(local, nc.node_comm); + for (int idx : owned) { + samplers_.push_back(std::make_unique( + static_cast(idx), sampling_interval, category)); + } } - - // get the GPU driver - ze_driver_handle_t driver{}; - HWS_LEVEL_ZERO_ERROR_CHECK(zeDriverGet(&driver_count, &driver)) - - // get all GPUs for the current driver - std::uint32_t device_count{ 0 }; - HWS_LEVEL_ZERO_ERROR_CHECK(zeDeviceGet(driver, &device_count, nullptr)) - for (std::uint32_t device = 0; device < device_count; ++device) { - samplers_.push_back(std::make_unique(static_cast(device), sampling_interval, category)); + #endif + + // Intel + #if defined(HWS_FOR_INTEL_GPUS_ENABLED) + { + const auto local = detail::enumerate_local_intel_devices(); + const auto owned = detail::owned_local_indices_for_backend(local, nc.node_comm); + for (int idx : owned) { + samplers_.push_back(std::make_unique(static_cast(idx), sampling_interval, category)); + } } + #endif + + detail::free_hostname_comm(nc); + } else { + throw std::runtime_error{ fmt::format("Unknown MPI sampling mode {}!", static_cast(mode)) }; } -#endif } +#endif void system_hardware_sampler::start_sampling() { std::for_each(samplers_.begin(), samplers_.end(), [](auto &ptr) { ptr->start_sampling(); }); @@ -255,4 +267,56 @@ std::string system_hardware_sampler::samples_only_as_yaml_string() const { return std::accumulate(samplers_.cbegin(), samplers_.cend(), std::string{}, [](const std::string str, const auto &ptr) { return str + ptr->samples_only_as_yaml_string(); }); } +void system_hardware_sampler::create_local_samplers(std::chrono::milliseconds sampling_interval, sample_category category) { +#if defined(HWS_FOR_CPUS_ENABLED) + { + samplers_.push_back(std::make_unique(sampling_interval, category)); + } +#endif +#if defined(HWS_FOR_NVIDIA_GPUS_ENABLED) + { + int device_count{}; + HWS_CUDA_ERROR_CHECK(cudaGetDeviceCount(&device_count)); + for (int device = 0; device < device_count; ++device) { + samplers_.push_back(std::make_unique(static_cast(device), sampling_interval, category)); + } + } +#endif +#if defined(HWS_FOR_AMD_GPUS_ENABLED) + { + int device_count{}; + HWS_HIP_ERROR_CHECK(hipGetDeviceCount(&device_count)); + for (int device = 0; device < device_count; ++device) { + samplers_.push_back(std::make_unique(static_cast(device), sampling_interval, category)); + } + } +#endif +#if defined(HWS_FOR_INTEL_GPUS_ENABLED) + { + // init level zero driver + HWS_LEVEL_ZERO_ERROR_CHECK(zeInit(ZE_INIT_FLAG_GPU_ONLY)) + + // discover the number of drivers + std::uint32_t driver_count{ 0 }; + HWS_LEVEL_ZERO_ERROR_CHECK(zeDriverGet(&driver_count, nullptr)) + + // check if only the single GPU driver has been found + if (driver_count > 1) { + throw std::runtime_error{ fmt::format("Found too many GPU drivers ({})!", driver_count) }; + } + + // get the GPU driver + ze_driver_handle_t driver{}; + HWS_LEVEL_ZERO_ERROR_CHECK(zeDriverGet(&driver_count, &driver)) + + // get all GPUs for the current driver + std::uint32_t device_count{ 0 }; + HWS_LEVEL_ZERO_ERROR_CHECK(zeDeviceGet(driver, &device_count, nullptr)) + for (std::uint32_t device = 0; device < device_count; ++device) { + samplers_.push_back(std::make_unique(static_cast(device), sampling_interval, category)); + } + } +#endif +} + } // namespace hws From bea8838b6b8f0c127e1d98d81b45ddbe2879827c Mon Sep 17 00:00:00 2001 From: TimThuering <56958925+TimThuering@users.noreply.github.com> Date: Mon, 8 Jun 2026 09:36:12 +0200 Subject: [PATCH 09/31] added synchronous start and stop sampling for MPI --- include/hws/system_hardware_sampler.hpp | 17 ++++++++++++++--- src/hws/system_hardware_sampler.cpp | 13 +++++++++++++ 2 files changed, 27 insertions(+), 3 deletions(-) diff --git a/include/hws/system_hardware_sampler.hpp b/include/hws/system_hardware_sampler.hpp index a5a1265..ba160a2 100644 --- a/include/hws/system_hardware_sampler.hpp +++ b/include/hws/system_hardware_sampler.hpp @@ -64,7 +64,6 @@ class system_hardware_sampler { explicit system_hardware_sampler(MPI_Comm communicator, detail::mpi_sampling_mode mode, std::chrono::milliseconds sampling_interval, sample_category category = sample_category::all); #endif - /** * @brief Delete the copy-constructor. */ @@ -91,10 +90,24 @@ class system_hardware_sampler { * @brief Start hardware sampling for all wrapped hardware samplers. */ void start_sampling(); +#if defined(HWS_MPI_SUPPORT_ENABLED) + /** + * @brief Start hardware sampling for all wrapped hardware samplers. Executes an MPI barrier before starting sampling to synchronize all MPI ranks. + * @param[in] communicator the MPI communicator to use + */ + void start_sampling(MPI_Comm communicator); +#endif /** * @brief Stop hardware sampling for all wrapped hardware samplers. */ void stop_sampling(); +#if defined(HWS_MPI_SUPPORT_ENABLED) + /** + * @brief Stop hardware sampling for all wrapped hardware samplers. Executes an MPI barrier after stopping sampling to synchronize all MPI ranks. + * @param[in] communicator the MPI communicator to use + */ + void stop_sampling(MPI_Comm communicator); +#endif /** * @brief Pause hardware sampling for all wrapped hardware samplers. */ @@ -237,8 +250,6 @@ class system_hardware_sampler { * @param category the sample category */ void create_local_samplers(std::chrono::milliseconds sampling_interval, hws::sample_category category); - - }; } // namespace hws diff --git a/src/hws/system_hardware_sampler.cpp b/src/hws/system_hardware_sampler.cpp index eb06924..416a19d 100644 --- a/src/hws/system_hardware_sampler.cpp +++ b/src/hws/system_hardware_sampler.cpp @@ -119,10 +119,23 @@ void system_hardware_sampler::start_sampling() { std::for_each(samplers_.begin(), samplers_.end(), [](auto &ptr) { ptr->start_sampling(); }); } +#if defined(HWS_MPI_SUPPORT_ENABLED) +void system_hardware_sampler::start_sampling(MPI_Comm communicator) { + MPI_Barrier(communicator); + std::for_each(samplers_.begin(), samplers_.end(), [](auto &ptr) { ptr->start_sampling(); }); +} +#endif void system_hardware_sampler::stop_sampling() { std::for_each(samplers_.begin(), samplers_.end(), [](auto &ptr) { ptr->stop_sampling(); }); } +#if defined(HWS_MPI_SUPPORT_ENABLED) +void system_hardware_sampler::stop_sampling(MPI_Comm communicator) { + std::for_each(samplers_.begin(), samplers_.end(), [](auto &ptr) { ptr->stop_sampling(); }); + MPI_Barrier(communicator); +} +#endif + void system_hardware_sampler::pause_sampling() { std::for_each(samplers_.begin(), samplers_.end(), [](auto &ptr) { ptr->pause_sampling(); }); } From 04210fb64f642c252b9cdfc3501de22da17d0979 Mon Sep 17 00:00:00 2001 From: TimThuering <56958925+TimThuering@users.noreply.github.com> Date: Mon, 8 Jun 2026 15:48:58 +0200 Subject: [PATCH 10/31] python bindings for MPI-aware constructor and functions --- bindings/system_hardware_sampler.cpp | 67 ++++++++++++++++++++++------ 1 file changed, 54 insertions(+), 13 deletions(-) diff --git a/bindings/system_hardware_sampler.cpp b/bindings/system_hardware_sampler.cpp index 6633444..948f8e1 100644 --- a/bindings/system_hardware_sampler.cpp +++ b/bindings/system_hardware_sampler.cpp @@ -20,21 +20,70 @@ #include // std::string #if defined(HWS_MPI_SUPPORT_ENABLED) -#include -#include "mpi4py_communicator.hpp" + #include "mpi4py_communicator.hpp" + #include #endif namespace py = pybind11; void init_system_hardware_sampler(py::module_ &m) { +#if defined(HWS_MPI_SUPPORT_ENABLED) + // bind mpi sampling mode enum + py::enum_(m, "MPISamplingMode") + .value("PER_RANK", hws::detail::mpi_sampling_mode::per_rank) + .value("WHOLE_NODE", hws::detail::mpi_sampling_mode::whole_node) + .export_values(); +#endif // bind the pure virtual hardware sampler base class py::class_(m, "SystemHardwareSampler") .def(py::init<>(), "construct a new system hardware sampler with the default sampling interval") .def(py::init(), "construct a new system hardware sampler with the default sampling interval sampling only the provided sample_category samples") .def(py::init(), "construct a new system hardware sampler for with the specified sampling interval") .def(py::init(), "construct a new system hardware sampler for with the specified sampling interval sampling only the provided sample_category samples") +#if defined(HWS_MPI_SUPPORT_ENABLED) + // MPI-aware constructors + + // (MPI_Comm, mode, category=all) + .def(py::init([](py::object py_comm, + hws::detail::mpi_sampling_mode mode, + hws::sample_category category) { + MPI_Comm comm = mpi_comm_from_python(py_comm); + return std::make_unique(comm, mode, category); + }), + py::arg("comm"), + py::arg("mode"), + py::arg("category") = hws::sample_category::all, + "construct a new system hardware sampler with the default sampling interval and MPI support using the given mpi4py communicator and sampling mode") + + // (MPI_Comm, mode, sampling_interval, category=all) + .def(py::init([](py::object py_comm, + hws::detail::mpi_sampling_mode mode, + std::chrono::milliseconds sampling_interval, + hws::sample_category category) { + MPI_Comm comm = mpi_comm_from_python(py_comm); + return std::make_unique(comm, mode, sampling_interval, category); + }), + py::arg("comm"), + py::arg("mode"), + py::arg("sampling_interval"), + py::arg("category") = hws::sample_category::all, + "construct a new system hardware sampler with the specified sampling interval and MPI support using the given mpi4py communicator and sampling mode") + + // Non-MPI overloads + .def("start", py::overload_cast<>(&hws::system_hardware_sampler::start_sampling), "start hardware sampling for all available hardware samplers") + .def("stop", py::overload_cast<>(&hws::system_hardware_sampler::stop_sampling), "stop hardware sampling for all available hardware samplers") + // MPI-aware overloads + .def("start", [](hws::system_hardware_sampler &self, py::object py_comm) { + MPI_Comm comm = mpi_comm_from_python(py_comm); + self.start_sampling(comm); }, py::arg("comm"), "start hardware sampling for all available hardware samplers; executes an MPI barrier on the given communicator before starting") + .def("stop", [](hws::system_hardware_sampler &self, py::object py_comm) { + MPI_Comm comm = mpi_comm_from_python(py_comm); + self.stop_sampling(comm); }, py::arg("comm"), "stop hardware sampling for all available hardware samplers; executes an MPI barrier on the given communicator after stopping") +#else + // No MPI support: only the simple overloads exist, no ambiguity .def("start", &hws::system_hardware_sampler::start_sampling, "start hardware sampling for all available hardware samplers") - .def("stop", &hws::system_hardware_sampler::stop_sampling, "stop hardware sampling for all available hardware samplers") + .def("stop", &hws::system_hardware_sampler::stop_sampling, "stop hardware sampling for all available hardware samplers"); +#endif .def("pause", &hws::system_hardware_sampler::pause_sampling, "pause hardware sampling for all available hardware samplers") .def("resume", &hws::system_hardware_sampler::resume_sampling, "resume hardware sampling for all available hardware samplers") .def("has_started", &hws::system_hardware_sampler::has_sampling_started, "check whether hardware sampling has already been started for all hardware samplers") @@ -73,17 +122,9 @@ void init_system_hardware_sampler(py::module_ &m) { .def("dump_yaml", py::overload_cast(&hws::system_hardware_sampler::dump_yaml, py::const_), "dump all hardware samples for all hardware samplers to the given YAML file") .def("as_yaml_string", &hws::system_hardware_sampler::as_yaml_string, "return all hardware samples for all hardware samplers as YAML string") #if defined(HWS_MPI_SUPPORT_ENABLED) - .def("dump_yaml_global", - [](const hws::system_hardware_sampler &self, - const std::string &filename, - py::object py_comm) { + .def("dump_yaml_global", [](const hws::system_hardware_sampler &self, const std::string &filename, py::object py_comm) { const MPI_Comm comm = mpi_comm_from_python(py_comm); - self.dump_yaml_global(filename, comm); - }, - py::arg("filename"), - py::arg("comm"), - "Let MPI rank 0 dump the hardware samples of all hardware samplers of all MPI ranks to the given YAML file using the provided mpi4py communicator." - ) + self.dump_yaml_global(filename, comm); }, py::arg("filename"), py::arg("comm"), "Let MPI rank 0 dump the hardware samples of all hardware samplers of all MPI ranks to the given YAML file using the provided mpi4py communicator.") #endif .def("__repr__", [](const hws::system_hardware_sampler &self) { return fmt::format("", self.num_samplers()); }); } From 89116b46d06a33fe9c1953b838cd5ea058375416 Mon Sep 17 00:00:00 2001 From: TimThuering <56958925+TimThuering@users.noreply.github.com> Date: Tue, 9 Jun 2026 09:51:25 +0200 Subject: [PATCH 11/31] fixes for mpi backend with intel GPUs --- include/hws/utility.hpp | 24 ++++++++++++++++++++---- 1 file changed, 20 insertions(+), 4 deletions(-) diff --git a/include/hws/utility.hpp b/include/hws/utility.hpp index 6abc3d8..6edc69d 100644 --- a/include/hws/utility.hpp +++ b/include/hws/utility.hpp @@ -25,8 +25,8 @@ #include // std::string_view #include // std::errc #include // std::is_same_v, std::is_floating_point_v, std::remove_cv_t, std::remove_reference_t, std::true_type, std::false_type -#include // std::vector #include // std::unordered_map +#include // std::vector #if defined(HWS_MPI_SUPPORT_ENABLED) #include // MPI_Comm @@ -40,6 +40,10 @@ #include "hws/gpu_amd/utility.hpp" // HWS_HIP_ERROR_CHECK #include "hip/hip_runtime.h" // hip functions #endif +#if defined(HWS_FOR_INTEL_GPUS_ENABLED) + #include "hws/gpu_intel/utility.hpp" // HWS_LEVEL_ZERO_ERROR_CHECK + #include "level_zero/ze_api.h" // Level Zero runtime functions +#endif namespace hws::detail { @@ -525,6 +529,18 @@ inline std::string intel_physical_id(ze_device_handle_t device) { inline std::vector enumerate_local_intel_devices() { std::vector out; + // init level zero driver + HWS_LEVEL_ZERO_ERROR_CHECK(zeInit(ZE_INIT_FLAG_GPU_ONLY)) + + // discover the number of drivers + std::uint32_t driver_count{ 0 }; + HWS_LEVEL_ZERO_ERROR_CHECK(zeDriverGet(&driver_count, nullptr)) + + // check if only the single GPU driver has been found + if (driver_count > 1) { + throw std::runtime_error{ fmt::format("Found too many GPU drivers ({})!", driver_count) }; + } + // get the GPU driver ze_driver_handle_t driver{}; HWS_LEVEL_ZERO_ERROR_CHECK(zeDriverGet(&driver_count, &driver)); @@ -533,7 +549,7 @@ inline std::vector enumerate_local_intel_devices() { std::uint32_t device_count = 0; HWS_LEVEL_ZERO_ERROR_CHECK(zeDeviceGet(driver, &device_count, nullptr)); if (device_count == 0) { - return out; // no Intel GPUs visible + return out; // no Intel GPUs visible } std::vector devices(device_count); @@ -544,7 +560,7 @@ inline std::vector enumerate_local_intel_devices() { ze_device_handle_t dev = devices[i]; visible_gpu_device d; - d.backend = device_backend_kind::intel; + d.backend = device_backend_kind::intel; d.local_index = static_cast(i); d.physical_id = intel_physical_id(dev); @@ -614,7 +630,7 @@ inline std::vector owned_local_indices_for_backend(const std::vector line_start) { const std::string id(base + line_start, base + line_end); // copy just this ID - owner_rank_for_id.emplace(id, r); // first insertion wins + owner_rank_for_id.emplace(id, r); // first insertion wins } line_start = line_end + 1; } From a7c6543938fc207b060f35345f45c4172fee02dc Mon Sep 17 00:00:00 2001 From: TimThuering <56958925+TimThuering@users.noreply.github.com> Date: Fri, 12 Jun 2026 11:02:14 +0200 Subject: [PATCH 12/31] fix for python bindings --- bindings/main.cpp | 2 +- bindings/system_hardware_sampler.cpp | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/bindings/main.cpp b/bindings/main.cpp index 2b1a161..a629ee0 100644 --- a/bindings/main.cpp +++ b/bindings/main.cpp @@ -102,4 +102,4 @@ MPI_Comm mpi_comm_from_python(py::object py_comm) { return *comm_ptr; } -#endif \ No newline at end of file +#endif diff --git a/bindings/system_hardware_sampler.cpp b/bindings/system_hardware_sampler.cpp index 948f8e1..4941fdf 100644 --- a/bindings/system_hardware_sampler.cpp +++ b/bindings/system_hardware_sampler.cpp @@ -82,7 +82,7 @@ void init_system_hardware_sampler(py::module_ &m) { #else // No MPI support: only the simple overloads exist, no ambiguity .def("start", &hws::system_hardware_sampler::start_sampling, "start hardware sampling for all available hardware samplers") - .def("stop", &hws::system_hardware_sampler::stop_sampling, "stop hardware sampling for all available hardware samplers"); + .def("stop", &hws::system_hardware_sampler::stop_sampling, "stop hardware sampling for all available hardware samplers") #endif .def("pause", &hws::system_hardware_sampler::pause_sampling, "pause hardware sampling for all available hardware samplers") .def("resume", &hws::system_hardware_sampler::resume_sampling, "resume hardware sampling for all available hardware samplers") From 45d50996834aefa9b7936cdf85c22b8d69639e9b Mon Sep 17 00:00:00 2001 From: TimThuering <56958925+TimThuering@users.noreply.github.com> Date: Fri, 12 Jun 2026 13:10:51 +0200 Subject: [PATCH 13/31] fix for non-mpi mode --- include/hws/utility.hpp | 2 -- 1 file changed, 2 deletions(-) diff --git a/include/hws/utility.hpp b/include/hws/utility.hpp index 6edc69d..257ed3c 100644 --- a/include/hws/utility.hpp +++ b/include/hws/utility.hpp @@ -452,7 +452,6 @@ struct visible_gpu_device { std::string physical_id; // stable per-node identifier }; -#endif #if defined(HWS_FOR_NVIDIA_GPUS_ENABLED) /** @@ -571,7 +570,6 @@ inline std::vector enumerate_local_intel_devices() { } #endif -#if defined(HWS_MPI_SUPPORT_ENABLED) /** * Computes for each MPI rank a list of devices that have to be sampled by this rank. Ensures that From 5532369ca4b07a61b776bfc738d414bf2cb74ce4 Mon Sep 17 00:00:00 2001 From: TimThuering <56958925+TimThuering@users.noreply.github.com> Date: Wed, 17 Jun 2026 08:45:48 +0200 Subject: [PATCH 14/31] restructure MPI related utility functions --- include/hws/utility.hpp | 131 ++++-------------------------- src/hws/gpu_amd/CMakeLists.txt | 2 +- src/hws/gpu_amd/utility.cpp | 42 ++++++++++ src/hws/gpu_intel/utility.cpp | 81 ++++++++++++++++++ src/hws/gpu_nvidia/CMakeLists.txt | 2 +- src/hws/gpu_nvidia/utility.cpp | 39 +++++++++ 6 files changed, 178 insertions(+), 119 deletions(-) diff --git a/include/hws/utility.hpp b/include/hws/utility.hpp index 257ed3c..2915a18 100644 --- a/include/hws/utility.hpp +++ b/include/hws/utility.hpp @@ -32,19 +32,6 @@ #include // MPI_Comm #endif -#if defined(HWS_FOR_NVIDIA_GPUS_ENABLED) - #include "hws/gpu_nvidia/utility.hpp" // HWS_CUDA_ERROR_CHECK - #include "cuda_runtime.h" // cuda functions -#endif -#if defined(HWS_FOR_AMD_GPUS_ENABLED) - #include "hws/gpu_amd/utility.hpp" // HWS_HIP_ERROR_CHECK - #include "hip/hip_runtime.h" // hip functions -#endif -#if defined(HWS_FOR_INTEL_GPUS_ENABLED) - #include "hws/gpu_intel/utility.hpp" // HWS_LEVEL_ZERO_ERROR_CHECK - #include "level_zero/ze_api.h" // Level Zero runtime functions -#endif - namespace hws::detail { /** @@ -455,119 +442,29 @@ struct visible_gpu_device { #if defined(HWS_FOR_NVIDIA_GPUS_ENABLED) /** - * @brief returns a stable physical ID for the NVIDIA GPU device with the given local index - * The ID is at least unique per node and can be used to identify the same device across different MPI ranks on the same node. - * - * @param local_index the local index of the NVIDIA GPU device - * @return the physical ID of the NVIDIA GPU device - */ -inline std::string nvidia_physical_id(int local_index) { - char bus_id[64] = {}; - HWS_CUDA_ERROR_CHECK(cudaDeviceGetPCIBusId(bus_id, sizeof(bus_id), local_index)); - return std::string{ "nvidia:" } + bus_id; -} - -/** - * @brief creates a list of all visible nvidia GPU devices + * @brief creates a list of all visible NVIDIA GPU devices * * @return a vector of all visible NVIDIA GPU devices on the local node, each with its local index and physical ID */ -inline std::vector enumerate_local_nvidia_devices() { - std::vector out; - int count = 0; - HWS_CUDA_ERROR_CHECK(cudaGetDeviceCount(&count)); - for (int i = 0; i < count; ++i) { - visible_gpu_device d; - d.backend = device_backend_kind::nvidia; - d.local_index = i; - d.physical_id = nvidia_physical_id(i); - out.push_back(std::move(d)); - } - return out; -} - +[[nodiscard]] std::vector enumerate_local_nvidia_devices(); #endif #if defined(HWS_FOR_AMD_GPUS_ENABLED) -inline std::string amd_physical_id(int local_index) { - char bus_id[64] = {}; - HWS_HIP_ERROR_CHECK(hipDeviceGetPCIBusId(bus_id, sizeof(bus_id), local_index)); - return std::string{ "amd:" } + bus_id; -} - -inline std::vector enumerate_local_amd_devices() { - std::vector out; - int count = 0; - HWS_HIP_ERROR_CHECK(hipGetDeviceCount(&count)); - for (int i = 0; i < count; ++i) { - visible_gpu_device d; - d.backend = device_backend_kind::amd; - d.local_index = i; - d.physical_id = amd_physical_id(i); - out.push_back(std::move(d)); - } - return out; -} +/** + * @brief creates a list of all visible AMD GPU devices + * + * @return a vector of all visible AMD GPU devices on the local node, each with its local index and physical ID + */ +[[nodiscard]] std::vector enumerate_local_amd_devices(); #endif #if defined(HWS_FOR_INTEL_GPUS_ENABLED) -inline std::string intel_physical_id(ze_device_handle_t device) { - ze_device_properties_t props{}; - props.stype = ZE_STRUCTURE_TYPE_DEVICE_PROPERTIES; - props.pNext = nullptr; - HWS_LEVEL_ZERO_ERROR_CHECK(zeDeviceGetProperties(device, &props)); - - char buf[2 * ZE_MAX_DEVICE_UUID_SIZE + 1] = {}; - for (std::size_t i = 0; i < ZE_MAX_DEVICE_UUID_SIZE; ++i) { - snprintf(buf + 2 * i, 3, "%02x", props.uuid.id[i]); - } - - return std::string{ "intel:" } + buf; -} - -inline std::vector enumerate_local_intel_devices() { - std::vector out; - - // init level zero driver - HWS_LEVEL_ZERO_ERROR_CHECK(zeInit(ZE_INIT_FLAG_GPU_ONLY)) - - // discover the number of drivers - std::uint32_t driver_count{ 0 }; - HWS_LEVEL_ZERO_ERROR_CHECK(zeDriverGet(&driver_count, nullptr)) - - // check if only the single GPU driver has been found - if (driver_count > 1) { - throw std::runtime_error{ fmt::format("Found too many GPU drivers ({})!", driver_count) }; - } - - // get the GPU driver - ze_driver_handle_t driver{}; - HWS_LEVEL_ZERO_ERROR_CHECK(zeDriverGet(&driver_count, &driver)); - - // Discover devices for this driver - std::uint32_t device_count = 0; - HWS_LEVEL_ZERO_ERROR_CHECK(zeDeviceGet(driver, &device_count, nullptr)); - if (device_count == 0) { - return out; // no Intel GPUs visible - } - - std::vector devices(device_count); - HWS_LEVEL_ZERO_ERROR_CHECK(zeDeviceGet(driver, &device_count, devices.data())); - - // Fill visible_gpu_device list - for (std::uint32_t i = 0; i < device_count; ++i) { - ze_device_handle_t dev = devices[i]; - - visible_gpu_device d; - d.backend = device_backend_kind::intel; - d.local_index = static_cast(i); - d.physical_id = intel_physical_id(dev); - - out.push_back(std::move(d)); - } - - return out; -} +/** + * @brief creates a list of all visible Intel GPU devices + * + * @return a vector of all visible Intel GPU devices on the local node, each with its local index and physical ID + */ +[[nodiscard]] std::vector enumerate_local_intel_devices(); #endif diff --git a/src/hws/gpu_amd/CMakeLists.txt b/src/hws/gpu_amd/CMakeLists.txt index 16aecb2..0b5f104 100644 --- a/src/hws/gpu_amd/CMakeLists.txt +++ b/src/hws/gpu_amd/CMakeLists.txt @@ -22,7 +22,7 @@ message(STATUS "Enable sampling of AMD GPU information using ROCm SMI (${rocm_sm find_package(HIP REQUIRED) # link against necessary libraries -target_link_libraries(${HWS_LIBRARY_NAME} PUBLIC rocm_smi64 hip::host) +target_link_libraries(${HWS_LIBRARY_NAME} PRIVATE rocm_smi64 hip::host) target_include_directories(${HWS_LIBRARY_NAME} PRIVATE ${ROCM_SMI_INCLUDE_DIR}) # add source file to source file list diff --git a/src/hws/gpu_amd/utility.cpp b/src/hws/gpu_amd/utility.cpp index a88969a..d0bb038 100644 --- a/src/hws/gpu_amd/utility.cpp +++ b/src/hws/gpu_amd/utility.cpp @@ -10,6 +10,13 @@ #include "rocm_smi/rocm_smi.h" // ROCm SMI runtime functions #include // std::string +#include // std::vector + +#if defined(HWS_MPI_SUPPORT_ENABLED) && defined(HWS_FOR_AMD_GPUS_ENABLED) + #include "hws/utility.hpp" // hws::detail::visible_gpu_device, hws::detail::device_backend_kind + + #include "hip/hip_runtime_api.h" // hipGetDeviceCount, hipDeviceGetPCIBusId +#endif namespace hws::detail { @@ -39,4 +46,39 @@ std::string performance_level_to_string(const rsmi_dev_perf_level_t perf_level) } } +#if defined(HWS_MPI_SUPPORT_ENABLED) && defined(HWS_FOR_AMD_GPUS_ENABLED) + +namespace { + +/** + * @brief returns a stable physical ID for the AMD GPU device with the given local index + * The ID is at least unique per node and can be used to identify the same device across different MPI ranks on the same node. + * + * @param local_index the local index of the AMD GPU device + * @return the physical ID of the AMD GPU device + */ +std::string amd_physical_id(int local_index) { + char bus_id[64] = {}; + HWS_HIP_ERROR_CHECK(hipDeviceGetPCIBusId(bus_id, sizeof(bus_id), local_index)); + return std::string{ "amd:" } + bus_id; +} + +} // namespace + +std::vector enumerate_local_amd_devices() { + std::vector out; + int count = 0; + HWS_HIP_ERROR_CHECK(hipGetDeviceCount(&count)); + for (int i = 0; i < count; ++i) { + visible_gpu_device d; + d.backend = device_backend_kind::amd; + d.local_index = i; + d.physical_id = amd_physical_id(i); + out.push_back(std::move(d)); + } + return out; +} + +#endif + } // namespace hws::detail diff --git a/src/hws/gpu_intel/utility.cpp b/src/hws/gpu_intel/utility.cpp index 5a29eee..2d50ad8 100644 --- a/src/hws/gpu_intel/utility.cpp +++ b/src/hws/gpu_intel/utility.cpp @@ -12,10 +12,18 @@ #include "level_zero/ze_api.h" // Level Zero runtime functions #include "level_zero/zes_api.h" // Level Zero runtime functions +#include // std::size_t +#include // std::uint32_t +#include // snprintf +#include // std::runtime_error #include // std::string #include // std::string_view #include // std::vector +#if defined(HWS_MPI_SUPPORT_ENABLED) && defined(HWS_FOR_INTEL_GPUS_ENABLED) + #include "hws/utility.hpp" // hws::detail::visible_gpu_device, hws::detail::device_backend_kind +#endif + namespace hws::detail { std::vector property_flags_to_vector(const ze_device_property_flags_t flags) { @@ -227,4 +235,77 @@ std::string memory_location_to_name(const zes_mem_loc_t mem_loc) { } } +#if defined(HWS_MPI_SUPPORT_ENABLED) && defined(HWS_FOR_INTEL_GPUS_ENABLED) + +namespace { + +/** + * @brief returns a stable physical ID for the Intel GPU @p device + * The ID is at least unique per node and can be used to identify the same device across different MPI ranks on the same node. + * + * @param device the Level Zero device handle of the Intel GPU device + * @return the physical ID of the Intel GPU device + */ +std::string intel_physical_id(ze_device_handle_t device) { + ze_device_properties_t props{}; + props.stype = ZE_STRUCTURE_TYPE_DEVICE_PROPERTIES; + props.pNext = nullptr; + HWS_LEVEL_ZERO_ERROR_CHECK(zeDeviceGetProperties(device, &props)); + + char buf[2 * ZE_MAX_DEVICE_UUID_SIZE + 1] = {}; + for (std::size_t i = 0; i < ZE_MAX_DEVICE_UUID_SIZE; ++i) { + snprintf(buf + 2 * i, 3, "%02x", props.uuid.id[i]); + } + + return std::string{ "intel:" } + buf; +} + +} // namespace + +std::vector enumerate_local_intel_devices() { + std::vector out; + + // init level zero driver + HWS_LEVEL_ZERO_ERROR_CHECK(zeInit(ZE_INIT_FLAG_GPU_ONLY)) + + // discover the number of drivers + std::uint32_t driver_count{ 0 }; + HWS_LEVEL_ZERO_ERROR_CHECK(zeDriverGet(&driver_count, nullptr)) + + // check if only the single GPU driver has been found + if (driver_count > 1) { + throw std::runtime_error{ fmt::format("Found too many GPU drivers ({})!", driver_count) }; + } + + // get the GPU driver + ze_driver_handle_t driver{}; + HWS_LEVEL_ZERO_ERROR_CHECK(zeDriverGet(&driver_count, &driver)); + + // Discover devices for this driver + std::uint32_t device_count = 0; + HWS_LEVEL_ZERO_ERROR_CHECK(zeDeviceGet(driver, &device_count, nullptr)); + if (device_count == 0) { + return out; // no Intel GPUs visible + } + + std::vector devices(device_count); + HWS_LEVEL_ZERO_ERROR_CHECK(zeDeviceGet(driver, &device_count, devices.data())); + + // Fill visible_gpu_device list + for (std::uint32_t i = 0; i < device_count; ++i) { + ze_device_handle_t dev = devices[i]; + + visible_gpu_device d; + d.backend = device_backend_kind::intel; + d.local_index = static_cast(i); + d.physical_id = intel_physical_id(dev); + + out.push_back(std::move(d)); + } + + return out; +} + +#endif + } // namespace hws::detail diff --git a/src/hws/gpu_nvidia/CMakeLists.txt b/src/hws/gpu_nvidia/CMakeLists.txt index f8a5749..437e063 100644 --- a/src/hws/gpu_nvidia/CMakeLists.txt +++ b/src/hws/gpu_nvidia/CMakeLists.txt @@ -19,7 +19,7 @@ endif () message(STATUS "Enable sampling of NVIDIA GPU information using NVML (${CUDAToolkit_VERSION}).") # link against necessary libraries -target_link_libraries(${HWS_LIBRARY_NAME} PUBLIC CUDA::nvml CUDA::cudart) +target_link_libraries(${HWS_LIBRARY_NAME} PRIVATE CUDA::nvml CUDA::cudart) # add source file to source file list target_sources(${HWS_LIBRARY_NAME} PRIVATE diff --git a/src/hws/gpu_nvidia/utility.cpp b/src/hws/gpu_nvidia/utility.cpp index a81feea..3237113 100644 --- a/src/hws/gpu_nvidia/utility.cpp +++ b/src/hws/gpu_nvidia/utility.cpp @@ -14,6 +14,10 @@ #include // std::string #include // std::vector +#if defined(HWS_MPI_SUPPORT_ENABLED) && defined(HWS_FOR_NVIDIA_GPUS_ENABLED) + #include "hws/utility.hpp" // hws::detail::visible_gpu_device, hws::detail::device_backend_kind +#endif + namespace hws::detail { #if CUDA_VERSION >= 12000 @@ -56,4 +60,39 @@ std::string throttle_event_reason_to_string(const unsigned long long clocks_even #endif +#if defined(HWS_MPI_SUPPORT_ENABLED) && defined(HWS_FOR_NVIDIA_GPUS_ENABLED) + +namespace { + +/** + * @brief returns a stable physical ID for the NVIDIA GPU device with the given local index + * The ID is at least unique per node and can be used to identify the same device across different MPI ranks on the same node. + * + * @param local_index the local index of the NVIDIA GPU device + * @return the physical ID of the NVIDIA GPU device + */ +std::string nvidia_physical_id(int local_index) { + char bus_id[64] = {}; + HWS_CUDA_ERROR_CHECK(cudaDeviceGetPCIBusId(bus_id, sizeof(bus_id), local_index)); + return std::string{ "nvidia:" } + bus_id; +} + +} // namespace + +std::vector enumerate_local_nvidia_devices() { + std::vector out; + int count = 0; + HWS_CUDA_ERROR_CHECK(cudaGetDeviceCount(&count)); + for (int i = 0; i < count; ++i) { + visible_gpu_device d; + d.backend = device_backend_kind::nvidia; + d.local_index = i; + d.physical_id = nvidia_physical_id(i); + out.push_back(std::move(d)); + } + return out; +} + +#endif + } // namespace hws::detail From 99d7f10bdceebe9d2643bc7a39ef9cfb19159e54 Mon Sep 17 00:00:00 2001 From: TimThuering <56958925+TimThuering@users.noreply.github.com> Date: Wed, 17 Jun 2026 09:25:55 +0200 Subject: [PATCH 15/31] consistent formatting --- bindings/hardware_sampler.cpp | 16 ++++------------ bindings/main.cpp | 10 ++++------ bindings/mpi4py_communicator.hpp | 5 ++--- include/hws/hardware_sampler.hpp | 4 ++-- include/hws/utility.hpp | 14 ++++++-------- src/hws/hardware_sampler.cpp | 2 +- src/hws/utility.cpp | 2 +- 7 files changed, 20 insertions(+), 33 deletions(-) diff --git a/bindings/hardware_sampler.cpp b/bindings/hardware_sampler.cpp index f8f5251..f29c4a3 100644 --- a/bindings/hardware_sampler.cpp +++ b/bindings/hardware_sampler.cpp @@ -32,8 +32,8 @@ #include // std::string #if defined(HWS_MPI_SUPPORT_ENABLED) -#include -#include "mpi4py_communicator.hpp" + #include "mpi4py_communicator.hpp" + #include #endif namespace py = pybind11; @@ -68,17 +68,9 @@ void init_hardware_sampler(py::module_ &m) { .def("sampling_interval", &hws::hardware_sampler::sampling_interval, "get the sampling interval of this hardware sampler (in ms)") .def("dump_yaml", py::overload_cast(&hws::hardware_sampler::dump_yaml, py::const_), "dump all hardware samples to the given YAML file") #if defined(HWS_MPI_SUPPORT_ENABLED) - .def("dump_yaml_global", - [](const hws::hardware_sampler &self, - const std::string &filename, - py::object py_comm) { + .def("dump_yaml_global", [](const hws::hardware_sampler &self, const std::string &filename, py::object py_comm) { const MPI_Comm comm = mpi_comm_from_python(py_comm); - self.dump_yaml_global(filename, comm); - }, - py::arg("filename"), - py::arg("comm"), - "Let MPI rank 0 dump the hardware samples of this hardware sampler of all MPI ranks to the given YAML file using the provided mpi4py communicator." - ) + self.dump_yaml_global(filename, comm); }, py::arg("filename"), py::arg("comm"), "Let MPI rank 0 dump the hardware samples of this hardware sampler of all MPI ranks to the given YAML file using the provided mpi4py communicator.") #endif .def("as_yaml_string", &hws::hardware_sampler::as_yaml_string, "return all hardware samples including additional information like events as YAML string") .def("samples_only_as_yaml_string", &hws::hardware_sampler::samples_only_as_yaml_string, "return all hardware samples as YAML string") diff --git a/bindings/main.cpp b/bindings/main.cpp index a629ee0..51825ae 100644 --- a/bindings/main.cpp +++ b/bindings/main.cpp @@ -5,16 +5,16 @@ * See the LICENSE.md file in the project root for full license information. */ -#include "hws/version.hpp" // hws::version::version +#include "hws/version.hpp" // hws::version::version #include "pybind11/pybind11.h" // PYBIND11_MODULE, py::module_ #include // std::string_view #if defined(HWS_MPI_SUPPORT_ENABLED) -#include -#include -#include "mpi4py_communicator.hpp" + #include "mpi4py_communicator.hpp" + #include + #include #endif #define HWS_IS_DEFINED_HELPER(x) #x @@ -80,8 +80,6 @@ PYBIND11_MODULE(HardwareSampling, m) { init_version(m); } - - #if defined(HWS_MPI_SUPPORT_ENABLED) /** * Extracts an MPI_Comm from a python mpi4py.MPI.Comm object. diff --git a/bindings/mpi4py_communicator.hpp b/bindings/mpi4py_communicator.hpp index 9ba2749..b7cdefd 100644 --- a/bindings/mpi4py_communicator.hpp +++ b/bindings/mpi4py_communicator.hpp @@ -1,5 +1,5 @@ /** -* @file + * @file * @author Tim Thüring * @copyright 2024-today All Rights Reserved * @license This file is released under the MIT license. @@ -15,7 +15,7 @@ #include "pybind11/pybind11.h" #if defined(HWS_MPI_SUPPORT_ENABLED) -# include + #include #endif namespace py = pybind11; @@ -24,5 +24,4 @@ namespace py = pybind11; MPI_Comm mpi_comm_from_python(py::object py_comm); #endif - #endif // HWS_HARDWARE_SAMPLING_FOR_GPUS_AND_CPUS_MPI4PY_COMMUNICATOR_HPP diff --git a/include/hws/hardware_sampler.hpp b/include/hws/hardware_sampler.hpp index 1d1f64c..6c31a75 100644 --- a/include/hws/hardware_sampler.hpp +++ b/include/hws/hardware_sampler.hpp @@ -166,7 +166,7 @@ class hardware_sampler { */ void dump_yaml(const std::filesystem::path &filename) const; - #if defined(HWS_MPI_SUPPORT_ENABLED) +#if defined(HWS_MPI_SUPPORT_ENABLED) /** * @brief Let MPI rank 0 dump the hardware samples of this hardware sampler of all MPI ranks to the YAML file with @p filename. * @param[in] filename the YAML file to append the hardware samples to @@ -181,7 +181,7 @@ class hardware_sampler { * @copydoc hws::hardware_sampler::dump_yaml_global(const char *) const */ void dump_yaml_global(const std::filesystem::path &filename, MPI_Comm communicator) const; - #endif +#endif /** * @brief Return the unique device identification. Can be used as unique key in the YAML string. diff --git a/include/hws/utility.hpp b/include/hws/utility.hpp index 2915a18..02369d7 100644 --- a/include/hws/utility.hpp +++ b/include/hws/utility.hpp @@ -439,34 +439,32 @@ struct visible_gpu_device { std::string physical_id; // stable per-node identifier }; - -#if defined(HWS_FOR_NVIDIA_GPUS_ENABLED) + #if defined(HWS_FOR_NVIDIA_GPUS_ENABLED) /** * @brief creates a list of all visible NVIDIA GPU devices * * @return a vector of all visible NVIDIA GPU devices on the local node, each with its local index and physical ID */ [[nodiscard]] std::vector enumerate_local_nvidia_devices(); -#endif + #endif -#if defined(HWS_FOR_AMD_GPUS_ENABLED) + #if defined(HWS_FOR_AMD_GPUS_ENABLED) /** * @brief creates a list of all visible AMD GPU devices * * @return a vector of all visible AMD GPU devices on the local node, each with its local index and physical ID */ [[nodiscard]] std::vector enumerate_local_amd_devices(); -#endif + #endif -#if defined(HWS_FOR_INTEL_GPUS_ENABLED) + #if defined(HWS_FOR_INTEL_GPUS_ENABLED) /** * @brief creates a list of all visible Intel GPU devices * * @return a vector of all visible Intel GPU devices on the local node, each with its local index and physical ID */ [[nodiscard]] std::vector enumerate_local_intel_devices(); -#endif - + #endif /** * Computes for each MPI rank a list of devices that have to be sampled by this rank. Ensures that diff --git a/src/hws/hardware_sampler.cpp b/src/hws/hardware_sampler.cpp index 4c5aa1d..4647adb 100644 --- a/src/hws/hardware_sampler.cpp +++ b/src/hws/hardware_sampler.cpp @@ -25,7 +25,7 @@ #include // std::move #if defined(HWS_MPI_SUPPORT_ENABLED) -#include // MPI_Comm + #include // MPI_Comm #endif namespace hws { diff --git a/src/hws/utility.cpp b/src/hws/utility.cpp index c5d1dcd..970b625 100644 --- a/src/hws/utility.cpp +++ b/src/hws/utility.cpp @@ -15,7 +15,7 @@ #include // std::vector #if defined(HWS_MPI_SUPPORT_ENABLED) -#include // MPI_Comm, MPI_Gatherv, MPI_Gather, MPI_Initialized, MPI_Comm_rank, MPI_Comm_size + #include // MPI_Comm, MPI_Gatherv, MPI_Gather, MPI_Initialized, MPI_Comm_rank, MPI_Comm_size #endif namespace hws::detail { From 1d3accf0f2f7f38581ade36f038bfc4c45974115 Mon Sep 17 00:00:00 2001 From: TimThuering <56958925+TimThuering@users.noreply.github.com> Date: Wed, 17 Jun 2026 12:44:58 +0200 Subject: [PATCH 16/31] add additional cmake check for mpi4py include path --- bindings/CMakeLists.txt | 7 +++++++ 1 file changed, 7 insertions(+) diff --git a/bindings/CMakeLists.txt b/bindings/CMakeLists.txt index fd4f564..c58dd20 100644 --- a/bindings/CMakeLists.txt +++ b/bindings/CMakeLists.txt @@ -82,6 +82,13 @@ if(HWS_MPI_SUPPORT_ACTIVE) "Install mpi4py in this environment or disable python bindings.") endif() + if(NOT EXISTS "${MPI4PY_INCLUDE_DIR}/mpi4py/mpi4py.h") + message(FATAL_ERROR + "mpi4py include path '${MPI4PY_INCLUDE_DIR}' does not contain mpi4py/mpi4py.h. " + "The mpi4py installation appears to be broken. " + "Reinstall mpi4py in this environment or disable python bindings.") + endif() + target_include_directories(${HWS_PYTHON_BINDINGS_LIBRARY_NAME} PRIVATE ${MPI4PY_INCLUDE_DIR}) # Propagate the same macro used on the C++ side into the Python module From 9fab189e16dbcfcdde4dfd5a1999d3afb3cc5f04 Mon Sep 17 00:00:00 2001 From: TimThuering <56958925+TimThuering@users.noreply.github.com> Date: Wed, 17 Jun 2026 13:17:59 +0200 Subject: [PATCH 17/31] add mpi4py version check to cmake --- bindings/CMakeLists.txt | 12 ++++++++++++ 1 file changed, 12 insertions(+) diff --git a/bindings/CMakeLists.txt b/bindings/CMakeLists.txt index c58dd20..569aa77 100644 --- a/bindings/CMakeLists.txt +++ b/bindings/CMakeLists.txt @@ -89,6 +89,18 @@ if(HWS_MPI_SUPPORT_ACTIVE) "Reinstall mpi4py in this environment or disable python bindings.") endif() + execute_process( + COMMAND "${Python_EXECUTABLE}" -c + "import mpi4py, sys; sys.stdout.write(mpi4py.__version__)" + OUTPUT_VARIABLE MPI4PY_VERSION + OUTPUT_STRIP_TRAILING_WHITESPACE + ) + if(MPI4PY_VERSION VERSION_LESS "4.0") + message(FATAL_ERROR + "mpi4py>=4.0 is required but found ${MPI4PY_VERSION} in Python_EXECUTABLE='${Python_EXECUTABLE}'. " + "Upgrade mpi4py or disable python bindings.") + endif() + target_include_directories(${HWS_PYTHON_BINDINGS_LIBRARY_NAME} PRIVATE ${MPI4PY_INCLUDE_DIR}) # Propagate the same macro used on the C++ side into the Python module From f43800f72ada8662c79cc36bd1ced507479ffd8f Mon Sep 17 00:00:00 2001 From: TimThuering <56958925+TimThuering@users.noreply.github.com> Date: Mon, 22 Jun 2026 08:38:06 +0200 Subject: [PATCH 18/31] cmake variable renamings --- bindings/CMakeLists.txt | 18 +++++++++--------- 1 file changed, 9 insertions(+), 9 deletions(-) diff --git a/bindings/CMakeLists.txt b/bindings/CMakeLists.txt index 569aa77..e344dec 100644 --- a/bindings/CMakeLists.txt +++ b/bindings/CMakeLists.txt @@ -70,21 +70,21 @@ if(HWS_MPI_SUPPORT_ACTIVE) execute_process( COMMAND "${Python_EXECUTABLE}" -c "import mpi4py, sys; sys.stdout.write(mpi4py.get_include())" - RESULT_VARIABLE MPI4PY_IMPORT_RESULT - OUTPUT_VARIABLE MPI4PY_INCLUDE_DIR + RESULT_VARIABLE HWS_MPI4PY_IMPORT_RESULT + OUTPUT_VARIABLE HWS_MPI4PY_INCLUDE_DIR OUTPUT_STRIP_TRAILING_WHITESPACE ) - if(MPI4PY_IMPORT_RESULT) + if(HWS_MPI4PY_IMPORT_RESULT) message(FATAL_ERROR "MPI support is enabled in hws (HWS_ENABLE_MPI_SUPPORT=AUTO/ON and MPI_FOUND) " "but mpi4py is not importable in Python_EXECUTABLE='${Python_EXECUTABLE}'. " "Install mpi4py in this environment or disable python bindings.") endif() - if(NOT EXISTS "${MPI4PY_INCLUDE_DIR}/mpi4py/mpi4py.h") + if(NOT EXISTS "${HWS_MPI4PY_INCLUDE_DIR}/mpi4py/mpi4py.h") message(FATAL_ERROR - "mpi4py include path '${MPI4PY_INCLUDE_DIR}' does not contain mpi4py/mpi4py.h. " + "mpi4py include path '${HWS_MPI4PY_INCLUDE_DIR}' does not contain mpi4py/mpi4py.h. " "The mpi4py installation appears to be broken. " "Reinstall mpi4py in this environment or disable python bindings.") endif() @@ -92,16 +92,16 @@ if(HWS_MPI_SUPPORT_ACTIVE) execute_process( COMMAND "${Python_EXECUTABLE}" -c "import mpi4py, sys; sys.stdout.write(mpi4py.__version__)" - OUTPUT_VARIABLE MPI4PY_VERSION + OUTPUT_VARIABLE HWS_MPI4PY_VERSION OUTPUT_STRIP_TRAILING_WHITESPACE ) - if(MPI4PY_VERSION VERSION_LESS "4.0") + if(HWS_MPI4PY_VERSION VERSION_LESS "4.0") message(FATAL_ERROR - "mpi4py>=4.0 is required but found ${MPI4PY_VERSION} in Python_EXECUTABLE='${Python_EXECUTABLE}'. " + "mpi4py>=4.0 is required but found ${HWS_MPI4PY_VERSION} in Python_EXECUTABLE='${Python_EXECUTABLE}'. " "Upgrade mpi4py or disable python bindings.") endif() - target_include_directories(${HWS_PYTHON_BINDINGS_LIBRARY_NAME} PRIVATE ${MPI4PY_INCLUDE_DIR}) + target_include_directories(${HWS_PYTHON_BINDINGS_LIBRARY_NAME} PRIVATE ${HWS_MPI4PY_INCLUDE_DIR}) # Propagate the same macro used on the C++ side into the Python module target_compile_definitions(${HWS_PYTHON_BINDINGS_LIBRARY_NAME} PRIVATE HWS_MPI_SUPPORT_ENABLED) From 41d20292b2b4ed232eb6bc2bb7277fd27cd73ab0 Mon Sep 17 00:00:00 2001 From: TimThuering <56958925+TimThuering@users.noreply.github.com> Date: Mon, 22 Jun 2026 08:44:52 +0200 Subject: [PATCH 19/31] update std::runtime_error constructor calls --- bindings/main.cpp | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/bindings/main.cpp b/bindings/main.cpp index 51825ae..932a897 100644 --- a/bindings/main.cpp +++ b/bindings/main.cpp @@ -90,12 +90,12 @@ PYBIND11_MODULE(HardwareSampling, m) { */ MPI_Comm mpi_comm_from_python(py::object py_comm) { if (!PyObject_TypeCheck(py_comm.ptr(), &PyMPIComm_Type)) { - throw std::runtime_error("expected mpi4py.MPI.Comm as communicator argument"); + throw std::runtime_error{"expected mpi4py.MPI.Comm as communicator argument"}; } MPI_Comm *comm_ptr = PyMPIComm_Get(py_comm.ptr()); if (comm_ptr == nullptr) { - throw std::runtime_error("could not extract MPI_Comm from mpi4py communicator"); + throw std::runtime_error{"could not extract MPI_Comm from mpi4py communicator"}; } return *comm_ptr; From 666130e260726cc7cd604390e083547423380d9d Mon Sep 17 00:00:00 2001 From: TimThuering <56958925+TimThuering@users.noreply.github.com> Date: Mon, 22 Jun 2026 08:47:47 +0200 Subject: [PATCH 20/31] update include guard name in mpi4py_communicator.hpp --- bindings/mpi4py_communicator.hpp | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/bindings/mpi4py_communicator.hpp b/bindings/mpi4py_communicator.hpp index b7cdefd..f325423 100644 --- a/bindings/mpi4py_communicator.hpp +++ b/bindings/mpi4py_communicator.hpp @@ -8,8 +8,8 @@ * @brief Utility functions for transforming mpi4py communicators into C++ MPI communicators */ -#ifndef HWS_HARDWARE_SAMPLING_FOR_GPUS_AND_CPUS_MPI4PY_COMMUNICATOR_HPP -#define HWS_HARDWARE_SAMPLING_FOR_GPUS_AND_CPUS_MPI4PY_COMMUNICATOR_HPP +#ifndef HWS_BINDINGS_MPI4PY_COMMUNICATOR_HPP +#define HWS_BINDINGS_MPI4PY_COMMUNICATOR_HPP #pragma once #include "pybind11/pybind11.h" @@ -24,4 +24,4 @@ namespace py = pybind11; MPI_Comm mpi_comm_from_python(py::object py_comm); #endif -#endif // HWS_HARDWARE_SAMPLING_FOR_GPUS_AND_CPUS_MPI4PY_COMMUNICATOR_HPP +#endif // HWS_BINDINGS_MPI4PY_COMMUNICATOR_HPP From 5be4fed131cd01f78e4a22e9efcdd9822cf4e718 Mon Sep 17 00:00:00 2001 From: TimThuering <56958925+TimThuering@users.noreply.github.com> Date: Mon, 22 Jun 2026 08:55:19 +0200 Subject: [PATCH 21/31] split variable declaration into separate lines --- include/hws/utility.hpp | 6 ++++-- src/hws/utility.cpp | 3 ++- 2 files changed, 6 insertions(+), 3 deletions(-) diff --git a/include/hws/utility.hpp b/include/hws/utility.hpp index 02369d7..3f5cde8 100644 --- a/include/hws/utility.hpp +++ b/include/hws/utility.hpp @@ -361,7 +361,8 @@ struct hostname_comm_info { * @return the node-local MPI communicator information */ inline hostname_comm_info make_hostname_comm(MPI_Comm comm) { - int world_rank = 0, world_size = 0; + int world_rank = 0; + int world_size = 0; MPI_Comm_rank(comm, &world_rank); MPI_Comm_size(comm, &world_size); @@ -475,7 +476,8 @@ struct visible_gpu_device { * @return all device indices that have to be sampled by this rank */ inline std::vector owned_local_indices_for_backend(const std::vector &local_devices, MPI_Comm node_comm) { - int node_rank = 0, node_size = 0; + int node_rank = 0; + int node_size = 0; MPI_Comm_rank(node_comm, &node_rank); MPI_Comm_size(node_comm, &node_size); diff --git a/src/hws/utility.cpp b/src/hws/utility.cpp index 970b625..e1a171f 100644 --- a/src/hws/utility.cpp +++ b/src/hws/utility.cpp @@ -89,7 +89,8 @@ std::string gather_yaml_strings_mpi(const std::string &local_yaml, MPI_Comm comm } // MPI rank and world size for identification and communication - int rank = 0, world_size = 0; + int rank = 0; + int world_size = 0; MPI_Comm_rank(communicator, &rank); MPI_Comm_size(communicator, &world_size); From e4716ff45fd00260c6b3134b3ede34dbe2971a16 Mon Sep 17 00:00:00 2001 From: TimThuering <56958925+TimThuering@users.noreply.github.com> Date: Mon, 22 Jun 2026 09:04:08 +0200 Subject: [PATCH 22/31] change std::runtime_error constructor calls --- src/hws/hardware_sampler.cpp | 2 +- src/hws/utility.cpp | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/src/hws/hardware_sampler.cpp b/src/hws/hardware_sampler.cpp index 4647adb..9066920 100644 --- a/src/hws/hardware_sampler.cpp +++ b/src/hws/hardware_sampler.cpp @@ -154,7 +154,7 @@ void hardware_sampler::dump_yaml_global(const char *filename, MPI_Comm communica MPI_Initialized(&initialized); if (!initialized) { - throw std::runtime_error("MPI must already be initialized"); + throw std::runtime_error{"MPI must already be initialized"}; } // MPI rank and world size for identification and communication diff --git a/src/hws/utility.cpp b/src/hws/utility.cpp index e1a171f..a5d51b8 100644 --- a/src/hws/utility.cpp +++ b/src/hws/utility.cpp @@ -85,7 +85,7 @@ std::string gather_yaml_strings_mpi(const std::string &local_yaml, MPI_Comm comm MPI_Initialized(&initialized); if (!initialized) { - throw std::runtime_error("MPI must already be initialized"); + throw std::runtime_error{"MPI must already be initialized"}; } // MPI rank and world size for identification and communication From c66b51041b41ba9e8ccce811cce5ce955d68a89c Mon Sep 17 00:00:00 2001 From: TimThuering <56958925+TimThuering@users.noreply.github.com> Date: Mon, 22 Jun 2026 09:12:59 +0200 Subject: [PATCH 23/31] add [[nodiscard]] and const to gpu_*/utility.cpp files --- src/hws/gpu_amd/utility.cpp | 4 ++-- src/hws/gpu_intel/utility.cpp | 4 ++-- src/hws/gpu_nvidia/utility.cpp | 4 ++-- 3 files changed, 6 insertions(+), 6 deletions(-) diff --git a/src/hws/gpu_amd/utility.cpp b/src/hws/gpu_amd/utility.cpp index d0bb038..614a7c7 100644 --- a/src/hws/gpu_amd/utility.cpp +++ b/src/hws/gpu_amd/utility.cpp @@ -57,7 +57,7 @@ namespace { * @param local_index the local index of the AMD GPU device * @return the physical ID of the AMD GPU device */ -std::string amd_physical_id(int local_index) { +[[nodiscard]] std::string amd_physical_id(const int local_index) { char bus_id[64] = {}; HWS_HIP_ERROR_CHECK(hipDeviceGetPCIBusId(bus_id, sizeof(bus_id), local_index)); return std::string{ "amd:" } + bus_id; @@ -65,7 +65,7 @@ std::string amd_physical_id(int local_index) { } // namespace -std::vector enumerate_local_amd_devices() { +[[nodiscard]] std::vector enumerate_local_amd_devices() { std::vector out; int count = 0; HWS_HIP_ERROR_CHECK(hipGetDeviceCount(&count)); diff --git a/src/hws/gpu_intel/utility.cpp b/src/hws/gpu_intel/utility.cpp index 2d50ad8..0b5f0e9 100644 --- a/src/hws/gpu_intel/utility.cpp +++ b/src/hws/gpu_intel/utility.cpp @@ -246,7 +246,7 @@ namespace { * @param device the Level Zero device handle of the Intel GPU device * @return the physical ID of the Intel GPU device */ -std::string intel_physical_id(ze_device_handle_t device) { +[[nodiscard]] std::string intel_physical_id(const ze_device_handle_t device) { ze_device_properties_t props{}; props.stype = ZE_STRUCTURE_TYPE_DEVICE_PROPERTIES; props.pNext = nullptr; @@ -262,7 +262,7 @@ std::string intel_physical_id(ze_device_handle_t device) { } // namespace -std::vector enumerate_local_intel_devices() { +[[nodiscard]] std::vector enumerate_local_intel_devices() { std::vector out; // init level zero driver diff --git a/src/hws/gpu_nvidia/utility.cpp b/src/hws/gpu_nvidia/utility.cpp index 3237113..437699d 100644 --- a/src/hws/gpu_nvidia/utility.cpp +++ b/src/hws/gpu_nvidia/utility.cpp @@ -71,7 +71,7 @@ namespace { * @param local_index the local index of the NVIDIA GPU device * @return the physical ID of the NVIDIA GPU device */ -std::string nvidia_physical_id(int local_index) { +[[nodiscard]] std::string nvidia_physical_id(const int local_index) { char bus_id[64] = {}; HWS_CUDA_ERROR_CHECK(cudaDeviceGetPCIBusId(bus_id, sizeof(bus_id), local_index)); return std::string{ "nvidia:" } + bus_id; @@ -79,7 +79,7 @@ std::string nvidia_physical_id(int local_index) { } // namespace -std::vector enumerate_local_nvidia_devices() { +[[nodiscard]] std::vector enumerate_local_nvidia_devices() { std::vector out; int count = 0; HWS_CUDA_ERROR_CHECK(cudaGetDeviceCount(&count)); From 5ec40727e42ade0d0ca259d0edef98e80544b30b Mon Sep 17 00:00:00 2001 From: TimThuering <56958925+TimThuering@users.noreply.github.com> Date: Mon, 22 Jun 2026 09:31:11 +0200 Subject: [PATCH 24/31] added missing const --- src/hws/system_hardware_sampler.cpp | 10 +++++----- src/hws/utility.cpp | 2 +- 2 files changed, 6 insertions(+), 6 deletions(-) diff --git a/src/hws/system_hardware_sampler.cpp b/src/hws/system_hardware_sampler.cpp index 416a19d..0e93f0a 100644 --- a/src/hws/system_hardware_sampler.cpp +++ b/src/hws/system_hardware_sampler.cpp @@ -56,10 +56,10 @@ system_hardware_sampler::system_hardware_sampler(const std::chrono::milliseconds } #if defined(HWS_MPI_SUPPORT_ENABLED) -system_hardware_sampler::system_hardware_sampler(MPI_Comm communicator, detail::mpi_sampling_mode mode, sample_category category) : +system_hardware_sampler::system_hardware_sampler(MPI_Comm communicator, const detail::mpi_sampling_mode mode, const sample_category category) : system_hardware_sampler(communicator, mode, HWS_SAMPLING_INTERVAL, category) { } -system_hardware_sampler::system_hardware_sampler(MPI_Comm communicator, detail::mpi_sampling_mode mode, std::chrono::milliseconds sampling_interval, sample_category category) { +system_hardware_sampler::system_hardware_sampler(MPI_Comm communicator, const detail::mpi_sampling_mode mode, const std::chrono::milliseconds sampling_interval, const sample_category category) { if (mode == detail::mpi_sampling_mode::per_rank) { // each rank creates samplers for all devices visible to him create_local_samplers(sampling_interval, category); @@ -79,7 +79,7 @@ system_hardware_sampler::system_hardware_sampler(MPI_Comm communicator, detail:: { const auto local = detail::enumerate_local_nvidia_devices(); const auto owned = detail::owned_local_indices_for_backend(local, nc.node_comm); - for (int idx : owned) { + for (int const idx : owned) { samplers_.push_back(std::make_unique(static_cast(idx), sampling_interval, category)); } } @@ -90,7 +90,7 @@ system_hardware_sampler::system_hardware_sampler(MPI_Comm communicator, detail:: { const auto local = detail::enumerate_local_amd_devices(); const auto owned = detail::owned_local_indices_for_backend(local, nc.node_comm); - for (int idx : owned) { + for (int const idx : owned) { samplers_.push_back(std::make_unique( static_cast(idx), sampling_interval, category)); } @@ -102,7 +102,7 @@ system_hardware_sampler::system_hardware_sampler(MPI_Comm communicator, detail:: { const auto local = detail::enumerate_local_intel_devices(); const auto owned = detail::owned_local_indices_for_backend(local, nc.node_comm); - for (int idx : owned) { + for (int const idx : owned) { samplers_.push_back(std::make_unique(static_cast(idx), sampling_interval, category)); } } diff --git a/src/hws/utility.cpp b/src/hws/utility.cpp index a5d51b8..201a0ef 100644 --- a/src/hws/utility.cpp +++ b/src/hws/utility.cpp @@ -95,7 +95,7 @@ std::string gather_yaml_strings_mpi(const std::string &local_yaml, MPI_Comm comm MPI_Comm_size(communicator, &world_size); // gather the size of the yaml string from each rank - int local_size = static_cast(local_yaml.size()); + const int local_size = static_cast(local_yaml.size()); std::vector recv_sizes; From 45b75872f29cbc57fdb5a773419c33b84b0937c7 Mon Sep 17 00:00:00 2001 From: TimThuering <56958925+TimThuering@users.noreply.github.com> Date: Mon, 22 Jun 2026 13:36:31 +0200 Subject: [PATCH 25/31] change std::runtime_error constructor call --- src/hws/system_hardware_sampler.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/hws/system_hardware_sampler.cpp b/src/hws/system_hardware_sampler.cpp index 0e93f0a..738b784 100644 --- a/src/hws/system_hardware_sampler.cpp +++ b/src/hws/system_hardware_sampler.cpp @@ -236,7 +236,7 @@ void system_hardware_sampler::dump_yaml_global(const char *filename, MPI_Comm co MPI_Initialized(&initialized); if (!initialized) { - throw std::runtime_error("MPI must already be initialized"); + throw std::runtime_error{"MPI must already be initialized"}; } // MPI rank and world size for identification and communication From 7459d192741b5527dac5b6be362c48434aad16e4 Mon Sep 17 00:00:00 2001 From: TimThuering <56958925+TimThuering@users.noreply.github.com> Date: Mon, 22 Jun 2026 14:00:38 +0200 Subject: [PATCH 26/31] changed prefix from std::string to std::string_view --- include/hws/utility.hpp | 2 +- src/hws/utility.cpp | 6 ++++-- 2 files changed, 5 insertions(+), 3 deletions(-) diff --git a/include/hws/utility.hpp b/include/hws/utility.hpp index 3f5cde8..8b2332b 100644 --- a/include/hws/utility.hpp +++ b/include/hws/utility.hpp @@ -258,7 +258,7 @@ template * @param[in] prefix the prefix (indentation) added to each line * @return the indented string */ -[[nodiscard]] std::string indent_lines(const std::string &text, const std::string &prefix); +[[nodiscard]] std::string indent_lines(const std::string &text, std::string_view prefix); /*****************************************************************************************************/ /** other free functions **/ diff --git a/src/hws/utility.cpp b/src/hws/utility.cpp index 201a0ef..f593521 100644 --- a/src/hws/utility.cpp +++ b/src/hws/utility.cpp @@ -66,14 +66,16 @@ bool is_integer(std::string_view str) { return std::all_of(str.cbegin(), str.cend(), [](const char c) { return std::isdigit(static_cast(c)); }); } -std::string indent_lines(const std::string &text, const std::string &prefix) { +std::string indent_lines(const std::string &text, const std::string_view prefix) { std::stringstream ss{ text }; std::string line; std::string out; while (std::getline(ss, line)) { - out += prefix + line + '\n'; + out += prefix; + out += line; + out += '\n'; } return out; From f3bbb01de0780044a3e9ae33dd8739341b83c7dc Mon Sep 17 00:00:00 2001 From: TimThuering <56958925+TimThuering@users.noreply.github.com> Date: Mon, 22 Jun 2026 14:11:41 +0200 Subject: [PATCH 27/31] change auto to acutal types --- src/hws/system_hardware_sampler.cpp | 14 +++++++------- 1 file changed, 7 insertions(+), 7 deletions(-) diff --git a/src/hws/system_hardware_sampler.cpp b/src/hws/system_hardware_sampler.cpp index 738b784..2dda396 100644 --- a/src/hws/system_hardware_sampler.cpp +++ b/src/hws/system_hardware_sampler.cpp @@ -65,7 +65,7 @@ system_hardware_sampler::system_hardware_sampler(MPI_Comm communicator, const de create_local_samplers(sampling_interval, category); } else if (mode == detail::mpi_sampling_mode::whole_node) { // create a custom, node-local MPI communicator - auto nc = detail::make_hostname_comm(communicator); + detail::hostname_comm_info nc = detail::make_hostname_comm(communicator); // CPU: one sampler per node --> node leader only #if defined(HWS_FOR_CPUS_ENABLED) @@ -77,8 +77,8 @@ system_hardware_sampler::system_hardware_sampler(MPI_Comm communicator, const de // NVIDIA #if defined(HWS_FOR_NVIDIA_GPUS_ENABLED) { - const auto local = detail::enumerate_local_nvidia_devices(); - const auto owned = detail::owned_local_indices_for_backend(local, nc.node_comm); + const std::vector local = detail::enumerate_local_nvidia_devices(); + const std::vector owned = detail::owned_local_indices_for_backend(local, nc.node_comm); for (int const idx : owned) { samplers_.push_back(std::make_unique(static_cast(idx), sampling_interval, category)); } @@ -88,8 +88,8 @@ system_hardware_sampler::system_hardware_sampler(MPI_Comm communicator, const de // AMD #if defined(HWS_FOR_AMD_GPUS_ENABLED) { - const auto local = detail::enumerate_local_amd_devices(); - const auto owned = detail::owned_local_indices_for_backend(local, nc.node_comm); + const std::vector local = detail::enumerate_local_amd_devices(); + const std::vector owned = detail::owned_local_indices_for_backend(local, nc.node_comm); for (int const idx : owned) { samplers_.push_back(std::make_unique( static_cast(idx), sampling_interval, category)); @@ -100,8 +100,8 @@ system_hardware_sampler::system_hardware_sampler(MPI_Comm communicator, const de // Intel #if defined(HWS_FOR_INTEL_GPUS_ENABLED) { - const auto local = detail::enumerate_local_intel_devices(); - const auto owned = detail::owned_local_indices_for_backend(local, nc.node_comm); + const std::vector local = detail::enumerate_local_intel_devices(); + const std::vector owned = detail::owned_local_indices_for_backend(local, nc.node_comm); for (int const idx : owned) { samplers_.push_back(std::make_unique(static_cast(idx), sampling_interval, category)); } From c75de3149dabbbe79e2569a7a10c66ac89d15098 Mon Sep 17 00:00:00 2001 From: TimThuering <56958925+TimThuering@users.noreply.github.com> Date: Mon, 22 Jun 2026 16:02:29 +0200 Subject: [PATCH 28/31] add constructor/destructor to hostname_comm_info --- include/hws/utility.hpp | 119 +++++++++++++--------------- src/hws/system_hardware_sampler.cpp | 3 +- 2 files changed, 56 insertions(+), 66 deletions(-) diff --git a/include/hws/utility.hpp b/include/hws/utility.hpp index 8b2332b..b2bc77c 100644 --- a/include/hws/utility.hpp +++ b/include/hws/utility.hpp @@ -347,86 +347,77 @@ enum class mpi_sampling_mode { }; /** - * @brief Information about a node-local MPI communicator for whole-node sampling. + * @brief RAII wrapper around a node-local MPI communicator for whole-node sampling. */ struct hostname_comm_info { MPI_Comm node_comm = MPI_COMM_NULL; int node_rank = 0; int node_size = 1; -}; -/** - * @brief Create a node-local MPI communicator for whole-node sampling based on node hostnames. - * @param comm the parent MPI communicator to split into node-local communicators - * @return the node-local MPI communicator information - */ -inline hostname_comm_info make_hostname_comm(MPI_Comm comm) { - int world_rank = 0; - int world_size = 0; - MPI_Comm_rank(comm, &world_rank); - MPI_Comm_size(comm, &world_size); - - // Gather all hostnames - char name[MPI_MAX_PROCESSOR_NAME]; - int name_len = 0; - MPI_Get_processor_name(name, &name_len); - - std::vector name_lengths(world_size); - MPI_Allgather(&name_len, 1, MPI_INT, name_lengths.data(), 1, MPI_INT, comm); - - // Build displacements and total byte count - std::vector displs(world_size); - int total = 0; - for (int i = 0; i < world_size; ++i) { - displs[i] = total; - total += name_lengths[i]; - } + explicit hostname_comm_info(MPI_Comm comm) { + int world_rank = 0; + int world_size = 0; + MPI_Comm_rank(comm, &world_rank); + MPI_Comm_size(comm, &world_size); + + // Gather all hostnames + char name[MPI_MAX_PROCESSOR_NAME]; + int name_len = 0; + MPI_Get_processor_name(name, &name_len); + + std::vector name_lengths(world_size); + MPI_Allgather(&name_len, 1, MPI_INT, name_lengths.data(), 1, MPI_INT, comm); + + // Build displacements and total byte count + std::vector displs(world_size); + int total = 0; + for (int i = 0; i < world_size; ++i) { + displs[i] = total; + total += name_lengths[i]; + } - std::vector all_names(total); - MPI_Allgatherv(name, name_len, MPI_CHAR, all_names.data(), name_lengths.data(), displs.data(), MPI_CHAR, comm); + std::vector all_names(total); + MPI_Allgatherv(name, name_len, MPI_CHAR, all_names.data(), name_lengths.data(), displs.data(), MPI_CHAR, comm); - // Assign colors locally on every rank - // - // All ranks hold identical copies of all_names, name_lengths, and displs, - // so they can each compute the same deterministic color map independently. + // Assign colors locally on every rank + // + // All ranks hold identical copies of all_names, name_lengths, and displs, + // so they can each compute the same deterministic color map independently. - std::unordered_map host_to_color; - host_to_color.reserve(world_size); - std::vector colors(world_size); - int next_color = 0; - for (int r = 0; r < world_size; ++r) { - // get host name of rank r - std::string_view host(&all_names[displs[r]], static_cast(name_lengths[r])); + std::unordered_map host_to_color; + host_to_color.reserve(world_size); + std::vector colors(world_size); + int next_color = 0; + for (int r = 0; r < world_size; ++r) { + // get host name of rank r + std::string_view host(&all_names[displs[r]], static_cast(name_lengths[r])); - // try to insert it into the host_to_color map - auto [it, inserted] = host_to_color.emplace(host, next_color); + // try to insert it into the host_to_color map + auto [it, inserted] = host_to_color.emplace(host, next_color); - // check if host was new, if yes, increment color - if (inserted) { - ++next_color; + // check if host was new, if yes, increment color + if (inserted) { + ++next_color; + } + // save color of current rank, either from newly created or existing entry + colors[r] = it->second; } - // save color of current rank, either from newly created or existing entry - colors[r] = it->second; - } - // Split communicator + // Split communicator + MPI_Comm_split(comm, colors[world_rank], world_rank, &node_comm); + MPI_Comm_rank(node_comm, &node_rank); + MPI_Comm_size(node_comm, &node_size); + } - hostname_comm_info info{}; - MPI_Comm_split(comm, colors[world_rank], world_rank, &info.node_comm); - MPI_Comm_rank(info.node_comm, &info.node_rank); - MPI_Comm_size(info.node_comm, &info.node_size); - return info; -} + hostname_comm_info(const hostname_comm_info &) = delete; + hostname_comm_info &operator=(const hostname_comm_info &) = delete; -/** - * @brief Free a node-local MPI communicator for whole-node sampling. - * @param info the node-local MPI communicator information to free - */ -inline void free_hostname_comm(hostname_comm_info &info) { - if (info.node_comm != MPI_COMM_NULL) { - MPI_Comm_free(&info.node_comm); + ~hostname_comm_info() { + if (node_comm != MPI_COMM_NULL) { + MPI_Comm_free(&node_comm); + } } -} +}; enum class device_backend_kind { nvidia, diff --git a/src/hws/system_hardware_sampler.cpp b/src/hws/system_hardware_sampler.cpp index 2dda396..5e30dee 100644 --- a/src/hws/system_hardware_sampler.cpp +++ b/src/hws/system_hardware_sampler.cpp @@ -65,7 +65,7 @@ system_hardware_sampler::system_hardware_sampler(MPI_Comm communicator, const de create_local_samplers(sampling_interval, category); } else if (mode == detail::mpi_sampling_mode::whole_node) { // create a custom, node-local MPI communicator - detail::hostname_comm_info nc = detail::make_hostname_comm(communicator); + detail::hostname_comm_info nc{ communicator }; // CPU: one sampler per node --> node leader only #if defined(HWS_FOR_CPUS_ENABLED) @@ -108,7 +108,6 @@ system_hardware_sampler::system_hardware_sampler(MPI_Comm communicator, const de } #endif - detail::free_hostname_comm(nc); } else { throw std::runtime_error{ fmt::format("Unknown MPI sampling mode {}!", static_cast(mode)) }; } From 8d7011ea50bee9c8c39b768131922c2c241dd922 Mon Sep 17 00:00:00 2001 From: TimThuering <56958925+TimThuering@users.noreply.github.com> Date: Tue, 23 Jun 2026 09:13:06 +0200 Subject: [PATCH 29/31] refactor MPI related functions --- CMakeLists.txt | 2 + include/hws/gpu_amd/utility.hpp | 18 ++ include/hws/gpu_intel/utility.hpp | 15 ++ include/hws/gpu_nvidia/utility.hpp | 17 ++ include/hws/mpi_sampling_mode.hpp | 33 ++++ include/hws/mpi_utility.hpp | 65 +++++++ include/hws/system_hardware_sampler.hpp | 5 +- include/hws/utility.hpp | 218 ------------------------ include/hws/visible_gpu_device.hpp | 37 ++++ src/hws/gpu_amd/utility.cpp | 2 +- src/hws/gpu_intel/utility.cpp | 2 +- src/hws/gpu_nvidia/utility.cpp | 2 +- src/hws/hardware_sampler.cpp | 2 +- src/hws/mpi_utility.cpp | 210 +++++++++++++++++++++++ src/hws/system_hardware_sampler.cpp | 2 +- src/hws/utility.cpp | 66 ------- 16 files changed, 405 insertions(+), 291 deletions(-) create mode 100644 include/hws/mpi_sampling_mode.hpp create mode 100644 include/hws/mpi_utility.hpp create mode 100644 include/hws/visible_gpu_device.hpp create mode 100644 src/hws/mpi_utility.cpp diff --git a/CMakeLists.txt b/CMakeLists.txt index cb049c2..6ab2b4b 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -177,6 +177,8 @@ if (HWS_ENABLE_MPI_SUPPORT MATCHES "AUTO" OR HWS_ENABLE_MPI_SUPPORT) # Expose that MPI is really enabled for the Python bindings (and potentially other submodules) via a cache variable. set(HWS_MPI_SUPPORT_ACTIVE TRUE CACHE BOOL "MPI support enabled in core library" FORCE) + + target_sources(${HWS_LIBRARY_NAME} PRIVATE ${CMAKE_CURRENT_SOURCE_DIR}/src/hws/mpi_utility.cpp) endif () endif () diff --git a/include/hws/gpu_amd/utility.hpp b/include/hws/gpu_amd/utility.hpp index 716ff8c..def0937 100644 --- a/include/hws/gpu_amd/utility.hpp +++ b/include/hws/gpu_amd/utility.hpp @@ -18,6 +18,12 @@ #include // std::runtime_error #include // std::string +#if defined(HWS_MPI_SUPPORT_ENABLED) + #include "hws/visible_gpu_device.hpp" // hws::detail::visible_gpu_device + + #include // std::vector +#endif + namespace hws::detail { /** @@ -68,6 +74,18 @@ namespace hws::detail { */ [[nodiscard]] std::string performance_level_to_string(rsmi_dev_perf_level_t perf_level); + +#if defined(HWS_MPI_SUPPORT_ENABLED) + +/** + * @brief creates a list of all visible AMD GPU devices + * + * @return a vector of all visible AMD GPU devices on the local node, each with its local index and physical ID + */ +[[nodiscard]] std::vector enumerate_local_amd_devices(); + +#endif + } // namespace hws::detail #endif // HWS_GPU_AMD_UTILITY_HPP_ diff --git a/include/hws/gpu_intel/utility.hpp b/include/hws/gpu_intel/utility.hpp index 76e15a1..6e7afe7 100644 --- a/include/hws/gpu_intel/utility.hpp +++ b/include/hws/gpu_intel/utility.hpp @@ -21,6 +21,10 @@ #include // std::string_view #include // std::vector +#if defined(HWS_MPI_SUPPORT_ENABLED) + #include "hws/visible_gpu_device.hpp" // hws::detail::visible_gpu_device +#endif + namespace hws::detail { /** @@ -75,6 +79,17 @@ namespace hws::detail { */ [[nodiscard]] std::string memory_location_to_name(zes_mem_loc_t mem_loc); +#if defined(HWS_MPI_SUPPORT_ENABLED) + +/** + * @brief creates a list of all visible Intel GPU devices + * + * @return a vector of all visible Intel GPU devices on the local node, each with its local index and physical ID + */ +[[nodiscard]] std::vector enumerate_local_intel_devices(); + +#endif + } // namespace hws::detail #endif // HWS_GPU_INTEL_UTILITY_HPP_ diff --git a/include/hws/gpu_nvidia/utility.hpp b/include/hws/gpu_nvidia/utility.hpp index 348f74b..b0b3811 100644 --- a/include/hws/gpu_nvidia/utility.hpp +++ b/include/hws/gpu_nvidia/utility.hpp @@ -19,6 +19,12 @@ #include // std::runtime_error #include // std::string +#if defined(HWS_MPI_SUPPORT_ENABLED) + #include "hws/visible_gpu_device.hpp" // hws::detail::visible_gpu_device + + #include // std::vector +#endif + namespace hws::detail { /** @@ -63,6 +69,17 @@ namespace hws::detail { #endif +#if defined(HWS_MPI_SUPPORT_ENABLED) + +/** + * @brief creates a list of all visible NVIDIA GPU devices + * + * @return a vector of all visible NVIDIA GPU devices on the local node, each with its local index and physical ID + */ +[[nodiscard]] std::vector enumerate_local_nvidia_devices(); + +#endif + } // namespace hws::detail #endif // HWS_GPU_NVIDIA_UTILITY_HPP_ diff --git a/include/hws/mpi_sampling_mode.hpp b/include/hws/mpi_sampling_mode.hpp new file mode 100644 index 0000000..b53cfeb --- /dev/null +++ b/include/hws/mpi_sampling_mode.hpp @@ -0,0 +1,33 @@ +/** + * @file + * @author Tim Thüring + * @copyright 2024-today All Rights Reserved + * @license This file is released under the MIT license. + * See the LICENSE.md file in the project root for full license information. + * + * @brief Defines the MPI sampling mode. + */ + +#ifndef HWS_MPI_SAMPLING_MODE_HPP_ +#define HWS_MPI_SAMPLING_MODE_HPP_ +#pragma once + +#if defined(HWS_MPI_SUPPORT_ENABLED) + +namespace hws::detail { + +/** + * @brief The mode to use for MPI sampling. + * per_rank: each rank creates hardware samplers for all devices visible to that rank + * whole_node: if the same device is visible to more than one rank, only one of those ranks creates a hardware sampler for that device + */ +enum class mpi_sampling_mode { + per_rank, + whole_node +}; + +} // namespace hws::detail + +#endif // HWS_MPI_SUPPORT_ENABLED + +#endif // HWS_MPI_SAMPLING_MODE_HPP_ diff --git a/include/hws/mpi_utility.hpp b/include/hws/mpi_utility.hpp new file mode 100644 index 0000000..673c2da --- /dev/null +++ b/include/hws/mpi_utility.hpp @@ -0,0 +1,65 @@ +/** + * @file + * @author Tim Thüring + * @copyright 2024-today All Rights Reserved + * @license This file is released under the MIT license. + * See the LICENSE.md file in the project root for full license information. + * + * @brief MPI utility functions for hardware sampling. + */ + +#ifndef HWS_MPI_UTILITY_HPP_ +#define HWS_MPI_UTILITY_HPP_ +#pragma once + +#if defined(HWS_MPI_SUPPORT_ENABLED) + + #include "hws/visible_gpu_device.hpp" // hws::detail::visible_gpu_device + + #include // MPI_Comm, MPI_COMM_NULL + #include // std::string + #include // std::vector + +namespace hws::detail { + +/** + * @brief Gather YAML strings from all MPI ranks and assemble them in rank order on rank 0. + * + * @param[in] local_yaml the local YAML string contribution + * @param[in] communicator the MPI communicator + * + * @return concatenated YAML string on rank 0, empty string on all other ranks + */ +[[nodiscard]] std::string gather_yaml_strings_mpi(const std::string &local_yaml, MPI_Comm communicator); + +/** + * @brief RAII wrapper around a node-local MPI communicator for whole-node sampling. + */ +struct hostname_comm_info { + MPI_Comm node_comm = MPI_COMM_NULL; + int node_rank = 0; + int node_size = 1; + + explicit hostname_comm_info(MPI_Comm comm); + + hostname_comm_info(const hostname_comm_info &) = delete; + hostname_comm_info &operator=(const hostname_comm_info &) = delete; + + ~hostname_comm_info(); +}; + +/** + * Computes for each MPI rank a list of devices that have to be sampled by this rank. Ensures that + * each device is sampled by exactly one rank. + * + * @param local_devices a vector of visible_gpu_device for the local rank, each containing a local index and a physical ID + * @param node_comm a node local MPI communicator + * @return all device indices that have to be sampled by this rank + */ +[[nodiscard]] std::vector owned_local_indices_for_backend(const std::vector &local_devices, MPI_Comm node_comm); + +} // namespace hws::detail + +#endif // HWS_MPI_SUPPORT_ENABLED + +#endif // HWS_MPI_UTILITY_HPP_ diff --git a/include/hws/system_hardware_sampler.hpp b/include/hws/system_hardware_sampler.hpp index ba160a2..21b3029 100644 --- a/include/hws/system_hardware_sampler.hpp +++ b/include/hws/system_hardware_sampler.hpp @@ -14,7 +14,7 @@ #include "hws/event.hpp" // hws::event #include "hws/hardware_sampler.hpp" // hws::hardware_sampler #include "hws/sample_category.hpp" // hws::sample_category -#include "hws/utility.hpp" // hws::detail::mpi_sampling_mode +#include "hws/utility.hpp" // hws::detail::indent_lines #include // std::chrono::{milliseconds, steady_clock::time_point} #include // std::size_t @@ -24,7 +24,8 @@ #include // std::vector #if defined(HWS_MPI_SUPPORT_ENABLED) - #include // MPI_Comm + #include "hws/mpi_sampling_mode.hpp" // hws::detail::mpi_sampling_mode + #include // MPI_Comm #endif namespace hws { diff --git a/include/hws/utility.hpp b/include/hws/utility.hpp index b2bc77c..2737418 100644 --- a/include/hws/utility.hpp +++ b/include/hws/utility.hpp @@ -25,13 +25,8 @@ #include // std::string_view #include // std::errc #include // std::is_same_v, std::is_floating_point_v, std::remove_cv_t, std::remove_reference_t, std::true_type, std::false_type -#include // std::unordered_map #include // std::vector -#if defined(HWS_MPI_SUPPORT_ENABLED) - #include // MPI_Comm -#endif - namespace hws::detail { /** @@ -325,219 +320,6 @@ template } } -#if defined(HWS_MPI_SUPPORT_ENABLED) -/** - * @brief Gather YAML strings from all MPI ranks and assemble them in rank order on rank 0. - * - * @param[in] local_yaml the local YAML string contribution - * @param[in] communicator the MPI communicator - * - * @return concatenated YAML string on rank 0, empty string on all other ranks - */ -[[nodiscard]] std::string gather_yaml_strings_mpi(const std::string &local_yaml, MPI_Comm communicator); - -/** - * @brief The mode to use for MPI sampling. - * per_rank: each rank creates hardware samplers for all devices visible to that rank - * whole_node: if the same device is visible to more than one rank, only one of those ranks creates a hardware sampler for that device - */ -enum class mpi_sampling_mode { - per_rank, - whole_node -}; - -/** - * @brief RAII wrapper around a node-local MPI communicator for whole-node sampling. - */ -struct hostname_comm_info { - MPI_Comm node_comm = MPI_COMM_NULL; - int node_rank = 0; - int node_size = 1; - - explicit hostname_comm_info(MPI_Comm comm) { - int world_rank = 0; - int world_size = 0; - MPI_Comm_rank(comm, &world_rank); - MPI_Comm_size(comm, &world_size); - - // Gather all hostnames - char name[MPI_MAX_PROCESSOR_NAME]; - int name_len = 0; - MPI_Get_processor_name(name, &name_len); - - std::vector name_lengths(world_size); - MPI_Allgather(&name_len, 1, MPI_INT, name_lengths.data(), 1, MPI_INT, comm); - - // Build displacements and total byte count - std::vector displs(world_size); - int total = 0; - for (int i = 0; i < world_size; ++i) { - displs[i] = total; - total += name_lengths[i]; - } - - std::vector all_names(total); - MPI_Allgatherv(name, name_len, MPI_CHAR, all_names.data(), name_lengths.data(), displs.data(), MPI_CHAR, comm); - - // Assign colors locally on every rank - // - // All ranks hold identical copies of all_names, name_lengths, and displs, - // so they can each compute the same deterministic color map independently. - - std::unordered_map host_to_color; - host_to_color.reserve(world_size); - std::vector colors(world_size); - int next_color = 0; - for (int r = 0; r < world_size; ++r) { - // get host name of rank r - std::string_view host(&all_names[displs[r]], static_cast(name_lengths[r])); - - // try to insert it into the host_to_color map - auto [it, inserted] = host_to_color.emplace(host, next_color); - - // check if host was new, if yes, increment color - if (inserted) { - ++next_color; - } - // save color of current rank, either from newly created or existing entry - colors[r] = it->second; - } - - // Split communicator - MPI_Comm_split(comm, colors[world_rank], world_rank, &node_comm); - MPI_Comm_rank(node_comm, &node_rank); - MPI_Comm_size(node_comm, &node_size); - } - - hostname_comm_info(const hostname_comm_info &) = delete; - hostname_comm_info &operator=(const hostname_comm_info &) = delete; - - ~hostname_comm_info() { - if (node_comm != MPI_COMM_NULL) { - MPI_Comm_free(&node_comm); - } - } -}; - -enum class device_backend_kind { - nvidia, - amd, - intel -}; - -struct visible_gpu_device { - device_backend_kind backend; - int local_index; // device index for that backend on this rank - std::string physical_id; // stable per-node identifier -}; - - #if defined(HWS_FOR_NVIDIA_GPUS_ENABLED) -/** - * @brief creates a list of all visible NVIDIA GPU devices - * - * @return a vector of all visible NVIDIA GPU devices on the local node, each with its local index and physical ID - */ -[[nodiscard]] std::vector enumerate_local_nvidia_devices(); - #endif - - #if defined(HWS_FOR_AMD_GPUS_ENABLED) -/** - * @brief creates a list of all visible AMD GPU devices - * - * @return a vector of all visible AMD GPU devices on the local node, each with its local index and physical ID - */ -[[nodiscard]] std::vector enumerate_local_amd_devices(); - #endif - - #if defined(HWS_FOR_INTEL_GPUS_ENABLED) -/** - * @brief creates a list of all visible Intel GPU devices - * - * @return a vector of all visible Intel GPU devices on the local node, each with its local index and physical ID - */ -[[nodiscard]] std::vector enumerate_local_intel_devices(); - #endif - -/** - * Computes for each MPI rank a list of devices that have to be sampled by this rank. Ensures that - * each device is sampled by exactly one rank. - * - * @param local_devices a vector of visible_gpu_device for the local rank, each containing a local index and a physical ID - * @param node_comm a node local MPI communicator - * @return all device indices that have to be sampled by this rank - */ -inline std::vector owned_local_indices_for_backend(const std::vector &local_devices, MPI_Comm node_comm) { - int node_rank = 0; - int node_size = 0; - MPI_Comm_rank(node_comm, &node_rank); - MPI_Comm_size(node_comm, &node_size); - - // Pack physical IDs into a newline-separated string - std::string packed; - for (const auto &d : local_devices) { - packed += d.physical_id; - packed += '\n'; - } - const int local_size = static_cast(packed.size()); - - // Allgather sizes - std::vector sizes(node_size); - MPI_Allgather(&local_size, 1, MPI_INT, sizes.data(), 1, MPI_INT, node_comm); - - // Displacements and total length - std::vector displs(node_size); - int total = 0; - for (int r = 0; r < node_size; ++r) { - displs[r] = total; - total += sizes[r]; - } - - // Allgatherv packed physical IDs - std::vector all_data(total); - MPI_Allgatherv(packed.data(), local_size, MPI_CHAR, all_data.data(), sizes.data(), displs.data(), MPI_CHAR, node_comm); - - // Build owner map: physical_id -> first node_rank that reports it - std::unordered_map owner_rank_for_id; - owner_rank_for_id.reserve(local_devices.size() * 2 + 1); - - for (int r = 0; r < node_size; ++r) { - if (sizes[r] == 0) { - continue; - } - - const char *base = all_data.data() + displs[r]; - const int len = sizes[r]; - - int line_start = 0; - while (line_start < len) { - int line_end = line_start; - while (line_end < len && base[line_end] != '\n') { - ++line_end; - } - if (line_end > line_start) { - const std::string id(base + line_start, base + line_end); // copy just this ID - owner_rank_for_id.emplace(id, r); // first insertion wins - } - line_start = line_end + 1; - } - } - - // Decide which local indices we own: those whose physical_id is mapped to node_rank - std::vector owned_indices; - owned_indices.reserve(local_devices.size()); - - for (const auto &d : local_devices) { - auto it = owner_rank_for_id.find(d.physical_id); - if (it != owner_rank_for_id.end() && it->second == node_rank) { - owned_indices.push_back(d.local_index); - } - } - - return owned_indices; -} - -#endif - } // namespace hws::detail #endif // HWS_UTILITY_HPP_ diff --git a/include/hws/visible_gpu_device.hpp b/include/hws/visible_gpu_device.hpp new file mode 100644 index 0000000..787776a --- /dev/null +++ b/include/hws/visible_gpu_device.hpp @@ -0,0 +1,37 @@ +/** + * @file + * @author Tim Thüring + * @copyright 2024-today All Rights Reserved + * @license This file is released under the MIT license. + * See the LICENSE.md file in the project root for full license information. + * + * @brief Defines types for identifying visible GPU devices. + */ + +#ifndef HWS_VISIBLE_GPU_DEVICE_HPP_ +#define HWS_VISIBLE_GPU_DEVICE_HPP_ +#pragma once + +#if defined(HWS_MPI_SUPPORT_ENABLED) + +#include // std::string + +namespace hws::detail { + +enum class device_backend_kind { + nvidia, + amd, + intel +}; + +struct visible_gpu_device { + device_backend_kind backend; + int local_index; // device index for that backend on this rank + std::string physical_id; // stable per-node identifier +}; + +} // namespace hws::detail + +#endif // HWS_MPI_SUPPORT_ENABLED + +#endif // HWS_VISIBLE_GPU_DEVICE_HPP_ diff --git a/src/hws/gpu_amd/utility.cpp b/src/hws/gpu_amd/utility.cpp index 614a7c7..55d6932 100644 --- a/src/hws/gpu_amd/utility.cpp +++ b/src/hws/gpu_amd/utility.cpp @@ -13,7 +13,7 @@ #include // std::vector #if defined(HWS_MPI_SUPPORT_ENABLED) && defined(HWS_FOR_AMD_GPUS_ENABLED) - #include "hws/utility.hpp" // hws::detail::visible_gpu_device, hws::detail::device_backend_kind + #include "hws/visible_gpu_device.hpp" // hws::detail::visible_gpu_device, hws::detail::device_backend_kind #include "hip/hip_runtime_api.h" // hipGetDeviceCount, hipDeviceGetPCIBusId #endif diff --git a/src/hws/gpu_intel/utility.cpp b/src/hws/gpu_intel/utility.cpp index 0b5f0e9..6d82160 100644 --- a/src/hws/gpu_intel/utility.cpp +++ b/src/hws/gpu_intel/utility.cpp @@ -21,7 +21,7 @@ #include // std::vector #if defined(HWS_MPI_SUPPORT_ENABLED) && defined(HWS_FOR_INTEL_GPUS_ENABLED) - #include "hws/utility.hpp" // hws::detail::visible_gpu_device, hws::detail::device_backend_kind + #include "hws/visible_gpu_device.hpp" // hws::detail::visible_gpu_device, hws::detail::device_backend_kind #endif namespace hws::detail { diff --git a/src/hws/gpu_nvidia/utility.cpp b/src/hws/gpu_nvidia/utility.cpp index 437699d..97d8c1e 100644 --- a/src/hws/gpu_nvidia/utility.cpp +++ b/src/hws/gpu_nvidia/utility.cpp @@ -15,7 +15,7 @@ #include // std::vector #if defined(HWS_MPI_SUPPORT_ENABLED) && defined(HWS_FOR_NVIDIA_GPUS_ENABLED) - #include "hws/utility.hpp" // hws::detail::visible_gpu_device, hws::detail::device_backend_kind + #include "hws/visible_gpu_device.hpp" // hws::detail::visible_gpu_device, hws::detail::device_backend_kind #endif namespace hws::detail { diff --git a/src/hws/hardware_sampler.cpp b/src/hws/hardware_sampler.cpp index 9066920..3ebaf4c 100644 --- a/src/hws/hardware_sampler.cpp +++ b/src/hws/hardware_sampler.cpp @@ -25,7 +25,7 @@ #include // std::move #if defined(HWS_MPI_SUPPORT_ENABLED) - #include // MPI_Comm + #include "hws/mpi_utility.hpp" // hws::detail::gather_yaml_strings_mpi #endif namespace hws { diff --git a/src/hws/mpi_utility.cpp b/src/hws/mpi_utility.cpp new file mode 100644 index 0000000..4d8b3af --- /dev/null +++ b/src/hws/mpi_utility.cpp @@ -0,0 +1,210 @@ +/** + * @author Tim Thüring + * @copyright 2024-today All Rights Reserved + * @license This file is released under the MIT license. + * See the LICENSE.md file in the project root for full license information. + */ + +#include "hws/mpi_utility.hpp" + +#include // std::size_t +#include // std::runtime_error +#include // std::string +#include // std::string_view +#include // std::unordered_map +#include // std::vector + +namespace hws::detail { + +hostname_comm_info::hostname_comm_info(MPI_Comm comm) { + int world_rank = 0; + int world_size = 0; + MPI_Comm_rank(comm, &world_rank); + MPI_Comm_size(comm, &world_size); + + // Gather all hostnames + char name[MPI_MAX_PROCESSOR_NAME]; + int name_len = 0; + MPI_Get_processor_name(name, &name_len); + + std::vector name_lengths(world_size); + MPI_Allgather(&name_len, 1, MPI_INT, name_lengths.data(), 1, MPI_INT, comm); + + // Build displacements and total byte count + std::vector displs(world_size); + int total = 0; + for (int i = 0; i < world_size; ++i) { + displs[i] = total; + total += name_lengths[i]; + } + + std::vector all_names(total); + MPI_Allgatherv(name, name_len, MPI_CHAR, all_names.data(), name_lengths.data(), displs.data(), MPI_CHAR, comm); + + // Assign colors locally on every rank + // + // All ranks hold identical copies of all_names, name_lengths, and displs, + // so they can each compute the same deterministic color map independently. + + std::unordered_map host_to_color; + host_to_color.reserve(world_size); + std::vector colors(world_size); + int next_color = 0; + for (int r = 0; r < world_size; ++r) { + // get host name of rank r + std::string_view host(&all_names[displs[r]], static_cast(name_lengths[r])); + + // try to insert it into the host_to_color map + auto [it, inserted] = host_to_color.emplace(host, next_color); + + // check if host was new, if yes, increment color + if (inserted) { + ++next_color; + } + // save color of current rank, either from newly created or existing entry + colors[r] = it->second; + } + + // Split communicator + MPI_Comm_split(comm, colors[world_rank], world_rank, &node_comm); + MPI_Comm_rank(node_comm, &node_rank); + MPI_Comm_size(node_comm, &node_size); +} + +hostname_comm_info::~hostname_comm_info() { + if (node_comm != MPI_COMM_NULL) { + MPI_Comm_free(&node_comm); + } +} + +std::string gather_yaml_strings_mpi(const std::string &local_yaml, MPI_Comm communicator) { + int initialized = 0; + MPI_Initialized(&initialized); + + if (!initialized) { + throw std::runtime_error{"MPI must already be initialized"}; + } + + // MPI rank and world size for identification and communication + int rank = 0; + int world_size = 0; + MPI_Comm_rank(communicator, &rank); + MPI_Comm_size(communicator, &world_size); + + // gather the size of the yaml string from each rank + const int local_size = static_cast(local_yaml.size()); + + std::vector recv_sizes; + + if (rank == 0) { + recv_sizes.resize(world_size); + } + + MPI_Gather(&local_size, 1, MPI_INT, recv_sizes.data(), 1, MPI_INT, 0, communicator); + + // compute the displacements from the rank string sizes + std::vector displacements; + int total_size = 0; + + if (rank == 0) { + displacements.resize(world_size); + + for (int i = 0; i < world_size; ++i) { + displacements[i] = total_size; + total_size += recv_sizes[i]; + } + } + + // gather the local yaml strings from all ranks + std::vector recv_buffer; + + if (rank == 0) { + recv_buffer.resize(total_size); + } + + MPI_Gatherv(local_yaml.data(), local_size, MPI_CHAR, recv_buffer.data(), recv_sizes.data(), displacements.data(), MPI_CHAR, 0, communicator); + + // build final yaml string on rank 0 + std::string global_yaml; + + if (rank == 0) { + for (int r = 0; r < world_size; ++r) { + global_yaml.append(recv_buffer.data() + displacements[r], recv_sizes[r]); + global_yaml += '\n'; + } + } + + return global_yaml; +} + +std::vector owned_local_indices_for_backend(const std::vector &local_devices, MPI_Comm node_comm) { + int node_rank = 0; + int node_size = 0; + MPI_Comm_rank(node_comm, &node_rank); + MPI_Comm_size(node_comm, &node_size); + + // Pack physical IDs into a newline-separated string + std::string packed; + for (const auto &d : local_devices) { + packed += d.physical_id; + packed += '\n'; + } + const int local_size = static_cast(packed.size()); + + // Allgather sizes + std::vector sizes(node_size); + MPI_Allgather(&local_size, 1, MPI_INT, sizes.data(), 1, MPI_INT, node_comm); + + // Displacements and total length + std::vector displs(node_size); + int total = 0; + for (int r = 0; r < node_size; ++r) { + displs[r] = total; + total += sizes[r]; + } + + // Allgatherv packed physical IDs + std::vector all_data(total); + MPI_Allgatherv(packed.data(), local_size, MPI_CHAR, all_data.data(), sizes.data(), displs.data(), MPI_CHAR, node_comm); + + // Build owner map: physical_id -> first node_rank that reports it + std::unordered_map owner_rank_for_id; + owner_rank_for_id.reserve(local_devices.size() * 2 + 1); + + for (int r = 0; r < node_size; ++r) { + if (sizes[r] == 0) { + continue; + } + + const char *base = all_data.data() + displs[r]; + const int len = sizes[r]; + + int line_start = 0; + while (line_start < len) { + int line_end = line_start; + while (line_end < len && base[line_end] != '\n') { + ++line_end; + } + if (line_end > line_start) { + const std::string id(base + line_start, base + line_end); // copy just this ID + owner_rank_for_id.emplace(id, r); // first insertion wins + } + line_start = line_end + 1; + } + } + + // Decide which local indices we own: those whose physical_id is mapped to node_rank + std::vector owned_indices; + owned_indices.reserve(local_devices.size()); + + for (const auto &d : local_devices) { + auto it = owner_rank_for_id.find(d.physical_id); + if (it != owner_rank_for_id.end() && it->second == node_rank) { + owned_indices.push_back(d.local_index); + } + } + + return owned_indices; +} + +} // namespace hws::detail diff --git a/src/hws/system_hardware_sampler.cpp b/src/hws/system_hardware_sampler.cpp index 5e30dee..4dff468 100644 --- a/src/hws/system_hardware_sampler.cpp +++ b/src/hws/system_hardware_sampler.cpp @@ -42,7 +42,7 @@ #include // std::vector #if defined(HWS_MPI_SUPPORT_ENABLED) - #include // MPI_Comm + #include "hws/mpi_utility.hpp" // hws::detail::hostname_comm_info, hws::detail::owned_local_indices_for_backend #endif namespace hws { diff --git a/src/hws/utility.cpp b/src/hws/utility.cpp index f593521..406089a 100644 --- a/src/hws/utility.cpp +++ b/src/hws/utility.cpp @@ -14,10 +14,6 @@ #include // std::string_view #include // std::vector -#if defined(HWS_MPI_SUPPORT_ENABLED) - #include // MPI_Comm, MPI_Gatherv, MPI_Gather, MPI_Initialized, MPI_Comm_rank, MPI_Comm_size -#endif - namespace hws::detail { bool starts_with(const std::string_view sv, const std::string_view start) noexcept { @@ -81,66 +77,4 @@ std::string indent_lines(const std::string &text, const std::string_view prefix) return out; } -#if defined(HWS_MPI_SUPPORT_ENABLED) -std::string gather_yaml_strings_mpi(const std::string &local_yaml, MPI_Comm communicator) { - int initialized = 0; - MPI_Initialized(&initialized); - - if (!initialized) { - throw std::runtime_error{"MPI must already be initialized"}; - } - - // MPI rank and world size for identification and communication - int rank = 0; - int world_size = 0; - MPI_Comm_rank(communicator, &rank); - MPI_Comm_size(communicator, &world_size); - - // gather the size of the yaml string from each rank - const int local_size = static_cast(local_yaml.size()); - - std::vector recv_sizes; - - if (rank == 0) { - recv_sizes.resize(world_size); - } - - MPI_Gather(&local_size, 1, MPI_INT, recv_sizes.data(), 1, MPI_INT, 0, communicator); - - // compute the displacements from the rank string sizes - std::vector displacements; - int total_size = 0; - - if (rank == 0) { - displacements.resize(world_size); - - for (int i = 0; i < world_size; ++i) { - displacements[i] = total_size; - total_size += recv_sizes[i]; - } - } - - // gather the local yaml strings from all ranks - std::vector recv_buffer; - - if (rank == 0) { - recv_buffer.resize(total_size); - } - - MPI_Gatherv(local_yaml.data(), local_size, MPI_CHAR, recv_buffer.data(), recv_sizes.data(), displacements.data(), MPI_CHAR, 0, communicator); - - // build final yaml string on rank 0 - std::string global_yaml; - - if (rank == 0) { - for (int r = 0; r < world_size; ++r) { - global_yaml.append(recv_buffer.data() + displacements[r], recv_sizes[r]); - global_yaml += '\n'; - } - } - - return global_yaml; -} -#endif - } // namespace hws::detail From 9701c70ad840cc6bbf98e1a75a2858f8992dbda5 Mon Sep 17 00:00:00 2001 From: TimThuering <56958925+TimThuering@users.noreply.github.com> Date: Thu, 25 Jun 2026 08:32:51 +0200 Subject: [PATCH 30/31] add documentation in visible_gpu_device header --- include/hws/visible_gpu_device.hpp | 8 ++++++++ 1 file changed, 8 insertions(+) diff --git a/include/hws/visible_gpu_device.hpp b/include/hws/visible_gpu_device.hpp index 787776a..7b6d581 100644 --- a/include/hws/visible_gpu_device.hpp +++ b/include/hws/visible_gpu_device.hpp @@ -18,12 +18,20 @@ namespace hws::detail { +/** + * @brief Enum class representing the backend kind of visible GPU device. + * @details The backend kind can be NVIDIA, AMD, or Intel. + */ enum class device_backend_kind { nvidia, amd, intel }; +/** + * @brief Represents a visible GPU device on the local rank. + * @details Contains the backend kind, the local index of the device for that backend on this rank, and a stable per-node identifier (physical ID) for the device. + */ struct visible_gpu_device { device_backend_kind backend; int local_index; // device index for that backend on this rank From 62c31dfd82dac1c622a577a8b59b482360fb43fe Mon Sep 17 00:00:00 2001 From: TimThuering <56958925+TimThuering@users.noreply.github.com> Date: Thu, 25 Jun 2026 11:14:18 +0200 Subject: [PATCH 31/31] update CMake error message if mpi4py causes problems --- bindings/CMakeLists.txt | 10 ++++++++-- 1 file changed, 8 insertions(+), 2 deletions(-) diff --git a/bindings/CMakeLists.txt b/bindings/CMakeLists.txt index e344dec..89357b6 100644 --- a/bindings/CMakeLists.txt +++ b/bindings/CMakeLists.txt @@ -79,14 +79,20 @@ if(HWS_MPI_SUPPORT_ACTIVE) message(FATAL_ERROR "MPI support is enabled in hws (HWS_ENABLE_MPI_SUPPORT=AUTO/ON and MPI_FOUND) " "but mpi4py is not importable in Python_EXECUTABLE='${Python_EXECUTABLE}'. " - "Install mpi4py in this environment or disable python bindings.") + "To fix this, either:\n" + " 1. Reinstall mpi4py in this environment \n" + " 2. Disable Python bindings: -DHWS_ENABLE_PYTHON_BINDINGS=OFF\n" + " 3. Disable MPI support: -DHWS_ENABLE_MPI_SUPPORT=OFF") endif() if(NOT EXISTS "${HWS_MPI4PY_INCLUDE_DIR}/mpi4py/mpi4py.h") message(FATAL_ERROR "mpi4py include path '${HWS_MPI4PY_INCLUDE_DIR}' does not contain mpi4py/mpi4py.h. " "The mpi4py installation appears to be broken. " - "Reinstall mpi4py in this environment or disable python bindings.") + "To fix this, either:\n" + " 1. Reinstall mpi4py in this environment \n" + " 2. Disable Python bindings: -DHWS_ENABLE_PYTHON_BINDINGS=OFF\n" + " 3. Disable MPI support: -DHWS_ENABLE_MPI_SUPPORT=OFF") endif() execute_process(