From e7f623c2126d83ab7e550d7806f91fc7e9bca734 Mon Sep 17 00:00:00 2001
From: TimThuering <56958925+TimThuering@users.noreply.github.com>
Date: Thu, 28 May 2026 13:25:46 +0200
Subject: [PATCH 01/31] add MPI to CMake configuration

---
 CMakeLists.txt | 31 ++++++++++++++++++++++++++++++-
 README.md      |  5 +++++
 2 files changed, 35 insertions(+), 1 deletion(-)
diff --git a/CMakeLists.txt b/CMakeLists.txt
index cf96cc6..ff588aa 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -25,7 +25,7 @@ add_library(${HWS_LIBRARY_NAME} SHARED ${HWS_SOURCES})
 add_library(hws::hws ALIAS ${HWS_LIBRARY_NAME})
 
 # set install target
-set(HWS_TARGETS_TO_INSTALL )
+set(HWS_TARGETS_TO_INSTALL)
 
 # use C++17
 target_compile_features(${HWS_LIBRARY_NAME} PUBLIC cxx_std_17)
@@ -156,6 +156,35 @@ if (HWS_ENABLE_PYTHON_BINDINGS)
 endif ()
 
 
+####################################################################################################################
+##                                               enable MPI support                                               ##
+####################################################################################################################
+set(HWS_ENABLE_MPI_SUPPORT AUTO CACHE STRING "Enable MPI support.")
+set_property(CACHE HWS_ENABLE_MPI_SUPPORT PROPERTY STRINGS AUTO ON OFF)
+
+if (HWS_ENABLE_MPI_SUPPORT MATCHES "AUTO" OR HWS_ENABLE_MPI_SUPPORT)
+    # try finding MPI
+    find_package(MPI QUIET)
+
+    # check if MPI could be found
+    if (NOT MPI_FOUND)
+        if (HWS_ENABLE_MPI_SUPPORT MATCHES "ON")
+            message(SEND_ERROR "Cannot find MPI but MPI support was explicitly requested!")
+        else ()
+            message(STATUS "Cannot find MPI. MPI support disabled.")
+        endif ()
+    else ()
+        message(STATUS "Enable MPI support (${MPI_CXX_VERSION}).")
+
+        # link against necessary libraries
+        target_link_libraries(${HWS_LIBRARY_NAME} PUBLIC MPI::MPI_CXX)
+
+        # add compile definition
+        target_compile_definitions(${HWS_LIBRARY_NAME} PUBLIC HWS_MPI_SUPPORT_ENABLED)
+    endif ()
+endif ()
+
+
 ########################################################################################################################
 ##                                                  add documentation                                                 ##
 ########################################################################################################################
diff --git a/README.md b/README.md
index 7846580..1bd62df 100644
--- a/README.md
+++ b/README.md
@@ -75,6 +75,11 @@ The `[optional_options]` can be one or multiple of:
 - `HWS_SAMPLING_INTERVAL=100ms` (default: `100ms`): set the sampling interval in milliseconds
 - `HWS_ENABLE_PYTHON_BINDINGS=ON|OFF` (default: `ON`): enable Python bindings
 
+- `HWS_ENABLE_MPI_SUPPORT=ON|OFF|AUTO` (default: `AUTO`):
+  - `ON`: check whether MPI is available and fail if this is not the case
+  - `AUTO`: check whether MPI is available but **do not** fail if this is not the case
+  - `OFF`: do not check whether MPI is available
+
 ### Installing via CMake
 
 The library supports the `install` target:

From 673917ccc53932aaac4540f7611f25f49225188a Mon Sep 17 00:00:00 2001
From: TimThuering <56958925+TimThuering@users.noreply.github.com>
Date: Thu, 28 May 2026 15:27:38 +0200
Subject: [PATCH 02/31] add global yaml dump with data from all MPI ranks to
 system_hardware_sampler

---
 include/hws/system_hardware_sampler.hpp | 21 +++++++
 include/hws/utility.hpp                 | 25 ++++++++
 src/hws/system_hardware_sampler.cpp     | 46 ++++++++++++++
 src/hws/utility.cpp                     | 79 +++++++++++++++++++++++++
 4 files changed, 171 insertions(+)

diff --git a/include/hws/system_hardware_sampler.hpp b/include/hws/system_hardware_sampler.hpp
index 42924ac..593a0d5 100644
--- a/include/hws/system_hardware_sampler.hpp
+++ b/include/hws/system_hardware_sampler.hpp
@@ -22,6 +22,10 @@
 #include <string>      // std::string
 #include <vector>      // std::vector
 
+#if defined(HWS_MPI_SUPPORT_ENABLED)
+    #include <mpi.h>  // MPI_Comm
+#endif
+
 namespace hws {
 
 /**
@@ -175,6 +179,23 @@ class system_hardware_sampler {
      */
     void dump_yaml(const std::filesystem::path &filename) const;
 
+#if defined(HWS_MPI_SUPPORT_ENABLED)
+    /**
+     * @brief Let MPI rank 0 dump the hardware samples of all hardware samplers of all MPI ranks to the YAML file with @p filename.
+     * @param[in] filename the YAML file to append the hardware samples to
+     * @param[in] communicator the MPI communicator to use
+     */
+    void dump_yaml_global(const char *filename, MPI_Comm communicator) const;
+    /**
+     * @copydoc hws::system_hardware_sampler::dump_yaml(const char *) const
+     */
+    void dump_yaml_global(const std::string &filename, MPI_Comm communicator) const;
+    /**
+     * @copydoc hws::system_hardware_sampler::dump_yaml(const char *) const
+     */
+    void dump_yaml_global(const std::filesystem::path &filename, MPI_Comm communicator) const;
+#endif
+
     /**
      * @brief Return the hardware samples as YAML string.
      * @return the YAML content as string (`[[nodiscard]]`)
diff --git a/include/hws/utility.hpp b/include/hws/utility.hpp
index aacaf23..cbb1075 100644
--- a/include/hws/utility.hpp
+++ b/include/hws/utility.hpp
@@ -27,6 +27,10 @@
 #include <type_traits>   // std::is_same_v, std::is_floating_point_v, std::remove_cv_t, std::remove_reference_t, std::true_type, std::false_type
 #include <vector>        // std::vector
 
+#if defined(HWS_MPI_SUPPORT_ENABLED)
+#include <mpi.h>        // MPI_Comm
+#endif
+
 namespace hws::detail {
 
 /**
@@ -247,6 +251,14 @@ template <typename T>
     return quoted;
 }
 
+/**
+ * @brief Prefix all lines in a string with the given indentation.
+ * @param[in] text the input text
+ * @param[in] prefix the prefix (indentation) added to each line
+ * @return the indented string
+ */
+[[nodiscard]] std::string indent_lines(const std::string &text, const std::string &prefix);
+
 /*****************************************************************************************************/
 /**                                      other free functions                                       **/
 /*****************************************************************************************************/
@@ -312,6 +324,19 @@ template <typename T>
     }
 }
 
+#if defined(HWS_MPI_SUPPORT_ENABLED)
+/**
+ * @brief Gather YAML strings from all MPI ranks and assemble them in rank order on rank 0.
+ *
+ * @param[in] local_yaml the local YAML string contribution
+ * @param[in] communicator the MPI communicator
+ *
+ * @return concatenated YAML string on rank 0, empty string on all other ranks
+ */
+[[nodiscard]]
+std::string gather_yaml_strings_mpi(const std::string& local_yaml, MPI_Comm communicator);
+#endif
+
 }  // namespace hws::detail
 
 #endif  // HWS_UTILITY_HPP_
diff --git a/src/hws/system_hardware_sampler.cpp b/src/hws/system_hardware_sampler.cpp
index 1c08762..e07d7a7 100644
--- a/src/hws/system_hardware_sampler.cpp
+++ b/src/hws/system_hardware_sampler.cpp
@@ -41,6 +41,10 @@
 #include <stdexcept>  // std::out_of_range
 #include <vector>     // std::vector
 
+#if defined(HWS_MPI_SUPPORT_ENABLED)
+#include <mpi.h>        // MPI_Comm
+#endif
+
 namespace hws {
 
 system_hardware_sampler::system_hardware_sampler(const sample_category category) :
@@ -201,6 +205,48 @@ void system_hardware_sampler::dump_yaml(const std::filesystem::path &filename) c
     std::for_each(samplers_.cbegin(), samplers_.cend(), [&filename](const auto &ptr) { ptr->dump_yaml(filename); });
 }
 
+#if defined(HWS_MPI_SUPPORT_ENABLED)
+void system_hardware_sampler::dump_yaml_global(const char *filename, MPI_Comm communicator) const {
+    int initialized = 0;
+    MPI_Initialized(&initialized);
+
+    if (!initialized) {
+        throw std::runtime_error("MPI must already be initialized");
+    }
+
+    // MPI rank and world size for identification and communication
+    int rank = 0;
+    MPI_Comm_rank(communicator, &rank);
+
+    std::string rank_yaml_output;  // yaml file as string per rank
+
+    rank_yaml_output += "---\n\n";
+    rank_yaml_output += "rank: " + std::to_string(rank) + "\n\n";
+
+    // accumulate string from each sampler
+    std::size_t sampler_idx = 0;
+    std::for_each(samplers_.cbegin(), samplers_.cend(), [&rank_yaml_output, &sampler_idx](const auto &ptr) {
+        rank_yaml_output += "sampler_" + std::to_string(sampler_idx++) + ":\n";
+        rank_yaml_output += detail::indent_lines(ptr->as_yaml_string(), "  ");
+    });
+
+    const std::string global_yaml_output = detail::gather_yaml_strings_mpi(rank_yaml_output, communicator);
+
+    if (rank == 0) {
+        std::ofstream file(filename);
+        file << global_yaml_output;
+    }
+}
+
+void system_hardware_sampler::dump_yaml_global(const std::string &filename, MPI_Comm communicator) const {
+    this->dump_yaml_global(filename.c_str(), communicator);
+}
+
+void system_hardware_sampler::dump_yaml_global(const std::filesystem::path &filename, MPI_Comm communicator) const {
+    this->dump_yaml_global(filename.string().c_str(), communicator);
+}
+#endif
+
 std::string system_hardware_sampler::as_yaml_string() const {
     return std::accumulate(samplers_.cbegin(), samplers_.cend(), std::string{}, [](const std::string str, const auto &ptr) { return str + ptr->as_yaml_string(); });
 }
diff --git a/src/hws/utility.cpp b/src/hws/utility.cpp
index 054db8d..c5d1dcd 100644
--- a/src/hws/utility.cpp
+++ b/src/hws/utility.cpp
@@ -9,10 +9,15 @@
 
 #include <algorithm>    // std::min, std::transform, std::all_of
 #include <cctype>       // std::tolower, std::isdigit
+#include <sstream>      // std::stringstream
 #include <string>       // std::string
 #include <string_view>  // std::string_view
 #include <vector>       // std::vector
 
+#if defined(HWS_MPI_SUPPORT_ENABLED)
+#include <mpi.h>        // MPI_Comm, MPI_Gatherv, MPI_Gather, MPI_Initialized, MPI_Comm_rank, MPI_Comm_size
+#endif
+
 namespace hws::detail {
 
 bool starts_with(const std::string_view sv, const std::string_view start) noexcept {
@@ -61,4 +66,78 @@ bool is_integer(std::string_view str) {
     return std::all_of(str.cbegin(), str.cend(), [](const char c) { return std::isdigit(static_cast<unsigned char>(c)); });
 }
 
+std::string indent_lines(const std::string &text, const std::string &prefix) {
+    std::stringstream ss{ text };
+
+    std::string line;
+    std::string out;
+
+    while (std::getline(ss, line)) {
+        out += prefix + line + '\n';
+    }
+
+    return out;
+}
+
+#if defined(HWS_MPI_SUPPORT_ENABLED)
+std::string gather_yaml_strings_mpi(const std::string &local_yaml, MPI_Comm communicator) {
+    int initialized = 0;
+    MPI_Initialized(&initialized);
+
+    if (!initialized) {
+        throw std::runtime_error("MPI must already be initialized");
+    }
+
+    // MPI rank and world size for identification and communication
+    int rank = 0, world_size = 0;
+    MPI_Comm_rank(communicator, &rank);
+    MPI_Comm_size(communicator, &world_size);
+
+    // gather the size of the yaml string from each rank
+    int local_size = static_cast<int>(local_yaml.size());
+
+    std::vector<int> recv_sizes;
+
+    if (rank == 0) {
+        recv_sizes.resize(world_size);
+    }
+
+    MPI_Gather(&local_size, 1, MPI_INT, recv_sizes.data(), 1, MPI_INT, 0, communicator);
+
+    // compute the displacements from the rank string sizes
+    std::vector<int> displacements;
+    int total_size = 0;
+
+    if (rank == 0) {
+        displacements.resize(world_size);
+
+        for (int i = 0; i < world_size; ++i) {
+            displacements[i] = total_size;
+            total_size += recv_sizes[i];
+        }
+    }
+
+    // gather the local yaml strings from all ranks
+    std::vector<char> recv_buffer;
+
+    if (rank == 0) {
+        recv_buffer.resize(total_size);
+    }
+
+    MPI_Gatherv(local_yaml.data(), local_size, MPI_CHAR, recv_buffer.data(), recv_sizes.data(), displacements.data(), MPI_CHAR, 0, communicator);
+
+    // build final yaml string on rank 0
+    std::string global_yaml;
+
+    if (rank == 0) {
+        for (int r = 0; r < world_size; ++r) {
+            global_yaml.append(recv_buffer.data() + displacements[r], recv_sizes[r]);
+            global_yaml += '\n';
+        }
+    }
+
+    return global_yaml;
+}
+#endif
+
 }  // namespace hws::detail

From 1f113ef23c256d114bf5810d0e4f80f9904abb5d Mon Sep 17 00:00:00 2001
From: TimThuering <56958925+TimThuering@users.noreply.github.com>
Date: Thu, 28 May 2026 16:58:51 +0200
Subject: [PATCH 03/31] update documentation

---
 include/hws/system_hardware_sampler.hpp | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/include/hws/system_hardware_sampler.hpp b/include/hws/system_hardware_sampler.hpp
index 593a0d5..2003b28 100644
--- a/include/hws/system_hardware_sampler.hpp
+++ b/include/hws/system_hardware_sampler.hpp
@@ -187,11 +187,11 @@ class system_hardware_sampler {
      */
     void dump_yaml_global(const char *filename, MPI_Comm communicator) const;
     /**
-     * @copydoc hws::system_hardware_sampler::dump_yaml(const char *) const
+     * @copydoc hws::system_hardware_sampler::dump_yaml_global(const char *) const
      */
     void dump_yaml_global(const std::string &filename, MPI_Comm communicator) const;
     /**
-     * @copydoc hws::system_hardware_sampler::dump_yaml(const char *) const
+     * @copydoc hws::system_hardware_sampler::dump_yaml_global(const char *) const
      */
     void dump_yaml_global(const std::filesystem::path &filename, MPI_Comm communicator) const;
 #endif

From 772d22642b16b3cff899851500fa19e80c5b44f9 Mon Sep 17 00:00:00 2001
From: TimThuering <56958925+TimThuering@users.noreply.github.com>
Date: Thu, 28 May 2026 17:00:42 +0200
Subject: [PATCH 04/31] add global yaml output for individual hardware sampler
 on all MPI ranks

---
 include/hws/hardware_sampler.hpp | 21 ++++++++++++++++
 src/hws/hardware_sampler.cpp     | 42 ++++++++++++++++++++++++++++++++
 2 files changed, 63 insertions(+)

diff --git a/include/hws/hardware_sampler.hpp b/include/hws/hardware_sampler.hpp
index 326eb7e..1d1f64c 100644
--- a/include/hws/hardware_sampler.hpp
+++ b/include/hws/hardware_sampler.hpp
@@ -23,6 +23,10 @@
 #include <thread>      // std::thread
 #include <vector>      // std::vector
 
+#if defined(HWS_MPI_SUPPORT_ENABLED)
+    #include <mpi.h>  // MPI_Comm
+#endif
+
 namespace hws {
 
 /**
@@ -162,6 +166,23 @@ class hardware_sampler {
      */
     void dump_yaml(const std::filesystem::path &filename) const;
 
+    #if defined(HWS_MPI_SUPPORT_ENABLED)
+    /**
+     * @brief Let MPI rank 0 dump the hardware samples of this hardware sampler of all MPI ranks to the YAML file with @p filename.
+     * @param[in] filename the YAML file to append the hardware samples to
+     * @param[in] communicator the MPI communicator to use
+     */
+    void dump_yaml_global(const char *filename, MPI_Comm communicator) const;
+    /**
+     * @copydoc hws::hardware_sampler::dump_yaml_global(const char *) const
+     */
+    void dump_yaml_global(const std::string &filename, MPI_Comm communicator) const;
+    /**
+     * @copydoc hws::hardware_sampler::dump_yaml_global(const char *) const
+     */
+    void dump_yaml_global(const std::filesystem::path &filename, MPI_Comm communicator) const;
+    #endif
+
     /**
      * @brief Return the unique device identification. Can be used as unique key in the YAML string.
      * @return the unique device identification (`[[nodiscard]]`)
diff --git a/src/hws/hardware_sampler.cpp b/src/hws/hardware_sampler.cpp
index abd907d..4c5aa1d 100644
--- a/src/hws/hardware_sampler.cpp
+++ b/src/hws/hardware_sampler.cpp
@@ -24,6 +24,10 @@
 #include <thread>     // std::thread
 #include <utility>    // std::move
 
+#if defined(HWS_MPI_SUPPORT_ENABLED)
+#include <mpi.h>        // MPI_Comm
+#endif
+
 namespace hws {
 
 hardware_sampler::hardware_sampler(const std::chrono::milliseconds sampling_interval, const sample_category category) :
@@ -144,6 +148,44 @@ void hardware_sampler::dump_yaml(const std::filesystem::path &filename) const {
     this->dump_yaml(filename.string().c_str());
 }
 
+#if defined(HWS_MPI_SUPPORT_ENABLED)
+void hardware_sampler::dump_yaml_global(const char *filename, MPI_Comm communicator) const {
+    int initialized = 0;
+    MPI_Initialized(&initialized);
+
+    if (!initialized) {
+        throw std::runtime_error("MPI must already be initialized");
+    }
+
+    // MPI rank and world size for identification and communication
+    int rank = 0;
+    MPI_Comm_rank(communicator, &rank);
+
+    std::string rank_yaml_output;  // yaml file as string per rank
+
+    rank_yaml_output += "---\n\n";
+    rank_yaml_output += "rank: " + std::to_string(rank) + "\n\n";
+
+    // add yaml string of this hardware sampler
+    rank_yaml_output += this->as_yaml_string();
+
+    const std::string global_yaml_output = detail::gather_yaml_strings_mpi(rank_yaml_output, communicator);
+
+    if (rank == 0) {
+        std::ofstream file(filename);
+        file << global_yaml_output;
+    }
+}
+
+void hardware_sampler::dump_yaml_global(const std::string &filename, MPI_Comm communicator) const {
+    this->dump_yaml_global(filename.c_str(), communicator);
+}
+
+void hardware_sampler::dump_yaml_global(const std::filesystem::path &filename, MPI_Comm communicator) const {
+    this->dump_yaml_global(filename.string().c_str(), communicator);
+}
+#endif
+
 std::string hardware_sampler::as_yaml_string() const {
     if (!this->has_sampling_stopped()) {
         throw std::runtime_error{ "Can return samples as string only after the sampling has been stopped!" };

From 9ae4c80ada4cc5b63f8b84f8d2ff13ad1ac3c3ac Mon Sep 17 00:00:00 2001
From: TimThuering <56958925+TimThuering@users.noreply.github.com>
Date: Fri, 29 May 2026 13:01:39 +0200
Subject: [PATCH 05/31] add mpi4py compatibility for python bindings

---
 CMakeLists.txt                   | 23 +++++++++++--------
 bindings/CMakeLists.txt          | 24 ++++++++++++++++++++
 bindings/main.cpp                | 39 ++++++++++++++++++++++++++++++++
 bindings/mpi4py_communicator.hpp | 28 +++++++++++++++++++++++
 pyproject.toml                   |  3 +++
 5 files changed, 108 insertions(+), 9 deletions(-)
 create mode 100644 bindings/mpi4py_communicator.hpp

diff --git a/CMakeLists.txt b/CMakeLists.txt
index ff588aa..cb049c2 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -147,20 +147,13 @@ else ()
 endif ()
 
 
-####################################################################################################################
-##                                             enable Python bindings                                             ##
-####################################################################################################################
-option(HWS_ENABLE_PYTHON_BINDINGS "Build language bindings for Python." ON)
-if (HWS_ENABLE_PYTHON_BINDINGS)
-    add_subdirectory(bindings)
-endif ()
-
-
 ####################################################################################################################
 ##                                               enable MPI support                                               ##
 ####################################################################################################################
 set(HWS_ENABLE_MPI_SUPPORT AUTO CACHE STRING "Enable MPI support.")
 set_property(CACHE HWS_ENABLE_MPI_SUPPORT PROPERTY STRINGS AUTO ON OFF)
+# Default: assume MPI support inactive
+set(HWS_MPI_SUPPORT_ACTIVE FALSE CACHE BOOL "MPI support enabled in core library")
 
 if (HWS_ENABLE_MPI_SUPPORT MATCHES "AUTO" OR HWS_ENABLE_MPI_SUPPORT)
     # try finding MPI
@@ -181,10 +174,22 @@ if (HWS_ENABLE_MPI_SUPPORT MATCHES "AUTO" OR HWS_ENABLE_MPI_SUPPORT)
 
         # add compile definition
         target_compile_definitions(${HWS_LIBRARY_NAME} PUBLIC HWS_MPI_SUPPORT_ENABLED)
+
+        # Expose that MPI is really enabled for the Python bindings (and potentially other submodules) via a cache variable.
+        set(HWS_MPI_SUPPORT_ACTIVE TRUE CACHE BOOL "MPI support enabled in core library" FORCE)
     endif ()
 endif ()
 
 
+####################################################################################################################
+##                                             enable Python bindings                                             ##
+####################################################################################################################
+option(HWS_ENABLE_PYTHON_BINDINGS "Build language bindings for Python." ON)
+if (HWS_ENABLE_PYTHON_BINDINGS)
+    add_subdirectory(bindings)
+endif ()
+
+
 ########################################################################################################################
 ##                                                  add documentation                                                 ##
 ########################################################################################################################
diff --git a/bindings/CMakeLists.txt b/bindings/CMakeLists.txt
index 93d8e98..fd4f564 100644
--- a/bindings/CMakeLists.txt
+++ b/bindings/CMakeLists.txt
@@ -64,6 +64,30 @@ target_include_directories(${HWS_PYTHON_BINDINGS_LIBRARY_NAME} PRIVATE ${CMAKE_C
 target_link_libraries(${HWS_PYTHON_BINDINGS_LIBRARY_NAME} PRIVATE ${HWS_LIBRARY_NAME})
 target_compile_definitions(${HWS_PYTHON_BINDINGS_LIBRARY_NAME} PRIVATE PYBIND11_DETAILED_ERROR_MESSAGES)
 
+if(HWS_MPI_SUPPORT_ACTIVE)
+    message(STATUS "MPI support enabled. Adding mpi4py include directory and linking against MPI.")
+    # Get mpi4py's C header location, simultaneously checking if mpi4py is importable in the current Python environment
+    execute_process(
+            COMMAND "${Python_EXECUTABLE}" -c
+            "import mpi4py, sys; sys.stdout.write(mpi4py.get_include())"
+            RESULT_VARIABLE MPI4PY_IMPORT_RESULT
+            OUTPUT_VARIABLE MPI4PY_INCLUDE_DIR
+            OUTPUT_STRIP_TRAILING_WHITESPACE
+    )
+
+    if(MPI4PY_IMPORT_RESULT)
+        message(FATAL_ERROR
+                "MPI support is enabled in hws (HWS_ENABLE_MPI_SUPPORT=AUTO/ON and MPI_FOUND) "
+                "but mpi4py is not importable in Python_EXECUTABLE='${Python_EXECUTABLE}'. "
+                "Install mpi4py in this environment or disable python bindings.")
+    endif()
+
+    target_include_directories(${HWS_PYTHON_BINDINGS_LIBRARY_NAME} PRIVATE ${MPI4PY_INCLUDE_DIR})
+
+    # Propagate the same macro used on the C++ side into the Python module
+    target_compile_definitions(${HWS_PYTHON_BINDINGS_LIBRARY_NAME} PRIVATE HWS_MPI_SUPPORT_ENABLED)
+endif()
+
 include(GNUInstallDirs)
 # install Python bindings
 install(TARGETS ${HWS_PYTHON_BINDINGS_LIBRARY_NAME}
diff --git a/bindings/main.cpp b/bindings/main.cpp
index 3f062e7..2b1a161 100644
--- a/bindings/main.cpp
+++ b/bindings/main.cpp
@@ -11,6 +11,12 @@
 
 #include <string_view>  // std::string_view
 
+#if defined(HWS_MPI_SUPPORT_ENABLED)
+#include <mpi.h>
+#include <mpi4py/mpi4py.h>
+#include "mpi4py_communicator.hpp"
+#endif
+
 #define HWS_IS_DEFINED_HELPER(x) #x
 #define HWS_IS_DEFINED(x) (std::string_view{ #x } != std::string_view{ HWS_IS_DEFINED_HELPER(x) })
 
@@ -32,6 +38,15 @@ PYBIND11_MODULE(HardwareSampling, m) {
     m.doc() = "Hardware Sampling for CPUs and GPUs";
     m.attr("__version__") = hws::version::version;
 
+    // MPI support
+#if defined(HWS_MPI_SUPPORT_ENABLED)
+    // Initialize mpi4py C-API so PyMPIComm_* are usable
+    if (import_mpi4py() < 0) {
+        throw py::error_already_set();
+    }
+#endif
+    m.def("has_mpi_support", []() { return HWS_IS_DEFINED(HWS_MPI_SUPPORT_ENABLED); });
+
     init_event(m);
     init_sample_category(m);
     init_relative_event(m);
@@ -64,3 +79,27 @@ PYBIND11_MODULE(HardwareSampling, m) {
 
     init_version(m);
 }
+
+
+
+#if defined(HWS_MPI_SUPPORT_ENABLED)
+/**
+ * Extracts an MPI_Comm from a python mpi4py.MPI.Comm object.
+ * Has to be in same translation unit as the import_mpi4py() call to ensure that the mpi4py C-API is initialized and the PyMPIComm_Type is available.
+ *
+ * @param py_comm a Python object that is expected to be an mpi4py.MPI.Comm instance
+ * @return the extracted MPI_Comm
+ */
+MPI_Comm mpi_comm_from_python(py::object py_comm) {
+    if (!PyObject_TypeCheck(py_comm.ptr(), &PyMPIComm_Type)) {
+        throw std::runtime_error("expected mpi4py.MPI.Comm as communicator argument");
+    }
+
+    MPI_Comm *comm_ptr = PyMPIComm_Get(py_comm.ptr());
+    if (comm_ptr == nullptr) {
+        throw std::runtime_error("could not extract MPI_Comm from mpi4py communicator");
+    }
+
+    return *comm_ptr;
+}
+#endif
\ No newline at end of file
diff --git a/bindings/mpi4py_communicator.hpp b/bindings/mpi4py_communicator.hpp
new file mode 100644
index 0000000..9ba2749
--- /dev/null
+++ b/bindings/mpi4py_communicator.hpp
@@ -0,0 +1,28 @@
+/**
+* @file
+ * @author Tim Thüring
+ * @copyright 2024-today All Rights Reserved
+ * @license This file is released under the MIT license.
+ *          See the LICENSE.md file in the project root for full license information.
+ *
+ * @brief Utility functions for transforming mpi4py communicators into C++ MPI communicators
+ */
+
+#ifndef HWS_HARDWARE_SAMPLING_FOR_GPUS_AND_CPUS_MPI4PY_COMMUNICATOR_HPP
+#define HWS_HARDWARE_SAMPLING_FOR_GPUS_AND_CPUS_MPI4PY_COMMUNICATOR_HPP
+#pragma once
+
+#include "pybind11/pybind11.h"
+
+#if defined(HWS_MPI_SUPPORT_ENABLED)
+#  include <mpi.h>
+#endif
+
+namespace py = pybind11;
+
+#if defined(HWS_MPI_SUPPORT_ENABLED)
+MPI_Comm mpi_comm_from_python(py::object py_comm);
+#endif
+
+
+#endif  // HWS_HARDWARE_SAMPLING_FOR_GPUS_AND_CPUS_MPI4PY_COMMUNICATOR_HPP
diff --git a/pyproject.toml b/pyproject.toml
index 45c4cfa..910c2b7 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -36,6 +36,9 @@ classifiers = [
     "Programming Language :: C++",
     "Programming Language :: Python :: 3"
 ]
+# optional dependencies
+[project.optional-dependencies]
+mpi = ["mpi4py>=4"]
 # project specific URLs
 [project.urls]
 documentation = "https://sc-sgs.github.io/hardware_sampling/"

From b3912c23747f982f0852c7859a7337f156defab1 Mon Sep 17 00:00:00 2001
From: TimThuering <56958925+TimThuering@users.noreply.github.com>
Date: Fri, 29 May 2026 13:02:12 +0200
Subject: [PATCH 06/31] add dump_yaml_global to system_hardware_sampler python
 bindings

---
 bindings/system_hardware_sampler.cpp | 18 ++++++++++++++++++
 1 file changed, 18 insertions(+)

diff --git a/bindings/system_hardware_sampler.cpp b/bindings/system_hardware_sampler.cpp
index d9af622..6633444 100644
--- a/bindings/system_hardware_sampler.cpp
+++ b/bindings/system_hardware_sampler.cpp
@@ -19,6 +19,11 @@
 #include "relative_event.hpp"  // hws::detail::relative_event
 #include <string>              // std::string
 
+#if defined(HWS_MPI_SUPPORT_ENABLED)
+#include <mpi.h>
+#include "mpi4py_communicator.hpp"
+#endif
+
 namespace py = pybind11;
 
 void init_system_hardware_sampler(py::module_ &m) {
@@ -67,5 +72,18 @@ void init_system_hardware_sampler(py::module_ &m) {
         .def("sampler", [](hws::system_hardware_sampler &self, const std::size_t idx) { return self.sampler(idx).get(); }, "get the i-th hardware sampler available for the whole system")
         .def("dump_yaml", py::overload_cast<const std::string &>(&hws::system_hardware_sampler::dump_yaml, py::const_), "dump all hardware samples for all hardware samplers to the given YAML file")
         .def("as_yaml_string", &hws::system_hardware_sampler::as_yaml_string, "return all hardware samples for all hardware samplers as YAML string")
+#if defined(HWS_MPI_SUPPORT_ENABLED)
+        .def("dump_yaml_global",
+            [](const hws::system_hardware_sampler &self,
+               const std::string &filename,
+               py::object py_comm) {
+                const MPI_Comm comm = mpi_comm_from_python(py_comm);
+                self.dump_yaml_global(filename, comm);
+            },
+            py::arg("filename"),
+            py::arg("comm"),
+            "Let MPI rank 0 dump the hardware samples of all hardware samplers of all MPI ranks to the given YAML file using the provided mpi4py communicator."
+        )
+#endif
         .def("__repr__", [](const hws::system_hardware_sampler &self) { return fmt::format("<hws.SystemHardwareSampler with {} samples>", self.num_samplers()); });
 }

From 59caf70c46f0eee3c66648ff1c49266a49aa9359 Mon Sep 17 00:00:00 2001
From: TimThuering <56958925+TimThuering@users.noreply.github.com>
Date: Fri, 29 May 2026 13:18:19 +0200
Subject: [PATCH 07/31] add dump_yaml_global to hardware_sampler python
 bindings

---
 bindings/hardware_sampler.cpp | 18 ++++++++++++++++++
 1 file changed, 18 insertions(+)

diff --git a/bindings/hardware_sampler.cpp b/bindings/hardware_sampler.cpp
index 5a12141..f8f5251 100644
--- a/bindings/hardware_sampler.cpp
+++ b/bindings/hardware_sampler.cpp
@@ -31,6 +31,11 @@
 #include "relative_event.hpp"  // hws::detail::relative_event
 #include <string>              // std::string
 
+#if defined(HWS_MPI_SUPPORT_ENABLED)
+#include <mpi.h>
+#include "mpi4py_communicator.hpp"
+#endif
+
 namespace py = pybind11;
 
 void init_hardware_sampler(py::module_ &m) {
@@ -62,6 +67,19 @@ void init_hardware_sampler(py::module_ &m) {
         .def("relative_time_points", [](const hws::hardware_sampler &self) { return hws::detail::durations_from_reference_time(self.sampling_time_points(), self.get_event(0).time_point); }, "get the relative durations of the respective hardware samples in seconds (as \"normal\" number)")
         .def("sampling_interval", &hws::hardware_sampler::sampling_interval, "get the sampling interval of this hardware sampler (in ms)")
         .def("dump_yaml", py::overload_cast<const std::string &>(&hws::hardware_sampler::dump_yaml, py::const_), "dump all hardware samples to the given YAML file")
+#if defined(HWS_MPI_SUPPORT_ENABLED)
+        .def("dump_yaml_global",
+            [](const hws::hardware_sampler &self,
+               const std::string &filename,
+               py::object py_comm) {
+                const MPI_Comm comm = mpi_comm_from_python(py_comm);
+                self.dump_yaml_global(filename, comm);
+            },
+            py::arg("filename"),
+            py::arg("comm"),
+            "Let MPI rank 0 dump the hardware samples of this hardware sampler of all MPI ranks to the given YAML file using the provided mpi4py communicator."
+        )
+#endif
         .def("as_yaml_string", &hws::hardware_sampler::as_yaml_string, "return all hardware samples including additional information like events as YAML string")
         .def("samples_only_as_yaml_string", &hws::hardware_sampler::samples_only_as_yaml_string, "return all hardware samples as YAML string")
         .def("__repr__", [](const hws::hardware_sampler &self) {

From 0cfe52785d4ebb3581df0cb6fdeab562660a4464 Mon Sep 17 00:00:00 2001
From: TimThuering <56958925+TimThuering@users.noreply.github.com>
Date: Mon, 8 Jun 2026 09:03:39 +0200
Subject: [PATCH 08/31] added system hardware sampler creation which avoids
 duplicates for MPI with NVIDIA and AMD GPUs

---
 include/hws/system_hardware_sampler.hpp |  28 ++
 include/hws/utility.hpp                 | 327 +++++++++++++++++++++++-
 src/hws/gpu_amd/CMakeLists.txt          |   2 +-
 src/hws/gpu_nvidia/CMakeLists.txt       |   2 +-
 src/hws/system_hardware_sampler.cpp     | 154 +++++++----
 5 files changed, 452 insertions(+), 61 deletions(-)

diff --git a/include/hws/system_hardware_sampler.hpp b/include/hws/system_hardware_sampler.hpp
index 2003b28..a5a1265 100644
--- a/include/hws/system_hardware_sampler.hpp
+++ b/include/hws/system_hardware_sampler.hpp
@@ -14,6 +14,7 @@
 #include "hws/event.hpp"             // hws::event
 #include "hws/hardware_sampler.hpp"  // hws::hardware_sampler
 #include "hws/sample_category.hpp"   // hws::sample_category
+#include "hws/utility.hpp"           // hws::detail::mpi_sampling_mode
 
 #include <chrono>      // std::chrono::{milliseconds, steady_clock::time_point}
 #include <cstddef>     // std::size_t
@@ -45,6 +46,24 @@ class system_hardware_sampler {
      * @param[in] category the sample categories that are enabled for hardware sampling (default: all)
      */
     explicit system_hardware_sampler(std::chrono::milliseconds sampling_interval, sample_category category = sample_category::all);
+#if defined(HWS_MPI_SUPPORT_ENABLED)
+    /**
+     * @brief Construct hardware samplers with the default sampling interval and MPI support.
+     * @param[in] communicator the MPI communicator
+     * @param[in] mode the MPI sampling mode
+     * @param[in] category the sample categories that are enabled for hardware sampling (default: all)
+     */
+    explicit system_hardware_sampler(MPI_Comm communicator, detail::mpi_sampling_mode mode, sample_category category = sample_category::all);
+    /**
+     * @brief Construct hardware samplers with the provided @p sampling_interval and MPI support.
+     * @param[in] communicator the MPI communicator
+     * @param[in] mode the MPI sampling mode
+     * @param[in] sampling_interval the used sampling interval
+     * @param[in] category the sample categories that are enabled for hardware sampling (default: all)
+     */
+    explicit system_hardware_sampler(MPI_Comm communicator, detail::mpi_sampling_mode mode, std::chrono::milliseconds sampling_interval, sample_category category = sample_category::all);
+#endif
+
 
     /**
      * @brief Delete the copy-constructor.
@@ -211,6 +230,15 @@ class system_hardware_sampler {
   private:
     /// The different hardware sampler for the current system.
     std::vector<std::unique_ptr<hardware_sampler>> samplers_;
+
+    /**
+     * Creates hardware samplers for all visible devices. Used by non-MPI class constructor.
+     * @param sampling_interval the used sampling interval
+     * @param category the sample category
+     */
+    void create_local_samplers(std::chrono::milliseconds sampling_interval, hws::sample_category category);
+
+
 };
 
 }  // namespace hws
diff --git a/include/hws/utility.hpp b/include/hws/utility.hpp
index cbb1075..6abc3d8 100644
--- a/include/hws/utility.hpp
+++ b/include/hws/utility.hpp
@@ -15,20 +15,30 @@
 #include "fmt/format.h"  // fmt::format
 #include "fmt/ranges.h"  // fmt::join
 
-#include <charconv>      // std::from_chars
-#include <chrono>        // std::chrono::duration
-#include <cmath>         // std::trunc
-#include <cstddef>       // std::size_t
-#include <optional>      // std::optional
-#include <stdexcept>     // std::runtime_error
-#include <string>        // std::string, std::stof, std::stod, std::stold
-#include <string_view>   // std::string_view
-#include <system_error>  // std::errc
-#include <type_traits>   // std::is_same_v, std::is_floating_point_v, std::remove_cv_t, std::remove_reference_t, std::true_type, std::false_type
-#include <vector>        // std::vector
+#include <charconv>       // std::from_chars
+#include <chrono>         // std::chrono::duration
+#include <cmath>          // std::trunc
+#include <cstddef>        // std::size_t
+#include <optional>       // std::optional
+#include <stdexcept>      // std::runtime_error
+#include <string>         // std::string, std::stof, std::stod, std::stold
+#include <string_view>    // std::string_view
+#include <system_error>   // std::errc
+#include <type_traits>    // std::is_same_v, std::is_floating_point_v, std::remove_cv_t, std::remove_reference_t, std::true_type, std::false_type
+#include <vector>         // std::vector
+#include <unordered_map>  // std::unordered_map
 
 #if defined(HWS_MPI_SUPPORT_ENABLED)
-#include <mpi.h>        // MPI_Comm
+    #include <mpi.h>  // MPI_Comm
+#endif
+
+#if defined(HWS_FOR_NVIDIA_GPUS_ENABLED)
+    #include "hws/gpu_nvidia/utility.hpp"  // HWS_CUDA_ERROR_CHECK
+    #include "cuda_runtime.h"  // cuda functions
+#endif
+#if defined(HWS_FOR_AMD_GPUS_ENABLED)
+    #include "hws/gpu_amd/utility.hpp"  // HWS_HIP_ERROR_CHECK
+    #include "hip/hip_runtime.h"  // hip functions
 #endif
 
 namespace hws::detail {
@@ -333,8 +343,297 @@ template <typename T>
  *
  * @return concatenated YAML string on rank 0, empty string on all other ranks
  */
-[[nodiscard]]
-std::string gather_yaml_strings_mpi(const std::string& local_yaml, MPI_Comm communicator);
+[[nodiscard]] std::string gather_yaml_strings_mpi(const std::string &local_yaml, MPI_Comm communicator);
+
+/**
+ * @brief The mode to use for MPI sampling.
+ * per_rank: each rank creates hardware samplers for all devices visible to that rank
+ * whole_node: if the same device is visible to more than one rank, only one of those ranks creates a hardware sampler for that device
+ */
+enum class mpi_sampling_mode {
+    per_rank,
+    whole_node
+};
+
+/**
+ * @brief Information about a node-local MPI communicator for whole-node sampling.
+ */
+struct hostname_comm_info {
+    MPI_Comm node_comm = MPI_COMM_NULL;
+    int node_rank = 0;
+    int node_size = 1;
+};
+
+/**
+ * @brief Create a node-local MPI communicator for whole-node sampling based on node hostnames.
+ * @param comm the parent MPI communicator to split into node-local communicators
+ * @return the node-local MPI communicator information
+ */
+inline hostname_comm_info make_hostname_comm(MPI_Comm comm) {
+    int world_rank = 0, world_size = 0;
+    MPI_Comm_rank(comm, &world_rank);
+    MPI_Comm_size(comm, &world_size);
+
+    // Gather all hostnames
+    char name[MPI_MAX_PROCESSOR_NAME];
+    int name_len = 0;
+    MPI_Get_processor_name(name, &name_len);
+
+    std::vector<int> name_lengths(world_size);
+    MPI_Allgather(&name_len, 1, MPI_INT, name_lengths.data(), 1, MPI_INT, comm);
+
+    // Build displacements and total byte count
+    std::vector<int> displs(world_size);
+    int total = 0;
+    for (int i = 0; i < world_size; ++i) {
+        displs[i] = total;
+        total += name_lengths[i];
+    }
+
+    std::vector<char> all_names(total);
+    MPI_Allgatherv(name, name_len, MPI_CHAR, all_names.data(), name_lengths.data(), displs.data(), MPI_CHAR, comm);
+
+    // Assign colors locally on every rank
+    //
+    // All ranks hold identical copies of all_names, name_lengths, and displs,
+    // so they can each compute the same deterministic color map independently.
+
+    std::unordered_map<std::string_view, int> host_to_color;
+    host_to_color.reserve(world_size);
+    std::vector<int> colors(world_size);
+    int next_color = 0;
+    for (int r = 0; r < world_size; ++r) {
+        // get host name of rank r
+        std::string_view host(&all_names[displs[r]], static_cast<std::size_t>(name_lengths[r]));
+
+        // try to insert it into the host_to_color map
+        auto [it, inserted] = host_to_color.emplace(host, next_color);
+
+        // check if host was new, if yes, increment color
+        if (inserted) {
+            ++next_color;
+        }
+        // save color of current rank, either from newly created or existing entry
+        colors[r] = it->second;
+    }
+
+    // Split communicator
+
+    hostname_comm_info info{};
+    MPI_Comm_split(comm, colors[world_rank], world_rank, &info.node_comm);
+    MPI_Comm_rank(info.node_comm, &info.node_rank);
+    MPI_Comm_size(info.node_comm, &info.node_size);
+    return info;
+}
+
+/**
+ * @brief Free a node-local MPI communicator for whole-node sampling.
+ * @param info the node-local MPI communicator information to free
+ */
+inline void free_hostname_comm(hostname_comm_info &info) {
+    if (info.node_comm != MPI_COMM_NULL) {
+        MPI_Comm_free(&info.node_comm);
+    }
+}
+
+enum class device_backend_kind {
+    nvidia,
+    amd,
+    intel
+};
+
+struct visible_gpu_device {
+    device_backend_kind backend;
+    int local_index;          // device index for that backend on this rank
+    std::string physical_id;  // stable per-node identifier
+};
+
+#endif
+
+#if defined(HWS_FOR_NVIDIA_GPUS_ENABLED)
+/**
+ * @brief returns a stable physical ID for the NVIDIA GPU device with the given local index
+ * The ID is at least unique per node and can be used to identify the same device across different MPI ranks on the same node.
+ *
+ * @param local_index the local index of the NVIDIA GPU device
+ * @return the physical ID of the NVIDIA GPU device
+ */
+inline std::string nvidia_physical_id(int local_index) {
+    char bus_id[64] = {};
+    HWS_CUDA_ERROR_CHECK(cudaDeviceGetPCIBusId(bus_id, sizeof(bus_id), local_index));
+    return std::string{ "nvidia:" } + bus_id;
+}
+
+/**
+ * @brief creates a list of all visible nvidia GPU devices
+ *
+ * @return a vector of all visible NVIDIA GPU devices on the local node, each with its local index and physical ID
+ */
+inline std::vector<visible_gpu_device> enumerate_local_nvidia_devices() {
+    std::vector<visible_gpu_device> out;
+    int count = 0;
+    HWS_CUDA_ERROR_CHECK(cudaGetDeviceCount(&count));
+    for (int i = 0; i < count; ++i) {
+        visible_gpu_device d;
+        d.backend = device_backend_kind::nvidia;
+        d.local_index = i;
+        d.physical_id = nvidia_physical_id(i);
+        out.push_back(std::move(d));
+    }
+    return out;
+}
+
+#endif
+
+#if defined(HWS_FOR_AMD_GPUS_ENABLED)
+inline std::string amd_physical_id(int local_index) {
+    char bus_id[64] = {};
+    HWS_HIP_ERROR_CHECK(hipDeviceGetPCIBusId(bus_id, sizeof(bus_id), local_index));
+    return std::string{ "amd:" } + bus_id;
+}
+
+inline std::vector<visible_gpu_device> enumerate_local_amd_devices() {
+    std::vector<visible_gpu_device> out;
+    int count = 0;
+    HWS_HIP_ERROR_CHECK(hipGetDeviceCount(&count));
+    for (int i = 0; i < count; ++i) {
+        visible_gpu_device d;
+        d.backend = device_backend_kind::amd;
+        d.local_index = i;
+        d.physical_id = amd_physical_id(i);
+        out.push_back(std::move(d));
+    }
+    return out;
+}
+#endif
+
+#if defined(HWS_FOR_INTEL_GPUS_ENABLED)
+inline std::string intel_physical_id(ze_device_handle_t device) {
+    ze_device_properties_t props{};
+    props.stype = ZE_STRUCTURE_TYPE_DEVICE_PROPERTIES;
+    props.pNext = nullptr;
+    HWS_LEVEL_ZERO_ERROR_CHECK(zeDeviceGetProperties(device, &props));
+
+    char buf[2 * ZE_MAX_DEVICE_UUID_SIZE + 1] = {};
+    for (std::size_t i = 0; i < ZE_MAX_DEVICE_UUID_SIZE; ++i) {
+        snprintf(buf + 2 * i, 3, "%02x", props.uuid.id[i]);
+    }
+
+    return std::string{ "intel:" } + buf;
+}
+
+inline std::vector<visible_gpu_device> enumerate_local_intel_devices() {
+    std::vector<visible_gpu_device> out;
+
+    // get the GPU driver
+    ze_driver_handle_t driver{};
+    HWS_LEVEL_ZERO_ERROR_CHECK(zeDriverGet(&driver_count, &driver));
+
+    // Discover devices for this driver
+    std::uint32_t device_count = 0;
+    HWS_LEVEL_ZERO_ERROR_CHECK(zeDeviceGet(driver, &device_count, nullptr));
+    if (device_count == 0) {
+        return out; // no Intel GPUs visible
+    }
+
+    std::vector<ze_device_handle_t> devices(device_count);
+    HWS_LEVEL_ZERO_ERROR_CHECK(zeDeviceGet(driver, &device_count, devices.data()));
+
+    // Fill visible_gpu_device list
+    for (std::uint32_t i = 0; i < device_count; ++i) {
+        ze_device_handle_t dev = devices[i];
+
+        visible_gpu_device d;
+        d.backend     = device_backend_kind::intel;
+        d.local_index = static_cast<int>(i);
+        d.physical_id = intel_physical_id(dev);
+
+        out.push_back(std::move(d));
+    }
+
+    return out;
+}
+#endif
+
+#if defined(HWS_MPI_SUPPORT_ENABLED)
+
+/**
+ * Computes for each MPI rank a list of devices that have to be sampled by this rank. Ensures that
+ * each device is sampled by exactly one rank.
+ *
+ * @param local_devices a vector of visible_gpu_device for the local rank, each containing a local index and a physical ID
+ * @param node_comm a node local MPI communicator
+ * @return all device indices that have to be sampled by this rank
+ */
+inline std::vector<int> owned_local_indices_for_backend(const std::vector<visible_gpu_device> &local_devices, MPI_Comm node_comm) {
+    int node_rank = 0, node_size = 0;
+    MPI_Comm_rank(node_comm, &node_rank);
+    MPI_Comm_size(node_comm, &node_size);
+
+    // Pack physical IDs into a newline-separated string
+    std::string packed;
+    for (const auto &d : local_devices) {
+        packed += d.physical_id;
+        packed += '\n';
+    }
+    const int local_size = static_cast<int>(packed.size());
+
+    // Allgather sizes
+    std::vector<int> sizes(node_size);
+    MPI_Allgather(&local_size, 1, MPI_INT, sizes.data(), 1, MPI_INT, node_comm);
+
+    // Displacements and total length
+    std::vector<int> displs(node_size);
+    int total = 0;
+    for (int r = 0; r < node_size; ++r) {
+        displs[r] = total;
+        total += sizes[r];
+    }
+
+    // Allgatherv packed physical IDs
+    std::vector<char> all_data(total);
+    MPI_Allgatherv(packed.data(), local_size, MPI_CHAR, all_data.data(), sizes.data(), displs.data(), MPI_CHAR, node_comm);
+
+    // Build owner map: physical_id -> first node_rank that reports it
+    std::unordered_map<std::string, int> owner_rank_for_id;
+    owner_rank_for_id.reserve(local_devices.size() * 2 + 1);
+
+    for (int r = 0; r < node_size; ++r) {
+        if (sizes[r] == 0) {
+            continue;
+        }
+
+        const char *base = all_data.data() + displs[r];
+        const int len = sizes[r];
+
+        int line_start = 0;
+        while (line_start < len) {
+            int line_end = line_start;
+            while (line_end < len && base[line_end] != '\n') {
+                ++line_end;
+            }
+            if (line_end > line_start) {
+                const std::string id(base + line_start, base + line_end);  // copy just this ID
+                owner_rank_for_id.emplace(id, r);                           // first insertion wins
+            }
+            line_start = line_end + 1;
+        }
+    }
+
+    // Decide which local indices we own: those whose physical_id is mapped to node_rank
+    std::vector<int> owned_indices;
+    owned_indices.reserve(local_devices.size());
+
+    for (const auto &d : local_devices) {
+        auto it = owner_rank_for_id.find(d.physical_id);
+        if (it != owner_rank_for_id.end() && it->second == node_rank) {
+            owned_indices.push_back(d.local_index);
+        }
+    }
+
+    return owned_indices;
+}
+
 #endif
 
 }  // namespace hws::detail
diff --git a/src/hws/gpu_amd/CMakeLists.txt b/src/hws/gpu_amd/CMakeLists.txt
index 0b5f104..16aecb2 100644
--- a/src/hws/gpu_amd/CMakeLists.txt
+++ b/src/hws/gpu_amd/CMakeLists.txt
@@ -22,7 +22,7 @@ message(STATUS "Enable sampling of AMD GPU information using ROCm SMI (${rocm_sm
 find_package(HIP REQUIRED)
 
 # link against necessary libraries
-target_link_libraries(${HWS_LIBRARY_NAME} PRIVATE rocm_smi64 hip::host)
+target_link_libraries(${HWS_LIBRARY_NAME} PUBLIC rocm_smi64 hip::host)
 target_include_directories(${HWS_LIBRARY_NAME} PRIVATE ${ROCM_SMI_INCLUDE_DIR})
 
 # add source file to source file list
diff --git a/src/hws/gpu_nvidia/CMakeLists.txt b/src/hws/gpu_nvidia/CMakeLists.txt
index 437e063..f8a5749 100644
--- a/src/hws/gpu_nvidia/CMakeLists.txt
+++ b/src/hws/gpu_nvidia/CMakeLists.txt
@@ -19,7 +19,7 @@ endif ()
 message(STATUS "Enable sampling of NVIDIA GPU information using NVML (${CUDAToolkit_VERSION}).")
 
 # link against necessary libraries
-target_link_libraries(${HWS_LIBRARY_NAME} PRIVATE CUDA::nvml CUDA::cudart)
+target_link_libraries(${HWS_LIBRARY_NAME} PUBLIC CUDA::nvml CUDA::cudart)
 
 # add source file to source file list
 target_sources(${HWS_LIBRARY_NAME} PRIVATE
diff --git a/src/hws/system_hardware_sampler.cpp b/src/hws/system_hardware_sampler.cpp
index e07d7a7..eb06924 100644
--- a/src/hws/system_hardware_sampler.cpp
+++ b/src/hws/system_hardware_sampler.cpp
@@ -42,7 +42,7 @@
 #include <vector>     // std::vector
 
 #if defined(HWS_MPI_SUPPORT_ENABLED)
-#include <mpi.h>        // MPI_Comm
+    #include <mpi.h>  // MPI_Comm
 #endif
 
 namespace hws {
@@ -52,56 +52,68 @@ system_hardware_sampler::system_hardware_sampler(const sample_category category)
 
 system_hardware_sampler::system_hardware_sampler(const std::chrono::milliseconds sampling_interval, sample_category category) {
     // create the hardware samplers based on the available hardware
-#if defined(HWS_FOR_CPUS_ENABLED)
-    {
-        samplers_.push_back(std::make_unique<cpu_hardware_sampler>(sampling_interval, category));
-    }
-#endif
-#if defined(HWS_FOR_NVIDIA_GPUS_ENABLED)
-    {
-        int device_count{};
-        HWS_CUDA_ERROR_CHECK(cudaGetDeviceCount(&device_count));
-        for (int device = 0; device < device_count; ++device) {
-            samplers_.push_back(std::make_unique<gpu_nvidia_hardware_sampler>(static_cast<std::size_t>(device), sampling_interval, category));
+    create_local_samplers(sampling_interval, category);
+}
+
+#if defined(HWS_MPI_SUPPORT_ENABLED)
+system_hardware_sampler::system_hardware_sampler(MPI_Comm communicator, detail::mpi_sampling_mode mode, sample_category category) :
+    system_hardware_sampler(communicator, mode, HWS_SAMPLING_INTERVAL, category) { }
+
+system_hardware_sampler::system_hardware_sampler(MPI_Comm communicator, detail::mpi_sampling_mode mode, std::chrono::milliseconds sampling_interval, sample_category category) {
+    if (mode == detail::mpi_sampling_mode::per_rank) {
+        // each rank creates samplers for all devices visible to him
+        create_local_samplers(sampling_interval, category);
+    } else if (mode == detail::mpi_sampling_mode::whole_node) {
+        // create a custom, node-local MPI communicator
+        auto nc = detail::make_hostname_comm(communicator);
+
+    // CPU: one sampler per node --> node leader only
+    #if defined(HWS_FOR_CPUS_ENABLED)
+        if (nc.node_rank == 0) {
+            samplers_.push_back(std::make_unique<cpu_hardware_sampler>(sampling_interval, category));
         }
-    }
-#endif
-#if defined(HWS_FOR_AMD_GPUS_ENABLED)
-    {
-        int device_count{};
-        HWS_HIP_ERROR_CHECK(hipGetDeviceCount(&device_count));
-        for (int device = 0; device < device_count; ++device) {
-            samplers_.push_back(std::make_unique<gpu_amd_hardware_sampler>(static_cast<std::size_t>(device), sampling_interval, category));
+    #endif
+
+    // NVIDIA
+    #if defined(HWS_FOR_NVIDIA_GPUS_ENABLED)
+        {
+            const auto local = detail::enumerate_local_nvidia_devices();
+            const auto owned = detail::owned_local_indices_for_backend(local, nc.node_comm);
+            for (int idx : owned) {
+                samplers_.push_back(std::make_unique<gpu_nvidia_hardware_sampler>(static_cast<std::size_t>(idx), sampling_interval, category));
+            }
         }
-    }
-#endif
-#if defined(HWS_FOR_INTEL_GPUS_ENABLED)
-    {
-        // init level zero driver
-        HWS_LEVEL_ZERO_ERROR_CHECK(zeInit(ZE_INIT_FLAG_GPU_ONLY))
-
-        // discover the number of drivers
-        std::uint32_t driver_count{ 0 };
-        HWS_LEVEL_ZERO_ERROR_CHECK(zeDriverGet(&driver_count, nullptr))
-
-        // check if only the single GPU driver has been found
-        if (driver_count > 1) {
-            throw std::runtime_error{ fmt::format("Found too many GPU drivers ({})!", driver_count) };
+    #endif
+
+    // AMD
+    #if defined(HWS_FOR_AMD_GPUS_ENABLED)
+        {
+            const auto local = detail::enumerate_local_amd_devices();
+            const auto owned = detail::owned_local_indices_for_backend(local, nc.node_comm);
+            for (int idx : owned) {
+                samplers_.push_back(std::make_unique<gpu_amd_hardware_sampler>(
+                    static_cast<std::size_t>(idx), sampling_interval, category));
+            }
         }
-
-        // get the GPU driver
-        ze_driver_handle_t driver{};
-        HWS_LEVEL_ZERO_ERROR_CHECK(zeDriverGet(&driver_count, &driver))
-
-        // get all GPUs for the current driver
-        std::uint32_t device_count{ 0 };
-        HWS_LEVEL_ZERO_ERROR_CHECK(zeDeviceGet(driver, &device_count, nullptr))
-        for (std::uint32_t device = 0; device < device_count; ++device) {
-            samplers_.push_back(std::make_unique<gpu_intel_hardware_sampler>(static_cast<std::size_t>(device), sampling_interval, category));
+    #endif
+
+    // Intel
+    #if defined(HWS_FOR_INTEL_GPUS_ENABLED)
+        {
+            const auto local = detail::enumerate_local_intel_devices();
+            const auto owned = detail::owned_local_indices_for_backend(local, nc.node_comm);
+            for (int idx : owned) {
+                samplers_.push_back(std::make_unique<gpu_intel_hardware_sampler>(static_cast<std::size_t>(idx), sampling_interval, category));
+            }
         }
+    #endif
+
+        detail::free_hostname_comm(nc);
+    } else {
+        throw std::runtime_error{ fmt::format("Unknown MPI sampling mode {}!", static_cast<int>(mode)) };
     }
-#endif
 }
+#endif
 
 void system_hardware_sampler::start_sampling() {
     std::for_each(samplers_.begin(), samplers_.end(), [](auto &ptr) { ptr->start_sampling(); });
@@ -255,4 +267,56 @@ std::string system_hardware_sampler::samples_only_as_yaml_string() const {
     return std::accumulate(samplers_.cbegin(), samplers_.cend(), std::string{}, [](const std::string str, const auto &ptr) { return str + ptr->samples_only_as_yaml_string(); });
 }
 
+void system_hardware_sampler::create_local_samplers(std::chrono::milliseconds sampling_interval, sample_category category) {
+#if defined(HWS_FOR_CPUS_ENABLED)
+    {
+        samplers_.push_back(std::make_unique<cpu_hardware_sampler>(sampling_interval, category));
+    }
+#endif
+#if defined(HWS_FOR_NVIDIA_GPUS_ENABLED)
+    {
+        int device_count{};
+        HWS_CUDA_ERROR_CHECK(cudaGetDeviceCount(&device_count));
+        for (int device = 0; device < device_count; ++device) {
+            samplers_.push_back(std::make_unique<gpu_nvidia_hardware_sampler>(static_cast<std::size_t>(device), sampling_interval, category));
+        }
+    }
+#endif
+#if defined(HWS_FOR_AMD_GPUS_ENABLED)
+    {
+        int device_count{};
+        HWS_HIP_ERROR_CHECK(hipGetDeviceCount(&device_count));
+        for (int device = 0; device < device_count; ++device) {
+            samplers_.push_back(std::make_unique<gpu_amd_hardware_sampler>(static_cast<std::size_t>(device), sampling_interval, category));
+        }
+    }
+#endif
+#if defined(HWS_FOR_INTEL_GPUS_ENABLED)
+    {
+        // init level zero driver
+        HWS_LEVEL_ZERO_ERROR_CHECK(zeInit(ZE_INIT_FLAG_GPU_ONLY))
+
+        // discover the number of drivers
+        std::uint32_t driver_count{ 0 };
+        HWS_LEVEL_ZERO_ERROR_CHECK(zeDriverGet(&driver_count, nullptr))
+
+        // check if only the single GPU driver has been found
+        if (driver_count > 1) {
+            throw std::runtime_error{ fmt::format("Found too many GPU drivers ({})!", driver_count) };
+        }
+
+        // get the GPU driver
+        ze_driver_handle_t driver{};
+        HWS_LEVEL_ZERO_ERROR_CHECK(zeDriverGet(&driver_count, &driver))
+
+        // get all GPUs for the current driver
+        std::uint32_t device_count{ 0 };
+        HWS_LEVEL_ZERO_ERROR_CHECK(zeDeviceGet(driver, &device_count, nullptr))
+        for (std::uint32_t device = 0; device < device_count; ++device) {
+            samplers_.push_back(std::make_unique<gpu_intel_hardware_sampler>(static_cast<std::size_t>(device), sampling_interval, category));
+        }
+    }
+#endif
+}
+
 }  // namespace hws

From bea8838b6b8f0c127e1d98d81b45ddbe2879827c Mon Sep 17 00:00:00 2001
From: TimThuering <56958925+TimThuering@users.noreply.github.com>
Date: Mon, 8 Jun 2026 09:36:12 +0200
Subject: [PATCH 09/31] added synchronous start and stop sampling for MPI

---
 include/hws/system_hardware_sampler.hpp | 17 ++++++++++++++---
 src/hws/system_hardware_sampler.cpp     | 13 +++++++++++++
 2 files changed, 27 insertions(+), 3 deletions(-)

diff --git a/include/hws/system_hardware_sampler.hpp b/include/hws/system_hardware_sampler.hpp
index a5a1265..ba160a2 100644
--- a/include/hws/system_hardware_sampler.hpp
+++ b/include/hws/system_hardware_sampler.hpp
@@ -64,7 +64,6 @@ class system_hardware_sampler {
     explicit system_hardware_sampler(MPI_Comm communicator, detail::mpi_sampling_mode mode, std::chrono::milliseconds sampling_interval, sample_category category = sample_category::all);
 #endif
 
-
     /**
      * @brief Delete the copy-constructor.
      */
@@ -91,10 +90,24 @@ class system_hardware_sampler {
      * @brief Start hardware sampling for all wrapped hardware samplers.
      */
     void start_sampling();
+#if defined(HWS_MPI_SUPPORT_ENABLED)
+    /**
+     * @brief Start hardware sampling for all wrapped hardware samplers. Executes an MPI barrier before starting sampling to synchronize all MPI ranks.
+     * @param[in] communicator the MPI communicator to use
+     */
+    void start_sampling(MPI_Comm communicator);
+#endif
     /**
      * @brief Stop hardware sampling for all wrapped hardware samplers.
      */
     void stop_sampling();
+#if defined(HWS_MPI_SUPPORT_ENABLED)
+    /**
+     * @brief Stop hardware sampling for all wrapped hardware samplers. Executes an MPI barrier after stopping sampling to synchronize all MPI ranks.
+     * @param[in] communicator the MPI communicator to use
+     */
+    void stop_sampling(MPI_Comm communicator);
+#endif
     /**
      * @brief Pause hardware sampling for all wrapped hardware samplers.
      */
@@ -237,8 +250,6 @@ class system_hardware_sampler {
      * @param category the sample category
      */
     void create_local_samplers(std::chrono::milliseconds sampling_interval, hws::sample_category category);
-
-
 };
 
 }  // namespace hws
diff --git a/src/hws/system_hardware_sampler.cpp b/src/hws/system_hardware_sampler.cpp
index eb06924..416a19d 100644
--- a/src/hws/system_hardware_sampler.cpp
+++ b/src/hws/system_hardware_sampler.cpp
@@ -119,10 +119,23 @@ void system_hardware_sampler::start_sampling() {
     std::for_each(samplers_.begin(), samplers_.end(), [](auto &ptr) { ptr->start_sampling(); });
 }
 
+#if defined(HWS_MPI_SUPPORT_ENABLED)
+void system_hardware_sampler::start_sampling(MPI_Comm communicator) {
+    MPI_Barrier(communicator);
+    std::for_each(samplers_.begin(), samplers_.end(), [](auto &ptr) { ptr->start_sampling(); });
+}
+#endif
 void system_hardware_sampler::stop_sampling() {
     std::for_each(samplers_.begin(), samplers_.end(), [](auto &ptr) { ptr->stop_sampling(); });
 }
 
+#if defined(HWS_MPI_SUPPORT_ENABLED)
+void system_hardware_sampler::stop_sampling(MPI_Comm communicator) {
+    std::for_each(samplers_.begin(), samplers_.end(), [](auto &ptr) { ptr->stop_sampling(); });
+    MPI_Barrier(communicator);
+}
+#endif
+
 void system_hardware_sampler::pause_sampling() {
     std::for_each(samplers_.begin(), samplers_.end(), [](auto &ptr) { ptr->pause_sampling(); });
 }

From 04210fb64f642c252b9cdfc3501de22da17d0979 Mon Sep 17 00:00:00 2001
From: TimThuering <56958925+TimThuering@users.noreply.github.com>
Date: Mon, 8 Jun 2026 15:48:58 +0200
Subject: [PATCH 10/31] python bindings for MPI-aware constructor and functions

---
 bindings/system_hardware_sampler.cpp | 67 ++++++++++++++++++++++------
 1 file changed, 54 insertions(+), 13 deletions(-)

diff --git a/bindings/system_hardware_sampler.cpp b/bindings/system_hardware_sampler.cpp
index 6633444..948f8e1 100644
--- a/bindings/system_hardware_sampler.cpp
+++ b/bindings/system_hardware_sampler.cpp
@@ -20,21 +20,70 @@
 #include <string>              // std::string
 
 #if defined(HWS_MPI_SUPPORT_ENABLED)
-#include <mpi.h>
-#include "mpi4py_communicator.hpp"
+    #include "mpi4py_communicator.hpp"
+    #include <mpi.h>
 #endif
 
 namespace py = pybind11;
 
 void init_system_hardware_sampler(py::module_ &m) {
+#if defined(HWS_MPI_SUPPORT_ENABLED)
+    // bind mpi sampling mode enum
+    py::enum_<hws::detail::mpi_sampling_mode>(m, "MPISamplingMode")
+        .value("PER_RANK", hws::detail::mpi_sampling_mode::per_rank)
+        .value("WHOLE_NODE", hws::detail::mpi_sampling_mode::whole_node)
+        .export_values();
+#endif
     // bind the pure virtual hardware sampler base class
     py::class_<hws::system_hardware_sampler>(m, "SystemHardwareSampler")
         .def(py::init<>(), "construct a new system hardware sampler with the default sampling interval")
         .def(py::init<hws::sample_category>(), "construct a new system hardware sampler with the default sampling interval sampling only the provided sample_category samples")
         .def(py::init<std::chrono::milliseconds>(), "construct a new system hardware sampler for with the specified sampling interval")
         .def(py::init<std::chrono::milliseconds, hws::sample_category>(), "construct a new system hardware sampler for with the specified sampling interval sampling only the provided sample_category samples")
+#if defined(HWS_MPI_SUPPORT_ENABLED)
+        // MPI-aware constructors
+
+        // (MPI_Comm, mode, category=all)
+        .def(py::init([](py::object py_comm,
+                         hws::detail::mpi_sampling_mode mode,
+                         hws::sample_category category) {
+                 MPI_Comm comm = mpi_comm_from_python(py_comm);
+                 return std::make_unique<hws::system_hardware_sampler>(comm, mode, category);
+             }),
+             py::arg("comm"),
+             py::arg("mode"),
+             py::arg("category") = hws::sample_category::all,
+             "construct a new system hardware sampler with the default sampling interval and MPI support using the given mpi4py communicator and sampling mode")
+
+        // (MPI_Comm, mode, sampling_interval, category=all)
+        .def(py::init([](py::object py_comm,
+                         hws::detail::mpi_sampling_mode mode,
+                         std::chrono::milliseconds sampling_interval,
+                         hws::sample_category category) {
+                 MPI_Comm comm = mpi_comm_from_python(py_comm);
+                 return std::make_unique<hws::system_hardware_sampler>(comm, mode, sampling_interval, category);
+             }),
+             py::arg("comm"),
+             py::arg("mode"),
+             py::arg("sampling_interval"),
+             py::arg("category") = hws::sample_category::all,
+             "construct a new system hardware sampler with the specified sampling interval and MPI support using the given mpi4py communicator and sampling mode")
+
+        // Non-MPI overloads
+        .def("start", py::overload_cast<>(&hws::system_hardware_sampler::start_sampling), "start hardware sampling for all available hardware samplers")
+        .def("stop", py::overload_cast<>(&hws::system_hardware_sampler::stop_sampling), "stop hardware sampling for all available hardware samplers")
+        // MPI-aware overloads
+        .def("start", [](hws::system_hardware_sampler &self, py::object py_comm) {
+                 MPI_Comm comm = mpi_comm_from_python(py_comm);
+                 self.start_sampling(comm); }, py::arg("comm"), "start hardware sampling for all available hardware samplers; executes an MPI barrier on the given communicator before starting")
+        .def("stop", [](hws::system_hardware_sampler &self, py::object py_comm) {
+                 MPI_Comm comm = mpi_comm_from_python(py_comm);
+                 self.stop_sampling(comm); }, py::arg("comm"), "stop hardware sampling for all available hardware samplers; executes an MPI barrier on the given communicator after stopping")
+#else
+        // No MPI support: only the simple overloads exist, no ambiguity
         .def("start", &hws::system_hardware_sampler::start_sampling, "start hardware sampling for all available hardware samplers")
-        .def("stop", &hws::system_hardware_sampler::stop_sampling, "stop hardware sampling for all available hardware samplers")
+        .def("stop", &hws::system_hardware_sampler::stop_sampling, "stop hardware sampling for all available hardware samplers");
+#endif
         .def("pause", &hws::system_hardware_sampler::pause_sampling, "pause hardware sampling for all available hardware samplers")
         .def("resume", &hws::system_hardware_sampler::resume_sampling, "resume hardware sampling for all available hardware samplers")
         .def("has_started", &hws::system_hardware_sampler::has_sampling_started, "check whether hardware sampling has already been started for all hardware samplers")
@@ -73,17 +122,9 @@ void init_system_hardware_sampler(py::module_ &m) {
         .def("dump_yaml", py::overload_cast<const std::string &>(&hws::system_hardware_sampler::dump_yaml, py::const_), "dump all hardware samples for all hardware samplers to the given YAML file")
         .def("as_yaml_string", &hws::system_hardware_sampler::as_yaml_string, "return all hardware samples for all hardware samplers as YAML string")
 #if defined(HWS_MPI_SUPPORT_ENABLED)
-        .def("dump_yaml_global",
-            [](const hws::system_hardware_sampler &self,
-               const std::string &filename,
-               py::object py_comm) {
+        .def("dump_yaml_global", [](const hws::system_hardware_sampler &self, const std::string &filename, py::object py_comm) {
                 const MPI_Comm comm = mpi_comm_from_python(py_comm);
-                self.dump_yaml_global(filename, comm);
-            },
-            py::arg("filename"),
-            py::arg("comm"),
-            "Let MPI rank 0 dump the hardware samples of all hardware samplers of all MPI ranks to the given YAML file using the provided mpi4py communicator."
-        )
+                self.dump_yaml_global(filename, comm); }, py::arg("filename"), py::arg("comm"), "Let MPI rank 0 dump the hardware samples of all hardware samplers of all MPI ranks to the given YAML file using the provided mpi4py communicator.")
 #endif
         .def("__repr__", [](const hws::system_hardware_sampler &self) { return fmt::format("<hws.SystemHardwareSampler with {} samples>", self.num_samplers()); });
 }

From 89116b46d06a33fe9c1953b838cd5ea058375416 Mon Sep 17 00:00:00 2001
From: TimThuering <56958925+TimThuering@users.noreply.github.com>
Date: Tue, 9 Jun 2026 09:51:25 +0200
Subject: [PATCH 11/31] fixes for mpi backend with intel GPUs

---
 include/hws/utility.hpp | 24 ++++++++++++++++++++----
 1 file changed, 20 insertions(+), 4 deletions(-)

diff --git a/include/hws/utility.hpp b/include/hws/utility.hpp
index 6abc3d8..6edc69d 100644
--- a/include/hws/utility.hpp
+++ b/include/hws/utility.hpp
@@ -25,8 +25,8 @@
 #include <string_view>    // std::string_view
 #include <system_error>   // std::errc
 #include <type_traits>    // std::is_same_v, std::is_floating_point_v, std::remove_cv_t, std::remove_reference_t, std::true_type, std::false_type
-#include <vector>         // std::vector
 #include <unordered_map>  // std::unordered_map
+#include <vector>         // std::vector
 
 #if defined(HWS_MPI_SUPPORT_ENABLED)
     #include <mpi.h>  // MPI_Comm
@@ -40,6 +40,10 @@
     #include "hws/gpu_amd/utility.hpp"  // HWS_HIP_ERROR_CHECK
     #include "hip/hip_runtime.h"  // hip functions
 #endif
+#if defined(HWS_FOR_INTEL_GPUS_ENABLED)
+    #include "hws/gpu_intel/utility.hpp"  // HWS_LEVEL_ZERO_ERROR_CHECK
+    #include "level_zero/ze_api.h"   // Level Zero runtime functions
+#endif
 
 namespace hws::detail {
 
@@ -525,6 +529,18 @@ inline std::string intel_physical_id(ze_device_handle_t device) {
 inline std::vector<visible_gpu_device> enumerate_local_intel_devices() {
     std::vector<visible_gpu_device> out;
 
+    // init level zero driver
+    HWS_LEVEL_ZERO_ERROR_CHECK(zeInit(ZE_INIT_FLAG_GPU_ONLY))
+
+    // discover the number of drivers
+    std::uint32_t driver_count{ 0 };
+    HWS_LEVEL_ZERO_ERROR_CHECK(zeDriverGet(&driver_count, nullptr))
+
+    // check if only the single GPU driver has been found
+    if (driver_count > 1) {
+        throw std::runtime_error{ fmt::format("Found too many GPU drivers ({})!", driver_count) };
+    }
+
     // get the GPU driver
     ze_driver_handle_t driver{};
     HWS_LEVEL_ZERO_ERROR_CHECK(zeDriverGet(&driver_count, &driver));
@@ -533,7 +549,7 @@ inline std::vector<visible_gpu_device> enumerate_local_intel_devices() {
     std::uint32_t device_count = 0;
     HWS_LEVEL_ZERO_ERROR_CHECK(zeDeviceGet(driver, &device_count, nullptr));
     if (device_count == 0) {
-        return out; // no Intel GPUs visible
+        return out;  // no Intel GPUs visible
     }
 
     std::vector<ze_device_handle_t> devices(device_count);
@@ -544,7 +560,7 @@ inline std::vector<visible_gpu_device> enumerate_local_intel_devices() {
         ze_device_handle_t dev = devices[i];
 
         visible_gpu_device d;
-        d.backend     = device_backend_kind::intel;
+        d.backend = device_backend_kind::intel;
         d.local_index = static_cast<int>(i);
         d.physical_id = intel_physical_id(dev);
 
@@ -614,7 +630,7 @@ inline std::vector<int> owned_local_indices_for_backend(const std::vector<visibl
             }
             if (line_end > line_start) {
                 const std::string id(base + line_start, base + line_end);  // copy just this ID
-                owner_rank_for_id.emplace(id, r);                           // first insertion wins
+                owner_rank_for_id.emplace(id, r);                          // first insertion wins
             }
             line_start = line_end + 1;
         }

From a7c6543938fc207b060f35345f45c4172fee02dc Mon Sep 17 00:00:00 2001
From: TimThuering <56958925+TimThuering@users.noreply.github.com>
Date: Fri, 12 Jun 2026 11:02:14 +0200
Subject: [PATCH 12/31] fix for python bindings

---
 bindings/main.cpp                    | 2 +-
 bindings/system_hardware_sampler.cpp | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/bindings/main.cpp b/bindings/main.cpp
index 2b1a161..a629ee0 100644
--- a/bindings/main.cpp
+++ b/bindings/main.cpp
@@ -102,4 +102,4 @@ MPI_Comm mpi_comm_from_python(py::object py_comm) {
 
     return *comm_ptr;
 }
-#endif
\ No newline at end of file
+#endif
diff --git a/bindings/system_hardware_sampler.cpp b/bindings/system_hardware_sampler.cpp
index 948f8e1..4941fdf 100644
--- a/bindings/system_hardware_sampler.cpp
+++ b/bindings/system_hardware_sampler.cpp
@@ -82,7 +82,7 @@ void init_system_hardware_sampler(py::module_ &m) {
 #else
         // No MPI support: only the simple overloads exist, no ambiguity
         .def("start", &hws::system_hardware_sampler::start_sampling, "start hardware sampling for all available hardware samplers")
-        .def("stop", &hws::system_hardware_sampler::stop_sampling, "stop hardware sampling for all available hardware samplers");
+        .def("stop", &hws::system_hardware_sampler::stop_sampling, "stop hardware sampling for all available hardware samplers")
 #endif
         .def("pause", &hws::system_hardware_sampler::pause_sampling, "pause hardware sampling for all available hardware samplers")
         .def("resume", &hws::system_hardware_sampler::resume_sampling, "resume hardware sampling for all available hardware samplers")

From 45d50996834aefa9b7936cdf85c22b8d69639e9b Mon Sep 17 00:00:00 2001
From: TimThuering <56958925+TimThuering@users.noreply.github.com>
Date: Fri, 12 Jun 2026 13:10:51 +0200
Subject: [PATCH 13/31] fix for non-mpi mode

---
 include/hws/utility.hpp | 2 --
 1 file changed, 2 deletions(-)

diff --git a/include/hws/utility.hpp b/include/hws/utility.hpp
index 6edc69d..257ed3c 100644
--- a/include/hws/utility.hpp
+++ b/include/hws/utility.hpp
@@ -452,7 +452,6 @@ struct visible_gpu_device {
     std::string physical_id;  // stable per-node identifier
 };
 
-#endif
 
 #if defined(HWS_FOR_NVIDIA_GPUS_ENABLED)
 /**
@@ -571,7 +570,6 @@ inline std::vector<visible_gpu_device> enumerate_local_intel_devices() {
 }
 #endif
 
-#if defined(HWS_MPI_SUPPORT_ENABLED)
 
 /**
  * Computes for each MPI rank a list of devices that have to be sampled by this rank. Ensures that

From 5532369ca4b07a61b776bfc738d414bf2cb74ce4 Mon Sep 17 00:00:00 2001
From: TimThuering <56958925+TimThuering@users.noreply.github.com>
Date: Wed, 17 Jun 2026 08:45:48 +0200
Subject: [PATCH 14/31] restructure MPI related utility functions

---
 include/hws/utility.hpp           | 131 ++++--------------------------
 src/hws/gpu_amd/CMakeLists.txt    |   2 +-
 src/hws/gpu_amd/utility.cpp       |  42 ++++++++++
 src/hws/gpu_intel/utility.cpp     |  81 ++++++++++++++++++
 src/hws/gpu_nvidia/CMakeLists.txt |   2 +-
 src/hws/gpu_nvidia/utility.cpp    |  39 +++++++++
 6 files changed, 178 insertions(+), 119 deletions(-)

diff --git a/include/hws/utility.hpp b/include/hws/utility.hpp
index 257ed3c..2915a18 100644
--- a/include/hws/utility.hpp
+++ b/include/hws/utility.hpp
@@ -32,19 +32,6 @@
     #include <mpi.h>  // MPI_Comm
 #endif
 
-#if defined(HWS_FOR_NVIDIA_GPUS_ENABLED)
-    #include "hws/gpu_nvidia/utility.hpp"  // HWS_CUDA_ERROR_CHECK
-    #include "cuda_runtime.h"  // cuda functions
-#endif
-#if defined(HWS_FOR_AMD_GPUS_ENABLED)
-    #include "hws/gpu_amd/utility.hpp"  // HWS_HIP_ERROR_CHECK
-    #include "hip/hip_runtime.h"  // hip functions
-#endif
-#if defined(HWS_FOR_INTEL_GPUS_ENABLED)
-    #include "hws/gpu_intel/utility.hpp"  // HWS_LEVEL_ZERO_ERROR_CHECK
-    #include "level_zero/ze_api.h"   // Level Zero runtime functions
-#endif
-
 namespace hws::detail {
 
 /**
@@ -455,119 +442,29 @@ struct visible_gpu_device {
 
 #if defined(HWS_FOR_NVIDIA_GPUS_ENABLED)
 /**
- * @brief returns a stable physical ID for the NVIDIA GPU device with the given local index
- * The ID is at least unique per node and can be used to identify the same device across different MPI ranks on the same node.
- *
- * @param local_index the local index of the NVIDIA GPU device
- * @return the physical ID of the NVIDIA GPU device
- */
-inline std::string nvidia_physical_id(int local_index) {
-    char bus_id[64] = {};
-    HWS_CUDA_ERROR_CHECK(cudaDeviceGetPCIBusId(bus_id, sizeof(bus_id), local_index));
-    return std::string{ "nvidia:" } + bus_id;
-}
-
-/**
- * @brief creates a list of all visible nvidia GPU devices
+ * @brief creates a list of all visible NVIDIA GPU devices
  *
  * @return a vector of all visible NVIDIA GPU devices on the local node, each with its local index and physical ID
  */
-inline std::vector<visible_gpu_device> enumerate_local_nvidia_devices() {
-    std::vector<visible_gpu_device> out;
-    int count = 0;
-    HWS_CUDA_ERROR_CHECK(cudaGetDeviceCount(&count));
-    for (int i = 0; i < count; ++i) {
-        visible_gpu_device d;
-        d.backend = device_backend_kind::nvidia;
-        d.local_index = i;
-        d.physical_id = nvidia_physical_id(i);
-        out.push_back(std::move(d));
-    }
-    return out;
-}
-
+[[nodiscard]] std::vector<visible_gpu_device> enumerate_local_nvidia_devices();
 #endif
 
 #if defined(HWS_FOR_AMD_GPUS_ENABLED)
-inline std::string amd_physical_id(int local_index) {
-    char bus_id[64] = {};
-    HWS_HIP_ERROR_CHECK(hipDeviceGetPCIBusId(bus_id, sizeof(bus_id), local_index));
-    return std::string{ "amd:" } + bus_id;
-}
-
-inline std::vector<visible_gpu_device> enumerate_local_amd_devices() {
-    std::vector<visible_gpu_device> out;
-    int count = 0;
-    HWS_HIP_ERROR_CHECK(hipGetDeviceCount(&count));
-    for (int i = 0; i < count; ++i) {
-        visible_gpu_device d;
-        d.backend = device_backend_kind::amd;
-        d.local_index = i;
-        d.physical_id = amd_physical_id(i);
-        out.push_back(std::move(d));
-    }
-    return out;
-}
+/**
+ * @brief creates a list of all visible AMD GPU devices
+ *
+ * @return a vector of all visible AMD GPU devices on the local node, each with its local index and physical ID
+ */
+[[nodiscard]] std::vector<visible_gpu_device> enumerate_local_amd_devices();
 #endif
 
 #if defined(HWS_FOR_INTEL_GPUS_ENABLED)
-inline std::string intel_physical_id(ze_device_handle_t device) {
-    ze_device_properties_t props{};
-    props.stype = ZE_STRUCTURE_TYPE_DEVICE_PROPERTIES;
-    props.pNext = nullptr;
-    HWS_LEVEL_ZERO_ERROR_CHECK(zeDeviceGetProperties(device, &props));
-
-    char buf[2 * ZE_MAX_DEVICE_UUID_SIZE + 1] = {};
-    for (std::size_t i = 0; i < ZE_MAX_DEVICE_UUID_SIZE; ++i) {
-        snprintf(buf + 2 * i, 3, "%02x", props.uuid.id[i]);
-    }
-
-    return std::string{ "intel:" } + buf;
-}
-
-inline std::vector<visible_gpu_device> enumerate_local_intel_devices() {
-    std::vector<visible_gpu_device> out;
-
-    // init level zero driver
-    HWS_LEVEL_ZERO_ERROR_CHECK(zeInit(ZE_INIT_FLAG_GPU_ONLY))
-
-    // discover the number of drivers
-    std::uint32_t driver_count{ 0 };
-    HWS_LEVEL_ZERO_ERROR_CHECK(zeDriverGet(&driver_count, nullptr))
-
-    // check if only the single GPU driver has been found
-    if (driver_count > 1) {
-        throw std::runtime_error{ fmt::format("Found too many GPU drivers ({})!", driver_count) };
-    }
-
-    // get the GPU driver
-    ze_driver_handle_t driver{};
-    HWS_LEVEL_ZERO_ERROR_CHECK(zeDriverGet(&driver_count, &driver));
-
-    // Discover devices for this driver
-    std::uint32_t device_count = 0;
-    HWS_LEVEL_ZERO_ERROR_CHECK(zeDeviceGet(driver, &device_count, nullptr));
-    if (device_count == 0) {
-        return out;  // no Intel GPUs visible
-    }
-
-    std::vector<ze_device_handle_t> devices(device_count);
-    HWS_LEVEL_ZERO_ERROR_CHECK(zeDeviceGet(driver, &device_count, devices.data()));
-
-    // Fill visible_gpu_device list
-    for (std::uint32_t i = 0; i < device_count; ++i) {
-        ze_device_handle_t dev = devices[i];
-
-        visible_gpu_device d;
-        d.backend = device_backend_kind::intel;
-        d.local_index = static_cast<int>(i);
-        d.physical_id = intel_physical_id(dev);
-
-        out.push_back(std::move(d));
-    }
-
-    return out;
-}
+/**
+ * @brief creates a list of all visible Intel GPU devices
+ *
+ * @return a vector of all visible Intel GPU devices on the local node, each with its local index and physical ID
+ */
+[[nodiscard]] std::vector<visible_gpu_device> enumerate_local_intel_devices();
 #endif
 
 
diff --git a/src/hws/gpu_amd/CMakeLists.txt b/src/hws/gpu_amd/CMakeLists.txt
index 16aecb2..0b5f104 100644
--- a/src/hws/gpu_amd/CMakeLists.txt
+++ b/src/hws/gpu_amd/CMakeLists.txt
@@ -22,7 +22,7 @@ message(STATUS "Enable sampling of AMD GPU information using ROCm SMI (${rocm_sm
 find_package(HIP REQUIRED)
 
 # link against necessary libraries
-target_link_libraries(${HWS_LIBRARY_NAME} PUBLIC rocm_smi64 hip::host)
+target_link_libraries(${HWS_LIBRARY_NAME} PRIVATE rocm_smi64 hip::host)
 target_include_directories(${HWS_LIBRARY_NAME} PRIVATE ${ROCM_SMI_INCLUDE_DIR})
 
 # add source file to source file list
diff --git a/src/hws/gpu_amd/utility.cpp b/src/hws/gpu_amd/utility.cpp
index a88969a..d0bb038 100644
--- a/src/hws/gpu_amd/utility.cpp
+++ b/src/hws/gpu_amd/utility.cpp
@@ -10,6 +10,13 @@
 #include "rocm_smi/rocm_smi.h"  // ROCm SMI runtime functions
 
 #include <string>  // std::string
+#include <vector>  // std::vector
+
+#if defined(HWS_MPI_SUPPORT_ENABLED) && defined(HWS_FOR_AMD_GPUS_ENABLED)
+    #include "hws/utility.hpp"  // hws::detail::visible_gpu_device, hws::detail::device_backend_kind
+
+    #include "hip/hip_runtime_api.h"  // hipGetDeviceCount, hipDeviceGetPCIBusId
+#endif
 
 namespace hws::detail {
 
@@ -39,4 +46,39 @@ std::string performance_level_to_string(const rsmi_dev_perf_level_t perf_level)
     }
 }
 
+#if defined(HWS_MPI_SUPPORT_ENABLED) && defined(HWS_FOR_AMD_GPUS_ENABLED)
+
+namespace {
+
+/**
+ * @brief returns a stable physical ID for the AMD GPU device with the given local index
+ * The ID is at least unique per node and can be used to identify the same device across different MPI ranks on the same node.
+ *
+ * @param local_index the local index of the AMD GPU device
+ * @return the physical ID of the AMD GPU device
+ */
+std::string amd_physical_id(int local_index) {
+    char bus_id[64] = {};
+    HWS_HIP_ERROR_CHECK(hipDeviceGetPCIBusId(bus_id, sizeof(bus_id), local_index));
+    return std::string{ "amd:" } + bus_id;
+}
+
+}  // namespace
+
+std::vector<visible_gpu_device> enumerate_local_amd_devices() {
+    std::vector<visible_gpu_device> out;
+    int count = 0;
+    HWS_HIP_ERROR_CHECK(hipGetDeviceCount(&count));
+    for (int i = 0; i < count; ++i) {
+        visible_gpu_device d;
+        d.backend = device_backend_kind::amd;
+        d.local_index = i;
+        d.physical_id = amd_physical_id(i);
+        out.push_back(std::move(d));
+    }
+    return out;
+}
+
+#endif
+
 }  // namespace hws::detail
diff --git a/src/hws/gpu_intel/utility.cpp b/src/hws/gpu_intel/utility.cpp
index 5a29eee..2d50ad8 100644
--- a/src/hws/gpu_intel/utility.cpp
+++ b/src/hws/gpu_intel/utility.cpp
@@ -12,10 +12,18 @@
 #include "level_zero/ze_api.h"   // Level Zero runtime functions
 #include "level_zero/zes_api.h"  // Level Zero runtime functions
 
+#include <cstddef>      // std::size_t
+#include <cstdint>      // std::uint32_t
+#include <cstdio>       // snprintf
+#include <stdexcept>    // std::runtime_error
 #include <string>       // std::string
 #include <string_view>  // std::string_view
 #include <vector>       // std::vector
 
+#if defined(HWS_MPI_SUPPORT_ENABLED) && defined(HWS_FOR_INTEL_GPUS_ENABLED)
+    #include "hws/utility.hpp"  // hws::detail::visible_gpu_device, hws::detail::device_backend_kind
+#endif
+
 namespace hws::detail {
 
 std::vector<std::string> property_flags_to_vector(const ze_device_property_flags_t flags) {
@@ -227,4 +235,77 @@ std::string memory_location_to_name(const zes_mem_loc_t mem_loc) {
     }
 }
 
+#if defined(HWS_MPI_SUPPORT_ENABLED) && defined(HWS_FOR_INTEL_GPUS_ENABLED)
+
+namespace {
+
+/**
+ * @brief returns a stable physical ID for the Intel GPU @p device
+ * The ID is at least unique per node and can be used to identify the same device across different MPI ranks on the same node.
+ *
+ * @param device the Level Zero device handle of the Intel GPU device
+ * @return the physical ID of the Intel GPU device
+ */
+std::string intel_physical_id(ze_device_handle_t device) {
+    ze_device_properties_t props{};
+    props.stype = ZE_STRUCTURE_TYPE_DEVICE_PROPERTIES;
+    props.pNext = nullptr;
+    HWS_LEVEL_ZERO_ERROR_CHECK(zeDeviceGetProperties(device, &props));
+
+    char buf[2 * ZE_MAX_DEVICE_UUID_SIZE + 1] = {};
+    for (std::size_t i = 0; i < ZE_MAX_DEVICE_UUID_SIZE; ++i) {
+        snprintf(buf + 2 * i, 3, "%02x", props.uuid.id[i]);
+    }
+
+    return std::string{ "intel:" } + buf;
+}
+
+}  // namespace
+
+std::vector<visible_gpu_device> enumerate_local_intel_devices() {
+    std::vector<visible_gpu_device> out;
+
+    // init level zero driver
+    HWS_LEVEL_ZERO_ERROR_CHECK(zeInit(ZE_INIT_FLAG_GPU_ONLY))
+
+    // discover the number of drivers
+    std::uint32_t driver_count{ 0 };
+    HWS_LEVEL_ZERO_ERROR_CHECK(zeDriverGet(&driver_count, nullptr))
+
+    // check if only the single GPU driver has been found
+    if (driver_count > 1) {
+        throw std::runtime_error{ fmt::format("Found too many GPU drivers ({})!", driver_count) };
+    }
+
+    // get the GPU driver
+    ze_driver_handle_t driver{};
+    HWS_LEVEL_ZERO_ERROR_CHECK(zeDriverGet(&driver_count, &driver));
+
+    // Discover devices for this driver
+    std::uint32_t device_count = 0;
+    HWS_LEVEL_ZERO_ERROR_CHECK(zeDeviceGet(driver, &device_count, nullptr));
+    if (device_count == 0) {
+        return out;  // no Intel GPUs visible
+    }
+
+    std::vector<ze_device_handle_t> devices(device_count);
+    HWS_LEVEL_ZERO_ERROR_CHECK(zeDeviceGet(driver, &device_count, devices.data()));
+
+    // Fill visible_gpu_device list
+    for (std::uint32_t i = 0; i < device_count; ++i) {
+        ze_device_handle_t dev = devices[i];
+
+        visible_gpu_device d;
+        d.backend = device_backend_kind::intel;
+        d.local_index = static_cast<int>(i);
+        d.physical_id = intel_physical_id(dev);
+
+        out.push_back(std::move(d));
+    }
+
+    return out;
+}
+
+#endif
+
 }  // namespace hws::detail
diff --git a/src/hws/gpu_nvidia/CMakeLists.txt b/src/hws/gpu_nvidia/CMakeLists.txt
index f8a5749..437e063 100644
--- a/src/hws/gpu_nvidia/CMakeLists.txt
+++ b/src/hws/gpu_nvidia/CMakeLists.txt
@@ -19,7 +19,7 @@ endif ()
 message(STATUS "Enable sampling of NVIDIA GPU information using NVML (${CUDAToolkit_VERSION}).")
 
 # link against necessary libraries
-target_link_libraries(${HWS_LIBRARY_NAME} PUBLIC CUDA::nvml CUDA::cudart)
+target_link_libraries(${HWS_LIBRARY_NAME} PRIVATE CUDA::nvml CUDA::cudart)
 
 # add source file to source file list
 target_sources(${HWS_LIBRARY_NAME} PRIVATE
diff --git a/src/hws/gpu_nvidia/utility.cpp b/src/hws/gpu_nvidia/utility.cpp
index a81feea..3237113 100644
--- a/src/hws/gpu_nvidia/utility.cpp
+++ b/src/hws/gpu_nvidia/utility.cpp
@@ -14,6 +14,10 @@
 #include <string>  // std::string
 #include <vector>  // std::vector
 
+#if defined(HWS_MPI_SUPPORT_ENABLED) && defined(HWS_FOR_NVIDIA_GPUS_ENABLED)
+    #include "hws/utility.hpp"  // hws::detail::visible_gpu_device, hws::detail::device_backend_kind
+#endif
+
 namespace hws::detail {
 
 #if CUDA_VERSION >= 12000
@@ -56,4 +60,39 @@ std::string throttle_event_reason_to_string(const unsigned long long clocks_even
 
 #endif
 
+#if defined(HWS_MPI_SUPPORT_ENABLED) && defined(HWS_FOR_NVIDIA_GPUS_ENABLED)
+
+namespace {
+
+/**
+ * @brief returns a stable physical ID for the NVIDIA GPU device with the given local index
+ * The ID is at least unique per node and can be used to identify the same device across different MPI ranks on the same node.
+ *
+ * @param local_index the local index of the NVIDIA GPU device
+ * @return the physical ID of the NVIDIA GPU device
+ */
+std::string nvidia_physical_id(int local_index) {
+    char bus_id[64] = {};
+    HWS_CUDA_ERROR_CHECK(cudaDeviceGetPCIBusId(bus_id, sizeof(bus_id), local_index));
+    return std::string{ "nvidia:" } + bus_id;
+}
+
+}  // namespace
+
+std::vector<visible_gpu_device> enumerate_local_nvidia_devices() {
+    std::vector<visible_gpu_device> out;
+    int count = 0;
+    HWS_CUDA_ERROR_CHECK(cudaGetDeviceCount(&count));
+    for (int i = 0; i < count; ++i) {
+        visible_gpu_device d;
+        d.backend = device_backend_kind::nvidia;
+        d.local_index = i;
+        d.physical_id = nvidia_physical_id(i);
+        out.push_back(std::move(d));
+    }
+    return out;
+}
+
+#endif
+
 }  // namespace hws::detail

From 99d7f10bdceebe9d2643bc7a39ef9cfb19159e54 Mon Sep 17 00:00:00 2001
From: TimThuering <56958925+TimThuering@users.noreply.github.com>
Date: Wed, 17 Jun 2026 09:25:55 +0200
Subject: [PATCH 15/31] consistent formatting

---
 bindings/hardware_sampler.cpp    | 16 ++++------------
 bindings/main.cpp                | 10 ++++------
 bindings/mpi4py_communicator.hpp |  5 ++---
 include/hws/hardware_sampler.hpp |  4 ++--
 include/hws/utility.hpp          | 14 ++++++--------
 src/hws/hardware_sampler.cpp     |  2 +-
 src/hws/utility.cpp              |  2 +-
 7 files changed, 20 insertions(+), 33 deletions(-)

diff --git a/bindings/hardware_sampler.cpp b/bindings/hardware_sampler.cpp
index f8f5251..f29c4a3 100644
--- a/bindings/hardware_sampler.cpp
+++ b/bindings/hardware_sampler.cpp
@@ -32,8 +32,8 @@
 #include <string>              // std::string
 
 #if defined(HWS_MPI_SUPPORT_ENABLED)
-#include <mpi.h>
-#include "mpi4py_communicator.hpp"
+    #include "mpi4py_communicator.hpp"
+    #include <mpi.h>
 #endif
 
 namespace py = pybind11;
@@ -68,17 +68,9 @@ void init_hardware_sampler(py::module_ &m) {
         .def("sampling_interval", &hws::hardware_sampler::sampling_interval, "get the sampling interval of this hardware sampler (in ms)")
         .def("dump_yaml", py::overload_cast<const std::string &>(&hws::hardware_sampler::dump_yaml, py::const_), "dump all hardware samples to the given YAML file")
 #if defined(HWS_MPI_SUPPORT_ENABLED)
-        .def("dump_yaml_global",
-            [](const hws::hardware_sampler &self,
-               const std::string &filename,
-               py::object py_comm) {
+        .def("dump_yaml_global", [](const hws::hardware_sampler &self, const std::string &filename, py::object py_comm) {
                 const MPI_Comm comm = mpi_comm_from_python(py_comm);
-                self.dump_yaml_global(filename, comm);
-            },
-            py::arg("filename"),
-            py::arg("comm"),
-            "Let MPI rank 0 dump the hardware samples of this hardware sampler of all MPI ranks to the given YAML file using the provided mpi4py communicator."
-        )
+                self.dump_yaml_global(filename, comm); }, py::arg("filename"), py::arg("comm"), "Let MPI rank 0 dump the hardware samples of this hardware sampler of all MPI ranks to the given YAML file using the provided mpi4py communicator.")
 #endif
         .def("as_yaml_string", &hws::hardware_sampler::as_yaml_string, "return all hardware samples including additional information like events as YAML string")
         .def("samples_only_as_yaml_string", &hws::hardware_sampler::samples_only_as_yaml_string, "return all hardware samples as YAML string")
diff --git a/bindings/main.cpp b/bindings/main.cpp
index a629ee0..51825ae 100644
--- a/bindings/main.cpp
+++ b/bindings/main.cpp
@@ -5,16 +5,16 @@
  *          See the LICENSE.md file in the project root for full license information.
  */
 
-#include "hws/version.hpp" // hws::version::version
+#include "hws/version.hpp"  // hws::version::version
 
 #include "pybind11/pybind11.h"  // PYBIND11_MODULE, py::module_
 
 #include <string_view>  // std::string_view
 
 #if defined(HWS_MPI_SUPPORT_ENABLED)
-#include <mpi.h>
-#include <mpi4py/mpi4py.h>
-#include "mpi4py_communicator.hpp"
+    #include "mpi4py_communicator.hpp"
+    #include <mpi.h>
+    #include <mpi4py/mpi4py.h>
 #endif
 
 #define HWS_IS_DEFINED_HELPER(x) #x
@@ -80,8 +80,6 @@ PYBIND11_MODULE(HardwareSampling, m) {
     init_version(m);
 }
 
-
-
 #if defined(HWS_MPI_SUPPORT_ENABLED)
 /**
  * Extracts an MPI_Comm from a python mpi4py.MPI.Comm object.
diff --git a/bindings/mpi4py_communicator.hpp b/bindings/mpi4py_communicator.hpp
index 9ba2749..b7cdefd 100644
--- a/bindings/mpi4py_communicator.hpp
+++ b/bindings/mpi4py_communicator.hpp
@@ -1,5 +1,5 @@
 /**
-* @file
+ * @file
  * @author Tim Thüring
  * @copyright 2024-today All Rights Reserved
  * @license This file is released under the MIT license.
@@ -15,7 +15,7 @@
 #include "pybind11/pybind11.h"
 
 #if defined(HWS_MPI_SUPPORT_ENABLED)
-#  include <mpi.h>
+    #include <mpi.h>
 #endif
 
 namespace py = pybind11;
@@ -24,5 +24,4 @@ namespace py = pybind11;
 MPI_Comm mpi_comm_from_python(py::object py_comm);
 #endif
 
-
 #endif  // HWS_HARDWARE_SAMPLING_FOR_GPUS_AND_CPUS_MPI4PY_COMMUNICATOR_HPP
diff --git a/include/hws/hardware_sampler.hpp b/include/hws/hardware_sampler.hpp
index 1d1f64c..6c31a75 100644
--- a/include/hws/hardware_sampler.hpp
+++ b/include/hws/hardware_sampler.hpp
@@ -166,7 +166,7 @@ class hardware_sampler {
      */
     void dump_yaml(const std::filesystem::path &filename) const;
 
-    #if defined(HWS_MPI_SUPPORT_ENABLED)
+#if defined(HWS_MPI_SUPPORT_ENABLED)
     /**
      * @brief Let MPI rank 0 dump the hardware samples of this hardware sampler of all MPI ranks to the YAML file with @p filename.
      * @param[in] filename the YAML file to append the hardware samples to
@@ -181,7 +181,7 @@ class hardware_sampler {
      * @copydoc hws::hardware_sampler::dump_yaml_global(const char *) const
      */
     void dump_yaml_global(const std::filesystem::path &filename, MPI_Comm communicator) const;
-    #endif
+#endif
 
     /**
      * @brief Return the unique device identification. Can be used as unique key in the YAML string.
diff --git a/include/hws/utility.hpp b/include/hws/utility.hpp
index 2915a18..02369d7 100644
--- a/include/hws/utility.hpp
+++ b/include/hws/utility.hpp
@@ -439,34 +439,32 @@ struct visible_gpu_device {
     std::string physical_id;  // stable per-node identifier
 };
 
-
-#if defined(HWS_FOR_NVIDIA_GPUS_ENABLED)
+    #if defined(HWS_FOR_NVIDIA_GPUS_ENABLED)
 /**
  * @brief creates a list of all visible NVIDIA GPU devices
  *
  * @return a vector of all visible NVIDIA GPU devices on the local node, each with its local index and physical ID
  */
 [[nodiscard]] std::vector<visible_gpu_device> enumerate_local_nvidia_devices();
-#endif
+    #endif
 
-#if defined(HWS_FOR_AMD_GPUS_ENABLED)
+    #if defined(HWS_FOR_AMD_GPUS_ENABLED)
 /**
  * @brief creates a list of all visible AMD GPU devices
  *
  * @return a vector of all visible AMD GPU devices on the local node, each with its local index and physical ID
  */
 [[nodiscard]] std::vector<visible_gpu_device> enumerate_local_amd_devices();
-#endif
+    #endif
 
-#if defined(HWS_FOR_INTEL_GPUS_ENABLED)
+    #if defined(HWS_FOR_INTEL_GPUS_ENABLED)
 /**
  * @brief creates a list of all visible Intel GPU devices
  *
  * @return a vector of all visible Intel GPU devices on the local node, each with its local index and physical ID
  */
 [[nodiscard]] std::vector<visible_gpu_device> enumerate_local_intel_devices();
-#endif
-
+    #endif
 
 /**
  * Computes for each MPI rank a list of devices that have to be sampled by this rank. Ensures that
diff --git a/src/hws/hardware_sampler.cpp b/src/hws/hardware_sampler.cpp
index 4c5aa1d..4647adb 100644
--- a/src/hws/hardware_sampler.cpp
+++ b/src/hws/hardware_sampler.cpp
@@ -25,7 +25,7 @@
 #include <utility>    // std::move
 
 #if defined(HWS_MPI_SUPPORT_ENABLED)
-#include <mpi.h>        // MPI_Comm
+    #include <mpi.h>  // MPI_Comm
 #endif
 
 namespace hws {
diff --git a/src/hws/utility.cpp b/src/hws/utility.cpp
index c5d1dcd..970b625 100644
--- a/src/hws/utility.cpp
+++ b/src/hws/utility.cpp
@@ -15,7 +15,7 @@
 #include <vector>       // std::vector
 
 #if defined(HWS_MPI_SUPPORT_ENABLED)
-#include <mpi.h>        // MPI_Comm, MPI_Gatherv, MPI_Gather, MPI_Initialized, MPI_Comm_rank, MPI_Comm_size
+    #include <mpi.h>  // MPI_Comm, MPI_Gatherv, MPI_Gather, MPI_Initialized, MPI_Comm_rank, MPI_Comm_size
 #endif
 
 namespace hws::detail {

From 1d3accf0f2f7f38581ade36f038bfc4c45974115 Mon Sep 17 00:00:00 2001
From: TimThuering <56958925+TimThuering@users.noreply.github.com>
Date: Wed, 17 Jun 2026 12:44:58 +0200
Subject: [PATCH 16/31] add additional cmake check for mpi4py include path

---
 bindings/CMakeLists.txt | 7 +++++++
 1 file changed, 7 insertions(+)

diff --git a/bindings/CMakeLists.txt b/bindings/CMakeLists.txt
index fd4f564..c58dd20 100644
--- a/bindings/CMakeLists.txt
+++ b/bindings/CMakeLists.txt
@@ -82,6 +82,13 @@ if(HWS_MPI_SUPPORT_ACTIVE)
                 "Install mpi4py in this environment or disable python bindings.")
     endif()
 
+    if(NOT EXISTS "${MPI4PY_INCLUDE_DIR}/mpi4py/mpi4py.h")
+        message(FATAL_ERROR
+                "mpi4py include path '${MPI4PY_INCLUDE_DIR}' does not contain mpi4py/mpi4py.h. "
+                "The mpi4py installation appears to be broken. "
+                "Reinstall mpi4py in this environment or disable python bindings.")
+    endif()
+
     target_include_directories(${HWS_PYTHON_BINDINGS_LIBRARY_NAME} PRIVATE ${MPI4PY_INCLUDE_DIR})
 
     # Propagate the same macro used on the C++ side into the Python module

From 9fab189e16dbcfcdde4dfd5a1999d3afb3cc5f04 Mon Sep 17 00:00:00 2001
From: TimThuering <56958925+TimThuering@users.noreply.github.com>
Date: Wed, 17 Jun 2026 13:17:59 +0200
Subject: [PATCH 17/31] add mpi4py version check to cmake

---
 bindings/CMakeLists.txt | 12 ++++++++++++
 1 file changed, 12 insertions(+)

diff --git a/bindings/CMakeLists.txt b/bindings/CMakeLists.txt
index c58dd20..569aa77 100644
--- a/bindings/CMakeLists.txt
+++ b/bindings/CMakeLists.txt
@@ -89,6 +89,18 @@ if(HWS_MPI_SUPPORT_ACTIVE)
                 "Reinstall mpi4py in this environment or disable python bindings.")
     endif()
 
+    execute_process(
+            COMMAND "${Python_EXECUTABLE}" -c
+            "import mpi4py, sys; sys.stdout.write(mpi4py.__version__)"
+            OUTPUT_VARIABLE MPI4PY_VERSION
+            OUTPUT_STRIP_TRAILING_WHITESPACE
+    )
+    if(MPI4PY_VERSION VERSION_LESS "4.0")
+        message(FATAL_ERROR
+                "mpi4py>=4.0 is required but found ${MPI4PY_VERSION} in Python_EXECUTABLE='${Python_EXECUTABLE}'. "
+                "Upgrade mpi4py or disable python bindings.")
+    endif()
+
     target_include_directories(${HWS_PYTHON_BINDINGS_LIBRARY_NAME} PRIVATE ${MPI4PY_INCLUDE_DIR})
 
     # Propagate the same macro used on the C++ side into the Python module

From f43800f72ada8662c79cc36bd1ced507479ffd8f Mon Sep 17 00:00:00 2001
From: TimThuering <56958925+TimThuering@users.noreply.github.com>
Date: Mon, 22 Jun 2026 08:38:06 +0200
Subject: [PATCH 18/31] cmake variable renamings

---
 bindings/CMakeLists.txt | 18 +++++++++---------
 1 file changed, 9 insertions(+), 9 deletions(-)

diff --git a/bindings/CMakeLists.txt b/bindings/CMakeLists.txt
index 569aa77..e344dec 100644
--- a/bindings/CMakeLists.txt
+++ b/bindings/CMakeLists.txt
@@ -70,21 +70,21 @@ if(HWS_MPI_SUPPORT_ACTIVE)
     execute_process(
             COMMAND "${Python_EXECUTABLE}" -c
             "import mpi4py, sys; sys.stdout.write(mpi4py.get_include())"
-            RESULT_VARIABLE MPI4PY_IMPORT_RESULT
-            OUTPUT_VARIABLE MPI4PY_INCLUDE_DIR
+            RESULT_VARIABLE HWS_MPI4PY_IMPORT_RESULT
+            OUTPUT_VARIABLE HWS_MPI4PY_INCLUDE_DIR
             OUTPUT_STRIP_TRAILING_WHITESPACE
     )
 
-    if(MPI4PY_IMPORT_RESULT)
+    if(HWS_MPI4PY_IMPORT_RESULT)
         message(FATAL_ERROR
                 "MPI support is enabled in hws (HWS_ENABLE_MPI_SUPPORT=AUTO/ON and MPI_FOUND) "
                 "but mpi4py is not importable in Python_EXECUTABLE='${Python_EXECUTABLE}'. "
                 "Install mpi4py in this environment or disable python bindings.")
     endif()
 
-    if(NOT EXISTS "${MPI4PY_INCLUDE_DIR}/mpi4py/mpi4py.h")
+    if(NOT EXISTS "${HWS_MPI4PY_INCLUDE_DIR}/mpi4py/mpi4py.h")
         message(FATAL_ERROR
-                "mpi4py include path '${MPI4PY_INCLUDE_DIR}' does not contain mpi4py/mpi4py.h. "
+                "mpi4py include path '${HWS_MPI4PY_INCLUDE_DIR}' does not contain mpi4py/mpi4py.h. "
                 "The mpi4py installation appears to be broken. "
                 "Reinstall mpi4py in this environment or disable python bindings.")
     endif()
@@ -92,16 +92,16 @@ if(HWS_MPI_SUPPORT_ACTIVE)
     execute_process(
             COMMAND "${Python_EXECUTABLE}" -c
             "import mpi4py, sys; sys.stdout.write(mpi4py.__version__)"
-            OUTPUT_VARIABLE MPI4PY_VERSION
+            OUTPUT_VARIABLE HWS_MPI4PY_VERSION
             OUTPUT_STRIP_TRAILING_WHITESPACE
     )
-    if(MPI4PY_VERSION VERSION_LESS "4.0")
+    if(HWS_MPI4PY_VERSION VERSION_LESS "4.0")
         message(FATAL_ERROR
-                "mpi4py>=4.0 is required but found ${MPI4PY_VERSION} in Python_EXECUTABLE='${Python_EXECUTABLE}'. "
+                "mpi4py>=4.0 is required but found ${HWS_MPI4PY_VERSION} in Python_EXECUTABLE='${Python_EXECUTABLE}'. "
                 "Upgrade mpi4py or disable python bindings.")
     endif()
 
-    target_include_directories(${HWS_PYTHON_BINDINGS_LIBRARY_NAME} PRIVATE ${MPI4PY_INCLUDE_DIR})
+    target_include_directories(${HWS_PYTHON_BINDINGS_LIBRARY_NAME} PRIVATE ${HWS_MPI4PY_INCLUDE_DIR})
 
     # Propagate the same macro used on the C++ side into the Python module
     target_compile_definitions(${HWS_PYTHON_BINDINGS_LIBRARY_NAME} PRIVATE HWS_MPI_SUPPORT_ENABLED)

From 41d20292b2b4ed232eb6bc2bb7277fd27cd73ab0 Mon Sep 17 00:00:00 2001
From: TimThuering <56958925+TimThuering@users.noreply.github.com>
Date: Mon, 22 Jun 2026 08:44:52 +0200
Subject: [PATCH 19/31] update std::runtime_error constructor calls

---
 bindings/main.cpp | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/bindings/main.cpp b/bindings/main.cpp
index 51825ae..932a897 100644
--- a/bindings/main.cpp
+++ b/bindings/main.cpp
@@ -90,12 +90,12 @@ PYBIND11_MODULE(HardwareSampling, m) {
  */
 MPI_Comm mpi_comm_from_python(py::object py_comm) {
     if (!PyObject_TypeCheck(py_comm.ptr(), &PyMPIComm_Type)) {
-        throw std::runtime_error("expected mpi4py.MPI.Comm as communicator argument");
+        throw std::runtime_error{"expected mpi4py.MPI.Comm as communicator argument"};
     }
 
     MPI_Comm *comm_ptr = PyMPIComm_Get(py_comm.ptr());
     if (comm_ptr == nullptr) {
-        throw std::runtime_error("could not extract MPI_Comm from mpi4py communicator");
+        throw std::runtime_error{"could not extract MPI_Comm from mpi4py communicator"};
     }
 
     return *comm_ptr;

From 666130e260726cc7cd604390e083547423380d9d Mon Sep 17 00:00:00 2001
From: TimThuering <56958925+TimThuering@users.noreply.github.com>
Date: Mon, 22 Jun 2026 08:47:47 +0200
Subject: [PATCH 20/31] update include guard name in mpi4py_communicator.hpp

---
 bindings/mpi4py_communicator.hpp | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/bindings/mpi4py_communicator.hpp b/bindings/mpi4py_communicator.hpp
index b7cdefd..f325423 100644
--- a/bindings/mpi4py_communicator.hpp
+++ b/bindings/mpi4py_communicator.hpp
@@ -8,8 +8,8 @@
  * @brief Utility functions for transforming mpi4py communicators into C++ MPI communicators
  */
 
-#ifndef HWS_HARDWARE_SAMPLING_FOR_GPUS_AND_CPUS_MPI4PY_COMMUNICATOR_HPP
-#define HWS_HARDWARE_SAMPLING_FOR_GPUS_AND_CPUS_MPI4PY_COMMUNICATOR_HPP
+#ifndef HWS_BINDINGS_MPI4PY_COMMUNICATOR_HPP
+#define HWS_BINDINGS_MPI4PY_COMMUNICATOR_HPP
 #pragma once
 
 #include "pybind11/pybind11.h"
@@ -24,4 +24,4 @@ namespace py = pybind11;
 MPI_Comm mpi_comm_from_python(py::object py_comm);
 #endif
 
-#endif  // HWS_HARDWARE_SAMPLING_FOR_GPUS_AND_CPUS_MPI4PY_COMMUNICATOR_HPP
+#endif  // HWS_BINDINGS_MPI4PY_COMMUNICATOR_HPP

From 5be4fed131cd01f78e4a22e9efcdd9822cf4e718 Mon Sep 17 00:00:00 2001
From: TimThuering <56958925+TimThuering@users.noreply.github.com>
Date: Mon, 22 Jun 2026 08:55:19 +0200
Subject: [PATCH 21/31] split variable declaration into separate lines

---
 include/hws/utility.hpp | 6 ++++--
 src/hws/utility.cpp     | 3 ++-
 2 files changed, 6 insertions(+), 3 deletions(-)

diff --git a/include/hws/utility.hpp b/include/hws/utility.hpp
index 02369d7..3f5cde8 100644
--- a/include/hws/utility.hpp
+++ b/include/hws/utility.hpp
@@ -361,7 +361,8 @@ struct hostname_comm_info {
  * @return the node-local MPI communicator information
  */
 inline hostname_comm_info make_hostname_comm(MPI_Comm comm) {
-    int world_rank = 0, world_size = 0;
+    int world_rank = 0;
+    int world_size = 0;
     MPI_Comm_rank(comm, &world_rank);
     MPI_Comm_size(comm, &world_size);
 
@@ -475,7 +476,8 @@ struct visible_gpu_device {
  * @return all device indices that have to be sampled by this rank
  */
 inline std::vector<int> owned_local_indices_for_backend(const std::vector<visible_gpu_device> &local_devices, MPI_Comm node_comm) {
-    int node_rank = 0, node_size = 0;
+    int node_rank = 0;
+    int node_size = 0;
     MPI_Comm_rank(node_comm, &node_rank);
     MPI_Comm_size(node_comm, &node_size);
 
diff --git a/src/hws/utility.cpp b/src/hws/utility.cpp
index 970b625..e1a171f 100644
--- a/src/hws/utility.cpp
+++ b/src/hws/utility.cpp
@@ -89,7 +89,8 @@ std::string gather_yaml_strings_mpi(const std::string &local_yaml, MPI_Comm comm
     }
 
     // MPI rank and world size for identification and communication
-    int rank = 0, world_size = 0;
+    int rank = 0;
+    int world_size = 0;
     MPI_Comm_rank(communicator, &rank);
     MPI_Comm_size(communicator, &world_size);
 

From e4716ff45fd00260c6b3134b3ede34dbe2971a16 Mon Sep 17 00:00:00 2001
From: TimThuering <56958925+TimThuering@users.noreply.github.com>
Date: Mon, 22 Jun 2026 09:04:08 +0200
Subject: [PATCH 22/31] change std::runtime_error constructor calls

---
 src/hws/hardware_sampler.cpp | 2 +-
 src/hws/utility.cpp          | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/src/hws/hardware_sampler.cpp b/src/hws/hardware_sampler.cpp
index 4647adb..9066920 100644
--- a/src/hws/hardware_sampler.cpp
+++ b/src/hws/hardware_sampler.cpp
@@ -154,7 +154,7 @@ void hardware_sampler::dump_yaml_global(const char *filename, MPI_Comm communica
     MPI_Initialized(&initialized);
 
     if (!initialized) {
-        throw std::runtime_error("MPI must already be initialized");
+        throw std::runtime_error{"MPI must already be initialized"};
     }
 
     // MPI rank and world size for identification and communication
diff --git a/src/hws/utility.cpp b/src/hws/utility.cpp
index e1a171f..a5d51b8 100644
--- a/src/hws/utility.cpp
+++ b/src/hws/utility.cpp
@@ -85,7 +85,7 @@ std::string gather_yaml_strings_mpi(const std::string &local_yaml, MPI_Comm comm
     MPI_Initialized(&initialized);
 
     if (!initialized) {
-        throw std::runtime_error("MPI must already be initialized");
+        throw std::runtime_error{"MPI must already be initialized"};
     }
 
     // MPI rank and world size for identification and communication

From c66b51041b41ba9e8ccce811cce5ce955d68a89c Mon Sep 17 00:00:00 2001
From: TimThuering <56958925+TimThuering@users.noreply.github.com>
Date: Mon, 22 Jun 2026 09:12:59 +0200
Subject: [PATCH 23/31] add [[nodiscard]] and const to gpu_*/utility.cpp files

---
 src/hws/gpu_amd/utility.cpp    | 4 ++--
 src/hws/gpu_intel/utility.cpp  | 4 ++--
 src/hws/gpu_nvidia/utility.cpp | 4 ++--
 3 files changed, 6 insertions(+), 6 deletions(-)

diff --git a/src/hws/gpu_amd/utility.cpp b/src/hws/gpu_amd/utility.cpp
index d0bb038..614a7c7 100644
--- a/src/hws/gpu_amd/utility.cpp
+++ b/src/hws/gpu_amd/utility.cpp
@@ -57,7 +57,7 @@ namespace {
  * @param local_index the local index of the AMD GPU device
  * @return the physical ID of the AMD GPU device
  */
-std::string amd_physical_id(int local_index) {
+[[nodiscard]] std::string amd_physical_id(const int local_index) {
     char bus_id[64] = {};
     HWS_HIP_ERROR_CHECK(hipDeviceGetPCIBusId(bus_id, sizeof(bus_id), local_index));
     return std::string{ "amd:" } + bus_id;
@@ -65,7 +65,7 @@ std::string amd_physical_id(int local_index) {
 
 }  // namespace
 
-std::vector<visible_gpu_device> enumerate_local_amd_devices() {
+[[nodiscard]] std::vector<visible_gpu_device> enumerate_local_amd_devices() {
     std::vector<visible_gpu_device> out;
     int count = 0;
     HWS_HIP_ERROR_CHECK(hipGetDeviceCount(&count));
diff --git a/src/hws/gpu_intel/utility.cpp b/src/hws/gpu_intel/utility.cpp
index 2d50ad8..0b5f0e9 100644
--- a/src/hws/gpu_intel/utility.cpp
+++ b/src/hws/gpu_intel/utility.cpp
@@ -246,7 +246,7 @@ namespace {
  * @param device the Level Zero device handle of the Intel GPU device
  * @return the physical ID of the Intel GPU device
  */
-std::string intel_physical_id(ze_device_handle_t device) {
+[[nodiscard]] std::string intel_physical_id(const ze_device_handle_t device) {
     ze_device_properties_t props{};
     props.stype = ZE_STRUCTURE_TYPE_DEVICE_PROPERTIES;
     props.pNext = nullptr;
@@ -262,7 +262,7 @@ std::string intel_physical_id(ze_device_handle_t device) {
 
 }  // namespace
 
-std::vector<visible_gpu_device> enumerate_local_intel_devices() {
+[[nodiscard]] std::vector<visible_gpu_device> enumerate_local_intel_devices() {
     std::vector<visible_gpu_device> out;
 
     // init level zero driver
diff --git a/src/hws/gpu_nvidia/utility.cpp b/src/hws/gpu_nvidia/utility.cpp
index 3237113..437699d 100644
--- a/src/hws/gpu_nvidia/utility.cpp
+++ b/src/hws/gpu_nvidia/utility.cpp
@@ -71,7 +71,7 @@ namespace {
  * @param local_index the local index of the NVIDIA GPU device
  * @return the physical ID of the NVIDIA GPU device
  */
-std::string nvidia_physical_id(int local_index) {
+[[nodiscard]] std::string nvidia_physical_id(const int local_index) {
     char bus_id[64] = {};
     HWS_CUDA_ERROR_CHECK(cudaDeviceGetPCIBusId(bus_id, sizeof(bus_id), local_index));
     return std::string{ "nvidia:" } + bus_id;
@@ -79,7 +79,7 @@ std::string nvidia_physical_id(int local_index) {
 
 }  // namespace
 
-std::vector<visible_gpu_device> enumerate_local_nvidia_devices() {
+[[nodiscard]] std::vector<visible_gpu_device> enumerate_local_nvidia_devices() {
     std::vector<visible_gpu_device> out;
     int count = 0;
     HWS_CUDA_ERROR_CHECK(cudaGetDeviceCount(&count));

From 5ec40727e42ade0d0ca259d0edef98e80544b30b Mon Sep 17 00:00:00 2001
From: TimThuering <56958925+TimThuering@users.noreply.github.com>
Date: Mon, 22 Jun 2026 09:31:11 +0200
Subject: [PATCH 24/31] added missing const

---
 src/hws/system_hardware_sampler.cpp | 10 +++++-----
 src/hws/utility.cpp                 |  2 +-
 2 files changed, 6 insertions(+), 6 deletions(-)

diff --git a/src/hws/system_hardware_sampler.cpp b/src/hws/system_hardware_sampler.cpp
index 416a19d..0e93f0a 100644
--- a/src/hws/system_hardware_sampler.cpp
+++ b/src/hws/system_hardware_sampler.cpp
@@ -56,10 +56,10 @@ system_hardware_sampler::system_hardware_sampler(const std::chrono::milliseconds
 }
 
 #if defined(HWS_MPI_SUPPORT_ENABLED)
-system_hardware_sampler::system_hardware_sampler(MPI_Comm communicator, detail::mpi_sampling_mode mode, sample_category category) :
+system_hardware_sampler::system_hardware_sampler(MPI_Comm communicator, const detail::mpi_sampling_mode mode, const sample_category category) :
     system_hardware_sampler(communicator, mode, HWS_SAMPLING_INTERVAL, category) { }
 
-system_hardware_sampler::system_hardware_sampler(MPI_Comm communicator, detail::mpi_sampling_mode mode, std::chrono::milliseconds sampling_interval, sample_category category) {
+system_hardware_sampler::system_hardware_sampler(MPI_Comm communicator, const detail::mpi_sampling_mode mode, const std::chrono::milliseconds sampling_interval, const sample_category category) {
     if (mode == detail::mpi_sampling_mode::per_rank) {
         // each rank creates samplers for all devices visible to him
         create_local_samplers(sampling_interval, category);
@@ -79,7 +79,7 @@ system_hardware_sampler::system_hardware_sampler(MPI_Comm communicator, detail::
         {
             const auto local = detail::enumerate_local_nvidia_devices();
             const auto owned = detail::owned_local_indices_for_backend(local, nc.node_comm);
-            for (int idx : owned) {
+            for (int const idx : owned) {
                 samplers_.push_back(std::make_unique<gpu_nvidia_hardware_sampler>(static_cast<std::size_t>(idx), sampling_interval, category));
             }
         }
@@ -90,7 +90,7 @@ system_hardware_sampler::system_hardware_sampler(MPI_Comm communicator, detail::
         {
             const auto local = detail::enumerate_local_amd_devices();
             const auto owned = detail::owned_local_indices_for_backend(local, nc.node_comm);
-            for (int idx : owned) {
+            for (int const idx : owned) {
                 samplers_.push_back(std::make_unique<gpu_amd_hardware_sampler>(
                     static_cast<std::size_t>(idx), sampling_interval, category));
             }
@@ -102,7 +102,7 @@ system_hardware_sampler::system_hardware_sampler(MPI_Comm communicator, detail::
         {
             const auto local = detail::enumerate_local_intel_devices();
             const auto owned = detail::owned_local_indices_for_backend(local, nc.node_comm);
-            for (int idx : owned) {
+            for (int const idx : owned) {
                 samplers_.push_back(std::make_unique<gpu_intel_hardware_sampler>(static_cast<std::size_t>(idx), sampling_interval, category));
             }
         }
diff --git a/src/hws/utility.cpp b/src/hws/utility.cpp
index a5d51b8..201a0ef 100644
--- a/src/hws/utility.cpp
+++ b/src/hws/utility.cpp
@@ -95,7 +95,7 @@ std::string gather_yaml_strings_mpi(const std::string &local_yaml, MPI_Comm comm
     MPI_Comm_size(communicator, &world_size);
 
     // gather the size of the yaml string from each rank
-    int local_size = static_cast<int>(local_yaml.size());
+    const int local_size = static_cast<int>(local_yaml.size());
 
     std::vector<int> recv_sizes;
 

From 45b75872f29cbc57fdb5a773419c33b84b0937c7 Mon Sep 17 00:00:00 2001
From: TimThuering <56958925+TimThuering@users.noreply.github.com>
Date: Mon, 22 Jun 2026 13:36:31 +0200
Subject: [PATCH 25/31] change std::runtime_error constructor call

---
 src/hws/system_hardware_sampler.cpp | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/hws/system_hardware_sampler.cpp b/src/hws/system_hardware_sampler.cpp
index 0e93f0a..738b784 100644
--- a/src/hws/system_hardware_sampler.cpp
+++ b/src/hws/system_hardware_sampler.cpp
@@ -236,7 +236,7 @@ void system_hardware_sampler::dump_yaml_global(const char *filename, MPI_Comm co
     MPI_Initialized(&initialized);
 
     if (!initialized) {
-        throw std::runtime_error("MPI must already be initialized");
+        throw std::runtime_error{"MPI must already be initialized"};
     }
 
     // MPI rank and world size for identification and communication

From 7459d192741b5527dac5b6be362c48434aad16e4 Mon Sep 17 00:00:00 2001
From: TimThuering <56958925+TimThuering@users.noreply.github.com>
Date: Mon, 22 Jun 2026 14:00:38 +0200
Subject: [PATCH 26/31] changed prefix from std::string to std::string_view

---
 include/hws/utility.hpp | 2 +-
 src/hws/utility.cpp     | 6 ++++--
 2 files changed, 5 insertions(+), 3 deletions(-)

diff --git a/include/hws/utility.hpp b/include/hws/utility.hpp
index 3f5cde8..8b2332b 100644
--- a/include/hws/utility.hpp
+++ b/include/hws/utility.hpp
@@ -258,7 +258,7 @@ template <typename T>
  * @param[in] prefix the prefix (indentation) added to each line
  * @return the indented string
  */
-[[nodiscard]] std::string indent_lines(const std::string &text, const std::string &prefix);
+[[nodiscard]] std::string indent_lines(const std::string &text, std::string_view prefix);
 
 /*****************************************************************************************************/
 /**                                      other free functions                                       **/
diff --git a/src/hws/utility.cpp b/src/hws/utility.cpp
index 201a0ef..f593521 100644
--- a/src/hws/utility.cpp
+++ b/src/hws/utility.cpp
@@ -66,14 +66,16 @@ bool is_integer(std::string_view str) {
     return std::all_of(str.cbegin(), str.cend(), [](const char c) { return std::isdigit(static_cast<unsigned char>(c)); });
 }
 
-std::string indent_lines(const std::string &text, const std::string &prefix) {
+std::string indent_lines(const std::string &text, const std::string_view prefix) {
     std::stringstream ss{ text };
 
     std::string line;
     std::string out;
 
     while (std::getline(ss, line)) {
-        out += prefix + line + '\n';
+        out += prefix;
+        out += line;
+        out += '\n';
     }
 
     return out;

From f3bbb01de0780044a3e9ae33dd8739341b83c7dc Mon Sep 17 00:00:00 2001
From: TimThuering <56958925+TimThuering@users.noreply.github.com>
Date: Mon, 22 Jun 2026 14:11:41 +0200
Subject: [PATCH 27/31] change auto to acutal types

---
 src/hws/system_hardware_sampler.cpp | 14 +++++++-------
 1 file changed, 7 insertions(+), 7 deletions(-)

diff --git a/src/hws/system_hardware_sampler.cpp b/src/hws/system_hardware_sampler.cpp
index 738b784..2dda396 100644
--- a/src/hws/system_hardware_sampler.cpp
+++ b/src/hws/system_hardware_sampler.cpp
@@ -65,7 +65,7 @@ system_hardware_sampler::system_hardware_sampler(MPI_Comm communicator, const de
         create_local_samplers(sampling_interval, category);
     } else if (mode == detail::mpi_sampling_mode::whole_node) {
         // create a custom, node-local MPI communicator
-        auto nc = detail::make_hostname_comm(communicator);
+        detail::hostname_comm_info nc = detail::make_hostname_comm(communicator);
 
     // CPU: one sampler per node --> node leader only
     #if defined(HWS_FOR_CPUS_ENABLED)
@@ -77,8 +77,8 @@ system_hardware_sampler::system_hardware_sampler(MPI_Comm communicator, const de
     // NVIDIA
     #if defined(HWS_FOR_NVIDIA_GPUS_ENABLED)
         {
-            const auto local = detail::enumerate_local_nvidia_devices();
-            const auto owned = detail::owned_local_indices_for_backend(local, nc.node_comm);
+            const std::vector<detail::visible_gpu_device> local = detail::enumerate_local_nvidia_devices();
+            const std::vector<int> owned = detail::owned_local_indices_for_backend(local, nc.node_comm);
             for (int const idx : owned) {
                 samplers_.push_back(std::make_unique<gpu_nvidia_hardware_sampler>(static_cast<std::size_t>(idx), sampling_interval, category));
             }
@@ -88,8 +88,8 @@ system_hardware_sampler::system_hardware_sampler(MPI_Comm communicator, const de
     // AMD
     #if defined(HWS_FOR_AMD_GPUS_ENABLED)
         {
-            const auto local = detail::enumerate_local_amd_devices();
-            const auto owned = detail::owned_local_indices_for_backend(local, nc.node_comm);
+            const std::vector<detail::visible_gpu_device> local = detail::enumerate_local_amd_devices();
+            const std::vector<int> owned = detail::owned_local_indices_for_backend(local, nc.node_comm);
             for (int const idx : owned) {
                 samplers_.push_back(std::make_unique<gpu_amd_hardware_sampler>(
                     static_cast<std::size_t>(idx), sampling_interval, category));
@@ -100,8 +100,8 @@ system_hardware_sampler::system_hardware_sampler(MPI_Comm communicator, const de
     // Intel
     #if defined(HWS_FOR_INTEL_GPUS_ENABLED)
         {
-            const auto local = detail::enumerate_local_intel_devices();
-            const auto owned = detail::owned_local_indices_for_backend(local, nc.node_comm);
+            const std::vector<detail::visible_gpu_device> local = detail::enumerate_local_intel_devices();
+            const std::vector<int> owned = detail::owned_local_indices_for_backend(local, nc.node_comm);
             for (int const idx : owned) {
                 samplers_.push_back(std::make_unique<gpu_intel_hardware_sampler>(static_cast<std::size_t>(idx), sampling_interval, category));
             }

From c75de3149dabbbe79e2569a7a10c66ac89d15098 Mon Sep 17 00:00:00 2001
From: TimThuering <56958925+TimThuering@users.noreply.github.com>
Date: Mon, 22 Jun 2026 16:02:29 +0200
Subject: [PATCH 28/31] add constructor/destructor to hostname_comm_info

---
 include/hws/utility.hpp             | 119 +++++++++++++---------------
 src/hws/system_hardware_sampler.cpp |   3 +-
 2 files changed, 56 insertions(+), 66 deletions(-)

diff --git a/include/hws/utility.hpp b/include/hws/utility.hpp
index 8b2332b..b2bc77c 100644
--- a/include/hws/utility.hpp
+++ b/include/hws/utility.hpp
@@ -347,86 +347,77 @@ enum class mpi_sampling_mode {
 };
 
 /**
- * @brief Information about a node-local MPI communicator for whole-node sampling.
+ * @brief RAII wrapper around a node-local MPI communicator for whole-node sampling.
  */
 struct hostname_comm_info {
     MPI_Comm node_comm = MPI_COMM_NULL;
     int node_rank = 0;
     int node_size = 1;
-};
 
-/**
- * @brief Create a node-local MPI communicator for whole-node sampling based on node hostnames.
- * @param comm the parent MPI communicator to split into node-local communicators
- * @return the node-local MPI communicator information
- */
-inline hostname_comm_info make_hostname_comm(MPI_Comm comm) {
-    int world_rank = 0;
-    int world_size = 0;
-    MPI_Comm_rank(comm, &world_rank);
-    MPI_Comm_size(comm, &world_size);
-
-    // Gather all hostnames
-    char name[MPI_MAX_PROCESSOR_NAME];
-    int name_len = 0;
-    MPI_Get_processor_name(name, &name_len);
-
-    std::vector<int> name_lengths(world_size);
-    MPI_Allgather(&name_len, 1, MPI_INT, name_lengths.data(), 1, MPI_INT, comm);
-
-    // Build displacements and total byte count
-    std::vector<int> displs(world_size);
-    int total = 0;
-    for (int i = 0; i < world_size; ++i) {
-        displs[i] = total;
-        total += name_lengths[i];
-    }
+    explicit hostname_comm_info(MPI_Comm comm) {
+        int world_rank = 0;
+        int world_size = 0;
+        MPI_Comm_rank(comm, &world_rank);
+        MPI_Comm_size(comm, &world_size);
+
+        // Gather all hostnames
+        char name[MPI_MAX_PROCESSOR_NAME];
+        int name_len = 0;
+        MPI_Get_processor_name(name, &name_len);
+
+        std::vector<int> name_lengths(world_size);
+        MPI_Allgather(&name_len, 1, MPI_INT, name_lengths.data(), 1, MPI_INT, comm);
+
+        // Build displacements and total byte count
+        std::vector<int> displs(world_size);
+        int total = 0;
+        for (int i = 0; i < world_size; ++i) {
+            displs[i] = total;
+            total += name_lengths[i];
+        }
 
-    std::vector<char> all_names(total);
-    MPI_Allgatherv(name, name_len, MPI_CHAR, all_names.data(), name_lengths.data(), displs.data(), MPI_CHAR, comm);
+        std::vector<char> all_names(total);
+        MPI_Allgatherv(name, name_len, MPI_CHAR, all_names.data(), name_lengths.data(), displs.data(), MPI_CHAR, comm);
 
-    // Assign colors locally on every rank
-    //
-    // All ranks hold identical copies of all_names, name_lengths, and displs,
-    // so they can each compute the same deterministic color map independently.
+        // Assign colors locally on every rank
+        //
+        // All ranks hold identical copies of all_names, name_lengths, and displs,
+        // so they can each compute the same deterministic color map independently.
 
-    std::unordered_map<std::string_view, int> host_to_color;
-    host_to_color.reserve(world_size);
-    std::vector<int> colors(world_size);
-    int next_color = 0;
-    for (int r = 0; r < world_size; ++r) {
-        // get host name of rank r
-        std::string_view host(&all_names[displs[r]], static_cast<std::size_t>(name_lengths[r]));
+        std::unordered_map<std::string_view, int> host_to_color;
+        host_to_color.reserve(world_size);
+        std::vector<int> colors(world_size);
+        int next_color = 0;
+        for (int r = 0; r < world_size; ++r) {
+            // get host name of rank r
+            std::string_view host(&all_names[displs[r]], static_cast<std::size_t>(name_lengths[r]));
 
-        // try to insert it into the host_to_color map
-        auto [it, inserted] = host_to_color.emplace(host, next_color);
+            // try to insert it into the host_to_color map
+            auto [it, inserted] = host_to_color.emplace(host, next_color);
 
-        // check if host was new, if yes, increment color
-        if (inserted) {
-            ++next_color;
+            // check if host was new, if yes, increment color
+            if (inserted) {
+                ++next_color;
+            }
+            // save color of current rank, either from newly created or existing entry
+            colors[r] = it->second;
         }
-        // save color of current rank, either from newly created or existing entry
-        colors[r] = it->second;
-    }
 
-    // Split communicator
+        // Split communicator
+        MPI_Comm_split(comm, colors[world_rank], world_rank, &node_comm);
+        MPI_Comm_rank(node_comm, &node_rank);
+        MPI_Comm_size(node_comm, &node_size);
+    }
 
-    hostname_comm_info info{};
-    MPI_Comm_split(comm, colors[world_rank], world_rank, &info.node_comm);
-    MPI_Comm_rank(info.node_comm, &info.node_rank);
-    MPI_Comm_size(info.node_comm, &info.node_size);
-    return info;
-}
+    hostname_comm_info(const hostname_comm_info &) = delete;
+    hostname_comm_info &operator=(const hostname_comm_info &) = delete;
 
-/**
- * @brief Free a node-local MPI communicator for whole-node sampling.
- * @param info the node-local MPI communicator information to free
- */
-inline void free_hostname_comm(hostname_comm_info &info) {
-    if (info.node_comm != MPI_COMM_NULL) {
-        MPI_Comm_free(&info.node_comm);
+    ~hostname_comm_info() {
+        if (node_comm != MPI_COMM_NULL) {
+            MPI_Comm_free(&node_comm);
+        }
     }
-}
+};
 
 enum class device_backend_kind {
     nvidia,
diff --git a/src/hws/system_hardware_sampler.cpp b/src/hws/system_hardware_sampler.cpp
index 2dda396..5e30dee 100644
--- a/src/hws/system_hardware_sampler.cpp
+++ b/src/hws/system_hardware_sampler.cpp
@@ -65,7 +65,7 @@ system_hardware_sampler::system_hardware_sampler(MPI_Comm communicator, const de
         create_local_samplers(sampling_interval, category);
     } else if (mode == detail::mpi_sampling_mode::whole_node) {
         // create a custom, node-local MPI communicator
-        detail::hostname_comm_info nc = detail::make_hostname_comm(communicator);
+        detail::hostname_comm_info nc{ communicator };
 
     // CPU: one sampler per node --> node leader only
     #if defined(HWS_FOR_CPUS_ENABLED)
@@ -108,7 +108,6 @@ system_hardware_sampler::system_hardware_sampler(MPI_Comm communicator, const de
         }
     #endif
 
-        detail::free_hostname_comm(nc);
     } else {
         throw std::runtime_error{ fmt::format("Unknown MPI sampling mode {}!", static_cast<int>(mode)) };
     }

From 8d7011ea50bee9c8c39b768131922c2c241dd922 Mon Sep 17 00:00:00 2001
From: TimThuering <56958925+TimThuering@users.noreply.github.com>
Date: Tue, 23 Jun 2026 09:13:06 +0200
Subject: [PATCH 29/31] refactor MPI related functions

---
 CMakeLists.txt                          |   2 +
 include/hws/gpu_amd/utility.hpp         |  18 ++
 include/hws/gpu_intel/utility.hpp       |  15 ++
 include/hws/gpu_nvidia/utility.hpp      |  17 ++
 include/hws/mpi_sampling_mode.hpp       |  33 ++++
 include/hws/mpi_utility.hpp             |  65 +++++++
 include/hws/system_hardware_sampler.hpp |   5 +-
 include/hws/utility.hpp                 | 218 ------------------------
 include/hws/visible_gpu_device.hpp      |  37 ++++
 src/hws/gpu_amd/utility.cpp             |   2 +-
 src/hws/gpu_intel/utility.cpp           |   2 +-
 src/hws/gpu_nvidia/utility.cpp          |   2 +-
 src/hws/hardware_sampler.cpp            |   2 +-
 src/hws/mpi_utility.cpp                 | 210 +++++++++++++++++++++++
 src/hws/system_hardware_sampler.cpp     |   2 +-
 src/hws/utility.cpp                     |  66 -------
 16 files changed, 405 insertions(+), 291 deletions(-)
 create mode 100644 include/hws/mpi_sampling_mode.hpp
 create mode 100644 include/hws/mpi_utility.hpp
 create mode 100644 include/hws/visible_gpu_device.hpp
 create mode 100644 src/hws/mpi_utility.cpp

diff --git a/CMakeLists.txt b/CMakeLists.txt
index cb049c2..6ab2b4b 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -177,6 +177,8 @@ if (HWS_ENABLE_MPI_SUPPORT MATCHES "AUTO" OR HWS_ENABLE_MPI_SUPPORT)
 
         # Expose that MPI is really enabled for the Python bindings (and potentially other submodules) via a cache variable.
         set(HWS_MPI_SUPPORT_ACTIVE TRUE CACHE BOOL "MPI support enabled in core library" FORCE)
+
+        target_sources(${HWS_LIBRARY_NAME} PRIVATE ${CMAKE_CURRENT_SOURCE_DIR}/src/hws/mpi_utility.cpp)
     endif ()
 endif ()
 
diff --git a/include/hws/gpu_amd/utility.hpp b/include/hws/gpu_amd/utility.hpp
index 716ff8c..def0937 100644
--- a/include/hws/gpu_amd/utility.hpp
+++ b/include/hws/gpu_amd/utility.hpp
@@ -18,6 +18,12 @@
 #include <stdexcept>  // std::runtime_error
 #include <string>     // std::string
 
+#if defined(HWS_MPI_SUPPORT_ENABLED)
+    #include "hws/visible_gpu_device.hpp"  // hws::detail::visible_gpu_device
+
+    #include <vector>  // std::vector
+#endif
+
 namespace hws::detail {
 
 /**
@@ -68,6 +74,18 @@ namespace hws::detail {
  */
 [[nodiscard]] std::string performance_level_to_string(rsmi_dev_perf_level_t perf_level);
 
+
+#if defined(HWS_MPI_SUPPORT_ENABLED)
+
+/**
+ * @brief creates a list of all visible AMD GPU devices
+ *
+ * @return a vector of all visible AMD GPU devices on the local node, each with its local index and physical ID
+ */
+[[nodiscard]] std::vector<visible_gpu_device> enumerate_local_amd_devices();
+
+#endif
+
 }  // namespace hws::detail
 
 #endif  // HWS_GPU_AMD_UTILITY_HPP_
diff --git a/include/hws/gpu_intel/utility.hpp b/include/hws/gpu_intel/utility.hpp
index 76e15a1..6e7afe7 100644
--- a/include/hws/gpu_intel/utility.hpp
+++ b/include/hws/gpu_intel/utility.hpp
@@ -21,6 +21,10 @@
 #include <string_view>  // std::string_view
 #include <vector>       // std::vector
 
+#if defined(HWS_MPI_SUPPORT_ENABLED)
+    #include "hws/visible_gpu_device.hpp"  // hws::detail::visible_gpu_device
+#endif
+
 namespace hws::detail {
 
 /**
@@ -75,6 +79,17 @@ namespace hws::detail {
  */
 [[nodiscard]] std::string memory_location_to_name(zes_mem_loc_t mem_loc);
 
+#if defined(HWS_MPI_SUPPORT_ENABLED)
+
+/**
+ * @brief creates a list of all visible Intel GPU devices
+ *
+ * @return a vector of all visible Intel GPU devices on the local node, each with its local index and physical ID
+ */
+[[nodiscard]] std::vector<visible_gpu_device> enumerate_local_intel_devices();
+
+#endif
+
 }  // namespace hws::detail
 
 #endif  // HWS_GPU_INTEL_UTILITY_HPP_
diff --git a/include/hws/gpu_nvidia/utility.hpp b/include/hws/gpu_nvidia/utility.hpp
index 348f74b..b0b3811 100644
--- a/include/hws/gpu_nvidia/utility.hpp
+++ b/include/hws/gpu_nvidia/utility.hpp
@@ -19,6 +19,12 @@
 #include <stdexcept>  // std::runtime_error
 #include <string>     // std::string
 
+#if defined(HWS_MPI_SUPPORT_ENABLED)
+    #include "hws/visible_gpu_device.hpp"  // hws::detail::visible_gpu_device
+
+    #include <vector>  // std::vector
+#endif
+
 namespace hws::detail {
 
 /**
@@ -63,6 +69,17 @@ namespace hws::detail {
 
 #endif
 
+#if defined(HWS_MPI_SUPPORT_ENABLED)
+
+/**
+ * @brief creates a list of all visible NVIDIA GPU devices
+ *
+ * @return a vector of all visible NVIDIA GPU devices on the local node, each with its local index and physical ID
+ */
+[[nodiscard]] std::vector<visible_gpu_device> enumerate_local_nvidia_devices();
+
+#endif
+
 }  // namespace hws::detail
 
 #endif  // HWS_GPU_NVIDIA_UTILITY_HPP_
diff --git a/include/hws/mpi_sampling_mode.hpp b/include/hws/mpi_sampling_mode.hpp
new file mode 100644
index 0000000..b53cfeb
--- /dev/null
+++ b/include/hws/mpi_sampling_mode.hpp
@@ -0,0 +1,33 @@
+/**
+ * @file
+ * @author Tim Thüring
+ * @copyright 2024-today All Rights Reserved
+ * @license This file is released under the MIT license.
+ *          See the LICENSE.md file in the project root for full license information.
+ *
+ * @brief Defines the MPI sampling mode.
+ */
+
+#ifndef HWS_MPI_SAMPLING_MODE_HPP_
+#define HWS_MPI_SAMPLING_MODE_HPP_
+#pragma once
+
+#if defined(HWS_MPI_SUPPORT_ENABLED)
+
+namespace hws::detail {
+
+/**
+ * @brief The mode to use for MPI sampling.
+ * per_rank: each rank creates hardware samplers for all devices visible to that rank
+ * whole_node: if the same device is visible to more than one rank, only one of those ranks creates a hardware sampler for that device
+ */
+enum class mpi_sampling_mode {
+    per_rank,
+    whole_node
+};
+
+}  // namespace hws::detail
+
+#endif  // HWS_MPI_SUPPORT_ENABLED
+
+#endif  // HWS_MPI_SAMPLING_MODE_HPP_
diff --git a/include/hws/mpi_utility.hpp b/include/hws/mpi_utility.hpp
new file mode 100644
index 0000000..673c2da
--- /dev/null
+++ b/include/hws/mpi_utility.hpp
@@ -0,0 +1,65 @@
+/**
+ * @file
+ * @author Tim Thüring
+ * @copyright 2024-today All Rights Reserved
+ * @license This file is released under the MIT license.
+ *          See the LICENSE.md file in the project root for full license information.
+ *
+ * @brief MPI utility functions for hardware sampling.
+ */
+
+#ifndef HWS_MPI_UTILITY_HPP_
+#define HWS_MPI_UTILITY_HPP_
+#pragma once
+
+#if defined(HWS_MPI_SUPPORT_ENABLED)
+
+    #include "hws/visible_gpu_device.hpp"  // hws::detail::visible_gpu_device
+
+    #include <mpi.h>   // MPI_Comm, MPI_COMM_NULL
+    #include <string>  // std::string
+    #include <vector>  // std::vector
+
+namespace hws::detail {
+
+/**
+ * @brief Gather YAML strings from all MPI ranks and assemble them in rank order on rank 0.
+ *
+ * @param[in] local_yaml the local YAML string contribution
+ * @param[in] communicator the MPI communicator
+ *
+ * @return concatenated YAML string on rank 0, empty string on all other ranks
+ */
+[[nodiscard]] std::string gather_yaml_strings_mpi(const std::string &local_yaml, MPI_Comm communicator);
+
+/**
+ * @brief RAII wrapper around a node-local MPI communicator for whole-node sampling.
+ */
+struct hostname_comm_info {
+    MPI_Comm node_comm = MPI_COMM_NULL;
+    int node_rank = 0;
+    int node_size = 1;
+
+    explicit hostname_comm_info(MPI_Comm comm);
+
+    hostname_comm_info(const hostname_comm_info &) = delete;
+    hostname_comm_info &operator=(const hostname_comm_info &) = delete;
+
+    ~hostname_comm_info();
+};
+
+/**
+ * Computes for each MPI rank a list of devices that have to be sampled by this rank. Ensures that
+ * each device is sampled by exactly one rank.
+ *
+ * @param local_devices a vector of visible_gpu_device for the local rank, each containing a local index and a physical ID
+ * @param node_comm a node local MPI communicator
+ * @return all device indices that have to be sampled by this rank
+ */
+[[nodiscard]] std::vector<int> owned_local_indices_for_backend(const std::vector<visible_gpu_device> &local_devices, MPI_Comm node_comm);
+
+}  // namespace hws::detail
+
+#endif  // HWS_MPI_SUPPORT_ENABLED
+
+#endif  // HWS_MPI_UTILITY_HPP_
diff --git a/include/hws/system_hardware_sampler.hpp b/include/hws/system_hardware_sampler.hpp
index ba160a2..21b3029 100644
--- a/include/hws/system_hardware_sampler.hpp
+++ b/include/hws/system_hardware_sampler.hpp
@@ -14,7 +14,7 @@
 #include "hws/event.hpp"             // hws::event
 #include "hws/hardware_sampler.hpp"  // hws::hardware_sampler
 #include "hws/sample_category.hpp"   // hws::sample_category
-#include "hws/utility.hpp"           // hws::detail::mpi_sampling_mode
+#include "hws/utility.hpp"           // hws::detail::indent_lines
 
 #include <chrono>      // std::chrono::{milliseconds, steady_clock::time_point}
 #include <cstddef>     // std::size_t
@@ -24,7 +24,8 @@
 #include <vector>      // std::vector
 
 #if defined(HWS_MPI_SUPPORT_ENABLED)
-    #include <mpi.h>  // MPI_Comm
+    #include "hws/mpi_sampling_mode.hpp"  // hws::detail::mpi_sampling_mode
+    #include <mpi.h>                      // MPI_Comm
 #endif
 
 namespace hws {
diff --git a/include/hws/utility.hpp b/include/hws/utility.hpp
index b2bc77c..2737418 100644
--- a/include/hws/utility.hpp
+++ b/include/hws/utility.hpp
@@ -25,13 +25,8 @@
 #include <string_view>    // std::string_view
 #include <system_error>   // std::errc
 #include <type_traits>    // std::is_same_v, std::is_floating_point_v, std::remove_cv_t, std::remove_reference_t, std::true_type, std::false_type
-#include <unordered_map>  // std::unordered_map
 #include <vector>         // std::vector
 
-#if defined(HWS_MPI_SUPPORT_ENABLED)
-    #include <mpi.h>  // MPI_Comm
-#endif
-
 namespace hws::detail {
 
 /**
@@ -325,219 +320,6 @@ template <typename T>
     }
 }
 
-#if defined(HWS_MPI_SUPPORT_ENABLED)
-/**
- * @brief Gather YAML strings from all MPI ranks and assemble them in rank order on rank 0.
- *
- * @param[in] local_yaml the local YAML string contribution
- * @param[in] communicator the MPI communicator
- *
- * @return concatenated YAML string on rank 0, empty string on all other ranks
- */
-[[nodiscard]] std::string gather_yaml_strings_mpi(const std::string &local_yaml, MPI_Comm communicator);
-
-/**
- * @brief The mode to use for MPI sampling.
- * per_rank: each rank creates hardware samplers for all devices visible to that rank
- * whole_node: if the same device is visible to more than one rank, only one of those ranks creates a hardware sampler for that device
- */
-enum class mpi_sampling_mode {
-    per_rank,
-    whole_node
-};
-
-/**
- * @brief RAII wrapper around a node-local MPI communicator for whole-node sampling.
- */
-struct hostname_comm_info {
-    MPI_Comm node_comm = MPI_COMM_NULL;
-    int node_rank = 0;
-    int node_size = 1;
-
-    explicit hostname_comm_info(MPI_Comm comm) {
-        int world_rank = 0;
-        int world_size = 0;
-        MPI_Comm_rank(comm, &world_rank);
-        MPI_Comm_size(comm, &world_size);
-
-        // Gather all hostnames
-        char name[MPI_MAX_PROCESSOR_NAME];
-        int name_len = 0;
-        MPI_Get_processor_name(name, &name_len);
-
-        std::vector<int> name_lengths(world_size);
-        MPI_Allgather(&name_len, 1, MPI_INT, name_lengths.data(), 1, MPI_INT, comm);
-
-        // Build displacements and total byte count
-        std::vector<int> displs(world_size);
-        int total = 0;
-        for (int i = 0; i < world_size; ++i) {
-            displs[i] = total;
-            total += name_lengths[i];
-        }
-
-        std::vector<char> all_names(total);
-        MPI_Allgatherv(name, name_len, MPI_CHAR, all_names.data(), name_lengths.data(), displs.data(), MPI_CHAR, comm);
-
-        // Assign colors locally on every rank
-        //
-        // All ranks hold identical copies of all_names, name_lengths, and displs,
-        // so they can each compute the same deterministic color map independently.
-
-        std::unordered_map<std::string_view, int> host_to_color;
-        host_to_color.reserve(world_size);
-        std::vector<int> colors(world_size);
-        int next_color = 0;
-        for (int r = 0; r < world_size; ++r) {
-            // get host name of rank r
-            std::string_view host(&all_names[displs[r]], static_cast<std::size_t>(name_lengths[r]));
-
-            // try to insert it into the host_to_color map
-            auto [it, inserted] = host_to_color.emplace(host, next_color);
-
-            // check if host was new, if yes, increment color
-            if (inserted) {
-                ++next_color;
-            }
-            // save color of current rank, either from newly created or existing entry
-            colors[r] = it->second;
-        }
-
-        // Split communicator
-        MPI_Comm_split(comm, colors[world_rank], world_rank, &node_comm);
-        MPI_Comm_rank(node_comm, &node_rank);
-        MPI_Comm_size(node_comm, &node_size);
-    }
-
-    hostname_comm_info(const hostname_comm_info &) = delete;
-    hostname_comm_info &operator=(const hostname_comm_info &) = delete;
-
-    ~hostname_comm_info() {
-        if (node_comm != MPI_COMM_NULL) {
-            MPI_Comm_free(&node_comm);
-        }
-    }
-};
-
-enum class device_backend_kind {
-    nvidia,
-    amd,
-    intel
-};
-
-struct visible_gpu_device {
-    device_backend_kind backend;
-    int local_index;          // device index for that backend on this rank
-    std::string physical_id;  // stable per-node identifier
-};
-
-    #if defined(HWS_FOR_NVIDIA_GPUS_ENABLED)
-/**
- * @brief creates a list of all visible NVIDIA GPU devices
- *
- * @return a vector of all visible NVIDIA GPU devices on the local node, each with its local index and physical ID
- */
-[[nodiscard]] std::vector<visible_gpu_device> enumerate_local_nvidia_devices();
-    #endif
-
-    #if defined(HWS_FOR_AMD_GPUS_ENABLED)
-/**
- * @brief creates a list of all visible AMD GPU devices
- *
- * @return a vector of all visible AMD GPU devices on the local node, each with its local index and physical ID
- */
-[[nodiscard]] std::vector<visible_gpu_device> enumerate_local_amd_devices();
-    #endif
-
-    #if defined(HWS_FOR_INTEL_GPUS_ENABLED)
-/**
- * @brief creates a list of all visible Intel GPU devices
- *
- * @return a vector of all visible Intel GPU devices on the local node, each with its local index and physical ID
- */
-[[nodiscard]] std::vector<visible_gpu_device> enumerate_local_intel_devices();
-    #endif
-
-/**
- * Computes for each MPI rank a list of devices that have to be sampled by this rank. Ensures that
- * each device is sampled by exactly one rank.
- *
- * @param local_devices a vector of visible_gpu_device for the local rank, each containing a local index and a physical ID
- * @param node_comm a node local MPI communicator
- * @return all device indices that have to be sampled by this rank
- */
-inline std::vector<int> owned_local_indices_for_backend(const std::vector<visible_gpu_device> &local_devices, MPI_Comm node_comm) {
-    int node_rank = 0;
-    int node_size = 0;
-    MPI_Comm_rank(node_comm, &node_rank);
-    MPI_Comm_size(node_comm, &node_size);
-
-    // Pack physical IDs into a newline-separated string
-    std::string packed;
-    for (const auto &d : local_devices) {
-        packed += d.physical_id;
-        packed += '\n';
-    }
-    const int local_size = static_cast<int>(packed.size());
-
-    // Allgather sizes
-    std::vector<int> sizes(node_size);
-    MPI_Allgather(&local_size, 1, MPI_INT, sizes.data(), 1, MPI_INT, node_comm);
-
-    // Displacements and total length
-    std::vector<int> displs(node_size);
-    int total = 0;
-    for (int r = 0; r < node_size; ++r) {
-        displs[r] = total;
-        total += sizes[r];
-    }
-
-    // Allgatherv packed physical IDs
-    std::vector<char> all_data(total);
-    MPI_Allgatherv(packed.data(), local_size, MPI_CHAR, all_data.data(), sizes.data(), displs.data(), MPI_CHAR, node_comm);
-
-    // Build owner map: physical_id -> first node_rank that reports it
-    std::unordered_map<std::string, int> owner_rank_for_id;
-    owner_rank_for_id.reserve(local_devices.size() * 2 + 1);
-
-    for (int r = 0; r < node_size; ++r) {
-        if (sizes[r] == 0) {
-            continue;
-        }
-
-        const char *base = all_data.data() + displs[r];
-        const int len = sizes[r];
-
-        int line_start = 0;
-        while (line_start < len) {
-            int line_end = line_start;
-            while (line_end < len && base[line_end] != '\n') {
-                ++line_end;
-            }
-            if (line_end > line_start) {
-                const std::string id(base + line_start, base + line_end);  // copy just this ID
-                owner_rank_for_id.emplace(id, r);                          // first insertion wins
-            }
-            line_start = line_end + 1;
-        }
-    }
-
-    // Decide which local indices we own: those whose physical_id is mapped to node_rank
-    std::vector<int> owned_indices;
-    owned_indices.reserve(local_devices.size());
-
-    for (const auto &d : local_devices) {
-        auto it = owner_rank_for_id.find(d.physical_id);
-        if (it != owner_rank_for_id.end() && it->second == node_rank) {
-            owned_indices.push_back(d.local_index);
-        }
-    }
-
-    return owned_indices;
-}
-
-#endif
-
 }  // namespace hws::detail
 
 #endif  // HWS_UTILITY_HPP_
diff --git a/include/hws/visible_gpu_device.hpp b/include/hws/visible_gpu_device.hpp
new file mode 100644
index 0000000..787776a
--- /dev/null
+++ b/include/hws/visible_gpu_device.hpp
@@ -0,0 +1,37 @@
+/**
+ * @file
+ * @author Tim Thüring
+ * @copyright 2024-today All Rights Reserved
+ * @license This file is released under the MIT license.
+ *          See the LICENSE.md file in the project root for full license information.
+ *
+ * @brief Defines types for identifying visible GPU devices.
+ */
+
+#ifndef HWS_VISIBLE_GPU_DEVICE_HPP_
+#define HWS_VISIBLE_GPU_DEVICE_HPP_
+#pragma once
+
+#if defined(HWS_MPI_SUPPORT_ENABLED)
+
+#include <string>  // std::string
+
+namespace hws::detail {
+
+enum class device_backend_kind {
+    nvidia,
+    amd,
+    intel
+};
+
+struct visible_gpu_device {
+    device_backend_kind backend;
+    int local_index;          // device index for that backend on this rank
+    std::string physical_id;  // stable per-node identifier
+};
+
+}  // namespace hws::detail
+
+#endif  // HWS_MPI_SUPPORT_ENABLED
+
+#endif  // HWS_VISIBLE_GPU_DEVICE_HPP_
diff --git a/src/hws/gpu_amd/utility.cpp b/src/hws/gpu_amd/utility.cpp
index 614a7c7..55d6932 100644
--- a/src/hws/gpu_amd/utility.cpp
+++ b/src/hws/gpu_amd/utility.cpp
@@ -13,7 +13,7 @@
 #include <vector>  // std::vector
 
 #if defined(HWS_MPI_SUPPORT_ENABLED) && defined(HWS_FOR_AMD_GPUS_ENABLED)
-    #include "hws/utility.hpp"  // hws::detail::visible_gpu_device, hws::detail::device_backend_kind
+    #include "hws/visible_gpu_device.hpp"  // hws::detail::visible_gpu_device, hws::detail::device_backend_kind
 
     #include "hip/hip_runtime_api.h"  // hipGetDeviceCount, hipDeviceGetPCIBusId
 #endif
diff --git a/src/hws/gpu_intel/utility.cpp b/src/hws/gpu_intel/utility.cpp
index 0b5f0e9..6d82160 100644
--- a/src/hws/gpu_intel/utility.cpp
+++ b/src/hws/gpu_intel/utility.cpp
@@ -21,7 +21,7 @@
 #include <vector>       // std::vector
 
 #if defined(HWS_MPI_SUPPORT_ENABLED) && defined(HWS_FOR_INTEL_GPUS_ENABLED)
-    #include "hws/utility.hpp"  // hws::detail::visible_gpu_device, hws::detail::device_backend_kind
+    #include "hws/visible_gpu_device.hpp"  // hws::detail::visible_gpu_device, hws::detail::device_backend_kind
 #endif
 
 namespace hws::detail {
diff --git a/src/hws/gpu_nvidia/utility.cpp b/src/hws/gpu_nvidia/utility.cpp
index 437699d..97d8c1e 100644
--- a/src/hws/gpu_nvidia/utility.cpp
+++ b/src/hws/gpu_nvidia/utility.cpp
@@ -15,7 +15,7 @@
 #include <vector>  // std::vector
 
 #if defined(HWS_MPI_SUPPORT_ENABLED) && defined(HWS_FOR_NVIDIA_GPUS_ENABLED)
-    #include "hws/utility.hpp"  // hws::detail::visible_gpu_device, hws::detail::device_backend_kind
+    #include "hws/visible_gpu_device.hpp"  // hws::detail::visible_gpu_device, hws::detail::device_backend_kind
 #endif
 
 namespace hws::detail {
diff --git a/src/hws/hardware_sampler.cpp b/src/hws/hardware_sampler.cpp
index 9066920..3ebaf4c 100644
--- a/src/hws/hardware_sampler.cpp
+++ b/src/hws/hardware_sampler.cpp
@@ -25,7 +25,7 @@
 #include <utility>    // std::move
 
 #if defined(HWS_MPI_SUPPORT_ENABLED)
-    #include <mpi.h>  // MPI_Comm
+    #include "hws/mpi_utility.hpp"  // hws::detail::gather_yaml_strings_mpi
 #endif
 
 namespace hws {
diff --git a/src/hws/mpi_utility.cpp b/src/hws/mpi_utility.cpp
new file mode 100644
index 0000000..4d8b3af
--- /dev/null
+++ b/src/hws/mpi_utility.cpp
@@ -0,0 +1,210 @@
+/**
+ * @author Tim Thüring
+ * @copyright 2024-today All Rights Reserved
+ * @license This file is released under the MIT license.
+ *          See the LICENSE.md file in the project root for full license information.
+ */
+
+#include "hws/mpi_utility.hpp"
+
+#include <cstddef>        // std::size_t
+#include <stdexcept>      // std::runtime_error
+#include <string>         // std::string
+#include <string_view>    // std::string_view
+#include <unordered_map>  // std::unordered_map
+#include <vector>         // std::vector
+
+namespace hws::detail {
+
+hostname_comm_info::hostname_comm_info(MPI_Comm comm) {
+    int world_rank = 0;
+    int world_size = 0;
+    MPI_Comm_rank(comm, &world_rank);
+    MPI_Comm_size(comm, &world_size);
+
+    // Gather all hostnames
+    char name[MPI_MAX_PROCESSOR_NAME];
+    int name_len = 0;
+    MPI_Get_processor_name(name, &name_len);
+
+    std::vector<int> name_lengths(world_size);
+    MPI_Allgather(&name_len, 1, MPI_INT, name_lengths.data(), 1, MPI_INT, comm);
+
+    // Build displacements and total byte count
+    std::vector<int> displs(world_size);
+    int total = 0;
+    for (int i = 0; i < world_size; ++i) {
+        displs[i] = total;
+        total += name_lengths[i];
+    }
+
+    std::vector<char> all_names(total);
+    MPI_Allgatherv(name, name_len, MPI_CHAR, all_names.data(), name_lengths.data(), displs.data(), MPI_CHAR, comm);
+
+    // Assign colors locally on every rank
+    //
+    // All ranks hold identical copies of all_names, name_lengths, and displs,
+    // so they can each compute the same deterministic color map independently.
+
+    std::unordered_map<std::string_view, int> host_to_color;
+    host_to_color.reserve(world_size);
+    std::vector<int> colors(world_size);
+    int next_color = 0;
+    for (int r = 0; r < world_size; ++r) {
+        // get host name of rank r
+        std::string_view host(&all_names[displs[r]], static_cast<std::size_t>(name_lengths[r]));
+
+        // try to insert it into the host_to_color map
+        auto [it, inserted] = host_to_color.emplace(host, next_color);
+
+        // check if host was new, if yes, increment color
+        if (inserted) {
+            ++next_color;
+        }
+        // save color of current rank, either from newly created or existing entry
+        colors[r] = it->second;
+    }
+
+    // Split communicator
+    MPI_Comm_split(comm, colors[world_rank], world_rank, &node_comm);
+    MPI_Comm_rank(node_comm, &node_rank);
+    MPI_Comm_size(node_comm, &node_size);
+}
+
+hostname_comm_info::~hostname_comm_info() {
+    if (node_comm != MPI_COMM_NULL) {
+        MPI_Comm_free(&node_comm);
+    }
+}
+
+std::string gather_yaml_strings_mpi(const std::string &local_yaml, MPI_Comm communicator) {
+    int initialized = 0;
+    MPI_Initialized(&initialized);
+
+    if (!initialized) {
+        throw std::runtime_error{"MPI must already be initialized"};
+    }
+
+    // MPI rank and world size for identification and communication
+    int rank = 0;
+    int world_size = 0;
+    MPI_Comm_rank(communicator, &rank);
+    MPI_Comm_size(communicator, &world_size);
+
+    // gather the size of the yaml string from each rank
+    const int local_size = static_cast<int>(local_yaml.size());
+
+    std::vector<int> recv_sizes;
+
+    if (rank == 0) {
+        recv_sizes.resize(world_size);
+    }
+
+    MPI_Gather(&local_size, 1, MPI_INT, recv_sizes.data(), 1, MPI_INT, 0, communicator);
+
+    // compute the displacements from the rank string sizes
+    std::vector<int> displacements;
+    int total_size = 0;
+
+    if (rank == 0) {
+        displacements.resize(world_size);
+
+        for (int i = 0; i < world_size; ++i) {
+            displacements[i] = total_size;
+            total_size += recv_sizes[i];
+        }
+    }
+
+    // gather the local yaml strings from all ranks
+    std::vector<char> recv_buffer;
+
+    if (rank == 0) {
+        recv_buffer.resize(total_size);
+    }
+
+    MPI_Gatherv(local_yaml.data(), local_size, MPI_CHAR, recv_buffer.data(), recv_sizes.data(), displacements.data(), MPI_CHAR, 0, communicator);
+
+    // build final yaml string on rank 0
+    std::string global_yaml;
+
+    if (rank == 0) {
+        for (int r = 0; r < world_size; ++r) {
+            global_yaml.append(recv_buffer.data() + displacements[r], recv_sizes[r]);
+            global_yaml += '\n';
+        }
+    }
+
+    return global_yaml;
+}
+
+std::vector<int> owned_local_indices_for_backend(const std::vector<visible_gpu_device> &local_devices, MPI_Comm node_comm) {
+    int node_rank = 0;
+    int node_size = 0;
+    MPI_Comm_rank(node_comm, &node_rank);
+    MPI_Comm_size(node_comm, &node_size);
+
+    // Pack physical IDs into a newline-separated string
+    std::string packed;
+    for (const auto &d : local_devices) {
+        packed += d.physical_id;
+        packed += '\n';
+    }
+    const int local_size = static_cast<int>(packed.size());
+
+    // Allgather sizes
+    std::vector<int> sizes(node_size);
+    MPI_Allgather(&local_size, 1, MPI_INT, sizes.data(), 1, MPI_INT, node_comm);
+
+    // Displacements and total length
+    std::vector<int> displs(node_size);
+    int total = 0;
+    for (int r = 0; r < node_size; ++r) {
+        displs[r] = total;
+        total += sizes[r];
+    }
+
+    // Allgatherv packed physical IDs
+    std::vector<char> all_data(total);
+    MPI_Allgatherv(packed.data(), local_size, MPI_CHAR, all_data.data(), sizes.data(), displs.data(), MPI_CHAR, node_comm);
+
+    // Build owner map: physical_id -> first node_rank that reports it
+    std::unordered_map<std::string, int> owner_rank_for_id;
+    owner_rank_for_id.reserve(local_devices.size() * 2 + 1);
+
+    for (int r = 0; r < node_size; ++r) {
+        if (sizes[r] == 0) {
+            continue;
+        }
+
+        const char *base = all_data.data() + displs[r];
+        const int len = sizes[r];
+
+        int line_start = 0;
+        while (line_start < len) {
+            int line_end = line_start;
+            while (line_end < len && base[line_end] != '\n') {
+                ++line_end;
+            }
+            if (line_end > line_start) {
+                const std::string id(base + line_start, base + line_end);  // copy just this ID
+                owner_rank_for_id.emplace(id, r);                          // first insertion wins
+            }
+            line_start = line_end + 1;
+        }
+    }
+
+    // Decide which local indices we own: those whose physical_id is mapped to node_rank
+    std::vector<int> owned_indices;
+    owned_indices.reserve(local_devices.size());
+
+    for (const auto &d : local_devices) {
+        auto it = owner_rank_for_id.find(d.physical_id);
+        if (it != owner_rank_for_id.end() && it->second == node_rank) {
+            owned_indices.push_back(d.local_index);
+        }
+    }
+
+    return owned_indices;
+}
+
+}  // namespace hws::detail
diff --git a/src/hws/system_hardware_sampler.cpp b/src/hws/system_hardware_sampler.cpp
index 5e30dee..4dff468 100644
--- a/src/hws/system_hardware_sampler.cpp
+++ b/src/hws/system_hardware_sampler.cpp
@@ -42,7 +42,7 @@
 #include <vector>     // std::vector
 
 #if defined(HWS_MPI_SUPPORT_ENABLED)
-    #include <mpi.h>  // MPI_Comm
+    #include "hws/mpi_utility.hpp"  // hws::detail::hostname_comm_info, hws::detail::owned_local_indices_for_backend
 #endif
 
 namespace hws {
diff --git a/src/hws/utility.cpp b/src/hws/utility.cpp
index f593521..406089a 100644
--- a/src/hws/utility.cpp
+++ b/src/hws/utility.cpp
@@ -14,10 +14,6 @@
 #include <string_view>  // std::string_view
 #include <vector>       // std::vector
 
-#if defined(HWS_MPI_SUPPORT_ENABLED)
-    #include <mpi.h>  // MPI_Comm, MPI_Gatherv, MPI_Gather, MPI_Initialized, MPI_Comm_rank, MPI_Comm_size
-#endif
-
 namespace hws::detail {
 
 bool starts_with(const std::string_view sv, const std::string_view start) noexcept {
@@ -81,66 +77,4 @@ std::string indent_lines(const std::string &text, const std::string_view prefix)
     return out;
 }
 
-#if defined(HWS_MPI_SUPPORT_ENABLED)
-std::string gather_yaml_strings_mpi(const std::string &local_yaml, MPI_Comm communicator) {
-    int initialized = 0;
-    MPI_Initialized(&initialized);
-
-    if (!initialized) {
-        throw std::runtime_error{"MPI must already be initialized"};
-    }
-
-    // MPI rank and world size for identification and communication
-    int rank = 0;
-    int world_size = 0;
-    MPI_Comm_rank(communicator, &rank);
-    MPI_Comm_size(communicator, &world_size);
-
-    // gather the size of the yaml string from each rank
-    const int local_size = static_cast<int>(local_yaml.size());
-
-    std::vector<int> recv_sizes;
-
-    if (rank == 0) {
-        recv_sizes.resize(world_size);
-    }
-
-    MPI_Gather(&local_size, 1, MPI_INT, recv_sizes.data(), 1, MPI_INT, 0, communicator);
-
-    // compute the displacements from the rank string sizes
-    std::vector<int> displacements;
-    int total_size = 0;
-
-    if (rank == 0) {
-        displacements.resize(world_size);
-
-        for (int i = 0; i < world_size; ++i) {
-            displacements[i] = total_size;
-            total_size += recv_sizes[i];
-        }
-    }
-
-    // gather the local yaml strings from all ranks
-    std::vector<char> recv_buffer;
-
-    if (rank == 0) {
-        recv_buffer.resize(total_size);
-    }
-
-    MPI_Gatherv(local_yaml.data(), local_size, MPI_CHAR, recv_buffer.data(), recv_sizes.data(), displacements.data(), MPI_CHAR, 0, communicator);
-
-    // build final yaml string on rank 0
-    std::string global_yaml;
-
-    if (rank == 0) {
-        for (int r = 0; r < world_size; ++r) {
-            global_yaml.append(recv_buffer.data() + displacements[r], recv_sizes[r]);
-            global_yaml += '\n';
-        }
-    }
-
-    return global_yaml;
-}
-#endif
-
 }  // namespace hws::detail

From 9701c70ad840cc6bbf98e1a75a2858f8992dbda5 Mon Sep 17 00:00:00 2001
From: TimThuering <56958925+TimThuering@users.noreply.github.com>
Date: Thu, 25 Jun 2026 08:32:51 +0200
Subject: [PATCH 30/31] add documentation in visible_gpu_device header

---
 include/hws/visible_gpu_device.hpp | 8 ++++++++
 1 file changed, 8 insertions(+)

diff --git a/include/hws/visible_gpu_device.hpp b/include/hws/visible_gpu_device.hpp
index 787776a..7b6d581 100644
--- a/include/hws/visible_gpu_device.hpp
+++ b/include/hws/visible_gpu_device.hpp
@@ -18,12 +18,20 @@
 
 namespace hws::detail {
 
+/**
+ * @brief Enum class representing the backend kind of visible GPU device.
+ * @details The backend kind can be NVIDIA, AMD, or Intel.
+ */
 enum class device_backend_kind {
     nvidia,
     amd,
     intel
 };
 
+/**
+ * @brief Represents a visible GPU device on the local rank.
+ * @details Contains the backend kind, the local index of the device for that backend on this rank, and a stable per-node identifier (physical ID) for the device.
+ */
 struct visible_gpu_device {
     device_backend_kind backend;
     int local_index;          // device index for that backend on this rank

From 62c31dfd82dac1c622a577a8b59b482360fb43fe Mon Sep 17 00:00:00 2001
From: TimThuering <56958925+TimThuering@users.noreply.github.com>
Date: Thu, 25 Jun 2026 11:14:18 +0200
Subject: [PATCH 31/31] update CMake error message if mpi4py causes problems

---
 bindings/CMakeLists.txt | 10 ++++++++--
 1 file changed, 8 insertions(+), 2 deletions(-)

diff --git a/bindings/CMakeLists.txt b/bindings/CMakeLists.txt
index e344dec..89357b6 100644
--- a/bindings/CMakeLists.txt
+++ b/bindings/CMakeLists.txt
@@ -79,14 +79,20 @@ if(HWS_MPI_SUPPORT_ACTIVE)
         message(FATAL_ERROR
                 "MPI support is enabled in hws (HWS_ENABLE_MPI_SUPPORT=AUTO/ON and MPI_FOUND) "
                 "but mpi4py is not importable in Python_EXECUTABLE='${Python_EXECUTABLE}'. "
-                "Install mpi4py in this environment or disable python bindings.")
+                "To fix this, either:\n"
+                "  1. Reinstall mpi4py in this environment  \n"
+                "  2. Disable Python bindings:               -DHWS_ENABLE_PYTHON_BINDINGS=OFF\n"
+                "  3. Disable MPI support:                   -DHWS_ENABLE_MPI_SUPPORT=OFF")
     endif()
 
     if(NOT EXISTS "${HWS_MPI4PY_INCLUDE_DIR}/mpi4py/mpi4py.h")
         message(FATAL_ERROR
                 "mpi4py include path '${HWS_MPI4PY_INCLUDE_DIR}' does not contain mpi4py/mpi4py.h. "
                 "The mpi4py installation appears to be broken. "
-                "Reinstall mpi4py in this environment or disable python bindings.")
+                "To fix this, either:\n"
+                "  1. Reinstall mpi4py in this environment  \n"
+                "  2. Disable Python bindings:               -DHWS_ENABLE_PYTHON_BINDINGS=OFF\n"
+                "  3. Disable MPI support:                   -DHWS_ENABLE_MPI_SUPPORT=OFF")
     endif()
 
     execute_process(