diff --git a/CMakeLists.txt b/CMakeLists.txt index cf96cc6..6ab2b4b 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -25,7 +25,7 @@ add_library(${HWS_LIBRARY_NAME} SHARED ${HWS_SOURCES}) add_library(hws::hws ALIAS ${HWS_LIBRARY_NAME}) # set install target -set(HWS_TARGETS_TO_INSTALL ) +set(HWS_TARGETS_TO_INSTALL) # use C++17 target_compile_features(${HWS_LIBRARY_NAME} PUBLIC cxx_std_17) @@ -147,6 +147,42 @@ else () endif () +#################################################################################################################### +## enable MPI support ## +#################################################################################################################### +set(HWS_ENABLE_MPI_SUPPORT AUTO CACHE STRING "Enable MPI support.") +set_property(CACHE HWS_ENABLE_MPI_SUPPORT PROPERTY STRINGS AUTO ON OFF) +# Default: assume MPI support inactive +set(HWS_MPI_SUPPORT_ACTIVE FALSE CACHE BOOL "MPI support enabled in core library") + +if (HWS_ENABLE_MPI_SUPPORT MATCHES "AUTO" OR HWS_ENABLE_MPI_SUPPORT) + # try finding MPI + find_package(MPI QUIET) + + # check if MPI could be found + if (NOT MPI_FOUND) + if (HWS_ENABLE_MPI_SUPPORT MATCHES "ON") + message(SEND_ERROR "Cannot find MPI but MPI support was explicitly requested!") + else () + message(STATUS "Cannot find MPI. MPI support disabled.") + endif () + else () + message(STATUS "Enable MPI support (${MPI_CXX_VERSION}).") + + # link against necessary libraries + target_link_libraries(${HWS_LIBRARY_NAME} PUBLIC MPI::MPI_CXX) + + # add compile definition + target_compile_definitions(${HWS_LIBRARY_NAME} PUBLIC HWS_MPI_SUPPORT_ENABLED) + + # Expose that MPI is really enabled for the Python bindings (and potentially other submodules) via a cache variable. + set(HWS_MPI_SUPPORT_ACTIVE TRUE CACHE BOOL "MPI support enabled in core library" FORCE) + + target_sources(${HWS_LIBRARY_NAME} PRIVATE ${CMAKE_CURRENT_SOURCE_DIR}/src/hws/mpi_utility.cpp) + endif () +endif () + + #################################################################################################################### ## enable Python bindings ## #################################################################################################################### diff --git a/README.md b/README.md index 7846580..1bd62df 100644 --- a/README.md +++ b/README.md @@ -75,6 +75,11 @@ The `[optional_options]` can be one or multiple of: - `HWS_SAMPLING_INTERVAL=100ms` (default: `100ms`): set the sampling interval in milliseconds - `HWS_ENABLE_PYTHON_BINDINGS=ON|OFF` (default: `ON`): enable Python bindings +- `HWS_ENABLE_MPI_SUPPORT=ON|OFF|AUTO` (default: `AUTO`): + - `ON`: check whether MPI is available and fail if this is not the case + - `AUTO`: check whether MPI is available but **do not** fail if this is not the case + - `OFF`: do not check whether MPI is available + ### Installing via CMake The library supports the `install` target: diff --git a/bindings/CMakeLists.txt b/bindings/CMakeLists.txt index 93d8e98..89357b6 100644 --- a/bindings/CMakeLists.txt +++ b/bindings/CMakeLists.txt @@ -64,6 +64,55 @@ target_include_directories(${HWS_PYTHON_BINDINGS_LIBRARY_NAME} PRIVATE ${CMAKE_C target_link_libraries(${HWS_PYTHON_BINDINGS_LIBRARY_NAME} PRIVATE ${HWS_LIBRARY_NAME}) target_compile_definitions(${HWS_PYTHON_BINDINGS_LIBRARY_NAME} PRIVATE PYBIND11_DETAILED_ERROR_MESSAGES) +if(HWS_MPI_SUPPORT_ACTIVE) + message(STATUS "MPI support enabled. Adding mpi4py include directory and linking against MPI.") + # Get mpi4py's C header location, simultaneously checking if mpi4py is importable in the current Python environment + execute_process( + COMMAND "${Python_EXECUTABLE}" -c + "import mpi4py, sys; sys.stdout.write(mpi4py.get_include())" + RESULT_VARIABLE HWS_MPI4PY_IMPORT_RESULT + OUTPUT_VARIABLE HWS_MPI4PY_INCLUDE_DIR + OUTPUT_STRIP_TRAILING_WHITESPACE + ) + + if(HWS_MPI4PY_IMPORT_RESULT) + message(FATAL_ERROR + "MPI support is enabled in hws (HWS_ENABLE_MPI_SUPPORT=AUTO/ON and MPI_FOUND) " + "but mpi4py is not importable in Python_EXECUTABLE='${Python_EXECUTABLE}'. " + "To fix this, either:\n" + " 1. Reinstall mpi4py in this environment \n" + " 2. Disable Python bindings: -DHWS_ENABLE_PYTHON_BINDINGS=OFF\n" + " 3. Disable MPI support: -DHWS_ENABLE_MPI_SUPPORT=OFF") + endif() + + if(NOT EXISTS "${HWS_MPI4PY_INCLUDE_DIR}/mpi4py/mpi4py.h") + message(FATAL_ERROR + "mpi4py include path '${HWS_MPI4PY_INCLUDE_DIR}' does not contain mpi4py/mpi4py.h. " + "The mpi4py installation appears to be broken. " + "To fix this, either:\n" + " 1. Reinstall mpi4py in this environment \n" + " 2. Disable Python bindings: -DHWS_ENABLE_PYTHON_BINDINGS=OFF\n" + " 3. Disable MPI support: -DHWS_ENABLE_MPI_SUPPORT=OFF") + endif() + + execute_process( + COMMAND "${Python_EXECUTABLE}" -c + "import mpi4py, sys; sys.stdout.write(mpi4py.__version__)" + OUTPUT_VARIABLE HWS_MPI4PY_VERSION + OUTPUT_STRIP_TRAILING_WHITESPACE + ) + if(HWS_MPI4PY_VERSION VERSION_LESS "4.0") + message(FATAL_ERROR + "mpi4py>=4.0 is required but found ${HWS_MPI4PY_VERSION} in Python_EXECUTABLE='${Python_EXECUTABLE}'. " + "Upgrade mpi4py or disable python bindings.") + endif() + + target_include_directories(${HWS_PYTHON_BINDINGS_LIBRARY_NAME} PRIVATE ${HWS_MPI4PY_INCLUDE_DIR}) + + # Propagate the same macro used on the C++ side into the Python module + target_compile_definitions(${HWS_PYTHON_BINDINGS_LIBRARY_NAME} PRIVATE HWS_MPI_SUPPORT_ENABLED) +endif() + include(GNUInstallDirs) # install Python bindings install(TARGETS ${HWS_PYTHON_BINDINGS_LIBRARY_NAME} diff --git a/bindings/hardware_sampler.cpp b/bindings/hardware_sampler.cpp index 5a12141..f29c4a3 100644 --- a/bindings/hardware_sampler.cpp +++ b/bindings/hardware_sampler.cpp @@ -31,6 +31,11 @@ #include "relative_event.hpp" // hws::detail::relative_event #include // std::string +#if defined(HWS_MPI_SUPPORT_ENABLED) + #include "mpi4py_communicator.hpp" + #include +#endif + namespace py = pybind11; void init_hardware_sampler(py::module_ &m) { @@ -62,6 +67,11 @@ void init_hardware_sampler(py::module_ &m) { .def("relative_time_points", [](const hws::hardware_sampler &self) { return hws::detail::durations_from_reference_time(self.sampling_time_points(), self.get_event(0).time_point); }, "get the relative durations of the respective hardware samples in seconds (as \"normal\" number)") .def("sampling_interval", &hws::hardware_sampler::sampling_interval, "get the sampling interval of this hardware sampler (in ms)") .def("dump_yaml", py::overload_cast(&hws::hardware_sampler::dump_yaml, py::const_), "dump all hardware samples to the given YAML file") +#if defined(HWS_MPI_SUPPORT_ENABLED) + .def("dump_yaml_global", [](const hws::hardware_sampler &self, const std::string &filename, py::object py_comm) { + const MPI_Comm comm = mpi_comm_from_python(py_comm); + self.dump_yaml_global(filename, comm); }, py::arg("filename"), py::arg("comm"), "Let MPI rank 0 dump the hardware samples of this hardware sampler of all MPI ranks to the given YAML file using the provided mpi4py communicator.") +#endif .def("as_yaml_string", &hws::hardware_sampler::as_yaml_string, "return all hardware samples including additional information like events as YAML string") .def("samples_only_as_yaml_string", &hws::hardware_sampler::samples_only_as_yaml_string, "return all hardware samples as YAML string") .def("__repr__", [](const hws::hardware_sampler &self) { diff --git a/bindings/main.cpp b/bindings/main.cpp index 3f062e7..932a897 100644 --- a/bindings/main.cpp +++ b/bindings/main.cpp @@ -5,12 +5,18 @@ * See the LICENSE.md file in the project root for full license information. */ -#include "hws/version.hpp" // hws::version::version +#include "hws/version.hpp" // hws::version::version #include "pybind11/pybind11.h" // PYBIND11_MODULE, py::module_ #include // std::string_view +#if defined(HWS_MPI_SUPPORT_ENABLED) + #include "mpi4py_communicator.hpp" + #include + #include +#endif + #define HWS_IS_DEFINED_HELPER(x) #x #define HWS_IS_DEFINED(x) (std::string_view{ #x } != std::string_view{ HWS_IS_DEFINED_HELPER(x) }) @@ -32,6 +38,15 @@ PYBIND11_MODULE(HardwareSampling, m) { m.doc() = "Hardware Sampling for CPUs and GPUs"; m.attr("__version__") = hws::version::version; + // MPI support +#if defined(HWS_MPI_SUPPORT_ENABLED) + // Initialize mpi4py C-API so PyMPIComm_* are usable + if (import_mpi4py() < 0) { + throw py::error_already_set(); + } +#endif + m.def("has_mpi_support", []() { return HWS_IS_DEFINED(HWS_MPI_SUPPORT_ENABLED); }); + init_event(m); init_sample_category(m); init_relative_event(m); @@ -64,3 +79,25 @@ PYBIND11_MODULE(HardwareSampling, m) { init_version(m); } + +#if defined(HWS_MPI_SUPPORT_ENABLED) +/** + * Extracts an MPI_Comm from a python mpi4py.MPI.Comm object. + * Has to be in same translation unit as the import_mpi4py() call to ensure that the mpi4py C-API is initialized and the PyMPIComm_Type is available. + * + * @param py_comm a Python object that is expected to be an mpi4py.MPI.Comm instance + * @return the extracted MPI_Comm + */ +MPI_Comm mpi_comm_from_python(py::object py_comm) { + if (!PyObject_TypeCheck(py_comm.ptr(), &PyMPIComm_Type)) { + throw std::runtime_error{"expected mpi4py.MPI.Comm as communicator argument"}; + } + + MPI_Comm *comm_ptr = PyMPIComm_Get(py_comm.ptr()); + if (comm_ptr == nullptr) { + throw std::runtime_error{"could not extract MPI_Comm from mpi4py communicator"}; + } + + return *comm_ptr; +} +#endif diff --git a/bindings/mpi4py_communicator.hpp b/bindings/mpi4py_communicator.hpp new file mode 100644 index 0000000..f325423 --- /dev/null +++ b/bindings/mpi4py_communicator.hpp @@ -0,0 +1,27 @@ +/** + * @file + * @author Tim Thüring + * @copyright 2024-today All Rights Reserved + * @license This file is released under the MIT license. + * See the LICENSE.md file in the project root for full license information. + * + * @brief Utility functions for transforming mpi4py communicators into C++ MPI communicators + */ + +#ifndef HWS_BINDINGS_MPI4PY_COMMUNICATOR_HPP +#define HWS_BINDINGS_MPI4PY_COMMUNICATOR_HPP +#pragma once + +#include "pybind11/pybind11.h" + +#if defined(HWS_MPI_SUPPORT_ENABLED) + #include +#endif + +namespace py = pybind11; + +#if defined(HWS_MPI_SUPPORT_ENABLED) +MPI_Comm mpi_comm_from_python(py::object py_comm); +#endif + +#endif // HWS_BINDINGS_MPI4PY_COMMUNICATOR_HPP diff --git a/bindings/system_hardware_sampler.cpp b/bindings/system_hardware_sampler.cpp index d9af622..4941fdf 100644 --- a/bindings/system_hardware_sampler.cpp +++ b/bindings/system_hardware_sampler.cpp @@ -19,17 +19,71 @@ #include "relative_event.hpp" // hws::detail::relative_event #include // std::string +#if defined(HWS_MPI_SUPPORT_ENABLED) + #include "mpi4py_communicator.hpp" + #include +#endif + namespace py = pybind11; void init_system_hardware_sampler(py::module_ &m) { +#if defined(HWS_MPI_SUPPORT_ENABLED) + // bind mpi sampling mode enum + py::enum_(m, "MPISamplingMode") + .value("PER_RANK", hws::detail::mpi_sampling_mode::per_rank) + .value("WHOLE_NODE", hws::detail::mpi_sampling_mode::whole_node) + .export_values(); +#endif // bind the pure virtual hardware sampler base class py::class_(m, "SystemHardwareSampler") .def(py::init<>(), "construct a new system hardware sampler with the default sampling interval") .def(py::init(), "construct a new system hardware sampler with the default sampling interval sampling only the provided sample_category samples") .def(py::init(), "construct a new system hardware sampler for with the specified sampling interval") .def(py::init(), "construct a new system hardware sampler for with the specified sampling interval sampling only the provided sample_category samples") +#if defined(HWS_MPI_SUPPORT_ENABLED) + // MPI-aware constructors + + // (MPI_Comm, mode, category=all) + .def(py::init([](py::object py_comm, + hws::detail::mpi_sampling_mode mode, + hws::sample_category category) { + MPI_Comm comm = mpi_comm_from_python(py_comm); + return std::make_unique(comm, mode, category); + }), + py::arg("comm"), + py::arg("mode"), + py::arg("category") = hws::sample_category::all, + "construct a new system hardware sampler with the default sampling interval and MPI support using the given mpi4py communicator and sampling mode") + + // (MPI_Comm, mode, sampling_interval, category=all) + .def(py::init([](py::object py_comm, + hws::detail::mpi_sampling_mode mode, + std::chrono::milliseconds sampling_interval, + hws::sample_category category) { + MPI_Comm comm = mpi_comm_from_python(py_comm); + return std::make_unique(comm, mode, sampling_interval, category); + }), + py::arg("comm"), + py::arg("mode"), + py::arg("sampling_interval"), + py::arg("category") = hws::sample_category::all, + "construct a new system hardware sampler with the specified sampling interval and MPI support using the given mpi4py communicator and sampling mode") + + // Non-MPI overloads + .def("start", py::overload_cast<>(&hws::system_hardware_sampler::start_sampling), "start hardware sampling for all available hardware samplers") + .def("stop", py::overload_cast<>(&hws::system_hardware_sampler::stop_sampling), "stop hardware sampling for all available hardware samplers") + // MPI-aware overloads + .def("start", [](hws::system_hardware_sampler &self, py::object py_comm) { + MPI_Comm comm = mpi_comm_from_python(py_comm); + self.start_sampling(comm); }, py::arg("comm"), "start hardware sampling for all available hardware samplers; executes an MPI barrier on the given communicator before starting") + .def("stop", [](hws::system_hardware_sampler &self, py::object py_comm) { + MPI_Comm comm = mpi_comm_from_python(py_comm); + self.stop_sampling(comm); }, py::arg("comm"), "stop hardware sampling for all available hardware samplers; executes an MPI barrier on the given communicator after stopping") +#else + // No MPI support: only the simple overloads exist, no ambiguity .def("start", &hws::system_hardware_sampler::start_sampling, "start hardware sampling for all available hardware samplers") .def("stop", &hws::system_hardware_sampler::stop_sampling, "stop hardware sampling for all available hardware samplers") +#endif .def("pause", &hws::system_hardware_sampler::pause_sampling, "pause hardware sampling for all available hardware samplers") .def("resume", &hws::system_hardware_sampler::resume_sampling, "resume hardware sampling for all available hardware samplers") .def("has_started", &hws::system_hardware_sampler::has_sampling_started, "check whether hardware sampling has already been started for all hardware samplers") @@ -67,5 +121,10 @@ void init_system_hardware_sampler(py::module_ &m) { .def("sampler", [](hws::system_hardware_sampler &self, const std::size_t idx) { return self.sampler(idx).get(); }, "get the i-th hardware sampler available for the whole system") .def("dump_yaml", py::overload_cast(&hws::system_hardware_sampler::dump_yaml, py::const_), "dump all hardware samples for all hardware samplers to the given YAML file") .def("as_yaml_string", &hws::system_hardware_sampler::as_yaml_string, "return all hardware samples for all hardware samplers as YAML string") +#if defined(HWS_MPI_SUPPORT_ENABLED) + .def("dump_yaml_global", [](const hws::system_hardware_sampler &self, const std::string &filename, py::object py_comm) { + const MPI_Comm comm = mpi_comm_from_python(py_comm); + self.dump_yaml_global(filename, comm); }, py::arg("filename"), py::arg("comm"), "Let MPI rank 0 dump the hardware samples of all hardware samplers of all MPI ranks to the given YAML file using the provided mpi4py communicator.") +#endif .def("__repr__", [](const hws::system_hardware_sampler &self) { return fmt::format("", self.num_samplers()); }); } diff --git a/include/hws/gpu_amd/utility.hpp b/include/hws/gpu_amd/utility.hpp index 716ff8c..def0937 100644 --- a/include/hws/gpu_amd/utility.hpp +++ b/include/hws/gpu_amd/utility.hpp @@ -18,6 +18,12 @@ #include // std::runtime_error #include // std::string +#if defined(HWS_MPI_SUPPORT_ENABLED) + #include "hws/visible_gpu_device.hpp" // hws::detail::visible_gpu_device + + #include // std::vector +#endif + namespace hws::detail { /** @@ -68,6 +74,18 @@ namespace hws::detail { */ [[nodiscard]] std::string performance_level_to_string(rsmi_dev_perf_level_t perf_level); + +#if defined(HWS_MPI_SUPPORT_ENABLED) + +/** + * @brief creates a list of all visible AMD GPU devices + * + * @return a vector of all visible AMD GPU devices on the local node, each with its local index and physical ID + */ +[[nodiscard]] std::vector enumerate_local_amd_devices(); + +#endif + } // namespace hws::detail #endif // HWS_GPU_AMD_UTILITY_HPP_ diff --git a/include/hws/gpu_intel/utility.hpp b/include/hws/gpu_intel/utility.hpp index 76e15a1..6e7afe7 100644 --- a/include/hws/gpu_intel/utility.hpp +++ b/include/hws/gpu_intel/utility.hpp @@ -21,6 +21,10 @@ #include // std::string_view #include // std::vector +#if defined(HWS_MPI_SUPPORT_ENABLED) + #include "hws/visible_gpu_device.hpp" // hws::detail::visible_gpu_device +#endif + namespace hws::detail { /** @@ -75,6 +79,17 @@ namespace hws::detail { */ [[nodiscard]] std::string memory_location_to_name(zes_mem_loc_t mem_loc); +#if defined(HWS_MPI_SUPPORT_ENABLED) + +/** + * @brief creates a list of all visible Intel GPU devices + * + * @return a vector of all visible Intel GPU devices on the local node, each with its local index and physical ID + */ +[[nodiscard]] std::vector enumerate_local_intel_devices(); + +#endif + } // namespace hws::detail #endif // HWS_GPU_INTEL_UTILITY_HPP_ diff --git a/include/hws/gpu_nvidia/utility.hpp b/include/hws/gpu_nvidia/utility.hpp index 348f74b..b0b3811 100644 --- a/include/hws/gpu_nvidia/utility.hpp +++ b/include/hws/gpu_nvidia/utility.hpp @@ -19,6 +19,12 @@ #include // std::runtime_error #include // std::string +#if defined(HWS_MPI_SUPPORT_ENABLED) + #include "hws/visible_gpu_device.hpp" // hws::detail::visible_gpu_device + + #include // std::vector +#endif + namespace hws::detail { /** @@ -63,6 +69,17 @@ namespace hws::detail { #endif +#if defined(HWS_MPI_SUPPORT_ENABLED) + +/** + * @brief creates a list of all visible NVIDIA GPU devices + * + * @return a vector of all visible NVIDIA GPU devices on the local node, each with its local index and physical ID + */ +[[nodiscard]] std::vector enumerate_local_nvidia_devices(); + +#endif + } // namespace hws::detail #endif // HWS_GPU_NVIDIA_UTILITY_HPP_ diff --git a/include/hws/hardware_sampler.hpp b/include/hws/hardware_sampler.hpp index 326eb7e..6c31a75 100644 --- a/include/hws/hardware_sampler.hpp +++ b/include/hws/hardware_sampler.hpp @@ -23,6 +23,10 @@ #include // std::thread #include // std::vector +#if defined(HWS_MPI_SUPPORT_ENABLED) + #include // MPI_Comm +#endif + namespace hws { /** @@ -162,6 +166,23 @@ class hardware_sampler { */ void dump_yaml(const std::filesystem::path &filename) const; +#if defined(HWS_MPI_SUPPORT_ENABLED) + /** + * @brief Let MPI rank 0 dump the hardware samples of this hardware sampler of all MPI ranks to the YAML file with @p filename. + * @param[in] filename the YAML file to append the hardware samples to + * @param[in] communicator the MPI communicator to use + */ + void dump_yaml_global(const char *filename, MPI_Comm communicator) const; + /** + * @copydoc hws::hardware_sampler::dump_yaml_global(const char *) const + */ + void dump_yaml_global(const std::string &filename, MPI_Comm communicator) const; + /** + * @copydoc hws::hardware_sampler::dump_yaml_global(const char *) const + */ + void dump_yaml_global(const std::filesystem::path &filename, MPI_Comm communicator) const; +#endif + /** * @brief Return the unique device identification. Can be used as unique key in the YAML string. * @return the unique device identification (`[[nodiscard]]`) diff --git a/include/hws/mpi_sampling_mode.hpp b/include/hws/mpi_sampling_mode.hpp new file mode 100644 index 0000000..b53cfeb --- /dev/null +++ b/include/hws/mpi_sampling_mode.hpp @@ -0,0 +1,33 @@ +/** + * @file + * @author Tim Thüring + * @copyright 2024-today All Rights Reserved + * @license This file is released under the MIT license. + * See the LICENSE.md file in the project root for full license information. + * + * @brief Defines the MPI sampling mode. + */ + +#ifndef HWS_MPI_SAMPLING_MODE_HPP_ +#define HWS_MPI_SAMPLING_MODE_HPP_ +#pragma once + +#if defined(HWS_MPI_SUPPORT_ENABLED) + +namespace hws::detail { + +/** + * @brief The mode to use for MPI sampling. + * per_rank: each rank creates hardware samplers for all devices visible to that rank + * whole_node: if the same device is visible to more than one rank, only one of those ranks creates a hardware sampler for that device + */ +enum class mpi_sampling_mode { + per_rank, + whole_node +}; + +} // namespace hws::detail + +#endif // HWS_MPI_SUPPORT_ENABLED + +#endif // HWS_MPI_SAMPLING_MODE_HPP_ diff --git a/include/hws/mpi_utility.hpp b/include/hws/mpi_utility.hpp new file mode 100644 index 0000000..673c2da --- /dev/null +++ b/include/hws/mpi_utility.hpp @@ -0,0 +1,65 @@ +/** + * @file + * @author Tim Thüring + * @copyright 2024-today All Rights Reserved + * @license This file is released under the MIT license. + * See the LICENSE.md file in the project root for full license information. + * + * @brief MPI utility functions for hardware sampling. + */ + +#ifndef HWS_MPI_UTILITY_HPP_ +#define HWS_MPI_UTILITY_HPP_ +#pragma once + +#if defined(HWS_MPI_SUPPORT_ENABLED) + + #include "hws/visible_gpu_device.hpp" // hws::detail::visible_gpu_device + + #include // MPI_Comm, MPI_COMM_NULL + #include // std::string + #include // std::vector + +namespace hws::detail { + +/** + * @brief Gather YAML strings from all MPI ranks and assemble them in rank order on rank 0. + * + * @param[in] local_yaml the local YAML string contribution + * @param[in] communicator the MPI communicator + * + * @return concatenated YAML string on rank 0, empty string on all other ranks + */ +[[nodiscard]] std::string gather_yaml_strings_mpi(const std::string &local_yaml, MPI_Comm communicator); + +/** + * @brief RAII wrapper around a node-local MPI communicator for whole-node sampling. + */ +struct hostname_comm_info { + MPI_Comm node_comm = MPI_COMM_NULL; + int node_rank = 0; + int node_size = 1; + + explicit hostname_comm_info(MPI_Comm comm); + + hostname_comm_info(const hostname_comm_info &) = delete; + hostname_comm_info &operator=(const hostname_comm_info &) = delete; + + ~hostname_comm_info(); +}; + +/** + * Computes for each MPI rank a list of devices that have to be sampled by this rank. Ensures that + * each device is sampled by exactly one rank. + * + * @param local_devices a vector of visible_gpu_device for the local rank, each containing a local index and a physical ID + * @param node_comm a node local MPI communicator + * @return all device indices that have to be sampled by this rank + */ +[[nodiscard]] std::vector owned_local_indices_for_backend(const std::vector &local_devices, MPI_Comm node_comm); + +} // namespace hws::detail + +#endif // HWS_MPI_SUPPORT_ENABLED + +#endif // HWS_MPI_UTILITY_HPP_ diff --git a/include/hws/system_hardware_sampler.hpp b/include/hws/system_hardware_sampler.hpp index 42924ac..21b3029 100644 --- a/include/hws/system_hardware_sampler.hpp +++ b/include/hws/system_hardware_sampler.hpp @@ -14,6 +14,7 @@ #include "hws/event.hpp" // hws::event #include "hws/hardware_sampler.hpp" // hws::hardware_sampler #include "hws/sample_category.hpp" // hws::sample_category +#include "hws/utility.hpp" // hws::detail::indent_lines #include // std::chrono::{milliseconds, steady_clock::time_point} #include // std::size_t @@ -22,6 +23,11 @@ #include // std::string #include // std::vector +#if defined(HWS_MPI_SUPPORT_ENABLED) + #include "hws/mpi_sampling_mode.hpp" // hws::detail::mpi_sampling_mode + #include // MPI_Comm +#endif + namespace hws { /** @@ -41,6 +47,23 @@ class system_hardware_sampler { * @param[in] category the sample categories that are enabled for hardware sampling (default: all) */ explicit system_hardware_sampler(std::chrono::milliseconds sampling_interval, sample_category category = sample_category::all); +#if defined(HWS_MPI_SUPPORT_ENABLED) + /** + * @brief Construct hardware samplers with the default sampling interval and MPI support. + * @param[in] communicator the MPI communicator + * @param[in] mode the MPI sampling mode + * @param[in] category the sample categories that are enabled for hardware sampling (default: all) + */ + explicit system_hardware_sampler(MPI_Comm communicator, detail::mpi_sampling_mode mode, sample_category category = sample_category::all); + /** + * @brief Construct hardware samplers with the provided @p sampling_interval and MPI support. + * @param[in] communicator the MPI communicator + * @param[in] mode the MPI sampling mode + * @param[in] sampling_interval the used sampling interval + * @param[in] category the sample categories that are enabled for hardware sampling (default: all) + */ + explicit system_hardware_sampler(MPI_Comm communicator, detail::mpi_sampling_mode mode, std::chrono::milliseconds sampling_interval, sample_category category = sample_category::all); +#endif /** * @brief Delete the copy-constructor. @@ -68,10 +91,24 @@ class system_hardware_sampler { * @brief Start hardware sampling for all wrapped hardware samplers. */ void start_sampling(); +#if defined(HWS_MPI_SUPPORT_ENABLED) + /** + * @brief Start hardware sampling for all wrapped hardware samplers. Executes an MPI barrier before starting sampling to synchronize all MPI ranks. + * @param[in] communicator the MPI communicator to use + */ + void start_sampling(MPI_Comm communicator); +#endif /** * @brief Stop hardware sampling for all wrapped hardware samplers. */ void stop_sampling(); +#if defined(HWS_MPI_SUPPORT_ENABLED) + /** + * @brief Stop hardware sampling for all wrapped hardware samplers. Executes an MPI barrier after stopping sampling to synchronize all MPI ranks. + * @param[in] communicator the MPI communicator to use + */ + void stop_sampling(MPI_Comm communicator); +#endif /** * @brief Pause hardware sampling for all wrapped hardware samplers. */ @@ -175,6 +212,23 @@ class system_hardware_sampler { */ void dump_yaml(const std::filesystem::path &filename) const; +#if defined(HWS_MPI_SUPPORT_ENABLED) + /** + * @brief Let MPI rank 0 dump the hardware samples of all hardware samplers of all MPI ranks to the YAML file with @p filename. + * @param[in] filename the YAML file to append the hardware samples to + * @param[in] communicator the MPI communicator to use + */ + void dump_yaml_global(const char *filename, MPI_Comm communicator) const; + /** + * @copydoc hws::system_hardware_sampler::dump_yaml_global(const char *) const + */ + void dump_yaml_global(const std::string &filename, MPI_Comm communicator) const; + /** + * @copydoc hws::system_hardware_sampler::dump_yaml_global(const char *) const + */ + void dump_yaml_global(const std::filesystem::path &filename, MPI_Comm communicator) const; +#endif + /** * @brief Return the hardware samples as YAML string. * @return the YAML content as string (`[[nodiscard]]`) @@ -190,6 +244,13 @@ class system_hardware_sampler { private: /// The different hardware sampler for the current system. std::vector> samplers_; + + /** + * Creates hardware samplers for all visible devices. Used by non-MPI class constructor. + * @param sampling_interval the used sampling interval + * @param category the sample category + */ + void create_local_samplers(std::chrono::milliseconds sampling_interval, hws::sample_category category); }; } // namespace hws diff --git a/include/hws/utility.hpp b/include/hws/utility.hpp index aacaf23..2737418 100644 --- a/include/hws/utility.hpp +++ b/include/hws/utility.hpp @@ -15,17 +15,17 @@ #include "fmt/format.h" // fmt::format #include "fmt/ranges.h" // fmt::join -#include // std::from_chars -#include // std::chrono::duration -#include // std::trunc -#include // std::size_t -#include // std::optional -#include // std::runtime_error -#include // std::string, std::stof, std::stod, std::stold -#include // std::string_view -#include // std::errc -#include // std::is_same_v, std::is_floating_point_v, std::remove_cv_t, std::remove_reference_t, std::true_type, std::false_type -#include // std::vector +#include // std::from_chars +#include // std::chrono::duration +#include // std::trunc +#include // std::size_t +#include // std::optional +#include // std::runtime_error +#include // std::string, std::stof, std::stod, std::stold +#include // std::string_view +#include // std::errc +#include // std::is_same_v, std::is_floating_point_v, std::remove_cv_t, std::remove_reference_t, std::true_type, std::false_type +#include // std::vector namespace hws::detail { @@ -247,6 +247,14 @@ template return quoted; } +/** + * @brief Prefix all lines in a string with the given indentation. + * @param[in] text the input text + * @param[in] prefix the prefix (indentation) added to each line + * @return the indented string + */ +[[nodiscard]] std::string indent_lines(const std::string &text, std::string_view prefix); + /*****************************************************************************************************/ /** other free functions **/ /*****************************************************************************************************/ diff --git a/include/hws/visible_gpu_device.hpp b/include/hws/visible_gpu_device.hpp new file mode 100644 index 0000000..7b6d581 --- /dev/null +++ b/include/hws/visible_gpu_device.hpp @@ -0,0 +1,45 @@ +/** + * @file + * @author Tim Thüring + * @copyright 2024-today All Rights Reserved + * @license This file is released under the MIT license. + * See the LICENSE.md file in the project root for full license information. + * + * @brief Defines types for identifying visible GPU devices. + */ + +#ifndef HWS_VISIBLE_GPU_DEVICE_HPP_ +#define HWS_VISIBLE_GPU_DEVICE_HPP_ +#pragma once + +#if defined(HWS_MPI_SUPPORT_ENABLED) + +#include // std::string + +namespace hws::detail { + +/** + * @brief Enum class representing the backend kind of visible GPU device. + * @details The backend kind can be NVIDIA, AMD, or Intel. + */ +enum class device_backend_kind { + nvidia, + amd, + intel +}; + +/** + * @brief Represents a visible GPU device on the local rank. + * @details Contains the backend kind, the local index of the device for that backend on this rank, and a stable per-node identifier (physical ID) for the device. + */ +struct visible_gpu_device { + device_backend_kind backend; + int local_index; // device index for that backend on this rank + std::string physical_id; // stable per-node identifier +}; + +} // namespace hws::detail + +#endif // HWS_MPI_SUPPORT_ENABLED + +#endif // HWS_VISIBLE_GPU_DEVICE_HPP_ diff --git a/pyproject.toml b/pyproject.toml index 45c4cfa..910c2b7 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -36,6 +36,9 @@ classifiers = [ "Programming Language :: C++", "Programming Language :: Python :: 3" ] +# optional dependencies +[project.optional-dependencies] +mpi = ["mpi4py>=4"] # project specific URLs [project.urls] documentation = "https://sc-sgs.github.io/hardware_sampling/" diff --git a/src/hws/gpu_amd/utility.cpp b/src/hws/gpu_amd/utility.cpp index a88969a..55d6932 100644 --- a/src/hws/gpu_amd/utility.cpp +++ b/src/hws/gpu_amd/utility.cpp @@ -10,6 +10,13 @@ #include "rocm_smi/rocm_smi.h" // ROCm SMI runtime functions #include // std::string +#include // std::vector + +#if defined(HWS_MPI_SUPPORT_ENABLED) && defined(HWS_FOR_AMD_GPUS_ENABLED) + #include "hws/visible_gpu_device.hpp" // hws::detail::visible_gpu_device, hws::detail::device_backend_kind + + #include "hip/hip_runtime_api.h" // hipGetDeviceCount, hipDeviceGetPCIBusId +#endif namespace hws::detail { @@ -39,4 +46,39 @@ std::string performance_level_to_string(const rsmi_dev_perf_level_t perf_level) } } +#if defined(HWS_MPI_SUPPORT_ENABLED) && defined(HWS_FOR_AMD_GPUS_ENABLED) + +namespace { + +/** + * @brief returns a stable physical ID for the AMD GPU device with the given local index + * The ID is at least unique per node and can be used to identify the same device across different MPI ranks on the same node. + * + * @param local_index the local index of the AMD GPU device + * @return the physical ID of the AMD GPU device + */ +[[nodiscard]] std::string amd_physical_id(const int local_index) { + char bus_id[64] = {}; + HWS_HIP_ERROR_CHECK(hipDeviceGetPCIBusId(bus_id, sizeof(bus_id), local_index)); + return std::string{ "amd:" } + bus_id; +} + +} // namespace + +[[nodiscard]] std::vector enumerate_local_amd_devices() { + std::vector out; + int count = 0; + HWS_HIP_ERROR_CHECK(hipGetDeviceCount(&count)); + for (int i = 0; i < count; ++i) { + visible_gpu_device d; + d.backend = device_backend_kind::amd; + d.local_index = i; + d.physical_id = amd_physical_id(i); + out.push_back(std::move(d)); + } + return out; +} + +#endif + } // namespace hws::detail diff --git a/src/hws/gpu_intel/utility.cpp b/src/hws/gpu_intel/utility.cpp index 5a29eee..6d82160 100644 --- a/src/hws/gpu_intel/utility.cpp +++ b/src/hws/gpu_intel/utility.cpp @@ -12,10 +12,18 @@ #include "level_zero/ze_api.h" // Level Zero runtime functions #include "level_zero/zes_api.h" // Level Zero runtime functions +#include // std::size_t +#include // std::uint32_t +#include // snprintf +#include // std::runtime_error #include // std::string #include // std::string_view #include // std::vector +#if defined(HWS_MPI_SUPPORT_ENABLED) && defined(HWS_FOR_INTEL_GPUS_ENABLED) + #include "hws/visible_gpu_device.hpp" // hws::detail::visible_gpu_device, hws::detail::device_backend_kind +#endif + namespace hws::detail { std::vector property_flags_to_vector(const ze_device_property_flags_t flags) { @@ -227,4 +235,77 @@ std::string memory_location_to_name(const zes_mem_loc_t mem_loc) { } } +#if defined(HWS_MPI_SUPPORT_ENABLED) && defined(HWS_FOR_INTEL_GPUS_ENABLED) + +namespace { + +/** + * @brief returns a stable physical ID for the Intel GPU @p device + * The ID is at least unique per node and can be used to identify the same device across different MPI ranks on the same node. + * + * @param device the Level Zero device handle of the Intel GPU device + * @return the physical ID of the Intel GPU device + */ +[[nodiscard]] std::string intel_physical_id(const ze_device_handle_t device) { + ze_device_properties_t props{}; + props.stype = ZE_STRUCTURE_TYPE_DEVICE_PROPERTIES; + props.pNext = nullptr; + HWS_LEVEL_ZERO_ERROR_CHECK(zeDeviceGetProperties(device, &props)); + + char buf[2 * ZE_MAX_DEVICE_UUID_SIZE + 1] = {}; + for (std::size_t i = 0; i < ZE_MAX_DEVICE_UUID_SIZE; ++i) { + snprintf(buf + 2 * i, 3, "%02x", props.uuid.id[i]); + } + + return std::string{ "intel:" } + buf; +} + +} // namespace + +[[nodiscard]] std::vector enumerate_local_intel_devices() { + std::vector out; + + // init level zero driver + HWS_LEVEL_ZERO_ERROR_CHECK(zeInit(ZE_INIT_FLAG_GPU_ONLY)) + + // discover the number of drivers + std::uint32_t driver_count{ 0 }; + HWS_LEVEL_ZERO_ERROR_CHECK(zeDriverGet(&driver_count, nullptr)) + + // check if only the single GPU driver has been found + if (driver_count > 1) { + throw std::runtime_error{ fmt::format("Found too many GPU drivers ({})!", driver_count) }; + } + + // get the GPU driver + ze_driver_handle_t driver{}; + HWS_LEVEL_ZERO_ERROR_CHECK(zeDriverGet(&driver_count, &driver)); + + // Discover devices for this driver + std::uint32_t device_count = 0; + HWS_LEVEL_ZERO_ERROR_CHECK(zeDeviceGet(driver, &device_count, nullptr)); + if (device_count == 0) { + return out; // no Intel GPUs visible + } + + std::vector devices(device_count); + HWS_LEVEL_ZERO_ERROR_CHECK(zeDeviceGet(driver, &device_count, devices.data())); + + // Fill visible_gpu_device list + for (std::uint32_t i = 0; i < device_count; ++i) { + ze_device_handle_t dev = devices[i]; + + visible_gpu_device d; + d.backend = device_backend_kind::intel; + d.local_index = static_cast(i); + d.physical_id = intel_physical_id(dev); + + out.push_back(std::move(d)); + } + + return out; +} + +#endif + } // namespace hws::detail diff --git a/src/hws/gpu_nvidia/utility.cpp b/src/hws/gpu_nvidia/utility.cpp index a81feea..97d8c1e 100644 --- a/src/hws/gpu_nvidia/utility.cpp +++ b/src/hws/gpu_nvidia/utility.cpp @@ -14,6 +14,10 @@ #include // std::string #include // std::vector +#if defined(HWS_MPI_SUPPORT_ENABLED) && defined(HWS_FOR_NVIDIA_GPUS_ENABLED) + #include "hws/visible_gpu_device.hpp" // hws::detail::visible_gpu_device, hws::detail::device_backend_kind +#endif + namespace hws::detail { #if CUDA_VERSION >= 12000 @@ -56,4 +60,39 @@ std::string throttle_event_reason_to_string(const unsigned long long clocks_even #endif +#if defined(HWS_MPI_SUPPORT_ENABLED) && defined(HWS_FOR_NVIDIA_GPUS_ENABLED) + +namespace { + +/** + * @brief returns a stable physical ID for the NVIDIA GPU device with the given local index + * The ID is at least unique per node and can be used to identify the same device across different MPI ranks on the same node. + * + * @param local_index the local index of the NVIDIA GPU device + * @return the physical ID of the NVIDIA GPU device + */ +[[nodiscard]] std::string nvidia_physical_id(const int local_index) { + char bus_id[64] = {}; + HWS_CUDA_ERROR_CHECK(cudaDeviceGetPCIBusId(bus_id, sizeof(bus_id), local_index)); + return std::string{ "nvidia:" } + bus_id; +} + +} // namespace + +[[nodiscard]] std::vector enumerate_local_nvidia_devices() { + std::vector out; + int count = 0; + HWS_CUDA_ERROR_CHECK(cudaGetDeviceCount(&count)); + for (int i = 0; i < count; ++i) { + visible_gpu_device d; + d.backend = device_backend_kind::nvidia; + d.local_index = i; + d.physical_id = nvidia_physical_id(i); + out.push_back(std::move(d)); + } + return out; +} + +#endif + } // namespace hws::detail diff --git a/src/hws/hardware_sampler.cpp b/src/hws/hardware_sampler.cpp index abd907d..3ebaf4c 100644 --- a/src/hws/hardware_sampler.cpp +++ b/src/hws/hardware_sampler.cpp @@ -24,6 +24,10 @@ #include // std::thread #include // std::move +#if defined(HWS_MPI_SUPPORT_ENABLED) + #include "hws/mpi_utility.hpp" // hws::detail::gather_yaml_strings_mpi +#endif + namespace hws { hardware_sampler::hardware_sampler(const std::chrono::milliseconds sampling_interval, const sample_category category) : @@ -144,6 +148,44 @@ void hardware_sampler::dump_yaml(const std::filesystem::path &filename) const { this->dump_yaml(filename.string().c_str()); } +#if defined(HWS_MPI_SUPPORT_ENABLED) +void hardware_sampler::dump_yaml_global(const char *filename, MPI_Comm communicator) const { + int initialized = 0; + MPI_Initialized(&initialized); + + if (!initialized) { + throw std::runtime_error{"MPI must already be initialized"}; + } + + // MPI rank and world size for identification and communication + int rank = 0; + MPI_Comm_rank(communicator, &rank); + + std::string rank_yaml_output; // yaml file as string per rank + + rank_yaml_output += "---\n\n"; + rank_yaml_output += "rank: " + std::to_string(rank) + "\n\n"; + + // add yaml string of this hardware sampler + rank_yaml_output += this->as_yaml_string(); + + const std::string global_yaml_output = detail::gather_yaml_strings_mpi(rank_yaml_output, communicator); + + if (rank == 0) { + std::ofstream file(filename); + file << global_yaml_output; + } +} + +void hardware_sampler::dump_yaml_global(const std::string &filename, MPI_Comm communicator) const { + this->dump_yaml_global(filename.c_str(), communicator); +} + +void hardware_sampler::dump_yaml_global(const std::filesystem::path &filename, MPI_Comm communicator) const { + this->dump_yaml_global(filename.string().c_str(), communicator); +} +#endif + std::string hardware_sampler::as_yaml_string() const { if (!this->has_sampling_stopped()) { throw std::runtime_error{ "Can return samples as string only after the sampling has been stopped!" }; diff --git a/src/hws/mpi_utility.cpp b/src/hws/mpi_utility.cpp new file mode 100644 index 0000000..4d8b3af --- /dev/null +++ b/src/hws/mpi_utility.cpp @@ -0,0 +1,210 @@ +/** + * @author Tim Thüring + * @copyright 2024-today All Rights Reserved + * @license This file is released under the MIT license. + * See the LICENSE.md file in the project root for full license information. + */ + +#include "hws/mpi_utility.hpp" + +#include // std::size_t +#include // std::runtime_error +#include // std::string +#include // std::string_view +#include // std::unordered_map +#include // std::vector + +namespace hws::detail { + +hostname_comm_info::hostname_comm_info(MPI_Comm comm) { + int world_rank = 0; + int world_size = 0; + MPI_Comm_rank(comm, &world_rank); + MPI_Comm_size(comm, &world_size); + + // Gather all hostnames + char name[MPI_MAX_PROCESSOR_NAME]; + int name_len = 0; + MPI_Get_processor_name(name, &name_len); + + std::vector name_lengths(world_size); + MPI_Allgather(&name_len, 1, MPI_INT, name_lengths.data(), 1, MPI_INT, comm); + + // Build displacements and total byte count + std::vector displs(world_size); + int total = 0; + for (int i = 0; i < world_size; ++i) { + displs[i] = total; + total += name_lengths[i]; + } + + std::vector all_names(total); + MPI_Allgatherv(name, name_len, MPI_CHAR, all_names.data(), name_lengths.data(), displs.data(), MPI_CHAR, comm); + + // Assign colors locally on every rank + // + // All ranks hold identical copies of all_names, name_lengths, and displs, + // so they can each compute the same deterministic color map independently. + + std::unordered_map host_to_color; + host_to_color.reserve(world_size); + std::vector colors(world_size); + int next_color = 0; + for (int r = 0; r < world_size; ++r) { + // get host name of rank r + std::string_view host(&all_names[displs[r]], static_cast(name_lengths[r])); + + // try to insert it into the host_to_color map + auto [it, inserted] = host_to_color.emplace(host, next_color); + + // check if host was new, if yes, increment color + if (inserted) { + ++next_color; + } + // save color of current rank, either from newly created or existing entry + colors[r] = it->second; + } + + // Split communicator + MPI_Comm_split(comm, colors[world_rank], world_rank, &node_comm); + MPI_Comm_rank(node_comm, &node_rank); + MPI_Comm_size(node_comm, &node_size); +} + +hostname_comm_info::~hostname_comm_info() { + if (node_comm != MPI_COMM_NULL) { + MPI_Comm_free(&node_comm); + } +} + +std::string gather_yaml_strings_mpi(const std::string &local_yaml, MPI_Comm communicator) { + int initialized = 0; + MPI_Initialized(&initialized); + + if (!initialized) { + throw std::runtime_error{"MPI must already be initialized"}; + } + + // MPI rank and world size for identification and communication + int rank = 0; + int world_size = 0; + MPI_Comm_rank(communicator, &rank); + MPI_Comm_size(communicator, &world_size); + + // gather the size of the yaml string from each rank + const int local_size = static_cast(local_yaml.size()); + + std::vector recv_sizes; + + if (rank == 0) { + recv_sizes.resize(world_size); + } + + MPI_Gather(&local_size, 1, MPI_INT, recv_sizes.data(), 1, MPI_INT, 0, communicator); + + // compute the displacements from the rank string sizes + std::vector displacements; + int total_size = 0; + + if (rank == 0) { + displacements.resize(world_size); + + for (int i = 0; i < world_size; ++i) { + displacements[i] = total_size; + total_size += recv_sizes[i]; + } + } + + // gather the local yaml strings from all ranks + std::vector recv_buffer; + + if (rank == 0) { + recv_buffer.resize(total_size); + } + + MPI_Gatherv(local_yaml.data(), local_size, MPI_CHAR, recv_buffer.data(), recv_sizes.data(), displacements.data(), MPI_CHAR, 0, communicator); + + // build final yaml string on rank 0 + std::string global_yaml; + + if (rank == 0) { + for (int r = 0; r < world_size; ++r) { + global_yaml.append(recv_buffer.data() + displacements[r], recv_sizes[r]); + global_yaml += '\n'; + } + } + + return global_yaml; +} + +std::vector owned_local_indices_for_backend(const std::vector &local_devices, MPI_Comm node_comm) { + int node_rank = 0; + int node_size = 0; + MPI_Comm_rank(node_comm, &node_rank); + MPI_Comm_size(node_comm, &node_size); + + // Pack physical IDs into a newline-separated string + std::string packed; + for (const auto &d : local_devices) { + packed += d.physical_id; + packed += '\n'; + } + const int local_size = static_cast(packed.size()); + + // Allgather sizes + std::vector sizes(node_size); + MPI_Allgather(&local_size, 1, MPI_INT, sizes.data(), 1, MPI_INT, node_comm); + + // Displacements and total length + std::vector displs(node_size); + int total = 0; + for (int r = 0; r < node_size; ++r) { + displs[r] = total; + total += sizes[r]; + } + + // Allgatherv packed physical IDs + std::vector all_data(total); + MPI_Allgatherv(packed.data(), local_size, MPI_CHAR, all_data.data(), sizes.data(), displs.data(), MPI_CHAR, node_comm); + + // Build owner map: physical_id -> first node_rank that reports it + std::unordered_map owner_rank_for_id; + owner_rank_for_id.reserve(local_devices.size() * 2 + 1); + + for (int r = 0; r < node_size; ++r) { + if (sizes[r] == 0) { + continue; + } + + const char *base = all_data.data() + displs[r]; + const int len = sizes[r]; + + int line_start = 0; + while (line_start < len) { + int line_end = line_start; + while (line_end < len && base[line_end] != '\n') { + ++line_end; + } + if (line_end > line_start) { + const std::string id(base + line_start, base + line_end); // copy just this ID + owner_rank_for_id.emplace(id, r); // first insertion wins + } + line_start = line_end + 1; + } + } + + // Decide which local indices we own: those whose physical_id is mapped to node_rank + std::vector owned_indices; + owned_indices.reserve(local_devices.size()); + + for (const auto &d : local_devices) { + auto it = owner_rank_for_id.find(d.physical_id); + if (it != owner_rank_for_id.end() && it->second == node_rank) { + owned_indices.push_back(d.local_index); + } + } + + return owned_indices; +} + +} // namespace hws::detail diff --git a/src/hws/system_hardware_sampler.cpp b/src/hws/system_hardware_sampler.cpp index 1c08762..4dff468 100644 --- a/src/hws/system_hardware_sampler.cpp +++ b/src/hws/system_hardware_sampler.cpp @@ -41,6 +41,10 @@ #include // std::out_of_range #include // std::vector +#if defined(HWS_MPI_SUPPORT_ENABLED) + #include "hws/mpi_utility.hpp" // hws::detail::hostname_comm_info, hws::detail::owned_local_indices_for_backend +#endif + namespace hws { system_hardware_sampler::system_hardware_sampler(const sample_category category) : @@ -48,65 +52,89 @@ system_hardware_sampler::system_hardware_sampler(const sample_category category) system_hardware_sampler::system_hardware_sampler(const std::chrono::milliseconds sampling_interval, sample_category category) { // create the hardware samplers based on the available hardware -#if defined(HWS_FOR_CPUS_ENABLED) - { - samplers_.push_back(std::make_unique(sampling_interval, category)); - } -#endif -#if defined(HWS_FOR_NVIDIA_GPUS_ENABLED) - { - int device_count{}; - HWS_CUDA_ERROR_CHECK(cudaGetDeviceCount(&device_count)); - for (int device = 0; device < device_count; ++device) { - samplers_.push_back(std::make_unique(static_cast(device), sampling_interval, category)); + create_local_samplers(sampling_interval, category); +} + +#if defined(HWS_MPI_SUPPORT_ENABLED) +system_hardware_sampler::system_hardware_sampler(MPI_Comm communicator, const detail::mpi_sampling_mode mode, const sample_category category) : + system_hardware_sampler(communicator, mode, HWS_SAMPLING_INTERVAL, category) { } + +system_hardware_sampler::system_hardware_sampler(MPI_Comm communicator, const detail::mpi_sampling_mode mode, const std::chrono::milliseconds sampling_interval, const sample_category category) { + if (mode == detail::mpi_sampling_mode::per_rank) { + // each rank creates samplers for all devices visible to him + create_local_samplers(sampling_interval, category); + } else if (mode == detail::mpi_sampling_mode::whole_node) { + // create a custom, node-local MPI communicator + detail::hostname_comm_info nc{ communicator }; + + // CPU: one sampler per node --> node leader only + #if defined(HWS_FOR_CPUS_ENABLED) + if (nc.node_rank == 0) { + samplers_.push_back(std::make_unique(sampling_interval, category)); } - } -#endif -#if defined(HWS_FOR_AMD_GPUS_ENABLED) - { - int device_count{}; - HWS_HIP_ERROR_CHECK(hipGetDeviceCount(&device_count)); - for (int device = 0; device < device_count; ++device) { - samplers_.push_back(std::make_unique(static_cast(device), sampling_interval, category)); + #endif + + // NVIDIA + #if defined(HWS_FOR_NVIDIA_GPUS_ENABLED) + { + const std::vector local = detail::enumerate_local_nvidia_devices(); + const std::vector owned = detail::owned_local_indices_for_backend(local, nc.node_comm); + for (int const idx : owned) { + samplers_.push_back(std::make_unique(static_cast(idx), sampling_interval, category)); + } } - } -#endif -#if defined(HWS_FOR_INTEL_GPUS_ENABLED) - { - // init level zero driver - HWS_LEVEL_ZERO_ERROR_CHECK(zeInit(ZE_INIT_FLAG_GPU_ONLY)) - - // discover the number of drivers - std::uint32_t driver_count{ 0 }; - HWS_LEVEL_ZERO_ERROR_CHECK(zeDriverGet(&driver_count, nullptr)) - - // check if only the single GPU driver has been found - if (driver_count > 1) { - throw std::runtime_error{ fmt::format("Found too many GPU drivers ({})!", driver_count) }; + #endif + + // AMD + #if defined(HWS_FOR_AMD_GPUS_ENABLED) + { + const std::vector local = detail::enumerate_local_amd_devices(); + const std::vector owned = detail::owned_local_indices_for_backend(local, nc.node_comm); + for (int const idx : owned) { + samplers_.push_back(std::make_unique( + static_cast(idx), sampling_interval, category)); + } } - - // get the GPU driver - ze_driver_handle_t driver{}; - HWS_LEVEL_ZERO_ERROR_CHECK(zeDriverGet(&driver_count, &driver)) - - // get all GPUs for the current driver - std::uint32_t device_count{ 0 }; - HWS_LEVEL_ZERO_ERROR_CHECK(zeDeviceGet(driver, &device_count, nullptr)) - for (std::uint32_t device = 0; device < device_count; ++device) { - samplers_.push_back(std::make_unique(static_cast(device), sampling_interval, category)); + #endif + + // Intel + #if defined(HWS_FOR_INTEL_GPUS_ENABLED) + { + const std::vector local = detail::enumerate_local_intel_devices(); + const std::vector owned = detail::owned_local_indices_for_backend(local, nc.node_comm); + for (int const idx : owned) { + samplers_.push_back(std::make_unique(static_cast(idx), sampling_interval, category)); + } } + #endif + + } else { + throw std::runtime_error{ fmt::format("Unknown MPI sampling mode {}!", static_cast(mode)) }; } -#endif } +#endif void system_hardware_sampler::start_sampling() { std::for_each(samplers_.begin(), samplers_.end(), [](auto &ptr) { ptr->start_sampling(); }); } +#if defined(HWS_MPI_SUPPORT_ENABLED) +void system_hardware_sampler::start_sampling(MPI_Comm communicator) { + MPI_Barrier(communicator); + std::for_each(samplers_.begin(), samplers_.end(), [](auto &ptr) { ptr->start_sampling(); }); +} +#endif void system_hardware_sampler::stop_sampling() { std::for_each(samplers_.begin(), samplers_.end(), [](auto &ptr) { ptr->stop_sampling(); }); } +#if defined(HWS_MPI_SUPPORT_ENABLED) +void system_hardware_sampler::stop_sampling(MPI_Comm communicator) { + std::for_each(samplers_.begin(), samplers_.end(), [](auto &ptr) { ptr->stop_sampling(); }); + MPI_Barrier(communicator); +} +#endif + void system_hardware_sampler::pause_sampling() { std::for_each(samplers_.begin(), samplers_.end(), [](auto &ptr) { ptr->pause_sampling(); }); } @@ -201,6 +229,48 @@ void system_hardware_sampler::dump_yaml(const std::filesystem::path &filename) c std::for_each(samplers_.cbegin(), samplers_.cend(), [&filename](const auto &ptr) { ptr->dump_yaml(filename); }); } +#if defined(HWS_MPI_SUPPORT_ENABLED) +void system_hardware_sampler::dump_yaml_global(const char *filename, MPI_Comm communicator) const { + int initialized = 0; + MPI_Initialized(&initialized); + + if (!initialized) { + throw std::runtime_error{"MPI must already be initialized"}; + } + + // MPI rank and world size for identification and communication + int rank = 0; + MPI_Comm_rank(communicator, &rank); + + std::string rank_yaml_output; // yaml file as string per rank + + rank_yaml_output += "---\n\n"; + rank_yaml_output += "rank: " + std::to_string(rank) + "\n\n"; + + // accumulate string from each sampler + std::size_t sampler_idx = 0; + std::for_each(samplers_.cbegin(), samplers_.cend(), [&rank_yaml_output, &sampler_idx](const auto &ptr) { + rank_yaml_output += "sampler_" + std::to_string(sampler_idx++) + ":\n"; + rank_yaml_output += detail::indent_lines(ptr->as_yaml_string(), " "); + }); + + const std::string global_yaml_output = detail::gather_yaml_strings_mpi(rank_yaml_output, communicator); + + if (rank == 0) { + std::ofstream file(filename); + file << global_yaml_output; + } +} + +void system_hardware_sampler::dump_yaml_global(const std::string &filename, MPI_Comm communicator) const { + this->dump_yaml_global(filename.c_str(), communicator); +} + +void system_hardware_sampler::dump_yaml_global(const std::filesystem::path &filename, MPI_Comm communicator) const { + this->dump_yaml_global(filename.string().c_str(), communicator); +} +#endif + std::string system_hardware_sampler::as_yaml_string() const { return std::accumulate(samplers_.cbegin(), samplers_.cend(), std::string{}, [](const std::string str, const auto &ptr) { return str + ptr->as_yaml_string(); }); } @@ -209,4 +279,56 @@ std::string system_hardware_sampler::samples_only_as_yaml_string() const { return std::accumulate(samplers_.cbegin(), samplers_.cend(), std::string{}, [](const std::string str, const auto &ptr) { return str + ptr->samples_only_as_yaml_string(); }); } +void system_hardware_sampler::create_local_samplers(std::chrono::milliseconds sampling_interval, sample_category category) { +#if defined(HWS_FOR_CPUS_ENABLED) + { + samplers_.push_back(std::make_unique(sampling_interval, category)); + } +#endif +#if defined(HWS_FOR_NVIDIA_GPUS_ENABLED) + { + int device_count{}; + HWS_CUDA_ERROR_CHECK(cudaGetDeviceCount(&device_count)); + for (int device = 0; device < device_count; ++device) { + samplers_.push_back(std::make_unique(static_cast(device), sampling_interval, category)); + } + } +#endif +#if defined(HWS_FOR_AMD_GPUS_ENABLED) + { + int device_count{}; + HWS_HIP_ERROR_CHECK(hipGetDeviceCount(&device_count)); + for (int device = 0; device < device_count; ++device) { + samplers_.push_back(std::make_unique(static_cast(device), sampling_interval, category)); + } + } +#endif +#if defined(HWS_FOR_INTEL_GPUS_ENABLED) + { + // init level zero driver + HWS_LEVEL_ZERO_ERROR_CHECK(zeInit(ZE_INIT_FLAG_GPU_ONLY)) + + // discover the number of drivers + std::uint32_t driver_count{ 0 }; + HWS_LEVEL_ZERO_ERROR_CHECK(zeDriverGet(&driver_count, nullptr)) + + // check if only the single GPU driver has been found + if (driver_count > 1) { + throw std::runtime_error{ fmt::format("Found too many GPU drivers ({})!", driver_count) }; + } + + // get the GPU driver + ze_driver_handle_t driver{}; + HWS_LEVEL_ZERO_ERROR_CHECK(zeDriverGet(&driver_count, &driver)) + + // get all GPUs for the current driver + std::uint32_t device_count{ 0 }; + HWS_LEVEL_ZERO_ERROR_CHECK(zeDeviceGet(driver, &device_count, nullptr)) + for (std::uint32_t device = 0; device < device_count; ++device) { + samplers_.push_back(std::make_unique(static_cast(device), sampling_interval, category)); + } + } +#endif +} + } // namespace hws diff --git a/src/hws/utility.cpp b/src/hws/utility.cpp index 054db8d..406089a 100644 --- a/src/hws/utility.cpp +++ b/src/hws/utility.cpp @@ -9,6 +9,7 @@ #include // std::min, std::transform, std::all_of #include // std::tolower, std::isdigit +#include // std::stringstream #include // std::string #include // std::string_view #include // std::vector @@ -61,4 +62,19 @@ bool is_integer(std::string_view str) { return std::all_of(str.cbegin(), str.cend(), [](const char c) { return std::isdigit(static_cast(c)); }); } +std::string indent_lines(const std::string &text, const std::string_view prefix) { + std::stringstream ss{ text }; + + std::string line; + std::string out; + + while (std::getline(ss, line)) { + out += prefix; + out += line; + out += '\n'; + } + + return out; +} + } // namespace hws::detail