From cf5cfa4bcdb9741330db0f39c5dcfb02a1fbd621 Mon Sep 17 00:00:00 2001 From: Phuong Nguyen Date: Mon, 6 Apr 2026 16:10:05 +0000 Subject: [PATCH 1/5] fix uninit comm->mac_max_size usage Signed-off-by: Phuong Nguyen --- .../common/comm_gemm_overlap/userbuffers/userbuffers-host.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/transformer_engine/common/comm_gemm_overlap/userbuffers/userbuffers-host.cpp b/transformer_engine/common/comm_gemm_overlap/userbuffers/userbuffers-host.cpp index 6ff9d63a2d..9b83f068ed 100644 --- a/transformer_engine/common/comm_gemm_overlap/userbuffers/userbuffers-host.cpp +++ b/transformer_engine/common/comm_gemm_overlap/userbuffers/userbuffers-host.cpp @@ -240,7 +240,7 @@ int create_communicator_grouped2(communicator **comm, int myrank, int numranks, size_t gran; CUmulticastObjectProp mcProp = {}; mcProp.numDevices = (*comm)->ar2_nvsize; - mcProp.size = (*comm)->mc_maxsize; + mcProp.size = mc_maxsize; mcProp.handleTypes = mnnvl_fabric ? CU_MEM_HANDLE_TYPE_FABRIC : CU_MEM_HANDLE_TYPE_POSIX_FILE_DESCRIPTOR; From 1dad574e5e25c6f545743637c7e0eb7fc341f709 Mon Sep 17 00:00:00 2001 From: Phuong Nguyen Date: Mon, 6 Apr 2026 16:21:25 +0000 Subject: [PATCH 2/5] Fix cuMulticastAddDevice parameter type Pass CUdevice instead of incorrectly casting device index to CUdeviceptr. Use cuDeviceGet to properly obtain the device handle before adding it to the multicast team. Signed-off-by: Phuong Nguyen --- .../userbuffers/userbuffers-host.cpp | 5 +++-- .../common/util/cuda_runtime.cpp | 22 ++++++++++++++++++- 2 files changed, 24 insertions(+), 3 deletions(-) diff --git a/transformer_engine/common/comm_gemm_overlap/userbuffers/userbuffers-host.cpp b/transformer_engine/common/comm_gemm_overlap/userbuffers/userbuffers-host.cpp index 9b83f068ed..26dce904a6 100644 --- a/transformer_engine/common/comm_gemm_overlap/userbuffers/userbuffers-host.cpp +++ b/transformer_engine/common/comm_gemm_overlap/userbuffers/userbuffers-host.cpp @@ -323,8 +323,9 @@ int create_communicator_grouped2(communicator **comm, int myrank, int numranks, IPCCHECK(ipcSocketClose(&ipcSock)); close(fd); } - NVTE_CALL_CHECK_CUDA_DRIVER(cuMulticastAddDevice, (*comm)->mc_handle, - (CUdeviceptr)(*comm)->mydev); + CUdevice cudev; + NVTE_CALL_CHECK_CUDA_DRIVER(cuDeviceGet, &cudev, (*comm)->mydev); + NVTE_CALL_CHECK_CUDA_DRIVER(cuMulticastAddDevice, (*comm)->mc_handle, cudev); CUdeviceptr mc_va; NVTE_CALL_CHECK_CUDA_DRIVER(cuMemAddressReserve, &mc_va, mc_maxsize, (size_t)0, (CUdeviceptr)0U, diff --git a/transformer_engine/common/util/cuda_runtime.cpp b/transformer_engine/common/util/cuda_runtime.cpp index 4b43940a51..75b9b53de1 100644 --- a/transformer_engine/common/util/cuda_runtime.cpp +++ b/transformer_engine/common/util/cuda_runtime.cpp @@ -128,7 +128,27 @@ bool supports_multicast(int device_id) { NVTE_CALL_CHECK_CUDA_DRIVER(cuDeviceGetAttribute, &result, CU_DEVICE_ATTRIBUTE_MULTICAST_SUPPORTED, cudev); } - cache[device_id] = static_cast(result); + if (!result) { + cache[device_id] = false; + return; + } + + // Verify NVLink/NVSwitch topology by testing multicast granularity query + // This will fail if NVLink is not properly configured or devices are not in the same domain + CUmulticastObjectProp testProp = {}; + testProp.numDevices = 1; + testProp.size = 4096; // 4KB test size + testProp.handleTypes = CU_MEM_HANDLE_TYPE_POSIX_FILE_DESCRIPTOR; + size_t gran; + CUresult gran_result = cuMulticastGetGranularity( + &gran, &testProp, + CU_MULTICAST_GRANULARITY_RECOMMENDED); + if (gran_result != CUDA_SUCCESS) { + cache[device_id] = false; + return; + } + + cache[device_id] = true; }; std::call_once(flags[device_id], init); return cache[device_id]; From 0d1cc8132413a1bc0930810d8292e2bddfc6bca3 Mon Sep 17 00:00:00 2001 From: Phuong Nguyen Date: Mon, 6 Apr 2026 16:10:38 +0000 Subject: [PATCH 3/5] added sm_arch check to supports_multicast() Signed-off-by: Phuong Nguyen --- transformer_engine/common/util/cuda_runtime.cpp | 9 +++++++-- 1 file changed, 7 insertions(+), 2 deletions(-) diff --git a/transformer_engine/common/util/cuda_runtime.cpp b/transformer_engine/common/util/cuda_runtime.cpp index 75b9b53de1..fdea5d5cca 100644 --- a/transformer_engine/common/util/cuda_runtime.cpp +++ b/transformer_engine/common/util/cuda_runtime.cpp @@ -117,6 +117,11 @@ bool supports_multicast(int device_id) { } NVTE_CHECK(0 <= device_id && device_id < num_devices(), "invalid CUDA device ID"); auto init = [&]() { + // Multicast requires Hopper (SM 9.0) or newer + if (sm_arch(device_id) < 90) { + cache[device_id] = false; + return; + } CUdevice cudev; NVTE_CALL_CHECK_CUDA_DRIVER(cuDeviceGet, &cudev, device_id); // Multicast support requires both CUDA12.1 UMD + KMD @@ -140,8 +145,8 @@ bool supports_multicast(int device_id) { testProp.size = 4096; // 4KB test size testProp.handleTypes = CU_MEM_HANDLE_TYPE_POSIX_FILE_DESCRIPTOR; size_t gran; - CUresult gran_result = cuMulticastGetGranularity( - &gran, &testProp, + CUresult gran_result = cuda_driver::call( + "cuMulticastGetGranularity", &gran, &testProp, CU_MULTICAST_GRANULARITY_RECOMMENDED); if (gran_result != CUDA_SUCCESS) { cache[device_id] = false; From 3c7d7281d5542a25adfa4a118bf45047b22b2d9f Mon Sep 17 00:00:00 2001 From: Phuong Nguyen Date: Mon, 6 Apr 2026 20:32:46 +0000 Subject: [PATCH 4/5] Fix peer-to-peer access check to verify all GPU pairs Previously the check would succeed if ANY peer was accessible, but later code attempts IPC operations with ALL peers. This caused illegal memory access errors on systems where some GPU pairs lack peer access. Now verify that ALL required GPU pairs have peer-to-peer access before attempting IPC memory handle operations. Signed-off-by: Phuong Nguyen --- .../comm_gemm_overlap/userbuffers/userbuffers-host.cpp | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/transformer_engine/common/comm_gemm_overlap/userbuffers/userbuffers-host.cpp b/transformer_engine/common/comm_gemm_overlap/userbuffers/userbuffers-host.cpp index 26dce904a6..ef169b03e0 100644 --- a/transformer_engine/common/comm_gemm_overlap/userbuffers/userbuffers-host.cpp +++ b/transformer_engine/common/comm_gemm_overlap/userbuffers/userbuffers-host.cpp @@ -678,18 +678,18 @@ int register_user_buffer_collective(void **gpubuff, size_t bytes, communicator * NVTE_CHECK_CUDA(cudaGetDevice(¤t_device)); cudaDeviceProp deviceProp; NVTE_CHECK_CUDA(cudaGetDeviceProperties(&deviceProp, current_device)); - bool peer_access_available = false; + bool all_peers_accessible = true; for (int i = 0; i < comm->nvsize; i++) { if (i != comm->nvrank) { int can_access_peer; cudaError_t peer_result = cudaDeviceCanAccessPeer(&can_access_peer, current_device, i); - if (peer_result == cudaSuccess && can_access_peer) { - peer_access_available = true; + if (peer_result != cudaSuccess || !can_access_peer) { + all_peers_accessible = false; break; } } } - if (!peer_access_available) { + if (!all_peers_accessible) { free(tmp); NVTE_ERROR( "No peer-to-peer access available between GPUs. This platform does not support the " From 3ed8d1864dbc91f74efc5fa8605cae93c0398048 Mon Sep 17 00:00:00 2001 From: "pre-commit-ci[bot]" <66853113+pre-commit-ci[bot]@users.noreply.github.com> Date: Wed, 8 Apr 2026 00:16:05 +0000 Subject: [PATCH 5/5] [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci --- transformer_engine/common/util/cuda_runtime.cpp | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) diff --git a/transformer_engine/common/util/cuda_runtime.cpp b/transformer_engine/common/util/cuda_runtime.cpp index fdea5d5cca..912d77baf4 100644 --- a/transformer_engine/common/util/cuda_runtime.cpp +++ b/transformer_engine/common/util/cuda_runtime.cpp @@ -145,9 +145,8 @@ bool supports_multicast(int device_id) { testProp.size = 4096; // 4KB test size testProp.handleTypes = CU_MEM_HANDLE_TYPE_POSIX_FILE_DESCRIPTOR; size_t gran; - CUresult gran_result = cuda_driver::call( - "cuMulticastGetGranularity", &gran, &testProp, - CU_MULTICAST_GRANULARITY_RECOMMENDED); + CUresult gran_result = cuda_driver::call("cuMulticastGetGranularity", &gran, &testProp, + CU_MULTICAST_GRANULARITY_RECOMMENDED); if (gran_result != CUDA_SUCCESS) { cache[device_id] = false; return;