From bb0a76456e8f42d96897b4c1ecc6ee0578aad27e Mon Sep 17 00:00:00 2001 From: Shahrear Jahan Santho Date: Mon, 29 Jun 2026 06:56:17 -0700 Subject: [PATCH] Add CUDA-aware MPI halo exchange --- CMakeLists.txt | 2 +- src/CMakeLists.txt | 1 + src/pmpo_MPMesh.cpp | 2 +- src/pmpo_MPMesh.hpp | 820 +++++++++++++++++++++++++++++++---- src/pmpo_MPMesh_assembly.hpp | 6 +- src/pmpo_c.cpp | 2 +- 6 files changed, 736 insertions(+), 97 deletions(-) diff --git a/CMakeLists.txt b/CMakeLists.txt index 080a4186..3f627daa 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -50,4 +50,4 @@ if(IS_TESTING) add_subdirectory (test) endif() -bob_end_package() \ No newline at end of file +bob_end_package() diff --git a/src/CMakeLists.txt b/src/CMakeLists.txt index 76059965..d0fb8e74 100644 --- a/src/CMakeLists.txt +++ b/src/CMakeLists.txt @@ -21,6 +21,7 @@ set(SOURCES ) add_library(polyMPO-core ${SOURCES}) +target_compile_definitions(polyMPO-core PUBLIC CUDA_AWARE_MPI) set_property(TARGET polyMPO-core PROPERTY CXX_STANDARD "17") set_property(TARGET polyMPO-core PROPERTY CXX_STANDARD_REQUIRED ON) set_property(TARGET polyMPO-core PROPERTY CXX_EXTENSIONS OFF) diff --git a/src/pmpo_MPMesh.cpp b/src/pmpo_MPMesh.cpp index 3801dcd9..837b896e 100644 --- a/src/pmpo_MPMesh.cpp +++ b/src/pmpo_MPMesh.cpp @@ -171,7 +171,7 @@ void MPMesh::calculateStressDivergence(){ timer.reset(); if(numProcsTot>1){ //Takes contribution of halo vertices and adds it in owner procs - communicate_and_take_halo_contributions1(stress_divUV, numVertices, 2, 0, 0); + communicate_and_take_halo_contributions1_improved(stress_divUV, numVertices, 2, 0, 0); //Transfer the correct values at owned vertices to halo vertices //communicate_and_take_halo_contributions(stress_divUV, numVertices, 2, 1, 1); } diff --git a/src/pmpo_MPMesh.hpp b/src/pmpo_MPMesh.hpp index d3aac1b7..dfd922e5 100644 --- a/src/pmpo_MPMesh.hpp +++ b/src/pmpo_MPMesh.hpp @@ -4,6 +4,9 @@ #include "pmpo_utils.hpp" #include "pmpo_mesh.hpp" #include "pmpo_materialPoints.hpp" +#include +#include +#include namespace polyMPO{ @@ -28,22 +31,27 @@ class MPMesh{ int numOwnersTot, numHalosTot; std::vector numOwnersOnOtherProcs; std::vector numHalosOnOtherProcs; - std::vectorhaloOwnerProcs; + std::vector haloOwnerProcs; std::vector> haloOwnerLocalIDs; std::vector> ownerOwnerLocalIDs; std::vector> ownerHaloLocalIDs; void startCommunication(); - void communicate_and_take_halo_contributions(const Kokkos::View& meshField, int nEntities, int numEntries, int mode, int op); + void communicate_and_take_halo_contributions( + const Kokkos::View& meshField, + int nEntities, + int numEntries, + int mode, + int op); - //Now Kokkos views are made 1D + // Original CPU-staging function template void communicate_and_take_halo_contributions1( const ViewType& meshField, int nEntities, int numEntries, - int mode , + int mode, int op){ int self; @@ -51,95 +59,155 @@ class MPMesh{ MPI_Comm_rank(comm, &self); Kokkos::Timer timer; - auto reconVals_host = Kokkos::create_mirror_view_and_copy(Kokkos::HostSpace(), meshField); + auto reconVals_host = + Kokkos::create_mirror_view_and_copy(Kokkos::HostSpace(), meshField); + pumipic::RecordTime("SD: GPU-CPU copy-" + std::to_string(self), timer.seconds()); timer.reset(); - std::vector> recvIDVec; + std::vector> recvIDVec; std::vector> recvDataVec; + pumipic::RecordTime("SD: Recv Vec Allocation-" + std::to_string(self), timer.seconds()); timer.reset(); - //communicateFields1(fieldData1, nEntities, numEntries, mode, recvIDVec, recvDataVec); - communicateFields1(reconVals_host, nEntities, numEntries, mode, recvIDVec, recvDataVec); + + communicateFields1( + reconVals_host, + nEntities, + numEntries, + mode, + recvIDVec, + recvDataVec); + pumipic::RecordTime("SD: IP Comm-" + std::to_string(self), timer.seconds()); timer.reset(); + int numProcsTot = recvIDVec.size(); - //Flatten IDs + int totalSize = 0; - std::vector offsets(numProcsTot, 0); - for(int i=0; i offsets(numProcsTot, 0); + + for(int i = 0; i < numProcsTot; i++){ offsets[i] = totalSize; totalSize += recvIDVec[i].size(); } - std::vector flatIDVec(totalSize, 0); - for(int i=0; i recvIDGPU("recvIDGPU", totalSize); + auto hostView = + Kokkos::View("recvIDCPU", totalSize); + + for(int i = 0; i < numProcsTot; i++){ + std::copy( + recvIDVec[i].begin(), + recvIDVec[i].end(), + hostView.data() + offsets[i]); } + pumipic::RecordTime("SD: Flatten IDs-" + std::to_string(self), timer.seconds()); timer.reset(); - Kokkos::View recvIDGPU("recvIDGPU", totalSize); - auto hostView = Kokkos::View("recvIDCPU", totalSize); - std::copy(flatIDVec.begin(), flatIDVec.end(), hostView.data()); + Kokkos::deep_copy(recvIDGPU, hostView); Kokkos::fence(); + pumipic::RecordTime("SD: Copy CPU-GPU-" + std::to_string(self), timer.seconds()); - //Flatten Data timer.reset(); - int totalSize_data=0; + + int totalSize_data = 0; std::vector offsets_data(numProcsTot, 0); - for(int i=0; i flatDataVec(totalSize_data, 0); - for(int i=0; i recvDataGPU("recvDataGPU", totalSize_data); + auto hostView_data = + Kokkos::View("recvDataCPU", totalSize_data); + + for(int i = 0; i < numProcsTot; i++){ + std::copy( + recvDataVec[i].begin(), + recvDataVec[i].end(), + hostView_data.data() + offsets_data[i]); } + pumipic::RecordTime("SD: Flatten Data-" + std::to_string(self), timer.seconds()); timer.reset(); - Kokkos::View recvDataGPU("recvDataGPU", totalSize_data); - auto hostView_data= Kokkos::View("recvDataCPU", totalSize_data); - std::copy(flatDataVec.begin(), flatDataVec.end(), hostView_data.data()); + Kokkos::deep_copy(recvDataGPU, hostView_data); Kokkos::fence(); - assert(totalSize_data == totalSize*numEntries); - for (int i=0; i>& fieldData, const int numEntities, const int numEntries, int mode, - std::vector>& recvIDVec, std::vector>& recvDataVec); + void communicateFields( + const std::vector>& fieldData, + const int numEntities, + const int numEntries, + int mode, + std::vector>& recvIDVec, + std::vector>& recvDataVec); template void communicateFields1( - const ViewType& fieldData, - const int numEntities, const int numEntries, int mode, + const ViewType& fieldData, + const int numEntities, + const int numEntries, + int mode, std::vector>& recvIDVec, std::vector>& recvDataVec){ int self, numProcsTot; + MPI_Comm comm = p_MPs->getMPIComm(); + MPI_Comm_rank(comm, &self); MPI_Comm_size(comm, &numProcsTot); @@ -151,98 +219,191 @@ class MPMesh{ recvDataVec.resize(numProcsTot); for(int i = 0; i < numProcsTot; i++){ - if(i==self) continue; + if(i == self) continue; + + int numToSend = 0; + int numToRecv = 0; - int numToSend = 0, numToRecv = 0; - if(mode == 0) { - //gather (halos send to owners) + if(mode == 0){ numToSend = numOwnersOnOtherProcs[i]; numToRecv = numHalosOnOtherProcs[i]; } - else{ - //scatter (owners send to halos) + else{ numToSend = numHalosOnOtherProcs[i]; numToRecv = numOwnersOnOtherProcs[i]; } if(numToSend > 0){ - sendDataVec[i].reserve(numToSend*numEntries); + sendDataVec[i].reserve(numToSend * numEntries); } + if(numToRecv > 0){ - recvDataVec[i].resize(numToRecv*numEntries); + recvDataVec[i].resize(numToRecv * numEntries); recvIDVec[i].resize(numToRecv); } } if(mode == 0){ - // Halos sends to owners - for (int iEnt = 0; iEnt < numHalosTot; iEnt++){ + for(int iEnt = 0; iEnt < numHalosTot; iEnt++){ auto ownerProc = haloOwnerProcs[iEnt]; - for (int iDouble = 0; iDouble < numEntries; iDouble++) - sendDataVec[ownerProc].push_back(fieldData(numOwnersTot+iEnt, iDouble)); + + for(int iDouble = 0; iDouble < numEntries; iDouble++){ + sendDataVec[ownerProc].push_back( + fieldData(numOwnersTot + iEnt, iDouble)); + } } } else if(mode == 1){ - // Owner sends to halos - for (size_t iProc=0; iProc requests; - requests.reserve(4*numProcsTot); + requests.reserve(4 * numProcsTot); + for(int proc = 0; proc < numProcsTot; proc++){ - if(proc == self) continue; + if(proc == self) continue; + if(mode == 0 && numHalosOnOtherProcs[proc]){ - assert(recvIDVec[proc].size() == (size_t)numHalosOnOtherProcs[proc]); - assert(recvDataVec[proc].size() == recvIDVec[proc].size() * (size_t)numEntries); - MPI_Request req3, req4; - MPI_Irecv(recvIDVec[proc].data(), recvIDVec[proc].size(), MPI_INT, proc, 1, comm, &req3); - MPI_Irecv(recvDataVec[proc].data(), recvDataVec[proc].size(), MPI_DOUBLE, proc, 2, comm, &req4); + assert(recvIDVec[proc].size() == + static_cast(numHalosOnOtherProcs[proc])); + + assert(recvDataVec[proc].size() == + recvIDVec[proc].size() * static_cast(numEntries)); + + MPI_Request req3; + MPI_Request req4; + + MPI_Irecv( + recvIDVec[proc].data(), + recvIDVec[proc].size(), + MPI_INT, + proc, + 1, + comm, + &req3); + + MPI_Irecv( + recvDataVec[proc].data(), + recvDataVec[proc].size(), + MPI_DOUBLE, + proc, + 2, + comm, + &req4); + requests.push_back(req3); requests.push_back(req4); } - if(mode == 0 && numOwnersOnOtherProcs[proc]) { - assert(haloOwnerLocalIDs[proc].size() == (size_t)numOwnersOnOtherProcs[proc]); - assert(sendDataVec[proc].size() == haloOwnerLocalIDs[proc].size() * (size_t)numEntries); - MPI_Request req1, req2; - MPI_Isend(haloOwnerLocalIDs[proc].data(), haloOwnerLocalIDs[proc].size(), MPI_INT, proc, 1, comm, &req1); - MPI_Isend(sendDataVec[proc].data(), sendDataVec[proc].size(), MPI_DOUBLE, proc, 2, comm, &req2); + + if(mode == 0 && numOwnersOnOtherProcs[proc]){ + assert(haloOwnerLocalIDs[proc].size() == + static_cast(numOwnersOnOtherProcs[proc])); + + assert(sendDataVec[proc].size() == + haloOwnerLocalIDs[proc].size() * static_cast(numEntries)); + + MPI_Request req1; + MPI_Request req2; + + MPI_Isend( + haloOwnerLocalIDs[proc].data(), + haloOwnerLocalIDs[proc].size(), + MPI_INT, + proc, + 1, + comm, + &req1); + + MPI_Isend( + sendDataVec[proc].data(), + sendDataVec[proc].size(), + MPI_DOUBLE, + proc, + 2, + comm, + &req2); + requests.push_back(req1); requests.push_back(req2); } if(mode == 1 && numOwnersOnOtherProcs[proc]){ - MPI_Request req3, req4; - MPI_Irecv(recvIDVec[proc].data(), recvIDVec[proc].size(), MPI_INT, proc, 1, comm, &req3); - MPI_Irecv(recvDataVec[proc].data(), recvDataVec[proc].size(), MPI_DOUBLE, proc, 2, comm, &req4); + MPI_Request req3; + MPI_Request req4; + + MPI_Irecv( + recvIDVec[proc].data(), + recvIDVec[proc].size(), + MPI_INT, + proc, + 1, + comm, + &req3); + + MPI_Irecv( + recvDataVec[proc].data(), + recvDataVec[proc].size(), + MPI_DOUBLE, + proc, + 2, + comm, + &req4); + requests.push_back(req3); requests.push_back(req4); } - if(mode == 1 && numHalosOnOtherProcs[proc]) { - MPI_Request req1, req2; - MPI_Isend(ownerHaloLocalIDs[proc].data(), ownerHaloLocalIDs[proc].size(), MPI_INT, proc, 1, comm, &req1); - MPI_Isend(sendDataVec[proc].data(), sendDataVec[proc].size(), MPI_DOUBLE, proc, 2, comm, &req2); + + if(mode == 1 && numHalosOnOtherProcs[proc]){ + MPI_Request req1; + MPI_Request req2; + + MPI_Isend( + ownerHaloLocalIDs[proc].data(), + ownerHaloLocalIDs[proc].size(), + MPI_INT, + proc, + 1, + comm, + &req1); + + MPI_Isend( + sendDataVec[proc].data(), + sendDataVec[proc].size(), + MPI_DOUBLE, + proc, + 2, + comm, + &req2); + requests.push_back(req1); requests.push_back(req2); } } + MPI_Waitall(requests.size(), requests.data(), MPI_STATUSES_IGNORE); } + MPMesh(Mesh* inMesh, MaterialPoints* inMPs): - p_mesh(inMesh), p_MPs(inMPs) { + p_mesh(inMesh), + p_MPs(inMPs) { }; - ~MPMesh() { + + ~MPMesh(){ delete p_mesh; delete p_MPs; } - //MP advection and tracking + + // MP advection and tracking void CVTTrackingEdgeCenterBased(Vec2dView dx); void CVTTrackingElmCenterBased(const int printVTPIndex = -1); void T2LTracking(Vec2dView dx); @@ -252,44 +413,521 @@ class MPMesh{ void push_swap_pos(); void push(); - //Used before advection to interpolate fields from mesh to MPs - //And also before reconstruction + + // Used before advection to interpolate fields from mesh to MPs + // And also before reconstruction void calcBasis(); - //Reconstruction + + // Reconstruction DoubleView assemblyV0(); + template void assemblyVtx0(); + template void assemblyElm0(); + template void assemblyVtx1(); + void reconstruct_coeff_full(); - void invertMatrix(const Kokkos::View& vtxMatrices, const double& radius); + + void invertMatrix( + const Kokkos::View& vtxMatrices, + const double& radius); + Kokkos::View precomputedVtxCoeffs_new; - Kokkos::View nearAnEdge; + Kokkos::View nearAnEdge; Kokkos::View vtxMatrixMass; - //Not used currently - std::map> reconstructSlice = std::map>(); + + // Not used currently + std::map> reconstructSlice = + std::map>(); + template DoubleView wtScaAssembly(); + template Vec2dView wtVec2Assembly(); + template - void assembly(int order, MeshFieldType type, bool basisWeightFlag, bool massWeightFlag); + void assembly( + int order, + MeshFieldType type, + bool basisWeightFlag, + bool massWeightFlag); + template - void setReconstructSlice(int order, MeshFieldType type); + void setReconstructSlice( + int order, + MeshFieldType type); + void reconstructSlices(); void printVTP_mesh(int printVTPIndex); - void writeMPTrackingVTP(int printVTPIndex, int numMPs, const Vec3dView& history, const Vec3dView& resultLeft, - const Vec3dView& resultRight, const Vec3dView& mpTgtPosArray); + void writeMPTrackingVTP( + int printVTPIndex, + int numMPs, + const Vec3dView& history, + const Vec3dView& resultLeft, + const Vec3dView& resultRight, + const Vec3dView& mpTgtPosArray); void calculateStrain(); void calculateStress(const int constitutive_relation); void calculateStressDivergence(); + + + +#ifdef CUDA_AWARE_MPI + + // Cached CUDA-aware MPI communication metadata and per-neighbor GPU buffers. + // Important change from the previous version: + // MPI is always given the base pointer of a Kokkos allocation, not + // "base pointer + offset". This avoids Cray MPICH/GTL CUDA IPC problems. + bool cudaAwareMPICacheValid = true; + bool cudaAwareMPIDisabled = false; + bool cudaAwareMPIEnvChecked = false; + bool cudaAwareMPIForceCPU = false; + bool cudaAwareMPILogged = false; + + struct CudaAwareMPIFieldCache{ + bool valid = false; + int cachedNumProcs = -1; + + std::vector sendCounts; + std::vector recvCounts; + + std::vector> sendEntityGPUPerProc; + std::vector> recvIDGPUPerProc; + + std::vector> sendDataGPUPerProc; + std::vector> recvDataGPUPerProc; + }; + + std::map, CudaAwareMPIFieldCache> cudaAwareMPICaches; + + bool cudaAwareMPIForceDisabled(){ + if(!cudaAwareMPIEnvChecked){ + const char* value = std::getenv("POLYMPO_DISABLE_CUDA_AWARE_MPI"); + cudaAwareMPIForceCPU = + value != nullptr && value[0] != '\0' && value[0] != '0'; + cudaAwareMPIEnvChecked = true; + } + + return cudaAwareMPIForceCPU; + } + + // Fully CUDA-aware MPI version: + // Field data is sent/received using GPU pointers. Receive IDs are cached + // once from the fixed halo/owner mapping and are not sent every call. + // + // Important: + // This function caches communication metadata and GPU buffers per + // (mode, numEntries). If the communication pattern changes, clear + // cudaAwareMPICaches before the next call. + template + void communicate_and_take_halo_contributions1_improved( + const ViewType& meshField, + int nEntities, + int numEntries, + int mode, + int op){ + + int self, numProcsTot; + + MPI_Comm comm = p_MPs->getMPIComm(); + + MPI_Comm_rank(comm, &self); + MPI_Comm_size(comm, &numProcsTot); + + assert(mode == 0 || mode == 1); + assert(op == 0 || op == 1); + assert(nEntities == numOwnersTot + numHalosTot); + + if(cudaAwareMPIDisabled || cudaAwareMPIForceDisabled()){ + communicate_and_take_halo_contributions1( + meshField, + nEntities, + numEntries, + mode, + op); + return; + } + +#ifdef POLYMPO_VERBOSE_MPI + if(self == 0 && !cudaAwareMPILogged){ + std::cout + << "[CUDA_AWARE_MPI] Using per-proc cached full GPU-aware MPI path in communicate_and_take_halo_contributions1_improved()" + << "\n"; + cudaAwareMPILogged = true; + } +#endif + + Kokkos::Timer timer; + + if(!cudaAwareMPICacheValid){ + cudaAwareMPICaches.clear(); + cudaAwareMPICacheValid = true; + } + + auto& cudaAwareCache = + cudaAwareMPICaches[std::make_pair(mode, numEntries)]; + + const bool needRebuild = + (!cudaAwareCache.valid) || + (cudaAwareCache.cachedNumProcs != numProcsTot); + + if(needRebuild){ + + cudaAwareCache.cachedNumProcs = numProcsTot; + + cudaAwareCache.sendCounts.assign(numProcsTot, 0); + cudaAwareCache.recvCounts.assign(numProcsTot, 0); + + cudaAwareCache.sendEntityGPUPerProc.clear(); + cudaAwareCache.recvIDGPUPerProc.clear(); + cudaAwareCache.sendDataGPUPerProc.clear(); + cudaAwareCache.recvDataGPUPerProc.clear(); + + cudaAwareCache.sendEntityGPUPerProc.resize(numProcsTot); + cudaAwareCache.recvIDGPUPerProc.resize(numProcsTot); + cudaAwareCache.sendDataGPUPerProc.resize(numProcsTot); + cudaAwareCache.recvDataGPUPerProc.resize(numProcsTot); + + for(int proc = 0; proc < numProcsTot; proc++){ + if(proc == self) continue; + + if(mode == 0){ + cudaAwareCache.sendCounts[proc] = numOwnersOnOtherProcs[proc]; + cudaAwareCache.recvCounts[proc] = numHalosOnOtherProcs[proc]; + } + else{ + cudaAwareCache.sendCounts[proc] = numHalosOnOtherProcs[proc]; + cudaAwareCache.recvCounts[proc] = numOwnersOnOtherProcs[proc]; + } + } + + for(int proc = 0; proc < numProcsTot; proc++){ + if(proc == self) continue; + + const int sendCount = cudaAwareCache.sendCounts[proc]; + const int recvCount = cudaAwareCache.recvCounts[proc]; + + if(sendCount > 0){ + cudaAwareCache.sendEntityGPUPerProc[proc] = + Kokkos::View( + "cudaAwareMPISendEntityGPUPerProc", + sendCount); + + cudaAwareCache.sendDataGPUPerProc[proc] = + Kokkos::View( + "cudaAwareMPISendDataGPUPerProc", + sendCount * numEntries); + + auto sendEntityCPU = + Kokkos::View( + "sendEntityCPU", + sendCount); + + if(mode == 0){ + assert(haloOwnerLocalIDs[proc].size() == + static_cast(sendCount)); + + int localIndex = 0; + + for(int iEnt = 0; iEnt < numHalosTot; iEnt++){ + int ownerProc = haloOwnerProcs[iEnt]; + + if(ownerProc != proc) continue; + + assert(localIndex < sendCount); + + sendEntityCPU(localIndex) = numOwnersTot + iEnt; + + localIndex++; + } + + assert(localIndex == sendCount); + } + else{ + assert(ownerOwnerLocalIDs[proc].size() == + static_cast(sendCount)); + + for(int i = 0; i < sendCount; i++){ + sendEntityCPU(i) = ownerOwnerLocalIDs[proc][i]; + } + } + + Kokkos::deep_copy( + cudaAwareCache.sendEntityGPUPerProc[proc], + sendEntityCPU); + } + + if(recvCount > 0){ + cudaAwareCache.recvIDGPUPerProc[proc] = + Kokkos::View( + "cudaAwareMPIRecvIDGPUPerProc", + recvCount); + + cudaAwareCache.recvDataGPUPerProc[proc] = + Kokkos::View( + "cudaAwareMPIRecvDataGPUPerProc", + recvCount * numEntries); + + auto recvIDCPU = + Kokkos::View( + "recvIDCPU", + recvCount); + + if(mode == 0){ + assert(ownerOwnerLocalIDs[proc].size() == + static_cast(recvCount)); + + for(int i = 0; i < recvCount; i++){ + recvIDCPU(i) = ownerOwnerLocalIDs[proc][i]; + } + } + else{ + int localIndex = 0; + + for(int iEnt = 0; iEnt < numHalosTot; iEnt++){ + if(haloOwnerProcs[iEnt] != proc) continue; + + assert(localIndex < recvCount); + + recvIDCPU(localIndex) = numOwnersTot + iEnt; + + localIndex++; + } + + assert(localIndex == recvCount); + } + + Kokkos::deep_copy( + cudaAwareCache.recvIDGPUPerProc[proc], + recvIDCPU); + } + } + + Kokkos::fence(); + + cudaAwareCache.valid = true; + + pumipic::RecordTime( + "SD: CUDA-aware MPI Cache Build m" + std::to_string(mode) + + " e" + std::to_string(numEntries) + "-" + std::to_string(self), + timer.seconds()); + + timer.reset(); + } + + for(int proc = 0; proc < numProcsTot; proc++){ + if(proc == self) continue; + if(cudaAwareCache.sendCounts[proc] <= 0) continue; + + auto sendEntityGPU = cudaAwareCache.sendEntityGPUPerProc[proc]; + auto sendDataGPU = cudaAwareCache.sendDataGPUPerProc[proc]; + int sendCount = cudaAwareCache.sendCounts[proc]; + + Kokkos::parallel_for( + "pack cached cuda-aware mpi send buffer per proc", + sendCount, + KOKKOS_LAMBDA(const int i){ + int entity = sendEntityGPU(i); + + for(int k = 0; k < numEntries; k++){ + sendDataGPU(i * numEntries + k) = + meshField(entity, k); + } + }); + } + + Kokkos::fence(); + + pumipic::RecordTime( + "SD: CUDA-aware MPI Pack m" + std::to_string(mode) + + " e" + std::to_string(numEntries) + "-" + std::to_string(self), + timer.seconds()); + + timer.reset(); + + Kokkos::Timer mpiTotalTimer; + std::vector requests; + requests.reserve(2 * numProcsTot); + int mpiError = MPI_SUCCESS; + + for(int proc = 0; proc < numProcsTot; proc++){ + if(proc == self) continue; + + if(cudaAwareCache.recvCounts[proc] > 0){ + MPI_Request reqData; + + mpiError = MPI_Irecv( + cudaAwareCache.recvDataGPUPerProc[proc].data(), + cudaAwareCache.recvCounts[proc] * numEntries, + MPI_DOUBLE, + proc, + 2, + comm, + &reqData); + if(mpiError != MPI_SUCCESS) break; + requests.push_back(reqData); + } + + if(cudaAwareCache.sendCounts[proc] > 0){ + MPI_Request reqData; + + mpiError = MPI_Isend( + cudaAwareCache.sendDataGPUPerProc[proc].data(), + cudaAwareCache.sendCounts[proc] * numEntries, + MPI_DOUBLE, + proc, + 2, + comm, + &reqData); + if(mpiError != MPI_SUCCESS) break; + requests.push_back(reqData); + } + } + + pumipic::RecordTime( + "SD: CUDA-aware MPI Post m" + std::to_string(mode) + + " e" + std::to_string(numEntries) + "-" + std::to_string(self), + timer.seconds()); + + timer.reset(); + + if(mpiError == MPI_SUCCESS && !requests.empty()){ + mpiError = MPI_Waitall( + static_cast(requests.size()), + requests.data(), + MPI_STATUSES_IGNORE); + } + + pumipic::RecordTime( + "SD: CUDA-aware MPI Wait m" + std::to_string(mode) + + " e" + std::to_string(numEntries) + "-" + std::to_string(self), + timer.seconds()); + + if(mpiError != MPI_SUCCESS){ + cudaAwareMPIDisabled = true; + + if(self == 0){ + std::cout + << "[CUDA_AWARE_MPI] Device-pointer MPI failed." + << std::endl; + } + + if(requests.empty()){ + if(self == 0){ + std::cout + << "[CUDA_AWARE_MPI] Falling back to CPU-staged communication." + << std::endl; + } + + communicate_and_take_halo_contributions1( + meshField, + nEntities, + numEntries, + mode, + op); + return; + } + + if(self == 0){ + std::cout + << "[CUDA_AWARE_MPI] Failure happened after MPI requests were posted. " + << "Set POLYMPO_DISABLE_CUDA_AWARE_MPI=1 before running to force the CPU-staged path." + << std::endl; + } + + MPI_Abort(comm, mpiError); + return; + } + + pumipic::RecordTime( + "SD: CUDA-aware MPI Comm m" + std::to_string(mode) + + " e" + std::to_string(numEntries) + "-" + std::to_string(self), + mpiTotalTimer.seconds()); + + timer.reset(); + + for(int proc = 0; proc < numProcsTot; proc++){ + if(proc == self) continue; + if(cudaAwareCache.recvCounts[proc] <= 0) continue; + + auto recvIDGPU = cudaAwareCache.recvIDGPUPerProc[proc]; + auto recvDataGPU = cudaAwareCache.recvDataGPUPerProc[proc]; + int recvCount = cudaAwareCache.recvCounts[proc]; + + if(op == 0){ + Kokkos::parallel_for( + "halo add cached cuda-aware mpi per proc", + recvCount, + KOKKOS_LAMBDA(const int i){ + const int vertex = recvIDGPU(i); + + for(int k = 0; k < numEntries; k++){ +#ifdef POLYMPO_ASSUME_UNIQUE_HALO_CONTRIBS + meshField(vertex, k) += + recvDataGPU(i * numEntries + k); +#else + Kokkos::atomic_add( + &meshField(vertex, k), + recvDataGPU(i * numEntries + k)); +#endif + } + }); + } + else{ + Kokkos::parallel_for( + "halo assign cached cuda-aware mpi per proc", + recvCount, + KOKKOS_LAMBDA(const int i){ + const int vertex = recvIDGPU(i); + + for(int k = 0; k < numEntries; k++){ + meshField(vertex, k) = + recvDataGPU(i * numEntries + k); + } + }); + } + } + + Kokkos::fence(); + + pumipic::RecordTime( + "SD: CUDA-aware MPI Contribution m" + std::to_string(mode) + + " e" + std::to_string(numEntries) + "-" + std::to_string(self), + timer.seconds()); + } + +#else + + // Fallback path: + // if CUDA_AWARE_MPI is not defined, use the original GPU-CPU staging function. + template + void communicate_and_take_halo_contributions1_improved( + const ViewType& meshField, + int nEntities, + int numEntries, + int mode, + int op){ + + communicate_and_take_halo_contributions1( + meshField, + nEntities, + numEntries, + mode, + op); + } + +#endif + }; }//namespace polyMPO end diff --git a/src/pmpo_MPMesh_assembly.hpp b/src/pmpo_MPMesh_assembly.hpp index 9f184e05..7705c8f7 100644 --- a/src/pmpo_MPMesh_assembly.hpp +++ b/src/pmpo_MPMesh_assembly.hpp @@ -161,10 +161,10 @@ void MPMesh::reconstruct_coeff_full(){ int mode = 0; int op = 0; if (numProcsTot >1){ - communicate_and_take_halo_contributions1(vtxMatrices, numVertices, numEntriesMatrix, mode, op); + communicate_and_take_halo_contributions1_improved(vtxMatrices, numVertices, numEntriesMatrix, mode, op); mode=1; op=1; - communicate_and_take_halo_contributions1(vtxMatrices, numVertices, numEntriesMatrix, mode, op); + communicate_and_take_halo_contributions1_improved(vtxMatrices, numVertices, numEntriesMatrix, mode, op); } pumipic::RecordTime("Communicate Matrix Values" + std::to_string(self), timer.seconds()); @@ -380,7 +380,7 @@ void MPMesh::assemblyVtx1(){ timer.reset(); if(numProcsTot>1){ - communicate_and_take_halo_contributions1(meshField, numVertices, numEntries, 0, 0); + communicate_and_take_halo_contributions1_improved(meshField, numVertices, numEntries, 0, 0); } pumipic::RecordTime("Communicate Field Values" + std::to_string(self), timer.seconds()); } diff --git a/src/pmpo_c.cpp b/src/pmpo_c.cpp index 53c330e1..cdefc2f5 100644 --- a/src/pmpo_c.cpp +++ b/src/pmpo_c.cpp @@ -1609,7 +1609,7 @@ void polympo_set_halo_vel_from_owner_f(MPMesh_ptr p_mpmesh){ auto p_mesh = ((polyMPO::MPMesh*)p_mpmesh)->p_mesh; int numVertices = p_mesh->getNumVertices(); auto vtxFieldVel = p_mesh->getMeshField(); - mpMesh->communicate_and_take_halo_contributions1(vtxFieldVel, numVertices, 2, 1, 1); + mpMesh->communicate_and_take_halo_contributions1_improved(vtxFieldVel, numVertices, 2, 1, 1); } //Advection Calcualtions