From 96341efb089f3d5cfbec36b9dd23ff1b819f7a3a Mon Sep 17 00:00:00 2001 From: JYMiracle305 <604951424@qq.com> Date: Thu, 23 Apr 2026 17:19:59 +0800 Subject: [PATCH] fix: resolve GPU memory leak in pipeline parallel training --- infini_train/src/nn/parallel/pp/pipeline_schedule.cc | 7 +++++++ 1 file changed, 7 insertions(+) diff --git a/infini_train/src/nn/parallel/pp/pipeline_schedule.cc b/infini_train/src/nn/parallel/pp/pipeline_schedule.cc index 1df2f1b5..090b7b15 100644 --- a/infini_train/src/nn/parallel/pp/pipeline_schedule.cc +++ b/infini_train/src/nn/parallel/pp/pipeline_schedule.cc @@ -50,6 +50,13 @@ std::vector> PipelineSchedule::ReceiveFromPrev(int peer_ // FIXME(jym): The data type between stages is not float32, which will cause a crash auto tensor = std::make_shared(shapes[i], DataType::kFLOAT32, stage_->device()); tensor->set_requires_grad(true); + + // Mark as non-leaf to prevent the autograd engine from creating an AccumulateGrad + // for this tensor. Otherwise, IRecv's next_functions_ would hold AccumulateGrad, + // which holds a shared_ptr back to this tensor, forming a reference cycle: + // tensor -> grad_fn_(IRecv) -> next_functions_ -> AccumulateGrad -> tensor + // This cycle prevents the autograd graph from being released after backward. + tensor->set_is_leaf(false); recv_tensors.push_back(tensor); }