From 96341efb089f3d5cfbec36b9dd23ff1b819f7a3a Mon Sep 17 00:00:00 2001
From: JYMiracle305 <604951424@qq.com>
Date: Thu, 23 Apr 2026 17:19:59 +0800
Subject: [PATCH] fix: resolve GPU memory leak in pipeline parallel training

---
 infini_train/src/nn/parallel/pp/pipeline_schedule.cc | 7 +++++++
 1 file changed, 7 insertions(+)
diff --git a/infini_train/src/nn/parallel/pp/pipeline_schedule.cc b/infini_train/src/nn/parallel/pp/pipeline_schedule.cc
index 1df2f1b5..090b7b15 100644
--- a/infini_train/src/nn/parallel/pp/pipeline_schedule.cc
+++ b/infini_train/src/nn/parallel/pp/pipeline_schedule.cc
@@ -50,6 +50,13 @@ std::vector<std::shared_ptr<Tensor>> PipelineSchedule::ReceiveFromPrev(int peer_
         // FIXME(jym): The data type between stages is not float32, which will cause a crash
         auto tensor = std::make_shared<Tensor>(shapes[i], DataType::kFLOAT32, stage_->device());
         tensor->set_requires_grad(true);
+
+        // Mark as non-leaf to prevent the autograd engine from creating an AccumulateGrad
+        // for this tensor. Otherwise, IRecv's next_functions_ would hold AccumulateGrad,
+        // which holds a shared_ptr back to this tensor, forming a reference cycle:
+        //   tensor -> grad_fn_(IRecv) -> next_functions_ -> AccumulateGrad -> tensor
+        // This cycle prevents the autograd graph from being released after backward.
+        tensor->set_is_leaf(false);
         recv_tensors.push_back(tensor);
     }