From 34366cf8b864c764daa85ab99e6034e3b2365546 Mon Sep 17 00:00:00 2001 From: Chris O'Neil Date: Tue, 14 Apr 2026 14:22:44 +0100 Subject: [PATCH] fix: prevent replication neighbor sync from blocking shutdown under active traffic MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The neighbor sync loop ran `run_neighbor_sync_round()` outside of its `tokio::select!` block. When shutdown was cancelled mid-round, the task couldn't notice until the entire sync round completed — which involves multiple network round-trips to peers that may themselves be shutting down, causing extended blocking. Wrap the sync round in a `tokio::select!` with `shutdown.cancelled()` so in-progress operations are cancelled immediately when shutdown fires. Also add a 10-second timeout to the replication engine's task joins in `shutdown()` as defense in depth, matching the same pattern applied to `DhtNetworkManager::stop()`. Discovered during auto-upgrade testing on a 151-node testnet with active client uploads. The DHT shutdown fix (saorsa-core) resolved 98% of hangs; this fix resolved the remaining 2%. Co-Authored-By: Claude Opus 4.6 (1M context) --- src/replication/mod.rs | 41 +++++++++++++++++++++++++++-------------- 1 file changed, 27 insertions(+), 14 deletions(-) diff --git a/src/replication/mod.rs b/src/replication/mod.rs index f9b567c4..336a3715 100644 --- a/src/replication/mod.rs +++ b/src/replication/mod.rs @@ -255,8 +255,16 @@ impl ReplicationEngine { /// released (e.g. before reopening the same LMDB environment). pub async fn shutdown(&mut self) { self.shutdown.cancel(); - for handle in self.task_handles.drain(..) { - let _ = handle.await; + for (i, mut handle) in self.task_handles.drain(..).enumerate() { + match tokio::time::timeout(std::time::Duration::from_secs(10), &mut handle).await { + Ok(Ok(())) => {} + Ok(Err(e)) if e.is_cancelled() => {} + Ok(Err(e)) => warn!("Replication task {i} panicked during shutdown: {e}"), + Err(_) => { + warn!("Replication task {i} did not stop within 10s, aborting"); + handle.abort(); + } + } } } @@ -435,18 +443,23 @@ impl ReplicationEngine { debug!("Neighbor sync triggered by topology change"); } } - run_neighbor_sync_round( - &p2p, - &storage, - &paid_list, - &queues, - &config, - &sync_state, - &sync_history, - &is_bootstrapping, - &bootstrap_state, - ) - .await; + // Wrap the sync round in a select so shutdown cancels + // in-progress network operations rather than waiting for + // the full round to complete. + tokio::select! { + () = shutdown.cancelled() => break, + _ = run_neighbor_sync_round( + &p2p, + &storage, + &paid_list, + &queues, + &config, + &sync_state, + &sync_history, + &is_bootstrapping, + &bootstrap_state, + ) => {} + } } debug!("Neighbor sync loop shut down"); });