From 7d94cdbc44c41d7cc3512013b1957dc111d5643e Mon Sep 17 00:00:00 2001 From: Rinse Date: Sat, 13 Jun 2026 07:08:59 +0000 Subject: [PATCH 1/2] fix(daemon): run async kubo cleanup before process.exit on startup failure (issue #98) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The daemon registers an asyncExitHook (exit-hook) to kill kubo and destroy the RPC server on shutdown. exit-hook only runs async hooks for SIGINT/SIGTERM/ beforeExit; an explicit process.exit() runs only synchronous hooks and prints "SYNCHRONOUS TERMINATION NOTICE". The notice's real trigger is the startup-failure path, not the force-quit guard at daemon.ts:655 (by the second signal exit-hook's internal isCalled is already true, so its synchronous handler returns before the warning). When the hook is registered (daemon.ts:600), kubo is spawned by keepKuboUp() (:688), and then createOrConnectRpc() (:689) throws — e.g. the #87/#97 TOCTOU RPC-port race — the throw unwinds to oclif's error handler (@oclif/core errors/handle.js), which calls process.exit(1). That skips the async cleanup: the notice prints, kubo is orphaned, and this daemon's state file is left behind. Fix: - Extract the async exit-hook body into a named shutdownDaemon() and capture both the hook's unsubscribe handle and the fn (hoisted above the try). - In the startup-failure catch, await shutdownDaemon() (kills kubo, destroys the RPC server, deletes the state file) and drop the hook before re-throwing, so oclif's process.exit() runs clean. No-ops if we failed before the hook was registered. - In the force-quit guard, drop the hook before its deliberate process.exit() as belt-and-suspenders; kubo is still SIGKILLed by the emergency "exit" handler. Test: - Add a guarded test hook PKC_CLI_TEST_FAIL_AFTER_KUBO_START that throws at the top of createOrConnectRpc, after kubo is up, mirroring the real port race. - New daemon.test.ts case "kills kubo (and prints no exit-hook termination notice) when startup fails after kubo is up": asserts non-zero exit, no SYNCHRONOUS TERMINATION NOTICE, and kubo stopped (not orphaned). Verified red before the fix, green after. Full daemon.test.ts (23), update-install-restart-race, and daemon-kubo-restart-race suites pass. --- src/cli/commands/daemon.ts | 83 +++++++++++++++++++++++++++----------- test/cli/daemon.test.ts | 48 ++++++++++++++++++++++ 2 files changed, 108 insertions(+), 23 deletions(-) diff --git a/src/cli/commands/daemon.ts b/src/cli/commands/daemon.ts index 4fd22ff..b441bfb 100644 --- a/src/cli/commands/daemon.ts +++ b/src/cli/commands/daemon.ts @@ -260,6 +260,13 @@ export default class Daemon extends Command { const { logFilePath, stdoutWrite } = await this._pipeDebugLogsToLogFile(flags.logPath, PKCLogger as PKCLoggerType); const log = PKCLogger("bitsocial-cli:daemon"); + // Captured once the async exit hook is registered (inside the try). The startup-failure + // catch and the force-quit signal guard use these to run kubo/daemon cleanup and drop the + // hook before the process exits via process.exit() — which would otherwise skip the async + // hook (exit-hook's "SYNCHRONOUS TERMINATION NOTICE") and orphan kubo. (issue #98) + let runDaemonShutdown: (() => Promise) | undefined; + let removeAsyncExitHook: (() => void) | undefined; + try { // Log debug info after pipe is set up so it goes to the log file, not terminal const envDebug: string | undefined = process.env["_PKC_DEBUG"] || process.env["DEBUG"]; @@ -480,6 +487,13 @@ export default class Daemon extends Command { const createOrConnectRpc = async () => { if (mainProcessExited) return; if (startedOwnRpc) return; + // Test hook (issue #98): force a startup failure *after* keepKuboUp() has already + // spawned kubo, reproducing the real TOCTOU port race (createOrConnectRpc throwing + // once kubo is up). Verifies the daemon tears kubo down — and doesn't print + // exit-hook's "SYNCHRONOUS TERMINATION NOTICE" — when oclif process.exit()s on a + // thrown startup error, instead of orphaning kubo. + if (process.env["PKC_CLI_TEST_FAIL_AFTER_KUBO_START"]) + throw new Error("Simulated startup failure after kubo start (PKC_CLI_TEST_FAIL_AFTER_KUBO_START)"); // Re-check the port: the early fail-fast at startup is a few ms before this runs, // so a TOCTOU race could let another process grab the port in between. If that // happens we must fail rather than silently leaving the daemon without an RPC. @@ -597,36 +611,44 @@ export default class Daemon extends Command { liveKuboPids.clear(); }; - asyncExitHook( - async () => { - if (keepKuboUpInterval) clearInterval(keepKuboUpInterval); - if (mainProcessExited) return; // we already exited - console.log( - "\nShutting down Bitsocial daemon, it may take a few seconds to shut down all communities and the IPFS node..." - ); - log("Received signal to exit, shutting down both kubo and pkc rpc. Please wait, it may take a few seconds"); + const shutdownDaemon = async () => { + if (keepKuboUpInterval) clearInterval(keepKuboUpInterval); + if (mainProcessExited) return; // we already exited + console.log( + "\nShutting down Bitsocial daemon, it may take a few seconds to shut down all communities and the IPFS node..." + ); + log("Received signal to exit, shutting down both kubo and pkc rpc. Please wait, it may take a few seconds"); - mainProcessExited = true; + mainProcessExited = true; - // Remove daemon state file so update install knows we're gone - await deleteDaemonState(process.pid).catch(() => {}); + // Remove daemon state file so update install knows we're gone + await deleteDaemonState(process.pid).catch(() => {}); - // Start killing Kubo immediately, in parallel with daemon server destroy. - // This way Kubo receives SIGINT right away, even if daemonServer.destroy() hangs. - const kuboKillPromise = killKuboProcess(); + // Start killing Kubo immediately, in parallel with daemon server destroy. + // This way Kubo receives SIGINT right away, even if daemonServer.destroy() hangs. + const kuboKillPromise = killKuboProcess(); - if (daemonServer) - try { - await daemonServer.destroy(); - log("Daemon server shut down"); - } catch (e) { - log.error("Error shutting down daemon server", e); - } + if (daemonServer) + try { + await daemonServer.destroy(); + log("Daemon server shut down"); + } catch (e) { + log.error("Error shutting down daemon server", e); + } - await kuboKillPromise; - }, + await kuboKillPromise; + }; + // Run kubo/daemon cleanup on SIGINT/SIGTERM/beforeExit. Capture the unsubscribe handle + // and the shutdown fn so the startup-failure catch (and the force-quit guard below) can + // run cleanup themselves and drop this hook: oclif calls process.exit() as it unwinds a + // thrown startup error, and process.exit() runs only synchronous exit hooks — exit-hook + // would skip this async one (printing its "SYNCHRONOUS TERMINATION NOTICE") and orphan a + // kubo we already spawned. (issue #98) + removeAsyncExitHook = asyncExitHook( + shutdownDaemon, { wait: DAEMON_SHUTDOWN_TIMEOUT_MS } // could take two minutes to shut down ); + runDaemonShutdown = shutdownDaemon; // Emergency cleanup: if the process force-exits (e.g. double Ctrl+C), // synchronously SIGKILL every live kubo's process group. This is a no-op if @@ -652,6 +674,11 @@ export default class Daemon extends Command { terminationSignalCount++; if (terminationSignalCount >= 2) { log(`Received ${signal} again during shutdown, force-quitting`); + // Deliberate immediate exit: the first signal's async shutdown is still + // running but the user wants out now. Drop the async exit hook first so the + // imminent process.exit() doesn't trip exit-hook's "SYNCHRONOUS TERMINATION + // NOTICE"; kubo is still SIGKILLed by the emergency "exit" handler. (issue #98) + removeAsyncExitHook?.(); process.exit(signal === "SIGINT" ? 130 : 143); } }); @@ -717,6 +744,16 @@ export default class Daemon extends Command { stdoutWrite(`Full log: ${logFilePath}\n`); stdoutWrite(`Or run: bitsocial logs\n`); + + // oclif's error handler calls process.exit() as it unwinds this throw, which runs only + // synchronous exit hooks — exit-hook would skip the async kubo/daemon cleanup (printing + // "SYNCHRONOUS TERMINATION NOTICE") and orphan a kubo we may have already spawned, and + // leave this daemon's state file behind. Run that cleanup now and drop the now-redundant + // hook so the process exits clean. No-ops if we failed before the hook was registered. + // (issue #98) + if (runDaemonShutdown) await runDaemonShutdown().catch(() => {}); + removeAsyncExitHook?.(); + throw err; } } diff --git a/test/cli/daemon.test.ts b/test/cli/daemon.test.ts index 1dcf8c8..e23bb20 100644 --- a/test/cli/daemon.test.ts +++ b/test/cli/daemon.test.ts @@ -863,6 +863,54 @@ describe("bitsocial daemon kills kubo on its own shutdown (no backup /shutdown c if (kuboApiUrl) await ensureKuboNodeStopped(kuboApiUrl); } }); + + it.skipIf(process.platform === "win32")( + "kills kubo (and prints no exit-hook termination notice) when startup fails after kubo is up", + { timeout: 60000 }, + async () => { + // Reproduces issue #98: when the daemon throws during startup *after* kubo has spawned + // (the real TOCTOU RPC-port race, simulated here via PKC_CLI_TEST_FAIL_AFTER_KUBO_START), + // oclif's error handler calls process.exit(). process.exit() runs only synchronous exit + // hooks, so exit-hook would skip the async kubo cleanup — printing "SYNCHRONOUS TERMINATION + // NOTICE" and orphaning kubo. The daemon must instead tear kubo down itself and exit quietly. + const endpoints = await allocateKuboEndpoints(); + const dataPath = randomDirectory(); + await preInitKuboWithEphemeralSwarm( + path.join(dataPath, ".bitsocial-cli.ipfs"), + new URL(endpoints.kuboRpcUrl), + new URL(endpoints.gatewayUrl) + ); + try { + const result = await runPkcDaemonExpectFailure(["--pkcOptions.dataPath", dataPath, "--pkcRpcUrl", endpoints.rpcWsUrl], { + KUBO_RPC_URL: endpoints.kuboRpcUrl, + IPFS_GATEWAY_URL: endpoints.gatewayUrl, + PKC_CLI_TEST_FAIL_AFTER_KUBO_START: "1" + }); + + // The daemon exited non-zero on the injected startup failure... + expect(result.exitCode).not.toBe(0); + // ...without exit-hook's synchronous-termination warning (proves the async cleanup ran). + const combinedOutput = `${result.stdout}\n${result.stderr}`; + expect(combinedOutput).not.toContain("SYNCHRONOUS TERMINATION NOTICE"); + // ...and kubo was killed by the daemon's own cleanup, not orphaned. + const kuboStopped = await waitForCondition( + async () => { + try { + const res = await fetch(`${endpoints.kuboApiUrl}/bitswap/stat`, { method: "POST" }); + return !res.ok; + } catch { + return true; // connection refused = stopped + } + }, + 20000, + 500 + ); + expect(kuboStopped).toBe(true); + } finally { + await ensureKuboNodeStopped(endpoints.kuboApiUrl); + } + } + ); }); describe("bitsocial daemon DEBUG env var", () => { From bf4778e2a3f26b12ae7331eafdfb4b7ecc131c47 Mon Sep 17 00:00:00 2001 From: Rinse Date: Sat, 13 Jun 2026 07:38:28 +0000 Subject: [PATCH 2/2] fix(daemon): cap startup-failure shutdown + pin its test to the injected path (issue #98) Address CodeRabbit review on #99: - Cap the manual startup-failure shutdown with DAEMON_SHUTDOWN_TIMEOUT_MS (Promise.race with an unref'd timer), matching the timeout contract the signal path gets from exit-hook's `wait`, so a hung daemonServer.destroy() can't swallow the original startup error forever. Low live risk (daemonServer is ~never set on the catch path and killKuboProcess is already bounded), but it keeps both exit paths consistent. - Assert the test's combined output contains the injected failure message so the case stays pinned to the PKC_CLI_TEST_FAIL_AFTER_KUBO_START path instead of passing on any non-zero failure. Wrap the spawn in withKuboBindRetry so a lost kubo bind race (issue #87) retries with fresh ports rather than flaking the new message assertion. daemon.test.ts (23), update-install-restart-race, and daemon-kubo-restart-race all pass. --- src/cli/commands/daemon.ts | 10 ++++++++-- test/cli/daemon.test.ts | 38 ++++++++++++++++++++++++-------------- 2 files changed, 32 insertions(+), 16 deletions(-) diff --git a/src/cli/commands/daemon.ts b/src/cli/commands/daemon.ts index b441bfb..7b7e38e 100644 --- a/src/cli/commands/daemon.ts +++ b/src/cli/commands/daemon.ts @@ -750,8 +750,14 @@ export default class Daemon extends Command { // "SYNCHRONOUS TERMINATION NOTICE") and orphan a kubo we may have already spawned, and // leave this daemon's state file behind. Run that cleanup now and drop the now-redundant // hook so the process exits clean. No-ops if we failed before the hook was registered. - // (issue #98) - if (runDaemonShutdown) await runDaemonShutdown().catch(() => {}); + // Cap the wait with the same DAEMON_SHUTDOWN_TIMEOUT_MS the signal path gets (via the + // exit-hook `wait` option) so a hung daemonServer.destroy() can't swallow the original + // startup error forever; the unref'd timer never keeps the loop alive. (issue #98) + if (runDaemonShutdown) + await Promise.race([ + runDaemonShutdown().catch(() => {}), + new Promise((resolve) => setTimeout(resolve, DAEMON_SHUTDOWN_TIMEOUT_MS).unref()) + ]); removeAsyncExitHook?.(); throw err; diff --git a/test/cli/daemon.test.ts b/test/cli/daemon.test.ts index e23bb20..bbc91aa 100644 --- a/test/cli/daemon.test.ts +++ b/test/cli/daemon.test.ts @@ -21,7 +21,9 @@ import { requestKuboShutdown, waitForWebSocketOpen, waitForKuboReady, - waitForPortFree + waitForPortFree, + withKuboBindRetry, + isAddressInUseError } from "../helpers/daemon-helpers.js"; import { preInitKuboWithEphemeralSwarm } from "../helpers/kubo-helpers.js"; dns.setDefaultResultOrder("ipv4first"); // to be able to resolve localhost @@ -866,31 +868,39 @@ describe("bitsocial daemon kills kubo on its own shutdown (no backup /shutdown c it.skipIf(process.platform === "win32")( "kills kubo (and prints no exit-hook termination notice) when startup fails after kubo is up", - { timeout: 60000 }, + { timeout: 90000 }, async () => { // Reproduces issue #98: when the daemon throws during startup *after* kubo has spawned // (the real TOCTOU RPC-port race, simulated here via PKC_CLI_TEST_FAIL_AFTER_KUBO_START), // oclif's error handler calls process.exit(). process.exit() runs only synchronous exit // hooks, so exit-hook would skip the async kubo cleanup — printing "SYNCHRONOUS TERMINATION // NOTICE" and orphaning kubo. The daemon must instead tear kubo down itself and exit quietly. - const endpoints = await allocateKuboEndpoints(); const dataPath = randomDirectory(); - await preInitKuboWithEphemeralSwarm( - path.join(dataPath, ".bitsocial-cli.ipfs"), - new URL(endpoints.kuboRpcUrl), - new URL(endpoints.gatewayUrl) - ); - try { - const result = await runPkcDaemonExpectFailure(["--pkcOptions.dataPath", dataPath, "--pkcRpcUrl", endpoints.rpcWsUrl], { - KUBO_RPC_URL: endpoints.kuboRpcUrl, - IPFS_GATEWAY_URL: endpoints.gatewayUrl, + // Retry on a lost kubo bind race (issue #87): the injected failure only fires once kubo is + // up, so if kubo's freshly-allocated ports are stolen in the TOCTOU window the daemon fails + // early instead — a different path. Retrying keeps the assertions below pinned to the + // intended path (the message check would otherwise flake on a bind race). + const { result, endpoints } = await withKuboBindRetry(async (e) => { + await preInitKuboWithEphemeralSwarm(path.join(dataPath, ".bitsocial-cli.ipfs"), new URL(e.kuboRpcUrl), new URL(e.gatewayUrl)); + const res = await runPkcDaemonExpectFailure(["--pkcOptions.dataPath", dataPath, "--pkcRpcUrl", e.rpcWsUrl], { + KUBO_RPC_URL: e.kuboRpcUrl, + IPFS_GATEWAY_URL: e.gatewayUrl, PKC_CLI_TEST_FAIL_AFTER_KUBO_START: "1" }); + const combined = `${res.stdout}\n${res.stderr}`; + // The daemon resolves (doesn't reject) on any failure; throw an address-in-use error so + // withKuboBindRetry retries only when we lost the bind race before the injected failure. + if (!combined.includes("Simulated startup failure after kubo start") && isAddressInUseError(combined)) + throw new Error(`address already in use (retry with fresh ports): ${combined}`); + return res; + }); - // The daemon exited non-zero on the injected startup failure... + try { + // The daemon exited non-zero on the injected startup failure (after kubo was up)... expect(result.exitCode).not.toBe(0); - // ...without exit-hook's synchronous-termination warning (proves the async cleanup ran). const combinedOutput = `${result.stdout}\n${result.stderr}`; + expect(combinedOutput).toContain("Simulated startup failure after kubo start"); + // ...without exit-hook's synchronous-termination warning (proves the async cleanup ran)... expect(combinedOutput).not.toContain("SYNCHRONOUS TERMINATION NOTICE"); // ...and kubo was killed by the daemon's own cleanup, not orphaned. const kuboStopped = await waitForCondition(