diff --git a/src/cli/commands/daemon.ts b/src/cli/commands/daemon.ts index 4fd22ff..7b7e38e 100644 --- a/src/cli/commands/daemon.ts +++ b/src/cli/commands/daemon.ts @@ -260,6 +260,13 @@ export default class Daemon extends Command { const { logFilePath, stdoutWrite } = await this._pipeDebugLogsToLogFile(flags.logPath, PKCLogger as PKCLoggerType); const log = PKCLogger("bitsocial-cli:daemon"); + // Captured once the async exit hook is registered (inside the try). The startup-failure + // catch and the force-quit signal guard use these to run kubo/daemon cleanup and drop the + // hook before the process exits via process.exit() — which would otherwise skip the async + // hook (exit-hook's "SYNCHRONOUS TERMINATION NOTICE") and orphan kubo. (issue #98) + let runDaemonShutdown: (() => Promise) | undefined; + let removeAsyncExitHook: (() => void) | undefined; + try { // Log debug info after pipe is set up so it goes to the log file, not terminal const envDebug: string | undefined = process.env["_PKC_DEBUG"] || process.env["DEBUG"]; @@ -480,6 +487,13 @@ export default class Daemon extends Command { const createOrConnectRpc = async () => { if (mainProcessExited) return; if (startedOwnRpc) return; + // Test hook (issue #98): force a startup failure *after* keepKuboUp() has already + // spawned kubo, reproducing the real TOCTOU port race (createOrConnectRpc throwing + // once kubo is up). Verifies the daemon tears kubo down — and doesn't print + // exit-hook's "SYNCHRONOUS TERMINATION NOTICE" — when oclif process.exit()s on a + // thrown startup error, instead of orphaning kubo. + if (process.env["PKC_CLI_TEST_FAIL_AFTER_KUBO_START"]) + throw new Error("Simulated startup failure after kubo start (PKC_CLI_TEST_FAIL_AFTER_KUBO_START)"); // Re-check the port: the early fail-fast at startup is a few ms before this runs, // so a TOCTOU race could let another process grab the port in between. If that // happens we must fail rather than silently leaving the daemon without an RPC. @@ -597,36 +611,44 @@ export default class Daemon extends Command { liveKuboPids.clear(); }; - asyncExitHook( - async () => { - if (keepKuboUpInterval) clearInterval(keepKuboUpInterval); - if (mainProcessExited) return; // we already exited - console.log( - "\nShutting down Bitsocial daemon, it may take a few seconds to shut down all communities and the IPFS node..." - ); - log("Received signal to exit, shutting down both kubo and pkc rpc. Please wait, it may take a few seconds"); + const shutdownDaemon = async () => { + if (keepKuboUpInterval) clearInterval(keepKuboUpInterval); + if (mainProcessExited) return; // we already exited + console.log( + "\nShutting down Bitsocial daemon, it may take a few seconds to shut down all communities and the IPFS node..." + ); + log("Received signal to exit, shutting down both kubo and pkc rpc. Please wait, it may take a few seconds"); - mainProcessExited = true; + mainProcessExited = true; - // Remove daemon state file so update install knows we're gone - await deleteDaemonState(process.pid).catch(() => {}); + // Remove daemon state file so update install knows we're gone + await deleteDaemonState(process.pid).catch(() => {}); - // Start killing Kubo immediately, in parallel with daemon server destroy. - // This way Kubo receives SIGINT right away, even if daemonServer.destroy() hangs. - const kuboKillPromise = killKuboProcess(); + // Start killing Kubo immediately, in parallel with daemon server destroy. + // This way Kubo receives SIGINT right away, even if daemonServer.destroy() hangs. + const kuboKillPromise = killKuboProcess(); - if (daemonServer) - try { - await daemonServer.destroy(); - log("Daemon server shut down"); - } catch (e) { - log.error("Error shutting down daemon server", e); - } + if (daemonServer) + try { + await daemonServer.destroy(); + log("Daemon server shut down"); + } catch (e) { + log.error("Error shutting down daemon server", e); + } - await kuboKillPromise; - }, + await kuboKillPromise; + }; + // Run kubo/daemon cleanup on SIGINT/SIGTERM/beforeExit. Capture the unsubscribe handle + // and the shutdown fn so the startup-failure catch (and the force-quit guard below) can + // run cleanup themselves and drop this hook: oclif calls process.exit() as it unwinds a + // thrown startup error, and process.exit() runs only synchronous exit hooks — exit-hook + // would skip this async one (printing its "SYNCHRONOUS TERMINATION NOTICE") and orphan a + // kubo we already spawned. (issue #98) + removeAsyncExitHook = asyncExitHook( + shutdownDaemon, { wait: DAEMON_SHUTDOWN_TIMEOUT_MS } // could take two minutes to shut down ); + runDaemonShutdown = shutdownDaemon; // Emergency cleanup: if the process force-exits (e.g. double Ctrl+C), // synchronously SIGKILL every live kubo's process group. This is a no-op if @@ -652,6 +674,11 @@ export default class Daemon extends Command { terminationSignalCount++; if (terminationSignalCount >= 2) { log(`Received ${signal} again during shutdown, force-quitting`); + // Deliberate immediate exit: the first signal's async shutdown is still + // running but the user wants out now. Drop the async exit hook first so the + // imminent process.exit() doesn't trip exit-hook's "SYNCHRONOUS TERMINATION + // NOTICE"; kubo is still SIGKILLed by the emergency "exit" handler. (issue #98) + removeAsyncExitHook?.(); process.exit(signal === "SIGINT" ? 130 : 143); } }); @@ -717,6 +744,22 @@ export default class Daemon extends Command { stdoutWrite(`Full log: ${logFilePath}\n`); stdoutWrite(`Or run: bitsocial logs\n`); + + // oclif's error handler calls process.exit() as it unwinds this throw, which runs only + // synchronous exit hooks — exit-hook would skip the async kubo/daemon cleanup (printing + // "SYNCHRONOUS TERMINATION NOTICE") and orphan a kubo we may have already spawned, and + // leave this daemon's state file behind. Run that cleanup now and drop the now-redundant + // hook so the process exits clean. No-ops if we failed before the hook was registered. + // Cap the wait with the same DAEMON_SHUTDOWN_TIMEOUT_MS the signal path gets (via the + // exit-hook `wait` option) so a hung daemonServer.destroy() can't swallow the original + // startup error forever; the unref'd timer never keeps the loop alive. (issue #98) + if (runDaemonShutdown) + await Promise.race([ + runDaemonShutdown().catch(() => {}), + new Promise((resolve) => setTimeout(resolve, DAEMON_SHUTDOWN_TIMEOUT_MS).unref()) + ]); + removeAsyncExitHook?.(); + throw err; } } diff --git a/test/cli/daemon.test.ts b/test/cli/daemon.test.ts index 1dcf8c8..bbc91aa 100644 --- a/test/cli/daemon.test.ts +++ b/test/cli/daemon.test.ts @@ -21,7 +21,9 @@ import { requestKuboShutdown, waitForWebSocketOpen, waitForKuboReady, - waitForPortFree + waitForPortFree, + withKuboBindRetry, + isAddressInUseError } from "../helpers/daemon-helpers.js"; import { preInitKuboWithEphemeralSwarm } from "../helpers/kubo-helpers.js"; dns.setDefaultResultOrder("ipv4first"); // to be able to resolve localhost @@ -863,6 +865,62 @@ describe("bitsocial daemon kills kubo on its own shutdown (no backup /shutdown c if (kuboApiUrl) await ensureKuboNodeStopped(kuboApiUrl); } }); + + it.skipIf(process.platform === "win32")( + "kills kubo (and prints no exit-hook termination notice) when startup fails after kubo is up", + { timeout: 90000 }, + async () => { + // Reproduces issue #98: when the daemon throws during startup *after* kubo has spawned + // (the real TOCTOU RPC-port race, simulated here via PKC_CLI_TEST_FAIL_AFTER_KUBO_START), + // oclif's error handler calls process.exit(). process.exit() runs only synchronous exit + // hooks, so exit-hook would skip the async kubo cleanup — printing "SYNCHRONOUS TERMINATION + // NOTICE" and orphaning kubo. The daemon must instead tear kubo down itself and exit quietly. + const dataPath = randomDirectory(); + // Retry on a lost kubo bind race (issue #87): the injected failure only fires once kubo is + // up, so if kubo's freshly-allocated ports are stolen in the TOCTOU window the daemon fails + // early instead — a different path. Retrying keeps the assertions below pinned to the + // intended path (the message check would otherwise flake on a bind race). + const { result, endpoints } = await withKuboBindRetry(async (e) => { + await preInitKuboWithEphemeralSwarm(path.join(dataPath, ".bitsocial-cli.ipfs"), new URL(e.kuboRpcUrl), new URL(e.gatewayUrl)); + const res = await runPkcDaemonExpectFailure(["--pkcOptions.dataPath", dataPath, "--pkcRpcUrl", e.rpcWsUrl], { + KUBO_RPC_URL: e.kuboRpcUrl, + IPFS_GATEWAY_URL: e.gatewayUrl, + PKC_CLI_TEST_FAIL_AFTER_KUBO_START: "1" + }); + const combined = `${res.stdout}\n${res.stderr}`; + // The daemon resolves (doesn't reject) on any failure; throw an address-in-use error so + // withKuboBindRetry retries only when we lost the bind race before the injected failure. + if (!combined.includes("Simulated startup failure after kubo start") && isAddressInUseError(combined)) + throw new Error(`address already in use (retry with fresh ports): ${combined}`); + return res; + }); + + try { + // The daemon exited non-zero on the injected startup failure (after kubo was up)... + expect(result.exitCode).not.toBe(0); + const combinedOutput = `${result.stdout}\n${result.stderr}`; + expect(combinedOutput).toContain("Simulated startup failure after kubo start"); + // ...without exit-hook's synchronous-termination warning (proves the async cleanup ran)... + expect(combinedOutput).not.toContain("SYNCHRONOUS TERMINATION NOTICE"); + // ...and kubo was killed by the daemon's own cleanup, not orphaned. + const kuboStopped = await waitForCondition( + async () => { + try { + const res = await fetch(`${endpoints.kuboApiUrl}/bitswap/stat`, { method: "POST" }); + return !res.ok; + } catch { + return true; // connection refused = stopped + } + }, + 20000, + 500 + ); + expect(kuboStopped).toBe(true); + } finally { + await ensureKuboNodeStopped(endpoints.kuboApiUrl); + } + } + ); }); describe("bitsocial daemon DEBUG env var", () => {