Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
89 changes: 66 additions & 23 deletions src/cli/commands/daemon.ts
Original file line number Diff line number Diff line change
Expand Up @@ -260,6 +260,13 @@ export default class Daemon extends Command {
const { logFilePath, stdoutWrite } = await this._pipeDebugLogsToLogFile(flags.logPath, PKCLogger as PKCLoggerType);
const log = PKCLogger("bitsocial-cli:daemon");

// Captured once the async exit hook is registered (inside the try). The startup-failure
// catch and the force-quit signal guard use these to run kubo/daemon cleanup and drop the
// hook before the process exits via process.exit() — which would otherwise skip the async
// hook (exit-hook's "SYNCHRONOUS TERMINATION NOTICE") and orphan kubo. (issue #98)
let runDaemonShutdown: (() => Promise<void>) | undefined;
let removeAsyncExitHook: (() => void) | undefined;

try {
// Log debug info after pipe is set up so it goes to the log file, not terminal
const envDebug: string | undefined = process.env["_PKC_DEBUG"] || process.env["DEBUG"];
Expand Down Expand Up @@ -480,6 +487,13 @@ export default class Daemon extends Command {
const createOrConnectRpc = async () => {
if (mainProcessExited) return;
if (startedOwnRpc) return;
// Test hook (issue #98): force a startup failure *after* keepKuboUp() has already
// spawned kubo, reproducing the real TOCTOU port race (createOrConnectRpc throwing
// once kubo is up). Verifies the daemon tears kubo down — and doesn't print
// exit-hook's "SYNCHRONOUS TERMINATION NOTICE" — when oclif process.exit()s on a
// thrown startup error, instead of orphaning kubo.
if (process.env["PKC_CLI_TEST_FAIL_AFTER_KUBO_START"])
throw new Error("Simulated startup failure after kubo start (PKC_CLI_TEST_FAIL_AFTER_KUBO_START)");
// Re-check the port: the early fail-fast at startup is a few ms before this runs,
// so a TOCTOU race could let another process grab the port in between. If that
// happens we must fail rather than silently leaving the daemon without an RPC.
Expand Down Expand Up @@ -597,36 +611,44 @@ export default class Daemon extends Command {
liveKuboPids.clear();
};

asyncExitHook(
async () => {
if (keepKuboUpInterval) clearInterval(keepKuboUpInterval);
if (mainProcessExited) return; // we already exited
console.log(
"\nShutting down Bitsocial daemon, it may take a few seconds to shut down all communities and the IPFS node..."
);
log("Received signal to exit, shutting down both kubo and pkc rpc. Please wait, it may take a few seconds");
const shutdownDaemon = async () => {
if (keepKuboUpInterval) clearInterval(keepKuboUpInterval);
if (mainProcessExited) return; // we already exited
console.log(
"\nShutting down Bitsocial daemon, it may take a few seconds to shut down all communities and the IPFS node..."
);
log("Received signal to exit, shutting down both kubo and pkc rpc. Please wait, it may take a few seconds");

mainProcessExited = true;
mainProcessExited = true;

// Remove daemon state file so update install knows we're gone
await deleteDaemonState(process.pid).catch(() => {});
// Remove daemon state file so update install knows we're gone
await deleteDaemonState(process.pid).catch(() => {});

// Start killing Kubo immediately, in parallel with daemon server destroy.
// This way Kubo receives SIGINT right away, even if daemonServer.destroy() hangs.
const kuboKillPromise = killKuboProcess();
// Start killing Kubo immediately, in parallel with daemon server destroy.
// This way Kubo receives SIGINT right away, even if daemonServer.destroy() hangs.
const kuboKillPromise = killKuboProcess();

if (daemonServer)
try {
await daemonServer.destroy();
log("Daemon server shut down");
} catch (e) {
log.error("Error shutting down daemon server", e);
}
if (daemonServer)
try {
await daemonServer.destroy();
log("Daemon server shut down");
} catch (e) {
log.error("Error shutting down daemon server", e);
}

await kuboKillPromise;
},
await kuboKillPromise;
};
// Run kubo/daemon cleanup on SIGINT/SIGTERM/beforeExit. Capture the unsubscribe handle
// and the shutdown fn so the startup-failure catch (and the force-quit guard below) can
// run cleanup themselves and drop this hook: oclif calls process.exit() as it unwinds a
// thrown startup error, and process.exit() runs only synchronous exit hooks — exit-hook
// would skip this async one (printing its "SYNCHRONOUS TERMINATION NOTICE") and orphan a
// kubo we already spawned. (issue #98)
removeAsyncExitHook = asyncExitHook(
shutdownDaemon,
{ wait: DAEMON_SHUTDOWN_TIMEOUT_MS } // could take two minutes to shut down
);
runDaemonShutdown = shutdownDaemon;

// Emergency cleanup: if the process force-exits (e.g. double Ctrl+C),
// synchronously SIGKILL every live kubo's process group. This is a no-op if
Expand All @@ -652,6 +674,11 @@ export default class Daemon extends Command {
terminationSignalCount++;
if (terminationSignalCount >= 2) {
log(`Received ${signal} again during shutdown, force-quitting`);
// Deliberate immediate exit: the first signal's async shutdown is still
// running but the user wants out now. Drop the async exit hook first so the
// imminent process.exit() doesn't trip exit-hook's "SYNCHRONOUS TERMINATION
// NOTICE"; kubo is still SIGKILLed by the emergency "exit" handler. (issue #98)
removeAsyncExitHook?.();
process.exit(signal === "SIGINT" ? 130 : 143);
}
});
Expand Down Expand Up @@ -717,6 +744,22 @@ export default class Daemon extends Command {

stdoutWrite(`Full log: ${logFilePath}\n`);
stdoutWrite(`Or run: bitsocial logs\n`);

// oclif's error handler calls process.exit() as it unwinds this throw, which runs only
// synchronous exit hooks — exit-hook would skip the async kubo/daemon cleanup (printing
// "SYNCHRONOUS TERMINATION NOTICE") and orphan a kubo we may have already spawned, and
// leave this daemon's state file behind. Run that cleanup now and drop the now-redundant
// hook so the process exits clean. No-ops if we failed before the hook was registered.
// Cap the wait with the same DAEMON_SHUTDOWN_TIMEOUT_MS the signal path gets (via the
// exit-hook `wait` option) so a hung daemonServer.destroy() can't swallow the original
// startup error forever; the unref'd timer never keeps the loop alive. (issue #98)
if (runDaemonShutdown)
await Promise.race([
runDaemonShutdown().catch(() => {}),
new Promise<void>((resolve) => setTimeout(resolve, DAEMON_SHUTDOWN_TIMEOUT_MS).unref())
]);
removeAsyncExitHook?.();
Comment thread
coderabbitai[bot] marked this conversation as resolved.

throw err;
}
}
Expand Down
60 changes: 59 additions & 1 deletion test/cli/daemon.test.ts
Original file line number Diff line number Diff line change
Expand Up @@ -21,7 +21,9 @@ import {
requestKuboShutdown,
waitForWebSocketOpen,
waitForKuboReady,
waitForPortFree
waitForPortFree,
withKuboBindRetry,
isAddressInUseError
} from "../helpers/daemon-helpers.js";
import { preInitKuboWithEphemeralSwarm } from "../helpers/kubo-helpers.js";
dns.setDefaultResultOrder("ipv4first"); // to be able to resolve localhost
Expand Down Expand Up @@ -863,6 +865,62 @@ describe("bitsocial daemon kills kubo on its own shutdown (no backup /shutdown c
if (kuboApiUrl) await ensureKuboNodeStopped(kuboApiUrl);
}
});

it.skipIf(process.platform === "win32")(
"kills kubo (and prints no exit-hook termination notice) when startup fails after kubo is up",
{ timeout: 90000 },
async () => {
// Reproduces issue #98: when the daemon throws during startup *after* kubo has spawned
// (the real TOCTOU RPC-port race, simulated here via PKC_CLI_TEST_FAIL_AFTER_KUBO_START),
// oclif's error handler calls process.exit(). process.exit() runs only synchronous exit
// hooks, so exit-hook would skip the async kubo cleanup — printing "SYNCHRONOUS TERMINATION
// NOTICE" and orphaning kubo. The daemon must instead tear kubo down itself and exit quietly.
const dataPath = randomDirectory();
// Retry on a lost kubo bind race (issue #87): the injected failure only fires once kubo is
// up, so if kubo's freshly-allocated ports are stolen in the TOCTOU window the daemon fails
// early instead — a different path. Retrying keeps the assertions below pinned to the
// intended path (the message check would otherwise flake on a bind race).
const { result, endpoints } = await withKuboBindRetry(async (e) => {
await preInitKuboWithEphemeralSwarm(path.join(dataPath, ".bitsocial-cli.ipfs"), new URL(e.kuboRpcUrl), new URL(e.gatewayUrl));
const res = await runPkcDaemonExpectFailure(["--pkcOptions.dataPath", dataPath, "--pkcRpcUrl", e.rpcWsUrl], {
KUBO_RPC_URL: e.kuboRpcUrl,
IPFS_GATEWAY_URL: e.gatewayUrl,
PKC_CLI_TEST_FAIL_AFTER_KUBO_START: "1"
});
const combined = `${res.stdout}\n${res.stderr}`;
// The daemon resolves (doesn't reject) on any failure; throw an address-in-use error so
// withKuboBindRetry retries only when we lost the bind race before the injected failure.
if (!combined.includes("Simulated startup failure after kubo start") && isAddressInUseError(combined))
throw new Error(`address already in use (retry with fresh ports): ${combined}`);
return res;
});

try {
// The daemon exited non-zero on the injected startup failure (after kubo was up)...
expect(result.exitCode).not.toBe(0);
const combinedOutput = `${result.stdout}\n${result.stderr}`;
expect(combinedOutput).toContain("Simulated startup failure after kubo start");
// ...without exit-hook's synchronous-termination warning (proves the async cleanup ran)...
expect(combinedOutput).not.toContain("SYNCHRONOUS TERMINATION NOTICE");
// ...and kubo was killed by the daemon's own cleanup, not orphaned.
const kuboStopped = await waitForCondition(
async () => {
try {
const res = await fetch(`${endpoints.kuboApiUrl}/bitswap/stat`, { method: "POST" });
return !res.ok;
} catch {
return true; // connection refused = stopped
}
},
20000,
500
);
expect(kuboStopped).toBe(true);
} finally {
await ensureKuboNodeStopped(endpoints.kuboApiUrl);
}
}
);
});

describe("bitsocial daemon DEBUG env var", () => {
Expand Down
Loading