diff --git a/src/cli/commands/daemon.ts b/src/cli/commands/daemon.ts index 945146e..4fd22ff 100644 --- a/src/cli/commands/daemon.ts +++ b/src/cli/commands/daemon.ts @@ -19,7 +19,7 @@ import { printBanner } from "../ascii-banner.js"; import { loadChallengesIntoPKC, formatChallengeNameVersion } from "../../challenge-packages/challenge-utils.js"; import { migrateDataDirectory } from "../../common-utils/data-migration.js"; import { createBsoResolvers, DEFAULT_PROVIDERS } from "../../common-utils/resolvers.js"; -import { pruneStaleStates, writeDaemonState, deleteDaemonState, DAEMON_SHUTDOWN_TIMEOUT_MS } from "../../common-utils/daemon-state.js"; +import { pruneStaleStates, writeDaemonState, deleteDaemonState, detectSelfSupervisor, DAEMON_SHUTDOWN_TIMEOUT_MS } from "../../common-utils/daemon-state.js"; import { createDaemonFileLogger, type DaemonFileLogger } from "../../common-utils/daemon-file-logger.js"; import fs from "fs"; import fsPromise from "fs/promises"; @@ -327,13 +327,17 @@ export default class Daemon extends Command { // Prune stale daemon state files (dead PIDs from crashed daemons) await pruneStaleStates(); - // Persist this daemon's PID and startup args so `bitsocial update install --restart-daemons` can stop and restart it + // Persist this daemon's PID and startup args so `bitsocial update install --restart-daemons` can stop and restart it. + // Also record the supervisor (e.g. systemd) so the updater restarts via the supervisor instead of spawning a + // detached daemon that would compete with it for the RPC port (issue #82). const daemonArgv = process.argv.slice(process.argv.indexOf("daemon") + 1); + const supervisor = await detectSelfSupervisor(); await writeDaemonState({ pid: process.pid, startedAt: new Date().toISOString(), argv: daemonArgv, - pkcRpcUrl: pkcRpcUrl.toString() + pkcRpcUrl: pkcRpcUrl.toString(), + ...(supervisor ? { supervisor } : {}) }); // Create BSO name resolvers for .bso/.eth domain resolution diff --git a/src/cli/commands/update/install.ts b/src/cli/commands/update/install.ts index da2e60f..4c3a179 100644 --- a/src/cli/commands/update/install.ts +++ b/src/cli/commands/update/install.ts @@ -4,7 +4,20 @@ import tcpPortUsed from "tcp-port-used"; import { fetchLatestVersion, installGlobal } from "../../../update/npm-registry.js"; import { fastInstallGlobal } from "../../../update/fast-update.js"; import { compareVersions } from "../../../update/semver.js"; -import { getAliveDaemonStates, DAEMON_SHUTDOWN_TIMEOUT_MS, type DaemonState } from "../../../common-utils/daemon-state.js"; +import { systemctlRestart } from "../../../update/systemctl.js"; +import { + getAliveDaemonStates, + resolveDaemonSupervisor, + DAEMON_SHUTDOWN_TIMEOUT_MS, + type DaemonSupervisor +} from "../../../common-utils/daemon-state.js"; +import { + planDaemonRestarts, + stopUnmanagedDaemons, + startUnmanagedDaemons, + restartManagedDaemons, + type DaemonLifecycle +} from "../../../update/restart-orchestration.js"; export default class Install extends Command { static override description = "Install a specific version of bitsocial from npm"; @@ -40,48 +53,29 @@ export default class Install extends Command { async run(): Promise { const { args, flags } = await this.parse(Install); - // Check for running daemons via state files + // Discover running daemons and split them into supervisor-managed vs. updater-managed (issue #82). + // Supervised daemons (e.g. systemd) are restarted through their supervisor; spawning a detached + // replacement ourselves would create a process the supervisor doesn't own that competes with it + // for the RPC port and triggers a restart loop. const aliveDaemons = await getAliveDaemonStates(); + const plan = await planDaemonRestarts(aliveDaemons, (d) => resolveDaemonSupervisor(d)); + const lifecycle = this._daemonLifecycle(); if (aliveDaemons.length > 0) { if (!flags["restart-daemons"]) { - this.error( - `${aliveDaemons.length} daemon(s) running. Stop them first, then retry.`, - { exit: 1 } - ); - } - - // Stop all running daemons - for (const d of aliveDaemons) { - this.log(`Stopping daemon (PID ${d.pid})...`); - try { - process.kill(d.pid, "SIGINT"); - } catch (e) { - if ((e as NodeJS.ErrnoException).code === "ESRCH") { - this.log(` PID ${d.pid} already exited.`); - continue; - } - throw e; - } + this.error(`${aliveDaemons.length} daemon(s) running. Stop them first, then retry.`, { exit: 1 }); } - // Wait for each daemon process to fully exit — NOT just for its RPC port to free. - // The daemon releases its RPC port (daemonServer.destroy()) before it finishes killing - // its kubo child, so a port-only wait lets us restart while the old kubo still holds the - // IPFS API port; the new daemon then dies on startup with "IPFS API port already in use" - // (issue #70). The daemon's exit hook kills kubo before the process exits, so waiting for - // the PID to disappear guarantees the kubo port is free before we restart. - for (const d of aliveDaemons) { - this.log(`Waiting for daemon (PID ${d.pid}) to exit...`); - const exited = await this._waitForProcessExit(d.pid, DAEMON_SHUTDOWN_TIMEOUT_MS); - if (!exited) { - this.error( - `Daemon (PID ${d.pid}) did not shut down within ${DAEMON_SHUTDOWN_TIMEOUT_MS / 1000} seconds.`, - { exit: 1 } - ); - } + // Stop only the unsupervised daemons before the binary swap. Supervised daemons keep running + // and are restarted by their supervisor afterwards (see _restartViaSupervisor). + await stopUnmanagedDaemons(plan, lifecycle); + if (plan.unmanaged.length > 0) this.log("All unsupervised daemons stopped."); + for (const { daemon, supervisor } of plan.managed) { + this.log( + `Daemon (PID ${daemon.pid}) is managed by ${supervisor.type} (${supervisor.unit}); ` + + `it will be restarted by its supervisor.` + ); } - this.log("All daemons stopped."); } // Resolve the target version @@ -101,10 +95,9 @@ export default class Install extends Command { // Skip if already on this version (unless --force) if (compareVersions(current, targetVersion) === 0 && !flags.force) { this.log(`Already on v${current}. Use --force to reinstall.`); - if (aliveDaemons.length > 0 && flags["restart-daemons"]) { - // We stopped daemons but don't need to update — restart them - await this._restartDaemons(aliveDaemons); - } + // We stopped the unsupervised daemons but aren't updating — bring them back. Supervised daemons + // were never stopped, so leave them running (no unnecessary service bounce). + if (flags["restart-daemons"]) await startUnmanagedDaemons(plan, lifecycle); return; } @@ -132,10 +125,13 @@ export default class Install extends Command { this.log(`Installed bitsocial v${targetVersion} (was v${current}).`); - // Restart daemons with the new binary + // Restart daemons with the new binary: re-spawn the unsupervised ones we stopped, and ask each + // supervisor to restart its daemon onto the new binary. if (aliveDaemons.length > 0 && flags["restart-daemons"]) { - await this._restartDaemons(aliveDaemons); + await startUnmanagedDaemons(plan, lifecycle); + await restartManagedDaemons(plan, lifecycle); this.log("To see the daemon logs run `bitsocial logs --stdout`"); + this.log("Check community status with: bitsocial community list"); } } @@ -163,36 +159,77 @@ export default class Install extends Command { return false; } - private async _restartDaemons(daemons: DaemonState[]): Promise { - this.log(`Restarting ${daemons.length} daemon(s)...`); - - for (const d of daemons) { - const argStr = d.argv.length > 0 ? d.argv.join(" ") : "(defaults)"; - this.log(` Starting daemon with args: ${argStr}`); + /** Build the side effects that the restart orchestration drives (split out so the routing is testable). */ + private _daemonLifecycle(): DaemonLifecycle { + return { + stopUnmanaged: async (daemon) => { + this.log(`Stopping daemon (PID ${daemon.pid})...`); + try { + process.kill(daemon.pid, "SIGINT"); + } catch (e) { + if ((e as NodeJS.ErrnoException).code === "ESRCH") { + this.log(` PID ${daemon.pid} already exited.`); + return; + } + throw e; + } - const child = spawn("bitsocial", ["daemon", ...d.argv], { - detached: true, - stdio: "ignore" - }); - child.unref(); + // Wait for the process to fully exit — NOT just for its RPC port to free. The daemon + // releases its RPC port (daemonServer.destroy()) before it finishes killing its kubo + // child, so a port-only wait lets us restart while the old kubo still holds the IPFS API + // port; the new daemon then dies on "IPFS API port already in use" (issue #70). The + // daemon's exit hook kills kubo before exiting, so "PID gone" guarantees kubo is free. + this.log(`Waiting for daemon (PID ${daemon.pid}) to exit...`); + const exited = await this._waitForProcessExit(daemon.pid, DAEMON_SHUTDOWN_TIMEOUT_MS); + if (!exited) { + this.error( + `Daemon (PID ${daemon.pid}) did not shut down within ${DAEMON_SHUTDOWN_TIMEOUT_MS / 1000} seconds.`, + { exit: 1 } + ); + } + }, + startUnmanaged: async (daemon) => { + const argStr = daemon.argv.length > 0 ? daemon.argv.join(" ") : "(defaults)"; + this.log(`Restarting daemon with args: ${argStr}`); + + const child = spawn("bitsocial", ["daemon", ...daemon.argv], { + detached: true, + stdio: "ignore" + }); + child.unref(); + + if (!child.pid) { + this.warn(`Failed to spawn daemon for args: ${argStr}`); + return; + } - if (!child.pid) { - this.warn(`Failed to spawn daemon for args: ${argStr}`); - continue; + // Wait briefly for the daemon's RPC port to come up + const port = Number(new URL(daemon.pkcRpcUrl).port); + const started = await tcpPortUsed.waitUntilUsed(port, 500, 30000).then(() => true).catch(() => false); + if (started) { + this.log(` Daemon started (port ${port}).`); + } else { + this.warn(` Daemon may not have started — port ${port} not responding after 30s. Check logs with: bitsocial logs`); + } + }, + restartManaged: async (supervisor) => { + await this._restartViaSupervisor(supervisor); } + }; + } - // Wait briefly for the daemon's RPC port to come up - const url = new URL(d.pkcRpcUrl); - const port = Number(url.port); - const started = await tcpPortUsed.waitUntilUsed(port, 500, 30000).then(() => true).catch(() => false); - if (started) { - this.log(` Daemon started (port ${port}).`); - } else { - this.warn(` Daemon may not have started — port ${port} not responding after 30s. Check logs with: bitsocial logs`); - } + /** Restart a supervised daemon onto the new binary by asking its supervisor (e.g. systemd). */ + private async _restartViaSupervisor(supervisor: DaemonSupervisor): Promise { + this.log(`Restarting ${supervisor.type} unit ${supervisor.unit} so it picks up the new binary...`); + try { + await systemctlRestart(supervisor.unit); + this.log(` ${supervisor.unit} restarted.`); + } catch (err) { + this.error( + `Updated the binary but failed to restart ${supervisor.type} unit ${supervisor.unit}: ${(err as Error).message}. ` + + `Restart it manually, e.g. 'sudo systemctl restart ${supervisor.unit}'.`, + { exit: 1 } + ); } - - this.log("Check community status with: bitsocial community list"); - this.log("Check logs with: bitsocial logs"); } } diff --git a/src/common-utils/daemon-state.ts b/src/common-utils/daemon-state.ts index 18c3aac..1f50e00 100644 --- a/src/common-utils/daemon-state.ts +++ b/src/common-utils/daemon-state.ts @@ -16,6 +16,18 @@ const DAEMON_STATES_DIR = path.join(defaults.PKC_DATA_PATH, ".daemon_states"); */ export const DAEMON_SHUTDOWN_TIMEOUT_MS = 120000; +/** + * How a daemon's lifecycle is managed by an external supervisor. Recorded at startup so that + * `update install` restarts the daemon through its supervisor instead of spawning a detached + * replacement that would compete with the supervisor for the RPC port (issue #82). + */ +export interface DaemonSupervisor { + /** The supervisor managing this daemon. Only systemd is detected today. */ + type: "systemd"; + /** The unit that owns the daemon, e.g. "bitsocial.service". */ + unit: string; +} + export interface DaemonState { pid: number; startedAt: string; @@ -23,6 +35,65 @@ export interface DaemonState { pkcRpcUrl: string; /** OS-reported process start time, used to detect PID reuse. Absent in legacy state files. */ procStartTime?: string; + /** External supervisor managing this daemon, if any. Absent for standalone or legacy daemons. */ + supervisor?: DaemonSupervisor; +} + +/** + * Parse the systemd service unit a process belongs to out of its cgroup contents, or undefined. + * cgroup v2: a single line `0::/system.slice/bitsocial.service` + * cgroup v1: many `id:controller:/system.slice/bitsocial.service` lines (all point at the same unit) + * The unit is the leaf of the cgroup path when it ends in `.service`. A user session has a `.scope` + * leaf (e.g. `…/session-36.scope`) — not a service — so it returns undefined (that daemon is not + * systemd-supervised even if it happens to live under system.slice somewhere up the tree). + */ +export function parseSystemdUnitFromCgroup(content: string): string | undefined { + for (const line of content.split("\n")) { + if (!line.trim()) continue; + // hierarchy-id:controller-list:cgroup-path — the path is the last colon-separated field + const cgroupPath = line.slice(line.lastIndexOf(":") + 1); + const leaf = cgroupPath.slice(cgroupPath.lastIndexOf("/") + 1); + if (leaf.endsWith(".service")) return leaf; + } + return undefined; +} + +/** Read the systemd unit owning `pid` (or the current process when "self") from /proc, or undefined. */ +export async function readSystemdUnit(pid: number | "self"): Promise { + try { + const content = await fs.readFile(`/proc/${pid}/cgroup`, "utf-8"); + return parseSystemdUnitFromCgroup(content); + } catch { + return undefined; // no /proc (non-Linux) or unreadable — treat as unsupervised + } +} + +/** + * Detect whether THIS process was started by systemd, and under which unit. systemd sets + * $INVOCATION_ID for every service it spawns; the unit name comes from this process's own cgroup. + * `env`/`readUnit` are injectable for testing. Returns undefined when not systemd-supervised. + */ +export async function detectSelfSupervisor( + env: NodeJS.ProcessEnv = process.env, + readUnit: (pid: number | "self") => Promise = readSystemdUnit +): Promise { + if (!env.INVOCATION_ID) return undefined; + const unit = await readUnit("self"); + return unit ? { type: "systemd", unit } : undefined; +} + +/** + * Resolve the supervisor for a daemon described by `state`. Prefers the `supervisor` it recorded + * at startup; for legacy daemons that predate that field, falls back to inferring the unit from the + * live process's cgroup. `readUnit` is injectable for testing. + */ +export async function resolveDaemonSupervisor( + state: DaemonState, + readUnit: (pid: number | "self") => Promise = readSystemdUnit +): Promise { + if (state.supervisor) return state.supervisor; + const unit = await readUnit(state.pid); + return unit ? { type: "systemd", unit } : undefined; } function stateFilePath(pid: number): string { diff --git a/src/update/restart-orchestration.ts b/src/update/restart-orchestration.ts new file mode 100644 index 0000000..8d35c23 --- /dev/null +++ b/src/update/restart-orchestration.ts @@ -0,0 +1,76 @@ +import type { DaemonState, DaemonSupervisor } from "../common-utils/daemon-state.js"; + +// Routes `update install`'s daemon restarts by supervisor (issue #82). +// +// Unsupervised daemons are managed directly by the updater: stopped (SIGINT) before the binary swap +// and re-spawned detached afterwards. Supervised daemons (e.g. systemd) are NOT touched by the +// updater — stopping/respawning one ourselves spawns a process the supervisor does not own, which +// then competes with the supervisor for the RPC port and triggers a restart loop. Instead we leave +// the supervised daemon running across the binary swap and ask its supervisor to restart it. +// +// The logic here is split from the side effects (in install.ts) so it can be unit-tested with a fake +// DaemonLifecycle. + +export interface ManagedDaemon { + daemon: DaemonState; + supervisor: DaemonSupervisor; +} + +export interface DaemonPlan { + /** Daemons restarted through their supervisor (left running across the binary swap). */ + managed: ManagedDaemon[]; + /** Daemons the updater stops and re-spawns itself. */ + unmanaged: DaemonState[]; +} + +/** The side effects of restarting daemons; injected so the orchestration is testable. */ +export interface DaemonLifecycle { + /** Take an unsupervised daemon down (SIGINT) and wait for it to fully exit, before the binary swap. */ + stopUnmanaged(daemon: DaemonState): Promise; + /** Re-spawn an unsupervised daemon as a detached process with its original args, after the binary swap. */ + startUnmanaged(daemon: DaemonState): Promise; + /** Ask a supervisor to restart its daemon onto the new binary (e.g. `systemctl restart `). */ + restartManaged(supervisor: DaemonSupervisor): Promise; +} + +/** Partition the alive daemons into supervised vs. updater-managed. */ +export async function planDaemonRestarts( + daemons: DaemonState[], + resolve: (daemon: DaemonState) => Promise +): Promise { + const managed: ManagedDaemon[] = []; + const unmanaged: DaemonState[] = []; + for (const daemon of daemons) { + const supervisor = await resolve(daemon); + if (supervisor) managed.push({ daemon, supervisor }); + else unmanaged.push(daemon); + } + return { managed, unmanaged }; +} + +/** + * Stop the daemons that must come down before the binary swap: only the unsupervised ones. + * Supervised daemons keep running — their supervisor restarts them after the swap. + */ +export async function stopUnmanagedDaemons(plan: DaemonPlan, lifecycle: DaemonLifecycle): Promise { + for (const daemon of plan.unmanaged) await lifecycle.stopUnmanaged(daemon); +} + +/** Re-spawn the unsupervised daemons that were stopped. Safe to call even after a no-op update. */ +export async function startUnmanagedDaemons(plan: DaemonPlan, lifecycle: DaemonLifecycle): Promise { + for (const daemon of plan.unmanaged) await lifecycle.startUnmanaged(daemon); +} + +/** + * Restart supervised daemons onto the new binary, deduplicated per unit. Call only after a real + * install — a no-op update shouldn't bounce a supervised service, since it was never stopped. + */ +export async function restartManagedDaemons(plan: DaemonPlan, lifecycle: DaemonLifecycle): Promise { + const restarted = new Set(); + for (const { supervisor } of plan.managed) { + const key = `${supervisor.type}:${supervisor.unit}`; + if (restarted.has(key)) continue; + restarted.add(key); + await lifecycle.restartManaged(supervisor); + } +} diff --git a/src/update/systemctl.ts b/src/update/systemctl.ts new file mode 100644 index 0000000..d213ba9 --- /dev/null +++ b/src/update/systemctl.ts @@ -0,0 +1,14 @@ +import { execFile } from "child_process"; +import { promisify } from "util"; + +const execFileAsync = promisify(execFile); + +export type Exec = (cmd: string, args: string[]) => Promise; + +/** + * Restart a systemd unit so it picks up a freshly installed binary. Rejects (propagating the + * underlying error) if systemctl is missing or the restart fails. `exec` is injectable for testing. + */ +export async function systemctlRestart(unit: string, exec: Exec = execFileAsync): Promise { + await exec("systemctl", ["restart", unit]); +} diff --git a/test/cli/update-install-systemd-aware.test.ts b/test/cli/update-install-systemd-aware.test.ts new file mode 100644 index 0000000..f476eeb --- /dev/null +++ b/test/cli/update-install-systemd-aware.test.ts @@ -0,0 +1,144 @@ +// `bitsocial update install` must not tear down a daemon that an external supervisor owns (issue #82). +// +// Before the fix, update install stopped EVERY daemon found in the state files (SIGINT + wait) and +// re-spawned it detached — escaping the supervisor and competing with it for the RPC port, which put +// systemd into a restart loop. After the fix, a daemon whose state file records a `supervisor` is left +// running across the update and restarted via that supervisor (`systemctl restart`) instead. +// +// This test pins the safety property end-to-end without a real daemon or npm: a dummy long-lived +// process stands in for the daemon, its state file (seeded in an isolated state dir) marks it +// systemd-managed, and we assert update install leaves it alive. The buggy code would SIGINT it. +// +// Isolation: HOME/XDG_DATA_HOME are overridden for both the state-seeding helper and update install so +// the command sees ONLY this dummy daemon and never touches real daemons on the machine. + +import { spawn, spawnSync } from "child_process"; +import { describe, it, expect, afterEach } from "vitest"; +import { directory as randomDirectory } from "tempy"; +import { readFileSync, existsSync } from "fs"; +import fs from "fs/promises"; +import path from "path"; +import { pathToFileURL } from "url"; + +const CLI_VERSION = JSON.parse(readFileSync(path.join(process.cwd(), "package.json"), "utf-8")).version as string; +const DAEMON_STATE_MODULE = pathToFileURL(path.join(process.cwd(), "dist", "common-utils", "daemon-state.js")).href; + +const isPidAlive = (pid: number): boolean => { + try { + process.kill(pid, 0); + return true; + } catch { + return false; + } +}; + +const runUpdateInstall = (env: Record): Promise<{ exitCode: number | null; stdout: string; stderr: string }> => + new Promise((resolve, reject) => { + const proc = spawn("node", ["./bin/run", "update", "install", CLI_VERSION], { + stdio: ["ignore", "pipe", "pipe"], + env: { ...process.env, ...env } + }); + let stdout = ""; + let stderr = ""; + proc.stdout.on("data", (d) => (stdout += d.toString())); + proc.stderr.on("data", (d) => (stderr += d.toString())); + proc.on("error", reject); + proc.on("close", (exitCode) => resolve({ exitCode, stdout, stderr })); + }); + +describe("bitsocial update install — supervised daemons (issue #82)", () => { + let dummyDaemon: ReturnType | undefined; + + afterEach(() => { + if (dummyDaemon?.pid && isPidAlive(dummyDaemon.pid)) { + try { + process.kill(dummyDaemon.pid, "SIGKILL"); + } catch { + /* already gone */ + } + } + dummyDaemon = undefined; + }); + + it.skipIf(process.platform === "win32")( + "leaves a systemd-managed daemon running (does not SIGINT it) and restarts it via the supervisor instead", + { timeout: 60000 }, + async () => { + const isolatedHome = randomDirectory(); + const env = { + HOME: isolatedHome, + XDG_DATA_HOME: path.join(isolatedHome, ".local", "share") + }; + + // A `systemctl` shim that records its invocation instead of restarting anything real. + const tmpBin = randomDirectory(); + const systemctlMarker = path.join(tmpBin, "systemctl.calls"); + const systemctlShim = path.join(tmpBin, "systemctl"); + await fs.writeFile(systemctlShim, `#!/bin/sh\necho "$@" >> "${systemctlMarker}"\nexit 0\n`); + await fs.chmod(systemctlShim, 0o755); + + // A `bitsocial` shim recording any detached-daemon spawn. A supervised daemon must NEVER be + // re-spawned this way (that's the bug). It also keeps the buggy code path from launching a + // real daemon during the test. + const bitsocialMarker = path.join(tmpBin, "bitsocial.calls"); + const bitsocialShim = path.join(tmpBin, "bitsocial"); + await fs.writeFile(bitsocialShim, `#!/bin/sh\necho "$@" >> "${bitsocialMarker}"\nexit 0\n`); + await fs.chmod(bitsocialShim, 0o755); + + // A dummy long-lived process standing in for the daemon. With no SIGINT handler, the buggy + // code's SIGINT would kill it — so "still alive afterwards" is the red/green discriminator. + dummyDaemon = spawn("node", ["-e", "setInterval(() => {}, 1 << 30)"], { stdio: "ignore" }); + await new Promise((r) => setTimeout(r, 200)); + const dummyPid = dummyDaemon.pid!; + expect(isPidAlive(dummyPid)).toBe(true); + + // Seed the daemon state file in the ISOLATED state dir, marked systemd-managed. Run via a + // subprocess with the isolated env so the state dir path is derived from that env, and so + // writeDaemonState records the dummy's real process start time (PID-reuse guard). + const seed = path.join(tmpBin, "seed-state.mjs"); + await fs.writeFile( + seed, + [ + `const mod = await import(${JSON.stringify(DAEMON_STATE_MODULE)});`, + `await mod.writeDaemonState(JSON.parse(process.env.SEED_STATE));` + ].join("\n") + ); + const seedResult = spawnSync("node", [seed], { + env: { + ...process.env, + ...env, + SEED_STATE: JSON.stringify({ + pid: dummyPid, + startedAt: new Date().toISOString(), + argv: [], + pkcRpcUrl: "ws://localhost:9138", + supervisor: { type: "systemd", unit: "bitsocial-test-82.service" } + }) + }, + encoding: "utf-8" + }); + expect(seedResult.status, `seed stderr: ${seedResult.stderr}`).toBe(0); + + // Same-version install: skips npm, but runs the full daemon stop/restart routing. + const result = await runUpdateInstall({ + ...env, + PATH: `${tmpBin}:${process.env.PATH}` + }); + + expect(result.exitCode, `update install output:\n${result.stdout}\n${result.stderr}`).toBe(0); + + // The core fix: the supervised daemon was NOT stopped — it's still the same running process. + expect(isPidAlive(dummyPid), `update install output:\n${result.stdout}\n${result.stderr}`).toBe(true); + + // It must be recognized as supervised (so the updater defers to systemd, never spawns a competitor). + expect(result.stdout).toContain("managed by systemd (bitsocial-test-82.service)"); + + // The supervised daemon must never be re-spawned as a detached `bitsocial daemon` (the bug's + // root cause — that process escapes the supervisor and competes for the RPC port). + expect(existsSync(bitsocialMarker), `update install output:\n${result.stdout}\n${result.stderr}`).toBe(false); + + // Same-version is a no-op install, so no service bounce should have been triggered. + expect(existsSync(systemctlMarker)).toBe(false); + } + ); +}); diff --git a/test/common-utils/daemon-supervisor.test.ts b/test/common-utils/daemon-supervisor.test.ts new file mode 100644 index 0000000..f5ed8d1 --- /dev/null +++ b/test/common-utils/daemon-supervisor.test.ts @@ -0,0 +1,113 @@ +import { describe, it, expect, afterEach } from "vitest"; +import { + parseSystemdUnitFromCgroup, + detectSelfSupervisor, + resolveDaemonSupervisor, + writeDaemonState, + readAllDaemonStates, + deleteDaemonState +} from "../../dist/common-utils/daemon-state.js"; +import type { DaemonState, DaemonSupervisor } from "../../dist/common-utils/daemon-state.js"; + +// Supervisor detection/resolution for issue #82. The parser is pure; detect/resolve take an +// injectable cgroup reader so we never depend on the test runner's own cgroup. + +describe("parseSystemdUnitFromCgroup", () => { + it("extracts the unit from a cgroup v2 service line", () => { + expect(parseSystemdUnitFromCgroup("0::/system.slice/bitsocial.service")).toBe("bitsocial.service"); + }); + + it("returns undefined for a user session scope (not a service)", () => { + expect(parseSystemdUnitFromCgroup("0::/user.slice/user-0.slice/session-36.scope")).toBeUndefined(); + }); + + it("parses a cgroup v1 multi-line file (all controllers point at the same unit)", () => { + const v1 = ["12:pids:/system.slice/bitsocial.service", "8:memory:/system.slice/bitsocial.service", "1:name=systemd:/system.slice/bitsocial.service"].join("\n"); + expect(parseSystemdUnitFromCgroup(v1)).toBe("bitsocial.service"); + }); + + it("handles a templated/instanced unit name", () => { + expect(parseSystemdUnitFromCgroup("0::/system.slice/system-foo.slice/foo@1.service")).toBe("foo@1.service"); + }); + + it("ignores blank lines and trailing newlines", () => { + expect(parseSystemdUnitFromCgroup("\n0::/system.slice/bitsocial.service\n\n")).toBe("bitsocial.service"); + }); + + it("returns undefined for the root cgroup", () => { + expect(parseSystemdUnitFromCgroup("0::/")).toBeUndefined(); + }); + + it("returns undefined for empty content", () => { + expect(parseSystemdUnitFromCgroup("")).toBeUndefined(); + }); +}); + +describe("detectSelfSupervisor", () => { + it("returns undefined when INVOCATION_ID is absent (not started by systemd)", async () => { + const sup = await detectSelfSupervisor({}, async () => "bitsocial.service"); + expect(sup).toBeUndefined(); + }); + + it("returns the systemd unit when INVOCATION_ID is set and the cgroup is a service", async () => { + const sup = await detectSelfSupervisor({ INVOCATION_ID: "abc" }, async () => "bitsocial.service"); + expect(sup).toEqual({ type: "systemd", unit: "bitsocial.service" }); + }); + + it("returns undefined when INVOCATION_ID is set but the cgroup is not a service", async () => { + const sup = await detectSelfSupervisor({ INVOCATION_ID: "abc" }, async () => undefined); + expect(sup).toBeUndefined(); + }); +}); + +describe("resolveDaemonSupervisor", () => { + const base: DaemonState = { pid: 4242, startedAt: "t", argv: [], pkcRpcUrl: "ws://localhost:9138" }; + + it("prefers the supervisor recorded in the state file (and never reads the cgroup)", async () => { + const recorded: DaemonSupervisor = { type: "systemd", unit: "recorded.service" }; + const sup = await resolveDaemonSupervisor({ ...base, supervisor: recorded }, async () => { + throw new Error("readUnit must not be called when the state file already records a supervisor"); + }); + expect(sup).toEqual(recorded); + }); + + it("falls back to the process cgroup for a legacy daemon (no supervisor field)", async () => { + const sup = await resolveDaemonSupervisor(base, async (pid) => { + expect(pid).toBe(base.pid); + return "bitsocial.service"; + }); + expect(sup).toEqual({ type: "systemd", unit: "bitsocial.service" }); + }); + + it("returns undefined for a legacy daemon whose cgroup is not a service", async () => { + const sup = await resolveDaemonSupervisor(base, async () => undefined); + expect(sup).toBeUndefined(); + }); +}); + +describe("writeDaemonState round-trips the supervisor field", () => { + const createdPids: number[] = []; + afterEach(async () => { + for (const pid of createdPids) await deleteDaemonState(pid); + createdPids.length = 0; + }); + + it("persists and reads back the supervisor", async () => { + const pid = 9999700; + createdPids.push(pid); + const supervisor: DaemonSupervisor = { type: "systemd", unit: "bitsocial.service" }; + await writeDaemonState({ pid, startedAt: new Date().toISOString(), argv: [], pkcRpcUrl: `ws://localhost:${pid}`, supervisor }); + + const found = (await readAllDaemonStates()).find((s) => s.pid === pid); + expect(found?.supervisor).toEqual(supervisor); + }); + + it("omits the supervisor for an unsupervised daemon", async () => { + const pid = 9999701; + createdPids.push(pid); + await writeDaemonState({ pid, startedAt: new Date().toISOString(), argv: [], pkcRpcUrl: `ws://localhost:${pid}` }); + + const found = (await readAllDaemonStates()).find((s) => s.pid === pid); + expect(found?.supervisor).toBeUndefined(); + }); +}); diff --git a/test/update/restart-orchestration.test.ts b/test/update/restart-orchestration.test.ts new file mode 100644 index 0000000..3c2d903 --- /dev/null +++ b/test/update/restart-orchestration.test.ts @@ -0,0 +1,109 @@ +import { describe, it, expect } from "vitest"; +import { + planDaemonRestarts, + stopUnmanagedDaemons, + startUnmanagedDaemons, + restartManagedDaemons, + type DaemonLifecycle +} from "../../dist/update/restart-orchestration.js"; +import type { DaemonState, DaemonSupervisor } from "../../dist/common-utils/daemon-state.js"; + +// Restart routing for issue #82: supervised daemons go through their supervisor, unsupervised ones +// are stopped + detached-respawned by the updater. Driven by a recording fake lifecycle. + +const mkDaemon = (pid: number, supervisor?: DaemonSupervisor): DaemonState => ({ + pid, + startedAt: "t", + argv: [], + pkcRpcUrl: `ws://localhost:${pid}`, + ...(supervisor ? { supervisor } : {}) +}); + +const systemd = (unit: string): DaemonSupervisor => ({ type: "systemd", unit }); + +function recordingLifecycle() { + const calls: string[] = []; + const lifecycle: DaemonLifecycle = { + stopUnmanaged: async (d) => void calls.push(`stop:${d.pid}`), + startUnmanaged: async (d) => void calls.push(`start:${d.pid}`), + restartManaged: async (s) => void calls.push(`restart:${s.unit}`) + }; + return { lifecycle, calls }; +} + +// Resolver used by planDaemonRestarts in these tests: trust the daemon's own supervisor field. +const resolveByField = async (d: DaemonState) => d.supervisor; + +describe("planDaemonRestarts", () => { + it("partitions daemons into supervised vs. unsupervised", async () => { + const managedDaemon = mkDaemon(1, systemd("bitsocial.service")); + const plainDaemon = mkDaemon(2); + const plan = await planDaemonRestarts([managedDaemon, plainDaemon], resolveByField); + + expect(plan.managed).toEqual([{ daemon: managedDaemon, supervisor: systemd("bitsocial.service") }]); + expect(plan.unmanaged).toEqual([plainDaemon]); + }); + + it("uses the (possibly async) resolver, not just the field", async () => { + const legacy = mkDaemon(3); // no supervisor field — resolver infers one (e.g. from cgroup) + const plan = await planDaemonRestarts([legacy], async () => systemd("inferred.service")); + expect(plan.managed.map((m) => m.supervisor.unit)).toEqual(["inferred.service"]); + expect(plan.unmanaged).toEqual([]); + }); +}); + +describe("stopUnmanagedDaemons", () => { + it("stops only unsupervised daemons, in order, and never touches supervised ones", async () => { + const plan = await planDaemonRestarts([mkDaemon(1, systemd("a.service")), mkDaemon(2), mkDaemon(3)], resolveByField); + const { lifecycle, calls } = recordingLifecycle(); + await stopUnmanagedDaemons(plan, lifecycle); + expect(calls).toEqual(["stop:2", "stop:3"]); + }); +}); + +describe("startUnmanagedDaemons", () => { + it("re-spawns only unsupervised daemons", async () => { + const plan = await planDaemonRestarts([mkDaemon(1, systemd("a.service")), mkDaemon(2)], resolveByField); + const { lifecycle, calls } = recordingLifecycle(); + await startUnmanagedDaemons(plan, lifecycle); + expect(calls).toEqual(["start:2"]); + }); +}); + +describe("restartManagedDaemons", () => { + it("restarts each supervised unit and never spawns/stops an unsupervised daemon", async () => { + const plan = await planDaemonRestarts([mkDaemon(1, systemd("a.service")), mkDaemon(2)], resolveByField); + const { lifecycle, calls } = recordingLifecycle(); + await restartManagedDaemons(plan, lifecycle); + expect(calls).toEqual(["restart:a.service"]); + }); + + it("deduplicates by unit so a shared unit is restarted once", async () => { + const plan = await planDaemonRestarts( + [mkDaemon(1, systemd("shared.service")), mkDaemon(2, systemd("shared.service")), mkDaemon(3, systemd("other.service"))], + resolveByField + ); + const { lifecycle, calls } = recordingLifecycle(); + await restartManagedDaemons(plan, lifecycle); + expect(calls).toEqual(["restart:shared.service", "restart:other.service"]); + }); +}); + +describe("full update sequence (mixed daemons)", () => { + it("stops/respawns the unsupervised daemon but only restarts the supervised one via its supervisor", async () => { + const managed = mkDaemon(10, systemd("bitsocial.service")); + const plain = mkDaemon(20); + const plan = await planDaemonRestarts([managed, plain], resolveByField); + const { lifecycle, calls } = recordingLifecycle(); + + // Mirrors install.ts: stop unsupervised → (binary swap) → respawn unsupervised + restart supervised. + await stopUnmanagedDaemons(plan, lifecycle); + await startUnmanagedDaemons(plan, lifecycle); + await restartManagedDaemons(plan, lifecycle); + + expect(calls).toEqual(["stop:20", "start:20", "restart:bitsocial.service"]); + // The supervised daemon (pid 10) is never SIGINT'd or detached-spawned — the bug's root cause. + expect(calls).not.toContain("stop:10"); + expect(calls).not.toContain("start:10"); + }); +}); diff --git a/test/update/systemctl.test.ts b/test/update/systemctl.test.ts new file mode 100644 index 0000000..55389c7 --- /dev/null +++ b/test/update/systemctl.test.ts @@ -0,0 +1,23 @@ +import { describe, it, expect } from "vitest"; +import { systemctlRestart } from "../../dist/update/systemctl.js"; + +// The actual side effect `update install` uses to restart a supervised daemon (issue #82). + +describe("systemctlRestart", () => { + it("invokes `systemctl restart `", async () => { + const calls: Array<[string, string[]]> = []; + await systemctlRestart("bitsocial.service", async (cmd, args) => { + calls.push([cmd, args]); + return undefined; + }); + expect(calls).toEqual([["systemctl", ["restart", "bitsocial.service"]]]); + }); + + it("propagates the failure when systemctl fails (so the caller can surface it)", async () => { + await expect( + systemctlRestart("bitsocial.service", async () => { + throw new Error("Failed to restart: Access denied"); + }) + ).rejects.toThrow(/Access denied/); + }); +});