diff --git a/browse/src/cli.ts b/browse/src/cli.ts index 66559b881..eae0f5cf4 100644 --- a/browse/src/cli.ts +++ b/browse/src/cli.ts @@ -1055,6 +1055,96 @@ Refs: After 'snapshot', use @e1, @e2... as selectors: console.error(`[browse] Connect failed: ${err.message}`); process.exit(1); } + + // ─── Outer Supervisor (v1.44+, opt-in) ────────────────────────── + // + // Default: fire-and-forget (CLI exits, server runs detached). This is + // the contract every existing call site relies on, including Claude + // Code's Bash tool which expects `$B connect` to return promptly. + // + // Opt-in via `--supervise` flag or BROWSE_SUPERVISE=1 env: the CLI + // stays attached, polls the spawned server's PID every 30s, and + // respawns it through the same headed-mode startServer path on + // unexpected exit. Crash-loop guard: 5 respawns inside 5 min → + // give up and exit 1 with a clear error. SIGINT / SIGTERM cleanly + // tear down the supervised server before exit. + // + // Out of scope for v1.44 minimum: routing the Chromium-disconnect + // exit-code-1 path back through this supervisor. The terminal-agent + // watchdog (T5) already covers the highest-frequency restart case; + // Chromium-crash-respawn is documented as a follow-up so the + // supervisor stays a tight, testable primitive. + const superviseRequested = commandArgs.includes('--supervise') + || process.env.BROWSE_SUPERVISE === '1'; + if (!superviseRequested) { + process.exit(0); + } + console.log('[browse] Supervisor mode: monitoring server. Ctrl-C to stop.'); + let supervisorExiting = false; + const teardownAndExit = (signal: string) => { + if (supervisorExiting) return; + supervisorExiting = true; + console.log(`\n[browse] ${signal} received — stopping server.`); + const state = readState(); + if (state?.pid && isProcessAlive(state.pid)) { + safeKill(state.pid, 'SIGTERM'); + } + process.exit(0); + }; + process.on('SIGINT', () => teardownAndExit('SIGINT')); + process.on('SIGTERM', () => teardownAndExit('SIGTERM')); + + const SUPERVISOR_TICK_MS = parseInt( + process.env.GSTACK_SUPERVISOR_TICK_MS || '30000', + 10, + ); + const SUPERVISOR_GUARD_WINDOW_MS = 5 * 60_000; + const SUPERVISOR_GUARD_MAX = 5; + const SUPERVISOR_BACKOFF_MS = (process.env.GSTACK_SUPERVISOR_BACKOFF || '1000,2000,4000,8000,30000') + .split(',').map(s => parseInt(s.trim(), 10)).filter(n => Number.isFinite(n)); + const respawns: number[] = []; + + while (!supervisorExiting) { + await new Promise(resolve => setTimeout(resolve, SUPERVISOR_TICK_MS)); + if (supervisorExiting) break; + const state = readState(); + if (state?.pid && isProcessAlive(state.pid)) continue; + // Server died. Prune rolling window and check guard. + const now = Date.now(); + while (respawns.length && now - respawns[0] > SUPERVISOR_GUARD_WINDOW_MS) { + respawns.shift(); + } + if (respawns.length >= SUPERVISOR_GUARD_MAX) { + console.error( + `[browse] Supervisor: ${SUPERVISOR_GUARD_MAX} crashes in ${SUPERVISOR_GUARD_WINDOW_MS / 1000}s — giving up.`, + ); + process.exit(1); + } + const attempt = respawns.length; + respawns.push(now); + const backoff = SUPERVISOR_BACKOFF_MS[Math.min(attempt, SUPERVISOR_BACKOFF_MS.length - 1)] ?? 30_000; + console.warn(`[browse] Supervisor: server PID gone — respawning in ${backoff}ms (attempt ${attempt + 1}/${SUPERVISOR_GUARD_MAX})...`); + await new Promise(resolve => setTimeout(resolve, backoff)); + if (supervisorExiting) break; + try { + const respawned = await startServer(serverEnv); + console.log(`[browse] Supervisor: server respawned (PID ${respawned.pid}, port ${respawned.port}).`); + // Re-spawn the terminal-agent too; same env wiring as the initial connect. + try { + spawnTerminalAgent({ + stateFile: config.stateFile, + serverPort: respawned.port, + cwd: config.projectDir, + }); + } catch (err: any) { + console.warn(`[browse] Supervisor: terminal-agent respawn failed: ${err?.message || err}`); + } + } catch (err: any) { + console.error(`[browse] Supervisor: server respawn failed: ${err?.message || err}`); + // Let the next tick try again — the crash-loop guard already + // bounded the retries via the rolling window. + } + } process.exit(0); } diff --git a/browse/test/cli-supervisor.test.ts b/browse/test/cli-supervisor.test.ts new file mode 100644 index 000000000..d9cec7b89 --- /dev/null +++ b/browse/test/cli-supervisor.test.ts @@ -0,0 +1,81 @@ +import { describe, test, expect } from 'bun:test'; +import * as fs from 'fs'; +import * as path from 'path'; + +// v1.44 outer supervisor — static-grep invariants. +// +// Pre-v1.44 `$B connect` was fire-and-forget: spawn server detached, CLI +// exits, server runs unsupervised. If the server crashed, the user had to +// re-run `$B connect`. The opt-in supervisor (--supervise or +// BROWSE_SUPERVISE=1) keeps the CLI attached and respawns the server on +// unexpected exit, with the same crash-loop guard shape as the v1.44 +// terminal-agent watchdog. +// +// Live respawn tests belong in the e2e tier (real Bun.spawn cycles take +// 3-8s each). These tripwires defend the load-bearing invariants: +// opt-in by default, signal handlers wired, crash-loop guard, env knobs. + +const CLI_TS = path.resolve(new URL(import.meta.url).pathname, '..', '..', 'src', 'cli.ts'); + +describe('CLI outer supervisor (v1.44+)', () => { + test('1. supervisor is opt-in via --supervise flag or BROWSE_SUPERVISE env', () => { + const src = fs.readFileSync(CLI_TS, 'utf-8'); + expect(src).toContain("commandArgs.includes('--supervise')"); + expect(src).toContain("process.env.BROWSE_SUPERVISE === '1'"); + // Default path MUST still exit 0 promptly. The legacy contract is + // that every caller of `$B connect` (Claude Code Bash tool, scripts, + // CI) gets a prompt return. + expect(src).toMatch(/if \(!superviseRequested\) \{\s*process\.exit\(0\);\s*\}/); + }); + + test('2. SIGINT and SIGTERM trigger clean teardown', () => { + const src = fs.readFileSync(CLI_TS, 'utf-8'); + // Both signals must hit the teardown path or the user's Ctrl-C leaves + // an orphaned server (worse than no supervisor). + expect(src).toMatch(/process\.on\('SIGINT'.*teardownAndExit/); + expect(src).toMatch(/process\.on\('SIGTERM'.*teardownAndExit/); + // Teardown must signal the supervised server before exiting itself. + expect(src).toContain("safeKill(state.pid, 'SIGTERM')"); + }); + + test('3. crash-loop guard with 5-in-5min rolling window', () => { + const src = fs.readFileSync(CLI_TS, 'utf-8'); + expect(src).toContain('SUPERVISOR_GUARD_WINDOW_MS = 5 * 60_000'); + expect(src).toContain('SUPERVISOR_GUARD_MAX = 5'); + // Window pruning: a long-lived daemon with sporadic crashes must NOT + // hit the guard (otherwise we punish the user for the supervisor doing + // its job). + expect(src).toMatch(/respawns\.shift\(\)/); + }); + + test('4. exponential backoff schedule, env-overridable', () => { + const src = fs.readFileSync(CLI_TS, 'utf-8'); + expect(src).toContain('GSTACK_SUPERVISOR_BACKOFF'); + // Default schedule must include short waits at first (rapid recovery + // from transient crashes) and cap at a sensible long wait. + expect(src).toContain('1000,2000,4000,8000,30000'); + }); + + test('5. tick interval is env-overridable for tests', () => { + const src = fs.readFileSync(CLI_TS, 'utf-8'); + expect(src).toContain('GSTACK_SUPERVISOR_TICK_MS'); + }); + + test('6. respawned server gets a fresh terminal-agent too', () => { + const src = fs.readFileSync(CLI_TS, 'utf-8'); + // After server respawn, the terminal-agent state is stale (old PID + // record points to a dead agent that exited with its parent). The + // supervisor must re-call spawnTerminalAgent or the PTY path stays + // broken even though the server is back up. + const block = sliceBetween(src, 'Supervisor mode:', '// ─── Headed Disconnect'); + expect(block).toContain('spawnTerminalAgent({'); + }); +}); + +function sliceBetween(source: string, start: string, end: string): string { + const i = source.indexOf(start); + if (i === -1) throw new Error(`marker not found: ${start}`); + const j = source.indexOf(end, i + start.length); + if (j === -1) throw new Error(`end marker not found: ${end}`); + return source.slice(i, j); +}