mirror of https://github.com/garrytan/gstack.git
feat(cli): opt-in outer supervisor — respawn browse server on crash
Pre-v1.44 `$B connect` was fire-and-forget: spawn server detached, CLI
exits, server runs unsupervised. If the server crashed (OOM, uncaught
exception, signal kill from a runaway debugger), the user had to notice,
re-run `$B connect`, and resume work. The v1.44 terminal-agent watchdog
recovers from one layer of failure; this commit closes the outer loop.
Opt-in via `--supervise` flag or `BROWSE_SUPERVISE=1` env. Default
behavior is unchanged — every existing caller (Claude Code's Bash tool,
scripts, CI) still gets a prompt return. When the flag is set:
* CLI stays attached, polls server PID every 30s via readState() +
isProcessAlive (same identity primitive as the terminal-agent watchdog).
* On unexpected exit: respawn via the same headed-mode startServer path
used initially, then re-spawn the terminal-agent so the PTY recovers
too (otherwise sidebar Restart is the only path back).
* Crash-loop guard: 5 respawns in a rolling 5-min window → exit 1 with
a clear error. Window pruning means a long-lived daemon with sporadic
crashes does NOT trip the guard (otherwise we punish the user for the
supervisor doing its job).
* Backoff: 1s, 2s, 4s, 8s, 30s capped. Env-overridable via
GSTACK_SUPERVISOR_BACKOFF for tests.
* SIGINT / SIGTERM: clean teardown — signals the supervised server
before exiting itself. Without this, Ctrl-C leaves an orphaned server.
Out of scope (deferred follow-up): routing the Chromium-disconnect
exit-code-1 path back through this supervisor. The terminal-agent
watchdog already covers the highest-frequency restart case; Chromium
crash recovery joins the queue as its own commit.
Test (browse/test/cli-supervisor.test.ts):
* 6 static-grep tripwires: opt-in default, signal wiring, crash-loop
guard with window pruning, backoff schedule env knob, tick interval
env knob, terminal-agent re-spawn after server respawn.
* Live respawn tests belong in the e2e tier (real spawn cycles take
3-8s each; spamming these in the free tier would balloon CI time).
Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
This commit is contained in:
parent
f42d7bac6d
commit
5d648e4568
|
|
@ -1055,6 +1055,96 @@ Refs: After 'snapshot', use @e1, @e2... as selectors:
|
|||
console.error(`[browse] Connect failed: ${err.message}`);
|
||||
process.exit(1);
|
||||
}
|
||||
|
||||
// ─── Outer Supervisor (v1.44+, opt-in) ──────────────────────────
|
||||
//
|
||||
// Default: fire-and-forget (CLI exits, server runs detached). This is
|
||||
// the contract every existing call site relies on, including Claude
|
||||
// Code's Bash tool which expects `$B connect` to return promptly.
|
||||
//
|
||||
// Opt-in via `--supervise` flag or BROWSE_SUPERVISE=1 env: the CLI
|
||||
// stays attached, polls the spawned server's PID every 30s, and
|
||||
// respawns it through the same headed-mode startServer path on
|
||||
// unexpected exit. Crash-loop guard: 5 respawns inside 5 min →
|
||||
// give up and exit 1 with a clear error. SIGINT / SIGTERM cleanly
|
||||
// tear down the supervised server before exit.
|
||||
//
|
||||
// Out of scope for v1.44 minimum: routing the Chromium-disconnect
|
||||
// exit-code-1 path back through this supervisor. The terminal-agent
|
||||
// watchdog (T5) already covers the highest-frequency restart case;
|
||||
// Chromium-crash-respawn is documented as a follow-up so the
|
||||
// supervisor stays a tight, testable primitive.
|
||||
const superviseRequested = commandArgs.includes('--supervise')
|
||||
|| process.env.BROWSE_SUPERVISE === '1';
|
||||
if (!superviseRequested) {
|
||||
process.exit(0);
|
||||
}
|
||||
console.log('[browse] Supervisor mode: monitoring server. Ctrl-C to stop.');
|
||||
let supervisorExiting = false;
|
||||
const teardownAndExit = (signal: string) => {
|
||||
if (supervisorExiting) return;
|
||||
supervisorExiting = true;
|
||||
console.log(`\n[browse] ${signal} received — stopping server.`);
|
||||
const state = readState();
|
||||
if (state?.pid && isProcessAlive(state.pid)) {
|
||||
safeKill(state.pid, 'SIGTERM');
|
||||
}
|
||||
process.exit(0);
|
||||
};
|
||||
process.on('SIGINT', () => teardownAndExit('SIGINT'));
|
||||
process.on('SIGTERM', () => teardownAndExit('SIGTERM'));
|
||||
|
||||
const SUPERVISOR_TICK_MS = parseInt(
|
||||
process.env.GSTACK_SUPERVISOR_TICK_MS || '30000',
|
||||
10,
|
||||
);
|
||||
const SUPERVISOR_GUARD_WINDOW_MS = 5 * 60_000;
|
||||
const SUPERVISOR_GUARD_MAX = 5;
|
||||
const SUPERVISOR_BACKOFF_MS = (process.env.GSTACK_SUPERVISOR_BACKOFF || '1000,2000,4000,8000,30000')
|
||||
.split(',').map(s => parseInt(s.trim(), 10)).filter(n => Number.isFinite(n));
|
||||
const respawns: number[] = [];
|
||||
|
||||
while (!supervisorExiting) {
|
||||
await new Promise(resolve => setTimeout(resolve, SUPERVISOR_TICK_MS));
|
||||
if (supervisorExiting) break;
|
||||
const state = readState();
|
||||
if (state?.pid && isProcessAlive(state.pid)) continue;
|
||||
// Server died. Prune rolling window and check guard.
|
||||
const now = Date.now();
|
||||
while (respawns.length && now - respawns[0] > SUPERVISOR_GUARD_WINDOW_MS) {
|
||||
respawns.shift();
|
||||
}
|
||||
if (respawns.length >= SUPERVISOR_GUARD_MAX) {
|
||||
console.error(
|
||||
`[browse] Supervisor: ${SUPERVISOR_GUARD_MAX} crashes in ${SUPERVISOR_GUARD_WINDOW_MS / 1000}s — giving up.`,
|
||||
);
|
||||
process.exit(1);
|
||||
}
|
||||
const attempt = respawns.length;
|
||||
respawns.push(now);
|
||||
const backoff = SUPERVISOR_BACKOFF_MS[Math.min(attempt, SUPERVISOR_BACKOFF_MS.length - 1)] ?? 30_000;
|
||||
console.warn(`[browse] Supervisor: server PID gone — respawning in ${backoff}ms (attempt ${attempt + 1}/${SUPERVISOR_GUARD_MAX})...`);
|
||||
await new Promise(resolve => setTimeout(resolve, backoff));
|
||||
if (supervisorExiting) break;
|
||||
try {
|
||||
const respawned = await startServer(serverEnv);
|
||||
console.log(`[browse] Supervisor: server respawned (PID ${respawned.pid}, port ${respawned.port}).`);
|
||||
// Re-spawn the terminal-agent too; same env wiring as the initial connect.
|
||||
try {
|
||||
spawnTerminalAgent({
|
||||
stateFile: config.stateFile,
|
||||
serverPort: respawned.port,
|
||||
cwd: config.projectDir,
|
||||
});
|
||||
} catch (err: any) {
|
||||
console.warn(`[browse] Supervisor: terminal-agent respawn failed: ${err?.message || err}`);
|
||||
}
|
||||
} catch (err: any) {
|
||||
console.error(`[browse] Supervisor: server respawn failed: ${err?.message || err}`);
|
||||
// Let the next tick try again — the crash-loop guard already
|
||||
// bounded the retries via the rolling window.
|
||||
}
|
||||
}
|
||||
process.exit(0);
|
||||
}
|
||||
|
||||
|
|
|
|||
|
|
@ -0,0 +1,81 @@
|
|||
import { describe, test, expect } from 'bun:test';
|
||||
import * as fs from 'fs';
|
||||
import * as path from 'path';
|
||||
|
||||
// v1.44 outer supervisor — static-grep invariants.
|
||||
//
|
||||
// Pre-v1.44 `$B connect` was fire-and-forget: spawn server detached, CLI
|
||||
// exits, server runs unsupervised. If the server crashed, the user had to
|
||||
// re-run `$B connect`. The opt-in supervisor (--supervise or
|
||||
// BROWSE_SUPERVISE=1) keeps the CLI attached and respawns the server on
|
||||
// unexpected exit, with the same crash-loop guard shape as the v1.44
|
||||
// terminal-agent watchdog.
|
||||
//
|
||||
// Live respawn tests belong in the e2e tier (real Bun.spawn cycles take
|
||||
// 3-8s each). These tripwires defend the load-bearing invariants:
|
||||
// opt-in by default, signal handlers wired, crash-loop guard, env knobs.
|
||||
|
||||
const CLI_TS = path.resolve(new URL(import.meta.url).pathname, '..', '..', 'src', 'cli.ts');
|
||||
|
||||
describe('CLI outer supervisor (v1.44+)', () => {
|
||||
test('1. supervisor is opt-in via --supervise flag or BROWSE_SUPERVISE env', () => {
|
||||
const src = fs.readFileSync(CLI_TS, 'utf-8');
|
||||
expect(src).toContain("commandArgs.includes('--supervise')");
|
||||
expect(src).toContain("process.env.BROWSE_SUPERVISE === '1'");
|
||||
// Default path MUST still exit 0 promptly. The legacy contract is
|
||||
// that every caller of `$B connect` (Claude Code Bash tool, scripts,
|
||||
// CI) gets a prompt return.
|
||||
expect(src).toMatch(/if \(!superviseRequested\) \{\s*process\.exit\(0\);\s*\}/);
|
||||
});
|
||||
|
||||
test('2. SIGINT and SIGTERM trigger clean teardown', () => {
|
||||
const src = fs.readFileSync(CLI_TS, 'utf-8');
|
||||
// Both signals must hit the teardown path or the user's Ctrl-C leaves
|
||||
// an orphaned server (worse than no supervisor).
|
||||
expect(src).toMatch(/process\.on\('SIGINT'.*teardownAndExit/);
|
||||
expect(src).toMatch(/process\.on\('SIGTERM'.*teardownAndExit/);
|
||||
// Teardown must signal the supervised server before exiting itself.
|
||||
expect(src).toContain("safeKill(state.pid, 'SIGTERM')");
|
||||
});
|
||||
|
||||
test('3. crash-loop guard with 5-in-5min rolling window', () => {
|
||||
const src = fs.readFileSync(CLI_TS, 'utf-8');
|
||||
expect(src).toContain('SUPERVISOR_GUARD_WINDOW_MS = 5 * 60_000');
|
||||
expect(src).toContain('SUPERVISOR_GUARD_MAX = 5');
|
||||
// Window pruning: a long-lived daemon with sporadic crashes must NOT
|
||||
// hit the guard (otherwise we punish the user for the supervisor doing
|
||||
// its job).
|
||||
expect(src).toMatch(/respawns\.shift\(\)/);
|
||||
});
|
||||
|
||||
test('4. exponential backoff schedule, env-overridable', () => {
|
||||
const src = fs.readFileSync(CLI_TS, 'utf-8');
|
||||
expect(src).toContain('GSTACK_SUPERVISOR_BACKOFF');
|
||||
// Default schedule must include short waits at first (rapid recovery
|
||||
// from transient crashes) and cap at a sensible long wait.
|
||||
expect(src).toContain('1000,2000,4000,8000,30000');
|
||||
});
|
||||
|
||||
test('5. tick interval is env-overridable for tests', () => {
|
||||
const src = fs.readFileSync(CLI_TS, 'utf-8');
|
||||
expect(src).toContain('GSTACK_SUPERVISOR_TICK_MS');
|
||||
});
|
||||
|
||||
test('6. respawned server gets a fresh terminal-agent too', () => {
|
||||
const src = fs.readFileSync(CLI_TS, 'utf-8');
|
||||
// After server respawn, the terminal-agent state is stale (old PID
|
||||
// record points to a dead agent that exited with its parent). The
|
||||
// supervisor must re-call spawnTerminalAgent or the PTY path stays
|
||||
// broken even though the server is back up.
|
||||
const block = sliceBetween(src, 'Supervisor mode:', '// ─── Headed Disconnect');
|
||||
expect(block).toContain('spawnTerminalAgent({');
|
||||
});
|
||||
});
|
||||
|
||||
function sliceBetween(source: string, start: string, end: string): string {
|
||||
const i = source.indexOf(start);
|
||||
if (i === -1) throw new Error(`marker not found: ${start}`);
|
||||
const j = source.indexOf(end, i + start.length);
|
||||
if (j === -1) throw new Error(`end marker not found: ${end}`);
|
||||
return source.slice(i, j);
|
||||
}
|
||||
Loading…
Reference in New Issue