mirror of https://github.com/garrytan/gstack.git
feat(browse): terminal-agent watchdog with PID liveness + crash-loop guard
terminal-agent could die independently of the server — SIGKILL from the OS
OOM killer, an uncaught exception under PTY churn, an external `pkill` from
a sibling debugging session. Pre-v1.44 the sidebar would observe the broken
connection and stay broken until the user reloaded the sidebar. Now a 60s
ticker checks the recorded agent PID and respawns via the shared
spawnTerminalAgent helper when dead.
Identity-based liveness (T4 from the eng review):
* Uses readAgentRecord + isProcessAlive (signal 0 probe), not a name match.
* Slow-but-alive agents intentionally fall through — respawning around a
living agent would create split-brain (two agents writing the port
file, tokens diverging between them, mystery upgrade 401s).
* Pairs with the v1.44 generation counter in /internal/* loopback calls:
if a stale agent does come back to life mid-cycle, its X-Browse-Gen
no longer matches and the parent's calls 409 cleanly.
Crash-loop guard:
* 3 respawn attempts inside a rolling 60s window → stop trying. A daemon
up for a week with one crash a day shouldn't trip the guard.
* On trip: one-line error to console (`respawn guard tripped`) and the
watchdog goes dormant. Manual restart via the sidebar Restart button
is the explicit signal to re-arm (added in Commit 2 of the larger PR).
Shared spawn path (refactor):
* New spawnTerminalAgent(opts) in terminal-agent-control.ts handles:
prior-PID cleanup → spawn → record stash. Both the CLI cold-start path
in cli.ts and the new server.ts watchdog route through it. Removes the
copy-paste between them; future env wiring lands in one place.
Gated on cfg.ownsTerminalAgent — embedders that pre-launch their own PTY
server (gbrowser phoenix overlay) still own the full lifecycle.
GSTACK_AGENT_WATCHDOG_TICK_MS env knob compresses the 60s tick for e2e
tests without 60s waits per assertion.
Tests:
* browse/test/terminal-agent-watchdog.test.ts — 7 static-grep tripwires
for the load-bearing invariants (ownsTerminalAgent gate, PID-based
liveness, crash-loop guard with window pruning, shutdown cleanup,
CLI cold-start uses the same helper, env knob exists).
* Live process-kill tests belong in the e2e tier; cheaper invariants
here catch refactor regressions in ~1ms each.
Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
This commit is contained in:
parent
ad669b238a
commit
f42d7bac6d
|
|
@ -16,7 +16,7 @@ import { writeSecureFile, mkdirSecure } from './file-permissions';
|
|||
import { resolveConfig, ensureStateDir, readVersionHash } from './config';
|
||||
import { parseProxyConfig, computeConfigHash, ProxyConfigError } from './proxy-config';
|
||||
import { redactProxyUrl } from './proxy-redact';
|
||||
import { readAgentRecord, killAgentByRecord, clearAgentRecord } from './terminal-agent-control';
|
||||
import { spawnTerminalAgent } from './terminal-agent-control';
|
||||
|
||||
const config = resolveConfig();
|
||||
const IS_WINDOWS = process.platform === 'win32';
|
||||
|
|
@ -1034,38 +1034,18 @@ Refs: After 'snapshot', use @e1, @e2... as selectors:
|
|||
// claude -p subprocesses to multiplex.
|
||||
|
||||
// Auto-start terminal agent (non-compiled bun process). Owns the PTY
|
||||
// WebSocket for the sidebar Terminal pane.
|
||||
let termAgentScript = path.resolve(__dirname, 'terminal-agent.ts');
|
||||
if (!fs.existsSync(termAgentScript)) {
|
||||
termAgentScript = path.resolve(path.dirname(process.execPath), '..', 'src', 'terminal-agent.ts');
|
||||
}
|
||||
// WebSocket for the sidebar Terminal pane. Routes through the shared
|
||||
// spawnTerminalAgent helper so the CLI cold-start path and the
|
||||
// server.ts watchdog respawn path share one implementation. The
|
||||
// helper handles prior-PID cleanup, script lookup, and env wiring.
|
||||
try {
|
||||
if (fs.existsSync(termAgentScript)) {
|
||||
// Kill any stale terminal-agent from a prior run so its port file
|
||||
// can't trick the server into routing /pty-session at a dead
|
||||
// listener. Identity-based (v1.44+) — only kills the PID recorded
|
||||
// in `<stateDir>/terminal-agent-pid`. Pre-v1.44 used
|
||||
// `pkill -f terminal-agent\.ts` which matched sibling gstack
|
||||
// sessions; see terminal-agent-control.ts header for rationale.
|
||||
{
|
||||
const stateDir = path.dirname(config.stateFile);
|
||||
const prior = readAgentRecord(stateDir);
|
||||
if (prior) {
|
||||
killAgentByRecord(prior, 'SIGTERM');
|
||||
clearAgentRecord(stateDir);
|
||||
}
|
||||
}
|
||||
const termProc = Bun.spawn(['bun', 'run', termAgentScript], {
|
||||
cwd: config.projectDir,
|
||||
env: {
|
||||
...process.env,
|
||||
BROWSE_STATE_FILE: config.stateFile,
|
||||
BROWSE_SERVER_PORT: String(newState.port),
|
||||
},
|
||||
stdio: ['ignore', 'ignore', 'ignore'],
|
||||
});
|
||||
termProc.unref();
|
||||
console.log(`[browse] Terminal agent started (PID: ${termProc.pid})`);
|
||||
const newPid = spawnTerminalAgent({
|
||||
stateFile: config.stateFile,
|
||||
serverPort: newState.port,
|
||||
cwd: config.projectDir,
|
||||
});
|
||||
if (newPid) {
|
||||
console.log(`[browse] Terminal agent started (PID: ${newPid})`);
|
||||
}
|
||||
} catch (err: any) {
|
||||
// Non-fatal: chat still works without the terminal agent.
|
||||
|
|
|
|||
|
|
@ -43,7 +43,8 @@ import { inspectElement, modifyStyle, resetModifications, getModificationHistory
|
|||
// Bun.spawn used instead of child_process.spawn (compiled bun binaries
|
||||
// fail posix_spawn on all executables including /bin/bash)
|
||||
import { safeUnlink, safeUnlinkQuiet, safeKill } from './error-handling';
|
||||
import { readAgentRecord, killAgentByRecord, clearAgentRecord, agentRecordPath } from './terminal-agent-control';
|
||||
import { readAgentRecord, killAgentByRecord, clearAgentRecord, agentRecordPath, spawnTerminalAgent } from './terminal-agent-control';
|
||||
import { isProcessAlive } from './error-handling';
|
||||
import { sanitizeBody, stripLoneSurrogateEscapes } from './sanitize';
|
||||
import { startSocksBridge, testUpstream, type BridgeHandle } from './socks-bridge';
|
||||
import { parseProxyConfig, toUpstreamConfig, ProxyConfigError } from './proxy-config';
|
||||
|
|
@ -1306,6 +1307,84 @@ export function buildFetchHandler(cfg: ServerConfig): ServerHandle {
|
|||
// premise even under malformed cfg.
|
||||
const ownsTerminalAgent = cfg.ownsTerminalAgent === false ? false : true;
|
||||
|
||||
// ─── Terminal-Agent Watchdog (v1.44+) ─────────────────────────────
|
||||
//
|
||||
// The terminal-agent process can die independently of the server: SIGKILL
|
||||
// from the OS OOM killer, an uncaught exception under load, an external
|
||||
// `pkill` from a sibling debugging session. Pre-v1.44 the sidebar would
|
||||
// see the broken connection and stay broken until the user reloaded.
|
||||
// Now: 60s ticker checks the recorded agent PID, respawns via the shared
|
||||
// spawnTerminalAgent helper if dead.
|
||||
//
|
||||
// Identity-based — uses readAgentRecord + isProcessAlive, NOT a process
|
||||
// name probe. Critical: prevents respawning around a slow-but-alive agent
|
||||
// (which would create split-brain — two agents writing the port file,
|
||||
// tokens diverging between them, mystery PTY upgrade failures).
|
||||
//
|
||||
// Crash-loop guard: 3 respawn attempts inside 60s → stop trying and emit
|
||||
// a one-line error. Manual `forceRestart` from the sidebar clears the
|
||||
// history (the user is the explicit signal to retry).
|
||||
//
|
||||
// Only active when ownsTerminalAgent === true. Embedders that pre-launch
|
||||
// their own PTY server (gbrowser phoenix overlay) must not be auto-respawned
|
||||
// by us — their lifecycle is their concern.
|
||||
let agentWatchdogInterval: ReturnType<typeof setInterval> | null = null;
|
||||
const respawnHistory: number[] = [];
|
||||
const AGENT_WATCHDOG_TICK_MS = parseInt(
|
||||
process.env.GSTACK_AGENT_WATCHDOG_TICK_MS || '60000',
|
||||
10,
|
||||
);
|
||||
const RESPAWN_GUARD_WINDOW_MS = 60_000;
|
||||
const RESPAWN_GUARD_MAX = 3;
|
||||
let agentRespawnGuardTripped = false;
|
||||
|
||||
if (ownsTerminalAgent) {
|
||||
agentWatchdogInterval = setInterval(() => {
|
||||
if (isShuttingDown) return;
|
||||
if (agentRespawnGuardTripped) return;
|
||||
const stateDir = path.dirname(cfg.config.stateFile);
|
||||
const record = readAgentRecord(stateDir);
|
||||
// If the record exists and the PID is alive, the agent is healthy
|
||||
// (or at least still answering signal 0). Slow-but-alive agents
|
||||
// intentionally fall through here — split-brain is worse than
|
||||
// unresponsiveness, and slow recovery is handled by the user via
|
||||
// restart.
|
||||
if (record && isProcessAlive(record.pid)) return;
|
||||
// Either no record (never spawned, or cleaned up after crash) or
|
||||
// PID is dead. Try to respawn.
|
||||
const now = Date.now();
|
||||
while (respawnHistory.length && now - respawnHistory[0] > RESPAWN_GUARD_WINDOW_MS) {
|
||||
respawnHistory.shift();
|
||||
}
|
||||
if (respawnHistory.length >= RESPAWN_GUARD_MAX) {
|
||||
agentRespawnGuardTripped = true;
|
||||
console.error(
|
||||
`[browse] terminal-agent respawn guard tripped (${RESPAWN_GUARD_MAX} crashes in ${RESPAWN_GUARD_WINDOW_MS / 1000}s) — manual restart required`,
|
||||
);
|
||||
return;
|
||||
}
|
||||
respawnHistory.push(now);
|
||||
try {
|
||||
const pid = spawnTerminalAgent({
|
||||
stateFile: cfg.config.stateFile,
|
||||
serverPort: cfg.browsePort,
|
||||
cwd: cfg.config.projectDir,
|
||||
});
|
||||
if (pid) {
|
||||
console.log(`[browse] terminal-agent respawned by watchdog (PID: ${pid})`);
|
||||
} else {
|
||||
console.warn('[browse] terminal-agent respawn skipped — script not found on disk');
|
||||
}
|
||||
} catch (err: any) {
|
||||
console.warn('[browse] terminal-agent respawn failed:', err?.message || err);
|
||||
}
|
||||
}, AGENT_WATCHDOG_TICK_MS);
|
||||
// Detach the watchdog timer from Node's event-loop ref count so a
|
||||
// healthy idle process can still exit cleanly if everything else is
|
||||
// also unref'd. Bun's setInterval returns a Timer with unref().
|
||||
(agentWatchdogInterval as any)?.unref?.();
|
||||
}
|
||||
|
||||
// Factory-scoped validateAuth. Closes over cfg.authToken so every internal
|
||||
// auth check sees the same token the routes receive. Module-level
|
||||
// validateAuth was deleted in v1.35.0.0.
|
||||
|
|
@ -1345,6 +1424,7 @@ export function buildFetchHandler(cfg: ServerConfig): ServerHandle {
|
|||
if (cfgBrowserManager.isWatching()) cfgBrowserManager.stopWatch();
|
||||
clearInterval(flushInterval);
|
||||
clearInterval(idleCheckInterval);
|
||||
if (agentWatchdogInterval) clearInterval(agentWatchdogInterval);
|
||||
await flushBuffers();
|
||||
|
||||
await cfgBrowserManager.close();
|
||||
|
|
|
|||
|
|
@ -19,6 +19,69 @@ import * as path from 'path';
|
|||
import { safeUnlink, safeKill, isProcessAlive } from './error-handling';
|
||||
import { writeSecureFile, mkdirSecure } from './file-permissions';
|
||||
|
||||
/**
|
||||
* Locate the terminal-agent script on disk. In dev (cli.ts running via
|
||||
* `bun run`), it lives next to this file in browse/src. In a compiled
|
||||
* binary, Bun's --compile bakes the source into the executable and
|
||||
* exposes it relative to process.execPath. Either path must work or
|
||||
* the agent can't be spawned at all.
|
||||
*/
|
||||
export function resolveTerminalAgentScript(searchHints: { metaDir?: string; execPath?: string } = {}): string | null {
|
||||
const meta = searchHints.metaDir || __dirname;
|
||||
const exec = searchHints.execPath || process.execPath;
|
||||
const candidates = [
|
||||
path.resolve(meta, 'terminal-agent.ts'),
|
||||
path.resolve(path.dirname(exec), '..', 'src', 'terminal-agent.ts'),
|
||||
];
|
||||
for (const c of candidates) {
|
||||
if (fs.existsSync(c)) return c;
|
||||
}
|
||||
return null;
|
||||
}
|
||||
|
||||
/**
|
||||
* Spawn a fresh terminal-agent as a detached child. Handles the standard
|
||||
* three steps: kill any prior agent recorded at `<stateDir>/terminal-agent-pid`,
|
||||
* clear the stale record, then `Bun.spawn(['bun', 'run', script], ...)` with
|
||||
* env wiring. Returns the PID of the new agent on success, null when the
|
||||
* agent script can't be located.
|
||||
*
|
||||
* Used by both the CLI cold-start path (cli.ts) and the v1.44 watchdog in
|
||||
* server.ts. Centralizing here removes a copy-paste between them and means
|
||||
* future spawn-env additions (e.g. BROWSE_OWNER_PID for the generation
|
||||
* counter rollout) land in one place.
|
||||
*/
|
||||
export function spawnTerminalAgent(opts: {
|
||||
stateFile: string;
|
||||
serverPort: number;
|
||||
cwd?: string;
|
||||
/** Optional extra env vars to add to the agent's process env. */
|
||||
extraEnv?: Record<string, string>;
|
||||
/** Override script lookup for tests. */
|
||||
scriptPath?: string;
|
||||
}): number | null {
|
||||
const stateDir = path.dirname(opts.stateFile);
|
||||
const prior = readAgentRecord(stateDir);
|
||||
if (prior) {
|
||||
killAgentByRecord(prior, 'SIGTERM');
|
||||
clearAgentRecord(stateDir);
|
||||
}
|
||||
const script = opts.scriptPath || resolveTerminalAgentScript();
|
||||
if (!script || !fs.existsSync(script)) return null;
|
||||
const proc = (Bun as any).spawn(['bun', 'run', script], {
|
||||
cwd: opts.cwd || process.cwd(),
|
||||
env: {
|
||||
...process.env,
|
||||
BROWSE_STATE_FILE: opts.stateFile,
|
||||
BROWSE_SERVER_PORT: String(opts.serverPort),
|
||||
...(opts.extraEnv || {}),
|
||||
},
|
||||
stdio: ['ignore', 'ignore', 'ignore'],
|
||||
});
|
||||
proc.unref?.();
|
||||
return proc.pid ?? null;
|
||||
}
|
||||
|
||||
export interface AgentRecord {
|
||||
pid: number;
|
||||
/** Random per-boot identifier. Loopback /internal/* sees X-Browse-Gen: <gen>. */
|
||||
|
|
|
|||
|
|
@ -0,0 +1,91 @@
|
|||
import { describe, test, expect } from 'bun:test';
|
||||
import * as fs from 'fs';
|
||||
import * as path from 'path';
|
||||
|
||||
// v1.44 terminal-agent watchdog — static-grep invariants.
|
||||
//
|
||||
// The watchdog respawns terminal-agent when its PID dies. Live process-tree
|
||||
// tests would require spawning, killing, and observing across two real Bun
|
||||
// processes — slow and flaky in the free tier. These tripwires defend the
|
||||
// load-bearing properties: identity-based liveness check (not name match),
|
||||
// crash-loop guard, gated on ownsTerminalAgent, and cleared on shutdown.
|
||||
|
||||
const SERVER_TS = path.resolve(new URL(import.meta.url).pathname, '..', '..', 'src', 'server.ts');
|
||||
const CONTROL_TS = path.resolve(new URL(import.meta.url).pathname, '..', '..', 'src', 'terminal-agent-control.ts');
|
||||
|
||||
describe('terminal-agent watchdog (v1.44+)', () => {
|
||||
test('1. spawnTerminalAgent helper exists with PID return type', () => {
|
||||
const src = fs.readFileSync(CONTROL_TS, 'utf-8');
|
||||
expect(src).toMatch(/export function spawnTerminalAgent\(/);
|
||||
// Must clean up prior PID before spawning (no zombies).
|
||||
expect(src).toContain('readAgentRecord(stateDir)');
|
||||
expect(src).toContain('killAgentByRecord(prior');
|
||||
expect(src).toContain('clearAgentRecord(stateDir)');
|
||||
});
|
||||
|
||||
test('2. watchdog is gated on ownsTerminalAgent', () => {
|
||||
const src = fs.readFileSync(SERVER_TS, 'utf-8');
|
||||
// Match the comment + the guard. The guard MUST be a positive check;
|
||||
// an inverted check would respawn for embedders and trample their PTY.
|
||||
const block = sliceBetween(src, '─── Terminal-Agent Watchdog', 'Factory-scoped validateAuth');
|
||||
expect(block).toMatch(/if \(ownsTerminalAgent\)/);
|
||||
expect(block).toContain('agentWatchdogInterval = setInterval');
|
||||
});
|
||||
|
||||
test('3. watchdog uses PID liveness, not process name probe', () => {
|
||||
const src = fs.readFileSync(SERVER_TS, 'utf-8');
|
||||
const block = sliceBetween(src, '─── Terminal-Agent Watchdog', 'Factory-scoped validateAuth');
|
||||
// The whole point of the v1.44 watchdog over v1.43- pkill teardown:
|
||||
// identity-based liveness. Slow-but-alive agents must NOT trigger
|
||||
// respawn (split-brain defense).
|
||||
expect(block).toContain('readAgentRecord(stateDir)');
|
||||
expect(block).toContain('isProcessAlive(record.pid)');
|
||||
// Negative: no executable name-based process lookup. Allow the strings
|
||||
// to appear in prose comments (the watchdog doc explains what it
|
||||
// replaces), reject only actual invocations.
|
||||
expect(block).not.toMatch(/spawnSync\s*\(\s*['"]pkill/);
|
||||
expect(block).not.toMatch(/Bun\.spawn\s*\(\s*\[\s*['"]pgrep/);
|
||||
});
|
||||
|
||||
test('4. crash-loop guard with rolling window', () => {
|
||||
const src = fs.readFileSync(SERVER_TS, 'utf-8');
|
||||
const block = sliceBetween(src, '─── Terminal-Agent Watchdog', 'Factory-scoped validateAuth');
|
||||
expect(block).toContain('RESPAWN_GUARD_WINDOW_MS = 60_000');
|
||||
expect(block).toContain('RESPAWN_GUARD_MAX = 3');
|
||||
expect(block).toContain('respawnHistory');
|
||||
expect(block).toContain('agentRespawnGuardTripped');
|
||||
// Window pruning: old entries must be evicted before counting toward
|
||||
// the limit. Otherwise a daemon up for a week with one crash a day
|
||||
// would eventually trip the guard.
|
||||
expect(block).toMatch(/respawnHistory\.shift\(\)/);
|
||||
});
|
||||
|
||||
test('5. watchdog interval is cleared on shutdown', () => {
|
||||
const src = fs.readFileSync(SERVER_TS, 'utf-8');
|
||||
expect(src).toContain('if (agentWatchdogInterval) clearInterval(agentWatchdogInterval)');
|
||||
});
|
||||
|
||||
test('6. tick interval is env-overridable for tests', () => {
|
||||
const src = fs.readFileSync(SERVER_TS, 'utf-8');
|
||||
expect(src).toContain('GSTACK_AGENT_WATCHDOG_TICK_MS');
|
||||
});
|
||||
|
||||
test('7. CLI cold-start path uses the same spawnTerminalAgent helper', () => {
|
||||
const cli = fs.readFileSync(
|
||||
path.resolve(new URL(import.meta.url).pathname, '..', '..', 'src', 'cli.ts'),
|
||||
'utf-8',
|
||||
);
|
||||
// Otherwise the CLI and watchdog could drift on spawn env/cwd, and
|
||||
// teardown invariants tested against one would silently miss the other.
|
||||
expect(cli).toContain('spawnTerminalAgent({');
|
||||
expect(cli).toContain("from './terminal-agent-control'");
|
||||
});
|
||||
});
|
||||
|
||||
function sliceBetween(source: string, start: string, end: string): string {
|
||||
const i = source.indexOf(start);
|
||||
if (i === -1) throw new Error(`marker not found: ${start}`);
|
||||
const j = source.indexOf(end, i + start.length);
|
||||
if (j === -1) throw new Error(`end marker not found: ${end}`);
|
||||
return source.slice(i, j);
|
||||
}
|
||||
Loading…
Reference in New Issue