From f42d7bac6d885e896443841fa4ebcbe4c7321743 Mon Sep 17 00:00:00 2001 From: Garry Tan Date: Sat, 23 May 2026 23:11:54 -0700 Subject: [PATCH] feat(browse): terminal-agent watchdog with PID liveness + crash-loop guard MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit terminal-agent could die independently of the server — SIGKILL from the OS OOM killer, an uncaught exception under PTY churn, an external `pkill` from a sibling debugging session. Pre-v1.44 the sidebar would observe the broken connection and stay broken until the user reloaded the sidebar. Now a 60s ticker checks the recorded agent PID and respawns via the shared spawnTerminalAgent helper when dead. Identity-based liveness (T4 from the eng review): * Uses readAgentRecord + isProcessAlive (signal 0 probe), not a name match. * Slow-but-alive agents intentionally fall through — respawning around a living agent would create split-brain (two agents writing the port file, tokens diverging between them, mystery upgrade 401s). * Pairs with the v1.44 generation counter in /internal/* loopback calls: if a stale agent does come back to life mid-cycle, its X-Browse-Gen no longer matches and the parent's calls 409 cleanly. Crash-loop guard: * 3 respawn attempts inside a rolling 60s window → stop trying. A daemon up for a week with one crash a day shouldn't trip the guard. * On trip: one-line error to console (`respawn guard tripped`) and the watchdog goes dormant. Manual restart via the sidebar Restart button is the explicit signal to re-arm (added in Commit 2 of the larger PR). Shared spawn path (refactor): * New spawnTerminalAgent(opts) in terminal-agent-control.ts handles: prior-PID cleanup → spawn → record stash. Both the CLI cold-start path in cli.ts and the new server.ts watchdog route through it. Removes the copy-paste between them; future env wiring lands in one place. Gated on cfg.ownsTerminalAgent — embedders that pre-launch their own PTY server (gbrowser phoenix overlay) still own the full lifecycle. GSTACK_AGENT_WATCHDOG_TICK_MS env knob compresses the 60s tick for e2e tests without 60s waits per assertion. Tests: * browse/test/terminal-agent-watchdog.test.ts — 7 static-grep tripwires for the load-bearing invariants (ownsTerminalAgent gate, PID-based liveness, crash-loop guard with window pruning, shutdown cleanup, CLI cold-start uses the same helper, env knob exists). * Live process-kill tests belong in the e2e tier; cheaper invariants here catch refactor regressions in ~1ms each. Co-Authored-By: Claude Opus 4.7 (1M context) --- browse/src/cli.ts | 44 +++------- browse/src/server.ts | 82 ++++++++++++++++++- browse/src/terminal-agent-control.ts | 63 ++++++++++++++ browse/test/terminal-agent-watchdog.test.ts | 91 +++++++++++++++++++++ 4 files changed, 247 insertions(+), 33 deletions(-) create mode 100644 browse/test/terminal-agent-watchdog.test.ts diff --git a/browse/src/cli.ts b/browse/src/cli.ts index a09947dbe..66559b881 100644 --- a/browse/src/cli.ts +++ b/browse/src/cli.ts @@ -16,7 +16,7 @@ import { writeSecureFile, mkdirSecure } from './file-permissions'; import { resolveConfig, ensureStateDir, readVersionHash } from './config'; import { parseProxyConfig, computeConfigHash, ProxyConfigError } from './proxy-config'; import { redactProxyUrl } from './proxy-redact'; -import { readAgentRecord, killAgentByRecord, clearAgentRecord } from './terminal-agent-control'; +import { spawnTerminalAgent } from './terminal-agent-control'; const config = resolveConfig(); const IS_WINDOWS = process.platform === 'win32'; @@ -1034,38 +1034,18 @@ Refs: After 'snapshot', use @e1, @e2... as selectors: // claude -p subprocesses to multiplex. // Auto-start terminal agent (non-compiled bun process). Owns the PTY - // WebSocket for the sidebar Terminal pane. - let termAgentScript = path.resolve(__dirname, 'terminal-agent.ts'); - if (!fs.existsSync(termAgentScript)) { - termAgentScript = path.resolve(path.dirname(process.execPath), '..', 'src', 'terminal-agent.ts'); - } + // WebSocket for the sidebar Terminal pane. Routes through the shared + // spawnTerminalAgent helper so the CLI cold-start path and the + // server.ts watchdog respawn path share one implementation. The + // helper handles prior-PID cleanup, script lookup, and env wiring. try { - if (fs.existsSync(termAgentScript)) { - // Kill any stale terminal-agent from a prior run so its port file - // can't trick the server into routing /pty-session at a dead - // listener. Identity-based (v1.44+) — only kills the PID recorded - // in `/terminal-agent-pid`. Pre-v1.44 used - // `pkill -f terminal-agent\.ts` which matched sibling gstack - // sessions; see terminal-agent-control.ts header for rationale. - { - const stateDir = path.dirname(config.stateFile); - const prior = readAgentRecord(stateDir); - if (prior) { - killAgentByRecord(prior, 'SIGTERM'); - clearAgentRecord(stateDir); - } - } - const termProc = Bun.spawn(['bun', 'run', termAgentScript], { - cwd: config.projectDir, - env: { - ...process.env, - BROWSE_STATE_FILE: config.stateFile, - BROWSE_SERVER_PORT: String(newState.port), - }, - stdio: ['ignore', 'ignore', 'ignore'], - }); - termProc.unref(); - console.log(`[browse] Terminal agent started (PID: ${termProc.pid})`); + const newPid = spawnTerminalAgent({ + stateFile: config.stateFile, + serverPort: newState.port, + cwd: config.projectDir, + }); + if (newPid) { + console.log(`[browse] Terminal agent started (PID: ${newPid})`); } } catch (err: any) { // Non-fatal: chat still works without the terminal agent. diff --git a/browse/src/server.ts b/browse/src/server.ts index a013fc1dd..9511f67b1 100644 --- a/browse/src/server.ts +++ b/browse/src/server.ts @@ -43,7 +43,8 @@ import { inspectElement, modifyStyle, resetModifications, getModificationHistory // Bun.spawn used instead of child_process.spawn (compiled bun binaries // fail posix_spawn on all executables including /bin/bash) import { safeUnlink, safeUnlinkQuiet, safeKill } from './error-handling'; -import { readAgentRecord, killAgentByRecord, clearAgentRecord, agentRecordPath } from './terminal-agent-control'; +import { readAgentRecord, killAgentByRecord, clearAgentRecord, agentRecordPath, spawnTerminalAgent } from './terminal-agent-control'; +import { isProcessAlive } from './error-handling'; import { sanitizeBody, stripLoneSurrogateEscapes } from './sanitize'; import { startSocksBridge, testUpstream, type BridgeHandle } from './socks-bridge'; import { parseProxyConfig, toUpstreamConfig, ProxyConfigError } from './proxy-config'; @@ -1306,6 +1307,84 @@ export function buildFetchHandler(cfg: ServerConfig): ServerHandle { // premise even under malformed cfg. const ownsTerminalAgent = cfg.ownsTerminalAgent === false ? false : true; + // ─── Terminal-Agent Watchdog (v1.44+) ───────────────────────────── + // + // The terminal-agent process can die independently of the server: SIGKILL + // from the OS OOM killer, an uncaught exception under load, an external + // `pkill` from a sibling debugging session. Pre-v1.44 the sidebar would + // see the broken connection and stay broken until the user reloaded. + // Now: 60s ticker checks the recorded agent PID, respawns via the shared + // spawnTerminalAgent helper if dead. + // + // Identity-based — uses readAgentRecord + isProcessAlive, NOT a process + // name probe. Critical: prevents respawning around a slow-but-alive agent + // (which would create split-brain — two agents writing the port file, + // tokens diverging between them, mystery PTY upgrade failures). + // + // Crash-loop guard: 3 respawn attempts inside 60s → stop trying and emit + // a one-line error. Manual `forceRestart` from the sidebar clears the + // history (the user is the explicit signal to retry). + // + // Only active when ownsTerminalAgent === true. Embedders that pre-launch + // their own PTY server (gbrowser phoenix overlay) must not be auto-respawned + // by us — their lifecycle is their concern. + let agentWatchdogInterval: ReturnType | null = null; + const respawnHistory: number[] = []; + const AGENT_WATCHDOG_TICK_MS = parseInt( + process.env.GSTACK_AGENT_WATCHDOG_TICK_MS || '60000', + 10, + ); + const RESPAWN_GUARD_WINDOW_MS = 60_000; + const RESPAWN_GUARD_MAX = 3; + let agentRespawnGuardTripped = false; + + if (ownsTerminalAgent) { + agentWatchdogInterval = setInterval(() => { + if (isShuttingDown) return; + if (agentRespawnGuardTripped) return; + const stateDir = path.dirname(cfg.config.stateFile); + const record = readAgentRecord(stateDir); + // If the record exists and the PID is alive, the agent is healthy + // (or at least still answering signal 0). Slow-but-alive agents + // intentionally fall through here — split-brain is worse than + // unresponsiveness, and slow recovery is handled by the user via + // restart. + if (record && isProcessAlive(record.pid)) return; + // Either no record (never spawned, or cleaned up after crash) or + // PID is dead. Try to respawn. + const now = Date.now(); + while (respawnHistory.length && now - respawnHistory[0] > RESPAWN_GUARD_WINDOW_MS) { + respawnHistory.shift(); + } + if (respawnHistory.length >= RESPAWN_GUARD_MAX) { + agentRespawnGuardTripped = true; + console.error( + `[browse] terminal-agent respawn guard tripped (${RESPAWN_GUARD_MAX} crashes in ${RESPAWN_GUARD_WINDOW_MS / 1000}s) — manual restart required`, + ); + return; + } + respawnHistory.push(now); + try { + const pid = spawnTerminalAgent({ + stateFile: cfg.config.stateFile, + serverPort: cfg.browsePort, + cwd: cfg.config.projectDir, + }); + if (pid) { + console.log(`[browse] terminal-agent respawned by watchdog (PID: ${pid})`); + } else { + console.warn('[browse] terminal-agent respawn skipped — script not found on disk'); + } + } catch (err: any) { + console.warn('[browse] terminal-agent respawn failed:', err?.message || err); + } + }, AGENT_WATCHDOG_TICK_MS); + // Detach the watchdog timer from Node's event-loop ref count so a + // healthy idle process can still exit cleanly if everything else is + // also unref'd. Bun's setInterval returns a Timer with unref(). + (agentWatchdogInterval as any)?.unref?.(); + } + // Factory-scoped validateAuth. Closes over cfg.authToken so every internal // auth check sees the same token the routes receive. Module-level // validateAuth was deleted in v1.35.0.0. @@ -1345,6 +1424,7 @@ export function buildFetchHandler(cfg: ServerConfig): ServerHandle { if (cfgBrowserManager.isWatching()) cfgBrowserManager.stopWatch(); clearInterval(flushInterval); clearInterval(idleCheckInterval); + if (agentWatchdogInterval) clearInterval(agentWatchdogInterval); await flushBuffers(); await cfgBrowserManager.close(); diff --git a/browse/src/terminal-agent-control.ts b/browse/src/terminal-agent-control.ts index 1e2fb6eb4..094ba668f 100644 --- a/browse/src/terminal-agent-control.ts +++ b/browse/src/terminal-agent-control.ts @@ -19,6 +19,69 @@ import * as path from 'path'; import { safeUnlink, safeKill, isProcessAlive } from './error-handling'; import { writeSecureFile, mkdirSecure } from './file-permissions'; +/** + * Locate the terminal-agent script on disk. In dev (cli.ts running via + * `bun run`), it lives next to this file in browse/src. In a compiled + * binary, Bun's --compile bakes the source into the executable and + * exposes it relative to process.execPath. Either path must work or + * the agent can't be spawned at all. + */ +export function resolveTerminalAgentScript(searchHints: { metaDir?: string; execPath?: string } = {}): string | null { + const meta = searchHints.metaDir || __dirname; + const exec = searchHints.execPath || process.execPath; + const candidates = [ + path.resolve(meta, 'terminal-agent.ts'), + path.resolve(path.dirname(exec), '..', 'src', 'terminal-agent.ts'), + ]; + for (const c of candidates) { + if (fs.existsSync(c)) return c; + } + return null; +} + +/** + * Spawn a fresh terminal-agent as a detached child. Handles the standard + * three steps: kill any prior agent recorded at `/terminal-agent-pid`, + * clear the stale record, then `Bun.spawn(['bun', 'run', script], ...)` with + * env wiring. Returns the PID of the new agent on success, null when the + * agent script can't be located. + * + * Used by both the CLI cold-start path (cli.ts) and the v1.44 watchdog in + * server.ts. Centralizing here removes a copy-paste between them and means + * future spawn-env additions (e.g. BROWSE_OWNER_PID for the generation + * counter rollout) land in one place. + */ +export function spawnTerminalAgent(opts: { + stateFile: string; + serverPort: number; + cwd?: string; + /** Optional extra env vars to add to the agent's process env. */ + extraEnv?: Record; + /** Override script lookup for tests. */ + scriptPath?: string; +}): number | null { + const stateDir = path.dirname(opts.stateFile); + const prior = readAgentRecord(stateDir); + if (prior) { + killAgentByRecord(prior, 'SIGTERM'); + clearAgentRecord(stateDir); + } + const script = opts.scriptPath || resolveTerminalAgentScript(); + if (!script || !fs.existsSync(script)) return null; + const proc = (Bun as any).spawn(['bun', 'run', script], { + cwd: opts.cwd || process.cwd(), + env: { + ...process.env, + BROWSE_STATE_FILE: opts.stateFile, + BROWSE_SERVER_PORT: String(opts.serverPort), + ...(opts.extraEnv || {}), + }, + stdio: ['ignore', 'ignore', 'ignore'], + }); + proc.unref?.(); + return proc.pid ?? null; +} + export interface AgentRecord { pid: number; /** Random per-boot identifier. Loopback /internal/* sees X-Browse-Gen: . */ diff --git a/browse/test/terminal-agent-watchdog.test.ts b/browse/test/terminal-agent-watchdog.test.ts new file mode 100644 index 000000000..f012dc406 --- /dev/null +++ b/browse/test/terminal-agent-watchdog.test.ts @@ -0,0 +1,91 @@ +import { describe, test, expect } from 'bun:test'; +import * as fs from 'fs'; +import * as path from 'path'; + +// v1.44 terminal-agent watchdog — static-grep invariants. +// +// The watchdog respawns terminal-agent when its PID dies. Live process-tree +// tests would require spawning, killing, and observing across two real Bun +// processes — slow and flaky in the free tier. These tripwires defend the +// load-bearing properties: identity-based liveness check (not name match), +// crash-loop guard, gated on ownsTerminalAgent, and cleared on shutdown. + +const SERVER_TS = path.resolve(new URL(import.meta.url).pathname, '..', '..', 'src', 'server.ts'); +const CONTROL_TS = path.resolve(new URL(import.meta.url).pathname, '..', '..', 'src', 'terminal-agent-control.ts'); + +describe('terminal-agent watchdog (v1.44+)', () => { + test('1. spawnTerminalAgent helper exists with PID return type', () => { + const src = fs.readFileSync(CONTROL_TS, 'utf-8'); + expect(src).toMatch(/export function spawnTerminalAgent\(/); + // Must clean up prior PID before spawning (no zombies). + expect(src).toContain('readAgentRecord(stateDir)'); + expect(src).toContain('killAgentByRecord(prior'); + expect(src).toContain('clearAgentRecord(stateDir)'); + }); + + test('2. watchdog is gated on ownsTerminalAgent', () => { + const src = fs.readFileSync(SERVER_TS, 'utf-8'); + // Match the comment + the guard. The guard MUST be a positive check; + // an inverted check would respawn for embedders and trample their PTY. + const block = sliceBetween(src, '─── Terminal-Agent Watchdog', 'Factory-scoped validateAuth'); + expect(block).toMatch(/if \(ownsTerminalAgent\)/); + expect(block).toContain('agentWatchdogInterval = setInterval'); + }); + + test('3. watchdog uses PID liveness, not process name probe', () => { + const src = fs.readFileSync(SERVER_TS, 'utf-8'); + const block = sliceBetween(src, '─── Terminal-Agent Watchdog', 'Factory-scoped validateAuth'); + // The whole point of the v1.44 watchdog over v1.43- pkill teardown: + // identity-based liveness. Slow-but-alive agents must NOT trigger + // respawn (split-brain defense). + expect(block).toContain('readAgentRecord(stateDir)'); + expect(block).toContain('isProcessAlive(record.pid)'); + // Negative: no executable name-based process lookup. Allow the strings + // to appear in prose comments (the watchdog doc explains what it + // replaces), reject only actual invocations. + expect(block).not.toMatch(/spawnSync\s*\(\s*['"]pkill/); + expect(block).not.toMatch(/Bun\.spawn\s*\(\s*\[\s*['"]pgrep/); + }); + + test('4. crash-loop guard with rolling window', () => { + const src = fs.readFileSync(SERVER_TS, 'utf-8'); + const block = sliceBetween(src, '─── Terminal-Agent Watchdog', 'Factory-scoped validateAuth'); + expect(block).toContain('RESPAWN_GUARD_WINDOW_MS = 60_000'); + expect(block).toContain('RESPAWN_GUARD_MAX = 3'); + expect(block).toContain('respawnHistory'); + expect(block).toContain('agentRespawnGuardTripped'); + // Window pruning: old entries must be evicted before counting toward + // the limit. Otherwise a daemon up for a week with one crash a day + // would eventually trip the guard. + expect(block).toMatch(/respawnHistory\.shift\(\)/); + }); + + test('5. watchdog interval is cleared on shutdown', () => { + const src = fs.readFileSync(SERVER_TS, 'utf-8'); + expect(src).toContain('if (agentWatchdogInterval) clearInterval(agentWatchdogInterval)'); + }); + + test('6. tick interval is env-overridable for tests', () => { + const src = fs.readFileSync(SERVER_TS, 'utf-8'); + expect(src).toContain('GSTACK_AGENT_WATCHDOG_TICK_MS'); + }); + + test('7. CLI cold-start path uses the same spawnTerminalAgent helper', () => { + const cli = fs.readFileSync( + path.resolve(new URL(import.meta.url).pathname, '..', '..', 'src', 'cli.ts'), + 'utf-8', + ); + // Otherwise the CLI and watchdog could drift on spawn env/cwd, and + // teardown invariants tested against one would silently miss the other. + expect(cli).toContain('spawnTerminalAgent({'); + expect(cli).toContain("from './terminal-agent-control'"); + }); +}); + +function sliceBetween(source: string, start: string, end: string): string { + const i = source.indexOf(start); + if (i === -1) throw new Error(`marker not found: ${start}`); + const j = source.indexOf(end, i + start.length); + if (j === -1) throw new Error(`end marker not found: ${end}`); + return source.slice(i, j); +}