diff --git a/CLAUDE.md b/CLAUDE.md index 3ff25fffe..f63086a28 100644 --- a/CLAUDE.md +++ b/CLAUDE.md @@ -236,19 +236,23 @@ Activity / Refs / Inspector as debug overlays behind the footer's flow, dual-token model, and threat-model boundary — silent failures here usually trace to not understanding the cross-component flow. -**Embedder terminal-agent ownership** (v1.42.1.0+). `buildFetchHandler` -in `browse/src/server.ts` accepts `ServerConfig.ownsTerminalAgent?: -boolean` (default `true`). When `true`, factory shutdown runs the full -teardown: `pkill -f terminal-agent\.ts` plus `safeUnlinkQuiet` on -`/terminal-port` and `/terminal-internal-token`. -Embedders (e.g. the gbrowser phoenix overlay) that pre-launch their -own PTY server must pass `false` so their discovery files survive -gstack teardown cycles. The flag is the third caller-owned teardown -gate in `ServerConfig` (alongside `xvfb?` and `proxyBridge?`); polarity -is inverted (explicit bool vs presence) and documented in the field's -JSDoc. CLI `start()` always passes `true` explicitly — the static-grep -test in `browse/test/server-embedder-terminal-port.test.ts` fails CI -if a refactor drops it. +**Embedder terminal-agent ownership** (v1.42.1.0+, identity-based kill v1.44.0.0+). +`buildFetchHandler` in `browse/src/server.ts` accepts `ServerConfig.ownsTerminalAgent?: +boolean` (default `true`). When `true`, factory shutdown runs the full teardown: +identity-based kill via `killAgentByRecord(readAgentRecord(stateDir))` from +`browse/src/terminal-agent-control.ts` plus `safeUnlinkQuiet` on +`/terminal-port`, `/terminal-internal-token`, and +`/terminal-agent-pid` (the per-boot agent record introduced in v1.44). +Embedders (e.g. the gbrowser phoenix overlay) that pre-launch their own PTY +server must pass `false` so their discovery files survive gstack teardown cycles. +The flag is the third caller-owned teardown gate in `ServerConfig` (alongside +`xvfb?` and `proxyBridge?`); polarity is inverted (explicit bool vs presence) and +documented in the field's JSDoc. CLI `start()` always passes `true` explicitly — +the static-grep test in `browse/test/server-embedder-terminal-port.test.ts` fails +CI if a refactor drops it. Pre-v1.44 used `pkill -f terminal-agent\.ts` (regex +match) which would kill sibling gstack sessions on the same host; the new +`browse/test/terminal-agent-pid-identity.test.ts` static-grep tripwire fails CI +if any source file re-introduces `pkill ... terminal-agent` or `spawnSync('pkill', ...)`. **WebSocket auth uses Sec-WebSocket-Protocol, not cookies.** Browsers can't set `Authorization` on a WebSocket upgrade, but they CAN set diff --git a/TODOS.md b/TODOS.md index 01fdc1c85..fbd645e60 100644 --- a/TODOS.md +++ b/TODOS.md @@ -2,34 +2,17 @@ ## browse server: terminal-agent teardown follow-ups (filed v1.41 via /plan-eng-review) -### P3: Identity-based terminal-agent kill (replace pkill regex with PID) +### ✅ DONE (v1.44.0.0): Identity-based terminal-agent kill (replace pkill regex with PID) -**What:** Record the spawned terminal-agent PID at `browse/src/cli.ts:1057` and -replace `pkill -f terminal-agent\.ts` at both `cli.ts:1047` and -`server.ts:1281` (now inside the `if (ownsTerminalAgent)` gate) with -`process.kill(pid, signal)` against the recorded PID. - -**Why:** `pkill -f terminal-agent\.ts` matches by command-line regex, so today -it can kill ANY process whose argv contains `terminal-agent.ts` — sibling -gstack sessions, editor processes that have the file open, a second gstack -run on the same host. Latent footgun for the CLI path, not just embedders. - -**Pros:** Removes a real cross-session foot-cannon. PID-based kill is the -correct identity primitive. Lets us tighten `pkill -f`'s broad-match warning -in the new `ownsTerminalAgent` JSDoc to "historical" rather than "current". - -**Cons:** Requires threading the PID through the CLI-to-server state path -(currently the parent server reads `terminal-port` to discover the agent; it -would also need `terminal-agent-pid`). Touches `cli.ts`, `server.ts`, and -`terminal-agent.ts` together — bigger surface than the v1.41 fix. - -**Context:** Surfaced by both Codex and Claude subagent during /autoplan -review of the `ownsTerminalAgent` gate. Currently documented as out-of-scope -in `browse/src/server.ts` JSDoc for `ServerConfig.ownsTerminalAgent`. The -embedder fix (ownsTerminalAgent: false) means embedders don't hit this; CLI -users still do. - -**Depends on:** None. +**Resolved:** Bundled into the v1.44.0.0 long-lived-sidebar PR as Commit 0. +`browse/src/terminal-agent-control.ts` is the new home for `readAgentRecord`, +`writeAgentRecord`, `clearAgentRecord`, and `killAgentByRecord`. The agent +writes `/terminal-agent-pid` (JSON `{pid, gen, startedAt}`) at boot +and clears it on SIGTERM/SIGINT. `cli.ts` and `server.ts` both route through +`killAgentByRecord` instead of `pkill -f terminal-agent\.ts`. The new +`browse/test/terminal-agent-pid-identity.test.ts` is the static-grep tripwire +that fails CI if `pkill ... terminal-agent` or `spawnSync('pkill', ...)` +reappears in any source file. --- diff --git a/browse/src/cli.ts b/browse/src/cli.ts index 4f523bea7..a09947dbe 100644 --- a/browse/src/cli.ts +++ b/browse/src/cli.ts @@ -16,6 +16,7 @@ import { writeSecureFile, mkdirSecure } from './file-permissions'; import { resolveConfig, ensureStateDir, readVersionHash } from './config'; import { parseProxyConfig, computeConfigHash, ProxyConfigError } from './proxy-config'; import { redactProxyUrl } from './proxy-redact'; +import { readAgentRecord, killAgentByRecord, clearAgentRecord } from './terminal-agent-control'; const config = resolveConfig(); const IS_WINDOWS = process.platform === 'win32'; @@ -1040,13 +1041,19 @@ Refs: After 'snapshot', use @e1, @e2... as selectors: } try { if (fs.existsSync(termAgentScript)) { - // Kill old terminal-agents so a stale port file can't trick the - // server into routing /pty-session at a dead listener. - try { - const { spawnSync } = require('child_process'); - spawnSync('pkill', ['-f', 'terminal-agent\\.ts'], { stdio: 'ignore', timeout: 3000 }); - } catch (err: any) { - if (err?.code !== 'ENOENT') throw err; + // Kill any stale terminal-agent from a prior run so its port file + // can't trick the server into routing /pty-session at a dead + // listener. Identity-based (v1.44+) — only kills the PID recorded + // in `/terminal-agent-pid`. Pre-v1.44 used + // `pkill -f terminal-agent\.ts` which matched sibling gstack + // sessions; see terminal-agent-control.ts header for rationale. + { + const stateDir = path.dirname(config.stateFile); + const prior = readAgentRecord(stateDir); + if (prior) { + killAgentByRecord(prior, 'SIGTERM'); + clearAgentRecord(stateDir); + } } const termProc = Bun.spawn(['bun', 'run', termAgentScript], { cwd: config.projectDir, diff --git a/browse/src/server.ts b/browse/src/server.ts index 05db6665b..a013fc1dd 100644 --- a/browse/src/server.ts +++ b/browse/src/server.ts @@ -43,6 +43,7 @@ import { inspectElement, modifyStyle, resetModifications, getModificationHistory // Bun.spawn used instead of child_process.spawn (compiled bun binaries // fail posix_spawn on all executables including /bin/bash) import { safeUnlink, safeUnlinkQuiet, safeKill } from './error-handling'; +import { readAgentRecord, killAgentByRecord, clearAgentRecord, agentRecordPath } from './terminal-agent-control'; import { sanitizeBody, stripLoneSurrogateEscapes } from './sanitize'; import { startSocksBridge, testUpstream, type BridgeHandle } from './socks-bridge'; import { parseProxyConfig, toUpstreamConfig, ProxyConfigError } from './proxy-config'; @@ -207,31 +208,34 @@ export interface ServerConfig { beforeRoute?: (req: Request, surface: Surface, auth: TokenInfo | null) => Promise; /** * Whether gstack owns the lifecycle of the terminal-agent process and its - * discovery files (`/terminal-port`, `/terminal-internal-token`). + * discovery files (`/terminal-port`, `/terminal-internal-token`, + * `/terminal-agent-pid`). * - * When true (default), shutdown() runs three side effects: - * 1. `pkill -f terminal-agent\.ts` — regex-broad, matches ANY process whose - * command line contains `terminal-agent.ts` on this host (including - * sibling gstack sessions). Pre-existing CLI behavior, not introduced by - * this flag. Identity-based PID kill is a separate followup (see TODOS). + * When true (default), shutdown() runs four side effects: + * 1. Identity-based kill via `killAgentByRecord(readAgentRecord(stateDir))` + * (v1.44+). Only signals the PID recorded by THIS daemon's agent. + * Replaced the historical `pkill -f terminal-agent\.ts` regex that + * matched sibling gstack sessions on the same host — see + * terminal-agent-control.ts for rationale. * 2. `safeUnlinkQuiet(/terminal-port)` * 3. `safeUnlinkQuiet(/terminal-internal-token)` + * 4. `safeUnlinkQuiet(/terminal-agent-pid)` (the v1.44 record) * * This is correct for gstack's CLI path, which spawns `terminal-agent.ts` as * the producer of those files (see cli.ts:1037-1063). * * Embedders (gbrowser phoenix overlay, future hosts) that run their own PTY * server and write those files themselves should pass `false`. When `false`, - * the embedder owns BOTH the agent process AND both discovery files — - * terminal-agent.ts's own SIGTERM cleanup only removes `terminal-port` - * (see terminal-agent.ts:558), so the internal-token file is the embedder's - * full responsibility. + * the embedder owns BOTH the agent process AND all three discovery files. + * Note that terminal-agent.ts's own SIGTERM cleanup removes `terminal-port` + * and `terminal-agent-pid` (the agent writes both at boot), so embedders + * that pre-launch their own agent must ensure their cleanup matches. * * Polarity note: this differs from `xvfb?` and `proxyBridge?`, which gate by * the *presence* of a caller-owned handle (presence ⇒ don't close). This * field gates by an explicit boolean because there is no handle object — * the terminal-agent is started elsewhere (cli.ts), and shutdown's only - * reference is the regex-based pkill + the file paths. + * reference is the PID record + the file paths. */ ownsTerminalAgent?: boolean; } @@ -1319,14 +1323,20 @@ export function buildFetchHandler(cfg: ServerConfig): ServerHandle { console.log('[browse] Shutting down...'); if (ownsTerminalAgent) { + // Identity-based kill (v1.44+). Replaces the v1.43- `pkill -f + // terminal-agent\.ts` regex teardown which matched sibling gstack + // sessions on the same host. Only the PID recorded in + // `/terminal-agent-pid` by THIS daemon's agent is signaled. try { - const { spawnSync } = require('child_process'); - spawnSync('pkill', ['-f', 'terminal-agent\\.ts'], { stdio: 'ignore', timeout: 3000 }); + const stateDir = path.dirname(config.stateFile); + const record = readAgentRecord(stateDir); + if (record) killAgentByRecord(record, 'SIGTERM'); } catch (err: any) { console.warn('[browse] Failed to kill terminal-agent:', err.message); } safeUnlinkQuiet(path.join(path.dirname(config.stateFile), 'terminal-port')); safeUnlinkQuiet(path.join(path.dirname(config.stateFile), 'terminal-internal-token')); + safeUnlinkQuiet(agentRecordPath(path.dirname(config.stateFile))); } try { detachSession(); } catch (err: any) { console.warn('[browse] Failed to detach CDP session:', err.message); diff --git a/browse/src/terminal-agent-control.ts b/browse/src/terminal-agent-control.ts new file mode 100644 index 000000000..1e2fb6eb4 --- /dev/null +++ b/browse/src/terminal-agent-control.ts @@ -0,0 +1,80 @@ +/** + * terminal-agent process-control primitives shared by cli.ts spawn site, + * server.ts shutdown teardown, and the v1.44 watchdog/respawn loop. + * + * Why this exists: pre-v1.44 used `pkill -f terminal-agent\.ts`, which + * matches any process whose argv contains the string and would kill + * sibling gstack sessions on the same host. The agent now writes a + * structured `terminal-agent-pid` record (`{pid, gen, startedAt}`) and + * every kill site routes through `killAgentByRecord` here — identity-based, + * no regex. + * + * The `gen` field is a per-boot generation counter. Loopback /internal/* + * calls from the parent server include `X-Browse-Gen` so a slow agent that + * the watchdog respawned around can't accidentally service a stale grant + * from the old generation. + */ +import * as fs from 'fs'; +import * as path from 'path'; +import { safeUnlink, safeKill, isProcessAlive } from './error-handling'; +import { writeSecureFile, mkdirSecure } from './file-permissions'; + +export interface AgentRecord { + pid: number; + /** Random per-boot identifier. Loopback /internal/* sees X-Browse-Gen: . */ + gen: string; + /** ms since epoch. Reserved for future PID-reuse guards. */ + startedAt: number; +} + +export function agentRecordPath(stateDir: string): string { + return path.join(stateDir, 'terminal-agent-pid'); +} + +/** Read the current record. Returns null on missing/malformed file. */ +export function readAgentRecord(stateDir: string): AgentRecord | null { + try { + const raw = fs.readFileSync(agentRecordPath(stateDir), 'utf-8'); + const j = JSON.parse(raw); + if (typeof j?.pid === 'number' && typeof j?.gen === 'string' && typeof j?.startedAt === 'number') { + return j as AgentRecord; + } + return null; + } catch { + return null; + } +} + +/** Atomic write. Caller must ensure stateDir exists; agent does this at boot. */ +export function writeAgentRecord(stateDir: string, record: AgentRecord): void { + try { mkdirSecure(stateDir); } catch {} + const target = agentRecordPath(stateDir); + const tmp = `${target}.tmp-${process.pid}`; + writeSecureFile(tmp, JSON.stringify(record)); + fs.renameSync(tmp, target); +} + +export function clearAgentRecord(stateDir: string): void { + safeUnlink(agentRecordPath(stateDir)); +} + +/** + * Kill the agent identified by `record`. Signal defaults to SIGTERM (give + * the agent a chance to run its own SIGTERM cleanup). Returns true if a + * signal was actually sent to a live PID; false if the PID was already + * dead (no-op). Never throws — ESRCH is swallowed by safeKill. + * + * Validates liveness BEFORE signaling so a PID-reuse race (the recorded + * PID was reaped and a brand-new unrelated process now holds it) can't + * cause us to kill the wrong process. This is a best-effort defense: + * Linux/macOS don't expose process-start-time cheaply, and the gap + * between record-write and watchdog-tick is small (60s max). + */ +export function killAgentByRecord( + record: AgentRecord, + signal: NodeJS.Signals = 'SIGTERM', +): boolean { + if (!isProcessAlive(record.pid)) return false; + safeKill(record.pid, signal); + return true; +} diff --git a/browse/src/terminal-agent.ts b/browse/src/terminal-agent.ts index 189a49f09..d1cae0fb2 100644 --- a/browse/src/terminal-agent.ts +++ b/browse/src/terminal-agent.ts @@ -25,12 +25,21 @@ import * as path from 'path'; import * as crypto from 'crypto'; import { writeSecureFile, mkdirSecure } from './file-permissions'; import { safeUnlink } from './error-handling'; +import { writeAgentRecord, clearAgentRecord } from './terminal-agent-control'; const STATE_FILE = process.env.BROWSE_STATE_FILE || path.join(process.env.HOME || '/tmp', '.gstack', 'browse.json'); const PORT_FILE = path.join(path.dirname(STATE_FILE), 'terminal-port'); const BROWSE_SERVER_PORT = parseInt(process.env.BROWSE_SERVER_PORT || '0', 10); const EXTENSION_ID = process.env.BROWSE_EXTENSION_ID || ''; // optional: tighten Origin check const INTERNAL_TOKEN = crypto.randomBytes(32).toString('base64url'); // shared with parent server via env at spawn +/** + * Per-boot generation identifier. Loopback /internal/* callers include + * `X-Browse-Gen: ` so a slow agent the watchdog respawned + * around can't service a stale grant from the prior generation. Absent + * header means "legacy caller" and is accepted (backward compat); a + * present-but-mismatched header returns 409 stale generation. + */ +const CURRENT_GEN = crypto.randomBytes(16).toString('base64url'); // In-memory cookie token registry. Parent posts /internal/grant after // /pty-session; we validate WS cookies against this set. @@ -201,6 +210,27 @@ function disposeSession(session: PtySession): void { * * Everything else returns 404. The listener binds 127.0.0.1 only. */ +/** + * Validate a loopback /internal/* request. Returns null when the request + * is allowed; otherwise returns the Response to send back. Centralizes + * bearer auth + the v1.44 X-Browse-Gen generation check so adding a new + * /internal/* route is a one-liner. The full internalHandler wrapper + * arrives in Commit 1 alongside the new routes; this is the minimal + * shape needed to gate the existing /internal/grant + /internal/revoke + * without copy-pasting the gen check. + */ +function checkInternalAuth(req: Request): Response | null { + const auth = req.headers.get('authorization'); + if (auth !== `Bearer ${INTERNAL_TOKEN}`) { + return new Response('forbidden', { status: 403 }); + } + const headerGen = req.headers.get('x-browse-gen'); + if (headerGen && headerGen !== CURRENT_GEN) { + return new Response('stale generation', { status: 409 }); + } + return null; +} + function buildServer() { return Bun.serve({ hostname: '127.0.0.1', @@ -212,10 +242,8 @@ function buildServer() { // /internal/grant — loopback-only handshake from parent server. if (url.pathname === '/internal/grant' && req.method === 'POST') { - const auth = req.headers.get('authorization'); - if (auth !== `Bearer ${INTERNAL_TOKEN}`) { - return new Response('forbidden', { status: 403 }); - } + const denied = checkInternalAuth(req); + if (denied) return denied; return req.json().then((body: any) => { if (typeof body?.token === 'string' && body.token.length > 16) { validTokens.add(body.token); @@ -226,16 +254,28 @@ function buildServer() { // /internal/revoke — drop a token (called on WS close or bootstrap reload) if (url.pathname === '/internal/revoke' && req.method === 'POST') { - const auth = req.headers.get('authorization'); - if (auth !== `Bearer ${INTERNAL_TOKEN}`) { - return new Response('forbidden', { status: 403 }); - } + const denied = checkInternalAuth(req); + if (denied) return denied; return req.json().then((body: any) => { if (typeof body?.token === 'string') validTokens.delete(body.token); return new Response('ok'); }).catch(() => new Response('bad', { status: 400 })); } + // /internal/healthz — liveness probe used by the v1.44 watchdog. + // Returns this agent's pid + gen + active session count without + // touching claude binary lookup (which can fail for non-process + // reasons and isn't a useful liveness signal). + if (url.pathname === '/internal/healthz' && req.method === 'GET') { + const denied = checkInternalAuth(req); + if (denied) return denied; + return new Response(JSON.stringify({ + pid: process.pid, + gen: CURRENT_GEN, + sessions: validTokens.size, + }), { status: 200, headers: { 'Content-Type': 'application/json' } }); + } + // /claude-available — bootstrap card hits this when user clicks "I installed it". if (url.pathname === '/claude-available' && req.method === 'GET') { writeClaudeAvailable(); @@ -548,14 +588,25 @@ function main() { writeSecureFile(tmp, String(port)); fs.renameSync(tmp, PORT_FILE); + // Write identity-based agent record (pid + per-boot gen). Replaces the + // v1.43- `pkill -f terminal-agent\.ts` regex teardown that could kill + // sibling gstack sessions. Callers (cli.ts spawn site, server.ts + // shutdown, the v1.44 watchdog) now route through killAgentByRecord in + // terminal-agent-control.ts. + writeAgentRecord(dir, { pid: process.pid, gen: CURRENT_GEN, startedAt: Date.now() }); + // Hand the parent the internal token so it can call /internal/grant. // Parent learns INTERNAL_TOKEN via env (TERMINAL_AGENT_INTERNAL_TOKEN below). // We just print it on stdout for the supervising process to pick up if it's // not already in env. Defense against env races at spawn time. - console.log(`[terminal-agent] listening on 127.0.0.1:${port} pid=${process.pid}`); + console.log(`[terminal-agent] listening on 127.0.0.1:${port} pid=${process.pid} gen=${CURRENT_GEN}`); - // Cleanup port file on exit. - const cleanup = () => { safeUnlink(PORT_FILE); process.exit(0); }; + // Cleanup port file + agent record on exit. + const cleanup = () => { + safeUnlink(PORT_FILE); + clearAgentRecord(dir); + process.exit(0); + }; process.on('SIGTERM', cleanup); process.on('SIGINT', cleanup); } diff --git a/browse/test/server-embedder-terminal-port.test.ts b/browse/test/server-embedder-terminal-port.test.ts index 722a331d8..f24ee3510 100644 --- a/browse/test/server-embedder-terminal-port.test.ts +++ b/browse/test/server-embedder-terminal-port.test.ts @@ -14,21 +14,35 @@ import { resolveConfig } from '../src/config'; // Tests for the v1.41+ ownsTerminalAgent flag. // // Embedders (gbrowser phoenix overlay) that run their own PTY server and write -// terminal-port / terminal-internal-token themselves were getting those files -// clobbered by gstack's shutdown(). The flag (default true) gates three side -// effects: pkill -f terminal-agent\.ts, unlink terminal-port, unlink -// terminal-internal-token. False = embedder owns them, gstack stays hands-off. +// terminal-port / terminal-internal-token / terminal-agent-pid themselves were +// getting those files clobbered by gstack's shutdown(). The flag (default true) +// gates four side effects (v1.44+): +// 1. identity-based kill of the PID in /terminal-agent-pid +// 2. unlink terminal-port +// 3. unlink terminal-internal-token +// 4. unlink terminal-agent-pid +// False = embedder owns them, gstack stays hands-off. // -// CRITICAL: each test stubs BOTH process.exit (so shutdown's exit doesn't kill -// the test runner) AND child_process.spawnSync (so pkill doesn't run real -// `pkill -f terminal-agent\.ts` on the developer's machine — would kill any -// sibling gstack sessions). +// Pre-v1.44 used `pkill -f terminal-agent\.ts` which matched sibling gstack +// sessions on the same host — see browse/src/terminal-agent-control.ts header. +// +// CRITICAL: each test stubs process.exit (so shutdown's exit doesn't kill +// the test runner). The PID in the test agent-record is a guaranteed-dead +// PID (1 = init / launchd — exists but cannot be killed by an unprivileged +// process, so safeKill returns ESRCH-equivalent without affecting anything). +// Use isProcessAlive's false branch by also testing with a PID that does +// not exist (negative PID rejected by the OS). const stateDir = resolveConfig().stateDir; const PORT_FILE = path.join(stateDir, 'terminal-port'); const TOKEN_FILE = path.join(stateDir, 'terminal-internal-token'); +const AGENT_RECORD_FILE = path.join(stateDir, 'terminal-agent-pid'); const SENTINEL_PORT = 'sentinel-port-65432'; const SENTINEL_TOKEN = 'sentinel-token-abcdef1234567890'; +// PID 2^31-1 is the Linux PID_MAX_LIMIT; macOS uses 99998. Either way, no +// real process will ever hold this PID on a developer machine. isProcessAlive +// returns false → killAgentByRecord no-ops without sending any signal. +const SENTINEL_DEAD_PID = 2147483646; function makeMinimalConfig(overrides: Partial = {}): ServerConfig { const token = 'embedder-test-' + crypto.randomBytes(16).toString('hex'); @@ -47,6 +61,10 @@ function writeSentinels(): void { fs.mkdirSync(stateDir, { recursive: true }); fs.writeFileSync(PORT_FILE, SENTINEL_PORT); fs.writeFileSync(TOKEN_FILE, SENTINEL_TOKEN); + fs.writeFileSync( + AGENT_RECORD_FILE, + JSON.stringify({ pid: SENTINEL_DEAD_PID, gen: 'sentinel-gen', startedAt: Date.now() }), + ); } function readIfExists(p: string): string | null { @@ -54,32 +72,40 @@ function readIfExists(p: string): string | null { } /** - * Stubs process.exit + child_process.spawnSync, runs the callback, and - * restores both regardless of throw. Returns the captured spawnSync argv - * list so callers can assert pkill was or wasn't invoked. The callback - * is expected to swallow the __exit:N throw from shutdown(). + * Stubs process.exit so shutdown()'s process.exit(0) throws an __exit:N + * marker the test can swallow instead of killing the runner. Also stubs + * process.kill so an accidental kill (regression in killAgentByRecord + * that bypassed isProcessAlive) cannot reach a real PID on the developer + * machine. Returns the captured kill calls so tests can assert kill + * scope. */ async function withStubs( - cb: (spawnSyncCalls: any[][]) => Promise -): Promise { + cb: (killCalls: Array<[number, NodeJS.Signals | number]>) => Promise +): Promise> { const origExit = process.exit; - const childProcess = require('child_process'); - const origSpawnSync = childProcess.spawnSync; - const spawnSyncCalls: any[][] = []; + const origKill = process.kill; + const killCalls: Array<[number, NodeJS.Signals | number]> = []; (process as any).exit = ((code: number) => { throw new Error(`__exit:${code}`); }) as any; - childProcess.spawnSync = ((...args: any[]) => { - spawnSyncCalls.push(args); - return { status: 0, stdout: '', stderr: '', signal: null, pid: 0, output: [] }; + (process as any).kill = ((pid: number, signal: NodeJS.Signals | number) => { + killCalls.push([pid, signal ?? 'SIGTERM']); + // signal 0 is a liveness probe — keep the existing 'process is dead' + // semantics so isProcessAlive(SENTINEL_DEAD_PID) returns false. + if (signal === 0) { + const err: any = new Error('No such process'); + err.code = 'ESRCH'; + throw err; + } + return true; }) as any; try { - await cb(spawnSyncCalls); + await cb(killCalls); } finally { (process as any).exit = origExit; - childProcess.spawnSync = origSpawnSync; + (process as any).kill = origKill; } - return spawnSyncCalls; + return killCalls; } async function runShutdown(handle: { shutdown: (code?: number) => Promise }): Promise { @@ -90,23 +116,28 @@ async function runShutdown(handle: { shutdown: (code?: number) => Promise } } -function pkillCalls(calls: any[][]): any[][] { - return calls.filter((call) => call[0] === 'pkill'); +// Filter out the signal=0 liveness probes; only count actual termination signals. +function terminationCalls( + calls: Array<[number, NodeJS.Signals | number]>, +): Array<[number, NodeJS.Signals | number]> { + return calls.filter(([, sig]) => sig !== 0); } describe('buildFetchHandler ownsTerminalAgent gate', () => { // shutdown() reads `path.dirname(config.stateFile)` from module-level config // (composition gap — see TODOS T9). So unlinks target the real state dir, // not a per-test temp dir. If a real gstack daemon is running on this host, - // its terminal-port + terminal-internal-token live where this test writes. - // Save + restore real-daemon file contents around the whole suite so the - // test never clobbers a developer's running session. + // its terminal-port + terminal-internal-token + terminal-agent-pid live + // where this test writes. Save + restore real-daemon file contents around + // the whole suite so the test never clobbers a developer's running session. let realPortBackup: string | null = null; let realTokenBackup: string | null = null; + let realAgentRecordBackup: string | null = null; beforeAll(() => { realPortBackup = readIfExists(PORT_FILE); realTokenBackup = readIfExists(TOKEN_FILE); + realAgentRecordBackup = readIfExists(AGENT_RECORD_FILE); }); afterAll(() => { @@ -122,6 +153,12 @@ describe('buildFetchHandler ownsTerminalAgent gate', () => { } else { try { fs.unlinkSync(TOKEN_FILE); } catch {} } + if (realAgentRecordBackup !== null) { + fs.mkdirSync(stateDir, { recursive: true }); + fs.writeFileSync(AGENT_RECORD_FILE, realAgentRecordBackup); + } else { + try { fs.unlinkSync(AGENT_RECORD_FILE); } catch {} + } }); beforeEach(() => { @@ -131,9 +168,10 @@ describe('buildFetchHandler ownsTerminalAgent gate', () => { // assertion can't pass spuriously off a stale file. try { fs.unlinkSync(PORT_FILE); } catch {} try { fs.unlinkSync(TOKEN_FILE); } catch {} + try { fs.unlinkSync(AGENT_RECORD_FILE); } catch {} }); - test('1. ownsTerminalAgent:false preserves both files and skips pkill', async () => { + test('1. ownsTerminalAgent:false preserves all three files and sends no signal', async () => { writeSentinels(); const handle = buildFetchHandler(makeMinimalConfig({ ownsTerminalAgent: false })); const calls = await withStubs(async () => { @@ -141,10 +179,11 @@ describe('buildFetchHandler ownsTerminalAgent gate', () => { }); expect(readIfExists(PORT_FILE)).toBe(SENTINEL_PORT); expect(readIfExists(TOKEN_FILE)).toBe(SENTINEL_TOKEN); - expect(pkillCalls(calls).length).toBe(0); + expect(readIfExists(AGENT_RECORD_FILE)).not.toBeNull(); + expect(terminationCalls(calls).length).toBe(0); }); - test('2. ownsTerminalAgent:true (explicit) deletes both files and invokes pkill exactly once', async () => { + test('2. ownsTerminalAgent:true deletes all three files; identity-based kill probes the recorded PID', async () => { writeSentinels(); const handle = buildFetchHandler(makeMinimalConfig({ ownsTerminalAgent: true })); const calls = await withStubs(async () => { @@ -152,13 +191,15 @@ describe('buildFetchHandler ownsTerminalAgent gate', () => { }); expect(readIfExists(PORT_FILE)).toBeNull(); expect(readIfExists(TOKEN_FILE)).toBeNull(); - const pkills = pkillCalls(calls); - expect(pkills.length).toBe(1); - // argv[1] is the args array passed to spawnSync. - expect(pkills[0][1]).toEqual(['-f', 'terminal-agent\\.ts']); + expect(readIfExists(AGENT_RECORD_FILE)).toBeNull(); + // isProcessAlive sends signal 0; PID is the sentinel-dead PID, so the + // probe returns false and no SIGTERM is sent. + const probes = calls.filter(([pid, sig]) => pid === SENTINEL_DEAD_PID && sig === 0); + expect(probes.length).toBeGreaterThan(0); + expect(terminationCalls(calls).length).toBe(0); }); - test('3. ownsTerminalAgent unset defaults to true (deletes + pkill)', async () => { + test('3. ownsTerminalAgent unset defaults to true (deletes all three; probes recorded PID)', async () => { writeSentinels(); // Note: no ownsTerminalAgent in the overrides — uses the `?? true` default. const handle = buildFetchHandler(makeMinimalConfig()); @@ -167,7 +208,9 @@ describe('buildFetchHandler ownsTerminalAgent gate', () => { }); expect(readIfExists(PORT_FILE)).toBeNull(); expect(readIfExists(TOKEN_FILE)).toBeNull(); - expect(pkillCalls(calls).length).toBe(1); + expect(readIfExists(AGENT_RECORD_FILE)).toBeNull(); + const probes = calls.filter(([pid, sig]) => pid === SENTINEL_DEAD_PID && sig === 0); + expect(probes.length).toBeGreaterThan(0); }); test('4. CLI start() call site passes ownsTerminalAgent: true literally (static grep)', () => { diff --git a/browse/test/terminal-agent-pid-identity.test.ts b/browse/test/terminal-agent-pid-identity.test.ts new file mode 100644 index 000000000..52503fe2e --- /dev/null +++ b/browse/test/terminal-agent-pid-identity.test.ts @@ -0,0 +1,161 @@ +import { describe, test, expect } from 'bun:test'; +import * as fs from 'fs'; +import * as path from 'path'; +import { + readAgentRecord, + writeAgentRecord, + clearAgentRecord, + killAgentByRecord, + agentRecordPath, + type AgentRecord, +} from '../src/terminal-agent-control'; + +// REGRESSION TEST for the v1.44 PID-identity migration. +// +// Pre-v1.44, both `cli.ts` and `server.ts` killed the terminal-agent with +// `spawnSync('pkill', ['-f', 'terminal-agent\\.ts'])`. That command matches +// by argv regex — any process whose command line contains the string +// `terminal-agent.ts` got SIGTERM'd. In practice this killed: +// +// * sibling gstack sessions on the same host +// * editor processes (vim, code, less) that had the file open +// * any second gstack run on the host +// +// The v1.44 migration replaces both kill sites with identity-based PID kill +// against the record written at `/terminal-agent-pid` by the +// agent's own boot path. This test is the static-grep tripwire that prevents +// reintroducing the regex teardown anywhere in the source tree. +// +// Pattern mirrors browse/test/server-embedder-terminal-port.test.ts (Test 4) +// and browse/test/server-sanitize-surrogates.test.ts: read source files +// directly, assert an invariant on their contents. + +const SRC_DIR = path.resolve(new URL(import.meta.url).pathname, '..', '..', 'src'); + +function readAllSourceFiles(): Array<{ file: string; content: string }> { + const out: Array<{ file: string; content: string }> = []; + for (const entry of fs.readdirSync(SRC_DIR)) { + if (!entry.endsWith('.ts')) continue; + const full = path.join(SRC_DIR, entry); + out.push({ file: entry, content: fs.readFileSync(full, 'utf-8') }); + } + return out; +} + +describe('terminal-agent PID identity (v1.44+)', () => { + test('1. no source file calls `pkill -f terminal-agent`', () => { + // The regex matches both `pkill -f terminal-agent\.ts` (escaped form + // used in spawnSync args) and `pkill -f terminal-agent.ts` (literal), + // since the dot is the only difference and both are footguns. + const offenders: string[] = []; + for (const { file, content } of readAllSourceFiles()) { + // Walk line by line so we can skip comments that mention the historical + // pattern (acceptable as documentation, not as code). + const lines = content.split('\n'); + for (let i = 0; i < lines.length; i++) { + const line = lines[i]; + if (!/pkill/.test(line)) continue; + if (!/terminal-agent/.test(line)) continue; + // Skip comment lines — historical mentions in JSDoc are fine. + const trimmed = line.trim(); + if (trimmed.startsWith('//') || trimmed.startsWith('*') || trimmed.startsWith('/*')) continue; + offenders.push(`${file}:${i + 1}: ${trimmed}`); + } + } + expect(offenders).toEqual([]); + }); + + test('2. neither cli.ts nor server.ts calls spawnSync with pkill', () => { + // Tighter check — even if someone routes through a different code path, + // any spawnSync('pkill', ...) anywhere in src/ is the smell. + const offenders: string[] = []; + for (const { file, content } of readAllSourceFiles()) { + if (/spawnSync\s*\(\s*['"]pkill['"]/.test(content)) { + offenders.push(file); + } + } + expect(offenders).toEqual([]); + }); + + test('3. readAgentRecord round-trips writeAgentRecord', () => { + const tmpDir = fs.mkdtempSync(path.join(require('os').tmpdir(), 'gstack-pid-id-')); + try { + const record: AgentRecord = { + pid: 12345, + gen: 'test-gen-abcdef', + startedAt: Date.now(), + }; + writeAgentRecord(tmpDir, record); + const read = readAgentRecord(tmpDir); + expect(read).toEqual(record); + expect(fs.existsSync(agentRecordPath(tmpDir))).toBe(true); + + clearAgentRecord(tmpDir); + expect(readAgentRecord(tmpDir)).toBeNull(); + expect(fs.existsSync(agentRecordPath(tmpDir))).toBe(false); + } finally { + fs.rmSync(tmpDir, { recursive: true, force: true }); + } + }); + + test('4. readAgentRecord returns null on missing or malformed file', () => { + const tmpDir = fs.mkdtempSync(path.join(require('os').tmpdir(), 'gstack-pid-id-')); + try { + // Missing. + expect(readAgentRecord(tmpDir)).toBeNull(); + + // Malformed: wrong type for pid. + fs.writeFileSync(agentRecordPath(tmpDir), JSON.stringify({ pid: 'not-a-number', gen: 'x', startedAt: 0 })); + expect(readAgentRecord(tmpDir)).toBeNull(); + + // Malformed: not JSON. + fs.writeFileSync(agentRecordPath(tmpDir), 'definitely not json'); + expect(readAgentRecord(tmpDir)).toBeNull(); + + // Missing field. + fs.writeFileSync(agentRecordPath(tmpDir), JSON.stringify({ pid: 1, gen: 'x' })); + expect(readAgentRecord(tmpDir)).toBeNull(); + } finally { + fs.rmSync(tmpDir, { recursive: true, force: true }); + } + }); + + test('5. killAgentByRecord returns false for a dead PID and never throws', () => { + // PID 2147483646 is below Linux PID_MAX_LIMIT but way above macOS's + // typical max — no real process will ever hold it. isProcessAlive + // returns false; killAgentByRecord no-ops. + const record: AgentRecord = { + pid: 2147483646, + gen: 'sentinel', + startedAt: Date.now(), + }; + const result = killAgentByRecord(record, 'SIGTERM'); + expect(result).toBe(false); + }); + + test('6. killAgentByRecord skips the kill when isProcessAlive is false', () => { + // Guard via process.kill stub: confirm killAgentByRecord does NOT call + // process.kill with a non-zero signal when the PID is dead. This is the + // belt-and-suspenders defense against PID-reuse: even if isProcessAlive + // changes implementation, killAgentByRecord must validate liveness first. + const origKill = process.kill; + const kills: Array<[number, NodeJS.Signals | number]> = []; + (process as any).kill = ((pid: number, sig: NodeJS.Signals | number) => { + kills.push([pid, sig ?? 'SIGTERM']); + if (sig === 0) { + const err: any = new Error('ESRCH'); + err.code = 'ESRCH'; + throw err; + } + return true; + }) as any; + try { + const record: AgentRecord = { pid: 9999999, gen: 'x', startedAt: Date.now() }; + killAgentByRecord(record, 'SIGTERM'); + const terminations = kills.filter(([, s]) => s !== 0); + expect(terminations).toEqual([]); + } finally { + (process as any).kill = origKill; + } + }); +});