From d8751e91dfdfad27e4765d12959fa807e29b2264 Mon Sep 17 00:00:00 2001 From: Garry Tan Date: Sat, 23 May 2026 23:09:23 -0700 Subject: [PATCH] feat(terminal-agent): 25s WS keepalive ping/pong + client keepalive frames MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit PTY connections were dying silently after NAT idle timeouts (30-60s on most home routers, even shorter on some carrier-grade NAT) and Chrome MV3 panel suspension. Neither side noticed until the user's next keystroke produced no output. Both sides now drive a 25s keepalive cycle. Server side (browse/src/terminal-agent.ts): * New ws.open handler constructs the PtySession eagerly and starts a setInterval that sends `{type:"ping",ts:Date.now()}` every 25s. Interval handle stored on session.pingInterval so close() can clear it. * PtySession.pingInterval field added; cleared in ws.close before disposeSession runs. Prevents timer leak across reconnects. * Message handler accepts `{type:"ping"|"pong"|"keepalive"}` silently — keepalive frames are a liveness signal at the TCP layer, no state to update. Existing resize/tabSwitch/tabState handling unchanged. * GSTACK_PTY_KEEPALIVE_INTERVAL_MS env knob (default 25000) lets the upcoming e2e tests compress idle assertions without 30s waits. Client side (extension/sidepanel-terminal.js): * Belt-and-suspenders: client also runs a 25s setInterval that sends `{type:"keepalive"}`. Defends against Chrome pausing our timers if the server-side ping ever gets dropped (rare but possible in MV3). * Ping reply: on `{type:"ping",ts}` from the server, immediately send `{type:"pong",ts}`. Lets the agent observe round-trip latency for free and confirms the channel is bidirectional. * Interval cleared in three teardown paths: ws.close handler, teardown(), forceRestart(). Three paths exist because the sidebar can exit the LIVE state through any of them; all three must clean up or we leak timers across reconnects. Test (browse/test/terminal-agent-keepalive.test.ts): * Static-grep tripwires for the 7-point protocol contract: agent has a configurable interval, open() starts the ping, close() clears it, message handler accepts keepalive vocabulary, client sends keepalive + replies pong, and all three client teardown paths clear the timer. * Wire-level tests (actually observe a ping after 25s) belong in the e2e tier — adding them here would either flake on slow CI or require a real Bun.serve listener per test which we don't want to pay for in the free tier. Co-Authored-By: Claude Opus 4.7 (1M context) --- browse/src/terminal-agent.ts | 69 +++++++++++++++- browse/test/terminal-agent-keepalive.test.ts | 86 ++++++++++++++++++++ extension/sidepanel-terminal.js | 41 +++++++++- 3 files changed, 192 insertions(+), 4 deletions(-) create mode 100644 browse/test/terminal-agent-keepalive.test.ts diff --git a/browse/src/terminal-agent.ts b/browse/src/terminal-agent.ts index 1d54a1a63..9e128d924 100644 --- a/browse/src/terminal-agent.ts +++ b/browse/src/terminal-agent.ts @@ -61,8 +61,29 @@ interface PtySession { rows: number; cookie: string; spawned: boolean; + /** + * 25s server-side WS keepalive interval (v1.44+). Set in the WS `open` + * handler, cleared in `close`. We send `{type:"ping",ts}` text frames so + * NAT boxes, proxies, and Chrome's MV3 panel-suspend heuristics see the + * connection as active; the client either replies with `{type:"pong"}` + * or fires its own 25s `{type:"keepalive"}` cycle. Either path keeps + * the underlying TCP from being silently dropped. + */ + pingInterval: ReturnType | null; } +/** + * WS keepalive interval. 25s is comfortably under the lowest common NAT + * idle timeout (typically 30-60s) and shorter than Chromium's WebSocket + * dead-peer threshold. Test-overridable via env so the v1.44 e2e tests + * can compress idle-window assertions to <1s without waiting half a + * minute per assertion. + */ +const KEEPALIVE_INTERVAL_MS = parseInt( + process.env.GSTACK_PTY_KEEPALIVE_INTERVAL_MS || '25000', + 10, +); + const sessions = new WeakMap(); // ws -> session /** Find claude on PATH. */ @@ -388,22 +409,52 @@ function buildServer() { }, websocket: { + /** + * Start the 25s server-side keepalive ping. Stored on the session so + * close() can clear it. We construct the PtySession here (not lazily + * in message()) so the ping interval has somewhere to live before any + * data flows. Lazy claude spawn still happens on first binary frame. + */ + open(ws) { + const session: PtySession = { + proc: null, + cols: 80, + rows: 24, + cookie: (ws.data as any)?.cookie || '', + spawned: false, + pingInterval: null, + }; + session.pingInterval = setInterval(() => { + try { + ws.send(JSON.stringify({ type: 'ping', ts: Date.now() })); + } catch { + // ws likely closed mid-tick; close handler clears the interval. + } + }, KEEPALIVE_INTERVAL_MS); + sessions.set(ws, session); + }, + message(ws, raw) { let session = sessions.get(ws); if (!session) { + // Fallback for any path where open() didn't fire (shouldn't happen + // in Bun.serve but keeps the spawn path safe). No keepalive on + // this branch — open() is the supported entry point. session = { proc: null, cols: 80, rows: 24, cookie: (ws.data as any)?.cookie || '', spawned: false, + pingInterval: null, }; sessions.set(ws, session); } - // Text frames are control messages: {type: "resize", cols, rows} or - // {type: "tabSwitch", tabId, url, title}. Binary frames are raw input - // bytes destined for the PTY stdin. + // Text frames are control messages: {type: "resize", cols, rows}, + // {type: "tabSwitch", tabId, url, title}, {type: "tabState", ...}, + // or v1.44 keepalive frames: {type: "pong", ts}, {type: "keepalive"}. + // Binary frames are raw input bytes destined for the PTY stdin. if (typeof raw === 'string') { let msg: any; try { msg = JSON.parse(raw); } catch { return; } @@ -423,6 +474,14 @@ function buildServer() { handleTabState(msg); return; } + if (msg?.type === 'pong' || msg?.type === 'keepalive' || msg?.type === 'ping') { + // Keepalive frames — accepted and silently dropped. The mere + // fact that the WS carried this frame is the liveness signal; + // there's no application-level state to update at this layer. + // `ping` is acknowledged here too in case the client (or a + // future agent peer) mirrors our server-side ping shape. + return; + } // Unknown text frame — ignore. return; } @@ -480,6 +539,10 @@ function buildServer() { close(ws) { const session = sessions.get(ws); if (session) { + if (session.pingInterval) { + clearInterval(session.pingInterval); + session.pingInterval = null; + } disposeSession(session); if (session.cookie) { // Drop the cookie so it can't be replayed against a new PTY. diff --git a/browse/test/terminal-agent-keepalive.test.ts b/browse/test/terminal-agent-keepalive.test.ts new file mode 100644 index 000000000..785147f2a --- /dev/null +++ b/browse/test/terminal-agent-keepalive.test.ts @@ -0,0 +1,86 @@ +import { describe, test, expect } from 'bun:test'; +import * as fs from 'fs'; +import * as path from 'path'; + +// v1.44 WS keepalive — static-grep invariants for the protocol contract. +// +// terminal-agent.ts and sidepanel-terminal.js cooperate on a 25s ping/pong + +// keepalive cycle so long-idle PTY connections survive NAT idle timeouts and +// Chromium's MV3 panel suspension heuristics. The wiring is invisible to +// integration tests (you'd have to wait 25s to observe a ping) but trivially +// regressed by a refactor. These tests fail CI if either side stops sending +// or stops accepting the protocol frames. + +const AGENT_TS = path.resolve(new URL(import.meta.url).pathname, '..', '..', 'src', 'terminal-agent.ts'); +const CLIENT_JS = path.resolve(new URL(import.meta.url).pathname, '..', '..', '..', 'extension', 'sidepanel-terminal.js'); + +describe('terminal-agent WS keepalive (v1.44+)', () => { + test('1. agent has a KEEPALIVE_INTERVAL_MS env knob, default 25000', () => { + const src = fs.readFileSync(AGENT_TS, 'utf-8'); + expect(src).toContain('GSTACK_PTY_KEEPALIVE_INTERVAL_MS'); + expect(src).toMatch(/KEEPALIVE_INTERVAL_MS\s*=\s*parseInt\(/); + // Default constant present so the env knob has a fallback. + expect(src).toContain("'25000'"); + }); + + test('2. WS open handler starts a ping interval on the session', () => { + const src = fs.readFileSync(AGENT_TS, 'utf-8'); + // The open(ws) handler in the websocket: { ... } block must call + // setInterval to drive the ping cadence and store the handle. + const wsBlock = sliceBetween(src, 'websocket: {', 'function handleTabState'); + expect(wsBlock).toMatch(/open\s*\(\s*ws\s*\)/); + expect(wsBlock).toContain('setInterval'); + expect(wsBlock).toContain("type: 'ping'"); + expect(wsBlock).toContain('pingInterval'); + }); + + test('3. WS close handler clears the ping interval', () => { + const src = fs.readFileSync(AGENT_TS, 'utf-8'); + const wsBlock = sliceBetween(src, 'websocket: {', 'function handleTabState'); + // close(ws) MUST clearInterval the pingInterval — otherwise we leak + // timers across reconnects and the ping handler captures a dead ws ref. + expect(wsBlock).toMatch(/close\s*\(\s*ws\s*\)/); + expect(wsBlock).toContain('clearInterval(session.pingInterval)'); + }); + + test('4. message handler accepts pong / keepalive frames silently', () => { + const src = fs.readFileSync(AGENT_TS, 'utf-8'); + // The text-frame router must recognize the keepalive vocabulary — + // if a future refactor strips this branch, unknown-text-frame + // suppression would still drop them but we lose intent. + expect(src).toMatch(/msg\?\.type === 'pong'/); + expect(src).toMatch(/msg\?\.type === 'keepalive'/); + }); + + test('5. client sends keepalive every 25s on ws.open', () => { + const src = fs.readFileSync(CLIENT_JS, 'utf-8'); + expect(src).toContain('keepaliveInterval'); + expect(src).toMatch(/setInterval\(/); + expect(src).toContain("type: 'keepalive'"); + expect(src).toContain('KEEPALIVE_INTERVAL_MS = 25000'); + }); + + test('6. client replies pong to server ping', () => { + const src = fs.readFileSync(CLIENT_JS, 'utf-8'); + // The ws.message handler must short-circuit on msg.type === 'ping' + // and reply with {type: 'pong', ts: msg.ts}. + expect(src).toMatch(/msg\.type === 'ping'/); + expect(src).toMatch(/type: 'pong'/); + }); + + test('7. client clears keepalive in close + teardown + forceRestart', () => { + const src = fs.readFileSync(CLIENT_JS, 'utf-8'); + // Three teardown paths exist; all three must drop the interval to + // avoid leaking timers across reconnect attempts. + const occurrences = (src.match(/clearInterval\(keepaliveInterval\)/g) || []).length; + expect(occurrences).toBeGreaterThanOrEqual(3); + }); +}); + +function sliceBetween(source: string, start: string, end: string): string { + const i = source.indexOf(start); + if (i === -1) throw new Error(`marker not found: ${start}`); + const j = source.indexOf(end, i + start.length); + if (j === -1) throw new Error(`end marker not found: ${end}`); + return source.slice(i, j); +} diff --git a/extension/sidepanel-terminal.js b/extension/sidepanel-terminal.js index 4ac0065d0..5e5835d5e 100644 --- a/extension/sidepanel-terminal.js +++ b/extension/sidepanel-terminal.js @@ -48,6 +48,16 @@ let term = null; let fitAddon = null; let ws = null; + /** + * 25s client-side WS keepalive interval (v1.44+). Belt-and-suspenders with + * the server-side ping in terminal-agent.ts: server pings cover most + * idle-NAT cases, client keepalive frames also defend against Chromium's + * MV3-adjacent panel suspension heuristics that can pause our timers. + * Started on ws.open, cleared on ws.close. The agent silently accepts + * `{type:"keepalive"}` text frames. + */ + let keepaliveInterval = null; + const KEEPALIVE_INTERVAL_MS = 25000; function show(el) { el.style.display = ''; } function hide(el) { el.style.display = 'none'; } @@ -371,16 +381,33 @@ } catch {} // Send a single byte to nudge the agent to spawn claude (lazy-spawn trigger). try { ws.send(new TextEncoder().encode('\n')); } catch {} + // v1.44 client-side keepalive. Server pings every 25s; we ALSO send + // keepalive frames at the same cadence so a paused timer on either + // side still has the other to lean on. Both are silently dropped + // by the agent's message handler. + if (keepaliveInterval) clearInterval(keepaliveInterval); + keepaliveInterval = setInterval(() => { + if (!ws || ws.readyState !== WebSocket.OPEN) return; + try { ws.send(JSON.stringify({ type: 'keepalive' })); } catch {} + }, KEEPALIVE_INTERVAL_MS); }); ws.addEventListener('message', (ev) => { if (typeof ev.data === 'string') { - // Agent control message (rare). Treat as JSON; error frames carry code. + // Agent control message. Treat as JSON; error frames carry code, + // ping frames trigger an immediate pong reply. try { const msg = JSON.parse(ev.data); if (msg.type === 'error' && msg.code === 'CLAUDE_NOT_FOUND') { setState(STATE.NO_CLAUDE); try { ws.close(); } catch {} + return; + } + if (msg.type === 'ping') { + // Mirror the server's timestamp back. Cheap liveness ACK that + // lets the agent observe round-trip latency for free. + try { ws.send(JSON.stringify({ type: 'pong', ts: msg.ts })); } catch {} + return; } } catch {} return; @@ -392,6 +419,10 @@ ws.addEventListener('close', () => { ws = null; + if (keepaliveInterval) { + clearInterval(keepaliveInterval); + keepaliveInterval = null; + } if (state !== STATE.NO_CLAUDE) setState(STATE.ENDED); }); @@ -401,6 +432,10 @@ } function teardown() { + if (keepaliveInterval) { + clearInterval(keepaliveInterval); + keepaliveInterval = null; + } try { ws && ws.close(); } catch {} ws = null; if (term) { @@ -418,6 +453,10 @@ * IDLE, kick off auto-connect. Safe to call from any state. */ function forceRestart() { + if (keepaliveInterval) { + clearInterval(keepaliveInterval); + keepaliveInterval = null; + } try { ws && ws.close(); } catch {} ws = null; if (term) {