feat(terminal-agent): 25s WS keepalive ping/pong + client keepalive frames

PTY connections were dying silently after NAT idle timeouts (30-60s on most
home routers, even shorter on some carrier-grade NAT) and Chrome MV3 panel
suspension. Neither side noticed until the user's next keystroke produced
no output. Both sides now drive a 25s keepalive cycle.

Server side (browse/src/terminal-agent.ts):
  * New ws.open handler constructs the PtySession eagerly and starts a
    setInterval that sends `{type:"ping",ts:Date.now()}` every 25s.
    Interval handle stored on session.pingInterval so close() can clear it.
  * PtySession.pingInterval field added; cleared in ws.close before
    disposeSession runs. Prevents timer leak across reconnects.
  * Message handler accepts `{type:"ping"|"pong"|"keepalive"}` silently —
    keepalive frames are a liveness signal at the TCP layer, no state to
    update. Existing resize/tabSwitch/tabState handling unchanged.
  * GSTACK_PTY_KEEPALIVE_INTERVAL_MS env knob (default 25000) lets the
    upcoming e2e tests compress idle assertions without 30s waits.

Client side (extension/sidepanel-terminal.js):
  * Belt-and-suspenders: client also runs a 25s setInterval that sends
    `{type:"keepalive"}`. Defends against Chrome pausing our timers if
    the server-side ping ever gets dropped (rare but possible in MV3).
  * Ping reply: on `{type:"ping",ts}` from the server, immediately send
    `{type:"pong",ts}`. Lets the agent observe round-trip latency for
    free and confirms the channel is bidirectional.
  * Interval cleared in three teardown paths: ws.close handler,
    teardown(), forceRestart(). Three paths exist because the sidebar
    can exit the LIVE state through any of them; all three must clean up
    or we leak timers across reconnects.

Test (browse/test/terminal-agent-keepalive.test.ts):
  * Static-grep tripwires for the 7-point protocol contract: agent has
    a configurable interval, open() starts the ping, close() clears it,
    message handler accepts keepalive vocabulary, client sends keepalive
    + replies pong, and all three client teardown paths clear the timer.
  * Wire-level tests (actually observe a ping after 25s) belong in the
    e2e tier — adding them here would either flake on slow CI or require
    a real Bun.serve listener per test which we don't want to pay for
    in the free tier.

Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
This commit is contained in:
Garry Tan 2026-05-23 23:09:23 -07:00
parent 4462f0caa4
commit d8751e91df
No known key found for this signature in database
GPG Key ID: C1F69E85C74EFE1D
3 changed files with 192 additions and 4 deletions

View File

@ -61,8 +61,29 @@ interface PtySession {
rows: number;
cookie: string;
spawned: boolean;
/**
* 25s server-side WS keepalive interval (v1.44+). Set in the WS `open`
* handler, cleared in `close`. We send `{type:"ping",ts}` text frames so
* NAT boxes, proxies, and Chrome's MV3 panel-suspend heuristics see the
* connection as active; the client either replies with `{type:"pong"}`
* or fires its own 25s `{type:"keepalive"}` cycle. Either path keeps
* the underlying TCP from being silently dropped.
*/
pingInterval: ReturnType<typeof setInterval> | null;
}
/**
* WS keepalive interval. 25s is comfortably under the lowest common NAT
* idle timeout (typically 30-60s) and shorter than Chromium's WebSocket
* dead-peer threshold. Test-overridable via env so the v1.44 e2e tests
* can compress idle-window assertions to <1s without waiting half a
* minute per assertion.
*/
const KEEPALIVE_INTERVAL_MS = parseInt(
process.env.GSTACK_PTY_KEEPALIVE_INTERVAL_MS || '25000',
10,
);
const sessions = new WeakMap<any, PtySession>(); // ws -> session
/** Find claude on PATH. */
@ -388,22 +409,52 @@ function buildServer() {
},
websocket: {
/**
* Start the 25s server-side keepalive ping. Stored on the session so
* close() can clear it. We construct the PtySession here (not lazily
* in message()) so the ping interval has somewhere to live before any
* data flows. Lazy claude spawn still happens on first binary frame.
*/
open(ws) {
const session: PtySession = {
proc: null,
cols: 80,
rows: 24,
cookie: (ws.data as any)?.cookie || '',
spawned: false,
pingInterval: null,
};
session.pingInterval = setInterval(() => {
try {
ws.send(JSON.stringify({ type: 'ping', ts: Date.now() }));
} catch {
// ws likely closed mid-tick; close handler clears the interval.
}
}, KEEPALIVE_INTERVAL_MS);
sessions.set(ws, session);
},
message(ws, raw) {
let session = sessions.get(ws);
if (!session) {
// Fallback for any path where open() didn't fire (shouldn't happen
// in Bun.serve but keeps the spawn path safe). No keepalive on
// this branch — open() is the supported entry point.
session = {
proc: null,
cols: 80,
rows: 24,
cookie: (ws.data as any)?.cookie || '',
spawned: false,
pingInterval: null,
};
sessions.set(ws, session);
}
// Text frames are control messages: {type: "resize", cols, rows} or
// {type: "tabSwitch", tabId, url, title}. Binary frames are raw input
// bytes destined for the PTY stdin.
// Text frames are control messages: {type: "resize", cols, rows},
// {type: "tabSwitch", tabId, url, title}, {type: "tabState", ...},
// or v1.44 keepalive frames: {type: "pong", ts}, {type: "keepalive"}.
// Binary frames are raw input bytes destined for the PTY stdin.
if (typeof raw === 'string') {
let msg: any;
try { msg = JSON.parse(raw); } catch { return; }
@ -423,6 +474,14 @@ function buildServer() {
handleTabState(msg);
return;
}
if (msg?.type === 'pong' || msg?.type === 'keepalive' || msg?.type === 'ping') {
// Keepalive frames — accepted and silently dropped. The mere
// fact that the WS carried this frame is the liveness signal;
// there's no application-level state to update at this layer.
// `ping` is acknowledged here too in case the client (or a
// future agent peer) mirrors our server-side ping shape.
return;
}
// Unknown text frame — ignore.
return;
}
@ -480,6 +539,10 @@ function buildServer() {
close(ws) {
const session = sessions.get(ws);
if (session) {
if (session.pingInterval) {
clearInterval(session.pingInterval);
session.pingInterval = null;
}
disposeSession(session);
if (session.cookie) {
// Drop the cookie so it can't be replayed against a new PTY.

View File

@ -0,0 +1,86 @@
import { describe, test, expect } from 'bun:test';
import * as fs from 'fs';
import * as path from 'path';
// v1.44 WS keepalive — static-grep invariants for the protocol contract.
//
// terminal-agent.ts and sidepanel-terminal.js cooperate on a 25s ping/pong +
// keepalive cycle so long-idle PTY connections survive NAT idle timeouts and
// Chromium's MV3 panel suspension heuristics. The wiring is invisible to
// integration tests (you'd have to wait 25s to observe a ping) but trivially
// regressed by a refactor. These tests fail CI if either side stops sending
// or stops accepting the protocol frames.
const AGENT_TS = path.resolve(new URL(import.meta.url).pathname, '..', '..', 'src', 'terminal-agent.ts');
const CLIENT_JS = path.resolve(new URL(import.meta.url).pathname, '..', '..', '..', 'extension', 'sidepanel-terminal.js');
describe('terminal-agent WS keepalive (v1.44+)', () => {
test('1. agent has a KEEPALIVE_INTERVAL_MS env knob, default 25000', () => {
const src = fs.readFileSync(AGENT_TS, 'utf-8');
expect(src).toContain('GSTACK_PTY_KEEPALIVE_INTERVAL_MS');
expect(src).toMatch(/KEEPALIVE_INTERVAL_MS\s*=\s*parseInt\(/);
// Default constant present so the env knob has a fallback.
expect(src).toContain("'25000'");
});
test('2. WS open handler starts a ping interval on the session', () => {
const src = fs.readFileSync(AGENT_TS, 'utf-8');
// The open(ws) handler in the websocket: { ... } block must call
// setInterval to drive the ping cadence and store the handle.
const wsBlock = sliceBetween(src, 'websocket: {', 'function handleTabState');
expect(wsBlock).toMatch(/open\s*\(\s*ws\s*\)/);
expect(wsBlock).toContain('setInterval');
expect(wsBlock).toContain("type: 'ping'");
expect(wsBlock).toContain('pingInterval');
});
test('3. WS close handler clears the ping interval', () => {
const src = fs.readFileSync(AGENT_TS, 'utf-8');
const wsBlock = sliceBetween(src, 'websocket: {', 'function handleTabState');
// close(ws) MUST clearInterval the pingInterval — otherwise we leak
// timers across reconnects and the ping handler captures a dead ws ref.
expect(wsBlock).toMatch(/close\s*\(\s*ws\s*\)/);
expect(wsBlock).toContain('clearInterval(session.pingInterval)');
});
test('4. message handler accepts pong / keepalive frames silently', () => {
const src = fs.readFileSync(AGENT_TS, 'utf-8');
// The text-frame router must recognize the keepalive vocabulary —
// if a future refactor strips this branch, unknown-text-frame
// suppression would still drop them but we lose intent.
expect(src).toMatch(/msg\?\.type === 'pong'/);
expect(src).toMatch(/msg\?\.type === 'keepalive'/);
});
test('5. client sends keepalive every 25s on ws.open', () => {
const src = fs.readFileSync(CLIENT_JS, 'utf-8');
expect(src).toContain('keepaliveInterval');
expect(src).toMatch(/setInterval\(/);
expect(src).toContain("type: 'keepalive'");
expect(src).toContain('KEEPALIVE_INTERVAL_MS = 25000');
});
test('6. client replies pong to server ping', () => {
const src = fs.readFileSync(CLIENT_JS, 'utf-8');
// The ws.message handler must short-circuit on msg.type === 'ping'
// and reply with {type: 'pong', ts: msg.ts}.
expect(src).toMatch(/msg\.type === 'ping'/);
expect(src).toMatch(/type: 'pong'/);
});
test('7. client clears keepalive in close + teardown + forceRestart', () => {
const src = fs.readFileSync(CLIENT_JS, 'utf-8');
// Three teardown paths exist; all three must drop the interval to
// avoid leaking timers across reconnect attempts.
const occurrences = (src.match(/clearInterval\(keepaliveInterval\)/g) || []).length;
expect(occurrences).toBeGreaterThanOrEqual(3);
});
});
function sliceBetween(source: string, start: string, end: string): string {
const i = source.indexOf(start);
if (i === -1) throw new Error(`marker not found: ${start}`);
const j = source.indexOf(end, i + start.length);
if (j === -1) throw new Error(`end marker not found: ${end}`);
return source.slice(i, j);
}

View File

@ -48,6 +48,16 @@
let term = null;
let fitAddon = null;
let ws = null;
/**
* 25s client-side WS keepalive interval (v1.44+). Belt-and-suspenders with
* the server-side ping in terminal-agent.ts: server pings cover most
* idle-NAT cases, client keepalive frames also defend against Chromium's
* MV3-adjacent panel suspension heuristics that can pause our timers.
* Started on ws.open, cleared on ws.close. The agent silently accepts
* `{type:"keepalive"}` text frames.
*/
let keepaliveInterval = null;
const KEEPALIVE_INTERVAL_MS = 25000;
function show(el) { el.style.display = ''; }
function hide(el) { el.style.display = 'none'; }
@ -371,16 +381,33 @@
} catch {}
// Send a single byte to nudge the agent to spawn claude (lazy-spawn trigger).
try { ws.send(new TextEncoder().encode('\n')); } catch {}
// v1.44 client-side keepalive. Server pings every 25s; we ALSO send
// keepalive frames at the same cadence so a paused timer on either
// side still has the other to lean on. Both are silently dropped
// by the agent's message handler.
if (keepaliveInterval) clearInterval(keepaliveInterval);
keepaliveInterval = setInterval(() => {
if (!ws || ws.readyState !== WebSocket.OPEN) return;
try { ws.send(JSON.stringify({ type: 'keepalive' })); } catch {}
}, KEEPALIVE_INTERVAL_MS);
});
ws.addEventListener('message', (ev) => {
if (typeof ev.data === 'string') {
// Agent control message (rare). Treat as JSON; error frames carry code.
// Agent control message. Treat as JSON; error frames carry code,
// ping frames trigger an immediate pong reply.
try {
const msg = JSON.parse(ev.data);
if (msg.type === 'error' && msg.code === 'CLAUDE_NOT_FOUND') {
setState(STATE.NO_CLAUDE);
try { ws.close(); } catch {}
return;
}
if (msg.type === 'ping') {
// Mirror the server's timestamp back. Cheap liveness ACK that
// lets the agent observe round-trip latency for free.
try { ws.send(JSON.stringify({ type: 'pong', ts: msg.ts })); } catch {}
return;
}
} catch {}
return;
@ -392,6 +419,10 @@
ws.addEventListener('close', () => {
ws = null;
if (keepaliveInterval) {
clearInterval(keepaliveInterval);
keepaliveInterval = null;
}
if (state !== STATE.NO_CLAUDE) setState(STATE.ENDED);
});
@ -401,6 +432,10 @@
}
function teardown() {
if (keepaliveInterval) {
clearInterval(keepaliveInterval);
keepaliveInterval = null;
}
try { ws && ws.close(); } catch {}
ws = null;
if (term) {
@ -418,6 +453,10 @@
* IDLE, kick off auto-connect. Safe to call from any state.
*/
function forceRestart() {
if (keepaliveInterval) {
clearInterval(keepaliveInterval);
keepaliveInterval = null;
}
try { ws && ws.close(); } catch {}
ws = null;
if (term) {