mirror of https://github.com/garrytan/gstack.git
feat(terminal-agent): 25s WS keepalive ping/pong + client keepalive frames
PTY connections were dying silently after NAT idle timeouts (30-60s on most
home routers, even shorter on some carrier-grade NAT) and Chrome MV3 panel
suspension. Neither side noticed until the user's next keystroke produced
no output. Both sides now drive a 25s keepalive cycle.
Server side (browse/src/terminal-agent.ts):
* New ws.open handler constructs the PtySession eagerly and starts a
setInterval that sends `{type:"ping",ts:Date.now()}` every 25s.
Interval handle stored on session.pingInterval so close() can clear it.
* PtySession.pingInterval field added; cleared in ws.close before
disposeSession runs. Prevents timer leak across reconnects.
* Message handler accepts `{type:"ping"|"pong"|"keepalive"}` silently —
keepalive frames are a liveness signal at the TCP layer, no state to
update. Existing resize/tabSwitch/tabState handling unchanged.
* GSTACK_PTY_KEEPALIVE_INTERVAL_MS env knob (default 25000) lets the
upcoming e2e tests compress idle assertions without 30s waits.
Client side (extension/sidepanel-terminal.js):
* Belt-and-suspenders: client also runs a 25s setInterval that sends
`{type:"keepalive"}`. Defends against Chrome pausing our timers if
the server-side ping ever gets dropped (rare but possible in MV3).
* Ping reply: on `{type:"ping",ts}` from the server, immediately send
`{type:"pong",ts}`. Lets the agent observe round-trip latency for
free and confirms the channel is bidirectional.
* Interval cleared in three teardown paths: ws.close handler,
teardown(), forceRestart(). Three paths exist because the sidebar
can exit the LIVE state through any of them; all three must clean up
or we leak timers across reconnects.
Test (browse/test/terminal-agent-keepalive.test.ts):
* Static-grep tripwires for the 7-point protocol contract: agent has
a configurable interval, open() starts the ping, close() clears it,
message handler accepts keepalive vocabulary, client sends keepalive
+ replies pong, and all three client teardown paths clear the timer.
* Wire-level tests (actually observe a ping after 25s) belong in the
e2e tier — adding them here would either flake on slow CI or require
a real Bun.serve listener per test which we don't want to pay for
in the free tier.
Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
This commit is contained in:
parent
4462f0caa4
commit
d8751e91df
|
|
@ -61,8 +61,29 @@ interface PtySession {
|
||||||
rows: number;
|
rows: number;
|
||||||
cookie: string;
|
cookie: string;
|
||||||
spawned: boolean;
|
spawned: boolean;
|
||||||
|
/**
|
||||||
|
* 25s server-side WS keepalive interval (v1.44+). Set in the WS `open`
|
||||||
|
* handler, cleared in `close`. We send `{type:"ping",ts}` text frames so
|
||||||
|
* NAT boxes, proxies, and Chrome's MV3 panel-suspend heuristics see the
|
||||||
|
* connection as active; the client either replies with `{type:"pong"}`
|
||||||
|
* or fires its own 25s `{type:"keepalive"}` cycle. Either path keeps
|
||||||
|
* the underlying TCP from being silently dropped.
|
||||||
|
*/
|
||||||
|
pingInterval: ReturnType<typeof setInterval> | null;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* WS keepalive interval. 25s is comfortably under the lowest common NAT
|
||||||
|
* idle timeout (typically 30-60s) and shorter than Chromium's WebSocket
|
||||||
|
* dead-peer threshold. Test-overridable via env so the v1.44 e2e tests
|
||||||
|
* can compress idle-window assertions to <1s without waiting half a
|
||||||
|
* minute per assertion.
|
||||||
|
*/
|
||||||
|
const KEEPALIVE_INTERVAL_MS = parseInt(
|
||||||
|
process.env.GSTACK_PTY_KEEPALIVE_INTERVAL_MS || '25000',
|
||||||
|
10,
|
||||||
|
);
|
||||||
|
|
||||||
const sessions = new WeakMap<any, PtySession>(); // ws -> session
|
const sessions = new WeakMap<any, PtySession>(); // ws -> session
|
||||||
|
|
||||||
/** Find claude on PATH. */
|
/** Find claude on PATH. */
|
||||||
|
|
@ -388,22 +409,52 @@ function buildServer() {
|
||||||
},
|
},
|
||||||
|
|
||||||
websocket: {
|
websocket: {
|
||||||
|
/**
|
||||||
|
* Start the 25s server-side keepalive ping. Stored on the session so
|
||||||
|
* close() can clear it. We construct the PtySession here (not lazily
|
||||||
|
* in message()) so the ping interval has somewhere to live before any
|
||||||
|
* data flows. Lazy claude spawn still happens on first binary frame.
|
||||||
|
*/
|
||||||
|
open(ws) {
|
||||||
|
const session: PtySession = {
|
||||||
|
proc: null,
|
||||||
|
cols: 80,
|
||||||
|
rows: 24,
|
||||||
|
cookie: (ws.data as any)?.cookie || '',
|
||||||
|
spawned: false,
|
||||||
|
pingInterval: null,
|
||||||
|
};
|
||||||
|
session.pingInterval = setInterval(() => {
|
||||||
|
try {
|
||||||
|
ws.send(JSON.stringify({ type: 'ping', ts: Date.now() }));
|
||||||
|
} catch {
|
||||||
|
// ws likely closed mid-tick; close handler clears the interval.
|
||||||
|
}
|
||||||
|
}, KEEPALIVE_INTERVAL_MS);
|
||||||
|
sessions.set(ws, session);
|
||||||
|
},
|
||||||
|
|
||||||
message(ws, raw) {
|
message(ws, raw) {
|
||||||
let session = sessions.get(ws);
|
let session = sessions.get(ws);
|
||||||
if (!session) {
|
if (!session) {
|
||||||
|
// Fallback for any path where open() didn't fire (shouldn't happen
|
||||||
|
// in Bun.serve but keeps the spawn path safe). No keepalive on
|
||||||
|
// this branch — open() is the supported entry point.
|
||||||
session = {
|
session = {
|
||||||
proc: null,
|
proc: null,
|
||||||
cols: 80,
|
cols: 80,
|
||||||
rows: 24,
|
rows: 24,
|
||||||
cookie: (ws.data as any)?.cookie || '',
|
cookie: (ws.data as any)?.cookie || '',
|
||||||
spawned: false,
|
spawned: false,
|
||||||
|
pingInterval: null,
|
||||||
};
|
};
|
||||||
sessions.set(ws, session);
|
sessions.set(ws, session);
|
||||||
}
|
}
|
||||||
|
|
||||||
// Text frames are control messages: {type: "resize", cols, rows} or
|
// Text frames are control messages: {type: "resize", cols, rows},
|
||||||
// {type: "tabSwitch", tabId, url, title}. Binary frames are raw input
|
// {type: "tabSwitch", tabId, url, title}, {type: "tabState", ...},
|
||||||
// bytes destined for the PTY stdin.
|
// or v1.44 keepalive frames: {type: "pong", ts}, {type: "keepalive"}.
|
||||||
|
// Binary frames are raw input bytes destined for the PTY stdin.
|
||||||
if (typeof raw === 'string') {
|
if (typeof raw === 'string') {
|
||||||
let msg: any;
|
let msg: any;
|
||||||
try { msg = JSON.parse(raw); } catch { return; }
|
try { msg = JSON.parse(raw); } catch { return; }
|
||||||
|
|
@ -423,6 +474,14 @@ function buildServer() {
|
||||||
handleTabState(msg);
|
handleTabState(msg);
|
||||||
return;
|
return;
|
||||||
}
|
}
|
||||||
|
if (msg?.type === 'pong' || msg?.type === 'keepalive' || msg?.type === 'ping') {
|
||||||
|
// Keepalive frames — accepted and silently dropped. The mere
|
||||||
|
// fact that the WS carried this frame is the liveness signal;
|
||||||
|
// there's no application-level state to update at this layer.
|
||||||
|
// `ping` is acknowledged here too in case the client (or a
|
||||||
|
// future agent peer) mirrors our server-side ping shape.
|
||||||
|
return;
|
||||||
|
}
|
||||||
// Unknown text frame — ignore.
|
// Unknown text frame — ignore.
|
||||||
return;
|
return;
|
||||||
}
|
}
|
||||||
|
|
@ -480,6 +539,10 @@ function buildServer() {
|
||||||
close(ws) {
|
close(ws) {
|
||||||
const session = sessions.get(ws);
|
const session = sessions.get(ws);
|
||||||
if (session) {
|
if (session) {
|
||||||
|
if (session.pingInterval) {
|
||||||
|
clearInterval(session.pingInterval);
|
||||||
|
session.pingInterval = null;
|
||||||
|
}
|
||||||
disposeSession(session);
|
disposeSession(session);
|
||||||
if (session.cookie) {
|
if (session.cookie) {
|
||||||
// Drop the cookie so it can't be replayed against a new PTY.
|
// Drop the cookie so it can't be replayed against a new PTY.
|
||||||
|
|
|
||||||
|
|
@ -0,0 +1,86 @@
|
||||||
|
import { describe, test, expect } from 'bun:test';
|
||||||
|
import * as fs from 'fs';
|
||||||
|
import * as path from 'path';
|
||||||
|
|
||||||
|
// v1.44 WS keepalive — static-grep invariants for the protocol contract.
|
||||||
|
//
|
||||||
|
// terminal-agent.ts and sidepanel-terminal.js cooperate on a 25s ping/pong +
|
||||||
|
// keepalive cycle so long-idle PTY connections survive NAT idle timeouts and
|
||||||
|
// Chromium's MV3 panel suspension heuristics. The wiring is invisible to
|
||||||
|
// integration tests (you'd have to wait 25s to observe a ping) but trivially
|
||||||
|
// regressed by a refactor. These tests fail CI if either side stops sending
|
||||||
|
// or stops accepting the protocol frames.
|
||||||
|
|
||||||
|
const AGENT_TS = path.resolve(new URL(import.meta.url).pathname, '..', '..', 'src', 'terminal-agent.ts');
|
||||||
|
const CLIENT_JS = path.resolve(new URL(import.meta.url).pathname, '..', '..', '..', 'extension', 'sidepanel-terminal.js');
|
||||||
|
|
||||||
|
describe('terminal-agent WS keepalive (v1.44+)', () => {
|
||||||
|
test('1. agent has a KEEPALIVE_INTERVAL_MS env knob, default 25000', () => {
|
||||||
|
const src = fs.readFileSync(AGENT_TS, 'utf-8');
|
||||||
|
expect(src).toContain('GSTACK_PTY_KEEPALIVE_INTERVAL_MS');
|
||||||
|
expect(src).toMatch(/KEEPALIVE_INTERVAL_MS\s*=\s*parseInt\(/);
|
||||||
|
// Default constant present so the env knob has a fallback.
|
||||||
|
expect(src).toContain("'25000'");
|
||||||
|
});
|
||||||
|
|
||||||
|
test('2. WS open handler starts a ping interval on the session', () => {
|
||||||
|
const src = fs.readFileSync(AGENT_TS, 'utf-8');
|
||||||
|
// The open(ws) handler in the websocket: { ... } block must call
|
||||||
|
// setInterval to drive the ping cadence and store the handle.
|
||||||
|
const wsBlock = sliceBetween(src, 'websocket: {', 'function handleTabState');
|
||||||
|
expect(wsBlock).toMatch(/open\s*\(\s*ws\s*\)/);
|
||||||
|
expect(wsBlock).toContain('setInterval');
|
||||||
|
expect(wsBlock).toContain("type: 'ping'");
|
||||||
|
expect(wsBlock).toContain('pingInterval');
|
||||||
|
});
|
||||||
|
|
||||||
|
test('3. WS close handler clears the ping interval', () => {
|
||||||
|
const src = fs.readFileSync(AGENT_TS, 'utf-8');
|
||||||
|
const wsBlock = sliceBetween(src, 'websocket: {', 'function handleTabState');
|
||||||
|
// close(ws) MUST clearInterval the pingInterval — otherwise we leak
|
||||||
|
// timers across reconnects and the ping handler captures a dead ws ref.
|
||||||
|
expect(wsBlock).toMatch(/close\s*\(\s*ws\s*\)/);
|
||||||
|
expect(wsBlock).toContain('clearInterval(session.pingInterval)');
|
||||||
|
});
|
||||||
|
|
||||||
|
test('4. message handler accepts pong / keepalive frames silently', () => {
|
||||||
|
const src = fs.readFileSync(AGENT_TS, 'utf-8');
|
||||||
|
// The text-frame router must recognize the keepalive vocabulary —
|
||||||
|
// if a future refactor strips this branch, unknown-text-frame
|
||||||
|
// suppression would still drop them but we lose intent.
|
||||||
|
expect(src).toMatch(/msg\?\.type === 'pong'/);
|
||||||
|
expect(src).toMatch(/msg\?\.type === 'keepalive'/);
|
||||||
|
});
|
||||||
|
|
||||||
|
test('5. client sends keepalive every 25s on ws.open', () => {
|
||||||
|
const src = fs.readFileSync(CLIENT_JS, 'utf-8');
|
||||||
|
expect(src).toContain('keepaliveInterval');
|
||||||
|
expect(src).toMatch(/setInterval\(/);
|
||||||
|
expect(src).toContain("type: 'keepalive'");
|
||||||
|
expect(src).toContain('KEEPALIVE_INTERVAL_MS = 25000');
|
||||||
|
});
|
||||||
|
|
||||||
|
test('6. client replies pong to server ping', () => {
|
||||||
|
const src = fs.readFileSync(CLIENT_JS, 'utf-8');
|
||||||
|
// The ws.message handler must short-circuit on msg.type === 'ping'
|
||||||
|
// and reply with {type: 'pong', ts: msg.ts}.
|
||||||
|
expect(src).toMatch(/msg\.type === 'ping'/);
|
||||||
|
expect(src).toMatch(/type: 'pong'/);
|
||||||
|
});
|
||||||
|
|
||||||
|
test('7. client clears keepalive in close + teardown + forceRestart', () => {
|
||||||
|
const src = fs.readFileSync(CLIENT_JS, 'utf-8');
|
||||||
|
// Three teardown paths exist; all three must drop the interval to
|
||||||
|
// avoid leaking timers across reconnect attempts.
|
||||||
|
const occurrences = (src.match(/clearInterval\(keepaliveInterval\)/g) || []).length;
|
||||||
|
expect(occurrences).toBeGreaterThanOrEqual(3);
|
||||||
|
});
|
||||||
|
});
|
||||||
|
|
||||||
|
function sliceBetween(source: string, start: string, end: string): string {
|
||||||
|
const i = source.indexOf(start);
|
||||||
|
if (i === -1) throw new Error(`marker not found: ${start}`);
|
||||||
|
const j = source.indexOf(end, i + start.length);
|
||||||
|
if (j === -1) throw new Error(`end marker not found: ${end}`);
|
||||||
|
return source.slice(i, j);
|
||||||
|
}
|
||||||
|
|
@ -48,6 +48,16 @@
|
||||||
let term = null;
|
let term = null;
|
||||||
let fitAddon = null;
|
let fitAddon = null;
|
||||||
let ws = null;
|
let ws = null;
|
||||||
|
/**
|
||||||
|
* 25s client-side WS keepalive interval (v1.44+). Belt-and-suspenders with
|
||||||
|
* the server-side ping in terminal-agent.ts: server pings cover most
|
||||||
|
* idle-NAT cases, client keepalive frames also defend against Chromium's
|
||||||
|
* MV3-adjacent panel suspension heuristics that can pause our timers.
|
||||||
|
* Started on ws.open, cleared on ws.close. The agent silently accepts
|
||||||
|
* `{type:"keepalive"}` text frames.
|
||||||
|
*/
|
||||||
|
let keepaliveInterval = null;
|
||||||
|
const KEEPALIVE_INTERVAL_MS = 25000;
|
||||||
|
|
||||||
function show(el) { el.style.display = ''; }
|
function show(el) { el.style.display = ''; }
|
||||||
function hide(el) { el.style.display = 'none'; }
|
function hide(el) { el.style.display = 'none'; }
|
||||||
|
|
@ -371,16 +381,33 @@
|
||||||
} catch {}
|
} catch {}
|
||||||
// Send a single byte to nudge the agent to spawn claude (lazy-spawn trigger).
|
// Send a single byte to nudge the agent to spawn claude (lazy-spawn trigger).
|
||||||
try { ws.send(new TextEncoder().encode('\n')); } catch {}
|
try { ws.send(new TextEncoder().encode('\n')); } catch {}
|
||||||
|
// v1.44 client-side keepalive. Server pings every 25s; we ALSO send
|
||||||
|
// keepalive frames at the same cadence so a paused timer on either
|
||||||
|
// side still has the other to lean on. Both are silently dropped
|
||||||
|
// by the agent's message handler.
|
||||||
|
if (keepaliveInterval) clearInterval(keepaliveInterval);
|
||||||
|
keepaliveInterval = setInterval(() => {
|
||||||
|
if (!ws || ws.readyState !== WebSocket.OPEN) return;
|
||||||
|
try { ws.send(JSON.stringify({ type: 'keepalive' })); } catch {}
|
||||||
|
}, KEEPALIVE_INTERVAL_MS);
|
||||||
});
|
});
|
||||||
|
|
||||||
ws.addEventListener('message', (ev) => {
|
ws.addEventListener('message', (ev) => {
|
||||||
if (typeof ev.data === 'string') {
|
if (typeof ev.data === 'string') {
|
||||||
// Agent control message (rare). Treat as JSON; error frames carry code.
|
// Agent control message. Treat as JSON; error frames carry code,
|
||||||
|
// ping frames trigger an immediate pong reply.
|
||||||
try {
|
try {
|
||||||
const msg = JSON.parse(ev.data);
|
const msg = JSON.parse(ev.data);
|
||||||
if (msg.type === 'error' && msg.code === 'CLAUDE_NOT_FOUND') {
|
if (msg.type === 'error' && msg.code === 'CLAUDE_NOT_FOUND') {
|
||||||
setState(STATE.NO_CLAUDE);
|
setState(STATE.NO_CLAUDE);
|
||||||
try { ws.close(); } catch {}
|
try { ws.close(); } catch {}
|
||||||
|
return;
|
||||||
|
}
|
||||||
|
if (msg.type === 'ping') {
|
||||||
|
// Mirror the server's timestamp back. Cheap liveness ACK that
|
||||||
|
// lets the agent observe round-trip latency for free.
|
||||||
|
try { ws.send(JSON.stringify({ type: 'pong', ts: msg.ts })); } catch {}
|
||||||
|
return;
|
||||||
}
|
}
|
||||||
} catch {}
|
} catch {}
|
||||||
return;
|
return;
|
||||||
|
|
@ -392,6 +419,10 @@
|
||||||
|
|
||||||
ws.addEventListener('close', () => {
|
ws.addEventListener('close', () => {
|
||||||
ws = null;
|
ws = null;
|
||||||
|
if (keepaliveInterval) {
|
||||||
|
clearInterval(keepaliveInterval);
|
||||||
|
keepaliveInterval = null;
|
||||||
|
}
|
||||||
if (state !== STATE.NO_CLAUDE) setState(STATE.ENDED);
|
if (state !== STATE.NO_CLAUDE) setState(STATE.ENDED);
|
||||||
});
|
});
|
||||||
|
|
||||||
|
|
@ -401,6 +432,10 @@
|
||||||
}
|
}
|
||||||
|
|
||||||
function teardown() {
|
function teardown() {
|
||||||
|
if (keepaliveInterval) {
|
||||||
|
clearInterval(keepaliveInterval);
|
||||||
|
keepaliveInterval = null;
|
||||||
|
}
|
||||||
try { ws && ws.close(); } catch {}
|
try { ws && ws.close(); } catch {}
|
||||||
ws = null;
|
ws = null;
|
||||||
if (term) {
|
if (term) {
|
||||||
|
|
@ -418,6 +453,10 @@
|
||||||
* IDLE, kick off auto-connect. Safe to call from any state.
|
* IDLE, kick off auto-connect. Safe to call from any state.
|
||||||
*/
|
*/
|
||||||
function forceRestart() {
|
function forceRestart() {
|
||||||
|
if (keepaliveInterval) {
|
||||||
|
clearInterval(keepaliveInterval);
|
||||||
|
keepaliveInterval = null;
|
||||||
|
}
|
||||||
try { ws && ws.close(); } catch {}
|
try { ws && ws.close(); } catch {}
|
||||||
ws = null;
|
ws = null;
|
||||||
if (term) {
|
if (term) {
|
||||||
|
|
|
||||||
Loading…
Reference in New Issue