mirror of https://github.com/garrytan/gstack.git
feat(terminal-agent): 25s WS keepalive ping/pong + client keepalive frames
PTY connections were dying silently after NAT idle timeouts (30-60s on most
home routers, even shorter on some carrier-grade NAT) and Chrome MV3 panel
suspension. Neither side noticed until the user's next keystroke produced
no output. Both sides now drive a 25s keepalive cycle.
Server side (browse/src/terminal-agent.ts):
* New ws.open handler constructs the PtySession eagerly and starts a
setInterval that sends `{type:"ping",ts:Date.now()}` every 25s.
Interval handle stored on session.pingInterval so close() can clear it.
* PtySession.pingInterval field added; cleared in ws.close before
disposeSession runs. Prevents timer leak across reconnects.
* Message handler accepts `{type:"ping"|"pong"|"keepalive"}` silently —
keepalive frames are a liveness signal at the TCP layer, no state to
update. Existing resize/tabSwitch/tabState handling unchanged.
* GSTACK_PTY_KEEPALIVE_INTERVAL_MS env knob (default 25000) lets the
upcoming e2e tests compress idle assertions without 30s waits.
Client side (extension/sidepanel-terminal.js):
* Belt-and-suspenders: client also runs a 25s setInterval that sends
`{type:"keepalive"}`. Defends against Chrome pausing our timers if
the server-side ping ever gets dropped (rare but possible in MV3).
* Ping reply: on `{type:"ping",ts}` from the server, immediately send
`{type:"pong",ts}`. Lets the agent observe round-trip latency for
free and confirms the channel is bidirectional.
* Interval cleared in three teardown paths: ws.close handler,
teardown(), forceRestart(). Three paths exist because the sidebar
can exit the LIVE state through any of them; all three must clean up
or we leak timers across reconnects.
Test (browse/test/terminal-agent-keepalive.test.ts):
* Static-grep tripwires for the 7-point protocol contract: agent has
a configurable interval, open() starts the ping, close() clears it,
message handler accepts keepalive vocabulary, client sends keepalive
+ replies pong, and all three client teardown paths clear the timer.
* Wire-level tests (actually observe a ping after 25s) belong in the
e2e tier — adding them here would either flake on slow CI or require
a real Bun.serve listener per test which we don't want to pay for
in the free tier.
Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
This commit is contained in:
parent
4462f0caa4
commit
d8751e91df
|
|
@ -61,8 +61,29 @@ interface PtySession {
|
|||
rows: number;
|
||||
cookie: string;
|
||||
spawned: boolean;
|
||||
/**
|
||||
* 25s server-side WS keepalive interval (v1.44+). Set in the WS `open`
|
||||
* handler, cleared in `close`. We send `{type:"ping",ts}` text frames so
|
||||
* NAT boxes, proxies, and Chrome's MV3 panel-suspend heuristics see the
|
||||
* connection as active; the client either replies with `{type:"pong"}`
|
||||
* or fires its own 25s `{type:"keepalive"}` cycle. Either path keeps
|
||||
* the underlying TCP from being silently dropped.
|
||||
*/
|
||||
pingInterval: ReturnType<typeof setInterval> | null;
|
||||
}
|
||||
|
||||
/**
|
||||
* WS keepalive interval. 25s is comfortably under the lowest common NAT
|
||||
* idle timeout (typically 30-60s) and shorter than Chromium's WebSocket
|
||||
* dead-peer threshold. Test-overridable via env so the v1.44 e2e tests
|
||||
* can compress idle-window assertions to <1s without waiting half a
|
||||
* minute per assertion.
|
||||
*/
|
||||
const KEEPALIVE_INTERVAL_MS = parseInt(
|
||||
process.env.GSTACK_PTY_KEEPALIVE_INTERVAL_MS || '25000',
|
||||
10,
|
||||
);
|
||||
|
||||
const sessions = new WeakMap<any, PtySession>(); // ws -> session
|
||||
|
||||
/** Find claude on PATH. */
|
||||
|
|
@ -388,22 +409,52 @@ function buildServer() {
|
|||
},
|
||||
|
||||
websocket: {
|
||||
/**
|
||||
* Start the 25s server-side keepalive ping. Stored on the session so
|
||||
* close() can clear it. We construct the PtySession here (not lazily
|
||||
* in message()) so the ping interval has somewhere to live before any
|
||||
* data flows. Lazy claude spawn still happens on first binary frame.
|
||||
*/
|
||||
open(ws) {
|
||||
const session: PtySession = {
|
||||
proc: null,
|
||||
cols: 80,
|
||||
rows: 24,
|
||||
cookie: (ws.data as any)?.cookie || '',
|
||||
spawned: false,
|
||||
pingInterval: null,
|
||||
};
|
||||
session.pingInterval = setInterval(() => {
|
||||
try {
|
||||
ws.send(JSON.stringify({ type: 'ping', ts: Date.now() }));
|
||||
} catch {
|
||||
// ws likely closed mid-tick; close handler clears the interval.
|
||||
}
|
||||
}, KEEPALIVE_INTERVAL_MS);
|
||||
sessions.set(ws, session);
|
||||
},
|
||||
|
||||
message(ws, raw) {
|
||||
let session = sessions.get(ws);
|
||||
if (!session) {
|
||||
// Fallback for any path where open() didn't fire (shouldn't happen
|
||||
// in Bun.serve but keeps the spawn path safe). No keepalive on
|
||||
// this branch — open() is the supported entry point.
|
||||
session = {
|
||||
proc: null,
|
||||
cols: 80,
|
||||
rows: 24,
|
||||
cookie: (ws.data as any)?.cookie || '',
|
||||
spawned: false,
|
||||
pingInterval: null,
|
||||
};
|
||||
sessions.set(ws, session);
|
||||
}
|
||||
|
||||
// Text frames are control messages: {type: "resize", cols, rows} or
|
||||
// {type: "tabSwitch", tabId, url, title}. Binary frames are raw input
|
||||
// bytes destined for the PTY stdin.
|
||||
// Text frames are control messages: {type: "resize", cols, rows},
|
||||
// {type: "tabSwitch", tabId, url, title}, {type: "tabState", ...},
|
||||
// or v1.44 keepalive frames: {type: "pong", ts}, {type: "keepalive"}.
|
||||
// Binary frames are raw input bytes destined for the PTY stdin.
|
||||
if (typeof raw === 'string') {
|
||||
let msg: any;
|
||||
try { msg = JSON.parse(raw); } catch { return; }
|
||||
|
|
@ -423,6 +474,14 @@ function buildServer() {
|
|||
handleTabState(msg);
|
||||
return;
|
||||
}
|
||||
if (msg?.type === 'pong' || msg?.type === 'keepalive' || msg?.type === 'ping') {
|
||||
// Keepalive frames — accepted and silently dropped. The mere
|
||||
// fact that the WS carried this frame is the liveness signal;
|
||||
// there's no application-level state to update at this layer.
|
||||
// `ping` is acknowledged here too in case the client (or a
|
||||
// future agent peer) mirrors our server-side ping shape.
|
||||
return;
|
||||
}
|
||||
// Unknown text frame — ignore.
|
||||
return;
|
||||
}
|
||||
|
|
@ -480,6 +539,10 @@ function buildServer() {
|
|||
close(ws) {
|
||||
const session = sessions.get(ws);
|
||||
if (session) {
|
||||
if (session.pingInterval) {
|
||||
clearInterval(session.pingInterval);
|
||||
session.pingInterval = null;
|
||||
}
|
||||
disposeSession(session);
|
||||
if (session.cookie) {
|
||||
// Drop the cookie so it can't be replayed against a new PTY.
|
||||
|
|
|
|||
|
|
@ -0,0 +1,86 @@
|
|||
import { describe, test, expect } from 'bun:test';
|
||||
import * as fs from 'fs';
|
||||
import * as path from 'path';
|
||||
|
||||
// v1.44 WS keepalive — static-grep invariants for the protocol contract.
|
||||
//
|
||||
// terminal-agent.ts and sidepanel-terminal.js cooperate on a 25s ping/pong +
|
||||
// keepalive cycle so long-idle PTY connections survive NAT idle timeouts and
|
||||
// Chromium's MV3 panel suspension heuristics. The wiring is invisible to
|
||||
// integration tests (you'd have to wait 25s to observe a ping) but trivially
|
||||
// regressed by a refactor. These tests fail CI if either side stops sending
|
||||
// or stops accepting the protocol frames.
|
||||
|
||||
const AGENT_TS = path.resolve(new URL(import.meta.url).pathname, '..', '..', 'src', 'terminal-agent.ts');
|
||||
const CLIENT_JS = path.resolve(new URL(import.meta.url).pathname, '..', '..', '..', 'extension', 'sidepanel-terminal.js');
|
||||
|
||||
describe('terminal-agent WS keepalive (v1.44+)', () => {
|
||||
test('1. agent has a KEEPALIVE_INTERVAL_MS env knob, default 25000', () => {
|
||||
const src = fs.readFileSync(AGENT_TS, 'utf-8');
|
||||
expect(src).toContain('GSTACK_PTY_KEEPALIVE_INTERVAL_MS');
|
||||
expect(src).toMatch(/KEEPALIVE_INTERVAL_MS\s*=\s*parseInt\(/);
|
||||
// Default constant present so the env knob has a fallback.
|
||||
expect(src).toContain("'25000'");
|
||||
});
|
||||
|
||||
test('2. WS open handler starts a ping interval on the session', () => {
|
||||
const src = fs.readFileSync(AGENT_TS, 'utf-8');
|
||||
// The open(ws) handler in the websocket: { ... } block must call
|
||||
// setInterval to drive the ping cadence and store the handle.
|
||||
const wsBlock = sliceBetween(src, 'websocket: {', 'function handleTabState');
|
||||
expect(wsBlock).toMatch(/open\s*\(\s*ws\s*\)/);
|
||||
expect(wsBlock).toContain('setInterval');
|
||||
expect(wsBlock).toContain("type: 'ping'");
|
||||
expect(wsBlock).toContain('pingInterval');
|
||||
});
|
||||
|
||||
test('3. WS close handler clears the ping interval', () => {
|
||||
const src = fs.readFileSync(AGENT_TS, 'utf-8');
|
||||
const wsBlock = sliceBetween(src, 'websocket: {', 'function handleTabState');
|
||||
// close(ws) MUST clearInterval the pingInterval — otherwise we leak
|
||||
// timers across reconnects and the ping handler captures a dead ws ref.
|
||||
expect(wsBlock).toMatch(/close\s*\(\s*ws\s*\)/);
|
||||
expect(wsBlock).toContain('clearInterval(session.pingInterval)');
|
||||
});
|
||||
|
||||
test('4. message handler accepts pong / keepalive frames silently', () => {
|
||||
const src = fs.readFileSync(AGENT_TS, 'utf-8');
|
||||
// The text-frame router must recognize the keepalive vocabulary —
|
||||
// if a future refactor strips this branch, unknown-text-frame
|
||||
// suppression would still drop them but we lose intent.
|
||||
expect(src).toMatch(/msg\?\.type === 'pong'/);
|
||||
expect(src).toMatch(/msg\?\.type === 'keepalive'/);
|
||||
});
|
||||
|
||||
test('5. client sends keepalive every 25s on ws.open', () => {
|
||||
const src = fs.readFileSync(CLIENT_JS, 'utf-8');
|
||||
expect(src).toContain('keepaliveInterval');
|
||||
expect(src).toMatch(/setInterval\(/);
|
||||
expect(src).toContain("type: 'keepalive'");
|
||||
expect(src).toContain('KEEPALIVE_INTERVAL_MS = 25000');
|
||||
});
|
||||
|
||||
test('6. client replies pong to server ping', () => {
|
||||
const src = fs.readFileSync(CLIENT_JS, 'utf-8');
|
||||
// The ws.message handler must short-circuit on msg.type === 'ping'
|
||||
// and reply with {type: 'pong', ts: msg.ts}.
|
||||
expect(src).toMatch(/msg\.type === 'ping'/);
|
||||
expect(src).toMatch(/type: 'pong'/);
|
||||
});
|
||||
|
||||
test('7. client clears keepalive in close + teardown + forceRestart', () => {
|
||||
const src = fs.readFileSync(CLIENT_JS, 'utf-8');
|
||||
// Three teardown paths exist; all three must drop the interval to
|
||||
// avoid leaking timers across reconnect attempts.
|
||||
const occurrences = (src.match(/clearInterval\(keepaliveInterval\)/g) || []).length;
|
||||
expect(occurrences).toBeGreaterThanOrEqual(3);
|
||||
});
|
||||
});
|
||||
|
||||
function sliceBetween(source: string, start: string, end: string): string {
|
||||
const i = source.indexOf(start);
|
||||
if (i === -1) throw new Error(`marker not found: ${start}`);
|
||||
const j = source.indexOf(end, i + start.length);
|
||||
if (j === -1) throw new Error(`end marker not found: ${end}`);
|
||||
return source.slice(i, j);
|
||||
}
|
||||
|
|
@ -48,6 +48,16 @@
|
|||
let term = null;
|
||||
let fitAddon = null;
|
||||
let ws = null;
|
||||
/**
|
||||
* 25s client-side WS keepalive interval (v1.44+). Belt-and-suspenders with
|
||||
* the server-side ping in terminal-agent.ts: server pings cover most
|
||||
* idle-NAT cases, client keepalive frames also defend against Chromium's
|
||||
* MV3-adjacent panel suspension heuristics that can pause our timers.
|
||||
* Started on ws.open, cleared on ws.close. The agent silently accepts
|
||||
* `{type:"keepalive"}` text frames.
|
||||
*/
|
||||
let keepaliveInterval = null;
|
||||
const KEEPALIVE_INTERVAL_MS = 25000;
|
||||
|
||||
function show(el) { el.style.display = ''; }
|
||||
function hide(el) { el.style.display = 'none'; }
|
||||
|
|
@ -371,16 +381,33 @@
|
|||
} catch {}
|
||||
// Send a single byte to nudge the agent to spawn claude (lazy-spawn trigger).
|
||||
try { ws.send(new TextEncoder().encode('\n')); } catch {}
|
||||
// v1.44 client-side keepalive. Server pings every 25s; we ALSO send
|
||||
// keepalive frames at the same cadence so a paused timer on either
|
||||
// side still has the other to lean on. Both are silently dropped
|
||||
// by the agent's message handler.
|
||||
if (keepaliveInterval) clearInterval(keepaliveInterval);
|
||||
keepaliveInterval = setInterval(() => {
|
||||
if (!ws || ws.readyState !== WebSocket.OPEN) return;
|
||||
try { ws.send(JSON.stringify({ type: 'keepalive' })); } catch {}
|
||||
}, KEEPALIVE_INTERVAL_MS);
|
||||
});
|
||||
|
||||
ws.addEventListener('message', (ev) => {
|
||||
if (typeof ev.data === 'string') {
|
||||
// Agent control message (rare). Treat as JSON; error frames carry code.
|
||||
// Agent control message. Treat as JSON; error frames carry code,
|
||||
// ping frames trigger an immediate pong reply.
|
||||
try {
|
||||
const msg = JSON.parse(ev.data);
|
||||
if (msg.type === 'error' && msg.code === 'CLAUDE_NOT_FOUND') {
|
||||
setState(STATE.NO_CLAUDE);
|
||||
try { ws.close(); } catch {}
|
||||
return;
|
||||
}
|
||||
if (msg.type === 'ping') {
|
||||
// Mirror the server's timestamp back. Cheap liveness ACK that
|
||||
// lets the agent observe round-trip latency for free.
|
||||
try { ws.send(JSON.stringify({ type: 'pong', ts: msg.ts })); } catch {}
|
||||
return;
|
||||
}
|
||||
} catch {}
|
||||
return;
|
||||
|
|
@ -392,6 +419,10 @@
|
|||
|
||||
ws.addEventListener('close', () => {
|
||||
ws = null;
|
||||
if (keepaliveInterval) {
|
||||
clearInterval(keepaliveInterval);
|
||||
keepaliveInterval = null;
|
||||
}
|
||||
if (state !== STATE.NO_CLAUDE) setState(STATE.ENDED);
|
||||
});
|
||||
|
||||
|
|
@ -401,6 +432,10 @@
|
|||
}
|
||||
|
||||
function teardown() {
|
||||
if (keepaliveInterval) {
|
||||
clearInterval(keepaliveInterval);
|
||||
keepaliveInterval = null;
|
||||
}
|
||||
try { ws && ws.close(); } catch {}
|
||||
ws = null;
|
||||
if (term) {
|
||||
|
|
@ -418,6 +453,10 @@
|
|||
* IDLE, kick off auto-connect. Safe to call from any state.
|
||||
*/
|
||||
function forceRestart() {
|
||||
if (keepaliveInterval) {
|
||||
clearInterval(keepaliveInterval);
|
||||
keepaliveInterval = null;
|
||||
}
|
||||
try { ws && ws.close(); } catch {}
|
||||
ws = null;
|
||||
if (term) {
|
||||
|
|
|
|||
Loading…
Reference in New Issue