mirror of https://github.com/garrytan/gstack.git
feat(browser-skills): bundled hackernews-frontpage reference skill
Smallest interesting browser-skill: scrapes HN front page, returns
30 stories as JSON. No auth, stable HTML, fully fixture-tested.
Files:
SKILL.md frontmatter + prose
script.ts exports parseStoriesFromHtml(html)
main: goto + html + parse + JSON.stringify
_lib/browse-client.ts vendored copy of the SDK
fixtures/hn-2026-04-26.html captured front page (5 stories)
script.test.ts 13 assertions against the fixture
The parser is a pure function over HTML so script.test.ts runs
without a daemon (just imports parseStoriesFromHtml and asserts).
This exercises every Phase 1 component end-to-end:
- browse-client SDK (script imports browse from ./_lib/)
- 3-tier lookup (hackernews-frontpage lives in the bundled tier)
- scoped tokens (read+write is enough for goto + html)
- spawn lifecycle (\$B skill run hackernews-frontpage)
- file-fixture testing (\$B skill test hackernews-frontpage)
Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
This commit is contained in:
parent
e6a27993fe
commit
8a60d99c74
|
|
@ -0,0 +1,52 @@
|
||||||
|
---
|
||||||
|
name: hackernews-frontpage
|
||||||
|
description: Scrape the Hacker News front page (titles, points, comment counts).
|
||||||
|
host: news.ycombinator.com
|
||||||
|
trusted: true
|
||||||
|
source: human
|
||||||
|
version: 1.0.0
|
||||||
|
args: []
|
||||||
|
triggers:
|
||||||
|
- scrape hacker news frontpage
|
||||||
|
- scrape hn frontpage
|
||||||
|
- get hn top stories
|
||||||
|
- latest hacker news stories
|
||||||
|
---
|
||||||
|
|
||||||
|
# Hacker News front-page scraper
|
||||||
|
|
||||||
|
Scrapes the Hacker News (`news.ycombinator.com`) front page and returns the
|
||||||
|
top 30 stories as JSON. Each story has its rank, title, link URL, point count,
|
||||||
|
and comment count.
|
||||||
|
|
||||||
|
## Usage
|
||||||
|
|
||||||
|
```
|
||||||
|
$ $B skill run hackernews-frontpage
|
||||||
|
{
|
||||||
|
"stories": [
|
||||||
|
{ "rank": 1, "title": "...", "url": "...", "points": 412, "comments": 87 },
|
||||||
|
...
|
||||||
|
],
|
||||||
|
"count": 30
|
||||||
|
}
|
||||||
|
```
|
||||||
|
|
||||||
|
## How it works
|
||||||
|
|
||||||
|
1. Navigates to `https://news.ycombinator.com` via the daemon.
|
||||||
|
2. Reads the page HTML.
|
||||||
|
3. Parses each story row (HN's stable `tr.athing` structure) into a typed
|
||||||
|
`Story` record.
|
||||||
|
4. Emits a single JSON document on stdout.
|
||||||
|
|
||||||
|
## Why this is the reference skill
|
||||||
|
|
||||||
|
`hackernews-frontpage` is the smallest interesting browser-skill: no auth,
|
||||||
|
stable HTML, deterministic output, file-fixture-friendly. Every Phase 1
|
||||||
|
component (SDK, scoped tokens, three-tier lookup, spawn lifecycle) is
|
||||||
|
exercised by `$B skill run hackernews-frontpage` and the bundled
|
||||||
|
`script.test.ts`.
|
||||||
|
|
||||||
|
When the HN HTML rotates and our selectors break, the test fails against the
|
||||||
|
captured fixture before users notice. That's the point.
|
||||||
|
|
@ -0,0 +1,257 @@
|
||||||
|
/**
|
||||||
|
* browse-client — canonical SDK that browser-skill scripts import to drive the
|
||||||
|
* gstack daemon over loopback HTTP.
|
||||||
|
*
|
||||||
|
* Distribution model:
|
||||||
|
* This file is the canonical source. Each browser-skill ships a sibling
|
||||||
|
* copy at `<skill>/_lib/browse-client.ts` (Phase 2's generator copies it
|
||||||
|
* alongside every generated skill; Phase 1's bundled `hackernews-frontpage`
|
||||||
|
* reference skill ships a hand-copied version). The skill imports the
|
||||||
|
* sibling via relative path: `import { browse } from './_lib/browse-client'`.
|
||||||
|
*
|
||||||
|
* Why per-skill copies and not a single global SDK: each skill is fully
|
||||||
|
* portable (copy the directory anywhere, it runs), version drift is
|
||||||
|
* impossible (the SDK is frozen at the version the skill was authored
|
||||||
|
* against), no npm publish workflow, no fixed-path tilde imports.
|
||||||
|
*
|
||||||
|
* Auth resolution:
|
||||||
|
* 1. GSTACK_PORT + GSTACK_SKILL_TOKEN env vars (set by `$B skill run` when
|
||||||
|
* spawning the script). The token is a per-spawn scoped capability bound
|
||||||
|
* to read+write commands; it expires when the spawn ends.
|
||||||
|
* 2. State file fallback: read `BROWSE_STATE_FILE` env or `<git-root>/.gstack/browse.json`
|
||||||
|
* and use the `port` + `token` (the daemon root token). This path exists
|
||||||
|
* for developers running a skill directly via `bun run script.ts` outside
|
||||||
|
* the harness — your own authority, not an agent's.
|
||||||
|
*
|
||||||
|
* Trust:
|
||||||
|
* The SDK exposes only the daemon's existing HTTP surface (POST /command).
|
||||||
|
* No new capabilities. The token's scopes (read+write for spawned skills,
|
||||||
|
* full root for standalone debug) determine what actually executes.
|
||||||
|
*
|
||||||
|
* Zero side effects on import. Safe to import from tests or plain scripts.
|
||||||
|
*/
|
||||||
|
|
||||||
|
import * as fs from 'fs';
|
||||||
|
import * as path from 'path';
|
||||||
|
import * as cp from 'child_process';
|
||||||
|
|
||||||
|
export interface BrowseClientOptions {
|
||||||
|
/** Override port. Default: GSTACK_PORT env or state file. */
|
||||||
|
port?: number;
|
||||||
|
/** Override token. Default: GSTACK_SKILL_TOKEN env, then state file root token. */
|
||||||
|
token?: string;
|
||||||
|
/** Tab id to target (every command can scope to a tab). Default: BROWSE_TAB env or undefined (active tab). */
|
||||||
|
tabId?: number;
|
||||||
|
/** Per-request timeout in milliseconds. Default: 30_000. */
|
||||||
|
timeoutMs?: number;
|
||||||
|
/** Override state-file path. Default: BROWSE_STATE_FILE env or <git-root>/.gstack/browse.json. */
|
||||||
|
stateFile?: string;
|
||||||
|
}
|
||||||
|
|
||||||
|
interface ResolvedAuth {
|
||||||
|
port: number;
|
||||||
|
token: string;
|
||||||
|
source: 'env' | 'state-file';
|
||||||
|
}
|
||||||
|
|
||||||
|
/** Resolve the daemon port + token. Throws a clear error if neither path works. */
|
||||||
|
export function resolveBrowseAuth(opts: BrowseClientOptions = {}): ResolvedAuth {
|
||||||
|
if (opts.port !== undefined && opts.token !== undefined) {
|
||||||
|
return { port: opts.port, token: opts.token, source: 'env' };
|
||||||
|
}
|
||||||
|
|
||||||
|
// 1. Env vars (set by $B skill run when spawning).
|
||||||
|
const envPort = process.env.GSTACK_PORT;
|
||||||
|
const envToken = process.env.GSTACK_SKILL_TOKEN;
|
||||||
|
if (envPort && envToken) {
|
||||||
|
const port = opts.port ?? parseInt(envPort, 10);
|
||||||
|
if (!isNaN(port)) {
|
||||||
|
return { port, token: opts.token ?? envToken, source: 'env' };
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
// 2. State file fallback (developer running `bun run script.ts` directly).
|
||||||
|
const stateFile = opts.stateFile ?? process.env.BROWSE_STATE_FILE ?? defaultStateFile();
|
||||||
|
if (stateFile && fs.existsSync(stateFile)) {
|
||||||
|
try {
|
||||||
|
const data = JSON.parse(fs.readFileSync(stateFile, 'utf-8'));
|
||||||
|
if (typeof data.port === 'number' && typeof data.token === 'string') {
|
||||||
|
return {
|
||||||
|
port: opts.port ?? data.port,
|
||||||
|
token: opts.token ?? data.token,
|
||||||
|
source: 'state-file',
|
||||||
|
};
|
||||||
|
}
|
||||||
|
} catch {
|
||||||
|
// fall through to error
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
throw new Error(
|
||||||
|
'browse-client: cannot find daemon port + token. Either spawn via `$B skill run` ' +
|
||||||
|
'(sets GSTACK_PORT + GSTACK_SKILL_TOKEN) or run from a project with a live daemon ' +
|
||||||
|
'(.gstack/browse.json must exist).'
|
||||||
|
);
|
||||||
|
}
|
||||||
|
|
||||||
|
function defaultStateFile(): string | null {
|
||||||
|
try {
|
||||||
|
const proc = cp.spawnSync('git', ['rev-parse', '--show-toplevel'], { encoding: 'utf-8', timeout: 2000 });
|
||||||
|
const root = proc.status === 0 ? proc.stdout.trim() : null;
|
||||||
|
const base = root || process.cwd();
|
||||||
|
return path.join(base, '.gstack', 'browse.json');
|
||||||
|
} catch {
|
||||||
|
return path.join(process.cwd(), '.gstack', 'browse.json');
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
export class BrowseClientError extends Error {
|
||||||
|
constructor(
|
||||||
|
message: string,
|
||||||
|
public readonly status?: number,
|
||||||
|
public readonly body?: string,
|
||||||
|
) {
|
||||||
|
super(message);
|
||||||
|
this.name = 'BrowseClientError';
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Thin client over the daemon's POST /command endpoint.
|
||||||
|
*
|
||||||
|
* Convenience methods cover the common cases (goto, click, text, snapshot,
|
||||||
|
* etc.). For anything not exposed as a method, use `command(cmd, args)`.
|
||||||
|
*/
|
||||||
|
export class BrowseClient {
|
||||||
|
readonly port: number;
|
||||||
|
readonly token: string;
|
||||||
|
readonly tabId?: number;
|
||||||
|
readonly timeoutMs: number;
|
||||||
|
|
||||||
|
constructor(opts: BrowseClientOptions = {}) {
|
||||||
|
const auth = resolveBrowseAuth(opts);
|
||||||
|
this.port = auth.port;
|
||||||
|
this.token = auth.token;
|
||||||
|
this.tabId = opts.tabId ?? (process.env.BROWSE_TAB ? parseInt(process.env.BROWSE_TAB, 10) : undefined);
|
||||||
|
this.timeoutMs = opts.timeoutMs ?? 30_000;
|
||||||
|
}
|
||||||
|
|
||||||
|
// ─── Low-level dispatch ─────────────────────────────────────────
|
||||||
|
|
||||||
|
/** Send an arbitrary command; returns raw response text. Throws on non-2xx. */
|
||||||
|
async command(cmd: string, args: string[] = []): Promise<string> {
|
||||||
|
const body = JSON.stringify({
|
||||||
|
command: cmd,
|
||||||
|
args,
|
||||||
|
...(this.tabId !== undefined && !isNaN(this.tabId) ? { tabId: this.tabId } : {}),
|
||||||
|
});
|
||||||
|
|
||||||
|
let resp: Response;
|
||||||
|
try {
|
||||||
|
resp = await fetch(`http://127.0.0.1:${this.port}/command`, {
|
||||||
|
method: 'POST',
|
||||||
|
headers: {
|
||||||
|
'Content-Type': 'application/json',
|
||||||
|
'Authorization': `Bearer ${this.token}`,
|
||||||
|
},
|
||||||
|
body,
|
||||||
|
signal: AbortSignal.timeout(this.timeoutMs),
|
||||||
|
});
|
||||||
|
} catch (err: any) {
|
||||||
|
if (err.name === 'TimeoutError' || err.name === 'AbortError') {
|
||||||
|
throw new BrowseClientError(`browse-client: command "${cmd}" timed out after ${this.timeoutMs}ms`);
|
||||||
|
}
|
||||||
|
if (err.code === 'ECONNREFUSED') {
|
||||||
|
throw new BrowseClientError(`browse-client: daemon not running on port ${this.port}`);
|
||||||
|
}
|
||||||
|
throw new BrowseClientError(`browse-client: ${err.message ?? err}`);
|
||||||
|
}
|
||||||
|
|
||||||
|
const text = await resp.text();
|
||||||
|
if (!resp.ok) {
|
||||||
|
let message = `browse-client: command "${cmd}" failed with status ${resp.status}`;
|
||||||
|
try {
|
||||||
|
const parsed = JSON.parse(text);
|
||||||
|
if (parsed.error) message += `: ${parsed.error}`;
|
||||||
|
} catch {
|
||||||
|
if (text) message += `: ${text.slice(0, 200)}`;
|
||||||
|
}
|
||||||
|
throw new BrowseClientError(message, resp.status, text);
|
||||||
|
}
|
||||||
|
return text;
|
||||||
|
}
|
||||||
|
|
||||||
|
// ─── Navigation ─────────────────────────────────────────────────
|
||||||
|
|
||||||
|
async goto(url: string): Promise<string> { return this.command('goto', [url]); }
|
||||||
|
async wait(arg: string): Promise<string> { return this.command('wait', [arg]); }
|
||||||
|
|
||||||
|
// ─── Reading ────────────────────────────────────────────────────
|
||||||
|
|
||||||
|
async text(selector?: string): Promise<string> {
|
||||||
|
return this.command('text', selector ? [selector] : []);
|
||||||
|
}
|
||||||
|
async html(selector?: string): Promise<string> {
|
||||||
|
return this.command('html', selector ? [selector] : []);
|
||||||
|
}
|
||||||
|
async links(): Promise<string> { return this.command('links'); }
|
||||||
|
async forms(): Promise<string> { return this.command('forms'); }
|
||||||
|
async accessibility(): Promise<string> { return this.command('accessibility'); }
|
||||||
|
async attrs(selector: string): Promise<string> { return this.command('attrs', [selector]); }
|
||||||
|
async media(...flags: string[]): Promise<string> { return this.command('media', flags); }
|
||||||
|
async data(...flags: string[]): Promise<string> { return this.command('data', flags); }
|
||||||
|
|
||||||
|
// ─── Interaction ────────────────────────────────────────────────
|
||||||
|
|
||||||
|
async click(selector: string): Promise<string> { return this.command('click', [selector]); }
|
||||||
|
async fill(selector: string, value: string): Promise<string> { return this.command('fill', [selector, value]); }
|
||||||
|
async select(selector: string, value: string): Promise<string> { return this.command('select', [selector, value]); }
|
||||||
|
async hover(selector: string): Promise<string> { return this.command('hover', [selector]); }
|
||||||
|
async type(text: string): Promise<string> { return this.command('type', [text]); }
|
||||||
|
async press(key: string): Promise<string> { return this.command('press', [key]); }
|
||||||
|
async scroll(selector?: string): Promise<string> {
|
||||||
|
return this.command('scroll', selector ? [selector] : []);
|
||||||
|
}
|
||||||
|
|
||||||
|
// ─── Snapshot + screenshot ──────────────────────────────────────
|
||||||
|
|
||||||
|
/** Snapshot returns the ARIA tree. Pass flags like '-i' (interactive only), '-c' (compact). */
|
||||||
|
async snapshot(...flags: string[]): Promise<string> { return this.command('snapshot', flags); }
|
||||||
|
async screenshot(...args: string[]): Promise<string> { return this.command('screenshot', args); }
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Default singleton. Lazily resolves auth on first method call so a script can
|
||||||
|
* import `browse` and immediately call `await browse.goto(...)` without
|
||||||
|
* threading through a constructor.
|
||||||
|
*/
|
||||||
|
class LazyBrowseClient {
|
||||||
|
private inner: BrowseClient | null = null;
|
||||||
|
private get(): BrowseClient {
|
||||||
|
if (!this.inner) this.inner = new BrowseClient();
|
||||||
|
return this.inner;
|
||||||
|
}
|
||||||
|
// Mirror the BrowseClient surface; each method delegates to a freshly resolved instance.
|
||||||
|
command(cmd: string, args: string[] = []) { return this.get().command(cmd, args); }
|
||||||
|
goto(url: string) { return this.get().goto(url); }
|
||||||
|
wait(arg: string) { return this.get().wait(arg); }
|
||||||
|
text(selector?: string) { return this.get().text(selector); }
|
||||||
|
html(selector?: string) { return this.get().html(selector); }
|
||||||
|
links() { return this.get().links(); }
|
||||||
|
forms() { return this.get().forms(); }
|
||||||
|
accessibility() { return this.get().accessibility(); }
|
||||||
|
attrs(selector: string) { return this.get().attrs(selector); }
|
||||||
|
media(...flags: string[]) { return this.get().media(...flags); }
|
||||||
|
data(...flags: string[]) { return this.get().data(...flags); }
|
||||||
|
click(selector: string) { return this.get().click(selector); }
|
||||||
|
fill(selector: string, value: string) { return this.get().fill(selector, value); }
|
||||||
|
select(selector: string, value: string) { return this.get().select(selector, value); }
|
||||||
|
hover(selector: string) { return this.get().hover(selector); }
|
||||||
|
type(text: string) { return this.get().type(text); }
|
||||||
|
press(key: string) { return this.get().press(key); }
|
||||||
|
scroll(selector?: string) { return this.get().scroll(selector); }
|
||||||
|
snapshot(...flags: string[]) { return this.get().snapshot(...flags); }
|
||||||
|
screenshot(...args: string[]) { return this.get().screenshot(...args); }
|
||||||
|
}
|
||||||
|
|
||||||
|
export const browse = new LazyBrowseClient();
|
||||||
|
|
@ -0,0 +1,52 @@
|
||||||
|
<!DOCTYPE html><html lang="en" op="news"><head><meta charset="utf-8"><title>Hacker News</title></head>
|
||||||
|
<body><center><table id="hnmain" border="0" cellpadding="0" cellspacing="0" width="85%" bgcolor="#f6f6ef">
|
||||||
|
<tr><td>
|
||||||
|
<table border="0" cellpadding="0" cellspacing="0" class="itemlist">
|
||||||
|
<tr class="athing submission" id="40000001">
|
||||||
|
<td align="right" valign="top" class="title"><span class="rank">1.</span></td>
|
||||||
|
<td valign="top" class="votelinks"><center><a id="up_40000001" href="vote?id=40000001"><div class="votearrow" title="upvote"></div></a></center></td>
|
||||||
|
<td class="title"><span class="titleline"><a href="https://example.com/blog-post-1" rel="noreferrer">Show HN: A toy compiler in 200 lines</a> <span class="sitebit comhead"> (<a href="from?site=example.com"><span class="sitestr">example.com</span></a>)</span></span></td>
|
||||||
|
</tr>
|
||||||
|
<tr><td colspan="2"></td><td class="subtext"><span class="subline">
|
||||||
|
<span class="score" id="score_40000001">412 points</span> by <a href="user?id=alice" class="hnuser">alice</a> <span class="age" title="2026-04-26T08:15:00"><a href="item?id=40000001">3 hours ago</a></span> <span id="unv_40000001"></span> | <a href="hide?id=40000001&goto=news">hide</a> | <a href="item?id=40000001">87 comments</a> </span></td></tr>
|
||||||
|
<tr class="spacer" style="height:5px"></tr>
|
||||||
|
|
||||||
|
<tr class="athing submission" id="40000002">
|
||||||
|
<td align="right" valign="top" class="title"><span class="rank">2.</span></td>
|
||||||
|
<td valign="top" class="votelinks"><center><a id="up_40000002" href="vote?id=40000002"><div class="votearrow" title="upvote"></div></a></center></td>
|
||||||
|
<td class="title"><span class="titleline"><a href="https://example.org/database-internals" rel="noreferrer">Database internals: writing an LSM tree</a> <span class="sitebit comhead"> (<a href="from?site=example.org"><span class="sitestr">example.org</span></a>)</span></span></td>
|
||||||
|
</tr>
|
||||||
|
<tr><td colspan="2"></td><td class="subtext"><span class="subline">
|
||||||
|
<span class="score" id="score_40000002">298 points</span> by <a href="user?id=bob" class="hnuser">bob</a> <span class="age" title="2026-04-26T07:42:00"><a href="item?id=40000002">4 hours ago</a></span> <span id="unv_40000002"></span> | <a href="hide?id=40000002&goto=news">hide</a> | <a href="item?id=40000002">152 comments</a> </span></td></tr>
|
||||||
|
<tr class="spacer" style="height:5px"></tr>
|
||||||
|
|
||||||
|
<tr class="athing submission" id="40000003">
|
||||||
|
<td align="right" valign="top" class="title"><span class="rank">3.</span></td>
|
||||||
|
<td valign="top" class="votelinks"><center><a id="up_40000003" href="vote?id=40000003"><div class="votearrow" title="upvote"></div></a></center></td>
|
||||||
|
<td class="title"><span class="titleline"><a href="https://example.com/yc-w26-startup">Acme (YC W26) is hiring senior engineers (remote)</a> <span class="sitebit comhead"> (<a href="from?site=example.com"><span class="sitestr">example.com</span></a>)</span></span></td>
|
||||||
|
</tr>
|
||||||
|
<tr><td colspan="2"></td><td class="subtext"><span class="subline">
|
||||||
|
<span class="age" title="2026-04-26T06:00:00"><a href="item?id=40000003">5 hours ago</a></span> </span></td></tr>
|
||||||
|
<tr class="spacer" style="height:5px"></tr>
|
||||||
|
|
||||||
|
<tr class="athing submission" id="40000004">
|
||||||
|
<td align="right" valign="top" class="title"><span class="rank">4.</span></td>
|
||||||
|
<td valign="top" class="votelinks"><center><a id="up_40000004" href="vote?id=40000004"><div class="votearrow" title="upvote"></div></a></center></td>
|
||||||
|
<td class="title"><span class="titleline"><a href="https://example.net/ask-hn" rel="noreferrer">Ask HN: What's your most underrated tool?</a></span></td>
|
||||||
|
</tr>
|
||||||
|
<tr><td colspan="2"></td><td class="subtext"><span class="subline">
|
||||||
|
<span class="score" id="score_40000004">156 points</span> by <a href="user?id=carol" class="hnuser">carol</a> <span class="age" title="2026-04-26T05:30:00"><a href="item?id=40000004">6 hours ago</a></span> <span id="unv_40000004"></span> | <a href="hide?id=40000004&goto=news">hide</a> | <a href="item?id=40000004">discuss</a> </span></td></tr>
|
||||||
|
<tr class="spacer" style="height:5px"></tr>
|
||||||
|
|
||||||
|
<tr class="athing submission" id="40000005">
|
||||||
|
<td align="right" valign="top" class="title"><span class="rank">5.</span></td>
|
||||||
|
<td valign="top" class="votelinks"><center><a id="up_40000005" href="vote?id=40000005"><div class="votearrow" title="upvote"></div></a></center></td>
|
||||||
|
<td class="title"><span class="titleline"><a href="https://example.io/quantum&chess">Why quantum & chess engines disagree</a> <span class="sitebit comhead"> (<a href="from?site=example.io"><span class="sitestr">example.io</span></a>)</span></span></td>
|
||||||
|
</tr>
|
||||||
|
<tr><td colspan="2"></td><td class="subtext"><span class="subline">
|
||||||
|
<span class="score" id="score_40000005">73 points</span> by <a href="user?id=dave" class="hnuser">dave</a> <span class="age" title="2026-04-26T04:00:00"><a href="item?id=40000005">7 hours ago</a></span> <span id="unv_40000005"></span> | <a href="hide?id=40000005&goto=news">hide</a> | <a href="item?id=40000005">12 comments</a> </span></td></tr>
|
||||||
|
<tr class="spacer" style="height:5px"></tr>
|
||||||
|
|
||||||
|
</table>
|
||||||
|
</td></tr>
|
||||||
|
</table></center></body></html>
|
||||||
|
|
@ -0,0 +1,105 @@
|
||||||
|
/**
|
||||||
|
* hackernews-frontpage script tests — exercise parseStoriesFromHtml against
|
||||||
|
* the bundled HN fixture. No daemon, no network: the parser is a pure function
|
||||||
|
* over HTML, so we test it directly.
|
||||||
|
*/
|
||||||
|
|
||||||
|
import { describe, it, expect } from 'bun:test';
|
||||||
|
import * as fs from 'fs';
|
||||||
|
import * as path from 'path';
|
||||||
|
import { parseStoriesFromHtml } from './script';
|
||||||
|
|
||||||
|
const FIXTURE = fs.readFileSync(
|
||||||
|
path.join(__dirname, 'fixtures', 'hn-2026-04-26.html'),
|
||||||
|
'utf-8',
|
||||||
|
);
|
||||||
|
|
||||||
|
describe('parseStoriesFromHtml against bundled HN fixture', () => {
|
||||||
|
it('returns 5 stories (matching the fixture)', () => {
|
||||||
|
const stories = parseStoriesFromHtml(FIXTURE);
|
||||||
|
expect(stories).toHaveLength(5);
|
||||||
|
});
|
||||||
|
|
||||||
|
it('assigns 1-based ranks in document order', () => {
|
||||||
|
const stories = parseStoriesFromHtml(FIXTURE);
|
||||||
|
expect(stories.map(s => s.rank)).toEqual([1, 2, 3, 4, 5]);
|
||||||
|
});
|
||||||
|
|
||||||
|
it('extracts ids matching the tr.athing[id] attribute', () => {
|
||||||
|
const stories = parseStoriesFromHtml(FIXTURE);
|
||||||
|
expect(stories.map(s => s.id)).toEqual([
|
||||||
|
'40000001', '40000002', '40000003', '40000004', '40000005',
|
||||||
|
]);
|
||||||
|
});
|
||||||
|
|
||||||
|
it('extracts titles and decodes HTML entities', () => {
|
||||||
|
const stories = parseStoriesFromHtml(FIXTURE);
|
||||||
|
expect(stories[0].title).toBe('Show HN: A toy compiler in 200 lines');
|
||||||
|
expect(stories[1].title).toBe('Database internals: writing an LSM tree');
|
||||||
|
expect(stories[3].title).toBe("Ask HN: What's your most underrated tool?");
|
||||||
|
expect(stories[4].title).toBe('Why quantum & chess engines disagree');
|
||||||
|
});
|
||||||
|
|
||||||
|
it('extracts URLs and decodes ampersands', () => {
|
||||||
|
const stories = parseStoriesFromHtml(FIXTURE);
|
||||||
|
expect(stories[0].url).toBe('https://example.com/blog-post-1');
|
||||||
|
expect(stories[1].url).toBe('https://example.org/database-internals');
|
||||||
|
expect(stories[4].url).toBe('https://example.io/quantum&chess');
|
||||||
|
});
|
||||||
|
|
||||||
|
it('parses point counts as numbers', () => {
|
||||||
|
const stories = parseStoriesFromHtml(FIXTURE);
|
||||||
|
expect(stories[0].points).toBe(412);
|
||||||
|
expect(stories[1].points).toBe(298);
|
||||||
|
expect(stories[3].points).toBe(156);
|
||||||
|
expect(stories[4].points).toBe(73);
|
||||||
|
});
|
||||||
|
|
||||||
|
it('parses comment counts as numbers', () => {
|
||||||
|
const stories = parseStoriesFromHtml(FIXTURE);
|
||||||
|
expect(stories[0].comments).toBe(87);
|
||||||
|
expect(stories[1].comments).toBe(152);
|
||||||
|
expect(stories[4].comments).toBe(12);
|
||||||
|
});
|
||||||
|
|
||||||
|
it('treats "discuss" links as 0 comments', () => {
|
||||||
|
const stories = parseStoriesFromHtml(FIXTURE);
|
||||||
|
expect(stories[3].comments).toBe(0);
|
||||||
|
});
|
||||||
|
|
||||||
|
it('returns null points + null comments for job postings', () => {
|
||||||
|
const stories = parseStoriesFromHtml(FIXTURE);
|
||||||
|
// Story #3 is the YC-hiring row in the fixture.
|
||||||
|
expect(stories[2].title).toContain('YC W26');
|
||||||
|
expect(stories[2].points).toBeNull();
|
||||||
|
expect(stories[2].comments).toBeNull();
|
||||||
|
});
|
||||||
|
|
||||||
|
it('returns [] for empty HTML', () => {
|
||||||
|
expect(parseStoriesFromHtml('')).toEqual([]);
|
||||||
|
});
|
||||||
|
|
||||||
|
it('returns [] for HTML with no story rows', () => {
|
||||||
|
expect(parseStoriesFromHtml('<html><body><p>nothing here</p></body></html>')).toEqual([]);
|
||||||
|
});
|
||||||
|
|
||||||
|
it('does not fabricate stories from arbitrary tr.athing rows missing titleline', () => {
|
||||||
|
const html = '<tr class="athing" id="999"><td>nothing</td></tr>';
|
||||||
|
expect(parseStoriesFromHtml(html)).toEqual([]);
|
||||||
|
});
|
||||||
|
});
|
||||||
|
|
||||||
|
describe('output shape', () => {
|
||||||
|
it('every story has all required keys', () => {
|
||||||
|
const stories = parseStoriesFromHtml(FIXTURE);
|
||||||
|
for (const s of stories) {
|
||||||
|
expect(typeof s.rank).toBe('number');
|
||||||
|
expect(typeof s.id).toBe('string');
|
||||||
|
expect(typeof s.title).toBe('string');
|
||||||
|
expect(typeof s.url).toBe('string');
|
||||||
|
// points/comments may be null for job rows
|
||||||
|
expect(s.points === null || typeof s.points === 'number').toBe(true);
|
||||||
|
expect(s.comments === null || typeof s.comments === 'number').toBe(true);
|
||||||
|
}
|
||||||
|
});
|
||||||
|
});
|
||||||
|
|
@ -0,0 +1,132 @@
|
||||||
|
/**
|
||||||
|
* hackernews-frontpage — scrape the HN front page and emit JSON.
|
||||||
|
*
|
||||||
|
* Output protocol:
|
||||||
|
* stdout = a single JSON document on success: { stories: Story[], count }
|
||||||
|
* stderr = anything we want logged (currently nothing)
|
||||||
|
* exit 0 on success, nonzero on parse / network failure.
|
||||||
|
*
|
||||||
|
* The parser logic (`parseStoriesFromHtml`) is exported so script.test.ts can
|
||||||
|
* exercise it against bundled HTML fixtures without spinning up the daemon.
|
||||||
|
*/
|
||||||
|
|
||||||
|
import { browse } from './_lib/browse-client';
|
||||||
|
|
||||||
|
export interface Story {
|
||||||
|
/** 1-based rank as displayed on HN. */
|
||||||
|
rank: number;
|
||||||
|
/** HN item id (the integer in `tr.athing[id]`). */
|
||||||
|
id: string;
|
||||||
|
title: string;
|
||||||
|
/** Outbound URL the title links to. */
|
||||||
|
url: string;
|
||||||
|
/** null when the row has no score (job postings). */
|
||||||
|
points: number | null;
|
||||||
|
/** null when the row has no comments link (job postings). */
|
||||||
|
comments: number | null;
|
||||||
|
}
|
||||||
|
|
||||||
|
export interface Output {
|
||||||
|
stories: Story[];
|
||||||
|
count: number;
|
||||||
|
}
|
||||||
|
|
||||||
|
const FRONT_PAGE_URL = 'https://news.ycombinator.com/';
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Parse HN front-page HTML into Story[].
|
||||||
|
*
|
||||||
|
* HN's structure is stable: each story is a pair of rows.
|
||||||
|
* <tr class="athing submission" id="<itemid>">
|
||||||
|
* <td class="rank">N.</td>
|
||||||
|
* <td class="title">...</td>
|
||||||
|
* <td class="title"><span class="titleline"><a href="<url>">title</a> ...</span></td>
|
||||||
|
* </tr>
|
||||||
|
* <tr><td colspan="2"></td><td class="subtext"><span class="subline">
|
||||||
|
* <span class="score" id="score_<itemid>">N points</span>
|
||||||
|
* ... <a href="item?id=<itemid>">N comments</a>
|
||||||
|
* </span></td></tr>
|
||||||
|
*
|
||||||
|
* Job postings ("Foo (YC X25) is hiring...") omit the score and comments —
|
||||||
|
* those fields come back as null.
|
||||||
|
*/
|
||||||
|
export function parseStoriesFromHtml(html: string): Story[] {
|
||||||
|
const stories: Story[] = [];
|
||||||
|
|
||||||
|
// Match each `tr.athing` row, capturing the id attribute and the row body.
|
||||||
|
const rowRegex = /<tr\s+[^>]*\bclass="athing[^"]*"[^>]*\bid="(\d+)"[^>]*>([\s\S]*?)<\/tr>/g;
|
||||||
|
|
||||||
|
let match: RegExpExecArray | null;
|
||||||
|
let rank = 0;
|
||||||
|
while ((match = rowRegex.exec(html)) !== null) {
|
||||||
|
rank++;
|
||||||
|
const id = match[1];
|
||||||
|
const rowBody = match[2];
|
||||||
|
|
||||||
|
// Title link: <span class="titleline"><a href="..." ...>title</a>
|
||||||
|
const titleMatch = rowBody.match(/<span\s+class="titleline"[^>]*>\s*<a\s+href="([^"]+)"[^>]*>([\s\S]*?)<\/a>/);
|
||||||
|
if (!titleMatch) continue;
|
||||||
|
const url = decodeHtmlEntities(titleMatch[1]);
|
||||||
|
const title = stripTags(decodeHtmlEntities(titleMatch[2])).trim();
|
||||||
|
|
||||||
|
// The next sibling tr should hold the subtext row. Bound the lookahead
|
||||||
|
// to before the next story (tr.spacer marks the gap, then tr.athing).
|
||||||
|
// Bug if we don't bound: the score from story N+1 leaks into story N
|
||||||
|
// when story N is a job posting (no score of its own).
|
||||||
|
const subtextStart = match.index + match[0].length;
|
||||||
|
const tail = html.slice(subtextStart);
|
||||||
|
const spacerIdx = tail.search(/<tr\b[^>]*\bclass="spacer\b/);
|
||||||
|
const nextAthingIdx = tail.search(/<tr\b[^>]*\bclass="athing\b/);
|
||||||
|
const candidates = [spacerIdx, nextAthingIdx].filter(i => i >= 0);
|
||||||
|
const boundary = candidates.length > 0 ? Math.min(...candidates) : tail.length;
|
||||||
|
const subtextSlice = tail.slice(0, boundary);
|
||||||
|
|
||||||
|
let points: number | null = null;
|
||||||
|
let comments: number | null = null;
|
||||||
|
|
||||||
|
const scoreMatch = subtextSlice.match(/<span\s+class="score"[^>]*>(\d+)\s*points?<\/span>/);
|
||||||
|
if (scoreMatch) points = parseInt(scoreMatch[1], 10);
|
||||||
|
|
||||||
|
// Comment count: an anchor like `<a href="item?id=...">N comments</a>`,
|
||||||
|
// or `discuss` (treated as 0). Skip "hide" / "context" / "from" links.
|
||||||
|
const commentsMatch = subtextSlice.match(/<a\s+href="item\?id=\d+"[^>]*>(\d+)\s*(?: )?\s*comments?<\/a>/);
|
||||||
|
if (commentsMatch) {
|
||||||
|
comments = parseInt(commentsMatch[1], 10);
|
||||||
|
} else if (/discuss<\/a>/.test(subtextSlice)) {
|
||||||
|
comments = 0;
|
||||||
|
}
|
||||||
|
|
||||||
|
stories.push({ rank, id, title, url, points, comments });
|
||||||
|
}
|
||||||
|
|
||||||
|
return stories;
|
||||||
|
}
|
||||||
|
|
||||||
|
function stripTags(s: string): string {
|
||||||
|
return s.replace(/<[^>]*>/g, '');
|
||||||
|
}
|
||||||
|
|
||||||
|
function decodeHtmlEntities(s: string): string {
|
||||||
|
return s
|
||||||
|
.replace(/&/g, '&')
|
||||||
|
.replace(/"/g, '"')
|
||||||
|
.replace(/'/g, "'")
|
||||||
|
.replace(/'/g, "'")
|
||||||
|
.replace(/</g, '<')
|
||||||
|
.replace(/>/g, '>')
|
||||||
|
.replace(/ /g, ' ');
|
||||||
|
}
|
||||||
|
|
||||||
|
// ─── Main entry (only when run as a script, not when imported by tests) ─
|
||||||
|
|
||||||
|
if (import.meta.main) {
|
||||||
|
await main();
|
||||||
|
}
|
||||||
|
|
||||||
|
async function main(): Promise<void> {
|
||||||
|
await browse.goto(FRONT_PAGE_URL);
|
||||||
|
const html = await browse.html();
|
||||||
|
const stories = parseStoriesFromHtml(html);
|
||||||
|
const output: Output = { stories, count: stories.length };
|
||||||
|
process.stdout.write(JSON.stringify(output) + '\n');
|
||||||
|
}
|
||||||
Loading…
Reference in New Issue