From 67e87fe421e35e86a52a8884d6eefefe245b7f4d Mon Sep 17 00:00:00 2001 From: Garry Tan Date: Thu, 11 Jun 2026 23:59:40 -0700 Subject: [PATCH] =?UTF-8?q?feat(make-pdf):=20diagram=20pre-pass=20?= =?UTF-8?q?=E2=80=94=20mermaid/excalidraw=20fences=20render=20as=20vector?= =?UTF-8?q?=20SVG;=20local=20images=20inline=20as=20data=20URIs?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit ```mermaid / ```excalidraw fences extract to placeholder tokens, render in one diagram-render bundle tab per run (reset contract: bundle page reloads after any render error), and substitute back as accessible
blocks with the raw source preserved in a comment. Render failures produce a loud red diagnostic block, never silent raw code. render=false keeps a fence as code; title="..." becomes the aria-label and caption. Local images now actually render: page.setContent loads at about:blank (tab-session.ts:194), so relative paths silently 404'd before. The pre-pass resolves them against the markdown's directory, inlines as data URIs, probes intrinsic dimensions from the bytes (pure-TS PNG/JPEG/GIF/WebP/SVG sniffing), and downscales rasters wider than 2x the content box at 300dpi. Remote URLs warn (offline posture, --allow-network exempts); missing files get a visible placeholder; --strict hard-fails both for CI pipelines. Co-Authored-By: Claude Fable 5 --- make-pdf/src/browseClient.ts | 11 + make-pdf/src/cli.ts | 7 + make-pdf/src/diagram-prepass.ts | 610 ++++++++++++++++++++++++++++++++ make-pdf/src/image-size.ts | 106 ++++++ make-pdf/src/orchestrator.ts | 73 +++- make-pdf/src/print-css.ts | 12 + make-pdf/src/types.ts | 4 + 7 files changed, 820 insertions(+), 3 deletions(-) create mode 100644 make-pdf/src/diagram-prepass.ts create mode 100644 make-pdf/src/image-size.ts diff --git a/make-pdf/src/browseClient.ts b/make-pdf/src/browseClient.ts index 63cec7755..9c70c716c 100644 --- a/make-pdf/src/browseClient.ts +++ b/make-pdf/src/browseClient.ts @@ -268,6 +268,17 @@ export function loadHtml(opts: LoadHtmlOptions): void { } } +/** + * Load an HTML file (already under browse's safe dirs, e.g. /tmp) into a tab + * by path. Cheaper than loadHtml for large pages — no JSON payload round-trip; + * browse reads the file directly (diagram-render bundle is ~9MB). + */ +export function loadHtmlFile(opts: { file: string; tabId: number; waitUntil?: "load" | "domcontentloaded" | "networkidle" }): void { + const args = ["load-html", opts.file, "--tab-id", String(opts.tabId)]; + if (opts.waitUntil) args.push("--wait-until", opts.waitUntil); + runBrowse(args); +} + /** * Evaluate a JS expression in a tab. Returns the serialized result as string. */ diff --git a/make-pdf/src/cli.ts b/make-pdf/src/cli.ts index 62a3b948e..f8e1fe52e 100644 --- a/make-pdf/src/cli.ts +++ b/make-pdf/src/cli.ts @@ -86,6 +86,12 @@ function printUsage(): void { lines.push(" --quiet Suppress progress on stderr."); lines.push(" --verbose Per-stage timings on stderr."); lines.push(""); + lines.push("Diagrams & images:"); + lines.push(" ```mermaid / ```excalidraw fences render as vector diagrams."); + lines.push(" Add render=false to a fence info string to keep it as a code block."); + lines.push(" Local images are inlined; oversized rasters downscale to print resolution."); + lines.push(" --strict Missing/remote images fail the run (CI mode)."); + lines.push(""); lines.push("Network:"); lines.push(" --allow-network Load external images (off by default)."); lines.push(""); @@ -136,6 +142,7 @@ function generateOptionsFromFlags(parsed: ParsedArgs): GenerateOptions { quiet: f.quiet === true, verbose: f.verbose === true, allowNetwork: f["allow-network"] === true, + strict: f.strict === true, title: typeof f.title === "string" ? f.title : undefined, author: typeof f.author === "string" ? f.author : undefined, date: typeof f.date === "string" ? f.date : undefined, diff --git a/make-pdf/src/diagram-prepass.ts b/make-pdf/src/diagram-prepass.ts new file mode 100644 index 000000000..f0e2a2c4b --- /dev/null +++ b/make-pdf/src/diagram-prepass.ts @@ -0,0 +1,610 @@ +/** + * Diagram + image pre-pass. Runs between "read markdown" and render() in the + * orchestrator, and owns everything that needs the diagram-render bundle. + * + * markdown ─▶ extractDiagramFences() ──▶ render() (marked+sanitize+smarty) + * │ fences → placeholder tokens │ + * │ ▼ + * └─▶ renderFenceSlots() ───────────▶ substituteSlots(html, slots) + * one browse render tab/run │ + * error ⇒ diagnostic block + page reload ▼ + * inlineLocalImages(html) + * data URIs, probe dims from bytes, + * downscale >2x content box @300dpi, + * remote warn / missing placeholder / + * --strict hard-fail + * + * Placeholders survive marked, the sanitizer, and smartypants because they are + * plain hyphenated lowercase tokens with no quotes or HTML. Slot HTML is run + * through the same sanitizer as user content before substitution (the bundle + * renders with securityLevel strict — the sanitizer is the second layer). + * + * Reset contract (eng-review D6.2): each fence renders with a fresh + * mermaid.render id; after ANY render error the bundle page is reloaded before + * the next fence so a poisoned global can't corrupt diagram N+1. + */ + +import * as fs from "node:fs"; +import * as os from "node:os"; +import * as path from "node:path"; +import * as crypto from "node:crypto"; + +import * as browseClient from "./browseClient"; +import { sanitizeUntrustedHtml } from "./render"; +import { imageDims } from "./image-size"; + +// ─── Types ──────────────────────────────────────────────────────────── + +export interface DiagramFence { + /** "mermaid" | "excalidraw" */ + lang: string; + /** Fence body (the diagram source). */ + source: string; + /** Optional title="..." from the fence info string (a11y label, D6.4). */ + title?: string; + /** render=false → leave as a plain code block (escape hatch, D6.3). */ + render: boolean; + /** Placeholder token substituted into the markdown. */ + token: string; + /** 1-based ordinal among rendered fences (unique ids, aria fallback). */ + ordinal: number; +} + +export interface FenceExtraction { + markdown: string; + fences: DiagramFence[]; +} + +export interface PrepassWarnings { + warn: (msg: string) => void; +} + +export interface PrepassImageOptions { + /** Directory of the source markdown — relative image paths resolve here. */ + inputDir: string; + /** Hard-fail on missing/remote images instead of warn (D6.1). */ + strict: boolean; + /** Remote images are left untouched when network is explicitly allowed. */ + allowNetwork: boolean; + /** Physical content-box width in inches (page width minus margins). */ + contentWidthIn: number; + warn: (msg: string) => void; + /** Lazily provides a ready bundle tab (only opened when needed). */ + getTab: () => RenderTab | null; +} + +/** Print-resolution policy (eng-review D4): downscale rasters wider than + * 2 × contentWidth × 300dpi down to contentWidth × 300dpi. */ +const PRINT_DPI = 300; +const DOWNSCALE_FACTOR = 2; + +export class StrictModeError extends Error { + constructor(msg: string) { + super(msg); + this.name = "StrictModeError"; + } +} + +// ─── Fence extraction (pure) ────────────────────────────────────────── + +const DIAGRAM_LANGS = new Set(["mermaid", "excalidraw"]); + +/** + * Extract top-level ```mermaid / ```excalidraw fences, replacing each with a + * unique placeholder token paragraph. Backtick and tilde fences, any length + * >= 3; closers must be at least as long as the opener (CommonMark). Fences + * with `render=false` in the info string are left untouched. + */ +export function extractDiagramFences(markdown: string): FenceExtraction { + const lines = markdown.split("\n"); + const out: string[] = []; + const fences: DiagramFence[] = []; + const runId = crypto.randomBytes(4).toString("hex"); + + let i = 0; + let openFence: { char: string; len: number; info: string; body: string[] } | null = null; + let ordinal = 0; + + while (i < lines.length) { + const line = lines[i]; + + if (openFence) { + const close = matchFenceLine(line); + if (close && close.char === openFence.char && close.len >= openFence.len && close.info === "") { + const info = parseInfoString(openFence.info); + if (DIAGRAM_LANGS.has(info.lang) && info.render) { + ordinal++; + const token = `gstack-diagram-slot-${runId}-${ordinal}`; + fences.push({ + lang: info.lang, + source: openFence.body.join("\n"), + title: info.title, + render: true, + token, + ordinal, + }); + out.push("", token, ""); + } else { + // Not a diagram fence (or render=false): replay verbatim, but strip + // the render=false flag so it never leaks into highlighted output. + const infoOut = info.render ? openFence.info : info.lang; + out.push(`${openFence.char.repeat(openFence.len)}${infoOut}`); + out.push(...openFence.body); + out.push(line); + } + openFence = null; + i++; + continue; + } + openFence.body.push(line); + i++; + continue; + } + + const open = matchFenceLine(line); + if (open && open.info !== "") { + openFence = { char: open.char, len: open.len, info: open.info, body: [] }; + i++; + continue; + } + if (open) { + // Anonymous fence (plain code block) — copy through to its closer so a + // ```mermaid example INSIDE a plain fence is never extracted. + out.push(line); + i++; + while (i < lines.length) { + const l = lines[i]; + const close = matchFenceLine(l); + out.push(l); + i++; + if (close && close.char === open.char && close.len >= open.len && close.info === "") break; + } + continue; + } + + out.push(line); + i++; + } + + // Unclosed fence at EOF: replay verbatim (CommonMark treats it as code to EOF). + if (openFence) { + out.push(`${openFence.char.repeat(openFence.len)}${openFence.info}`); + out.push(...openFence.body); + } + + return { markdown: out.join("\n"), fences }; +} + +function matchFenceLine(line: string): { char: string; len: number; info: string } | null { + const m = line.match(/^ {0,3}(`{3,}|~{3,})\s*(.*)$/); + if (!m) return null; + return { char: m[1][0], len: m[1].length, info: m[2].trim() }; +} + +/** Parse a fence info string: `mermaid`, `mermaid render=false`, `mermaid title="Auth flow"`. */ +export function parseInfoString(info: string): { lang: string; render: boolean; title?: string } { + const lang = (info.match(/^\S+/)?.[0] ?? "").toLowerCase(); + const render = !/\brender\s*=\s*false\b/i.test(info); + const title = info.match(/\btitle\s*=\s*"([^"]*)"/i)?.[1] + ?? info.match(/\btitle\s*=\s*'([^']*)'/i)?.[1]; + return { lang, render, title }; +} + +// ─── Slot substitution (pure) ───────────────────────────────────────── + +/** + * Replace placeholder tokens in rendered HTML with their final slot HTML. + * marked wraps the bare token line in

; replace the wrapper too so + * the figure isn't nested inside a paragraph. + */ +export function substituteSlots(html: string, slots: Map): string { + let s = html; + for (const [token, slotHtml] of slots) { + const wrapped = new RegExp(`

\\s*${token}\\s*

`, "g"); + if (wrapped.test(s)) { + s = s.replace(new RegExp(`

\\s*${token}\\s*

`, "g"), slotHtml); + } else { + s = s.split(token).join(slotHtml); + } + } + return s; +} + +/** + * Visible diagnostic block for a failed fence render — never silent raw code + * (eng-review: explicit error blocks). Sanitizer-safe: all dynamic content is + * HTML-escaped. + */ +export function buildDiagnosticBlock(fence: DiagramFence, errorMessage: string): string { + const excerpt = fence.source.split("\n").slice(0, 8).join("\n"); + const truncated = fence.source.split("\n").length > 8 ? "\n…" : ""; + return [ + ``, + ].join("\n"); +} + +/** Wrap a rendered SVG in an accessible figure (D6.4). */ +export function buildDiagramFigure(fence: DiagramFence, svg: string): string { + const label = diagramLabel(fence); + const cleanSvg = sanitizeUntrustedHtml(svg); + const captioned = fence.title + ? `\n
${escapeHtml(fence.title)}
` + : ""; + return [ + ``, + ].join("\n"); +} + +function diagramLabel(fence: DiagramFence): string { + return fence.title ?? `diagram ${fence.ordinal}`; +} + +// ─── Render tab (bundle page lifecycle) ─────────────────────────────── + +const PAYLOAD_TMP_DIR = process.platform === "win32" ? os.tmpdir() : "/tmp"; +const READY_TIMEOUT_MS = 20_000; + +export class RenderTab { + private constructor( + public readonly tabId: number, + private readonly stagedBundlePath: string, + ) {} + + /** + * Open a tab and load the diagram-render bundle. The bundle HTML is staged + * under /tmp (content-addressed, reused across runs — load-html only reads + * inside its safe dirs) and loaded by PATH, not --from-file: a 9MB JSON + * round-trip per run would be pure waste. + */ + static open(): RenderTab { + const bundleSrc = resolveBundlePath(); + const html = fs.readFileSync(bundleSrc); + const sha = crypto.createHash("sha256").update(html).digest("hex").slice(0, 16); + const staged = path.join(PAYLOAD_TMP_DIR, `gstack-diagram-render-${sha}.html`); + if (!fs.existsSync(staged)) { + // Concurrent-safe: write to a unique temp name, then atomic rename. + const tmp = `${staged}.${process.pid}.${crypto.randomBytes(4).toString("hex")}`; + fs.writeFileSync(tmp, html); + try { + fs.renameSync(tmp, staged); + } catch { + fs.unlinkSync(tmp); // another process won the race — theirs is identical + } + } + const tabId = browseClient.newtab(); + const tab = new RenderTab(tabId, staged); + tab.loadBundle(); + return tab; + } + + /** (Re)load the bundle page — also the reset path after a render error. */ + loadBundle(): void { + browseClient.loadHtmlFile({ file: this.stagedBundlePath, tabId: this.tabId }); + const ready = browseClient.waitForExpression({ + expression: "document.getElementById('status') !== null && document.getElementById('status').textContent === 'ready'", + tabId: this.tabId, + timeoutMs: READY_TIMEOUT_MS, + }); + if (!ready) { + throw new Error( + "diagram-render bundle did not become ready in the browse tab " + + `(${READY_TIMEOUT_MS}ms). Check \`browse js "window.__errors"\` on tab ${this.tabId}.`, + ); + } + } + + /** + * Call one of the bundle's async window functions with JSON-safe string + * args. Errors come back as a recognizable ERR: prefix so a render failure + * is data, not a thrown browse exit. + */ + call(fn: string, ...args: Array): string { + const argList = args.map((a) => JSON.stringify(a)).join(","); + const expression = + `window.${fn}(${argList})` + + `.then(r => "OK:" + r)` + + `.catch(e => "ERR:" + String((e && e.message) || e))`; + const result = this.js(expression); + if (result.startsWith("OK:")) return result.slice(3); + if (result.startsWith("ERR:")) throw new RenderCallError(result.slice(4)); + throw new RenderCallError(`unexpected bundle result: ${result.slice(0, 200)}`); + } + + private js(expression: string): string { + // Large payloads (scene JSON, SVG text, data URIs) blow past argv limits — + // browseClient.js shells out with the expression as an argv element, so + // stage anything big through a tmp file the page can fetch? No: file URLs + // are unreachable from the page. Instead, chunk through a window buffer. + if (expression.length <= 100_000) { + return browseClient.js({ expression, tabId: this.tabId }); + } + return this.jsViaBuffer(expression); + } + + /** + * argv-safe path for big expressions: ship the expression into the page in + * 64KB chunks (window.__exprBuf), then eval it there. Used for multi-MB + * data URIs (photo downscaling) where a single argv would exceed OS limits. + */ + private jsViaBuffer(expression: string): string { + browseClient.js({ expression: "window.__exprBuf = ''", tabId: this.tabId }); + const CHUNK = 64_000; + for (let i = 0; i < expression.length; i += CHUNK) { + const chunk = expression.slice(i, i + CHUNK); + browseClient.js({ + expression: `window.__exprBuf += ${JSON.stringify(chunk)}, window.__exprBuf.length`, + tabId: this.tabId, + }); + } + // Eval the buffer as a single expression so the resulting promise is the + // statement value browse awaits. The buffer resets at the next call. + return browseClient.js({ + expression: `(0, eval)(window.__exprBuf)`, + tabId: this.tabId, + }); + } + + close(): void { + try { + browseClient.closetab(this.tabId); + } catch { + // best-effort: orchestrator finally path + } + } +} + +export class RenderCallError extends Error { + constructor(msg: string) { + super(msg); + this.name = "RenderCallError"; + } +} + +/** Resolve dist/diagram-render.html: env override → repo-relative (dev) → global install. */ +export function resolveBundlePath(env: NodeJS.ProcessEnv = process.env): string { + const candidates = [ + env.GSTACK_DIAGRAM_BUNDLE, + // dev: make-pdf/src/* → repo root lib/. (In a compiled binary this is the + // virtual /$bunfs/root and simply never exists — harmless.) + path.resolve(import.meta.dir, "../../lib/diagram-render/dist/diagram-render.html"), + // compiled binary at /make-pdf/dist/pdf → /lib/… — same shape + // in the repo and in the ~/.claude/skills/gstack global install. argv[0] + // is the literal string "bun" in compiled binaries; execPath is real. + path.resolve(path.dirname(process.execPath), "../../lib/diagram-render/dist/diagram-render.html"), + path.join(os.homedir(), ".claude/skills/gstack/lib/diagram-render/dist/diagram-render.html"), + ].filter((p): p is string => !!p); + for (const p of candidates) { + if (fs.existsSync(p)) return p; + } + throw new Error( + "diagram-render bundle not found. Tried:\n" + + candidates.map((c) => ` - ${c}`).join("\n") + + "\nRun `bun run build:diagram-render` (repo) or re-run ./setup (install).", + ); +} + +// ─── Fence rendering ────────────────────────────────────────────────── + +/** + * Render every extracted fence to its slot HTML. One bundle tab serves all + * fences; a failed fence yields a diagnostic block and a bundle reload + * (reset contract) before the next fence renders. + */ +export function renderFenceSlots( + fences: DiagramFence[], + tab: RenderTab, + warn: (msg: string) => void, +): Map { + const slots = new Map(); + for (const fence of fences) { + try { + let svg: string; + if (fence.lang === "mermaid") { + svg = tab.call("__renderMermaid", `mermaid-fence-${fence.ordinal}`, fence.source); + } else { + JSON.parse(fence.source); // fail fast with a JSON diagnostic, not a bundle stack + svg = tab.call("__excalidrawToSvg", fence.source); + } + slots.set(fence.token, buildDiagramFigure(fence, svg)); + } catch (err: any) { + const msg = err?.message ?? String(err); + warn(`diagram ${fence.ordinal} (${fence.lang}) failed to render: ${firstLine(msg)}`); + slots.set(fence.token, buildDiagnosticBlock(fence, msg)); + // Reset contract: a poisoned page must not corrupt the next fence. + try { + tab.loadBundle(); + } catch (reloadErr: any) { + warn(`bundle reload after render error failed: ${firstLine(reloadErr?.message ?? String(reloadErr))}`); + } + } + } + return slots; +} + +// ─── Image inlining (eng-review D1 + D4 + D6.1) ─────────────────────── + +const IMG_TAG_RE = /]*>/gi; +const SRC_RE = /\bsrc\s*=\s*("([^"]*)"|'([^']*)')/i; + +/** + * Inline every local as a data URI, probe intrinsic dimensions from the + * bytes, and annotate the tag with data-gstack-px-width/-height for the width + * policy. Oversized rasters are downscaled to print resolution via the bundle + * tab. Missing files become visible placeholders (or throw under --strict); + * remote URLs warn (offline posture) unless --allow-network. + */ +export function inlineLocalImages(html: string, opts: PrepassImageOptions): string { + const maxPx = Math.round(opts.contentWidthIn * PRINT_DPI * DOWNSCALE_FACTOR); + const targetPx = Math.round(opts.contentWidthIn * PRINT_DPI); + + return html.replace(IMG_TAG_RE, (tag) => { + const srcMatch = tag.match(SRC_RE); + if (!srcMatch) return tag; + const src = srcMatch[2] ?? srcMatch[3] ?? ""; + + if (src.startsWith("data:")) return annotateFromDataUri(tag, src); + + if (/^[a-z][a-z0-9+.-]*:/i.test(src)) { + // Absolute URL with a scheme (http, https, file, …) + if (opts.allowNetwork && /^https?:/i.test(src)) return tag; + if (/^https?:/i.test(src)) { + const msg = `remote image not fetched (offline posture): ${src}`; + if (opts.strict) throw new StrictModeError(msg + " — re-run without --strict or pass --allow-network"); + opts.warn(msg); + return tag; + } + // file:// and friends fall through to the local path branch + if (!src.startsWith("file:")) return tag; + } + + const filePath = src.startsWith("file:") + ? decodeURIComponent(new URL(src).pathname) + : path.resolve(opts.inputDir, decodeURIComponent(src)); + + if (!fs.existsSync(filePath)) { + const msg = `image not found: ${src} (resolved to ${filePath})`; + if (opts.strict) throw new StrictModeError(msg); + opts.warn(msg); + return buildMissingImagePlaceholder(src); + } + + let buf = fs.readFileSync(filePath); + let dims = imageDims(buf); + let mime = dims?.mime ?? mimeFromExtension(filePath); + + // Print-resolution normalization (D4): rasters only — SVG scales free. + if (dims && mime !== "image/svg+xml" && dims.width > maxPx) { + const tab = opts.getTab(); + if (tab) { + try { + const dataUri = `data:${mime};base64,${buf.toString("base64")}`; + const scaled = tab.call("__downscaleRaster", dataUri, targetPx, mime); + const scaledB64 = scaled.replace(/^data:[^,]*,/, ""); + opts.warn( + `downscaled ${path.basename(filePath)} ${dims.width}px → ${targetPx}px ` + + `(print is ${PRINT_DPI}dpi; original exceeds ${maxPx}px content-box ceiling)`, + ); + buf = Buffer.from(scaledB64, "base64"); + mime = scaled.slice(5, scaled.indexOf(";")); + dims = { ...dims, height: Math.round((dims.height * targetPx) / dims.width), width: targetPx }; + } catch (err: any) { + opts.warn(`downscale failed for ${src}, inlining at full size: ${firstLine(err?.message ?? String(err))}`); + } + } + } + + const dataUri = `data:${mime};base64,${buf.toString("base64")}`; + let newTag = tag.replace(SRC_RE, `src="${dataUri}"`); + if (dims) { + newTag = newTag.replace( + /^` + + `[missing image: ${escapeHtml(src)}]` + ); +} + +function mimeFromExtension(p: string): string { + switch (path.extname(p).toLowerCase()) { + case ".png": return "image/png"; + case ".jpg": + case ".jpeg": return "image/jpeg"; + case ".gif": return "image/gif"; + case ".webp": return "image/webp"; + case ".svg": return "image/svg+xml"; + default: return "application/octet-stream"; + } +} + +// ─── Content-box math ───────────────────────────────────────────────── + +const PAGE_WIDTHS_IN: Record = { + letter: 8.5, + a4: 8.27, + legal: 8.5, + tabloid: 11, +}; + +/** Parse a CSS dimension ("1in" | "72pt" | "25mm" | "2.54cm") to inches. */ +export function dimToInches(dim: string | undefined, fallbackIn: number): number { + if (!dim) return fallbackIn; + const m = dim.trim().match(/^([0-9.]+)\s*(in|pt|cm|mm|px)?$/i); + if (!m) return fallbackIn; + const v = parseFloat(m[1]); + switch ((m[2] ?? "in").toLowerCase()) { + case "in": return v; + case "pt": return v / 72; + case "cm": return v / 2.54; + case "mm": return v / 25.4; + case "px": return v / 96; + default: return fallbackIn; + } +} + +export function contentWidthInches(opts: { + pageSize?: string; + margins?: string; + marginLeft?: string; + marginRight?: string; +}): number { + const pageW = PAGE_WIDTHS_IN[opts.pageSize ?? "letter"] ?? 8.5; + const left = dimToInches(opts.marginLeft ?? opts.margins, 1); + const right = dimToInches(opts.marginRight ?? opts.margins, 1); + return Math.max(1, pageW - left - right); +} + +// ─── tiny helpers ───────────────────────────────────────────────────── + +function escapeHtml(s: string): string { + return s + .replace(/&/g, "&") + .replace(//g, ">") + .replace(/"/g, """) + .replace(/'/g, "'"); +} + +function escapeAttr(s: string): string { + return escapeHtml(s); +} + +/** Comments may not contain `--`; encode it so the raw source survives. */ +function escapeHtmlComment(s: string): string { + return s.replace(/--/g, "-‐"); +} + +function firstLine(s: string): string { + return s.split("\n")[0].slice(0, 200); +} diff --git a/make-pdf/src/image-size.ts b/make-pdf/src/image-size.ts new file mode 100644 index 000000000..96b4f4fa5 --- /dev/null +++ b/make-pdf/src/image-size.ts @@ -0,0 +1,106 @@ +/** + * Intrinsic image dimensions from raw bytes. Pure, no DOM, no deps. + * + * The diagram pre-pass probes every local image it inlines (eng-review D1: + * "dimensions are probed from the bytes") so the width policy and landscape + * detector never need a browser round-trip. Formats: PNG, JPEG, GIF, WebP + * (VP8/VP8L/VP8X), and SVG (attribute/viewBox best-effort). + * + * Returns null when the format is unrecognized or the header is truncated — + * callers treat unknown dimensions as "no policy applied", never an error. + */ + +export interface ImageDims { + width: number; + height: number; + mime: string; +} + +export function imageDims(buf: Buffer): ImageDims | null { + if (buf.length < 12) return null; + return pngDims(buf) ?? jpegDims(buf) ?? gifDims(buf) ?? webpDims(buf) ?? svgDims(buf); +} + +function pngDims(b: Buffer): ImageDims | null { + // 8-byte signature, then IHDR chunk: length(4) "IHDR"(4) width(4) height(4) + if (b.length < 24) return null; + if (b.readUInt32BE(0) !== 0x89504e47 || b.readUInt32BE(4) !== 0x0d0a1a0a) return null; + if (b.toString("ascii", 12, 16) !== "IHDR") return null; + return { width: b.readUInt32BE(16), height: b.readUInt32BE(20), mime: "image/png" }; +} + +function jpegDims(b: Buffer): ImageDims | null { + if (b[0] !== 0xff || b[1] !== 0xd8) return null; + let i = 2; + while (i + 9 < b.length) { + if (b[i] !== 0xff) { i++; continue; } + const marker = b[i + 1]; + // Standalone markers without length payload + if (marker === 0xd8 || (marker >= 0xd0 && marker <= 0xd9)) { i += 2; continue; } + const len = b.readUInt16BE(i + 2); + if (len < 2) return null; + // SOF0-SOF15 except DHT(C4)/JPGA(C8)/DAC(CC) carry dimensions + if (marker >= 0xc0 && marker <= 0xcf && marker !== 0xc4 && marker !== 0xc8 && marker !== 0xcc) { + if (i + 9 >= b.length) return null; + return { height: b.readUInt16BE(i + 5), width: b.readUInt16BE(i + 7), mime: "image/jpeg" }; + } + i += 2 + len; + } + return null; +} + +function gifDims(b: Buffer): ImageDims | null { + const sig = b.toString("ascii", 0, 6); + if (sig !== "GIF87a" && sig !== "GIF89a") return null; + return { width: b.readUInt16LE(6), height: b.readUInt16LE(8), mime: "image/gif" }; +} + +function webpDims(b: Buffer): ImageDims | null { + if (b.toString("ascii", 0, 4) !== "RIFF" || b.toString("ascii", 8, 12) !== "WEBP") return null; + const fmt = b.toString("ascii", 12, 16); + if (fmt === "VP8X" && b.length >= 30) { + // 24-bit little-endian width-1 / height-1 at offsets 24 / 27 + const w = 1 + (b[24] | (b[25] << 8) | (b[26] << 16)); + const h = 1 + (b[27] | (b[28] << 8) | (b[29] << 16)); + return { width: w, height: h, mime: "image/webp" }; + } + if (fmt === "VP8 " && b.length >= 30) { + // Lossy: dimensions at offset 26, 14 bits each, little-endian + return { + width: b.readUInt16LE(26) & 0x3fff, + height: b.readUInt16LE(28) & 0x3fff, + mime: "image/webp", + }; + } + if (fmt === "VP8L" && b.length >= 25) { + if (b[20] !== 0x2f) return null; + const bits = b.readUInt32LE(21); + return { + width: (bits & 0x3fff) + 1, + height: ((bits >> 14) & 0x3fff) + 1, + mime: "image/webp", + }; + } + return null; +} + +/** + * SVG: parse width/height attributes (px or unitless) off the root element, + * falling back to viewBox. CSS-unit widths (em, %, pt) are ignored — the + * width policy treats them as "no intrinsic size". + */ +function svgDims(b: Buffer): ImageDims | null { + const head = b.toString("utf8", 0, Math.min(b.length, 4096)); + const tag = head.match(/]*>/i)?.[0]; + if (!tag) return null; + const attr = (name: string): number | null => { + const m = tag.match(new RegExp(`\\b${name}\\s*=\\s*["']\\s*([0-9.]+)(px)?\\s*["']`, "i")); + return m ? parseFloat(m[1]) : null; + }; + const w = attr("width"); + const h = attr("height"); + if (w && h) return { width: w, height: h, mime: "image/svg+xml" }; + const vb = tag.match(/\bviewBox\s*=\s*["']\s*[-0-9.]+[\s,]+[-0-9.]+[\s,]+([0-9.]+)[\s,]+([0-9.]+)\s*["']/i); + if (vb) return { width: parseFloat(vb[1]), height: parseFloat(vb[2]), mime: "image/svg+xml" }; + return null; +} diff --git a/make-pdf/src/orchestrator.ts b/make-pdf/src/orchestrator.ts index cf8dffae6..be25941cc 100644 --- a/make-pdf/src/orchestrator.ts +++ b/make-pdf/src/orchestrator.ts @@ -24,6 +24,14 @@ import { render } from "./render"; import type { GenerateOptions, PreviewOptions } from "./types"; import { ExitCode } from "./types"; import * as browseClient from "./browseClient"; +import { + RenderTab, + contentWidthInches, + extractDiagramFences, + inlineLocalImages, + renderFenceSlots, + substituteSlots, +} from "./diagram-prepass"; class ProgressReporter { private readonly quiet: boolean; @@ -80,10 +88,14 @@ export async function generate(opts: GenerateOptions): Promise { const markdown = fs.readFileSync(input, "utf8"); progress.end("Reading markdown"); + // Stage 1.5: diagram pre-pass — extract ```mermaid/```excalidraw fences and + // swap in placeholder tokens. Rendering happens after the tab opens below. + const extraction = extractDiagramFences(markdown); + // Stage 2: render HTML progress.begin("Rendering HTML"); const rendered = render({ - markdown, + markdown: extraction.markdown, title: opts.title, author: opts.author, date: opts.date, @@ -99,11 +111,66 @@ export async function generate(opts: GenerateOptions): Promise { }); progress.end("Rendering HTML", `${rendered.meta.wordCount} words`); + // Stage 2.5: render diagram fences in a dedicated bundle tab, substitute + // slots, then inline + probe + (if oversized) downscale local images. + // The bundle tab is lazy: image-only documents open it only when a raster + // actually needs print-resolution downscaling (eng-review D4). + const warn = (msg: string) => { + if (!opts.quiet) process.stderr.write(`\r\x1b[K[make-pdf] warning: ${msg}\n`); + }; + let renderTab: RenderTab | null = null; + const getRenderTab = (): RenderTab | null => { + if (renderTab) return renderTab; + try { + renderTab = RenderTab.open(); + } catch (err: any) { + warn(`diagram-render tab unavailable: ${String(err?.message ?? err).split("\n")[0]}`); + return null; + } + return renderTab; + }; + + let finalHtml = rendered.html; + try { + if (extraction.fences.length > 0) { + progress.begin(`Rendering ${extraction.fences.length} diagram(s)`); + const tab = getRenderTab(); + if (tab) { + const slots = renderFenceSlots(extraction.fences, tab, warn); + finalHtml = substituteSlots(finalHtml, slots); + } else { + // No bundle/tab: visible diagnostic beats silent raw tokens. + const slots = new Map( + extraction.fences.map((f) => [ + f.token, + ``, + ]), + ); + finalHtml = substituteSlots(finalHtml, slots); + } + progress.end(`Rendering ${extraction.fences.length} diagram(s)`); + } + + progress.begin("Inlining images"); + finalHtml = inlineLocalImages(finalHtml, { + inputDir: path.dirname(input), + strict: opts.strict === true, + allowNetwork: opts.allowNetwork === true, + contentWidthIn: contentWidthInches(opts), + warn, + getTab: getRenderTab, + }); + progress.end("Inlining images"); + } finally { + renderTab?.close(); + } + // Stage 3: write HTML to a tmp file browse can read // (We don't actually write it; we pass inline via --from-file JSON.) // But for preview mode and debugging, we still write to tmp. const htmlTmp = tmpFile("html"); - fs.writeFileSync(htmlTmp, rendered.html, "utf8"); + fs.writeFileSync(htmlTmp, finalHtml, "utf8"); // Stage 4: spin up a dedicated tab, load HTML, (wait for Paged.js if TOC), // then emit PDF. Always close the tab. @@ -114,7 +181,7 @@ export async function generate(opts: GenerateOptions): Promise { try { progress.begin("Loading HTML into Chromium"); browseClient.loadHtml({ - html: rendered.html, + html: finalHtml, waitUntil: "domcontentloaded", tabId, }); diff --git a/make-pdf/src/print-css.ts b/make-pdf/src/print-css.ts index 2366f42b9..db6f9c925 100644 --- a/make-pdf/src/print-css.ts +++ b/make-pdf/src/print-css.ts @@ -324,6 +324,18 @@ function figureRules(): string { `figure { margin: 12pt 0; }`, `figure img { display: block; max-width: 100%; height: auto; }`, `figcaption { font-size: 9pt; color: #666; margin-top: 6pt; font-style: italic; }`, + // Diagram figures (diagram-prepass): rendered mermaid/excalidraw SVG. + // SVGs scale to the content box and never split across pages. + `figure.diagram { break-inside: avoid; text-align: center; }`, + `figure.diagram > svg { max-width: 100%; height: auto; }`, + `figure.diagram .diagram-caption { text-align: center; }`, + // Diagnostic block for a fence that failed to render — loud, boxed, + // unmistakably an error (never silent raw code). + `figure.diagram-error { border: 1.5pt solid #b00020; padding: 8pt 10pt; text-align: left; }`, + `figure.diagram-error .diagram-error-title { font-weight: 700; color: #b00020; font-style: normal; margin: 0 0 6pt; }`, + `figure.diagram-error .diagram-error-detail { font-size: 8.5pt; white-space: pre-wrap; margin: 0; }`, + // Missing local image placeholder (non-strict mode). + `.image-missing { display: inline-block; border: 1pt dashed #b00020; color: #b00020; padding: 4pt 8pt; font-size: 9pt; }`, ].join("\n"); } diff --git a/make-pdf/src/types.ts b/make-pdf/src/types.ts index 6d4e67108..aa6706548 100644 --- a/make-pdf/src/types.ts +++ b/make-pdf/src/types.ts @@ -44,6 +44,10 @@ export interface GenerateOptions { // Network allowNetwork?: boolean; // default: false + // Strict mode (eng-review D6.1): missing/remote images hard-fail instead of + // warn + placeholder. For CI docs pipelines that need determinism. + strict?: boolean; // default: false + // Metadata title?: string; author?: string;