mirror of https://github.com/garrytan/gstack.git
313 lines
11 KiB
TypeScript
313 lines
11 KiB
TypeScript
/**
|
|
* pdftotext wrapper — the tool behind the copy-paste CI gate.
|
|
*
|
|
* Codex round 2 surfaced two real problems we address here:
|
|
*
|
|
* #18: pdftotext (Poppler) vs pdftotext (Xpdf) vs pdftotext-next vary on
|
|
* whitespace, line wrap, Unicode normalization, form feeds, and
|
|
* extraction order. Cross-platform exact diffing is a non-starter.
|
|
* We normalize aggressively and diff the normalized form.
|
|
*
|
|
* #19: the regex /(?:\b\w\s){4,}/ only catches one failure shape (letters
|
|
* spaced out). It misses word-order corruption, missing whitespace
|
|
* between paragraphs, and homoglyph substitution. We add a word-token
|
|
* diff and a paragraph-boundary assertion on top.
|
|
*
|
|
* Resolution order for the pdftotext binary (v1.24-aligned):
|
|
* 1. $GSTACK_PDFTOTEXT_BIN env override (preferred, matches v1.24 GSTACK_*_BIN pattern)
|
|
* 2. $PDFTOTEXT_BIN env override (back-compat alias)
|
|
* 3. PATH lookup via Bun.which('pdftotext') — handles Windows PATHEXT natively
|
|
* 4. standard POSIX paths (Homebrew + distro) — no Windows candidates because
|
|
* Poppler scatters across Scoop / Chocolatey / oschwartz10612-poppler-windows
|
|
* and guessing causes false positives. Set GSTACK_PDFTOTEXT_BIN explicitly.
|
|
* 5. throws a friendly "install poppler" error
|
|
*
|
|
* The wrapper is *optional at runtime*: production renders don't need it.
|
|
* Only the CI gate and unit tests invoke pdftotext.
|
|
*/
|
|
|
|
import { execFileSync } from "node:child_process";
|
|
import * as fs from "node:fs";
|
|
import * as os from "node:os";
|
|
import * as path from "node:path";
|
|
|
|
export class PdftotextUnavailableError extends Error {
|
|
constructor(message: string) {
|
|
super(message);
|
|
this.name = "PdftotextUnavailableError";
|
|
}
|
|
}
|
|
|
|
export interface PdftotextInfo {
|
|
bin: string;
|
|
version: string; // "pdftotext version 24.02.0" or similar
|
|
flavor: "poppler" | "xpdf" | "unknown";
|
|
}
|
|
|
|
/**
|
|
* Probe a base path for executability, honoring Windows extension suffixes.
|
|
* Matches browseClient.ts:findExecutable — duplicated rather than shared
|
|
* because the two modules already duplicate isExecutable for compile-isolation.
|
|
*/
|
|
export function findExecutable(base: string): string | null {
|
|
if (isExecutable(base)) return base;
|
|
if (process.platform === "win32") {
|
|
for (const ext of [".exe", ".cmd", ".bat"]) {
|
|
const withExt = base + ext;
|
|
if (isExecutable(withExt)) return withExt;
|
|
}
|
|
}
|
|
return null;
|
|
}
|
|
|
|
function resolveOverride(value: string | undefined, env: NodeJS.ProcessEnv): string | null {
|
|
if (!value?.trim()) return null;
|
|
const trimmed = value.trim().replace(/^"(.*)"$/, '$1');
|
|
if (path.isAbsolute(trimmed)) return findExecutable(trimmed);
|
|
const PATH = env.PATH ?? env.Path ?? '';
|
|
return Bun.which(trimmed, { PATH }) ?? null;
|
|
}
|
|
|
|
/**
|
|
* Locate pdftotext. Throws PdftotextUnavailableError if none is found.
|
|
*/
|
|
export function resolvePdftotext(env: NodeJS.ProcessEnv = process.env): PdftotextInfo {
|
|
// 1 + 2: env overrides (GSTACK_PDFTOTEXT_BIN preferred, PDFTOTEXT_BIN back-compat).
|
|
const overrideRaw = env.GSTACK_PDFTOTEXT_BIN ?? env.PDFTOTEXT_BIN;
|
|
const override = resolveOverride(overrideRaw, env);
|
|
if (override) return describeBinary(override);
|
|
|
|
// 3: PATH lookup via Bun.which — handles Windows PATHEXT natively.
|
|
const PATH = env.PATH ?? env.Path ?? '';
|
|
const onPath = Bun.which('pdftotext', { PATH });
|
|
if (onPath) return describeBinary(onPath);
|
|
|
|
// 4: POSIX-only standard locations. No Windows candidates — Poppler installs
|
|
// scatter across Scoop/Chocolatey/portable zips and guessing causes false
|
|
// positives. Windows users set GSTACK_PDFTOTEXT_BIN explicitly.
|
|
const posixCandidates = [
|
|
"/opt/homebrew/bin/pdftotext", // Apple Silicon Homebrew
|
|
"/usr/local/bin/pdftotext", // Intel Mac or Linuxbrew
|
|
"/usr/bin/pdftotext", // distro package
|
|
];
|
|
for (const candidate of posixCandidates) {
|
|
if (isExecutable(candidate)) return describeBinary(candidate);
|
|
}
|
|
|
|
throw new PdftotextUnavailableError([
|
|
"pdftotext not found.",
|
|
"",
|
|
"make-pdf needs pdftotext to run the copy-paste CI gate.",
|
|
"(Runtime rendering does NOT need it. This only affects tests.)",
|
|
"",
|
|
"To install:",
|
|
" macOS: brew install poppler",
|
|
" Ubuntu: sudo apt-get install poppler-utils",
|
|
" Fedora: sudo dnf install poppler-utils",
|
|
" Windows: scoop install poppler (or download from",
|
|
" https://github.com/oschwartz10612/poppler-windows)",
|
|
"",
|
|
"Or set GSTACK_PDFTOTEXT_BIN to an explicit path:",
|
|
process.platform === "win32"
|
|
? ' setx GSTACK_PDFTOTEXT_BIN "C:\\path\\to\\pdftotext.exe"'
|
|
: " export GSTACK_PDFTOTEXT_BIN=/path/to/pdftotext",
|
|
].join("\n"));
|
|
}
|
|
|
|
/**
|
|
* Locate a poppler companion tool (pdffonts, pdfimages, pdftoppm) used by the
|
|
* emoji render gate. Mirrors resolvePdftotext's resolution order:
|
|
* 1. $GSTACK_<TOOL>_BIN env override (e.g. GSTACK_PDFFONTS_BIN)
|
|
* 2. PATH via Bun.which
|
|
* 3. standard POSIX locations (Homebrew + distro)
|
|
*
|
|
* Returns null (does NOT throw) when the tool is missing — the emoji gate skips
|
|
* cleanly rather than failing on a box without full poppler-utils.
|
|
*/
|
|
export function resolvePopplerTool(
|
|
tool: "pdffonts" | "pdfimages" | "pdftoppm",
|
|
env: NodeJS.ProcessEnv = process.env,
|
|
): string | null {
|
|
const override = resolveOverride(env[`GSTACK_${tool.toUpperCase()}_BIN`], env);
|
|
if (override) return override;
|
|
|
|
const PATH = env.PATH ?? env.Path ?? "";
|
|
const onPath = Bun.which(tool, { PATH });
|
|
if (onPath) return onPath;
|
|
|
|
for (const dir of ["/opt/homebrew/bin", "/usr/local/bin", "/usr/bin"]) {
|
|
const candidate = findExecutable(path.join(dir, tool));
|
|
if (candidate) return candidate;
|
|
}
|
|
return null;
|
|
}
|
|
|
|
function isExecutable(p: string): boolean {
|
|
try {
|
|
fs.accessSync(p, fs.constants.X_OK);
|
|
return true;
|
|
} catch {
|
|
return false;
|
|
}
|
|
}
|
|
|
|
function describeBinary(bin: string): PdftotextInfo {
|
|
let version = "unknown";
|
|
let flavor: PdftotextInfo["flavor"] = "unknown";
|
|
try {
|
|
// pdftotext -v writes to stderr and exits 0 on poppler, 99 on some xpdf builds.
|
|
const result = execFileSync(bin, ["-v"], {
|
|
encoding: "utf8",
|
|
stdio: ["ignore", "pipe", "pipe"],
|
|
});
|
|
version = (result || "").trim().split("\n")[0] || "unknown";
|
|
} catch (err: any) {
|
|
// Many pdftotext builds exit non-zero on -v but still write to stderr.
|
|
const stderr = err?.stderr?.toString?.() ?? "";
|
|
version = stderr.trim().split("\n")[0] || "unknown";
|
|
}
|
|
const v = version.toLowerCase();
|
|
if (v.includes("poppler")) flavor = "poppler";
|
|
else if (v.includes("xpdf")) flavor = "xpdf";
|
|
return { bin, version, flavor };
|
|
}
|
|
|
|
/**
|
|
* Run pdftotext on a PDF and return the extracted text.
|
|
*
|
|
* Uses `-layout` by default because that's what downstream normalization
|
|
* expects. Callers that need raw text can pass layout=false.
|
|
*/
|
|
export function pdftotext(pdfPath: string, opts?: { layout?: boolean }): string {
|
|
const info = resolvePdftotext();
|
|
const layout = opts?.layout ?? true;
|
|
const args: string[] = [];
|
|
if (layout) args.push("-layout");
|
|
args.push(pdfPath, "-"); // "-" = stdout
|
|
try {
|
|
return execFileSync(info.bin, args, {
|
|
encoding: "utf8",
|
|
maxBuffer: 32 * 1024 * 1024,
|
|
});
|
|
} catch (err: any) {
|
|
throw new Error(`pdftotext failed on ${pdfPath}: ${err.message}`);
|
|
}
|
|
}
|
|
|
|
/**
|
|
* Normalize extracted text for cross-platform, cross-flavor diffing.
|
|
*
|
|
* What we strip / normalize:
|
|
* - Unicode: NFC canonical composition (macOS emits NFD; Linux emits NFC;
|
|
* this dodges the fundamental encoding diff).
|
|
* - CR and CRLF → LF (Windows Xpdf emits CRLF).
|
|
* - Form feeds (\f) → double newline (Poppler emits \f at page breaks).
|
|
* - Trailing spaces on every line.
|
|
* - Runs of 3+ blank lines → 2 blank lines.
|
|
* - Leading/trailing whitespace on the whole string.
|
|
* - Non-breaking space (U+00A0) → regular space.
|
|
* - Zero-width space (U+200B) and zero-width non-joiner (U+200C) → empty.
|
|
* - Soft hyphen (U+00AD) → empty (pdftotext -layout sometimes emits these
|
|
* for hyphens: auto breaks).
|
|
*/
|
|
export function normalize(raw: string): string {
|
|
let s = raw;
|
|
s = s.normalize("NFC");
|
|
s = s.replace(/\r\n/g, "\n");
|
|
s = s.replace(/\r/g, "\n");
|
|
s = s.replace(/\f/g, "\n\n");
|
|
s = s.replace(/\u00a0/g, " ");
|
|
s = s.replace(/[\u200b\u200c\u00ad]/g, "");
|
|
s = s.replace(/[ \t]+$/gm, "");
|
|
s = s.replace(/\n{3,}/g, "\n\n");
|
|
s = s.trim();
|
|
return s;
|
|
}
|
|
|
|
/**
|
|
* The canonical copy-paste gate used in the E2E tests.
|
|
*
|
|
* Returns { ok: true } when all three assertions pass; returns
|
|
* { ok: false, reasons: [...] } with one or more failure reasons otherwise.
|
|
*/
|
|
export interface GateResult {
|
|
ok: boolean;
|
|
reasons: string[];
|
|
extracted: string;
|
|
}
|
|
|
|
export function copyPasteGate(pdfPath: string, expected: string): GateResult {
|
|
const extracted = normalize(pdftotext(pdfPath, { layout: true }));
|
|
const expectedNorm = normalize(expected);
|
|
const reasons: string[] = [];
|
|
|
|
// Assertion 1: every expected paragraph appears as a whole line or
|
|
// contiguous block in the extracted text.
|
|
const expectedParagraphs = splitParagraphs(expectedNorm);
|
|
for (const paragraph of expectedParagraphs) {
|
|
const compact = collapseWhitespace(paragraph);
|
|
const extractedCompact = collapseWhitespace(extracted);
|
|
if (!extractedCompact.includes(compact)) {
|
|
reasons.push(
|
|
`expected paragraph not found in extracted text: ${truncate(paragraph, 80)}`,
|
|
);
|
|
}
|
|
}
|
|
|
|
// Assertion 2: no "S a i l i n g"-style single-char runs.
|
|
// Count groups of 4+ consecutive letter-then-space tokens. False positive
|
|
// risk on things like "A B C D" (initials) — mitigate by requiring the
|
|
// letters spell a known-word substring of the expected text.
|
|
const fragRegex = /((?:\b\w\s){4,})/g;
|
|
let fragMatch: RegExpExecArray | null;
|
|
while ((fragMatch = fragRegex.exec(extracted)) !== null) {
|
|
const letters = fragMatch[1].replace(/\s/g, "");
|
|
// Only flag if the reassembled letters appear in the expected text.
|
|
if (expectedNorm.toLowerCase().includes(letters.toLowerCase()) && letters.length >= 4) {
|
|
reasons.push(
|
|
`per-glyph emission detected (the "S ai li ng" bug): "${fragMatch[1].trim()}" reassembles to "${letters}"`,
|
|
);
|
|
}
|
|
}
|
|
|
|
// Assertion 3: paragraph boundaries preserved. Count double-newlines
|
|
// in both; they should differ by no more than ±2 (header/footer noise).
|
|
const expectedBreaks = (expectedNorm.match(/\n\n/g) || []).length;
|
|
const extractedBreaks = (extracted.match(/\n\n/g) || []).length;
|
|
if (Math.abs(expectedBreaks - extractedBreaks) > 4) {
|
|
reasons.push(
|
|
`paragraph boundary count drift: expected ~${expectedBreaks}, got ${extractedBreaks}`,
|
|
);
|
|
}
|
|
|
|
return { ok: reasons.length === 0, reasons, extracted };
|
|
}
|
|
|
|
function splitParagraphs(s: string): string[] {
|
|
return s.split(/\n\n+/).map(p => p.trim()).filter(p => p.length > 0);
|
|
}
|
|
|
|
function collapseWhitespace(s: string): string {
|
|
return s.replace(/\s+/g, " ").trim();
|
|
}
|
|
|
|
function truncate(s: string, n: number): string {
|
|
return s.length > n ? s.slice(0, n) + "..." : s;
|
|
}
|
|
|
|
/**
|
|
* Emit diagnostic info to stderr — useful for CI failure debugging.
|
|
* Call this once before running any gate in a CI log.
|
|
*/
|
|
export function logDiagnostics(): void {
|
|
try {
|
|
const info = resolvePdftotext();
|
|
process.stderr.write(
|
|
`[pdftotext] bin=${info.bin} flavor=${info.flavor} version="${info.version}" ` +
|
|
`os=${os.platform()}-${os.arch()} node=${process.version}\n`,
|
|
);
|
|
} catch (err: any) {
|
|
process.stderr.write(`[pdftotext] unavailable: ${err.message}\n`);
|
|
}
|
|
}
|