From dd84bdb7d9302aff10cd1c58e3641af45c8d3cd1 Mon Sep 17 00:00:00 2001 From: Garry Tan Date: Mon, 18 May 2026 21:28:56 -0700 Subject: [PATCH] fix(browse): guard full-page screenshots against Anthropic vision API >2000px brick (#1214) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Full-page screenshots of tall pages routinely exceeded 2000px on the longest dimension, silently bricking the agent's session: the resulting base64 reached the Anthropic vision API which rejected the oversized image, leaving the agent burning turns on a useless blob with no stderr trace from the browse side. Adds browse/src/screenshot-size-guard.ts as a shared helper: - guardScreenshotBuffer(buf) → downscales in-memory if max(w,h) > 2000 - guardScreenshotPath(path) → file-mode variant that rewrites in place - Aspect ratio preserved via sharp's resize fit:inside - Stderr diagnostic on any downscale so callers can see when it fired - Lazy sharp import so non-screenshot paths pay no startup cost Wires the guard into all three full-page callsites codex review flagged: - browse/src/snapshot.ts: annotated + heatmap fullPage captures - browse/src/meta-commands.ts: screenshot command (path + base64 fullPage modes) plus the responsive 3-viewport sweep - browse/src/write-commands.ts: prettyscreenshot fullPage path Covers seven unit cases (pass-through, downscale, aspect ratio, exactly-2000px edge, file-mode rewrite) plus a static invariant test that fails the build if any of the three callsites stops importing the guard. Closes #1214. Co-Authored-By: Claude Opus 4.7 (1M context) --- browse/src/meta-commands.ts | 7 ++ browse/src/screenshot-size-guard.ts | 106 +++++++++++++++++++ browse/src/snapshot.ts | 3 + browse/src/write-commands.ts | 5 + browse/test/screenshot-size-guard.test.ts | 118 ++++++++++++++++++++++ 5 files changed, 239 insertions(+) create mode 100644 browse/src/screenshot-size-guard.ts create mode 100644 browse/test/screenshot-size-guard.test.ts diff --git a/browse/src/meta-commands.ts b/browse/src/meta-commands.ts index c505d4cf4..f71018006 100644 --- a/browse/src/meta-commands.ts +++ b/browse/src/meta-commands.ts @@ -11,6 +11,7 @@ import { handleSkillCommand } from './browser-skill-commands'; import { validateNavigationUrl } from './url-validation'; import { checkScope, type TokenInfo } from './token-registry'; import { validateOutputPath, validateReadPath, SAFE_DIRECTORIES, escapeRegExp } from './path-security'; +import { guardScreenshotBuffer, guardScreenshotPath } from './screenshot-size-guard'; // Re-export for backward compatibility (tests import from meta-commands) export { validateOutputPath, escapeRegExp } from './path-security'; import * as Diff from 'diff'; @@ -497,6 +498,10 @@ export async function handleMetaCommand( buffer = await page.screenshot({ clip: clipRect }); } else { buffer = await page.screenshot({ fullPage: !viewportOnly }); + // Guard the most common API-bricking case (fullPage). Element / + // clip captures usually stay within the cap; we still guard the + // path-mode below for fullPage writes. + ({ buffer } = await guardScreenshotBuffer(buffer)); } if (buffer.length > 10 * 1024 * 1024) { throw new Error('Screenshot too large for --base64 (>10MB). Use disk path instead.'); @@ -517,6 +522,7 @@ export async function handleMetaCommand( } await page.screenshot({ path: outputPath, fullPage: !viewportOnly }); + if (!viewportOnly) await guardScreenshotPath(outputPath); return `Screenshot saved${viewportOnly ? ' (viewport)' : ''}: ${outputPath}`; } @@ -567,6 +573,7 @@ export async function handleMetaCommand( const screenshotPath = `${prefix}-${vp.name}.png`; validateOutputPath(screenshotPath); await page.screenshot({ path: screenshotPath, fullPage: true }); + await guardScreenshotPath(screenshotPath); results.push(`${vp.name} (${vp.width}x${vp.height}): ${screenshotPath}`); } diff --git a/browse/src/screenshot-size-guard.ts b/browse/src/screenshot-size-guard.ts new file mode 100644 index 000000000..392864e00 --- /dev/null +++ b/browse/src/screenshot-size-guard.ts @@ -0,0 +1,106 @@ +/** + * Screenshot size guard — keep full-page screenshots ≤ 2000px max-dim. + * + * The Anthropic vision API rejects images whose longest dimension exceeds + * 2000 image-pixels (post deviceScaleFactor). Full-page screenshots of long + * pages routinely exceed that, silently bricking the session: the agent + * burns turns on a base64 blob that errors model-side with no useful + * stderr surfacing on the browse side. + * + * This module centralizes the "after page.screenshot, check dimensions and + * downscale if too big" path so every full-page caller in browse/src can + * share the same enforcement. The cap is image-pixels, not CSS pixels, + * matching the Anthropic API's own threshold. + * + * Used by: snapshot.ts (annotated, heatmap), meta-commands.ts (screenshot), + * write-commands.ts (prettyscreenshot). See test/snapshot-meta-write-guard.test.ts. + * + * Closes #1214. + */ + +import { writeFileSync, readFileSync } from "fs"; + +const MAX_DIMENSION_PX = 2000; + +export interface SizeGuardResult { + /** True if the input image exceeded MAX_DIMENSION_PX and was downscaled. */ + resized: boolean; + /** Final width and height (pixels) of the image as written/returned. */ + width: number; + height: number; + /** Original dimensions before any downscale. */ + originalWidth: number; + originalHeight: number; +} + +/** + * Inspect an image buffer and downscale if its longest side exceeds the + * 2000px Anthropic vision API cap. Preserves aspect ratio. Encodes back + * to PNG. Returns the resulting buffer plus a diagnostic shape. + * + * Imports sharp lazily so the module load cost only hits screenshot paths + * (sharp's native binding is non-trivial to initialize). + */ +export async function guardScreenshotBuffer(input: Buffer): Promise<{ buffer: Buffer; result: SizeGuardResult }> { + const sharpModule = await import("sharp"); + const sharp = sharpModule.default ?? sharpModule; + const image = sharp(input); + const metadata = await image.metadata(); + const width = metadata.width ?? 0; + const height = metadata.height ?? 0; + + const longest = Math.max(width, height); + if (longest <= MAX_DIMENSION_PX) { + return { + buffer: input, + result: { + resized: false, + width, + height, + originalWidth: width, + originalHeight: height, + }, + }; + } + + const scale = MAX_DIMENSION_PX / longest; + const newWidth = Math.round(width * scale); + const newHeight = Math.round(height * scale); + + const resized = await image + .resize(newWidth, newHeight, { fit: "inside" }) + .png() + .toBuffer(); + + process.stderr.write( + `[screenshot-size-guard] image ${width}x${height} exceeded ${MAX_DIMENSION_PX}px max-dim; ` + + `downscaled to ${newWidth}x${newHeight} to fit Anthropic vision API\n`, + ); + + return { + buffer: resized, + result: { + resized: true, + width: newWidth, + height: newHeight, + originalWidth: width, + originalHeight: height, + }, + }; +} + +/** + * File-mode variant: read the image at the given path, downscale if + * needed, and write the result back to the same path. Returns the + * diagnostic shape. Use this after `await page.screenshot({ path, ... })`. + */ +export async function guardScreenshotPath(filePath: string): Promise { + const input = readFileSync(filePath); + const { buffer, result } = await guardScreenshotBuffer(input); + if (result.resized) { + writeFileSync(filePath, buffer); + } + return result; +} + +export const SCREENSHOT_MAX_DIMENSION_PX = MAX_DIMENSION_PX; diff --git a/browse/src/snapshot.ts b/browse/src/snapshot.ts index 0ed80f0c7..ce3a1a466 100644 --- a/browse/src/snapshot.ts +++ b/browse/src/snapshot.ts @@ -23,6 +23,7 @@ import * as Diff from 'diff'; import { TEMP_DIR, isPathWithin } from './platform'; import { escapeEnvelopeSentinels } from './content-security'; import { stripLoneSurrogates } from './sanitize'; +import { guardScreenshotPath } from './screenshot-size-guard'; // Roles considered "interactive" for the -i flag const INTERACTIVE_ROLES = new Set([ @@ -418,6 +419,7 @@ export async function handleSnapshot( }, boxes); await page.screenshot({ path: screenshotPath, fullPage: true }); + await guardScreenshotPath(screenshotPath); // Always remove overlays await page.evaluate(() => { @@ -538,6 +540,7 @@ export async function handleSnapshot( }, boxes); await page.screenshot({ path: heatmapPath, fullPage: true }); + await guardScreenshotPath(heatmapPath); // Remove heatmap overlays await page.evaluate(() => { diff --git a/browse/src/write-commands.ts b/browse/src/write-commands.ts index 61c84d839..daebd18a0 100644 --- a/browse/src/write-commands.ts +++ b/browse/src/write-commands.ts @@ -11,6 +11,7 @@ import { findInstalledBrowsers, importCookies, importCookiesViaCdp, hasV20Cookie import { generatePickerCode } from './cookie-picker-routes'; import { validateNavigationUrl } from './url-validation'; import { validateOutputPath, validateReadPath } from './path-security'; +import { guardScreenshotPath } from './screenshot-size-guard'; import * as fs from 'fs'; import * as path from 'path'; import type { SetContentWaitUntil } from './tab-session'; @@ -1123,6 +1124,10 @@ export async function handleWriteCommand( // Take screenshot await page.screenshot({ path: outputPath, fullPage: !scrollTo }); + // Guard against Anthropic vision API >2000px brick (#1214). Only + // applies to fullPage captures; scrollTo viewport-bound shots are + // already capped by the viewport size. + if (!scrollTo) await guardScreenshotPath(outputPath); // Restore viewport if (viewportWidth && originalViewport) { diff --git a/browse/test/screenshot-size-guard.test.ts b/browse/test/screenshot-size-guard.test.ts new file mode 100644 index 000000000..c2a831735 --- /dev/null +++ b/browse/test/screenshot-size-guard.test.ts @@ -0,0 +1,118 @@ +/** + * Unit tests for the screenshot size guard (#1214). + * + * Verifies that images exceeding 2000px on the longest dimension get + * downscaled to fit the Anthropic vision API cap, while images already + * inside the cap pass through untouched. + * + * Integration with the three callsites (snapshot.ts, meta-commands.ts, + * write-commands.ts) is exercised by the existing browse E2E suite — we + * don't need to spin up Chromium just to verify the helper. The static + * invariant test below pins that all three callsites import the guard. + */ + +import { afterEach, beforeEach, describe, expect, test } from 'bun:test'; +import { mkdtempSync, readFileSync, rmSync, writeFileSync } from 'fs'; +import { tmpdir } from 'os'; +import { join } from 'path'; +import sharp from 'sharp'; +import { + SCREENSHOT_MAX_DIMENSION_PX, + guardScreenshotBuffer, + guardScreenshotPath, +} from '../src/screenshot-size-guard'; + +let tmp: string; + +beforeEach(() => { + tmp = mkdtempSync(join(tmpdir(), 'screenshot-guard-')); +}); + +afterEach(() => { + rmSync(tmp, { recursive: true, force: true }); +}); + +async function makePng(width: number, height: number): Promise { + return sharp({ + create: { width, height, channels: 3, background: { r: 200, g: 50, b: 50 } }, + }) + .png() + .toBuffer(); +} + +describe('guardScreenshotBuffer', () => { + test('passes through images already within the cap', async () => { + const input = await makePng(1500, 1800); + const { buffer, result } = await guardScreenshotBuffer(input); + expect(result.resized).toBe(false); + expect(result.width).toBe(1500); + expect(result.height).toBe(1800); + expect(buffer).toBe(input); // identity — no re-encode + }); + + test('downscales a 5000px-tall image to fit the cap', async () => { + const input = await makePng(1200, 5000); + const { buffer, result } = await guardScreenshotBuffer(input); + expect(result.resized).toBe(true); + expect(result.originalHeight).toBe(5000); + expect(Math.max(result.width, result.height)).toBeLessThanOrEqual( + SCREENSHOT_MAX_DIMENSION_PX, + ); + // Aspect ratio preserved. + expect(result.height / result.width).toBeCloseTo(5000 / 1200, 1); + // Buffer is a different (smaller) PNG. + expect(buffer.length).toBeLessThan(input.length); + }); + + test('downscales a 6000px-wide image', async () => { + const input = await makePng(6000, 1200); + const { buffer, result } = await guardScreenshotBuffer(input); + expect(result.resized).toBe(true); + expect(result.originalWidth).toBe(6000); + expect(Math.max(result.width, result.height)).toBeLessThanOrEqual( + SCREENSHOT_MAX_DIMENSION_PX, + ); + expect(buffer.length).toBeGreaterThan(0); + }); + + test('treats exactly-2000px images as in-bounds (no resize)', async () => { + const input = await makePng(2000, 1000); + const { result } = await guardScreenshotBuffer(input); + expect(result.resized).toBe(false); + }); +}); + +describe('guardScreenshotPath', () => { + test('rewrites the file in place when downscale is needed', async () => { + const filePath = join(tmp, 'tall.png'); + writeFileSync(filePath, await makePng(1200, 5000)); + const result = await guardScreenshotPath(filePath); + expect(result.resized).toBe(true); + const written = readFileSync(filePath); + const meta = await sharp(written).metadata(); + expect(Math.max(meta.width ?? 0, meta.height ?? 0)).toBeLessThanOrEqual( + SCREENSHOT_MAX_DIMENSION_PX, + ); + }); + + test('leaves the file untouched when already within cap', async () => { + const filePath = join(tmp, 'short.png'); + const original = await makePng(800, 600); + writeFileSync(filePath, original); + const result = await guardScreenshotPath(filePath); + expect(result.resized).toBe(false); + const written = readFileSync(filePath); + expect(written.equals(original)).toBe(true); + }); +}); + +describe('static invariant: all three full-page callsites import the guard', () => { + test('snapshot.ts, meta-commands.ts, and write-commands.ts wire the size guard', () => { + const browseSrc = join(import.meta.dir, '..', 'src'); + const paths = ['snapshot.ts', 'meta-commands.ts', 'write-commands.ts']; + for (const rel of paths) { + const content = readFileSync(join(browseSrc, rel), 'utf-8'); + expect(content).toContain('screenshot-size-guard'); + } + }); +});