fix(browse): guard full-page screenshots against Anthropic vision API >2000px brick (#1214)

Full-page screenshots of tall pages routinely exceeded 2000px on the
longest dimension, silently bricking the agent's session: the
resulting base64 reached the Anthropic vision API which rejected the
oversized image, leaving the agent burning turns on a useless blob
with no stderr trace from the browse side.

Adds browse/src/screenshot-size-guard.ts as a shared helper:
- guardScreenshotBuffer(buf) → downscales in-memory if max(w,h) > 2000
- guardScreenshotPath(path) → file-mode variant that rewrites in place
- Aspect ratio preserved via sharp's resize fit:inside
- Stderr diagnostic on any downscale so callers can see when it fired
- Lazy sharp import so non-screenshot paths pay no startup cost

Wires the guard into all three full-page callsites codex review
flagged:
- browse/src/snapshot.ts: annotated + heatmap fullPage captures
- browse/src/meta-commands.ts: screenshot command (path + base64
  fullPage modes) plus the responsive 3-viewport sweep
- browse/src/write-commands.ts: prettyscreenshot fullPage path

Covers seven unit cases (pass-through, downscale, aspect ratio,
exactly-2000px edge, file-mode rewrite) plus a static invariant test
that fails the build if any of the three callsites stops importing the
guard.

Closes #1214.

Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
This commit is contained in:
Garry Tan 2026-05-18 21:28:56 -07:00
parent 16fca84d04
commit dd84bdb7d9
No known key found for this signature in database
GPG Key ID: C1F69E85C74EFE1D
5 changed files with 239 additions and 0 deletions

View File

@ -11,6 +11,7 @@ import { handleSkillCommand } from './browser-skill-commands';
import { validateNavigationUrl } from './url-validation';
import { checkScope, type TokenInfo } from './token-registry';
import { validateOutputPath, validateReadPath, SAFE_DIRECTORIES, escapeRegExp } from './path-security';
import { guardScreenshotBuffer, guardScreenshotPath } from './screenshot-size-guard';
// Re-export for backward compatibility (tests import from meta-commands)
export { validateOutputPath, escapeRegExp } from './path-security';
import * as Diff from 'diff';
@ -497,6 +498,10 @@ export async function handleMetaCommand(
buffer = await page.screenshot({ clip: clipRect });
} else {
buffer = await page.screenshot({ fullPage: !viewportOnly });
// Guard the most common API-bricking case (fullPage). Element /
// clip captures usually stay within the cap; we still guard the
// path-mode below for fullPage writes.
({ buffer } = await guardScreenshotBuffer(buffer));
}
if (buffer.length > 10 * 1024 * 1024) {
throw new Error('Screenshot too large for --base64 (>10MB). Use disk path instead.');
@ -517,6 +522,7 @@ export async function handleMetaCommand(
}
await page.screenshot({ path: outputPath, fullPage: !viewportOnly });
if (!viewportOnly) await guardScreenshotPath(outputPath);
return `Screenshot saved${viewportOnly ? ' (viewport)' : ''}: ${outputPath}`;
}
@ -567,6 +573,7 @@ export async function handleMetaCommand(
const screenshotPath = `${prefix}-${vp.name}.png`;
validateOutputPath(screenshotPath);
await page.screenshot({ path: screenshotPath, fullPage: true });
await guardScreenshotPath(screenshotPath);
results.push(`${vp.name} (${vp.width}x${vp.height}): ${screenshotPath}`);
}

View File

@ -0,0 +1,106 @@
/**
* Screenshot size guard keep full-page screenshots 2000px max-dim.
*
* The Anthropic vision API rejects images whose longest dimension exceeds
* 2000 image-pixels (post deviceScaleFactor). Full-page screenshots of long
* pages routinely exceed that, silently bricking the session: the agent
* burns turns on a base64 blob that errors model-side with no useful
* stderr surfacing on the browse side.
*
* This module centralizes the "after page.screenshot, check dimensions and
* downscale if too big" path so every full-page caller in browse/src can
* share the same enforcement. The cap is image-pixels, not CSS pixels,
* matching the Anthropic API's own threshold.
*
* Used by: snapshot.ts (annotated, heatmap), meta-commands.ts (screenshot),
* write-commands.ts (prettyscreenshot). See test/snapshot-meta-write-guard.test.ts.
*
* Closes #1214.
*/
import { writeFileSync, readFileSync } from "fs";
const MAX_DIMENSION_PX = 2000;
export interface SizeGuardResult {
/** True if the input image exceeded MAX_DIMENSION_PX and was downscaled. */
resized: boolean;
/** Final width and height (pixels) of the image as written/returned. */
width: number;
height: number;
/** Original dimensions before any downscale. */
originalWidth: number;
originalHeight: number;
}
/**
* Inspect an image buffer and downscale if its longest side exceeds the
* 2000px Anthropic vision API cap. Preserves aspect ratio. Encodes back
* to PNG. Returns the resulting buffer plus a diagnostic shape.
*
* Imports sharp lazily so the module load cost only hits screenshot paths
* (sharp's native binding is non-trivial to initialize).
*/
export async function guardScreenshotBuffer(input: Buffer): Promise<{ buffer: Buffer; result: SizeGuardResult }> {
const sharpModule = await import("sharp");
const sharp = sharpModule.default ?? sharpModule;
const image = sharp(input);
const metadata = await image.metadata();
const width = metadata.width ?? 0;
const height = metadata.height ?? 0;
const longest = Math.max(width, height);
if (longest <= MAX_DIMENSION_PX) {
return {
buffer: input,
result: {
resized: false,
width,
height,
originalWidth: width,
originalHeight: height,
},
};
}
const scale = MAX_DIMENSION_PX / longest;
const newWidth = Math.round(width * scale);
const newHeight = Math.round(height * scale);
const resized = await image
.resize(newWidth, newHeight, { fit: "inside" })
.png()
.toBuffer();
process.stderr.write(
`[screenshot-size-guard] image ${width}x${height} exceeded ${MAX_DIMENSION_PX}px max-dim; ` +
`downscaled to ${newWidth}x${newHeight} to fit Anthropic vision API\n`,
);
return {
buffer: resized,
result: {
resized: true,
width: newWidth,
height: newHeight,
originalWidth: width,
originalHeight: height,
},
};
}
/**
* File-mode variant: read the image at the given path, downscale if
* needed, and write the result back to the same path. Returns the
* diagnostic shape. Use this after `await page.screenshot({ path, ... })`.
*/
export async function guardScreenshotPath(filePath: string): Promise<SizeGuardResult> {
const input = readFileSync(filePath);
const { buffer, result } = await guardScreenshotBuffer(input);
if (result.resized) {
writeFileSync(filePath, buffer);
}
return result;
}
export const SCREENSHOT_MAX_DIMENSION_PX = MAX_DIMENSION_PX;

View File

@ -23,6 +23,7 @@ import * as Diff from 'diff';
import { TEMP_DIR, isPathWithin } from './platform';
import { escapeEnvelopeSentinels } from './content-security';
import { stripLoneSurrogates } from './sanitize';
import { guardScreenshotPath } from './screenshot-size-guard';
// Roles considered "interactive" for the -i flag
const INTERACTIVE_ROLES = new Set([
@ -418,6 +419,7 @@ export async function handleSnapshot(
}, boxes);
await page.screenshot({ path: screenshotPath, fullPage: true });
await guardScreenshotPath(screenshotPath);
// Always remove overlays
await page.evaluate(() => {
@ -538,6 +540,7 @@ export async function handleSnapshot(
}, boxes);
await page.screenshot({ path: heatmapPath, fullPage: true });
await guardScreenshotPath(heatmapPath);
// Remove heatmap overlays
await page.evaluate(() => {

View File

@ -11,6 +11,7 @@ import { findInstalledBrowsers, importCookies, importCookiesViaCdp, hasV20Cookie
import { generatePickerCode } from './cookie-picker-routes';
import { validateNavigationUrl } from './url-validation';
import { validateOutputPath, validateReadPath } from './path-security';
import { guardScreenshotPath } from './screenshot-size-guard';
import * as fs from 'fs';
import * as path from 'path';
import type { SetContentWaitUntil } from './tab-session';
@ -1123,6 +1124,10 @@ export async function handleWriteCommand(
// Take screenshot
await page.screenshot({ path: outputPath, fullPage: !scrollTo });
// Guard against Anthropic vision API >2000px brick (#1214). Only
// applies to fullPage captures; scrollTo viewport-bound shots are
// already capped by the viewport size.
if (!scrollTo) await guardScreenshotPath(outputPath);
// Restore viewport
if (viewportWidth && originalViewport) {

View File

@ -0,0 +1,118 @@
/**
* Unit tests for the screenshot size guard (#1214).
*
* Verifies that images exceeding 2000px on the longest dimension get
* downscaled to fit the Anthropic vision API cap, while images already
* inside the cap pass through untouched.
*
* Integration with the three callsites (snapshot.ts, meta-commands.ts,
* write-commands.ts) is exercised by the existing browse E2E suite we
* don't need to spin up Chromium just to verify the helper. The static
* invariant test below pins that all three callsites import the guard.
*/
import { afterEach, beforeEach, describe, expect, test } from 'bun:test';
import { mkdtempSync, readFileSync, rmSync, writeFileSync } from 'fs';
import { tmpdir } from 'os';
import { join } from 'path';
import sharp from 'sharp';
import {
SCREENSHOT_MAX_DIMENSION_PX,
guardScreenshotBuffer,
guardScreenshotPath,
} from '../src/screenshot-size-guard';
let tmp: string;
beforeEach(() => {
tmp = mkdtempSync(join(tmpdir(), 'screenshot-guard-'));
});
afterEach(() => {
rmSync(tmp, { recursive: true, force: true });
});
async function makePng(width: number, height: number): Promise<Buffer> {
return sharp({
create: { width, height, channels: 3, background: { r: 200, g: 50, b: 50 } },
})
.png()
.toBuffer();
}
describe('guardScreenshotBuffer', () => {
test('passes through images already within the cap', async () => {
const input = await makePng(1500, 1800);
const { buffer, result } = await guardScreenshotBuffer(input);
expect(result.resized).toBe(false);
expect(result.width).toBe(1500);
expect(result.height).toBe(1800);
expect(buffer).toBe(input); // identity — no re-encode
});
test('downscales a 5000px-tall image to fit the cap', async () => {
const input = await makePng(1200, 5000);
const { buffer, result } = await guardScreenshotBuffer(input);
expect(result.resized).toBe(true);
expect(result.originalHeight).toBe(5000);
expect(Math.max(result.width, result.height)).toBeLessThanOrEqual(
SCREENSHOT_MAX_DIMENSION_PX,
);
// Aspect ratio preserved.
expect(result.height / result.width).toBeCloseTo(5000 / 1200, 1);
// Buffer is a different (smaller) PNG.
expect(buffer.length).toBeLessThan(input.length);
});
test('downscales a 6000px-wide image', async () => {
const input = await makePng(6000, 1200);
const { buffer, result } = await guardScreenshotBuffer(input);
expect(result.resized).toBe(true);
expect(result.originalWidth).toBe(6000);
expect(Math.max(result.width, result.height)).toBeLessThanOrEqual(
SCREENSHOT_MAX_DIMENSION_PX,
);
expect(buffer.length).toBeGreaterThan(0);
});
test('treats exactly-2000px images as in-bounds (no resize)', async () => {
const input = await makePng(2000, 1000);
const { result } = await guardScreenshotBuffer(input);
expect(result.resized).toBe(false);
});
});
describe('guardScreenshotPath', () => {
test('rewrites the file in place when downscale is needed', async () => {
const filePath = join(tmp, 'tall.png');
writeFileSync(filePath, await makePng(1200, 5000));
const result = await guardScreenshotPath(filePath);
expect(result.resized).toBe(true);
const written = readFileSync(filePath);
const meta = await sharp(written).metadata();
expect(Math.max(meta.width ?? 0, meta.height ?? 0)).toBeLessThanOrEqual(
SCREENSHOT_MAX_DIMENSION_PX,
);
});
test('leaves the file untouched when already within cap', async () => {
const filePath = join(tmp, 'short.png');
const original = await makePng(800, 600);
writeFileSync(filePath, original);
const result = await guardScreenshotPath(filePath);
expect(result.resized).toBe(false);
const written = readFileSync(filePath);
expect(written.equals(original)).toBe(true);
});
});
describe('static invariant: all three full-page callsites import the guard', () => {
test('snapshot.ts, meta-commands.ts, and write-commands.ts wire the size guard', () => {
const browseSrc = join(import.meta.dir, '..', 'src');
const paths = ['snapshot.ts', 'meta-commands.ts', 'write-commands.ts'];
for (const rel of paths) {
const content = readFileSync(join(browseSrc, rel), 'utf-8');
expect(content).toContain('screenshot-size-guard');
}
});
});