mirror of https://github.com/garrytan/gstack.git
feat: add media and data commands for page content extraction
media command: discovers all img/video/audio/background-image elements on the page. Returns JSON with URLs, dimensions, srcset, loading state, HLS/DASH detection. Supports --images/--videos/--audio filters and optional CSS selector scoping. data command: extracts structured data embedded in pages (JSON-LD, Open Graph, Twitter Cards, meta tags). One command returns product prices, article metadata, social share info without DOM scraping. Both are READ scope with untrusted content wrapping. Shared media-extract.ts helper for reuse by the upcoming scrape command. Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
This commit is contained in:
parent
a6e0277fc4
commit
cc63edb006
|
|
@ -16,6 +16,7 @@ export const READ_COMMANDS = new Set([
|
||||||
'console', 'network', 'cookies', 'storage', 'perf',
|
'console', 'network', 'cookies', 'storage', 'perf',
|
||||||
'dialog', 'is',
|
'dialog', 'is',
|
||||||
'inspect',
|
'inspect',
|
||||||
|
'media', 'data',
|
||||||
]);
|
]);
|
||||||
|
|
||||||
export const WRITE_COMMANDS = new Set([
|
export const WRITE_COMMANDS = new Set([
|
||||||
|
|
@ -46,6 +47,7 @@ export const ALL_COMMANDS = new Set([...READ_COMMANDS, ...WRITE_COMMANDS, ...MET
|
||||||
export const PAGE_CONTENT_COMMANDS = new Set([
|
export const PAGE_CONTENT_COMMANDS = new Set([
|
||||||
'text', 'html', 'links', 'forms', 'accessibility', 'attrs',
|
'text', 'html', 'links', 'forms', 'accessibility', 'attrs',
|
||||||
'console', 'dialog',
|
'console', 'dialog',
|
||||||
|
'media', 'data',
|
||||||
]);
|
]);
|
||||||
|
|
||||||
/** Wrap output from untrusted-content commands with trust boundary markers */
|
/** Wrap output from untrusted-content commands with trust boundary markers */
|
||||||
|
|
@ -70,6 +72,8 @@ export const COMMAND_DESCRIPTIONS: Record<string, { category: string; descriptio
|
||||||
'links': { category: 'Reading', description: 'All links as "text → href"' },
|
'links': { category: 'Reading', description: 'All links as "text → href"' },
|
||||||
'forms': { category: 'Reading', description: 'Form fields as JSON' },
|
'forms': { category: 'Reading', description: 'Form fields as JSON' },
|
||||||
'accessibility': { category: 'Reading', description: 'Full ARIA tree' },
|
'accessibility': { category: 'Reading', description: 'Full ARIA tree' },
|
||||||
|
'media': { category: 'Reading', description: 'All media elements (images, videos, audio) with URLs, dimensions, types', usage: 'media [--images|--videos|--audio] [selector]' },
|
||||||
|
'data': { category: 'Reading', description: 'Structured data: JSON-LD, Open Graph, Twitter Cards, meta tags', usage: 'data [--jsonld|--og|--meta|--twitter]' },
|
||||||
// Inspection
|
// Inspection
|
||||||
'js': { category: 'Inspection', description: 'Run JavaScript expression and return result as string', usage: 'js <expr>' },
|
'js': { category: 'Inspection', description: 'Run JavaScript expression and return result as string', usage: 'js <expr>' },
|
||||||
'eval': { category: 'Inspection', description: 'Run JavaScript from file and return result as string (path must be under /tmp or cwd)', usage: 'eval <file>' },
|
'eval': { category: 'Inspection', description: 'Run JavaScript from file and return result as string (path must be under /tmp or cwd)', usage: 'eval <file>' },
|
||||||
|
|
|
||||||
|
|
@ -0,0 +1,177 @@
|
||||||
|
/**
|
||||||
|
* Media extraction helper — shared between `media` (read) and `scrape` (write) commands.
|
||||||
|
*
|
||||||
|
* Runs page.evaluate() to discover all media elements on the page:
|
||||||
|
* - <img> with src, srcset, currentSrc, alt, dimensions, loading, data-src
|
||||||
|
* - <video> with currentSrc, poster, duration, <source> children, HLS/DASH detection
|
||||||
|
* - <audio> with src, duration, type
|
||||||
|
* - CSS background-image (capped at 500 elements)
|
||||||
|
*/
|
||||||
|
|
||||||
|
import type { Page, Frame } from 'playwright';
|
||||||
|
|
||||||
|
export interface ImageInfo {
|
||||||
|
index: number;
|
||||||
|
src: string;
|
||||||
|
srcset: string;
|
||||||
|
currentSrc: string;
|
||||||
|
alt: string;
|
||||||
|
width: number;
|
||||||
|
height: number;
|
||||||
|
naturalWidth: number;
|
||||||
|
naturalHeight: number;
|
||||||
|
loading: string;
|
||||||
|
dataSrc: string;
|
||||||
|
visible: boolean;
|
||||||
|
}
|
||||||
|
|
||||||
|
export interface VideoSource {
|
||||||
|
src: string;
|
||||||
|
type: string;
|
||||||
|
}
|
||||||
|
|
||||||
|
export interface VideoInfo {
|
||||||
|
index: number;
|
||||||
|
src: string;
|
||||||
|
currentSrc: string;
|
||||||
|
poster: string;
|
||||||
|
width: number;
|
||||||
|
height: number;
|
||||||
|
duration: number;
|
||||||
|
type: string;
|
||||||
|
sources: VideoSource[];
|
||||||
|
isHLS: boolean;
|
||||||
|
isDASH: boolean;
|
||||||
|
}
|
||||||
|
|
||||||
|
export interface AudioInfo {
|
||||||
|
index: number;
|
||||||
|
src: string;
|
||||||
|
currentSrc: string;
|
||||||
|
duration: number;
|
||||||
|
type: string;
|
||||||
|
}
|
||||||
|
|
||||||
|
export interface BackgroundImageInfo {
|
||||||
|
index: number;
|
||||||
|
url: string;
|
||||||
|
selector: string;
|
||||||
|
element: string;
|
||||||
|
}
|
||||||
|
|
||||||
|
export interface MediaResult {
|
||||||
|
images: ImageInfo[];
|
||||||
|
videos: VideoInfo[];
|
||||||
|
audio: AudioInfo[];
|
||||||
|
backgroundImages: BackgroundImageInfo[];
|
||||||
|
total: number;
|
||||||
|
}
|
||||||
|
|
||||||
|
/** Extract all media elements from the page or a scoped subtree. */
|
||||||
|
export async function extractMedia(
|
||||||
|
target: Page | Frame,
|
||||||
|
options?: { selector?: string; filter?: 'images' | 'videos' | 'audio' },
|
||||||
|
): Promise<MediaResult> {
|
||||||
|
const result = await target.evaluate(({ scopeSelector, filter }) => {
|
||||||
|
const root = scopeSelector
|
||||||
|
? document.querySelector(scopeSelector) || document
|
||||||
|
: document;
|
||||||
|
|
||||||
|
const images: any[] = [];
|
||||||
|
const videos: any[] = [];
|
||||||
|
const audio: any[] = [];
|
||||||
|
const backgroundImages: any[] = [];
|
||||||
|
|
||||||
|
// Images
|
||||||
|
if (!filter || filter === 'images') {
|
||||||
|
const imgs = root.querySelectorAll('img');
|
||||||
|
imgs.forEach((img, i) => {
|
||||||
|
const rect = img.getBoundingClientRect();
|
||||||
|
images.push({
|
||||||
|
index: i,
|
||||||
|
src: img.src || '',
|
||||||
|
srcset: img.srcset || '',
|
||||||
|
currentSrc: img.currentSrc || '',
|
||||||
|
alt: img.alt || '',
|
||||||
|
width: img.width,
|
||||||
|
height: img.height,
|
||||||
|
naturalWidth: img.naturalWidth,
|
||||||
|
naturalHeight: img.naturalHeight,
|
||||||
|
loading: img.loading || '',
|
||||||
|
dataSrc: img.getAttribute('data-src') || img.getAttribute('data-lazy-src') || img.getAttribute('data-original') || '',
|
||||||
|
visible: rect.width > 0 && rect.height > 0 && rect.bottom > 0 && rect.right > 0,
|
||||||
|
});
|
||||||
|
});
|
||||||
|
}
|
||||||
|
|
||||||
|
// Videos
|
||||||
|
if (!filter || filter === 'videos') {
|
||||||
|
const vids = root.querySelectorAll('video');
|
||||||
|
vids.forEach((vid, i) => {
|
||||||
|
const sources = Array.from(vid.querySelectorAll('source')).map(s => ({
|
||||||
|
src: s.src || '',
|
||||||
|
type: s.type || '',
|
||||||
|
}));
|
||||||
|
const isHLS = sources.some(s => s.type.includes('mpegURL') || s.src.includes('.m3u8'));
|
||||||
|
const isDASH = sources.some(s => s.type.includes('dash') || s.src.includes('.mpd'));
|
||||||
|
videos.push({
|
||||||
|
index: i,
|
||||||
|
src: vid.src || '',
|
||||||
|
currentSrc: vid.currentSrc || '',
|
||||||
|
poster: vid.poster || '',
|
||||||
|
width: vid.videoWidth || vid.width,
|
||||||
|
height: vid.videoHeight || vid.height,
|
||||||
|
duration: isFinite(vid.duration) ? vid.duration : 0,
|
||||||
|
type: sources[0]?.type || '',
|
||||||
|
sources,
|
||||||
|
isHLS,
|
||||||
|
isDASH,
|
||||||
|
});
|
||||||
|
});
|
||||||
|
}
|
||||||
|
|
||||||
|
// Audio
|
||||||
|
if (!filter || filter === 'audio') {
|
||||||
|
const auds = root.querySelectorAll('audio');
|
||||||
|
auds.forEach((aud, i) => {
|
||||||
|
const source = aud.querySelector('source');
|
||||||
|
audio.push({
|
||||||
|
index: i,
|
||||||
|
src: aud.src || source?.src || '',
|
||||||
|
currentSrc: aud.currentSrc || '',
|
||||||
|
duration: isFinite(aud.duration) ? aud.duration : 0,
|
||||||
|
type: source?.type || '',
|
||||||
|
});
|
||||||
|
});
|
||||||
|
}
|
||||||
|
|
||||||
|
// Background images (capped at 500 elements for performance)
|
||||||
|
if (!filter || filter === 'images') {
|
||||||
|
const allElements = root.querySelectorAll('*');
|
||||||
|
let bgCount = 0;
|
||||||
|
for (let i = 0; i < allElements.length && bgCount < 500; i++) {
|
||||||
|
const el = allElements[i];
|
||||||
|
const bg = getComputedStyle(el).backgroundImage;
|
||||||
|
if (bg && bg !== 'none') {
|
||||||
|
const urlMatch = bg.match(/url\(["']?([^"')]+)["']?\)/);
|
||||||
|
if (urlMatch && urlMatch[1] && !urlMatch[1].startsWith('data:')) {
|
||||||
|
backgroundImages.push({
|
||||||
|
index: bgCount,
|
||||||
|
url: urlMatch[1],
|
||||||
|
selector: el.tagName.toLowerCase() + (el.id ? `#${el.id}` : '') + (el.className && typeof el.className === 'string' ? '.' + el.className.trim().split(/\s+/).join('.') : ''),
|
||||||
|
element: el.tagName.toLowerCase(),
|
||||||
|
});
|
||||||
|
bgCount++;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
return { images, videos, audio, backgroundImages };
|
||||||
|
}, { scopeSelector: options?.selector || null, filter: options?.filter || null });
|
||||||
|
|
||||||
|
return {
|
||||||
|
...result,
|
||||||
|
total: result.images.length + result.videos.length + result.audio.length + result.backgroundImages.length,
|
||||||
|
};
|
||||||
|
}
|
||||||
|
|
@ -383,6 +383,76 @@ export async function handleReadCommand(
|
||||||
return formatInspectorResult(result, { includeUA });
|
return formatInspectorResult(result, { includeUA });
|
||||||
}
|
}
|
||||||
|
|
||||||
|
case 'media': {
|
||||||
|
const { extractMedia } = await import('./media-extract');
|
||||||
|
const target = bm.getActiveFrameOrPage();
|
||||||
|
const filter = args.includes('--images') ? 'images' as const
|
||||||
|
: args.includes('--videos') ? 'videos' as const
|
||||||
|
: args.includes('--audio') ? 'audio' as const
|
||||||
|
: undefined;
|
||||||
|
const selectorArg = args.find(a => !a.startsWith('--'));
|
||||||
|
const result = await extractMedia(target, { selector: selectorArg, filter });
|
||||||
|
return JSON.stringify(result, null, 2);
|
||||||
|
}
|
||||||
|
|
||||||
|
case 'data': {
|
||||||
|
const target = bm.getActiveFrameOrPage();
|
||||||
|
const wantJsonLd = args.includes('--jsonld') || args.length === 0;
|
||||||
|
const wantOg = args.includes('--og') || args.length === 0;
|
||||||
|
const wantTwitter = args.includes('--twitter') || args.length === 0;
|
||||||
|
const wantMeta = args.includes('--meta') || args.length === 0;
|
||||||
|
|
||||||
|
const result = await target.evaluate(({ wantJsonLd, wantOg, wantTwitter, wantMeta }) => {
|
||||||
|
const data: Record<string, any> = {};
|
||||||
|
|
||||||
|
if (wantJsonLd) {
|
||||||
|
const scripts = document.querySelectorAll('script[type="application/ld+json"]');
|
||||||
|
const jsonLd: any[] = [];
|
||||||
|
scripts.forEach(s => {
|
||||||
|
try { jsonLd.push(JSON.parse(s.textContent || '')); } catch {}
|
||||||
|
});
|
||||||
|
data.jsonLd = jsonLd;
|
||||||
|
}
|
||||||
|
|
||||||
|
if (wantOg) {
|
||||||
|
const og: Record<string, string> = {};
|
||||||
|
document.querySelectorAll('meta[property^="og:"]').forEach(m => {
|
||||||
|
const prop = m.getAttribute('property')?.replace('og:', '') || '';
|
||||||
|
og[prop] = m.getAttribute('content') || '';
|
||||||
|
});
|
||||||
|
data.openGraph = og;
|
||||||
|
}
|
||||||
|
|
||||||
|
if (wantTwitter) {
|
||||||
|
const tw: Record<string, string> = {};
|
||||||
|
document.querySelectorAll('meta[name^="twitter:"]').forEach(m => {
|
||||||
|
const name = m.getAttribute('name')?.replace('twitter:', '') || '';
|
||||||
|
tw[name] = m.getAttribute('content') || '';
|
||||||
|
});
|
||||||
|
data.twitterCards = tw;
|
||||||
|
}
|
||||||
|
|
||||||
|
if (wantMeta) {
|
||||||
|
const meta: Record<string, string> = {};
|
||||||
|
const canonical = document.querySelector('link[rel="canonical"]');
|
||||||
|
if (canonical) meta.canonical = canonical.getAttribute('href') || '';
|
||||||
|
const desc = document.querySelector('meta[name="description"]');
|
||||||
|
if (desc) meta.description = desc.getAttribute('content') || '';
|
||||||
|
const keywords = document.querySelector('meta[name="keywords"]');
|
||||||
|
if (keywords) meta.keywords = keywords.getAttribute('content') || '';
|
||||||
|
const author = document.querySelector('meta[name="author"]');
|
||||||
|
if (author) meta.author = author.getAttribute('content') || '';
|
||||||
|
const title = document.querySelector('title');
|
||||||
|
if (title) meta.title = title.textContent || '';
|
||||||
|
data.meta = meta;
|
||||||
|
}
|
||||||
|
|
||||||
|
return data;
|
||||||
|
}, { wantJsonLd, wantOg, wantTwitter, wantMeta });
|
||||||
|
|
||||||
|
return JSON.stringify(result, null, 2);
|
||||||
|
}
|
||||||
|
|
||||||
default:
|
default:
|
||||||
throw new Error(`Unknown read command: ${command}`);
|
throw new Error(`Unknown read command: ${command}`);
|
||||||
}
|
}
|
||||||
|
|
|
||||||
|
|
@ -40,6 +40,7 @@ export const SCOPE_READ = new Set([
|
||||||
'snapshot', 'text', 'html', 'links', 'forms', 'accessibility',
|
'snapshot', 'text', 'html', 'links', 'forms', 'accessibility',
|
||||||
'console', 'network', 'perf', 'dialog', 'is', 'inspect',
|
'console', 'network', 'perf', 'dialog', 'is', 'inspect',
|
||||||
'url', 'tabs', 'status', 'screenshot', 'pdf', 'css', 'attrs',
|
'url', 'tabs', 'status', 'screenshot', 'pdf', 'css', 'attrs',
|
||||||
|
'media', 'data',
|
||||||
]);
|
]);
|
||||||
|
|
||||||
/** Commands that modify page state or navigate */
|
/** Commands that modify page state or navigate */
|
||||||
|
|
|
||||||
Loading…
Reference in New Issue