gstack/test/gbrain-sync-voyage-code-3-i...

329 lines
12 KiB
TypeScript

/**
* Real integration: gbrain PGLite + voyage-code-3 end-to-end.
*
* Inits a sandboxed PGLite engine with voyage-code-3 embeddings, registers a
* tiny code fixture as a source, syncs it (which triggers Voyage embedding
* generation), and queries it back. The whole point is to catch the failure
* modes that hit us in real life:
*
* - dimension mismatch between the configured embedding column and the
* model's actual output dim (the 1280-vs-1536 trap that gbrain doctor
* surfaces but `gbrain init` silently sets up)
* - voyage-code-3 unavailable via gbrain's openai-compat adapter
* - sync completes but embedding generation silently fails (0 chunks)
*
* We intentionally do NOT call `gbrain query` here — it produces correct
* output but doesn't exit cleanly on a fresh PGLite (~2 min hang after
* results print). The smoking-gun assertion for "embeddings worked" is the
* "N pages embedded" line from sync output: if that's >= 1, voyage-code-3
* returned 1024-dim vectors and gbrain persisted them. Symbol-aware
* functionality is covered separately by the code-def test.
*
* Skips when:
* - `gbrain` is not on PATH (dev machine without it installed)
* - VOYAGE_API_KEY is unset (the test makes real Voyage API calls)
*
* Cost: ~$0.001 per run. The fixture is 3 tiny files, ~500 tokens total.
* Not gated on EVALS=1 because it's not an LLM eval — it's a deterministic
* integration test of the embedding pipeline. Always runs when the env
* supports it.
*
* Runtime: ~30-60s (gbrain init schema migrations + sync + Voyage round-trip).
* Long enough that `bun test` runs it serially with a per-test 120s timeout.
*/
import { describe, test, expect } from "bun:test";
import {
mkdtempSync,
mkdirSync,
writeFileSync,
rmSync,
existsSync,
} from "fs";
import { tmpdir } from "os";
import { join } from "path";
import { spawnSync } from "child_process";
const gbrainPath = spawnSync("which", ["gbrain"], { encoding: "utf-8" }).stdout.trim();
const gbrainAvailable = gbrainPath.length > 0;
const voyageKey = process.env.VOYAGE_API_KEY?.trim() ?? "";
const voyageKeyPresent = voyageKey.length > 0;
const shouldRun = gbrainAvailable && voyageKeyPresent;
const skipReason = !gbrainAvailable
? "gbrain not on PATH"
: !voyageKeyPresent
? "VOYAGE_API_KEY not set (real Voyage API calls required)"
: "";
if (!shouldRun) {
console.log(`[gbrain-sync-voyage-code-3-integration] SKIP: ${skipReason}`);
}
interface SandboxEnv {
root: string;
gbrainHome: string;
fixtureDir: string;
cleanup: () => void;
}
function makeSandbox(): SandboxEnv {
const root = mkdtempSync(join(tmpdir(), "gbrain-voyage-int-"));
// GBRAIN_HOME points at the PARENT of .gbrain (per gbrain's configDir());
// setting GBRAIN_HOME=/x means gbrain looks at /x/.gbrain/.
const gbrainHome = root;
const fixtureDir = join(root, "fixture-repo");
mkdirSync(fixtureDir, { recursive: true });
// Tiny realistic fixture: three files exercising different file types so
// gbrain's code stage has something to extract symbols + embeddings from.
writeFileSync(
join(fixtureDir, "math.ts"),
`export function fibonacci(n: number): number {
if (n <= 1) return n;
return fibonacci(n - 1) + fibonacci(n - 2);
}
export function isPrime(n: number): boolean {
if (n < 2) return false;
for (let i = 2; i * i <= n; i++) {
if (n % i === 0) return false;
}
return true;
}
`,
);
writeFileSync(
join(fixtureDir, "queue.ts"),
`export class JobQueue<T> {
private items: T[] = [];
enqueue(item: T): void { this.items.push(item); }
dequeue(): T | undefined { return this.items.shift(); }
size(): number { return this.items.length; }
}
`,
);
writeFileSync(
join(fixtureDir, "README.md"),
`# Fixture repo
Sample code for testing the voyage-code-3 embedding pipeline.
The math module exposes fibonacci and primality helpers.
The queue module is a simple FIFO job queue.
`,
);
// Make it a git repo because gbrain's code-sync strategy expects one.
const gitInit = spawnSync("git", ["init", "-q"], { cwd: fixtureDir, encoding: "utf-8" });
if (gitInit.status !== 0) {
throw new Error(`git init failed: ${gitInit.stderr}`);
}
spawnSync("git", ["config", "user.email", "test@example.invalid"], { cwd: fixtureDir });
spawnSync("git", ["config", "user.name", "test"], { cwd: fixtureDir });
spawnSync("git", ["add", "."], { cwd: fixtureDir });
spawnSync("git", ["commit", "-q", "-m", "fixture"], { cwd: fixtureDir });
return {
root,
gbrainHome,
fixtureDir,
cleanup: () => rmSync(root, { recursive: true, force: true }),
};
}
function gbrainEnv(s: SandboxEnv): NodeJS.ProcessEnv {
return {
...process.env,
GBRAIN_HOME: s.gbrainHome,
VOYAGE_API_KEY: voyageKey,
};
}
function runGbrain(s: SandboxEnv, args: string[], opts: { timeout?: number } = {}) {
// cwd MUST be the sandbox root, not the test's parent CWD. If gbrain runs
// from inside the gstack worktree, it picks up the worktree's
// `.gbrain-source` pin and tries to sync that source too — which won't
// exist in the sandbox PGLite, and the resulting "not found" exits 1.
return spawnSync("gbrain", args, {
encoding: "utf-8",
env: gbrainEnv(s),
cwd: s.root,
timeout: opts.timeout ?? 120_000,
});
}
describe.skipIf(!shouldRun)(
"gbrain PGLite + voyage-code-3 end-to-end (real Voyage API)",
() => {
test(
"init with voyage-code-3 produces a 1024-dim-aligned PGLite config",
() => {
const s = makeSandbox();
try {
const init = runGbrain(s, [
"init",
"--pglite",
"--json",
"--embedding-model",
"voyage:voyage-code-3",
"--embedding-dimensions",
"1024",
]);
expect(init.status).toBe(0);
// init prints JSON status line at the end; just sniff for success.
const out = (init.stdout || "") + (init.stderr || "");
expect(out).toContain('"status":"success"');
expect(out).toContain('"engine":"pglite"');
// doctor must agree the column width matches the live probe dim.
const doctor = runGbrain(s, ["doctor"]);
const dout = (doctor.stdout || "") + (doctor.stderr || "");
// Doctor exits non-zero on error rows; warnings are OK. The
// critical assertion is no dimension mismatch.
expect(dout).not.toContain("DB dimension mismatch");
// Should explicitly mention voyage-code-3 as the live provider.
expect(dout).toMatch(/voyage-code-3/);
// Width consistency check should be green for 1024d.
expect(dout).toMatch(/Schema width \(1024d\)/);
} finally {
s.cleanup();
}
},
120_000,
);
test(
"sync --strategy code generates Voyage embeddings and registers pages + chunks",
() => {
const s = makeSandbox();
try {
// 1. init voyage-code-3 PGLite
const init = runGbrain(s, [
"init",
"--pglite",
"--json",
"--embedding-model",
"voyage:voyage-code-3",
"--embedding-dimensions",
"1024",
]);
expect(init.status).toBe(0);
// 2. register the fixture as a code source
const add = runGbrain(s, [
"sources",
"add",
"fixture-code",
"--path",
s.fixtureDir,
]);
expect(add.status).toBe(0);
// 3. sync with code strategy — this is where Voyage embeddings get
// generated. Use --skip-failed so a single oversized file (which
// can happen in real repos) doesn't block the assertion.
const sync = runGbrain(
s,
[
"sync",
"--source",
"fixture-code",
"--strategy",
"code",
"--skip-failed",
],
{ timeout: 180_000 },
);
if (sync.status !== 0) {
console.error(`[sync FAILED exit=${sync.status}]`);
console.error(`STDOUT:\n${sync.stdout}`);
console.error(`STDERR:\n${sync.stderr}`);
}
expect(sync.status).toBe(0);
const sout = (sync.stdout || "") + (sync.stderr || "");
// The fixture has 3 files; gbrain should import at least the 2 .ts
// files (README.md may or may not be picked up by --strategy code
// depending on gbrain's file-type heuristics).
expect(sout).toMatch(/imported=[1-9]/);
// The "pages embedded" line is the smoking gun: if it's 0,
// embedding generation silently failed (voyage adapter broken,
// dimension mismatch, etc). Anything > 0 means voyage-code-3
// returned 1024-dim vectors and gbrain wrote them.
expect(sout).toMatch(/[1-9]\d* pages embedded/);
// 4. verify the source has pages and chunks
const list = runGbrain(s, ["sources", "list", "--json"]);
expect(list.status).toBe(0);
const sources = JSON.parse(list.stdout) as {
sources: Array<{ id: string; page_count: number }>;
};
const fixture = sources.sources.find((x) => x.id === "fixture-code");
expect(fixture).toBeDefined();
expect(fixture!.page_count).toBeGreaterThanOrEqual(2);
} finally {
s.cleanup();
}
},
300_000,
);
test(
"code-def finds symbols defined in the embedded fixture",
() => {
const s = makeSandbox();
try {
runGbrain(s, [
"init",
"--pglite",
"--json",
"--embedding-model",
"voyage:voyage-code-3",
"--embedding-dimensions",
"1024",
]);
runGbrain(s, ["sources", "add", "fixture-code", "--path", s.fixtureDir]);
runGbrain(
s,
["sync", "--source", "fixture-code", "--strategy", "code", "--skip-failed"],
{ timeout: 180_000 },
);
// code-def is the symbol-aware path. It doesn't strictly need
// embeddings (symbols are extracted by tree-sitter), but the JSON
// shape it returns is the contract gstack's CLAUDE.md guidance
// points the agent at. Verify it works against our PGLite + Voyage
// setup.
const result = runGbrain(s, ["code-def", "fibonacci"]);
expect(result.status).toBe(0);
const parsed = JSON.parse(result.stdout) as {
symbol: string;
count: number;
results: Array<{ file: string; symbol_type: string }>;
};
expect(parsed.symbol).toBe("fibonacci");
expect(parsed.count).toBeGreaterThanOrEqual(1);
expect(parsed.results[0].file).toContain("math.ts");
} finally {
s.cleanup();
}
},
300_000,
);
},
);
// Lightweight always-on guard: even without the integration test running, we
// can still assert that the test file's `describe.skipIf` gate is correctly
// formed. This catches a future edit that accidentally inverts the gate.
test("integration test gate uses the correct skip predicate", () => {
// shouldRun must be the boolean AND of the two pre-checks. If a refactor
// makes it true when either piece is missing, the test below would attempt
// real API calls without a key — undefined behavior.
expect(shouldRun).toBe(gbrainAvailable && voyageKeyPresent);
// When skipping, we logged a reason — basic sanity that the reason string
// matches what shouldRun says.
if (!shouldRun) {
expect(skipReason.length).toBeGreaterThan(0);
}
});