mirror of https://github.com/garrytan/gstack.git
fix(test): E2E privacy gate — ambient env + skill-file prompt
Two fixes to get the E2E actually running end-to-end (first attempt failed at the SDK auth step, second at the assertion step): 1. Don't pass an explicit `env:` object to runAgentSdkTest. The SDK's auth pipeline misses ANTHROPIC_API_KEY when env is supplied as an object (verified against the plan-mode-no-op test, which passes no env and auths cleanly). Mutate process.env before the call instead, and restore the originals in finally so other tests don't inherit the ambient mutation. 2. The "Run /learn with no arguments" user prompt was too narrow — the model reduced it to a direct action and skipped the preamble privacy-gate directives entirely, so zero AskUserQuestions fired. Mirror the plan-mode-no-op pattern: point the model at the skill file on disk and ask it to follow every preamble directive. Bumped maxTurns from 6 to 10 to give the preamble room to execute. Verified both tests pass under `EVALS=1 EVALS_TIER=periodic bun test test/skill-e2e-brain-privacy-gate.test.ts` against a real ANTHROPIC_API_KEY. Cost per run: ~$0.30-$0.50 per test. Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
This commit is contained in:
parent
f989fdbf2f
commit
371a7e684a
|
|
@ -59,22 +59,38 @@ describeE2E('gbrain-sync privacy gate fires once via preamble', () => {
|
||||||
const askUserQuestions: Array<{ input: Record<string, unknown> }> = [];
|
const askUserQuestions: Array<{ input: Record<string, unknown> }> = [];
|
||||||
const binary = resolveClaudeBinary();
|
const binary = resolveClaudeBinary();
|
||||||
|
|
||||||
|
// Ambient env mutations — restored in finally so other tests in the file
|
||||||
|
// don't inherit them.
|
||||||
|
const origGstackHome = process.env.GSTACK_HOME;
|
||||||
|
const origPath = process.env.PATH;
|
||||||
|
process.env.GSTACK_HOME = gstackHome;
|
||||||
|
process.env.PATH = `${fakeBinDir}:${process.env.PATH ?? '/usr/bin:/bin:/opt/homebrew/bin'}`;
|
||||||
|
|
||||||
try {
|
try {
|
||||||
// Pick a small skill with the preamble. `/learn` is read-only +
|
// Pick a small skill with the preamble and load it via Read to force
|
||||||
// short, which keeps the token cost down. The preamble fires
|
// the model to execute every preamble directive. A narrow "run /learn"
|
||||||
// regardless of which skill we pick.
|
// prompt often gets reduced to a direct action, skipping the preamble
|
||||||
|
// gates. Mirror the plan-mode-no-op test pattern: ask the model to
|
||||||
|
// follow the skill's instructions in full.
|
||||||
|
const learnSkill = path.resolve(
|
||||||
|
import.meta.dir,
|
||||||
|
'..',
|
||||||
|
'learn',
|
||||||
|
'SKILL.md'
|
||||||
|
);
|
||||||
await runAgentSdkTest({
|
await runAgentSdkTest({
|
||||||
systemPrompt: { type: 'preset', preset: 'claude_code' },
|
systemPrompt: { type: 'preset', preset: 'claude_code' },
|
||||||
userPrompt:
|
userPrompt:
|
||||||
'Run /learn with no arguments. Just report the learnings count and answer any AskUserQuestion that fires.',
|
`Read the skill file at ${learnSkill} and follow its instructions from the top, including every preamble directive. Execute every bash block. If any AskUserQuestion fires, present it.`,
|
||||||
workingDirectory: gstackHome,
|
workingDirectory: gstackHome,
|
||||||
maxTurns: 6,
|
maxTurns: 10,
|
||||||
allowedTools: ['Read', 'Grep', 'Glob', 'Bash'],
|
allowedTools: ['Read', 'Grep', 'Glob', 'Bash'],
|
||||||
env: {
|
// NOTE: do NOT pass `env:` here. When the Agent SDK gets an explicit
|
||||||
GSTACK_HOME: gstackHome,
|
// env object, its auth pipeline doesn't pick up ANTHROPIC_API_KEY the
|
||||||
// Prepend the fake gbrain to PATH so the preamble's detection wins.
|
// same way as when env is undefined (SDK-internal detail, verified
|
||||||
PATH: `${fakeBinDir}:${process.env.PATH ?? '/usr/bin:/bin:/opt/homebrew/bin'}`,
|
// against the plan-mode-no-op test which passes no env and auths
|
||||||
},
|
// cleanly). Instead, mutate process.env before the call so the SDK
|
||||||
|
// inherits our overrides ambiently.
|
||||||
...(binary ? { pathToClaudeCodeExecutable: binary } : {}),
|
...(binary ? { pathToClaudeCodeExecutable: binary } : {}),
|
||||||
canUseTool: async (toolName, input) => {
|
canUseTool: async (toolName, input) => {
|
||||||
if (toolName === 'AskUserQuestion') {
|
if (toolName === 'AskUserQuestion') {
|
||||||
|
|
@ -125,6 +141,11 @@ describeE2E('gbrain-sync privacy gate fires once via preamble', () => {
|
||||||
// (The preamble is supposed to be idempotent within a session.)
|
// (The preamble is supposed to be idempotent within a session.)
|
||||||
expect(privacyQuestions.length).toBe(1);
|
expect(privacyQuestions.length).toBe(1);
|
||||||
} finally {
|
} finally {
|
||||||
|
// Restore ambient env before other tests.
|
||||||
|
if (origGstackHome === undefined) delete process.env.GSTACK_HOME;
|
||||||
|
else process.env.GSTACK_HOME = origGstackHome;
|
||||||
|
if (origPath === undefined) delete process.env.PATH;
|
||||||
|
else process.env.PATH = origPath;
|
||||||
fs.rmSync(gstackHome, { recursive: true, force: true });
|
fs.rmSync(gstackHome, { recursive: true, force: true });
|
||||||
fs.rmSync(fakeBinDir, { recursive: true, force: true });
|
fs.rmSync(fakeBinDir, { recursive: true, force: true });
|
||||||
}
|
}
|
||||||
|
|
@ -150,6 +171,12 @@ describeE2E('gbrain-sync privacy gate fires once via preamble', () => {
|
||||||
const askUserQuestions: Array<{ input: Record<string, unknown> }> = [];
|
const askUserQuestions: Array<{ input: Record<string, unknown> }> = [];
|
||||||
const binary = resolveClaudeBinary();
|
const binary = resolveClaudeBinary();
|
||||||
|
|
||||||
|
// Ambient env mutations (see note on the first test).
|
||||||
|
const origGstackHome = process.env.GSTACK_HOME;
|
||||||
|
const origPath = process.env.PATH;
|
||||||
|
process.env.GSTACK_HOME = gstackHome;
|
||||||
|
process.env.PATH = `${fakeBinDir}:${process.env.PATH ?? '/usr/bin:/bin:/opt/homebrew/bin'}`;
|
||||||
|
|
||||||
try {
|
try {
|
||||||
await runAgentSdkTest({
|
await runAgentSdkTest({
|
||||||
systemPrompt: { type: 'preset', preset: 'claude_code' },
|
systemPrompt: { type: 'preset', preset: 'claude_code' },
|
||||||
|
|
@ -158,10 +185,6 @@ describeE2E('gbrain-sync privacy gate fires once via preamble', () => {
|
||||||
workingDirectory: gstackHome,
|
workingDirectory: gstackHome,
|
||||||
maxTurns: 4,
|
maxTurns: 4,
|
||||||
allowedTools: ['Read', 'Grep', 'Glob', 'Bash'],
|
allowedTools: ['Read', 'Grep', 'Glob', 'Bash'],
|
||||||
env: {
|
|
||||||
GSTACK_HOME: gstackHome,
|
|
||||||
PATH: `${fakeBinDir}:${process.env.PATH ?? '/usr/bin:/bin:/opt/homebrew/bin'}`,
|
|
||||||
},
|
|
||||||
...(binary ? { pathToClaudeCodeExecutable: binary } : {}),
|
...(binary ? { pathToClaudeCodeExecutable: binary } : {}),
|
||||||
canUseTool: async (toolName, input) => {
|
canUseTool: async (toolName, input) => {
|
||||||
if (toolName === 'AskUserQuestion') {
|
if (toolName === 'AskUserQuestion') {
|
||||||
|
|
@ -193,6 +216,10 @@ describeE2E('gbrain-sync privacy gate fires once via preamble', () => {
|
||||||
});
|
});
|
||||||
expect(privacyQuestions.length).toBe(0);
|
expect(privacyQuestions.length).toBe(0);
|
||||||
} finally {
|
} finally {
|
||||||
|
if (origGstackHome === undefined) delete process.env.GSTACK_HOME;
|
||||||
|
else process.env.GSTACK_HOME = origGstackHome;
|
||||||
|
if (origPath === undefined) delete process.env.PATH;
|
||||||
|
else process.env.PATH = origPath;
|
||||||
fs.rmSync(gstackHome, { recursive: true, force: true });
|
fs.rmSync(gstackHome, { recursive: true, force: true });
|
||||||
fs.rmSync(fakeBinDir, { recursive: true, force: true });
|
fs.rmSync(fakeBinDir, { recursive: true, force: true });
|
||||||
}
|
}
|
||||||
|
|
|
||||||
Loading…
Reference in New Issue