mirror of https://github.com/garrytan/gstack.git
Merge PR #1369: gate domain-skill auto-promote on classifier_score > 0
This commit is contained in:
commit
8529aeeea7
|
|
@ -291,8 +291,20 @@ export async function writeSkill(input: WriteSkillInput): Promise<DomainSkillRow
|
||||||
*
|
*
|
||||||
* Auto-promote logic:
|
* Auto-promote logic:
|
||||||
* - increment use_count
|
* - increment use_count
|
||||||
* - if use_count >= PROMOTE_THRESHOLD AND flag_count == 0 → state:active
|
* - if use_count >= PROMOTE_THRESHOLD AND flag_count == 0 AND L4 has scored
|
||||||
* - else stay quarantined with updated counter
|
* the body (classifier_score > 0) → state:active
|
||||||
|
* - else stay quarantined with updated counter; user must run
|
||||||
|
* `domain-skill promote-to-global` manually
|
||||||
|
*
|
||||||
|
* The classifier_score > 0 gate is load-bearing: handleSave currently writes
|
||||||
|
* classifier_score=0 with the comment "L4 deferred to load-time / sidebar-agent
|
||||||
|
* fills this in on first prompt-injection load," but sidebar-agent was ripped
|
||||||
|
* (CLAUDE.md "Sidebar architecture") and nothing else updates the score, so
|
||||||
|
* skills authored via the production path never had their body scanned by L4.
|
||||||
|
* Without this gate, three benign uses promote any quarantined skill — including
|
||||||
|
* one written under the influence of a poisoned page — into the prompt context
|
||||||
|
* for every subsequent visit. The gate re-opens automatically the day L4 is
|
||||||
|
* rewired and writeSkill / recordSkillUse start receiving non-zero scores.
|
||||||
*/
|
*/
|
||||||
export async function recordSkillUse(host: string, projectSlug: string, classifierFlagged: boolean): Promise<DomainSkillRow | null> {
|
export async function recordSkillUse(host: string, projectSlug: string, classifierFlagged: boolean): Promise<DomainSkillRow | null> {
|
||||||
const normalized = normalizeHost(host);
|
const normalized = normalizeHost(host);
|
||||||
|
|
@ -303,7 +315,12 @@ export async function recordSkillUse(host: string, projectSlug: string, classifi
|
||||||
const useCount = current.use_count + 1;
|
const useCount = current.use_count + 1;
|
||||||
const flagCount = current.flag_count + (classifierFlagged ? 1 : 0);
|
const flagCount = current.flag_count + (classifierFlagged ? 1 : 0);
|
||||||
let state: SkillState = current.state;
|
let state: SkillState = current.state;
|
||||||
if (state === 'quarantined' && useCount >= PROMOTE_THRESHOLD && flagCount === 0) {
|
if (
|
||||||
|
state === 'quarantined' &&
|
||||||
|
useCount >= PROMOTE_THRESHOLD &&
|
||||||
|
flagCount === 0 &&
|
||||||
|
current.classifier_score > 0
|
||||||
|
) {
|
||||||
state = 'active';
|
state = 'active';
|
||||||
}
|
}
|
||||||
const updated: DomainSkillRow = {
|
const updated: DomainSkillRow = {
|
||||||
|
|
|
||||||
|
|
@ -106,6 +106,31 @@ describe('domain-skills: state machine (T6)', () => {
|
||||||
})
|
})
|
||||||
).rejects.toThrow(/classifier flagged/);
|
).rejects.toThrow(/classifier flagged/);
|
||||||
});
|
});
|
||||||
|
|
||||||
|
// domain-skill-commands.ts:140 (handleSave) writes classifier_score=0 with
|
||||||
|
// the comment "L4 deferred to load-time" — but sidebar-agent (the deferred
|
||||||
|
// scanner) was ripped per CLAUDE.md "Sidebar architecture." Without an
|
||||||
|
// explicit gate, three benign uses promote any quarantined skill, including
|
||||||
|
// one authored under a poisoned page, into prompt context permanently.
|
||||||
|
it('does NOT auto-promote when classifier_score is 0 (production handleSave shape)', async () => {
|
||||||
|
const m = await freshImport();
|
||||||
|
await m.writeSkill({
|
||||||
|
host: 'linkedin.com',
|
||||||
|
body: '# LinkedIn',
|
||||||
|
projectSlug: 'test-slug',
|
||||||
|
source: 'agent',
|
||||||
|
classifierScore: 0, // matches domain-skill-commands.ts:140 production path
|
||||||
|
});
|
||||||
|
const after3 = await m.recordSkillUse('linkedin.com', 'test-slug', false);
|
||||||
|
await m.recordSkillUse('linkedin.com', 'test-slug', false);
|
||||||
|
const final = await m.recordSkillUse('linkedin.com', 'test-slug', false);
|
||||||
|
expect(after3?.state).toBe('quarantined');
|
||||||
|
expect(final?.state).toBe('quarantined');
|
||||||
|
expect(final?.use_count).toBe(3);
|
||||||
|
// readSkill returns null for quarantined skills — they don't fire.
|
||||||
|
const read = await m.readSkill('linkedin.com', 'test-slug');
|
||||||
|
expect(read).toBeNull();
|
||||||
|
});
|
||||||
});
|
});
|
||||||
|
|
||||||
describe('domain-skills: scope shadowing (T4)', () => {
|
describe('domain-skills: scope shadowing (T4)', () => {
|
||||||
|
|
|
||||||
Loading…
Reference in New Issue