Merge PR #1369: gate domain-skill auto-promote on classifier_score > 0

2026-05-08 21:42:09 -07:00 · 2026-05-08 21:42:09 -07:00 · 8529aeeea7
parent 99402350db 01e584253d
commit 8529aeeea7
2 changed files with 45 additions and 3 deletions
--- a/browse/src/domain-skills.ts
+++ b/browse/src/domain-skills.ts
@ -291,8 +291,20 @@ export async function writeSkill(input: WriteSkillInput): Promise<DomainSkillRow
 *
 * Auto-promote logic:
 *   - increment use_count
- *   - if use_count >= PROMOTE_THRESHOLD AND flag_count == 0 → state:active
- *   - else stay quarantined with updated counter
+ *   - if use_count >= PROMOTE_THRESHOLD AND flag_count == 0 AND L4 has scored
+ *     the body (classifier_score > 0) → state:active
+ *   - else stay quarantined with updated counter; user must run
+ *     `domain-skill promote-to-global` manually
+ *
+ * The classifier_score > 0 gate is load-bearing: handleSave currently writes
+ * classifier_score=0 with the comment "L4 deferred to load-time / sidebar-agent
+ * fills this in on first prompt-injection load," but sidebar-agent was ripped
+ * (CLAUDE.md "Sidebar architecture") and nothing else updates the score, so
+ * skills authored via the production path never had their body scanned by L4.
+ * Without this gate, three benign uses promote any quarantined skill — including
+ * one written under the influence of a poisoned page — into the prompt context
+ * for every subsequent visit. The gate re-opens automatically the day L4 is
+ * rewired and writeSkill / recordSkillUse start receiving non-zero scores.
 */
 export async function recordSkillUse(host: string, projectSlug: string, classifierFlagged: boolean): Promise<DomainSkillRow | null> {
  const normalized = normalizeHost(host);
@ -303,7 +315,12 @@ export async function recordSkillUse(host: string, projectSlug: string, classifi
  const useCount = current.use_count + 1;
  const flagCount = current.flag_count + (classifierFlagged ? 1 : 0);
  let state: SkillState = current.state;
-  if (state === 'quarantined' && useCount >= PROMOTE_THRESHOLD && flagCount === 0) {
+  if (
+    state === 'quarantined' &&
+    useCount >= PROMOTE_THRESHOLD &&
+    flagCount === 0 &&
+    current.classifier_score > 0
+  ) {
    state = 'active';
  }
  const updated: DomainSkillRow = {
--- a/browse/test/domain-skills-storage.test.ts
+++ b/browse/test/domain-skills-storage.test.ts
@ -106,6 +106,31 @@ describe('domain-skills: state machine (T6)', () => {
      })
    ).rejects.toThrow(/classifier flagged/);
  });
+
+  // domain-skill-commands.ts:140 (handleSave) writes classifier_score=0 with
+  // the comment "L4 deferred to load-time" — but sidebar-agent (the deferred
+  // scanner) was ripped per CLAUDE.md "Sidebar architecture." Without an
+  // explicit gate, three benign uses promote any quarantined skill, including
+  // one authored under a poisoned page, into prompt context permanently.
+  it('does NOT auto-promote when classifier_score is 0 (production handleSave shape)', async () => {
+    const m = await freshImport();
+    await m.writeSkill({
+      host: 'linkedin.com',
+      body: '# LinkedIn',
+      projectSlug: 'test-slug',
+      source: 'agent',
+      classifierScore: 0, // matches domain-skill-commands.ts:140 production path
+    });
+    const after3 = await m.recordSkillUse('linkedin.com', 'test-slug', false);
+    await m.recordSkillUse('linkedin.com', 'test-slug', false);
+    const final = await m.recordSkillUse('linkedin.com', 'test-slug', false);
+    expect(after3?.state).toBe('quarantined');
+    expect(final?.state).toBe('quarantined');
+    expect(final?.use_count).toBe(3);
+    // readSkill returns null for quarantined skills — they don't fire.
+    const read = await m.readSkill('linkedin.com', 'test-slug');
+    expect(read).toBeNull();
+  });
 });

 describe('domain-skills: scope shadowing (T4)', () => {