MicroFish/backend/tests/interviews/test_orchestrator.py

96 lines
4.0 KiB
Python

from pathlib import Path
import pytest
from app.models.interview import InterviewPhase, SubagentKind
from app.services.interviews.base import PersonaRecord, MemoryDigest
from app.services.interview_orchestrator import (
InterviewOrchestrator, PersonaProvider,
)
INST_DIR = Path(__file__).resolve().parents[2] / "scripts" / "instruments"
class _Mem:
def get_digest(self, agent_id, max_chars=2000):
return MemoryDigest(text="x", available=True)
class _LLM:
def chat_json(self, messages, temperature=0.0, max_tokens=None, **kw):
sys_text = next((m["content"] for m in messages if m["role"] == "system"), "")
if "longitudinal" in sys_text or "stk_" in (messages[-1].get("content") or ""):
return {
"responses": {k: 3 for k in ("stk_1","stk_2","stk_3","gov_1","gov_2","gov_3",
"mkt_1","mkt_2","mkt_3","clm_1","clm_2","clm_3")},
"confidence": {}, "open_comment": "ok",
}
return {}
class _Personas(PersonaProvider):
def __init__(self, n=3):
self._items = [PersonaRecord(agent_id=i, name=f"A{i}", persona="p") for i in range(n)]
def all(self): return list(self._items)
class _NoopZep:
def write_per_agent(self, *a, **kw): pass
def write_aggregate(self, *a, **kw): pass
def test_pre_phase_runs_longitudinal_only(tmp_path):
orch = InterviewOrchestrator(
llm=_LLM(), memory=_Mem(), personas=_Personas(3),
instrument_dir=INST_DIR, store_root=tmp_path, sim_id="sim1",
zep_writer=_NoopZep(), max_workers=2,
)
result = orch.run_pre()
assert result["longitudinal"]["n_responded"] == 3
assert "diversity" not in result # only longitudinal in pre-phase
def test_partial_failure_does_not_kill_run(tmp_path):
class _FlakyLLM:
def __init__(self): self.n = 0
def chat_json(self, messages, temperature=0.0, max_tokens=None, **kw):
self.n += 1
if self.n % 2 == 0:
raise RuntimeError("simulated LLM 5xx")
return {
"responses": {k: 3 for k in ("stk_1","stk_2","stk_3","gov_1","gov_2","gov_3",
"mkt_1","mkt_2","mkt_3","clm_1","clm_2","clm_3")},
"confidence": {}, "open_comment": "ok",
}
orch = InterviewOrchestrator(
llm=_FlakyLLM(), memory=_Mem(), personas=_Personas(4),
instrument_dir=INST_DIR, store_root=tmp_path, sim_id="sim2",
zep_writer=_NoopZep(), max_workers=1,
)
result = orch.run_pre()
assert result["longitudinal"]["n_responded"] < 4
assert result["longitudinal"]["n_failed"] > 0
def test_schema_failure_audit_captures_raw_llm_output(tmp_path):
"""When an agent's LLM output fails the schema validator twice, the audit log
should preserve both raw outputs so we can debug what the model actually said."""
bad_response = {"wrong": "shape, no responses key"}
class _BadLLM:
def chat_json(self, messages, temperature=0.0, max_tokens=None, **kw):
return bad_response # always fails Longitudinal validator
orch = InterviewOrchestrator(
llm=_BadLLM(), memory=_Mem(), personas=_Personas(1),
instrument_dir=INST_DIR, store_root=tmp_path, sim_id="sim3",
zep_writer=_NoopZep(), max_workers=1,
)
result = orch.run_pre()
assert result["longitudinal"]["n_responded"] == 0
assert result["longitudinal"]["n_failed"] == 1
import json as _j
run_dir = Path(result["longitudinal"]["run_dir"])
audit_path = run_dir / "audit.jsonl"
lines = audit_path.read_text(encoding="utf-8").splitlines()
assert lines, "audit.jsonl should not be empty"
entry = _j.loads(lines[0])
assert entry["event"] == "schema_validation_failure"
assert entry["agent_id"] == 0
detail = entry["detail"]
assert detail["label"] == "longitudinal_T0"
assert len(detail["attempts"]) == 2
assert detail["attempts"][0]["raw"] == bad_response
assert detail["attempts"][1]["raw"] == bad_response