MicroFish/backend/tests/interviews/test_orchestrator.py

from pathlib import Path
import pytest
from app.models.interview import InterviewPhase, SubagentKind
from app.services.interviews.base import PersonaRecord, MemoryDigest
from app.services.interview_orchestrator import (
    InterviewOrchestrator, PersonaProvider,
)

INST_DIR = Path(__file__).resolve().parents[2] / "scripts" / "instruments"

class _Mem:
    def get_digest(self, agent_id, max_chars=2000):
        return MemoryDigest(text="x", available=True)

class _LLM:
    def chat_json(self, messages, temperature=0.0, max_tokens=None, **kw):
        sys_text = next((m["content"] for m in messages if m["role"] == "system"), "")
        if "longitudinal" in sys_text or "stk_" in (messages[-1].get("content") or ""):
            return {
                "responses": {k: 3 for k in ("stk_1","stk_2","stk_3","gov_1","gov_2","gov_3",
                                             "mkt_1","mkt_2","mkt_3","clm_1","clm_2","clm_3")},
                "confidence": {}, "open_comment": "ok",
            }
        return {}

class _Personas(PersonaProvider):
    def __init__(self, n=3):
        self._items = [PersonaRecord(agent_id=i, name=f"A{i}", persona="p") for i in range(n)]
    def all(self): return list(self._items)

class _NoopZep:
    def write_per_agent(self, *a, **kw): pass
    def write_aggregate(self, *a, **kw): pass

def test_pre_phase_runs_longitudinal_only(tmp_path):
    orch = InterviewOrchestrator(
        llm=_LLM(), memory=_Mem(), personas=_Personas(3),
        instrument_dir=INST_DIR, store_root=tmp_path, sim_id="sim1",
        zep_writer=_NoopZep(), max_workers=2,
    )
    result = orch.run_pre()
    assert result["longitudinal"]["n_responded"] == 3
    assert "diversity" not in result  # only longitudinal in pre-phase

def test_partial_failure_does_not_kill_run(tmp_path):
    class _FlakyLLM:
        def __init__(self): self.n = 0
        def chat_json(self, messages, temperature=0.0, max_tokens=None, **kw):
            self.n += 1
            if self.n % 2 == 0:
                raise RuntimeError("simulated LLM 5xx")
            return {
                "responses": {k: 3 for k in ("stk_1","stk_2","stk_3","gov_1","gov_2","gov_3",
                                             "mkt_1","mkt_2","mkt_3","clm_1","clm_2","clm_3")},
                "confidence": {}, "open_comment": "ok",
            }
    orch = InterviewOrchestrator(
        llm=_FlakyLLM(), memory=_Mem(), personas=_Personas(4),
        instrument_dir=INST_DIR, store_root=tmp_path, sim_id="sim2",
        zep_writer=_NoopZep(), max_workers=1,
    )
    result = orch.run_pre()
    assert result["longitudinal"]["n_responded"] < 4
    assert result["longitudinal"]["n_failed"] > 0


def test_schema_failure_audit_captures_raw_llm_output(tmp_path):
    """When an agent's LLM output fails the schema validator twice, the audit log
    should preserve both raw outputs so we can debug what the model actually said."""
    bad_response = {"wrong": "shape, no responses key"}
    class _BadLLM:
        def chat_json(self, messages, temperature=0.0, max_tokens=None, **kw):
            return bad_response  # always fails Longitudinal validator
    orch = InterviewOrchestrator(
        llm=_BadLLM(), memory=_Mem(), personas=_Personas(1),
        instrument_dir=INST_DIR, store_root=tmp_path, sim_id="sim3",
        zep_writer=_NoopZep(), max_workers=1,
    )
    result = orch.run_pre()
    assert result["longitudinal"]["n_responded"] == 0
    assert result["longitudinal"]["n_failed"] == 1

    import json as _j
    run_dir = Path(result["longitudinal"]["run_dir"])
    audit_path = run_dir / "audit.jsonl"
    lines = audit_path.read_text(encoding="utf-8").splitlines()
    assert lines, "audit.jsonl should not be empty"
    entry = _j.loads(lines[0])
    assert entry["event"] == "schema_validation_failure"
    assert entry["agent_id"] == 0
    detail = entry["detail"]
    assert detail["label"] == "longitudinal_T0"
    assert len(detail["attempts"]) == 2
    assert detail["attempts"][0]["raw"] == bad_response
    assert detail["attempts"][1]["raw"] == bad_response