diff --git a/backend/app/__init__.py b/backend/app/__init__.py index aba624bb..fdc49112 100644 --- a/backend/app/__init__.py +++ b/backend/app/__init__.py @@ -47,6 +47,20 @@ def create_app(config_class=Config): SimulationRunner.register_cleanup() if should_log_startup: logger.info("已注册模拟进程清理函数") + + # Install interview lifecycle hooks on the SimulationManager class. + # Hooks are stored on the class itself (not on a particular instance), so + # any fresh `SimulationManager()` constructed later (e.g. per request in + # the Flask API) will see them. We still bridge `_notify_on_completed` + # into SimulationRunner via a transient instance so the runner's monitor + # thread fires the completed hooks when a simulation process exits. + from .services.simulation_manager import SimulationManager + from .services.interviews.lifecycle import install_hooks + + install_hooks(SimulationManager) + SimulationRunner.register_on_completed(SimulationManager()._notify_on_completed) + if should_log_startup: + logger.info("已安装面试生命周期钩子") # 请求日志中间件 @app.before_request @@ -63,10 +77,8 @@ def create_app(config_class=Config): return response # 注册蓝图 - from .api import graph_bp, simulation_bp, report_bp - app.register_blueprint(graph_bp, url_prefix='/api/graph') - app.register_blueprint(simulation_bp, url_prefix='/api/simulation') - app.register_blueprint(report_bp, url_prefix='/api/report') + from .api import register_blueprints + register_blueprints(app) # 健康检查 @app.route('/health') diff --git a/backend/app/api/__init__.py b/backend/app/api/__init__.py index ffda743a..396750f2 100644 --- a/backend/app/api/__init__.py +++ b/backend/app/api/__init__.py @@ -2,13 +2,22 @@ API路由模块 """ -from flask import Blueprint +from flask import Blueprint, Flask graph_bp = Blueprint('graph', __name__) simulation_bp = Blueprint('simulation', __name__) report_bp = Blueprint('report', __name__) +interview_bp = Blueprint('interview', __name__) from . import graph # noqa: E402, F401 from . import simulation # noqa: E402, F401 from . import report # noqa: E402, F401 +from . import interview # noqa: E402, F401 + +def register_blueprints(app: Flask) -> None: + """Register all API blueprints on *app* with their canonical URL prefixes.""" + app.register_blueprint(graph_bp, url_prefix='/api/graph') + app.register_blueprint(simulation_bp, url_prefix='/api/simulation') + app.register_blueprint(report_bp, url_prefix='/api/report') + app.register_blueprint(interview_bp, url_prefix='/api/interview') diff --git a/backend/app/api/interview.py b/backend/app/api/interview.py new file mode 100644 index 00000000..e638aaab --- /dev/null +++ b/backend/app/api/interview.py @@ -0,0 +1,225 @@ +from __future__ import annotations +import threading +import traceback +import uuid +from pathlib import Path +from flask import Blueprint, jsonify, request, send_file +from app.config import Config +from app.models.interview import SubagentKind, InterviewPhase +from app.services.interviews.adapters import FileSystemPersonaProvider, ZepMemoryProvider +from app.services.interviews.zep_writer import InterviewZepWriter +from app.services.interview_orchestrator import InterviewOrchestrator +from app.services.interview_synthesizer import InterviewSynthesizer +from app.services.interviews.storage import InterviewStore +from app.utils.llm_client import LLMClient +from app.utils.logger import get_logger + +from . import interview_bp + +logger = get_logger(__name__) + + +class _NullUpdater: + """No-op stand-in for ``ZepGraphMemoryUpdater`` used when Zep is unavailable. + + Exposes ``add_text_episode`` so ``InterviewZepWriter._emit`` succeeds silently — + the interview pipeline still produces local artefacts; Zep just isn't updated. + """ + + def add_text_episode(self, graph_id, text): # noqa: ARG002 - matches real API + return None + + +class _NullMemory: + """Fallback memory provider that always reports unavailable digests.""" + + def get_digest(self, agent_id, max_chars=2000): # noqa: ARG002 - matches Protocol + from app.services.interviews.base import MemoryDigest + return MemoryDigest(text="[memory unavailable]", available=False) + +_TASKS: dict[str, dict] = {} +_LOCK = threading.Lock() + +INSTRUMENT_DIR = Path(__file__).resolve().parents[2] / "scripts" / "instruments" + + +def _uploads_root() -> Path: + return Path(getattr(Config, "UPLOADS_DIR", "uploads")) + + +def _load_graph_id(sim_id: str) -> str: + """Read the Zep ``graph_id`` for a simulation from its persisted state. + + The graph_id is written by ``SimulationManager`` into + ``uploads/simulations/{sim_id}/state.json``. Returns ``""`` if the state + file is missing or unreadable — callers should treat empty graph_id as + "Zep unavailable" and fall back to the null memory/writer path. + """ + try: + from app.services.simulation_manager import SimulationManager + state = SimulationManager().get_simulation(sim_id) + if state and state.graph_id: + return state.graph_id + except Exception as e: # pragma: no cover - defensive + logger.warning(f"_load_graph_id({sim_id}) failed: {e!r}") + return "" + + +def _build_orchestrator(sim_id: str) -> InterviewOrchestrator: + sim_dir = _uploads_root() / "simulations" / sim_id + reddit = sim_dir / "reddit_profiles.json" + twitter = sim_dir / "twitter_profiles.csv" + personas = FileSystemPersonaProvider( + reddit_path=reddit if reddit.exists() else None, + twitter_path=twitter if twitter.exists() else None, + ) + # Build agent_id -> Zep entity uuid map from the persisted profile files. + agent_to_entity = personas.agent_to_entity() + + # Resolve the graph_id from the simulation's persisted state — NOT from a + # ``graph_id.txt`` (nothing in the codebase writes such a file). + graph_id = _load_graph_id(sim_id) + + memory: object + zep_writer: InterviewZepWriter + if not graph_id: + logger.warning( + f"interview: no graph_id for sim {sim_id} — Zep memory/writer disabled " + "(simulation state missing or graph_id empty)" + ) + memory = _NullMemory() + zep_writer = InterviewZepWriter(memory_updater=_NullUpdater(), graph_id="") + else: + try: + from app.services.zep_entity_reader import ZepEntityReader + from app.services.zep_graph_memory_updater import ZepGraphMemoryUpdater + + reader = ZepEntityReader() + updater = ZepGraphMemoryUpdater(graph_id=graph_id) + memory = ZepMemoryProvider( + reader, graph_id=graph_id, agent_to_entity=agent_to_entity + ) + zep_writer = InterviewZepWriter(memory_updater=updater, graph_id=graph_id) + if not agent_to_entity: + logger.warning( + f"interview: empty agent_to_entity map for sim {sim_id} — " + "memory digests will be unavailable. Check that profile files " + "include `source_entity_uuid`." + ) + except Exception as e: + logger.warning( + f"interview: Zep init failed for sim {sim_id} ({e!r}); " + "falling back to null memory/writer" + ) + memory = _NullMemory() + zep_writer = InterviewZepWriter(memory_updater=_NullUpdater(), graph_id="") + llm = LLMClient(api_key=Config.LLM_API_KEY, base_url=Config.LLM_BASE_URL, + model=Config.LLM_MODEL_NAME) + return InterviewOrchestrator( + llm=llm, memory=memory, personas=personas, + instrument_dir=INSTRUMENT_DIR, store_root=_uploads_root(), sim_id=sim_id, + zep_writer=zep_writer, max_workers=Config.INTERVIEW_MAX_WORKERS, + language=Config.INTERVIEW_DEFAULT_LANGUAGE, + ) + + +def _run_task(task_id: str, fn) -> None: + with _LOCK: + _TASKS[task_id] = {"status": "running", "progress": {}, "result": None, "error": None} + try: + result = fn(task_id) + with _LOCK: + _TASKS[task_id]["status"] = "completed"; _TASKS[task_id]["result"] = result + except Exception as e: + with _LOCK: + _TASKS[task_id]["status"] = "failed" + _TASKS[task_id]["error"] = repr(e) + _TASKS[task_id]["traceback"] = traceback.format_exc() + + +def _start_task(fn) -> str: + task_id = uuid.uuid4().hex[:12] + with _LOCK: + _TASKS[task_id] = {"status": "queued", "progress": {}, "result": None, "error": None} + threading.Thread(target=_run_task, args=(task_id, fn), daemon=True).start() + return task_id + + +def _envelope(data=None, error=None, status: int = 200): + body = {"success": error is None, "data": data or {}, "error": error} + return jsonify(body), status + + +@interview_bp.route("//pre", methods=["POST"]) +def post_pre(sim_id: str): + orch = _build_orchestrator(sim_id) + task_id = _start_task(lambda tid: orch.run_pre()) + return _envelope({"task_id": task_id}) + + +@interview_bp.route("//post", methods=["POST"]) +def post_post(sim_id: str): + orch = _build_orchestrator(sim_id) + def run(tid): + out = orch.run_post() + synth = InterviewSynthesizer(store=orch.store) + out["synthesis"] = synth.run()[:1000] # short preview + return out + task_id = _start_task(run) + return _envelope({"task_id": task_id}) + + +@interview_bp.route("//rerun", methods=["POST"]) +def post_rerun(sim_id: str): + body = request.get_json(silent=True) or {} + sub = body.get("subagent") + try: subagent = SubagentKind(sub) + except ValueError: return _envelope(error=f"unknown subagent {sub!r}", status=400) + orch = _build_orchestrator(sim_id) + task_id = _start_task(lambda tid: orch.rerun(subagent)) + return _envelope({"task_id": task_id}) + + +@interview_bp.route("//status", methods=["GET"]) +def get_status(sim_id: str): + task_id = request.args.get("task_id") + with _LOCK: + task = _TASKS.get(task_id) + if task is None: return _envelope(error="unknown task_id", status=404) + return _envelope({"status": task["status"], "progress": task.get("progress", {}), + "result": task.get("result"), "error": task.get("error")}) + + +@interview_bp.route("//results/", methods=["GET"]) +def get_results(sim_id: str, subagent: str): + try: sub = SubagentKind(subagent) + except ValueError: return _envelope(error=f"unknown subagent {subagent!r}", status=400) + store = InterviewStore(root=_uploads_root(), sim_id=sim_id) + phase = InterviewPhase.T1 if sub != SubagentKind.LONGITUDINAL else InterviewPhase.T1 + run = store.latest_run(phase, sub) + if run is None: return _envelope(error="no results yet", status=404) + agg = (run / "aggregate.json") + if not agg.exists(): return _envelope(error="aggregate missing", status=404) + import json as _j + return _envelope({"aggregate": _j.loads(agg.read_text(encoding="utf-8")), + "run_dir": str(run)}) + + +@interview_bp.route("//results/synthesis", methods=["GET"]) +def get_synthesis(sim_id: str): + store = InterviewStore(root=_uploads_root(), sim_id=sim_id) + report = store.base / "synthesis" / "report.md" + if not report.exists(): + synth = InterviewSynthesizer(store=store) + synth.run() + return _envelope({"report_markdown": report.read_text(encoding="utf-8")}) + + +@interview_bp.route("//export.csv", methods=["GET"]) +def get_export_csv(sim_id: str): + store = InterviewStore(root=_uploads_root(), sim_id=sim_id) + csv_path = store.base / "synthesis" / "exports" / "all_responses.csv" + if not csv_path.exists(): + InterviewSynthesizer(store=store).run() + return send_file(csv_path, mimetype="text/csv", as_attachment=True, + download_name=f"{sim_id}_interviews.csv") diff --git a/backend/app/config.py b/backend/app/config.py index de63e2b4..a63ba39b 100644 --- a/backend/app/config.py +++ b/backend/app/config.py @@ -39,6 +39,8 @@ class Config: MAX_CONTENT_LENGTH = 50 * 1024 * 1024 # 50MB UPLOAD_FOLDER = os.path.join(os.path.dirname(__file__), '../uploads') ALLOWED_EXTENSIONS = {'pdf', 'md', 'txt', 'markdown'} + # Root directory for simulation uploads (used by the interview subsystem) + UPLOADS_DIR = os.environ.get("UPLOADS_DIR", os.path.join(os.path.dirname(__file__), '../uploads')) # 文本处理配置 DEFAULT_CHUNK_SIZE = 500 # 默认切块大小 @@ -62,6 +64,12 @@ class Config: REPORT_AGENT_MAX_TOOL_CALLS = int(os.environ.get('REPORT_AGENT_MAX_TOOL_CALLS', '5')) REPORT_AGENT_MAX_REFLECTION_ROUNDS = int(os.environ.get('REPORT_AGENT_MAX_REFLECTION_ROUNDS', '2')) REPORT_AGENT_TEMPERATURE = float(os.environ.get('REPORT_AGENT_TEMPERATURE', '0.5')) + + # Interview subsystem + INTERVIEW_MAX_TOKENS_PER_RUN = int(os.environ.get("INTERVIEW_MAX_TOKENS_PER_RUN", 15_000_000)) + INTERVIEW_MAX_WORKERS = int(os.environ.get("INTERVIEW_MAX_WORKERS", 8)) + INTERVIEW_DEFAULT_LANGUAGE = os.environ.get("INTERVIEW_DEFAULT_LANGUAGE", "de") + LLM_STUB_MODE = os.environ.get("LLM_STUB_MODE", "false").lower() == "true" @classmethod def validate(cls) -> list[str]: diff --git a/backend/app/models/interview.py b/backend/app/models/interview.py new file mode 100644 index 00000000..980efc82 --- /dev/null +++ b/backend/app/models/interview.py @@ -0,0 +1,99 @@ +from __future__ import annotations +from enum import Enum +from typing import Optional +from pydantic import BaseModel, Field, field_validator, model_validator + +class InterviewPhase(str, Enum): + T0 = "T0" + T1 = "T1" + +class SubagentKind(str, Enum): + LONGITUDINAL = "longitudinal" + DIVERSITY = "diversity" + DELPHI = "delphi" + SCENARIO = "scenario" + +class LikertItem(BaseModel): + item_id: str + de: str + en: str + scale: int = Field(ge=3, le=7) + family: Optional[str] = None + reverse_coded: bool = False + + @field_validator("scale") + @classmethod + def odd_scale(cls, v: int) -> int: + if v not in (3, 5, 7): + raise ValueError("scale must be 3, 5, or 7") + return v + +class LikertInstrument(BaseModel): + name: str + version: str = "1.0" + language_default: str = "de" + items: list[LikertItem] + + @model_validator(mode="after") + def unique_item_ids(self) -> "LikertInstrument": + ids = [i.item_id for i in self.items] + if len(set(ids)) != len(ids): + raise ValueError("duplicate item_id in instrument") + return self + +class LikertResponse(BaseModel): + agent_id: int + phase: InterviewPhase + responses: dict[str, int] + confidence: dict[str, float] = Field(default_factory=dict) + open_comment: Optional[str] = None + memory_available: bool = True + failed_items: list[str] = Field(default_factory=list) + + @model_validator(mode="after") + def values_in_range(self) -> "LikertResponse": + for k, v in self.responses.items(): + if not 1 <= v <= 5: + raise ValueError(f"response {k}={v} out of 1..5 range") + for k, v in self.confidence.items(): + if not 0.0 <= v <= 1.0: + raise ValueError(f"confidence {k}={v} out of 0..1 range") + return self + +class QSortStatement(BaseModel): + statement_id: str + de: str + en: str + +class QSortInstrument(BaseModel): + name: str + version: str = "1.0" + statements: list[QSortStatement] + distribution: list[int] # e.g. [2,3,4,6,4,3,2] for -3..+3 + +class QSortResponse(BaseModel): + agent_id: int + placements: dict[str, int] # statement_id -> bucket (-3..+3) + likert_axes: dict[str, int] # axis_id -> 1..7 + +class DelphiOpenResponse(BaseModel): + agent_id: int + round: int = 1 + answers: dict[str, str] # question_id -> free text + +class DelphiRatingResponse(BaseModel): + agent_id: int + round: int + ratings: dict[str, dict[str, int]] # theme_id -> {importance, plausibility} + justification: Optional[str] = None + +class ScenarioRating(BaseModel): + desirability: int = Field(ge=1, le=7) + plausibility: int = Field(ge=1, le=7) + impact_on_my_group: int = Field(ge=1, le=7) + fairness: int = Field(ge=1, le=7) + if_woke_up_response: str + +class ScenarioResponse(BaseModel): + agent_id: int + ratings: dict[str, ScenarioRating] # scenario_id -> rating diff --git a/backend/app/services/interview_orchestrator.py b/backend/app/services/interview_orchestrator.py new file mode 100644 index 00000000..d87e90ea --- /dev/null +++ b/backend/app/services/interview_orchestrator.py @@ -0,0 +1,222 @@ +from __future__ import annotations +from concurrent.futures import ThreadPoolExecutor, as_completed +from pathlib import Path +from typing import Protocol +from app.models.interview import ( + InterviewPhase, SubagentKind, LikertResponse, QSortResponse, + DelphiOpenResponse, DelphiRatingResponse, ScenarioResponse, +) +from app.services.interviews.base import PersonaRecord, SchemaValidationFailure +from app.services.interviews.longitudinal import LongitudinalSubagent, run_aggregate as longitudinal_aggregate +from app.services.interviews.diversity import DiversitySubagent, run_typology +from app.services.interviews.delphi import ( + DelphiSubagent, extract_themes, convergence_metrics, group_stats_from_r2, +) +from app.services.interviews.scenario import ScenarioSubagent, polarity_matrix +from app.services.interviews.storage import InterviewStore +from app.services.interviews.instrument_loader import freeze_snapshot + + +class PersonaProvider(Protocol): + def all(self) -> list[PersonaRecord]: ... + + +class InterviewOrchestrator: + def __init__( + self, llm, memory, personas: PersonaProvider, + instrument_dir: Path, store_root: Path, sim_id: str, + zep_writer, max_workers: int = 8, language: str = "de", + ): + self.llm = llm + self.memory = memory + self.personas = personas + self.instrument_dir = Path(instrument_dir) + self.store = InterviewStore(root=store_root, sim_id=sim_id) + self.zep_writer = zep_writer + self.max_workers = max_workers + self.language = language + # Freeze snapshot once per orchestrator lifetime + freeze_snapshot( + instruments={ + "longitudinal": self.instrument_dir / "longitudinal_v1.yaml", + "diversity": self.instrument_dir / "diversity_v1.yaml", + "delphi": self.instrument_dir / "delphi_v1.yaml", + "scenario": self.instrument_dir / "scenario_v1.yaml", + }, + out_path=self.store.base / "instruments_used.json", + ) + + # --- Generic per-agent runner --- + def _fan_out(self, run_dir, agent_fn, personas, audit_label): + ok: list = [] + failed: list[int] = [] + with ThreadPoolExecutor(max_workers=self.max_workers) as pool: + futures = {pool.submit(agent_fn, p): p for p in personas} + for fut in as_completed(futures): + p = futures[fut] + try: + out = fut.result() + ok.append(out) + self.store.append_response(run_dir, out) + except SchemaValidationFailure as e: + failed.append(p.agent_id) + self.store.audit(run_dir, agent_id=p.agent_id, + event="schema_validation_failure", + detail={"label": audit_label, "attempts": e.attempts}) + except Exception as e: + failed.append(p.agent_id) + self.store.audit(run_dir, agent_id=p.agent_id, + event="agent_failed", detail=f"{audit_label}: {e!r}") + return ok, failed + + # --- Pre-phase (T0) --- + def run_pre(self) -> dict: + sub = LongitudinalSubagent(self.llm, self.memory, + self.instrument_dir / "longitudinal_v1.yaml", + language=self.language) + run_dir = self.store.start_run(InterviewPhase.T0, SubagentKind.LONGITUDINAL) + ok, failed = self._fan_out( + run_dir, lambda p: sub.administer(p, phase=InterviewPhase.T0), + self.personas.all(), audit_label="longitudinal_T0", + ) + for r in ok: + persona = next(p for p in self.personas.all() if p.agent_id == r.agent_id) + try: self.zep_writer.write_per_agent(SubagentKind.LONGITUDINAL, r, persona.name) + except Exception: pass + self.store.mark_latest(run_dir) + return {"longitudinal": {"n_responded": len(ok), "n_failed": len(failed), + "run_dir": str(run_dir)}} + + # --- Post-phase (T1) --- + def run_post(self) -> dict: + personas = self.personas.all() + out: dict = {} + with ThreadPoolExecutor(max_workers=4) as pool: + futures = { + "longitudinal": pool.submit(self._post_longitudinal, personas), + "diversity": pool.submit(self._post_diversity, personas), + "scenario": pool.submit(self._post_scenario, personas), + } + for name, fut in futures.items(): + try: out[name] = fut.result() + except Exception as e: out[name] = {"error": repr(e)} + # Delphi runs sequentially (R1 → R2 → R3) and uses the LLM for theme extraction + try: out["delphi"] = self._post_delphi(personas) + except Exception as e: out["delphi"] = {"error": repr(e)} + return out + + def _post_longitudinal(self, personas) -> dict: + sub = LongitudinalSubagent(self.llm, self.memory, + self.instrument_dir / "longitudinal_v1.yaml", + language=self.language) + run_dir = self.store.start_run(InterviewPhase.T1, SubagentKind.LONGITUDINAL) + ok, failed = self._fan_out( + run_dir, lambda p: sub.administer(p, phase=InterviewPhase.T1), + personas, audit_label="longitudinal_T1", + ) + # Aggregate using T0 + T1 + t0_path = self.store.latest_run(InterviewPhase.T0, SubagentKind.LONGITUDINAL) + t0_raw = self.store.read_responses(t0_path) if t0_path else [] + t0 = [LikertResponse(**d) for d in t0_raw] + agg = longitudinal_aggregate(t0, ok) + self.store.write_aggregate(run_dir, agg) + for r in ok: + persona = next(p for p in personas if p.agent_id == r.agent_id) + try: self.zep_writer.write_per_agent(SubagentKind.LONGITUDINAL, r, persona.name) + except Exception: pass + try: self.zep_writer.write_aggregate(SubagentKind.LONGITUDINAL, + f"n_paired={agg['n_paired']}") + except Exception: pass + self.store.mark_latest(run_dir) + return {"n_responded": len(ok), "n_failed": len(failed), "run_dir": str(run_dir)} + + def _post_diversity(self, personas) -> dict: + sub = DiversitySubagent(self.llm, self.memory, + self.instrument_dir / "diversity_v1.yaml", + language=self.language) + run_dir = self.store.start_run(InterviewPhase.T1, SubagentKind.DIVERSITY) + ok, failed = self._fan_out( + run_dir, lambda p: sub.administer(p), personas, audit_label="diversity", + ) + typology = run_typology(ok) + self.store.write_named(run_dir, "typology.json", typology) + self.store.write_aggregate(run_dir, {"n": len(ok), "n_failed": len(failed), + "clusters": typology["clusters"]}) + for r in ok: + persona = next(p for p in personas if p.agent_id == r.agent_id) + try: self.zep_writer.write_per_agent(SubagentKind.DIVERSITY, r, persona.name) + except Exception: pass + self.store.mark_latest(run_dir) + return {"n_responded": len(ok), "n_failed": len(failed), "run_dir": str(run_dir)} + + def _post_scenario(self, personas) -> dict: + sub = ScenarioSubagent(self.llm, self.memory, + self.instrument_dir / "scenario_v1.yaml", + language=self.language) + run_dir = self.store.start_run(InterviewPhase.T1, SubagentKind.SCENARIO) + ok, failed = self._fan_out( + run_dir, lambda p: sub.administer(p), personas, audit_label="scenario", + ) + matrix = polarity_matrix(ok) + self.store.write_named(run_dir, "polarity_matrix.json", matrix) + self.store.write_aggregate(run_dir, {"n": len(ok), "n_failed": len(failed), + "polarity": matrix}) + for r in ok: + persona = next(p for p in personas if p.agent_id == r.agent_id) + try: self.zep_writer.write_per_agent(SubagentKind.SCENARIO, r, persona.name) + except Exception: pass + self.store.mark_latest(run_dir) + return {"n_responded": len(ok), "n_failed": len(failed), "run_dir": str(run_dir)} + + def _post_delphi(self, personas) -> dict: + sub = DelphiSubagent(self.llm, self.memory, + self.instrument_dir / "delphi_v1.yaml", + language=self.language) + run_dir = self.store.start_run(InterviewPhase.T1, SubagentKind.DELPHI) + # Round 1 + r1_ok, r1_failed = self._fan_out( + run_dir, lambda p: sub.administer_round1(p), personas, audit_label="delphi_r1", + ) + # Move all R1 responses into a dedicated file + for r in r1_ok: self.store.append_jsonl(run_dir, "round1_themes.jsonl", r) + # Extract themes from R1 + themes = extract_themes(r1_ok, llm=self.llm) + self.store.write_named(run_dir, "themes.json", {"themes": themes}) + # Round 2 + r2_ok, r2_failed = self._fan_out( + run_dir, lambda p: sub.administer_round2(p, themes), + [p for p in personas if p.agent_id in {r.agent_id for r in r1_ok}], + audit_label="delphi_r2", + ) + for r in r2_ok: self.store.append_jsonl(run_dir, "round2_ratings.jsonl", r) + gstats = group_stats_from_r2(r2_ok) + # Round 3 + r2_by = {r.agent_id: r for r in r2_ok} + r3_personas = [p for p in personas if p.agent_id in r2_by] + def r3_call(p): return sub.administer_round3(p, themes, gstats, r2_by[p.agent_id]) + r3_ok, r3_failed = self._fan_out(run_dir, r3_call, r3_personas, audit_label="delphi_r3") + for r in r3_ok: self.store.append_jsonl(run_dir, "round3_revisions.jsonl", r) + # Convergence + conv = convergence_metrics(r2_ok, r3_ok) + self.store.write_named(run_dir, "convergence.json", conv) + self.store.write_aggregate(run_dir, { + "n_r1": len(r1_ok), "n_r2": len(r2_ok), "n_r3": len(r3_ok), + "n_failed_r1": len(r1_failed), "n_failed_r2": len(r2_failed), "n_failed_r3": len(r3_failed), + "themes": themes, + }) + for r in r3_ok: + persona = next(p for p in personas if p.agent_id == r.agent_id) + try: self.zep_writer.write_per_agent(SubagentKind.DELPHI, r, persona.name) + except Exception: pass + self.store.mark_latest(run_dir) + return {"n_r1": len(r1_ok), "n_r2": len(r2_ok), "n_r3": len(r3_ok), + "run_dir": str(run_dir)} + + # --- Re-run a single subagent --- + def rerun(self, subagent: SubagentKind) -> dict: + personas = self.personas.all() + if subagent == SubagentKind.LONGITUDINAL: return {"longitudinal": self._post_longitudinal(personas)} + if subagent == SubagentKind.DIVERSITY: return {"diversity": self._post_diversity(personas)} + if subagent == SubagentKind.SCENARIO: return {"scenario": self._post_scenario(personas)} + if subagent == SubagentKind.DELPHI: return {"delphi": self._post_delphi(personas)} + raise ValueError(f"unknown subagent {subagent}") diff --git a/backend/app/services/interview_synthesizer.py b/backend/app/services/interview_synthesizer.py new file mode 100644 index 00000000..a74609ae --- /dev/null +++ b/backend/app/services/interview_synthesizer.py @@ -0,0 +1,160 @@ +from __future__ import annotations +import csv +import json +from pathlib import Path +from app.models.interview import InterviewPhase, SubagentKind +from app.services.interviews.storage import InterviewStore + + +class InterviewSynthesizer: + def __init__(self, store: InterviewStore): + self.store = store + + def _maybe(self, phase: InterviewPhase, sub: SubagentKind) -> dict | None: + run = self.store.latest_run(phase, sub) + if run is None: + return None + agg = run / "aggregate.json" + if not agg.exists(): + return None + return {"run_dir": str(run), "aggregate": json.loads(agg.read_text(encoding="utf-8"))} + + def _instrument_hashes(self) -> dict: + snap = self.store.base / "instruments_used.json" + if not snap.exists(): + return {} + try: + data = json.loads(snap.read_text(encoding="utf-8")) + except Exception: + return {} + return {k: v.get("hash") for k, v in data.items()} + + def _limitations_text(self, present: dict[str, bool]) -> str: + lines = [ + "## Limitations", + "- **Simulated, not real stakeholders.** Responses reflect how the seed-document discourse " + "and the LLM jointly encode each stakeholder type, not what an actual fisher or NGO " + "staffer would say. The instrument measures the *model of the stakeholder*, not the stakeholder.", + "- **Memory digest is lossy.** Each agent's experience of OASIS is summarised to bounded length; " + "agents do not have full episodic recall.", + "- **LLM acquiescence and centrality bias.** Likert scales with LLM respondents skew toward 3–4 " + "of 5; check per-item distribution shape before drawing conclusions.", + "- **N is what it is.** `n_responded` and `n_failed` are printed verbatim per subagent; no smoothing.", + "- **Instrument provenance.** Hashes of frozen instruments are listed below; an identical run " + "is reproducible from these snapshots.", + ] + for k, ok in present.items(): + if not ok: + lines.append(f"- *{k}* subagent results are missing for this run.") + return "\n".join(lines) + + def run(self) -> str: + sections: list[str] = [] + sections.append("# Stakeholder Interview Synthesis\n") + + long_t0 = self._maybe(InterviewPhase.T0, SubagentKind.LONGITUDINAL) + long_t1 = self._maybe(InterviewPhase.T1, SubagentKind.LONGITUDINAL) + if long_t1: + agg = long_t1["aggregate"] + sections.append("## Longitudinal opinion drift (T0 → T1)") + sections.append(f"- N paired: {agg.get('n_paired', 'NA')}") + per_item = agg.get("per_item", {}) + top = sorted(per_item.items(), + key=lambda kv: abs(kv[1].get("mean_delta") or 0), reverse=True)[:5] + sections.append("- Largest mean shifts:") + for k, v in top: + sections.append(f" - `{k}`: Δ̄ = {v.get('mean_delta'):+0.2f} (n={v.get('n')})") + + diversity = self._maybe(InterviewPhase.T1, SubagentKind.DIVERSITY) + if diversity: + clusters = diversity["aggregate"].get("clusters", []) + sections.append("## Stakeholder typology") + sections.append(f"- N agents: {diversity['aggregate'].get('n', 'NA')}") + sections.append(f"- Clusters: {len(clusters)}") + for c in clusters: + sections.append(f" - cluster {c['cluster_id']}: n={c['n']}, " + f"top loadings = {list(c['top_loadings'].keys())[:5]}") + + delphi = self._maybe(InterviewPhase.T1, SubagentKind.DELPHI) + if delphi: + agg = delphi["aggregate"] + sections.append("## Delphi consensus") + sections.append(f"- Rounds completed: R1={agg.get('n_r1')}, R2={agg.get('n_r2')}, R3={agg.get('n_r3')}") + themes = agg.get("themes", []) + sections.append(f"- Themes: {[t.get('label') for t in themes]}") + + scenario = self._maybe(InterviewPhase.T1, SubagentKind.SCENARIO) + if scenario: + pol = scenario["aggregate"].get("polarity", {}) + sections.append("## Scenario evaluation") + for sid in sorted(pol): + v = pol[sid] + if v.get("n", 0) == 0: + continue + sections.append( + f"- **{sid}**: n={v['n']}, desirability {v['mean_desirability']:.2f}, " + f"plausibility {v['mean_plausibility']:.2f}, impact {v['mean_impact']:.2f}, " + f"fairness {v['mean_fairness']:.2f}") + + sections.append("") + sections.append(self._limitations_text({ + "longitudinal": bool(long_t1), + "diversity": bool(diversity), + "delphi": bool(delphi), + "scenario": bool(scenario), + })) + sections.append("") + sections.append("### Instrument provenance") + for name, h in self._instrument_hashes().items(): + sections.append(f"- `{name}`: hash `{h}`") + + report = "\n\n".join(sections) + out_dir = self.store.base / "synthesis" + out_dir.mkdir(parents=True, exist_ok=True) + (out_dir / "report.md").write_text(report, encoding="utf-8") + self._write_tidy_csv(out_dir / "exports" / "all_responses.csv") + return report + + def _write_tidy_csv(self, csv_path: Path) -> None: + csv_path.parent.mkdir(parents=True, exist_ok=True) + rows: list[dict] = [] + for phase in (InterviewPhase.T0, InterviewPhase.T1): + for sub in SubagentKind: + run = self.store.latest_run(phase, sub) + if run is None: + continue + files = ["responses.jsonl", "round1_themes.jsonl", + "round2_ratings.jsonl", "round3_revisions.jsonl"] + for fname in files: + for rec in self.store.read_responses(run, fname): + flat = self._flatten(rec, phase=phase.value, subagent=sub.value) + rows.extend(flat) + if not rows: + csv_path.write_text("phase,subagent,agent_id,key,value\n", encoding="utf-8") + return + fieldnames = sorted({k for r in rows for k in r.keys()}) + with csv_path.open("w", encoding="utf-8", newline="") as f: + w = csv.DictWriter(f, fieldnames=fieldnames) + w.writeheader() + for r in rows: + w.writerow(r) + + def _flatten(self, rec: dict, *, phase: str, subagent: str) -> list[dict]: + out: list[dict] = [] + aid = rec.get("agent_id") + for key, val in rec.items(): + if key == "agent_id": + continue + if isinstance(val, dict): + for k2, v2 in val.items(): + if isinstance(v2, dict): + for k3, v3 in v2.items(): + out.append({"phase": phase, "subagent": subagent, "agent_id": aid, + "key": f"{key}.{k2}.{k3}", "value": v3}) + else: + out.append({"phase": phase, "subagent": subagent, "agent_id": aid, + "key": f"{key}.{k2}", "value": v2}) + else: + out.append({"phase": phase, "subagent": subagent, "agent_id": aid, + "key": key, "value": val}) + return out diff --git a/backend/app/services/interviews/__init__.py b/backend/app/services/interviews/__init__.py new file mode 100644 index 00000000..e69de29b diff --git a/backend/app/services/interviews/adapters.py b/backend/app/services/interviews/adapters.py new file mode 100644 index 00000000..06d05e94 --- /dev/null +++ b/backend/app/services/interviews/adapters.py @@ -0,0 +1,133 @@ +from __future__ import annotations +import csv +import json +from pathlib import Path +from typing import Optional +from app.services.interviews.base import PersonaRecord, MemoryDigest + + +class FileSystemPersonaProvider: + """Reads OASIS profiles from the simulation's `reddit_profiles.json` and/or `twitter_profiles.csv`. + + If both are present, agents from `reddit_profiles.json` take precedence; twitter-only agents are appended. + """ + + def __init__(self, reddit_path: Optional[Path], twitter_path: Optional[Path]): + self.reddit_path = Path(reddit_path) if reddit_path else None + self.twitter_path = Path(twitter_path) if twitter_path else None + + def _load_reddit(self) -> list[PersonaRecord]: + if not self.reddit_path or not self.reddit_path.exists(): + return [] + data = json.loads(self.reddit_path.read_text(encoding="utf-8")) + out = [] + for row in data: + out.append(PersonaRecord( + agent_id=int(row.get("user_id")), + name=str(row.get("name") or row.get("user_name") or f"agent_{row.get('user_id')}"), + persona=str(row.get("persona") or row.get("bio") or ""), + profession=row.get("profession"), + bio=row.get("bio"), + )) + return out + + def _load_twitter(self) -> list[PersonaRecord]: + if not self.twitter_path or not self.twitter_path.exists(): + return [] + out = [] + with self.twitter_path.open("r", encoding="utf-8", newline="") as f: + for row in csv.DictReader(f): + if not row.get("user_id"): + continue + out.append(PersonaRecord( + agent_id=int(row["user_id"]), + name=str(row.get("name") or row.get("user_name") or f"agent_{row['user_id']}"), + persona=str(row.get("persona") or row.get("bio") or ""), + profession=row.get("profession"), + bio=row.get("bio"), + )) + return out + + def all(self) -> list[PersonaRecord]: + reddit = self._load_reddit() + seen = {p.agent_id for p in reddit} + twitter = [p for p in self._load_twitter() if p.agent_id not in seen] + return reddit + twitter + + def agent_to_entity(self) -> dict[int, str]: + """Build the ``{agent_id: zep_entity_uuid}`` map from the persisted profile files. + + Both writers (``oasis_profile_generator._save_reddit_json`` and + ``_save_twitter_csv``) emit ``source_entity_uuid`` per agent. Reddit takes + precedence; rows with a missing/blank uuid are skipped. + Returns an empty dict if neither file is present or no row has the field. + """ + mapping: dict[int, str] = {} + + # Reddit JSON + if self.reddit_path and self.reddit_path.exists(): + try: + rows = json.loads(self.reddit_path.read_text(encoding="utf-8")) + for row in rows: + uid = row.get("user_id") + uuid_ = row.get("source_entity_uuid") + if uid is None or not uuid_: + continue + mapping[int(uid)] = str(uuid_) + except (json.JSONDecodeError, ValueError, TypeError): + pass + + # Twitter CSV (only fills agents not already mapped) + if self.twitter_path and self.twitter_path.exists(): + try: + with self.twitter_path.open("r", encoding="utf-8", newline="") as f: + for row in csv.DictReader(f): + uid = row.get("user_id") + uuid_ = row.get("source_entity_uuid") + if not uid or not uuid_: + continue + try: + uid_int = int(uid) + except (TypeError, ValueError): + continue + if uid_int not in mapping: + mapping[uid_int] = str(uuid_) + except OSError: + pass + + return mapping + + +class ZepMemoryProvider: + """Builds a bounded memory digest per agent from Zep entity context. + + Maps `agent_id` (OASIS user_id) to a Zep entity UUID; falls back to the agent_id as a string. + """ + + def __init__(self, entity_reader, graph_id: str, agent_to_entity: dict[int, str] | None = None): + self.reader = entity_reader + self.graph_id = graph_id + self.map = dict(agent_to_entity or {}) + + def get_digest(self, agent_id: int, max_chars: int = 2000) -> MemoryDigest: + entity_uuid = self.map.get(agent_id) or str(agent_id) + try: + ctx = self.reader.get_entity_with_context(self.graph_id, entity_uuid) + except Exception: + return MemoryDigest(text=f"[no memory for agent {agent_id}]", available=False) + parts: list[str] = [] + name = getattr(ctx, "name", None) + summary = getattr(ctx, "summary", None) + if name: + parts.append(f"Name: {name}") + if summary: + parts.append(f"Summary: {summary}") + edges = getattr(ctx, "related_edges", []) or [] + for e in edges[:20]: + fact = e.get("fact") if isinstance(e, dict) else getattr(e, "fact", None) + if fact: + parts.append(f"- {fact}") + text = "\n".join(parts) + if len(text) > max_chars: + text = text[: max_chars - 1] + "…" + return MemoryDigest(text=text or f"[empty memory for agent {agent_id}]", available=True) diff --git a/backend/app/services/interviews/base.py b/backend/app/services/interviews/base.py new file mode 100644 index 00000000..0eb2f821 --- /dev/null +++ b/backend/app/services/interviews/base.py @@ -0,0 +1,107 @@ +from __future__ import annotations +from dataclasses import dataclass, field +from typing import Any, Callable, Optional, Protocol + + +@dataclass +class PersonaRecord: + agent_id: int + name: str + persona: str + profession: Optional[str] = None + bio: Optional[str] = None + + +@dataclass +class MemoryDigest: + text: str + available: bool = True + + +class MemoryProvider(Protocol): + def get_digest(self, agent_id: int, max_chars: int = 2000) -> MemoryDigest: ... + + +def coerce_int(value: Any) -> Optional[int]: + """Coerce LLM-returned Likert values into ints. + + Real LLMs frequently return numeric Likert responses as JSON strings + (e.g. "3" instead of 3). Returns the int if value is an int or a string + that round-trips through int(); otherwise None. Bools are rejected so + True/False aren't accepted as 1/0. + """ + if isinstance(value, bool): + return None + if isinstance(value, int): + return value + if isinstance(value, str): + s = value.strip() + if s and s.lstrip("-").isdigit(): + try: + return int(s) + except ValueError: + return None + return None + + +class SchemaValidationFailure(ValueError): + def __init__(self, agent_id: int, attempts: list[dict]): + super().__init__(f"agent {agent_id}: schema violation after retry") + self.agent_id = agent_id + self.attempts = attempts + + +class StakeholderInterviewer: + def __init__(self, llm, memory: MemoryProvider, language: str = "de"): + self.llm = llm + self.memory = memory + self.language = language + + def _system_prompt(self, persona: PersonaRecord, digest: MemoryDigest, schema_hint: str) -> str: + memory_block = digest.text if digest.available else "[no simulation memory available]" + lang_note = "Antworte ausschließlich auf Deutsch." if self.language == "de" else "Answer in English." + return ( + f"You are {persona.name}. {persona.persona}\n\n" + "You are answering a survey about the future of German fisheries. " + "Answer strictly in character based on your background, values, and what you experienced " + "during the simulated social media discourse summarised below.\n\n" + f"--- simulation memory digest ---\n{memory_block}\n--- end ---\n\n" + f"{lang_note} Return JSON ONLY matching this schema:\n{schema_hint}" + ) + + def ask_in_character( + self, + persona: PersonaRecord, + user_prompt: str, + schema_hint: str, + *, + temperature: float = 0.3, + max_tokens: Optional[int] = None, + validate: Optional[Callable[[dict], Optional[dict]]] = None, + ) -> dict: + digest = self.memory.get_digest(persona.agent_id) + messages = [ + {"role": "system", "content": self._system_prompt(persona, digest, schema_hint)}, + {"role": "user", "content": user_prompt}, + ] + first = self.llm.chat_json(messages=messages, temperature=temperature, max_tokens=max_tokens) + if validate is not None: + validated = validate(first) + if validated is not None: + return validated + messages.append({"role": "assistant", "content": str(first)}) + messages.append({"role": "user", "content": + "Your previous response did not match the required schema. " + f"Return ONLY valid JSON matching: {schema_hint}"}) + second = self.llm.chat_json(messages=messages, temperature=0.0, max_tokens=max_tokens) + validated = validate(second) + if validated is None: + raise SchemaValidationFailure( + persona.agent_id, + attempts=[ + {"attempt": 1, "raw": first, "schema_hint": schema_hint}, + {"attempt": 2, "raw": second, "schema_hint": schema_hint}, + ], + ) + return validated + return first diff --git a/backend/app/services/interviews/delphi.py b/backend/app/services/interviews/delphi.py new file mode 100644 index 00000000..198da793 --- /dev/null +++ b/backend/app/services/interviews/delphi.py @@ -0,0 +1,203 @@ +from __future__ import annotations +import json +import statistics +from pathlib import Path +from typing import Optional +import yaml +from app.models.interview import ( + DelphiOpenResponse, DelphiRatingResponse, +) +from app.services.interviews.base import StakeholderInterviewer, PersonaRecord, coerce_int + + +class DelphiSubagent: + def __init__(self, llm, memory, instrument_path: Path, language: str = "de"): + with Path(instrument_path).open("r", encoding="utf-8") as f: + self.instrument = yaml.safe_load(f) + self.interviewer = StakeholderInterviewer(llm=llm, memory=memory, language=language) + self.llm = llm + self.language = language + + # --- Round 1: open questions --- + def _r1_schema(self) -> str: + return json.dumps({ + "answers": {q["question_id"]: "" for q in self.instrument["questions"]} + }, ensure_ascii=False) + + def _r1_prompt(self) -> str: + lines = ["Bitte beantworten Sie offen:" if self.language == "de" else "Please answer openly:"] + for q in self.instrument["questions"]: + txt = q["de"] if self.language == "de" else q["en"] + lines.append(f"[{q['question_id']}] {txt}") + return "\n".join(lines) + + def _r1_validate(self, raw: dict) -> Optional[dict]: + if not isinstance(raw, dict): return None + ans = raw.get("answers") + if not isinstance(ans, dict): return None + required = {q["question_id"] for q in self.instrument["questions"]} + if not required.issubset(ans.keys()): return None + return raw + + def administer_round1(self, persona: PersonaRecord) -> DelphiOpenResponse: + raw = self.interviewer.ask_in_character( + persona, user_prompt=self._r1_prompt(), + schema_hint=self._r1_schema(), validate=self._r1_validate, + ) + return DelphiOpenResponse(agent_id=persona.agent_id, round=1, + answers={k: str(v) for k, v in raw["answers"].items()}) + + # --- Round 2: rate themes --- + def _r2_schema(self, theme_ids: list[str]) -> str: + return json.dumps({ + "ratings": {tid: {"importance": "", "plausibility": ""} for tid in theme_ids} + }, ensure_ascii=False) + + def _r2_prompt(self, themes: list[dict]) -> str: + head = "Bewerten Sie jedes Thema nach Wichtigkeit (1-5) und Plausibilität (1-5):" if self.language == "de" \ + else "Rate each theme on importance (1-5) and plausibility (1-5):" + body = [f"- [{t['theme_id']}] {t['label']}" for t in themes] + return head + "\n" + "\n".join(body) + + def _r2_validate(self, theme_ids: list[str]): + def v(raw: dict) -> Optional[dict]: + if not isinstance(raw, dict): return None + ratings = raw.get("ratings", {}) + if set(ratings.keys()) != set(theme_ids): return None + for tid, r in ratings.items(): + if not isinstance(r, dict): return None + coerced: dict[str, int] = {} + for key in ("importance", "plausibility"): + iv = coerce_int(r.get(key)) + if iv is None or not 1 <= iv <= 5: return None + coerced[key] = iv + ratings[tid] = coerced + return raw + return v + + def administer_round2(self, persona: PersonaRecord, themes: list[dict]) -> DelphiRatingResponse: + theme_ids = [t["theme_id"] for t in themes] + raw = self.interviewer.ask_in_character( + persona, user_prompt=self._r2_prompt(themes), + schema_hint=self._r2_schema(theme_ids), validate=self._r2_validate(theme_ids), + ) + return DelphiRatingResponse(agent_id=persona.agent_id, round=2, + ratings={k: dict(v) for k, v in raw["ratings"].items()}) + + # --- Round 3: revise after seeing group stats --- + def administer_round3( + self, persona: PersonaRecord, themes: list[dict], group_stats: dict, own_r2: DelphiRatingResponse + ) -> DelphiRatingResponse: + theme_ids = [t["theme_id"] for t in themes] + head = ("Sie sehen unten die anonymisierten Gruppenwerte (Median, IQR). " + "Bitte überarbeiten Sie Ihre Bewertungen, wenn Sie möchten, und begründen Sie kurz.") \ + if self.language == "de" else \ + ("Below are the anonymised group values (median, IQR). " + "Please revise your ratings if you wish and add a short justification.") + ctx_lines = [] + for t in themes: + tid = t["theme_id"] + gs = group_stats.get(tid, {}) + own = own_r2.ratings.get(tid, {}) + ctx_lines.append( + f"[{tid}] {t['label']} — group importance median={gs.get('imp_median')}, " + f"IQR={gs.get('imp_iqr')}; plausibility median={gs.get('plaus_median')}, " + f"IQR={gs.get('plaus_iqr')}. Your R2: imp={own.get('importance')}, plaus={own.get('plausibility')}." + ) + prompt = head + "\n\n" + "\n".join(ctx_lines) + schema = json.dumps({ + "ratings": {tid: {"importance": "", "plausibility": ""} for tid in theme_ids}, + "justification": "", + }, ensure_ascii=False) + + def validate(raw): + if not isinstance(raw, dict): return None + ratings = raw.get("ratings", {}) + if set(ratings.keys()) != set(theme_ids): return None + for tid, r in ratings.items(): + if not isinstance(r, dict): return None + coerced: dict[str, int] = {} + for key in ("importance", "plausibility"): + iv = coerce_int(r.get(key)) + if iv is None or not 1 <= iv <= 5: return None + coerced[key] = iv + ratings[tid] = coerced + return raw + + raw = self.interviewer.ask_in_character(persona, user_prompt=prompt, + schema_hint=schema, validate=validate) + return DelphiRatingResponse( + agent_id=persona.agent_id, round=3, + ratings={k: dict(v) for k, v in raw["ratings"].items()}, + justification=raw.get("justification"), + ) + + +def extract_themes(round1: list[DelphiOpenResponse], llm) -> list[dict]: + text_blocks = [] + for r in round1: + for qid, ans in r.answers.items(): + text_blocks.append(f"[agent {r.agent_id} {qid}] {ans}") + schema = json.dumps({"themes": [{"theme_id": "", "label": ""}]}, ensure_ascii=False) + messages = [ + {"role": "system", "content": + "You extract distinct thematic codes from open-ended German fisheries survey responses. " + f"Return JSON ONLY matching: {schema}. Use stable theme_ids of form theme_0, theme_1, …"}, + {"role": "user", "content": "Responses:\n" + "\n".join(text_blocks) + "\n\nReturn up to 12 distinct themes."}, + ] + raw = llm.chat_json(messages=messages, temperature=0.0) + themes = raw.get("themes", []) if isinstance(raw, dict) else [] + out = [] + for i, t in enumerate(themes): + if isinstance(t, dict) and "label" in t: + out.append({"theme_id": t.get("theme_id") or f"theme_{i}", "label": str(t["label"])}) + return out + + +def _iqr(xs: list[float]) -> float: + if not xs: return 0.0 + xs = sorted(xs) + q1 = statistics.quantiles(xs, n=4)[0] if len(xs) >= 4 else xs[0] + q3 = statistics.quantiles(xs, n=4)[2] if len(xs) >= 4 else xs[-1] + return q3 - q1 + + +def convergence_metrics(r2: list[DelphiRatingResponse], r3: list[DelphiRatingResponse]) -> dict: + by_r2 = {r.agent_id: r for r in r2} + by_r3 = {r.agent_id: r for r in r3} + themes: set[str] = set() + for r in r2 + r3: + themes.update(r.ratings.keys()) + out: dict[str, dict] = {} + for t in sorted(themes): + imp_r2 = [by_r2[a].ratings[t]["importance"] for a in by_r2 if t in by_r2[a].ratings] + imp_r3 = [by_r3[a].ratings[t]["importance"] for a in by_r3 if t in by_r3[a].ratings] + plaus_r2 = [by_r2[a].ratings[t]["plausibility"] for a in by_r2 if t in by_r2[a].ratings] + plaus_r3 = [by_r3[a].ratings[t]["plausibility"] for a in by_r3 if t in by_r3[a].ratings] + out[t] = { + "imp_median_r2": statistics.median(imp_r2) if imp_r2 else None, + "imp_median_r3": statistics.median(imp_r3) if imp_r3 else None, + "imp_iqr_r2": _iqr(imp_r2), + "imp_iqr_r3": _iqr(imp_r3), + "delta_iqr_importance": _iqr(imp_r3) - _iqr(imp_r2), + "plaus_iqr_r2": _iqr(plaus_r2), + "plaus_iqr_r3": _iqr(plaus_r3), + "delta_iqr_plausibility": _iqr(plaus_r3) - _iqr(plaus_r2), + } + return out + + +def group_stats_from_r2(r2: list[DelphiRatingResponse]) -> dict: + themes: set[str] = set() + for r in r2: themes.update(r.ratings.keys()) + stats: dict[str, dict] = {} + for t in themes: + imps = [r.ratings[t]["importance"] for r in r2 if t in r.ratings] + plauss = [r.ratings[t]["plausibility"] for r in r2 if t in r.ratings] + stats[t] = { + "imp_median": statistics.median(imps) if imps else None, + "imp_iqr": _iqr(imps), + "plaus_median": statistics.median(plauss) if plauss else None, + "plaus_iqr": _iqr(plauss), + } + return stats diff --git a/backend/app/services/interviews/diversity.py b/backend/app/services/interviews/diversity.py new file mode 100644 index 00000000..2c129828 --- /dev/null +++ b/backend/app/services/interviews/diversity.py @@ -0,0 +1,140 @@ +from __future__ import annotations +import json +from pathlib import Path +from typing import Optional +import numpy as np +from sklearn.decomposition import PCA +from sklearn.cluster import KMeans +import yaml +from app.models.interview import QSortResponse +from app.services.interviews.base import StakeholderInterviewer, PersonaRecord, coerce_int +from app.services.interviews.instrument_loader import InstrumentValidationError + + +class DiversitySubagent: + def __init__(self, llm, memory, instrument_path: Path, language: str = "de"): + self.instrument = self._load(Path(instrument_path)) + self.interviewer = StakeholderInterviewer(llm=llm, memory=memory, language=language) + self.language = language + + def _load(self, path: Path) -> dict: + with path.open("r", encoding="utf-8") as f: + data = yaml.safe_load(f) + if not isinstance(data, dict) or "statements" not in data or "distribution" not in data: + raise InstrumentValidationError(f"invalid diversity instrument: {path}") + if sum(data["distribution"]) != len(data["statements"]): + raise InstrumentValidationError("distribution sum must equal number of statements") + return data + + def _schema_hint(self) -> str: + return json.dumps({ + "placements": {s["statement_id"]: "" for s in self.instrument["statements"]}, + "likert_axes": {a["axis_id"]: "" for a in self.instrument["likert_axes"]}, + }, ensure_ascii=False) + + def _user_prompt(self) -> str: + dist = self.instrument["distribution"] + buckets = list(range(-3, 4)) + bucket_desc = ", ".join(f"{b}:{n}" for b, n in zip(buckets, dist)) + lines = [ + ("Ordnen Sie jede Aussage genau einer Box von -3 (lehne stark ab) bis +3 (stimme stark zu) zu. " + f"Die Verteilung ist erzwungen: {bucket_desc}.") if self.language == "de" else + ("Place every statement into exactly one box from -3 (strongly disagree) to +3 (strongly agree). " + f"The distribution is forced: {bucket_desc}."), + "", + "Statements:", + ] + for s in self.instrument["statements"]: + txt = s["de"] if self.language == "de" else s["en"] + lines.append(f"- [{s['statement_id']}] {txt}") + lines += ["", "Then rate each axis from 1 to 7:"] + for a in self.instrument["likert_axes"]: + txt = a["de"] if self.language == "de" else a["en"] + lines.append(f"- [{a['axis_id']}] {txt}") + return "\n".join(lines) + + def _validator(self, raw: dict) -> Optional[dict]: + if not isinstance(raw, dict): + return None + placements = raw.get("placements", {}) + axes = raw.get("likert_axes", {}) + statements = {s["statement_id"] for s in self.instrument["statements"]} + if set(placements.keys()) != statements: + return None + dist = self.instrument["distribution"] + target = {b: n for b, n in zip(range(-3, 4), dist)} + got: dict[int, int] = {} + coerced_p: dict[str, int] = {} + for k, v in placements.items(): + iv = coerce_int(v) + if iv is None or not -3 <= iv <= 3: + return None + coerced_p[k] = iv + got[iv] = got.get(iv, 0) + 1 + if got != target: + return None + coerced_a: dict[str, int] = {} + for a in self.instrument["likert_axes"]: + iv = coerce_int(axes.get(a["axis_id"])) + if iv is None or not 1 <= iv <= 7: + return None + coerced_a[a["axis_id"]] = iv + raw["placements"] = coerced_p + raw["likert_axes"] = coerced_a + return raw + + def administer(self, persona: PersonaRecord) -> QSortResponse: + raw = self.interviewer.ask_in_character( + persona, + user_prompt=self._user_prompt(), + schema_hint=self._schema_hint(), + validate=self._validator, + ) + return QSortResponse( + agent_id=persona.agent_id, + placements={k: int(v) for k, v in raw["placements"].items()}, + likert_axes={k: int(v) for k, v in raw["likert_axes"].items()}, + ) + + +def _vectorize(r: QSortResponse, statements: list[str], axes: list[str]) -> np.ndarray: + return np.array( + [r.placements.get(s, 0) for s in statements] + + [r.likert_axes.get(a, 4) for a in axes], + dtype=float, + ) + + +def run_typology(responses: list[QSortResponse], n_clusters: int = 4) -> dict: + if not responses: + return {"n": 0, "clusters": [], "pca": {"components": [], "explained_variance": []}} + statements = sorted({k for r in responses for k in r.placements}) + axes = sorted({k for r in responses for k in r.likert_axes}) + X = np.vstack([_vectorize(r, statements, axes) for r in responses]) + n_clusters = min(n_clusters, len(responses)) + pca = PCA(n_components=min(5, X.shape[1], X.shape[0])) + pcs = pca.fit_transform(X) + km = KMeans(n_clusters=n_clusters, n_init=10, random_state=0) + labels = km.fit_predict(X) + clusters = [] + for c in range(n_clusters): + members = [responses[i].agent_id for i in range(len(responses)) if labels[i] == c] + centroid = km.cluster_centers_[c] + clusters.append({ + "cluster_id": int(c), + "n": len(members), + "agent_ids": members, + "top_loadings": { + statements[i] if i < len(statements) else axes[i - len(statements)]: float(centroid[i]) + for i in np.argsort(np.abs(centroid))[::-1][:8].tolist() + }, + }) + return { + "n": len(responses), + "clusters": clusters, + "pca": { + "components": pcs.tolist(), + "explained_variance": pca.explained_variance_ratio_.tolist(), + "agent_ids": [r.agent_id for r in responses], + }, + } diff --git a/backend/app/services/interviews/instrument_loader.py b/backend/app/services/interviews/instrument_loader.py new file mode 100644 index 00000000..6d35d8a1 --- /dev/null +++ b/backend/app/services/interviews/instrument_loader.py @@ -0,0 +1,55 @@ +from __future__ import annotations +import hashlib +import json +from pathlib import Path +import yaml +from pydantic import ValidationError +from app.models.interview import ( + LikertInstrument, QSortInstrument, +) + +class InstrumentValidationError(ValueError): + pass + +def _parse_yaml(path: Path) -> dict: + if not path.exists(): + raise InstrumentValidationError(f"instrument file not found: {path}") + try: + with path.open("r", encoding="utf-8") as f: + data = yaml.safe_load(f) + except yaml.YAMLError as e: + raise InstrumentValidationError(f"YAML parse error in {path}: {e}") from e + if not isinstance(data, dict): + raise InstrumentValidationError(f"top-level YAML must be a mapping in {path}") + return data + +def load_likert_instrument(path: Path) -> LikertInstrument: + data = _parse_yaml(Path(path)) + try: + return LikertInstrument(**data) + except ValidationError as e: + raise InstrumentValidationError(str(e)) from e + +def load_qsort_instrument(path: Path) -> QSortInstrument: + data = _parse_yaml(Path(path)) + try: + return QSortInstrument(**data) + except ValidationError as e: + raise InstrumentValidationError(str(e)) from e + +def instrument_hash(path: Path) -> str: + data = Path(path).read_bytes() + return hashlib.sha256(data).hexdigest()[:16] + +def freeze_snapshot(instruments: dict[str, Path], out_path: Path) -> dict: + snapshot = { + name: { + "path": str(p), + "hash": instrument_hash(p), + "content": _parse_yaml(p), + } + for name, p in instruments.items() + } + out_path.parent.mkdir(parents=True, exist_ok=True) + out_path.write_text(json.dumps(snapshot, ensure_ascii=False, indent=2), encoding="utf-8") + return snapshot diff --git a/backend/app/services/interviews/lifecycle.py b/backend/app/services/interviews/lifecycle.py new file mode 100644 index 00000000..5e2d351d --- /dev/null +++ b/backend/app/services/interviews/lifecycle.py @@ -0,0 +1,72 @@ +""" +Interview lifecycle hook installer (Task 20). + +install_hooks(manager) registers two callbacks on a SimulationManager: + - on_ready → spawn T0 longitudinal pre-survey in a background thread + - on_completed → spawn full post-sim batch + synthesis in a background thread + +Both hooks are best-effort: failures are logged but never propagate to the +calling thread. +""" + +from __future__ import annotations + +import threading + +from app.utils.logger import get_logger + +logger = get_logger(__name__) + + +def install_hooks(manager) -> None: + """Attach interview lifecycle callbacks to a SimulationManager. + + on_ready → spawn T0 longitudinal in a background thread + on_completed → spawn full post-sim batch in a background thread + Hooks are best-effort; failures only log. + """ + + def _on_ready(state) -> None: + sim_id = ( + getattr(state, "simulation_id", None) + or getattr(state, "sim_id", None) + or getattr(state, "id", None) + ) + if not sim_id: + return + threading.Thread(target=_run_pre, args=(sim_id,), daemon=True).start() + + def _on_completed(state) -> None: + sim_id = ( + getattr(state, "simulation_id", None) + or getattr(state, "sim_id", None) + or getattr(state, "id", None) + ) + if not sim_id: + return + threading.Thread(target=_run_post, args=(sim_id,), daemon=True).start() + + manager.register_on_ready(_on_ready) + manager.register_on_completed(_on_completed) + + +def _run_pre(sim_id: str) -> None: + try: + from app.api.interview import _build_orchestrator + + orch = _build_orchestrator(sim_id) + orch.run_pre() + except Exception as e: + logger.warning(f"auto pre-survey failed for {sim_id}: {e!r}") + + +def _run_post(sim_id: str) -> None: + try: + from app.api.interview import _build_orchestrator + from app.services.interview_synthesizer import InterviewSynthesizer + + orch = _build_orchestrator(sim_id) + orch.run_post() + InterviewSynthesizer(store=orch.store).run() + except Exception as e: + logger.warning(f"auto post-survey failed for {sim_id}: {e!r}") diff --git a/backend/app/services/interviews/longitudinal.py b/backend/app/services/interviews/longitudinal.py new file mode 100644 index 00000000..6ef7b811 --- /dev/null +++ b/backend/app/services/interviews/longitudinal.py @@ -0,0 +1,113 @@ +from __future__ import annotations +import json +import math +from pathlib import Path +from typing import Optional +from app.models.interview import ( + LikertInstrument, LikertResponse, InterviewPhase, +) +from app.services.interviews.base import StakeholderInterviewer, PersonaRecord, coerce_int +from app.services.interviews.instrument_loader import load_likert_instrument + + +class LongitudinalSubagent: + def __init__(self, llm, memory, instrument_path: Path, language: str = "de"): + self.instrument: LikertInstrument = load_likert_instrument(Path(instrument_path)) + self.interviewer = StakeholderInterviewer(llm=llm, memory=memory, language=language) + self.language = language + + def _schema_hint(self) -> str: + ids = [i.item_id for i in self.instrument.items] + return json.dumps({ + "responses": {k: "" for k in ids}, + "confidence": {k: "" for k in ids}, + "open_comment": "", + }, ensure_ascii=False) + + def _user_prompt(self) -> str: + lines = [ + "Bitte bewerten Sie die folgenden Aussagen auf einer Skala von 1 (lehne stark ab) bis 5 (stimme stark zu)." + if self.language == "de" + else "Please rate the following statements on a scale from 1 (strongly disagree) to 5 (strongly agree)." + ] + for it in self.instrument.items: + txt = it.de if self.language == "de" else it.en + lines.append(f"- [{it.item_id}] {txt}") + return "\n".join(lines) + + def _validator(self, raw: dict) -> Optional[dict]: + if not isinstance(raw, dict): + return None + resp = raw.get("responses") + if not isinstance(resp, dict): + return None + required = {it.item_id for it in self.instrument.items} + if not required.issubset(resp.keys()): + return None + coerced: dict[str, int] = {} + for k, v in resp.items(): + iv = coerce_int(v) + if iv is None or not 1 <= iv <= 5: + return None + coerced[k] = iv + raw["responses"] = coerced + return raw + + def administer(self, persona: PersonaRecord, phase: InterviewPhase) -> LikertResponse: + raw = self.interviewer.ask_in_character( + persona, + user_prompt=self._user_prompt(), + schema_hint=self._schema_hint(), + validate=self._validator, + ) + return LikertResponse( + agent_id=persona.agent_id, + phase=phase, + responses={k: int(v) for k, v in raw["responses"].items()}, + confidence={k: float(v) for k, v in raw.get("confidence", {}).items()}, + open_comment=raw.get("open_comment"), + ) + + +def run_aggregate(t0: list[LikertResponse], t1: list[LikertResponse]) -> dict: + by_t0 = {r.agent_id: r for r in t0} + by_t1 = {r.agent_id: r for r in t1} + paired = sorted(set(by_t0) & set(by_t1)) + items: set[str] = set() + for r in t0 + t1: + items.update(r.responses.keys()) + per_item: dict[str, dict] = {} + for it in sorted(items): + deltas = [] + for aid in paired: + v0 = by_t0[aid].responses.get(it) + v1 = by_t1[aid].responses.get(it) + if v0 is None or v1 is None: + continue + deltas.append(v1 - v0) + if not deltas: + per_item[it] = {"mean_delta": None, "n": 0} + continue + m = sum(deltas) / len(deltas) + var = sum((d - m) ** 2 for d in deltas) / max(len(deltas) - 1, 1) + per_item[it] = { + "mean_delta": m, + "sd_delta": math.sqrt(var), + "n": len(deltas), + "n_positive": sum(1 for d in deltas if d > 0), + "n_negative": sum(1 for d in deltas if d < 0), + } + per_agent: dict[int, dict] = {} + for aid in paired: + r0 = by_t0[aid].responses + r1 = by_t1[aid].responses + common = set(r0) & set(r1) + total = sum(abs(r1[k] - r0[k]) for k in common) + per_agent[aid] = {"total_abs_drift": total, "n_items": len(common)} + return { + "n_paired": len(paired), + "n_t0_only": len(set(by_t0) - set(by_t1)), + "n_t1_only": len(set(by_t1) - set(by_t0)), + "per_item": per_item, + "per_agent": per_agent, + } diff --git a/backend/app/services/interviews/scenario.py b/backend/app/services/interviews/scenario.py new file mode 100644 index 00000000..1b1e8468 --- /dev/null +++ b/backend/app/services/interviews/scenario.py @@ -0,0 +1,82 @@ +from __future__ import annotations +import json +import statistics +from pathlib import Path +from typing import Optional +import yaml +from app.models.interview import ScenarioRating, ScenarioResponse +from app.services.interviews.base import StakeholderInterviewer, PersonaRecord, coerce_int + +class ScenarioSubagent: + def __init__(self, llm, memory, instrument_path: Path, language: str = "de"): + with Path(instrument_path).open("r", encoding="utf-8") as f: + self.instrument = yaml.safe_load(f) + self.interviewer = StakeholderInterviewer(llm=llm, memory=memory, language=language) + self.language = language + + def _schema_hint(self) -> str: + sids = [s["scenario_id"] for s in self.instrument["scenarios"]] + return json.dumps({ + "ratings": {sid: { + "desirability": "", + "plausibility": "", + "impact_on_my_group": "", + "fairness": "", + "if_woke_up_response": "", + } for sid in sids} + }, ensure_ascii=False) + + def _user_prompt(self) -> str: + head = ("Bewerten Sie jedes der folgenden Szenarien auf vier Dimensionen (1-7) " + "und beantworten Sie kurz, was Sie tun würden, wenn Sie in dieser Welt aufwachten.") \ + if self.language == "de" else \ + ("Rate each of the following scenarios on four dimensions (1-7) " + "and briefly answer what you would do if you woke up in this world.") + blocks = [] + for s in self.instrument["scenarios"]: + label = s["label_de"] if self.language == "de" else s["label_en"] + desc = s["description_de"] if self.language == "de" else s["description_en"] + blocks.append(f"--- {s['scenario_id']}: {label} ---\n{desc}") + return head + "\n\n" + "\n\n".join(blocks) + + def _validate(self, raw: dict) -> Optional[dict]: + if not isinstance(raw, dict): return None + sids = {s["scenario_id"] for s in self.instrument["scenarios"]} + ratings = raw.get("ratings", {}) + if set(ratings.keys()) != sids: return None + for sid, v in ratings.items(): + if not isinstance(v, dict): return None + for k in ("desirability", "plausibility", "impact_on_my_group", "fairness"): + iv = coerce_int(v.get(k)) + if iv is None or not 1 <= iv <= 7: return None + v[k] = iv + if not isinstance(v.get("if_woke_up_response", ""), str): return None + return raw + + def administer(self, persona: PersonaRecord) -> ScenarioResponse: + raw = self.interviewer.ask_in_character( + persona, user_prompt=self._user_prompt(), + schema_hint=self._schema_hint(), validate=self._validate, + ) + ratings = {sid: ScenarioRating(**v) for sid, v in raw["ratings"].items()} + return ScenarioResponse(agent_id=persona.agent_id, ratings=ratings) + +def polarity_matrix(responses: list[ScenarioResponse]) -> dict: + matrix: dict[str, dict] = {} + sids: set[str] = set() + for r in responses: sids.update(r.ratings.keys()) + for sid in sorted(sids): + vals = [r.ratings[sid] for r in responses if sid in r.ratings] + if not vals: + matrix[sid] = {"n": 0} + continue + matrix[sid] = { + "n": len(vals), + "mean_desirability": statistics.mean(v.desirability for v in vals), + "mean_plausibility": statistics.mean(v.plausibility for v in vals), + "mean_impact": statistics.mean(v.impact_on_my_group for v in vals), + "mean_fairness": statistics.mean(v.fairness for v in vals), + "sd_desirability": statistics.pstdev([v.desirability for v in vals]) if len(vals) > 1 else 0.0, + "sd_plausibility": statistics.pstdev([v.plausibility for v in vals]) if len(vals) > 1 else 0.0, + } + return matrix diff --git a/backend/app/services/interviews/storage.py b/backend/app/services/interviews/storage.py new file mode 100644 index 00000000..9ba23d49 --- /dev/null +++ b/backend/app/services/interviews/storage.py @@ -0,0 +1,75 @@ +from __future__ import annotations +import json +import time +import uuid +from pathlib import Path +from typing import Any +from pydantic import BaseModel +from app.models.interview import InterviewPhase, SubagentKind + + +class InterviewStore: + def __init__(self, root: Path, sim_id: str): + self.base = Path(root) / "simulations" / sim_id / "interviews" + self.base.mkdir(parents=True, exist_ok=True) + + def start_run(self, phase: InterviewPhase, subagent: SubagentKind) -> Path: + run_id = time.strftime("%Y%m%dT%H%M%S") + "-" + uuid.uuid4().hex[:6] + run_dir = self.base / phase.value / subagent.value / run_id + run_dir.mkdir(parents=True, exist_ok=True) + meta = {"run_id": run_id, "phase": phase.value, "subagent": subagent.value, + "created_at": time.time()} + (run_dir / "run.json").write_text(json.dumps(meta, indent=2), encoding="utf-8") + return run_dir + + def append_response(self, run_dir: Path, model: BaseModel) -> None: + path = run_dir / "responses.jsonl" + with path.open("a", encoding="utf-8") as f: + f.write(model.model_dump_json() + "\n") + + def append_jsonl(self, run_dir: Path, filename: str, payload: dict | BaseModel) -> None: + path = run_dir / filename + with path.open("a", encoding="utf-8") as f: + if isinstance(payload, BaseModel): + f.write(payload.model_dump_json() + "\n") + else: + f.write(json.dumps(payload, ensure_ascii=False) + "\n") + + def read_responses(self, run_dir: Path, filename: str = "responses.jsonl") -> list[dict]: + path = run_dir / filename + if not path.exists(): + return [] + return [json.loads(line) for line in path.read_text(encoding="utf-8").splitlines() if line.strip()] + + def write_aggregate(self, run_dir: Path, payload: dict) -> None: + (run_dir / "aggregate.json").write_text( + json.dumps(payload, ensure_ascii=False, indent=2), encoding="utf-8") + + def write_named(self, run_dir: Path, name: str, payload: Any) -> None: + (run_dir / name).write_text( + json.dumps(payload, ensure_ascii=False, indent=2), encoding="utf-8") + + def audit( + self, + run_dir: Path, + agent_id: int | None, + event: str, + detail: str | dict = "", + ) -> None: + entry = {"ts": time.time(), "agent_id": agent_id, "event": event, "detail": detail} + with (run_dir / "audit.jsonl").open("a", encoding="utf-8") as f: + f.write(json.dumps(entry, ensure_ascii=False, default=str) + "\n") + + def mark_latest(self, run_dir: Path) -> None: + pointer = run_dir.parent / "latest.json" + pointer.write_text(json.dumps({ + "run_dir": str(run_dir.relative_to(self.base)), + }), encoding="utf-8") + + def latest_run(self, phase: InterviewPhase, subagent: SubagentKind) -> Path | None: + pointer = self.base / phase.value / subagent.value / "latest.json" + if not pointer.exists(): + return None + rel = json.loads(pointer.read_text())["run_dir"] + path = self.base / rel + return path if path.exists() else None diff --git a/backend/app/services/interviews/zep_writer.py b/backend/app/services/interviews/zep_writer.py new file mode 100644 index 00000000..fdd9f185 --- /dev/null +++ b/backend/app/services/interviews/zep_writer.py @@ -0,0 +1,68 @@ +from __future__ import annotations +from typing import Any, Optional +from app.models.interview import ( + LikertResponse, QSortResponse, DelphiRatingResponse, ScenarioResponse, SubagentKind, +) + +class InterviewZepWriter: + """Writes interview episodes (per-agent responses, aggregates) to a Zep graph. + + Expects ``memory_updater`` to expose ``add_text_episode(graph_id, text)`` — that + is the method the real ``ZepGraphMemoryUpdater`` provides for synchronous text + writes outside the agent-activity batch pipeline. A no-op shim with the same + method is acceptable for tests and stub mode. + """ + def __init__(self, memory_updater, graph_id: str): + self.updater = memory_updater + self.graph_id = graph_id + + def _emit(self, text: str) -> None: + if hasattr(self.updater, "add_text_episode"): + self.updater.add_text_episode(self.graph_id, text) + else: + raise RuntimeError( + "memory_updater is missing add_text_episode(graph_id, text); " + "InterviewZepWriter requires the explicit text-episode API." + ) + + def _summarize_likert(self, r: LikertResponse) -> str: + mean_v = sum(r.responses.values()) / max(len(r.responses), 1) + top = sorted(r.responses.items(), key=lambda kv: -kv[1])[:3] + bot = sorted(r.responses.items(), key=lambda kv: kv[1])[:3] + return (f"mean={mean_v:.2f}; agrees with {[k for k,_ in top]}; " + f"disagrees with {[k for k,_ in bot]}") + + def _summarize_qsort(self, r: QSortResponse) -> str: + plus = [k for k, v in r.placements.items() if v >= 2] + minus = [k for k, v in r.placements.items() if v <= -2] + return f"+strongly:{plus}; -strongly:{minus}" + + def _summarize_scenario(self, r: ScenarioResponse) -> str: + parts = [f"{sid}: des={rt.desirability} plaus={rt.plausibility}" + for sid, rt in r.ratings.items()] + return "; ".join(parts) + + def write_per_agent( + self, subagent: SubagentKind, response: Any, agent_name: str, + phase: Optional[str] = None, + ) -> None: + if isinstance(response, LikertResponse): + phase = phase or response.phase.value + summary = self._summarize_likert(response) + elif isinstance(response, QSortResponse): + phase = phase or "T1" + summary = self._summarize_qsort(response) + elif isinstance(response, ScenarioResponse): + phase = phase or "T1" + summary = self._summarize_scenario(response) + elif isinstance(response, DelphiRatingResponse): + phase = phase or f"T1/R{response.round}" + summary = f"round={response.round}; {len(response.ratings)} themes rated" + else: + phase = phase or "T1" + summary = str(response)[:200] + text = f"Agent {agent_name} (interview/{subagent.value}/{phase}): {summary}" + self._emit(text) + + def write_aggregate(self, subagent: SubagentKind, summary: str) -> None: + self._emit(f"Interview aggregate ({subagent.value}): {summary}") diff --git a/backend/app/services/oasis_profile_generator.py b/backend/app/services/oasis_profile_generator.py index 7704a627..9360e18c 100644 --- a/backend/app/services/oasis_profile_generator.py +++ b/backend/app/services/oasis_profile_generator.py @@ -1090,11 +1090,13 @@ class OasisProfileGenerator: with open(file_path, 'w', newline='', encoding='utf-8') as f: writer = csv.writer(f) - - # 写入OASIS要求的表头 - headers = ['user_id', 'name', 'username', 'user_char', 'description'] + + # 写入表头:OASIS要求的5列 + 额外的source_entity_uuid列(反向链接到Zep实体)。 + # OASIS按列名读取,额外的列不会影响其行为,但允许下游(面试子系统等) + # 重建 agent_id -> Zep entity uuid 的映射。 + headers = ['user_id', 'name', 'username', 'user_char', 'description', 'source_entity_uuid'] writer.writerow(headers) - + # 写入数据行 for idx, profile in enumerate(profiles): # user_char: 完整人设(bio + persona),用于LLM系统提示 @@ -1103,16 +1105,17 @@ class OasisProfileGenerator: user_char = f"{profile.bio} {profile.persona}" # 处理换行符(CSV中用空格替代) user_char = user_char.replace('\n', ' ').replace('\r', ' ') - + # description: 简短简介,用于外部显示 description = profile.bio.replace('\n', ' ').replace('\r', ' ') - + row = [ idx, # user_id: 从0开始的顺序ID profile.name, # name: 真实姓名 profile.user_name, # username: 用户名 user_char, # user_char: 完整人设(内部LLM使用) - description # description: 简短简介(外部显示) + description, # description: 简短简介(外部显示) + profile.source_entity_uuid or "", # source_entity_uuid: Zep实体UUID ] writer.writerow(row) @@ -1184,12 +1187,18 @@ class OasisProfileGenerator: item["profession"] = profile.profession if profile.interested_topics: item["interested_topics"] = profile.interested_topics - + # source_entity_uuid: 反向链接到Zep实体,下游(面试子系统等)需要此映射以 + # 在Zep图谱中查找Agent的上下文。仅在存在时写入。 + if profile.source_entity_uuid: + item["source_entity_uuid"] = profile.source_entity_uuid + if profile.source_entity_type: + item["source_entity_type"] = profile.source_entity_type + data.append(item) - + with open(file_path, 'w', encoding='utf-8') as f: json.dump(data, f, ensure_ascii=False, indent=2) - + logger.info(f"已保存 {len(profiles)} 个Reddit Profile到 {file_path} (JSON格式,包含user_id字段)") # 保留旧方法名作为别名,保持向后兼容 diff --git a/backend/app/services/simulation_manager.py b/backend/app/services/simulation_manager.py index 0d161a90..50b7890a 100644 --- a/backend/app/services/simulation_manager.py +++ b/backend/app/services/simulation_manager.py @@ -115,24 +115,31 @@ class SimulationState: class SimulationManager: """ 模拟管理器 - + 核心功能: 1. 从Zep图谱读取实体并过滤 2. 生成OASIS Agent Profile 3. 使用LLM智能生成模拟配置参数 4. 准备预设脚本所需的所有文件 """ - + # 模拟数据存储目录 SIMULATION_DATA_DIR = os.path.join( - os.path.dirname(__file__), + os.path.dirname(__file__), '../../uploads/simulations' ) - + + # Class-level hook registries so callbacks survive across instances. + # The Flask API endpoints construct fresh `SimulationManager()` instances per request, + # while lifecycle hooks are registered once at app startup — storing the lists on the + # instance would silently drop those hooks on every request. + _on_ready_hooks: list = [] + _on_completed_hooks: list = [] + def __init__(self): # 确保目录存在 os.makedirs(self.SIMULATION_DATA_DIR, exist_ok=True) - + # 内存中的模拟状态缓存 self._simulations: Dict[str, SimulationState] = {} @@ -191,6 +198,46 @@ class SimulationManager: self._simulations[simulation_id] = state return state + # ------------------------------------------------------------------ + # Lifecycle hook registration (class-level — see class docstring) + # ------------------------------------------------------------------ + + @classmethod + def register_on_ready(cls, fn) -> None: + """Register a callback invoked when a simulation transitions to READY. + + Class-level so hooks registered at app startup remain visible to every + SimulationManager() instance constructed later (e.g. per-request in Flask). + """ + cls._on_ready_hooks.append(fn) + + @classmethod + def register_on_completed(cls, fn) -> None: + """Register a callback invoked when a simulation transitions to COMPLETED. + + Class-level so hooks registered at app startup remain visible to every + SimulationManager() instance constructed later (e.g. per-request in Flask). + """ + cls._on_completed_hooks.append(fn) + + def _notify_on_ready(self, state: "SimulationState") -> None: + """Invoke all on_ready hooks; exceptions are isolated per hook.""" + for fn in list(type(self)._on_ready_hooks): + try: + fn(state) + except Exception as e: + logger.warning(f"on_ready hook failed: {e!r}") + + def _notify_on_completed(self, state: "SimulationState") -> None: + """Invoke all on_completed hooks; exceptions are isolated per hook.""" + for fn in list(type(self)._on_completed_hooks): + try: + fn(state) + except Exception as e: + logger.warning(f"on_completed hook failed: {e!r}") + + # ------------------------------------------------------------------ + def create_simulation( self, project_id: str, @@ -441,7 +488,8 @@ class SimulationManager: # 更新状态 state.status = SimulationStatus.READY self._save_simulation_state(state) - + self._notify_on_ready(state) + logger.info(f"模拟准备完成: {simulation_id}, " f"entities={state.entities_count}, profiles={state.profiles_count}") diff --git a/backend/app/services/simulation_runner.py b/backend/app/services/simulation_runner.py index e86021f8..942f522f 100644 --- a/backend/app/services/simulation_runner.py +++ b/backend/app/services/simulation_runner.py @@ -226,7 +226,29 @@ class SimulationRunner: # 图谱记忆更新配置 _graph_memory_enabled: Dict[str, bool] = {} # simulation_id -> enabled - + + # Completion callbacks registered from outside (e.g. SimulationManager lifecycle hooks). + # Each callable receives the SimulationRunState that just transitioned to COMPLETED. + _on_completed_callbacks: list = [] + + @classmethod + def register_on_completed(cls, fn) -> None: + """Register a callback invoked when a simulation transitions to COMPLETED. + + The callback receives the SimulationRunState instance. It is called from + the monitor daemon thread, so keep it short or hand off to another thread. + """ + cls._on_completed_callbacks.append(fn) + + @classmethod + def _fire_on_completed(cls, state: SimulationRunState) -> None: + """Invoke all registered on_completed callbacks; exceptions are isolated.""" + for fn in list(cls._on_completed_callbacks): + try: + fn(state) + except Exception as e: + logger.warning(f"on_completed callback failed: {e!r}") + @classmethod def get_run_state(cls, simulation_id: str) -> Optional[SimulationRunState]: """获取运行状态""" @@ -528,6 +550,7 @@ class SimulationRunner: state.runner_status = RunnerStatus.COMPLETED state.completed_at = datetime.now().isoformat() logger.info(f"模拟完成: {simulation_id}") + cls._fire_on_completed(state) else: state.runner_status = RunnerStatus.FAILED # 从主日志文件读取错误信息 @@ -638,6 +661,7 @@ class SimulationRunner: state.runner_status = RunnerStatus.COMPLETED state.completed_at = datetime.now().isoformat() logger.info(f"所有平台模拟已完成: {state.simulation_id}") + cls._fire_on_completed(state) # 更新轮次信息(从 round_end 事件) elif event_type == "round_end": diff --git a/backend/app/services/zep_graph_memory_updater.py b/backend/app/services/zep_graph_memory_updater.py index e034fee2..86a4e1e2 100644 --- a/backend/app/services/zep_graph_memory_updater.py +++ b/backend/app/services/zep_graph_memory_updater.py @@ -337,6 +337,44 @@ class ZepGraphMemoryUpdater: self._total_activities += 1 logger.debug(f"添加活动到Zep队列: {activity.agent_name} - {activity.action_type}") + def add_text_episode(self, graph_id: str, text: str) -> None: + """ + 直接将一段文本写入Zep图谱(同步发送,不经过批量队列) + + 用于面试子系统(InterviewZepWriter)等需要立即写入、不属于 + agent活动流水线的场景。绕过 _send_batch_activities 的批量逻辑, + 但仍带重试。 + + Args: + graph_id: 目标图谱ID(允许覆盖 self.graph_id,便于多图场景) + text: 要发送的文本内容 + """ + if not text: + return + target_graph_id = graph_id or self.graph_id + if not target_graph_id: + logger.warning("add_text_episode 调用时未指定graph_id,跳过") + return + + for attempt in range(self.MAX_RETRIES): + try: + self.client.graph.add( + graph_id=target_graph_id, + type="text", + data=text, + ) + self._total_sent += 1 + self._total_items_sent += 1 + logger.debug(f"add_text_episode 发送成功 (graph={target_graph_id}, len={len(text)})") + return + except Exception as e: + if attempt < self.MAX_RETRIES - 1: + logger.warning(f"add_text_episode 失败 (尝试 {attempt + 1}/{self.MAX_RETRIES}): {e}") + time.sleep(self.RETRY_DELAY * (attempt + 1)) + else: + logger.error(f"add_text_episode 失败,已重试{self.MAX_RETRIES}次: {e}") + self._failed_count += 1 + def add_activity_from_dict(self, data: Dict[str, Any], platform: str): """ 从字典数据添加活动 diff --git a/backend/app/utils/llm_client.py b/backend/app/utils/llm_client.py index 6c1a81f4..9b22ac02 100644 --- a/backend/app/utils/llm_client.py +++ b/backend/app/utils/llm_client.py @@ -32,6 +32,82 @@ class LLMClient: base_url=self.base_url ) + def _stub_key(self, messages: list[dict]) -> str: + user_msg = next((m["content"] for m in reversed(messages) if m.get("role") == "user"), "") + sys_msg = next((m["content"] for m in messages if m.get("role") == "system"), "") + # Allow callers to embed an explicit stub_key=... token + for chunk in user_msg.split(): + if chunk.startswith("stub_key="): + return chunk[len("stub_key="):] + import hashlib + return hashlib.sha256((sys_msg + "|" + user_msg).encode("utf-8")).hexdigest()[:12] + + def _stub_response(self, messages: list[dict]) -> str: + import json as _json + return _json.dumps(self._stub_response_json(messages), ensure_ascii=False) + + def _stub_response_json(self, messages: list[dict]) -> dict: + import hashlib, json as _json + sys_msg = next((m["content"] for m in messages if m.get("role") == "system"), "") + usr_msg = next((m["content"] for m in reversed(messages) if m.get("role") == "user"), "") + h = hashlib.sha256((sys_msg + "|" + usr_msg).encode("utf-8")).hexdigest() + seed = int(h[:8], 16) + rng = (seed % 5) + 1 + + # Longitudinal Likert (12 items) + if all(tok in usr_msg for tok in ("stk_1", "gov_1", "mkt_1", "clm_1")): + ids = ["stk_1","stk_2","stk_3","gov_1","gov_2","gov_3", + "mkt_1","mkt_2","mkt_3","clm_1","clm_2","clm_3"] + return {"responses": {k: ((seed >> (i*3)) % 5) + 1 for i, k in enumerate(ids)}, + "confidence": {k: 0.6 for k in ids}, + "open_comment": f"stub:{h[:8]}"} + + # Diversity Q-sort: 24 statements + 6 axes, forced distribution 2,3,4,6,4,3,2 + if "st_01" in usr_msg and "ax_pres_extr" in usr_msg: + buckets = [-3]*2 + [-2]*3 + [-1]*4 + [0]*6 + [1]*4 + [2]*3 + [3]*2 + stmts = [f"st_{i+1:02d}" for i in range(24)] + # shuffle deterministically + order = sorted(range(24), key=lambda i: (h[i % len(h)], i)) + placements = {stmts[i]: buckets[order.index(i)] for i in range(24)} + return { + "placements": placements, + "likert_axes": {a: ((seed >> (j*3)) % 7) + 1 for j, a in enumerate( + ["ax_pres_extr","ax_loc_eu","ax_sci_trad", + "ax_ind_col","ax_short_long","ax_mkt_reg"])}, + } + + # Scenario: S1..S4 × 4 dims + if all(s in usr_msg for s in ("S1:", "S2:", "S3:", "S4:")): + return {"ratings": {sid: { + "desirability": ((seed >> (i*3)) % 7) + 1, + "plausibility": ((seed >> (i*3+1)) % 7) + 1, + "impact_on_my_group": ((seed >> (i*3+2)) % 7) + 1, + "fairness": ((seed >> (i*3+4)) % 7) + 1, + "if_woke_up_response": f"act-{sid}-{h[:4]}", + } for i, sid in enumerate(["S1","S2","S3","S4"])}} + + # Delphi R1: q1..q4 free text + if "q1" in usr_msg and "q2" in usr_msg and "Bewerten" not in usr_msg and "Sie sehen" not in usr_msg: + return {"answers": {qid: f"stub-themes-{qid}-{h[:4]}" for qid in ("q1","q2","q3","q4")}} + + # Delphi theme extraction (no in-character system prompt) + if "extract distinct thematic codes" in sys_msg: + return {"themes": [{"theme_id": f"theme_{i}", "label": f"Thema {i}"} for i in range(5)]} + + # Delphi R2 (rate) or R3 (revise) + if "Bewerten Sie jedes Thema" in usr_msg or "Sie sehen unten" in usr_msg \ + or "Rate each theme" in usr_msg or "Below are the anonymised" in usr_msg: + theme_ids = [f"theme_{i}" for i in range(5)] + out = {"ratings": {tid: {"importance": ((seed >> (i*2)) % 5) + 1, + "plausibility": ((seed >> (i*2+1)) % 5) + 1} + for i, tid in enumerate(theme_ids)}} + if "Sie sehen unten" in usr_msg or "Below are the anonymised" in usr_msg: + out["justification"] = "stub-revision" + return out + + # Fallback + return {"stub_key": h[:12], "value": rng} + def chat( self, messages: List[Dict[str, str]], @@ -41,16 +117,20 @@ class LLMClient: ) -> str: """ 发送聊天请求 - + Args: messages: 消息列表 temperature: 温度参数 max_tokens: 最大token数 response_format: 响应格式(如JSON模式) - + Returns: 模型响应文本 """ + from app.config import Config + if getattr(Config, "LLM_STUB_MODE", False): + return self._stub_response(messages) + kwargs = { "model": self.model, "messages": messages, @@ -75,15 +155,19 @@ class LLMClient: ) -> Dict[str, Any]: """ 发送聊天请求并返回JSON - + Args: messages: 消息列表 temperature: 温度参数 max_tokens: 最大token数 - + Returns: 解析后的JSON对象 """ + from app.config import Config + if getattr(Config, "LLM_STUB_MODE", False): + return self._stub_response_json(messages) + response = self.chat( messages=messages, temperature=temperature, diff --git a/backend/pyproject.toml b/backend/pyproject.toml index 8c65b729..88fa3d13 100644 --- a/backend/pyproject.toml +++ b/backend/pyproject.toml @@ -32,6 +32,11 @@ dependencies = [ # 工具库 "python-dotenv>=1.0.0", "pydantic>=2.0.0", + "PyYAML>=6.0", + "scikit-learn>=1.4", + "scipy>=1.12", + "numpy>=1.26", + "pandas>=2.1", ] [project.optional-dependencies] diff --git a/backend/pytest.ini b/backend/pytest.ini new file mode 100644 index 00000000..60f69ff1 --- /dev/null +++ b/backend/pytest.ini @@ -0,0 +1,8 @@ +[pytest] +testpaths = tests +python_files = test_*.py +python_classes = Test* +python_functions = test_* +addopts = -ra --strict-markers +markers = + integration: marks integration tests (deselect with -m 'not integration') diff --git a/backend/scripts/instruments/__init__.py b/backend/scripts/instruments/__init__.py new file mode 100644 index 00000000..e69de29b diff --git a/backend/scripts/instruments/delphi_v1.yaml b/backend/scripts/instruments/delphi_v1.yaml new file mode 100644 index 00000000..bb7650dc --- /dev/null +++ b/backend/scripts/instruments/delphi_v1.yaml @@ -0,0 +1,9 @@ +name: delphi_v1 +version: "1.0" +language_default: de +rounds: 3 +questions: + - {question_id: q1, de: "Welche drei Faktoren werden die deutsche Fischerei bis 2040 am stärksten prägen?", en: "Which three factors will most shape German fisheries by 2040?"} + - {question_id: q2, de: "Welche Akteurinnen und Akteure sind heute entscheidend, werden aber unterschätzt?", en: "Which actors are decisive today but underestimated?"} + - {question_id: q3, de: "Was sollte sich in den nächsten fünf Jahren ändern, damit die Fischerei eine Zukunft hat?", en: "What should change in the next five years for fisheries to have a future?"} + - {question_id: q4, de: "Welcher Trend macht Ihnen am meisten Hoffnung – und welcher am meisten Sorge?", en: "Which trend gives you most hope — and which most concern?"} diff --git a/backend/scripts/instruments/diversity_v1.yaml b/backend/scripts/instruments/diversity_v1.yaml new file mode 100644 index 00000000..7c47cd96 --- /dev/null +++ b/backend/scripts/instruments/diversity_v1.yaml @@ -0,0 +1,36 @@ +name: diversity_v1 +version: "1.0" +language_default: de +distribution: [2, 3, 4, 6, 4, 3, 2] # buckets from -3 to +3, total 24 +statements: + - {statement_id: st_01, de: "Die Ostsee gehört den Fischern, die hier seit Generationen leben.", en: "The Baltic belongs to fishers who have lived here for generations."} + - {statement_id: st_02, de: "MSC-Zertifizierung schützt vor allem große Konzerne.", en: "MSC certification mainly protects large corporations."} + - {statement_id: st_03, de: "Wissenschaftliche Quoten sind die einzige Grundlage für Politik.", en: "Scientific quotas are the only legitimate basis for policy."} + - {statement_id: st_04, de: "Aquakultur kann Ostseefischerei ersetzen.", en: "Aquaculture can replace Baltic fisheries."} + - {statement_id: st_05, de: "Sportfischer schaden den Beständen mehr als die Berufsfischer.", en: "Recreational anglers harm stocks more than commercial fishers."} + - {statement_id: st_06, de: "Die EU-Fischereipolitik kennt die Ostsee nicht.", en: "EU fisheries policy doesn't understand the Baltic."} + - {statement_id: st_07, de: "Großtechnische Fischerei ist effizienter und damit nachhaltiger.", en: "Industrial fisheries are more efficient and therefore more sustainable."} + - {statement_id: st_08, de: "Wer Fisch isst, sollte mehr dafür bezahlen.", en: "Those who eat fish should pay more for it."} + - {statement_id: st_09, de: "Die Kleinfischerei muss subventioniert werden.", en: "Small-scale fisheries must be subsidised."} + - {statement_id: st_10, de: "Marine Schutzgebiete sind reine Symbolpolitik.", en: "Marine protected areas are mere symbolism."} + - {statement_id: st_11, de: "Russlands Krieg ändert alles in der Ostsee.", en: "Russia's war changes everything in the Baltic."} + - {statement_id: st_12, de: "Nur drastische Reduktion der Fangmengen rettet die Bestände.", en: "Only drastic catch reductions will save the stocks."} + - {statement_id: st_13, de: "NGOs übertreiben die Krise systematisch.", en: "NGOs systematically exaggerate the crisis."} + - {statement_id: st_14, de: "Klimawandel ist das eigentliche Problem, nicht die Fischerei.", en: "Climate change is the real problem, not fisheries."} + - {statement_id: st_15, de: "Tradition zählt mehr als kurzfristige Bestandszahlen.", en: "Tradition matters more than short-term stock numbers."} + - {statement_id: st_16, de: "Verbraucher entscheiden über die Zukunft des Fisches.", en: "Consumers decide the future of fish."} + - {statement_id: st_17, de: "Ohne Generalstreik der Fischer ändert sich nichts.", en: "Without a fishers' general strike, nothing will change."} + - {statement_id: st_18, de: "Die Bundesregierung sollte Kutter aufkaufen und stilllegen.", en: "The federal government should buy out and decommission boats."} + - {statement_id: st_19, de: "Die Dorschkrise ist Folge gescheiterter Politik.", en: "The cod crisis is the result of policy failure."} + - {statement_id: st_20, de: "Ostsee-Aquakultur ist ökologisch problematisch.", en: "Baltic aquaculture is ecologically problematic."} + - {statement_id: st_21, de: "Junge Menschen werden keinen Fischereibetrieb mehr übernehmen.", en: "Young people will no longer take over fishing businesses."} + - {statement_id: st_22, de: "Markt regelt sich selbst, auch beim Fisch.", en: "The market regulates itself, also for fish."} + - {statement_id: st_23, de: "Lokale Genossenschaften sind die Lösung.", en: "Local cooperatives are the solution."} + - {statement_id: st_24, de: "In 20 Jahren gibt es keine deutsche Ostseefischerei mehr.", en: "In 20 years there will be no German Baltic fisheries left."} +likert_axes: + - {axis_id: ax_pres_extr, scale: 7, de: "Bewahrung (1) vs. Nutzung (7)", en: "Preservation (1) vs. Extraction (7)"} + - {axis_id: ax_loc_eu, scale: 7, de: "Lokal (1) vs. EU-zentral (7)", en: "Local (1) vs. EU-central (7)"} + - {axis_id: ax_sci_trad, scale: 7, de: "Wissenschaft (1) vs. Tradition (7)", en: "Science-led (1) vs. Tradition-led (7)"} + - {axis_id: ax_ind_col, scale: 7, de: "Individuum (1) vs. Kollektiv (7)", en: "Individual (1) vs. Collective (7)"} + - {axis_id: ax_short_long,scale: 7, de: "Kurzfristig (1) vs. Langfristig (7)", en: "Short-term (1) vs. Long-term (7)"} + - {axis_id: ax_mkt_reg, scale: 7, de: "Markt (1) vs. Regulierung (7)", en: "Market (1) vs. Regulation (7)"} diff --git a/backend/scripts/instruments/longitudinal_v1.yaml b/backend/scripts/instruments/longitudinal_v1.yaml new file mode 100644 index 00000000..7a37d18c --- /dev/null +++ b/backend/scripts/instruments/longitudinal_v1.yaml @@ -0,0 +1,47 @@ +name: longitudinal_v1 +version: "1.0" +language_default: de +items: + # Stock status & recovery + - {item_id: stk_1, family: stocks, scale: 5, + de: "Der westliche Dorschbestand wird sich bis 2035 erholen.", + en: "The Western Baltic cod stock will recover by 2035."} + - {item_id: stk_2, family: stocks, scale: 5, + de: "Der Heringsbestand in der westlichen Ostsee ist nicht mehr zu retten.", + en: "The Western Baltic herring stock can no longer be saved.", + reverse_coded: true} + - {item_id: stk_3, family: stocks, scale: 5, + de: "Wissenschaftliche Bestandsschätzungen sind generell zuverlässig.", + en: "Scientific stock assessments are generally reliable."} + # Governance & CFP + - {item_id: gov_1, family: governance, scale: 5, + de: "Die Gemeinsame Fischereipolitik der EU scheitert beim Schutz der Ostseefische.", + en: "The EU Common Fisheries Policy fails to protect Baltic fish.", + reverse_coded: true} + - {item_id: gov_2, family: governance, scale: 5, + de: "Entscheidungen über Fangquoten sollten stärker lokal getroffen werden.", + en: "Decisions on catch quotas should be taken more locally."} + - {item_id: gov_3, family: governance, scale: 5, + de: "Die deutsche Bundesregierung handelt entschlossen bei Fischereifragen.", + en: "The German federal government acts decisively on fisheries issues."} + # Market & MSC + - {item_id: mkt_1, family: market, scale: 5, + de: "Nur MSC-zertifizierter Fisch sollte verkauft werden dürfen.", + en: "Only MSC-certified fish should be allowed for sale."} + - {item_id: mkt_2, family: market, scale: 5, + de: "Importierter Fisch verdrängt die deutsche Kleinfischerei.", + en: "Imported fish displaces German small-scale fisheries."} + - {item_id: mkt_3, family: market, scale: 5, + de: "Verbraucher zahlen gerne mehr für nachhaltigen Ostseefisch.", + en: "Consumers gladly pay more for sustainable Baltic fish."} + # Climate & adaptation + - {item_id: clm_1, family: climate, scale: 5, + de: "Der Klimawandel macht traditionelle Ostseefischerei unmöglich.", + en: "Climate change makes traditional Baltic fisheries impossible.", + reverse_coded: true} + - {item_id: clm_2, family: climate, scale: 5, + de: "Aquakultur ist die Zukunft der deutschen Fischwirtschaft.", + en: "Aquaculture is the future of the German fishing industry."} + - {item_id: clm_3, family: climate, scale: 5, + de: "Die Fischerei muss sich grundlegend an neue Arten anpassen.", + en: "Fisheries must fundamentally adapt to new species."} diff --git a/backend/scripts/instruments/scenario_v1.yaml b/backend/scripts/instruments/scenario_v1.yaml new file mode 100644 index 00000000..5c150b80 --- /dev/null +++ b/backend/scripts/instruments/scenario_v1.yaml @@ -0,0 +1,51 @@ +name: scenario_v1 +version: "1.0" +language_default: de +scenarios: + - scenario_id: S1 + label_de: "Erholung 2040" + label_en: "Recovery 2040" + description_de: | + Bis 2040 haben sich Dorsch- und Heringsbestände in der westlichen Ostsee + deutlich erholt. MSC-Zertifizierung ist branchenweit Standard. Die kleine + Küstenfischerei hat sich stabilisiert; die Politik gilt als erfolgreich. + description_en: | + By 2040, Western Baltic cod and herring stocks have substantially recovered. + MSC certification is industry-wide standard. Small-scale coastal fisheries + have stabilised; policy is regarded as successful. + - scenario_id: S2 + label_de: "Kollaps 2040" + label_en: "Collapse 2040" + description_de: | + Bis 2040 sind Dorsch- und Heringsbestände zusammengebrochen. Die Flotte + ist halbiert, Aquakultur dominiert den Markt, Häfen veröden. + description_en: | + By 2040, cod and herring stocks have collapsed. The fleet is halved, + aquaculture dominates the market, harbour towns decline. + - scenario_id: S3 + label_de: "Festung Europa 2040" + label_en: "Fortress Europe 2040" + description_de: | + Bis 2040 verfolgt die EU eine protektionistische Politik mit hohen Importzöllen, + Meeresschutzgebiete bedecken 30% der Ostsee, Sportfischerei ist stark eingeschränkt. + description_en: | + By 2040, the EU pursues a protectionist policy with high import tariffs, + MPAs cover 30% of the Baltic, recreational fishing is strongly curtailed. + - scenario_id: S4 + label_de: "Privatisierung 2040" + label_en: "Privatisation 2040" + description_de: | + Bis 2040 sind Fangrechte als handelbare Quoten (ITQs) etabliert. Die Branche + hat sich konsolidiert; nur große, kapitalstarke Unternehmen sind übrig. + description_en: | + By 2040, fishing rights are tradable quotas (ITQs). The industry has + consolidated; only large, well-capitalised firms remain. +dimensions: + - {dimension_id: desirability, scale: 7, + de: "Wie wünschenswert ist dieses Szenario?", en: "How desirable is this scenario?"} + - {dimension_id: plausibility, scale: 7, + de: "Wie plausibel ist dieses Szenario?", en: "How plausible is this scenario?"} + - {dimension_id: impact_on_my_group, scale: 7, + de: "Wie stark trifft es Ihre Gruppe?", en: "How strongly does it affect your group?"} + - {dimension_id: fairness, scale: 7, + de: "Wie fair ist dieses Szenario?", en: "How fair is this scenario?"} diff --git a/backend/tests/__init__.py b/backend/tests/__init__.py new file mode 100644 index 00000000..e69de29b diff --git a/backend/tests/conftest.py b/backend/tests/conftest.py new file mode 100644 index 00000000..2ba3931d --- /dev/null +++ b/backend/tests/conftest.py @@ -0,0 +1,17 @@ +import os +import sys +import pathlib +import pytest + +ROOT = pathlib.Path(__file__).resolve().parents[1] +sys.path.insert(0, str(ROOT)) + +os.environ.setdefault("LLM_API_KEY", "test") +os.environ.setdefault("LLM_BASE_URL", "https://example.invalid") +os.environ.setdefault("LLM_MODEL_NAME", "test-model") +os.environ.setdefault("ZEP_API_KEY", "test") + +@pytest.fixture +def tmp_uploads(tmp_path, monkeypatch): + monkeypatch.setenv("UPLOADS_DIR", str(tmp_path)) + return tmp_path diff --git a/backend/tests/integration/__init__.py b/backend/tests/integration/__init__.py new file mode 100644 index 00000000..e69de29b diff --git a/backend/tests/integration/test_interview_pipeline.py b/backend/tests/integration/test_interview_pipeline.py new file mode 100644 index 00000000..54bb0540 --- /dev/null +++ b/backend/tests/integration/test_interview_pipeline.py @@ -0,0 +1,81 @@ +import json +import pytest +from pathlib import Path +from app.config import Config +from app.models.interview import SubagentKind, InterviewPhase +from app.services.interviews.adapters import FileSystemPersonaProvider +from app.services.interviews.base import MemoryDigest +from app.services.interviews.zep_writer import InterviewZepWriter +from app.services.interview_orchestrator import InterviewOrchestrator +from app.services.interview_synthesizer import InterviewSynthesizer +from app.utils.llm_client import LLMClient + +pytestmark = pytest.mark.integration + +INST_DIR = Path(__file__).resolve().parents[2] / "scripts" / "instruments" + +class _NullUpdater: + def __init__(self): self.events = [] + def add_text_episode(self, graph_id, text): self.events.append(text) + +class _StaticMem: + def get_digest(self, agent_id, max_chars=2000): + return MemoryDigest(text=f"agent {agent_id} memory snippet", available=True) + +@pytest.fixture +def seeded_uploads(tmp_path, monkeypatch): + monkeypatch.setenv("LLM_STUB_MODE", "true") + Config.LLM_STUB_MODE = True + sim_dir = tmp_path / "simulations" / "intg_sim" + sim_dir.mkdir(parents=True) + profiles = [{"user_id": i, "user_name": f"u{i}", "name": f"A{i}", + "persona": "stakeholder p", "profession": "fisher"} for i in range(5)] + (sim_dir / "reddit_profiles.json").write_text(json.dumps(profiles), encoding="utf-8") + return tmp_path + +def _make_orch(tmp_path): + sim_dir = tmp_path / "simulations" / "intg_sim" + personas = FileSystemPersonaProvider( + reddit_path=sim_dir / "reddit_profiles.json", twitter_path=None, + ) + llm = LLMClient(api_key="x", base_url="x", model="x") + updater = _NullUpdater() + writer = InterviewZepWriter(memory_updater=updater, graph_id="g") + return InterviewOrchestrator( + llm=llm, memory=_StaticMem(), personas=personas, + instrument_dir=INST_DIR, store_root=tmp_path, sim_id="intg_sim", + zep_writer=writer, max_workers=2, language="de", + ) + +def test_pipeline_runs_pre_then_post_then_synthesis(seeded_uploads): + tmp = seeded_uploads + orch = _make_orch(tmp) + + pre = orch.run_pre() + assert pre["longitudinal"]["n_responded"] >= 1 + + post = orch.run_post() + assert "longitudinal" in post + assert "diversity" in post + assert "scenario" in post + assert "delphi" in post + + synth = InterviewSynthesizer(store=orch.store) + report = synth.run() + assert "Stakeholder Interview Synthesis" in report + assert "Limitations" in report + + csv_path = orch.store.base / "synthesis" / "exports" / "all_responses.csv" + assert csv_path.exists() + lines = csv_path.read_text().splitlines() + assert lines[0].startswith("agent_id,") or "agent_id" in lines[0] + +def test_idempotent_rerun_creates_new_run_id(seeded_uploads): + tmp = seeded_uploads + orch = _make_orch(tmp) + orch.run_pre() + first = orch.run_post() + second = orch.rerun(SubagentKind.SCENARIO) + first_scn = first["scenario"]["run_dir"] + second_scn = second["scenario"]["run_dir"] + assert first_scn != second_scn diff --git a/backend/tests/interviews/__init__.py b/backend/tests/interviews/__init__.py new file mode 100644 index 00000000..e69de29b diff --git a/backend/tests/interviews/test_adapters.py b/backend/tests/interviews/test_adapters.py new file mode 100644 index 00000000..977d5997 --- /dev/null +++ b/backend/tests/interviews/test_adapters.py @@ -0,0 +1,123 @@ +import csv +import json +from pathlib import Path +from app.services.interviews.adapters import ( + FileSystemPersonaProvider, ZepMemoryProvider, +) + +def _write_reddit_profiles(tmp_path: Path): + data = [ + {"user_id": 0, "user_name": "fischer1", "name": "Fischer Müller", + "persona": "I am a small-scale Baltic fisher.", "profession": "fisher", "bio": ""}, + {"user_id": 1, "user_name": "ngo1", "name": "Ines NGO", + "persona": "I work for an environmental NGO.", "profession": "ngo_staff", "bio": ""}, + ] + p = tmp_path / "reddit_profiles.json" + p.write_text(json.dumps(data), encoding="utf-8") + return p + +def test_file_system_persona_provider_reads_reddit_json(tmp_path): + p = _write_reddit_profiles(tmp_path) + provider = FileSystemPersonaProvider(reddit_path=p, twitter_path=None) + personas = provider.all() + assert len(personas) == 2 + assert personas[0].name == "Fischer Müller" + assert personas[0].agent_id == 0 + +def test_zep_memory_provider_returns_empty_when_unavailable(): + class _BrokenReader: + def get_entity_with_context(self, *a, **kw): + raise RuntimeError("offline") + prov = ZepMemoryProvider(entity_reader=_BrokenReader(), graph_id="g1", + agent_to_entity={0: "uuid-zero"}) + d = prov.get_digest(0) + assert d.available is False + assert d.text != "" + +def test_zep_memory_provider_truncates_to_max_chars(): + class _R: + def get_entity_with_context(self, *a, **kw): + class _Ctx: + name = "X"; summary = "Y" + related_edges = [{"fact": "very long fact " * 200}] + return _Ctx() + prov = ZepMemoryProvider(entity_reader=_R(), graph_id="g1", + agent_to_entity={5: "uuid-five"}) + d = prov.get_digest(5, max_chars=300) + assert d.available is True + assert len(d.text) <= 300 + + +def test_agent_to_entity_from_reddit_json(tmp_path): + """C5: ``FileSystemPersonaProvider.agent_to_entity()`` must reconstruct the + ``{agent_id: zep_entity_uuid}`` map from a reddit_profiles.json that + includes ``source_entity_uuid``. + """ + data = [ + {"user_id": 0, "user_name": "fischer1", "name": "Fischer Müller", + "persona": "p", "profession": "fisher", + "source_entity_uuid": "uuid-zero"}, + {"user_id": 1, "user_name": "ngo1", "name": "Ines NGO", + "persona": "p", "profession": "ngo_staff", + "source_entity_uuid": "uuid-one"}, + # Row with no uuid must be skipped. + {"user_id": 2, "user_name": "gov1", "name": "Gov Agent", + "persona": "p", "profession": "official"}, + ] + p = tmp_path / "reddit_profiles.json" + p.write_text(json.dumps(data), encoding="utf-8") + + provider = FileSystemPersonaProvider(reddit_path=p, twitter_path=None) + mapping = provider.agent_to_entity() + + assert mapping == {0: "uuid-zero", 1: "uuid-one"} + # Map values are strings, keys are ints. + for k, v in mapping.items(): + assert isinstance(k, int) + assert isinstance(v, str) + + +def test_agent_to_entity_empty_when_no_field(tmp_path): + """C5: if no row has ``source_entity_uuid``, return an empty dict — not + a crash, not partial garbage.""" + data = [{"user_id": 0, "user_name": "u", "name": "A", "persona": "p"}] + p = tmp_path / "reddit_profiles.json" + p.write_text(json.dumps(data), encoding="utf-8") + provider = FileSystemPersonaProvider(reddit_path=p, twitter_path=None) + assert provider.agent_to_entity() == {} + + +def test_agent_to_entity_falls_back_to_twitter_csv(tmp_path): + """C5: when only twitter_profiles.csv exists, the helper must still + extract uuids from the CSV's ``source_entity_uuid`` column. + """ + p = tmp_path / "twitter_profiles.csv" + with p.open("w", newline="", encoding="utf-8") as f: + writer = csv.writer(f) + writer.writerow(["user_id", "name", "username", "user_char", "description", "source_entity_uuid"]) + writer.writerow([0, "A0", "u0", "char", "desc", "uuid-zero"]) + writer.writerow([1, "A1", "u1", "char", "desc", ""]) # skipped (blank uuid) + writer.writerow([2, "A2", "u2", "char", "desc", "uuid-two"]) + + provider = FileSystemPersonaProvider(reddit_path=None, twitter_path=p) + assert provider.agent_to_entity() == {0: "uuid-zero", 2: "uuid-two"} + + +def test_agent_to_entity_reddit_takes_precedence(tmp_path): + """C5: when both files exist, Reddit JSON wins; Twitter CSV only fills + agents not already mapped.""" + reddit = tmp_path / "reddit_profiles.json" + reddit.write_text(json.dumps([ + {"user_id": 0, "user_name": "u0", "name": "A0", "persona": "p", + "source_entity_uuid": "reddit-zero"}, + ]), encoding="utf-8") + + twitter = tmp_path / "twitter_profiles.csv" + with twitter.open("w", newline="", encoding="utf-8") as f: + writer = csv.writer(f) + writer.writerow(["user_id", "name", "username", "user_char", "description", "source_entity_uuid"]) + writer.writerow([0, "A0", "u0", "char", "desc", "twitter-zero"]) # ignored + writer.writerow([1, "A1", "u1", "char", "desc", "twitter-one"]) # used + + provider = FileSystemPersonaProvider(reddit_path=reddit, twitter_path=twitter) + assert provider.agent_to_entity() == {0: "reddit-zero", 1: "twitter-one"} diff --git a/backend/tests/interviews/test_api_interview.py b/backend/tests/interviews/test_api_interview.py new file mode 100644 index 00000000..7e55d627 --- /dev/null +++ b/backend/tests/interviews/test_api_interview.py @@ -0,0 +1,155 @@ +import json +import os +from pathlib import Path +import pytest + +@pytest.fixture +def client(tmp_path, monkeypatch): + monkeypatch.setenv("LLM_STUB_MODE", "true") + monkeypatch.setenv("UPLOADS_DIR", str(tmp_path)) + from app.config import Config + Config.LLM_STUB_MODE = True + Config.UPLOADS_DIR = str(tmp_path) + # Seed a minimal reddit_profiles.json + sim_dir = tmp_path / "simulations" / "sim_test" + sim_dir.mkdir(parents=True) + profiles = [{"user_id": i, "user_name": f"u{i}", "name": f"A{i}", + "persona": "p", "profession": "fisher"} for i in range(3)] + (sim_dir / "reddit_profiles.json").write_text(json.dumps(profiles), encoding="utf-8") + from flask import Flask + from app.api import register_blueprints + app = Flask(__name__) + register_blueprints(app) + return app.test_client() + +def test_post_pre_returns_task_id(client): + res = client.post("/api/interview/sim_test/pre") + assert res.status_code == 200 + body = res.get_json() + assert body["success"] is True + assert "task_id" in body["data"] + +def test_status_endpoint_returns_progress(client): + res = client.post("/api/interview/sim_test/pre") + task_id = res.get_json()["data"]["task_id"] + res2 = client.get(f"/api/interview/sim_test/status?task_id={task_id}") + assert res2.status_code == 200 + assert "status" in res2.get_json()["data"] + +def test_unknown_subagent_returns_400(client): + res = client.post("/api/interview/sim_test/rerun", + json={"subagent": "nonsense"}) + assert res.status_code == 400 + + +def test_build_orchestrator_reads_graph_id_from_state(tmp_path, monkeypatch): + """C1+C2: ``_build_orchestrator`` must resolve the Zep graph_id from + ``state.json`` (written by ``SimulationManager``), not from the + nonexistent ``graph_id.txt``. The graph_id then must reach the + ``InterviewZepWriter`` instead of being silently swallowed. + """ + monkeypatch.setenv("LLM_STUB_MODE", "true") + monkeypatch.setenv("UPLOADS_DIR", str(tmp_path)) + monkeypatch.setenv("ZEP_API_KEY", "test-fake-key") + from app.config import Config + Config.LLM_STUB_MODE = True + Config.UPLOADS_DIR = str(tmp_path) + Config.ZEP_API_KEY = "test-fake-key" + + # SimulationManager's data dir is class-level — point it at tmp_path. + from app.services.simulation_manager import SimulationManager + sim_root = tmp_path / "simulations" + sim_root.mkdir(parents=True, exist_ok=True) + monkeypatch.setattr(SimulationManager, "SIMULATION_DATA_DIR", str(sim_root)) + + sim_id = "sim_graphid" + sim_dir = sim_root / sim_id + sim_dir.mkdir(parents=True) + # Seed a profile file so FileSystemPersonaProvider can work. + (sim_dir / "reddit_profiles.json").write_text( + json.dumps([ + {"user_id": 0, "user_name": "u0", "name": "A0", + "persona": "p", "profession": "fisher", + "source_entity_uuid": "uuid-zero"}, + {"user_id": 1, "user_name": "u1", "name": "A1", + "persona": "p", "profession": "fisher", + "source_entity_uuid": "uuid-one"}, + ]), + encoding="utf-8", + ) + # Seed state.json with the graph_id. + state_doc = { + "simulation_id": sim_id, + "project_id": "p", + "graph_id": "graph-from-state", + "status": "ready", + "enable_twitter": False, + "enable_reddit": True, + } + (sim_dir / "state.json").write_text(json.dumps(state_doc), encoding="utf-8") + + # Patch ZepGraphMemoryUpdater + ZepEntityReader so we don't hit the network. + import app.services.zep_graph_memory_updater as zgmu + import app.services.zep_entity_reader as zer + + class _FakeUpdater: + def __init__(self, graph_id, api_key=None): + self.graph_id = graph_id + + def add_text_episode(self, graph_id, text): + return None + + class _FakeReader: + def __init__(self, api_key=None): + pass + + def get_entity_with_context(self, graph_id, entity_uuid): + return None + + monkeypatch.setattr(zgmu, "ZepGraphMemoryUpdater", _FakeUpdater) + monkeypatch.setattr(zer, "ZepEntityReader", _FakeReader) + + from app.api.interview import _build_orchestrator + + orch = _build_orchestrator(sim_id) + assert orch.zep_writer.graph_id == "graph-from-state" + # Updater on the writer must be the real (or fake) ZepGraphMemoryUpdater path, + # NOT the null updater — i.e. its graph_id must match. + assert getattr(orch.zep_writer.updater, "graph_id", None) == "graph-from-state" + + # ZepMemoryProvider must have received the agent_to_entity map (C5). + assert hasattr(orch.memory, "map") + assert orch.memory.map == {0: "uuid-zero", 1: "uuid-one"} + + +def test_build_orchestrator_falls_back_when_state_missing(tmp_path, monkeypatch): + """C1+C2: when ``state.json`` is missing, the orchestrator must still be + constructed with the null updater/memory path (not crash, not silently + pass a bare ``ZepGraphMemoryUpdater()`` that would error out). + """ + monkeypatch.setenv("LLM_STUB_MODE", "true") + monkeypatch.setenv("UPLOADS_DIR", str(tmp_path)) + from app.config import Config + Config.LLM_STUB_MODE = True + Config.UPLOADS_DIR = str(tmp_path) + + from app.services.simulation_manager import SimulationManager + sim_root = tmp_path / "simulations" + sim_root.mkdir(parents=True, exist_ok=True) + monkeypatch.setattr(SimulationManager, "SIMULATION_DATA_DIR", str(sim_root)) + + sim_id = "sim_no_state" + sim_dir = sim_root / sim_id + sim_dir.mkdir(parents=True) + (sim_dir / "reddit_profiles.json").write_text( + json.dumps([{"user_id": 0, "user_name": "u0", "name": "A0", + "persona": "p", "profession": "fisher"}]), + encoding="utf-8", + ) + + from app.api.interview import _build_orchestrator + + orch = _build_orchestrator(sim_id) + assert orch.zep_writer.graph_id == "" + # Null updater path: writer must still respond to _emit without raising. + orch.zep_writer._emit("hello") diff --git a/backend/tests/interviews/test_base_interviewer.py b/backend/tests/interviews/test_base_interviewer.py new file mode 100644 index 00000000..03295867 --- /dev/null +++ b/backend/tests/interviews/test_base_interviewer.py @@ -0,0 +1,96 @@ +import json +import pytest +from app.services.interviews.base import ( + StakeholderInterviewer, MemoryDigest, PersonaRecord, SchemaValidationFailure, + coerce_int, +) + + +def test_coerce_int_accepts_real_int(): + assert coerce_int(3) == 3 + assert coerce_int(-2) == -2 + assert coerce_int(0) == 0 + + +def test_coerce_int_accepts_numeric_strings(): + assert coerce_int("3") == 3 + assert coerce_int(" 4 ") == 4 + assert coerce_int("-2") == -2 + + +def test_coerce_int_rejects_non_numeric(): + assert coerce_int("3.5") is None + assert coerce_int("abc") is None + assert coerce_int(None) is None + assert coerce_int([3]) is None + assert coerce_int(3.5) is None + + +def test_coerce_int_rejects_bool(): + """True/False should NOT silently coerce to 1/0 even though Python says they're ints.""" + assert coerce_int(True) is None + assert coerce_int(False) is None + + +class _FakeLLM: + def __init__(self, responses): + self.responses = list(responses) + self.calls = [] + def chat_json(self, messages, temperature=0.0, max_tokens=None, **kw): + self.calls.append(messages) + return self.responses.pop(0) + +class _FakeMemory: + def get_digest(self, agent_id, max_chars=2000): + return MemoryDigest(text=f"digest-for-{agent_id}", available=True) + +def test_in_character_prompt_includes_persona_and_memory(): + llm = _FakeLLM([{"x": 1}]) + mem = _FakeMemory() + interviewer = StakeholderInterviewer(llm=llm, memory=mem) + persona = PersonaRecord(agent_id=7, name="A", persona="I am a small-scale Baltic fisher.") + out = interviewer.ask_in_character(persona, user_prompt="Q?", schema_hint="{...}") + assert out == {"x": 1} + sys_msg = llm.calls[0][0]["content"] + assert "small-scale Baltic fisher" in sys_msg + assert "digest-for-7" in sys_msg + +def test_schema_retry_on_first_failure(): + bad_then_good = [{}, {"responses": {"a": 3}}] + llm = _FakeLLM(bad_then_good) + mem = _FakeMemory() + interviewer = StakeholderInterviewer(llm=llm, memory=mem) + def validator(d): + return d if "responses" in d else None + persona = PersonaRecord(agent_id=1, name="A", persona="p") + out = interviewer.ask_in_character(persona, user_prompt="Q?", schema_hint="x", validate=validator) + assert out == {"responses": {"a": 3}} + assert len(llm.calls) == 2 + +def test_two_failures_raise(): + llm = _FakeLLM([{}, {}]) + mem = _FakeMemory() + interviewer = StakeholderInterviewer(llm=llm, memory=mem) + persona = PersonaRecord(agent_id=1, name="A", persona="p") + with pytest.raises(ValueError): + interviewer.ask_in_character(persona, user_prompt="Q?", schema_hint="x", + validate=lambda d: d if "responses" in d else None) + + +def test_schema_failure_captures_both_raw_attempts(): + bad1 = {"oops": "no responses key"} + bad2 = {"still": "wrong shape"} + llm = _FakeLLM([bad1, bad2]) + mem = _FakeMemory() + interviewer = StakeholderInterviewer(llm=llm, memory=mem) + persona = PersonaRecord(agent_id=42, name="A", persona="p") + with pytest.raises(SchemaValidationFailure) as exc_info: + interviewer.ask_in_character(persona, user_prompt="Q?", schema_hint="x", + validate=lambda d: d if "responses" in d else None) + err = exc_info.value + assert err.agent_id == 42 + assert len(err.attempts) == 2 + assert err.attempts[0]["raw"] == bad1 + assert err.attempts[1]["raw"] == bad2 + assert err.attempts[0]["attempt"] == 1 + assert err.attempts[1]["attempt"] == 2 diff --git a/backend/tests/interviews/test_delphi.py b/backend/tests/interviews/test_delphi.py new file mode 100644 index 00000000..e55cab7a --- /dev/null +++ b/backend/tests/interviews/test_delphi.py @@ -0,0 +1,84 @@ +from pathlib import Path +from app.services.interviews.base import PersonaRecord, MemoryDigest +from app.services.interviews.delphi import ( + DelphiSubagent, extract_themes, convergence_metrics, +) + +INSTRUMENT = Path(__file__).resolve().parents[2] / "scripts" / "instruments" / "delphi_v1.yaml" + +class _Mem: + def get_digest(self, agent_id, max_chars=2000): + return MemoryDigest(text="x", available=True) + +class _R1LLM: + def chat_json(self, messages, temperature=0.0, max_tokens=None, **kw): + return {"answers": { + "q1": "Klimawandel, Quoten, Generationswechsel", + "q2": "MSC, Aquakultur", + "q3": "Russland, EU-Politik", + "q4": "Verbraucherpreise", + }} + +class _R2LLM: + def chat_json(self, messages, temperature=0.0, max_tokens=None, **kw): + return {"ratings": {f"theme_{i}": {"importance": 4, "plausibility": 3} for i in range(5)}} + +class _ExtractLLM: + def chat_json(self, messages, temperature=0.0, max_tokens=None, **kw): + return {"themes": [ + {"theme_id": "theme_0", "label": "Klimawandel"}, + {"theme_id": "theme_1", "label": "Quoten"}, + {"theme_id": "theme_2", "label": "MSC"}, + {"theme_id": "theme_3", "label": "EU-Politik"}, + {"theme_id": "theme_4", "label": "Generationswechsel"}, + ]} + +def test_delphi_round1_open(): + sub = DelphiSubagent(llm=_R1LLM(), memory=_Mem(), instrument_path=INSTRUMENT) + persona = PersonaRecord(agent_id=2, name="A", persona="p") + resp = sub.administer_round1(persona) + assert resp.round == 1 + assert len(resp.answers) == 4 + +def test_extract_themes_aggregates(): + from app.models.interview import DelphiOpenResponse + r1 = [DelphiOpenResponse(agent_id=i, answers={"q1": "Klimawandel", "q2": "MSC"}) for i in range(3)] + themes = extract_themes(r1, llm=_ExtractLLM()) + assert len(themes) == 5 + assert all("theme_id" in t for t in themes) + +def test_convergence_metrics(): + from app.models.interview import DelphiRatingResponse + r2 = [DelphiRatingResponse(agent_id=i, round=2, + ratings={"t1": {"importance": 3, "plausibility": 3}}) for i in range(5)] + r3 = [DelphiRatingResponse(agent_id=i, round=3, + ratings={"t1": {"importance": 4, "plausibility": 4}}) for i in range(5)] + conv = convergence_metrics(r2, r3) + assert "t1" in conv + assert conv["t1"]["delta_iqr_importance"] is not None + + +def test_delphi_r2_accepts_string_ratings(): + """Delphi R2/R3 ratings should accept stringified importance/plausibility ints.""" + from app.services.interviews.base import PersonaRecord, MemoryDigest + from app.services.interviews.delphi import DelphiSubagent + from pathlib import Path as _P + + class _Mem: + def get_digest(self, agent_id, max_chars=2000): + return MemoryDigest(text="x", available=True) + + class _StringLLM: + def chat_json(self, messages, temperature=0.0, max_tokens=None, **kw): + return {"ratings": { + "t1": {"importance": "4", "plausibility": "3"}, + "t2": {"importance": "5", "plausibility": "2"}, + }} + + inst = _P(__file__).resolve().parents[2] / "scripts" / "instruments" / "delphi_v1.yaml" + sub = DelphiSubagent(llm=_StringLLM(), memory=_Mem(), instrument_path=inst) + persona = PersonaRecord(agent_id=1, name="A", persona="p") + themes = [{"theme_id": "t1", "label": "T1"}, {"theme_id": "t2", "label": "T2"}] + resp = sub.administer_round2(persona, themes) + assert resp.ratings["t1"]["importance"] == 4 + assert isinstance(resp.ratings["t1"]["importance"], int) diff --git a/backend/tests/interviews/test_diversity.py b/backend/tests/interviews/test_diversity.py new file mode 100644 index 00000000..d8eb45d3 --- /dev/null +++ b/backend/tests/interviews/test_diversity.py @@ -0,0 +1,78 @@ +from pathlib import Path +import numpy as np +from app.services.interviews.base import PersonaRecord, MemoryDigest +from app.services.interviews.diversity import ( + DiversitySubagent, run_typology, +) + +class _Mem: + def get_digest(self, agent_id, max_chars=2000): + return MemoryDigest(text="x", available=True) + +class _CannedLLM: + def chat_json(self, messages, temperature=0.0, max_tokens=None, **kw): + # Place all 24 statements into legal buckets per the forced distribution + placements = {} + buckets = [-3]*2 + [-2]*3 + [-1]*4 + [0]*6 + [1]*4 + [2]*3 + [3]*2 + for i in range(24): + placements[f"st_{i+1:02d}"] = buckets[i] + return { + "placements": placements, + "likert_axes": {"ax_pres_extr": 5, "ax_loc_eu": 3, "ax_sci_trad": 4, + "ax_ind_col": 4, "ax_short_long": 5, "ax_mkt_reg": 3}, + } + +INSTRUMENT = Path(__file__).resolve().parents[2] / "scripts" / "instruments" / "diversity_v1.yaml" + +def test_diversity_administer(): + sub = DiversitySubagent(llm=_CannedLLM(), memory=_Mem(), instrument_path=INSTRUMENT) + persona = PersonaRecord(agent_id=1, name="A", persona="p") + resp = sub.administer(persona) + assert len(resp.placements) == 24 + assert set(resp.likert_axes.keys()) == { + "ax_pres_extr","ax_loc_eu","ax_sci_trad","ax_ind_col","ax_short_long","ax_mkt_reg" + } + +def test_typology_runs_pca_kmeans(): + from app.models.interview import QSortResponse + rng = np.random.default_rng(42) + responses = [] + for aid in range(20): + placements = {f"st_{i+1:02d}": int(rng.integers(-3, 4)) for i in range(24)} + axes = {f"ax_{j}": int(rng.integers(1, 8)) for j in range(6)} + responses.append(QSortResponse(agent_id=aid, placements=placements, likert_axes=axes)) + result = run_typology(responses, n_clusters=3) + assert "clusters" in result + assert len(result["clusters"]) == 3 + assert "pca" in result + assert len(result["pca"]["components"]) >= 2 + + +def test_diversity_accepts_string_likert_values(): + """Diversity placements + axes should accept stringified ints.""" + from app.services.interviews.base import PersonaRecord, MemoryDigest + from app.services.interviews.diversity import DiversitySubagent + from pathlib import Path as _P + + class _Mem: + def get_digest(self, agent_id, max_chars=2000): + return MemoryDigest(text="x", available=True) + + buckets = [-3]*2 + [-2]*3 + [-1]*4 + [0]*6 + [1]*4 + [2]*3 + [3]*2 + + class _StringLLM: + def chat_json(self, messages, temperature=0.0, max_tokens=None, **kw): + return { + "placements": {f"st_{i+1:02d}": str(buckets[i]) for i in range(24)}, + "likert_axes": {a: "4" for a in ( + "ax_pres_extr","ax_loc_eu","ax_sci_trad", + "ax_ind_col","ax_short_long","ax_mkt_reg")}, + } + + inst = _P(__file__).resolve().parents[2] / "scripts" / "instruments" / "diversity_v1.yaml" + sub = DiversitySubagent(llm=_StringLLM(), memory=_Mem(), instrument_path=inst) + persona = PersonaRecord(agent_id=7, name="A", persona="p") + resp = sub.administer(persona) + assert isinstance(resp.placements["st_01"], int) + assert isinstance(resp.likert_axes["ax_pres_extr"], int) + assert resp.likert_axes["ax_pres_extr"] == 4 diff --git a/backend/tests/interviews/test_instrument_loader.py b/backend/tests/interviews/test_instrument_loader.py new file mode 100644 index 00000000..dfb0852e --- /dev/null +++ b/backend/tests/interviews/test_instrument_loader.py @@ -0,0 +1,44 @@ +import pytest +from app.services.interviews.instrument_loader import ( + load_likert_instrument, InstrumentValidationError, +) + +def _write(tmp_path, text): + p = tmp_path / "inst.yaml" + p.write_text(text, encoding="utf-8") + return p + +def test_loads_valid_likert(tmp_path): + p = _write(tmp_path, """ +name: longitudinal_v1 +version: "1.0" +language_default: de +items: + - item_id: stk_1 + de: "Der westliche Dorschbestand wird sich erholen" + en: "Western cod stock will recover" + scale: 5 + family: stocks +""") + inst = load_likert_instrument(p) + assert inst.name == "longitudinal_v1" + assert len(inst.items) == 1 + +def test_rejects_duplicate_item_id(tmp_path): + p = _write(tmp_path, """ +name: x +items: + - {item_id: a, de: d, en: e, scale: 5} + - {item_id: a, de: d, en: e, scale: 5} +""") + with pytest.raises(InstrumentValidationError): + load_likert_instrument(p) + +def test_rejects_missing_required_field(tmp_path): + p = _write(tmp_path, """ +name: x +items: + - {item_id: a, de: d, scale: 5} +""") + with pytest.raises(InstrumentValidationError): + load_likert_instrument(p) diff --git a/backend/tests/interviews/test_lifecycle.py b/backend/tests/interviews/test_lifecycle.py new file mode 100644 index 00000000..f8d2c952 --- /dev/null +++ b/backend/tests/interviews/test_lifecycle.py @@ -0,0 +1,26 @@ +""" +Tests for interview lifecycle hook installer (Task 20). +""" + +from app.services.interviews.lifecycle import install_hooks + + +class _StubMgr: + def __init__(self): + self.ready = [] + self.completed = [] + + def register_on_ready(self, fn): + self.ready.append(fn) + + def register_on_completed(self, fn): + self.completed.append(fn) + + +def test_install_hooks_registers_two_callables(): + mgr = _StubMgr() + install_hooks(mgr) + assert len(mgr.ready) == 1 + assert len(mgr.completed) == 1 + assert callable(mgr.ready[0]) + assert callable(mgr.completed[0]) diff --git a/backend/tests/interviews/test_llm_stub.py b/backend/tests/interviews/test_llm_stub.py new file mode 100644 index 00000000..6be5ed2a --- /dev/null +++ b/backend/tests/interviews/test_llm_stub.py @@ -0,0 +1,17 @@ +import json +from app.utils.llm_client import LLMClient + + +def test_stub_mode_returns_deterministic_canned_json(monkeypatch): + monkeypatch.setenv("LLM_STUB_MODE", "true") + from app.config import Config + Config.LLM_STUB_MODE = True + client = LLMClient(api_key="x", base_url="x", model="x") + messages = [ + {"role": "system", "content": "You are persona_42. Return JSON."}, + {"role": "user", "content": "stub_key=longitudinal:item_001"}, + ] + out1 = client.chat_json(messages=messages, temperature=0.0) + out2 = client.chat_json(messages=messages, temperature=0.0) + assert out1 == out2 + assert isinstance(out1, dict) diff --git a/backend/tests/interviews/test_longitudinal.py b/backend/tests/interviews/test_longitudinal.py new file mode 100644 index 00000000..006c293a --- /dev/null +++ b/backend/tests/interviews/test_longitudinal.py @@ -0,0 +1,91 @@ +from pathlib import Path +import pytest +from app.models.interview import InterviewPhase +from app.services.interviews.base import PersonaRecord, MemoryDigest +from app.services.interviews.longitudinal import LongitudinalSubagent, run_aggregate + + +class _FakeMem: + def get_digest(self, agent_id, max_chars=2000): + return MemoryDigest(text="x", available=True) + + +class _CannedLLM: + def __init__(self): self.n = 0 + def chat_json(self, messages, temperature=0.0, max_tokens=None, **kw): + self.n += 1 + return { + "responses": { + "stk_1": 4, "stk_2": 3, "stk_3": 5, + "gov_1": 3, "gov_2": 4, "gov_3": 2, + "mkt_1": 5, "mkt_2": 3, "mkt_3": 4, + "clm_1": 2, "clm_2": 4, "clm_3": 5, + }, + "confidence": { + "stk_1": 0.8, "stk_2": 0.7, "stk_3": 0.9, + "gov_1": 0.6, "gov_2": 0.7, "gov_3": 0.5, + "mkt_1": 0.7, "mkt_2": 0.6, "mkt_3": 0.8, + "clm_1": 0.5, "clm_2": 0.7, "clm_3": 0.6, + }, + "open_comment": "test", + } + + +INSTRUMENT = Path(__file__).resolve().parents[2] / "scripts" / "instruments" / "longitudinal_v1.yaml" + + +def test_longitudinal_administer_one_agent(): + sub = LongitudinalSubagent(llm=_CannedLLM(), memory=_FakeMem(), instrument_path=INSTRUMENT) + persona = PersonaRecord(agent_id=3, name="A", persona="p") + resp = sub.administer(persona, phase=InterviewPhase.T0) + assert resp.agent_id == 3 + assert resp.phase == InterviewPhase.T0 + assert set(resp.responses.keys()) >= {"stk_1", "gov_1", "mkt_1", "clm_1"} + + +def test_longitudinal_aggregate_delta(): + from app.models.interview import LikertResponse + t0 = [LikertResponse(agent_id=i, phase=InterviewPhase.T0, + responses={"stk_1": 3, "gov_1": 4}, + confidence={"stk_1": 0.8, "gov_1": 0.8}) for i in range(5)] + t1 = [LikertResponse(agent_id=i, phase=InterviewPhase.T1, + responses={"stk_1": 4, "gov_1": 4}, + confidence={"stk_1": 0.8, "gov_1": 0.8}) for i in range(5)] + agg = run_aggregate(t0, t1) + assert agg["per_item"]["stk_1"]["mean_delta"] == 1.0 + assert agg["per_item"]["gov_1"]["mean_delta"] == 0.0 + assert agg["n_paired"] == 5 + + +def test_longitudinal_accepts_string_likert_values(): + """Real LLMs sometimes return Likert values as JSON strings ('3' not 3). + The validator should coerce them rather than fail the agent.""" + from app.models.interview import InterviewPhase + from app.services.interviews.base import PersonaRecord, MemoryDigest + from app.services.interviews.longitudinal import LongitudinalSubagent + from pathlib import Path as _P + + class _Mem: + def get_digest(self, agent_id, max_chars=2000): + return MemoryDigest(text="x", available=True) + + class _StringLLM: + def chat_json(self, messages, temperature=0.0, max_tokens=None, **kw): + return { + "responses": { # all strings, not ints + "stk_1": "4", "stk_2": "3", "stk_3": "5", + "gov_1": "3", "gov_2": "4", "gov_3": "2", + "mkt_1": "5", "mkt_2": "3", "mkt_3": "4", + "clm_1": "2", "clm_2": "4", "clm_3": "5", + }, + "confidence": {}, + "open_comment": "stringified", + } + + inst = _P(__file__).resolve().parents[2] / "scripts" / "instruments" / "longitudinal_v1.yaml" + sub = LongitudinalSubagent(llm=_StringLLM(), memory=_Mem(), instrument_path=inst) + persona = PersonaRecord(agent_id=99, name="A", persona="p") + resp = sub.administer(persona, phase=InterviewPhase.T0) + assert resp.agent_id == 99 + assert resp.responses["stk_1"] == 4 + assert isinstance(resp.responses["stk_1"], int) diff --git a/backend/tests/interviews/test_models.py b/backend/tests/interviews/test_models.py new file mode 100644 index 00000000..e575d118 --- /dev/null +++ b/backend/tests/interviews/test_models.py @@ -0,0 +1,30 @@ +import pytest +from pydantic import ValidationError +from app.models.interview import ( + LikertItem, LikertInstrument, LikertResponse, + InterviewPhase, SubagentKind, +) + +def test_likert_item_requires_de_and_en(): + item = LikertItem(item_id="x1", de="Frage", en="Question", scale=5) + assert item.scale == 5 + +def test_likert_item_rejects_bad_scale(): + with pytest.raises(ValidationError): + LikertItem(item_id="x1", de="d", en="e", scale=2) + +def test_likert_instrument_unique_item_ids(): + with pytest.raises(ValidationError): + LikertInstrument( + name="t", + items=[LikertItem(item_id="a", de="d", en="e", scale=5), + LikertItem(item_id="a", de="d", en="e", scale=5)], + ) + +def test_likert_response_validates_scale_range(): + with pytest.raises(ValidationError): + LikertResponse(agent_id=1, phase=InterviewPhase.T0, + responses={"a": 6}, confidence={"a": 0.5}) + +def test_subagent_kind_enum(): + assert SubagentKind.LONGITUDINAL.value == "longitudinal" diff --git a/backend/tests/interviews/test_orchestrator.py b/backend/tests/interviews/test_orchestrator.py new file mode 100644 index 00000000..8d380eaf --- /dev/null +++ b/backend/tests/interviews/test_orchestrator.py @@ -0,0 +1,95 @@ +from pathlib import Path +import pytest +from app.models.interview import InterviewPhase, SubagentKind +from app.services.interviews.base import PersonaRecord, MemoryDigest +from app.services.interview_orchestrator import ( + InterviewOrchestrator, PersonaProvider, +) + +INST_DIR = Path(__file__).resolve().parents[2] / "scripts" / "instruments" + +class _Mem: + def get_digest(self, agent_id, max_chars=2000): + return MemoryDigest(text="x", available=True) + +class _LLM: + def chat_json(self, messages, temperature=0.0, max_tokens=None, **kw): + sys_text = next((m["content"] for m in messages if m["role"] == "system"), "") + if "longitudinal" in sys_text or "stk_" in (messages[-1].get("content") or ""): + return { + "responses": {k: 3 for k in ("stk_1","stk_2","stk_3","gov_1","gov_2","gov_3", + "mkt_1","mkt_2","mkt_3","clm_1","clm_2","clm_3")}, + "confidence": {}, "open_comment": "ok", + } + return {} + +class _Personas(PersonaProvider): + def __init__(self, n=3): + self._items = [PersonaRecord(agent_id=i, name=f"A{i}", persona="p") for i in range(n)] + def all(self): return list(self._items) + +class _NoopZep: + def write_per_agent(self, *a, **kw): pass + def write_aggregate(self, *a, **kw): pass + +def test_pre_phase_runs_longitudinal_only(tmp_path): + orch = InterviewOrchestrator( + llm=_LLM(), memory=_Mem(), personas=_Personas(3), + instrument_dir=INST_DIR, store_root=tmp_path, sim_id="sim1", + zep_writer=_NoopZep(), max_workers=2, + ) + result = orch.run_pre() + assert result["longitudinal"]["n_responded"] == 3 + assert "diversity" not in result # only longitudinal in pre-phase + +def test_partial_failure_does_not_kill_run(tmp_path): + class _FlakyLLM: + def __init__(self): self.n = 0 + def chat_json(self, messages, temperature=0.0, max_tokens=None, **kw): + self.n += 1 + if self.n % 2 == 0: + raise RuntimeError("simulated LLM 5xx") + return { + "responses": {k: 3 for k in ("stk_1","stk_2","stk_3","gov_1","gov_2","gov_3", + "mkt_1","mkt_2","mkt_3","clm_1","clm_2","clm_3")}, + "confidence": {}, "open_comment": "ok", + } + orch = InterviewOrchestrator( + llm=_FlakyLLM(), memory=_Mem(), personas=_Personas(4), + instrument_dir=INST_DIR, store_root=tmp_path, sim_id="sim2", + zep_writer=_NoopZep(), max_workers=1, + ) + result = orch.run_pre() + assert result["longitudinal"]["n_responded"] < 4 + assert result["longitudinal"]["n_failed"] > 0 + + +def test_schema_failure_audit_captures_raw_llm_output(tmp_path): + """When an agent's LLM output fails the schema validator twice, the audit log + should preserve both raw outputs so we can debug what the model actually said.""" + bad_response = {"wrong": "shape, no responses key"} + class _BadLLM: + def chat_json(self, messages, temperature=0.0, max_tokens=None, **kw): + return bad_response # always fails Longitudinal validator + orch = InterviewOrchestrator( + llm=_BadLLM(), memory=_Mem(), personas=_Personas(1), + instrument_dir=INST_DIR, store_root=tmp_path, sim_id="sim3", + zep_writer=_NoopZep(), max_workers=1, + ) + result = orch.run_pre() + assert result["longitudinal"]["n_responded"] == 0 + assert result["longitudinal"]["n_failed"] == 1 + + import json as _j + run_dir = Path(result["longitudinal"]["run_dir"]) + audit_path = run_dir / "audit.jsonl" + lines = audit_path.read_text(encoding="utf-8").splitlines() + assert lines, "audit.jsonl should not be empty" + entry = _j.loads(lines[0]) + assert entry["event"] == "schema_validation_failure" + assert entry["agent_id"] == 0 + detail = entry["detail"] + assert detail["label"] == "longitudinal_T0" + assert len(detail["attempts"]) == 2 + assert detail["attempts"][0]["raw"] == bad_response + assert detail["attempts"][1]["raw"] == bad_response diff --git a/backend/tests/interviews/test_scenario.py b/backend/tests/interviews/test_scenario.py new file mode 100644 index 00000000..61787211 --- /dev/null +++ b/backend/tests/interviews/test_scenario.py @@ -0,0 +1,60 @@ +from pathlib import Path +from app.services.interviews.base import PersonaRecord, MemoryDigest +from app.services.interviews.scenario import ScenarioSubagent, polarity_matrix + +INSTRUMENT = Path(__file__).resolve().parents[2] / "scripts" / "instruments" / "scenario_v1.yaml" + +class _Mem: + def get_digest(self, agent_id, max_chars=2000): + return MemoryDigest(text="x", available=True) + +class _LLM: + def chat_json(self, messages, temperature=0.0, max_tokens=None, **kw): + return {"ratings": {sid: { + "desirability": 4, "plausibility": 3, "impact_on_my_group": 5, "fairness": 3, + "if_woke_up_response": f"act-on-{sid}", + } for sid in ("S1", "S2", "S3", "S4")}} + +def test_scenario_administer(): + sub = ScenarioSubagent(llm=_LLM(), memory=_Mem(), instrument_path=INSTRUMENT) + persona = PersonaRecord(agent_id=1, name="A", persona="p") + resp = sub.administer(persona) + assert set(resp.ratings.keys()) == {"S1", "S2", "S3", "S4"} + assert resp.ratings["S1"].desirability == 4 + +def test_polarity_matrix(): + from app.models.interview import ScenarioResponse, ScenarioRating + responses = [ScenarioResponse(agent_id=i, ratings={ + "S1": ScenarioRating(desirability=5, plausibility=4, impact_on_my_group=5, fairness=4, + if_woke_up_response="x"), + }) for i in range(3)] + m = polarity_matrix(responses) + assert "S1" in m + assert m["S1"]["mean_desirability"] == 5 + assert m["S1"]["n"] == 3 + + +def test_scenario_accepts_string_likert_values(): + """Scenario ratings should accept stringified ints across all 4 dimensions.""" + from app.services.interviews.base import PersonaRecord, MemoryDigest + from app.services.interviews.scenario import ScenarioSubagent + from pathlib import Path as _P + + class _Mem: + def get_digest(self, agent_id, max_chars=2000): + return MemoryDigest(text="x", available=True) + + class _StringLLM: + def chat_json(self, messages, temperature=0.0, max_tokens=None, **kw): + return {"ratings": {sid: { + "desirability": "4", "plausibility": "3", + "impact_on_my_group": "5", "fairness": "3", + "if_woke_up_response": f"act-{sid}", + } for sid in ("S1","S2","S3","S4")}} + + inst = _P(__file__).resolve().parents[2] / "scripts" / "instruments" / "scenario_v1.yaml" + sub = ScenarioSubagent(llm=_StringLLM(), memory=_Mem(), instrument_path=inst) + persona = PersonaRecord(agent_id=3, name="A", persona="p") + resp = sub.administer(persona) + assert resp.ratings["S1"].desirability == 4 + assert isinstance(resp.ratings["S1"].desirability, int) diff --git a/backend/tests/interviews/test_simulation_hooks.py b/backend/tests/interviews/test_simulation_hooks.py new file mode 100644 index 00000000..52852d28 --- /dev/null +++ b/backend/tests/interviews/test_simulation_hooks.py @@ -0,0 +1,96 @@ +""" +Tests for SimulationManager lifecycle hooks (on_ready / on_completed). + +NOTE ON SHAPE DIVERGENCE vs. original plan spec: +- SimulationState uses `simulation_id` (not `sim_id`) +- `status` is a SimulationStatus enum, not a plain string +- The COMPLETED transition lives in simulation_runner.py (SimulationRunner._monitor_simulation), + not in simulation_manager.py. The _notify_on_completed hook is registered on SimulationManager + and the production insertion point for COMPLETED is documented in DONE_WITH_CONCERNS. + +Hooks are stored on the class (C3 fix), so each test snapshots/restores the +registries via the autouse fixture to keep test isolation. +""" + +import pytest + +from app.services.simulation_manager import SimulationManager, SimulationState, SimulationStatus + + +@pytest.fixture(autouse=True) +def _isolate_class_hooks(): + saved_ready = list(SimulationManager._on_ready_hooks) + saved_completed = list(SimulationManager._on_completed_hooks) + try: + yield + finally: + SimulationManager._on_ready_hooks[:] = saved_ready + SimulationManager._on_completed_hooks[:] = saved_completed + + +def test_register_post_ready_hook_invoked(): + called = [] + mgr = SimulationManager() + mgr.register_on_ready(lambda state: called.append(("ready", state.simulation_id))) + state = SimulationState( + simulation_id="abc", + project_id="proj1", + graph_id="graph1", + status=SimulationStatus.READY, + ) + mgr._notify_on_ready(state) + assert called == [("ready", "abc")] + + +def test_register_post_completed_hook_invoked(): + called = [] + mgr = SimulationManager() + mgr.register_on_completed(lambda state: called.append(("done", state.simulation_id))) + state = SimulationState( + simulation_id="abc", + project_id="proj1", + graph_id="graph1", + status=SimulationStatus.COMPLETED, + ) + mgr._notify_on_completed(state) + assert called == [("done", "abc")] + + +def test_hooks_survive_across_instances(): + """C3: hook registries are class-level, so callbacks registered through the + classmethod must still fire on a freshly constructed instance. This is + what makes the Flask per-request ``SimulationManager()`` pattern work + after ``install_hooks(SimulationManager)`` runs at app startup. + """ + called: list[str] = [] + + # Register via the class — the production install_hooks(cls) path. + SimulationManager.register_on_ready(lambda s: called.append(f"ready:{s.simulation_id}")) + SimulationManager.register_on_completed(lambda s: called.append(f"done:{s.simulation_id}")) + + # New, independently-constructed instance must still see the hooks. + fresh = SimulationManager() + state = SimulationState( + simulation_id="cross_instance", + project_id="p", + graph_id="g", + status=SimulationStatus.READY, + ) + fresh._notify_on_ready(state) + state.status = SimulationStatus.COMPLETED + fresh._notify_on_completed(state) + + assert "ready:cross_instance" in called + assert "done:cross_instance" in called + + +def test_register_via_instance_also_lands_on_class(): + """Registering through an instance must populate the class registry too — + backward-compatibility with code that calls ``manager.register_on_*``. + """ + mgr1 = SimulationManager() + mgr1.register_on_ready(lambda s: None) + # A second, unrelated instance must see the hook. + mgr2 = SimulationManager() + assert len(SimulationManager._on_ready_hooks) >= 1 + assert SimulationManager._on_ready_hooks is mgr2.__class__._on_ready_hooks diff --git a/backend/tests/interviews/test_storage.py b/backend/tests/interviews/test_storage.py new file mode 100644 index 00000000..26837e92 --- /dev/null +++ b/backend/tests/interviews/test_storage.py @@ -0,0 +1,37 @@ +import json +from pathlib import Path +from app.models.interview import ( + LikertResponse, InterviewPhase, SubagentKind, +) +from app.services.interviews.storage import InterviewStore + +def test_run_directory_layout(tmp_path): + store = InterviewStore(root=tmp_path, sim_id="sim42") + run_dir = store.start_run(phase=InterviewPhase.T0, subagent=SubagentKind.LONGITUDINAL) + assert run_dir.exists() + assert run_dir.parent.name == "longitudinal" + assert run_dir.parent.parent.name == "T0" + +def test_append_response(tmp_path): + store = InterviewStore(root=tmp_path, sim_id="sim42") + run_dir = store.start_run(phase=InterviewPhase.T0, subagent=SubagentKind.LONGITUDINAL) + r = LikertResponse(agent_id=1, phase=InterviewPhase.T0, + responses={"a": 3}, confidence={"a": 0.5}) + store.append_response(run_dir, r) + contents = (run_dir / "responses.jsonl").read_text() + assert json.loads(contents.splitlines()[0])["agent_id"] == 1 + +def test_write_aggregate_and_latest_pointer(tmp_path): + store = InterviewStore(root=tmp_path, sim_id="sim42") + run_dir = store.start_run(phase=InterviewPhase.T1, subagent=SubagentKind.SCENARIO) + store.write_aggregate(run_dir, {"k": 1}) + store.mark_latest(run_dir) + latest = (run_dir.parent / "latest.json").read_text() + assert json.loads(latest)["run_dir"].endswith(run_dir.name) + +def test_audit_log_append(tmp_path): + store = InterviewStore(root=tmp_path, sim_id="sim42") + run_dir = store.start_run(phase=InterviewPhase.T0, subagent=SubagentKind.DELPHI) + store.audit(run_dir, agent_id=7, event="schema_violation", detail="missing key x") + audit = (run_dir / "audit.jsonl").read_text() + assert "schema_violation" in audit diff --git a/backend/tests/interviews/test_synthesizer.py b/backend/tests/interviews/test_synthesizer.py new file mode 100644 index 00000000..2a842114 --- /dev/null +++ b/backend/tests/interviews/test_synthesizer.py @@ -0,0 +1,32 @@ +import json +from pathlib import Path +from app.services.interviews.storage import InterviewStore +from app.models.interview import InterviewPhase, SubagentKind, LikertResponse +from app.services.interview_synthesizer import InterviewSynthesizer + +def _seed_minimal(tmp_path: Path) -> InterviewStore: + store = InterviewStore(root=tmp_path, sim_id="s1") + rd = store.start_run(InterviewPhase.T0, SubagentKind.LONGITUDINAL) + for i in range(3): + store.append_response(rd, LikertResponse( + agent_id=i, phase=InterviewPhase.T0, + responses={"stk_1": 3, "gov_1": 3}, confidence={"stk_1": 0.5, "gov_1": 0.5}, + )) + store.write_aggregate(rd, {"per_item": {}, "n_paired": 0}) + store.mark_latest(rd) + return store + +def test_synthesizer_runs_with_partial_data(tmp_path): + store = _seed_minimal(tmp_path) + synth = InterviewSynthesizer(store=store) + report = synth.run() + assert "limitations" in report.lower() + assert "stub mode" in report.lower() or "n_responded" in report.lower() + +def test_synthesizer_writes_files(tmp_path): + store = _seed_minimal(tmp_path) + synth = InterviewSynthesizer(store=store) + synth.run() + files = list((store.base / "synthesis").iterdir()) + names = {f.name for f in files} + assert "report.md" in names diff --git a/backend/tests/interviews/test_zep_writer.py b/backend/tests/interviews/test_zep_writer.py new file mode 100644 index 00000000..6eaed454 --- /dev/null +++ b/backend/tests/interviews/test_zep_writer.py @@ -0,0 +1,77 @@ +import pytest + +from app.models.interview import ( + LikertResponse, InterviewPhase, SubagentKind, +) +from app.services.interviews.zep_writer import InterviewZepWriter + + +class _FakeMemoryUpdater: + """Fake mirroring the real ZepGraphMemoryUpdater contract. + + Post-C4 the writer only uses ``add_text_episode(graph_id, text)`` — + ``add_activity`` is deliberately omitted to lock in the new behaviour and + catch any regression that re-introduces the broken dict-based fallback. + """ + + def __init__(self): + self.events: list[dict] = [] + + def add_text_episode(self, graph_id, text): + self.events.append({"graph_id": graph_id, "text": text}) + + +def test_per_agent_episode_text(): + upd = _FakeMemoryUpdater() + w = InterviewZepWriter(memory_updater=upd, graph_id="g1") + r = LikertResponse(agent_id=42, phase=InterviewPhase.T1, + responses={"stk_1": 4, "gov_1": 3}, + confidence={"stk_1": 0.8, "gov_1": 0.7}) + w.write_per_agent(SubagentKind.LONGITUDINAL, r, agent_name="Fischer Müller") + assert any("Fischer Müller" in str(e) for e in upd.events) + assert any("longitudinal/T1" in str(e) for e in upd.events) + # Each event must carry the configured graph_id. + assert all(e["graph_id"] == "g1" for e in upd.events) + + +def test_aggregate_episode(): + upd = _FakeMemoryUpdater() + w = InterviewZepWriter(memory_updater=upd, graph_id="g1") + w.write_aggregate(SubagentKind.SCENARIO, summary="S1 mean desirability 5.2; S2 mean 2.1") + assert any("S1 mean" in str(e) for e in upd.events) + + +def test_emit_uses_add_text_episode_with_graph_id(): + """C4: ``_emit`` must call ``updater.add_text_episode(graph_id, text)`` + with the constructor's graph_id and the raw text — no dict shape, no + ``add_activity`` fallback (the real ``add_activity`` rejects dicts). + """ + upd = _FakeMemoryUpdater() + w = InterviewZepWriter(memory_updater=upd, graph_id="g_xyz") + w._emit("hello world") + assert upd.events == [{"graph_id": "g_xyz", "text": "hello world"}] + + +def test_emit_raises_when_updater_lacks_add_text_episode(): + """C4: a memory_updater without ``add_text_episode`` must surface a + RuntimeError rather than silently no-op via a broken ``add_activity`` + fallback. + """ + + class _Broken: + def add_activity(self, activity): # pragma: no cover - kept for clarity + raise AssertionError("must not be called") + + w = InterviewZepWriter(memory_updater=_Broken(), graph_id="g1") + with pytest.raises(RuntimeError, match="add_text_episode"): + w._emit("x") + + +def test_real_updater_exposes_add_text_episode(): + """C4 sanity check: ZepGraphMemoryUpdater (the real class) must expose + ``add_text_episode`` so the production wiring works without falling + through to the broken ``add_activity(dict)`` path. + """ + from app.services.zep_graph_memory_updater import ZepGraphMemoryUpdater + + assert hasattr(ZepGraphMemoryUpdater, "add_text_episode") diff --git a/backend/uv.lock b/backend/uv.lock index 642dd9c3..612a6a31 100644 --- a/backend/uv.lock +++ b/backend/uv.lock @@ -994,10 +994,15 @@ dependencies = [ { name = "charset-normalizer" }, { name = "flask" }, { name = "flask-cors" }, + { name = "numpy" }, { name = "openai" }, + { name = "pandas" }, { name = "pydantic" }, { name = "pymupdf" }, { name = "python-dotenv" }, + { name = "pyyaml" }, + { name = "scikit-learn" }, + { name = "scipy" }, { name = "zep-cloud" }, ] @@ -1022,13 +1027,18 @@ requires-dist = [ { name = "charset-normalizer", specifier = ">=3.0.0" }, { name = "flask", specifier = ">=3.0.0" }, { name = "flask-cors", specifier = ">=6.0.0" }, + { name = "numpy", specifier = ">=1.26" }, { name = "openai", specifier = ">=1.0.0" }, + { name = "pandas", specifier = ">=2.1" }, { name = "pipreqs", marker = "extra == 'dev'", specifier = ">=0.5.0" }, { name = "pydantic", specifier = ">=2.0.0" }, { name = "pymupdf", specifier = ">=1.24.0" }, { name = "pytest", marker = "extra == 'dev'", specifier = ">=8.0.0" }, { name = "pytest-asyncio", marker = "extra == 'dev'", specifier = ">=0.23.0" }, { name = "python-dotenv", specifier = ">=1.0.0" }, + { name = "pyyaml", specifier = ">=6.0" }, + { name = "scikit-learn", specifier = ">=1.4" }, + { name = "scipy", specifier = ">=1.12" }, { name = "zep-cloud", specifier = "==3.13.0" }, ] provides-extras = ["dev"] diff --git a/docs/superpowers/plans/2026-05-23-stakeholder-interview-subagents.md b/docs/superpowers/plans/2026-05-23-stakeholder-interview-subagents.md new file mode 100644 index 00000000..4de7f7c6 --- /dev/null +++ b/docs/superpowers/plans/2026-05-23-stakeholder-interview-subagents.md @@ -0,0 +1,3837 @@ +# Stakeholder Interview Subagents Implementation Plan + +> **For agentic workers:** REQUIRED SUB-SKILL: Use superpowers:subagent-driven-development (recommended) or superpowers:executing-plans to implement this plan task-by-task. Steps use checkbox (`- [ ]`) syntax for tracking. + +**Goal:** Build a four-subagent post-simulation interview system (Longitudinal, Diversity, Delphi, Scenario) over MiroFish-simulated stakeholders, plus a cross-method synthesiser, exposed via `/api/interview` and rendered in a new Vue Step4b. + +**Architecture:** Deterministic instrument runners (not ReACT). Shared `StakeholderInterviewer` base loads persona + Zep memory digest and administers per-instrument JSON-schema-validated prompts via the existing `LLMClient`. Four subagents own their own instrument YAML + output schema. `InterviewOrchestrator` fans out parallel post-sim execution; `InterviewSynthesizer` aggregates. Files: backend Python services + new Flask blueprint; frontend new Vue component with d3 viz. + +**Tech Stack:** Python 3.12, Flask, pydantic v2, PyYAML, scikit-learn (PCA, k-means), scipy (Wilcoxon), numpy, pytest; Vue 3, axios, d3 v7, vue-i18n. + +**Spec:** `docs/superpowers/specs/2026-05-23-stakeholder-interview-subagents-design.md` + +--- + +## Phase 0 — Setup + +### Task 0: Add deps and pytest scaffold + +**Files:** +- Modify: `backend/pyproject.toml` +- Create: `backend/tests/__init__.py` +- Create: `backend/tests/conftest.py` +- Create: `backend/pytest.ini` + +- [ ] **Step 1: Add deps to `backend/pyproject.toml`** + +In the `dependencies` array (after `pydantic>=2.0.0`), add: +```toml + "PyYAML>=6.0", + "scikit-learn>=1.4", + "scipy>=1.12", + "numpy>=1.26", + "pandas>=2.1", +``` + +- [ ] **Step 2: Create `backend/pytest.ini`** + +```ini +[pytest] +testpaths = tests +python_files = test_*.py +python_classes = Test* +python_functions = test_* +addopts = -ra --strict-markers +markers = + integration: marks integration tests (deselect with -m 'not integration') +``` + +- [ ] **Step 3: Create `backend/tests/__init__.py`** + +Empty file. + +- [ ] **Step 4: Create `backend/tests/conftest.py`** + +```python +import os +import sys +import pathlib +import pytest + +ROOT = pathlib.Path(__file__).resolve().parents[1] +sys.path.insert(0, str(ROOT)) + +os.environ.setdefault("LLM_API_KEY", "test") +os.environ.setdefault("LLM_BASE_URL", "https://example.invalid") +os.environ.setdefault("LLM_MODEL_NAME", "test-model") +os.environ.setdefault("ZEP_API_KEY", "test") + +@pytest.fixture +def tmp_uploads(tmp_path, monkeypatch): + monkeypatch.setenv("UPLOADS_DIR", str(tmp_path)) + return tmp_path +``` + +- [ ] **Step 5: Install + verify** + +Run: `cd backend && uv sync --python 3.12 && uv run pytest -q` +Expected: `0 tests collected` (no failures). Confirms infrastructure works. + +- [ ] **Step 6: Commit** + +```bash +git add backend/pyproject.toml backend/uv.lock backend/pytest.ini backend/tests/__init__.py backend/tests/conftest.py +git commit -m "chore(interviews): add deps and pytest scaffold for interview subsystem" +``` + +--- + +### Task 1: Add interview config keys + +**Files:** +- Modify: `backend/app/config.py` + +- [ ] **Step 1: Read current config** + +Open `backend/app/config.py` and locate the `Config` class. + +- [ ] **Step 2: Add config keys** + +Add inside the `Config` class (preserving existing keys): +```python + # Interview subsystem + INTERVIEW_MAX_TOKENS_PER_RUN = int(os.environ.get("INTERVIEW_MAX_TOKENS_PER_RUN", 15_000_000)) + INTERVIEW_MAX_WORKERS = int(os.environ.get("INTERVIEW_MAX_WORKERS", 8)) + INTERVIEW_DEFAULT_LANGUAGE = os.environ.get("INTERVIEW_DEFAULT_LANGUAGE", "de") + LLM_STUB_MODE = os.environ.get("LLM_STUB_MODE", "false").lower() == "true" +``` + +- [ ] **Step 3: Verify import** + +Run: `cd backend && uv run python -c "from app.config import Config; print(Config.INTERVIEW_MAX_WORKERS, Config.LLM_STUB_MODE)"` +Expected: `8 False` + +- [ ] **Step 4: Commit** + +```bash +git add backend/app/config.py +git commit -m "feat(interviews): add interview config keys (token budget, workers, language, stub mode)" +``` + +--- + +## Phase 1 — Foundation + +### Task 2: Pydantic models for instruments and responses + +**Files:** +- Create: `backend/app/models/interview.py` +- Create: `backend/tests/interviews/__init__.py` +- Test: `backend/tests/interviews/test_models.py` + +- [ ] **Step 1: Write failing test** + +Create `backend/tests/interviews/__init__.py` (empty), then `backend/tests/interviews/test_models.py`: +```python +import pytest +from pydantic import ValidationError +from app.models.interview import ( + LikertItem, LikertInstrument, LikertResponse, + InterviewPhase, SubagentKind, +) + +def test_likert_item_requires_de_and_en(): + item = LikertItem(item_id="x1", de="Frage", en="Question", scale=5) + assert item.scale == 5 + +def test_likert_item_rejects_bad_scale(): + with pytest.raises(ValidationError): + LikertItem(item_id="x1", de="d", en="e", scale=2) + +def test_likert_instrument_unique_item_ids(): + with pytest.raises(ValidationError): + LikertInstrument( + name="t", + items=[LikertItem(item_id="a", de="d", en="e", scale=5), + LikertItem(item_id="a", de="d", en="e", scale=5)], + ) + +def test_likert_response_validates_scale_range(): + with pytest.raises(ValidationError): + LikertResponse(agent_id=1, phase=InterviewPhase.T0, + responses={"a": 6}, confidence={"a": 0.5}) + +def test_subagent_kind_enum(): + assert SubagentKind.LONGITUDINAL.value == "longitudinal" +``` + +- [ ] **Step 2: Run test to verify it fails** + +Run: `cd backend && uv run pytest tests/interviews/test_models.py -v` +Expected: ImportError (module not yet created). + +- [ ] **Step 3: Create `backend/app/models/interview.py`** + +```python +from __future__ import annotations +from enum import Enum +from typing import Optional +from pydantic import BaseModel, Field, field_validator, model_validator + +class InterviewPhase(str, Enum): + T0 = "T0" + T1 = "T1" + +class SubagentKind(str, Enum): + LONGITUDINAL = "longitudinal" + DIVERSITY = "diversity" + DELPHI = "delphi" + SCENARIO = "scenario" + +class LikertItem(BaseModel): + item_id: str + de: str + en: str + scale: int = Field(ge=3, le=7) + family: Optional[str] = None + reverse_coded: bool = False + + @field_validator("scale") + @classmethod + def odd_scale(cls, v: int) -> int: + if v not in (3, 5, 7): + raise ValueError("scale must be 3, 5, or 7") + return v + +class LikertInstrument(BaseModel): + name: str + version: str = "1.0" + language_default: str = "de" + items: list[LikertItem] + + @model_validator(mode="after") + def unique_item_ids(self) -> "LikertInstrument": + ids = [i.item_id for i in self.items] + if len(set(ids)) != len(ids): + raise ValueError("duplicate item_id in instrument") + return self + +class LikertResponse(BaseModel): + agent_id: int + phase: InterviewPhase + responses: dict[str, int] + confidence: dict[str, float] = Field(default_factory=dict) + open_comment: Optional[str] = None + memory_available: bool = True + failed_items: list[str] = Field(default_factory=list) + + @model_validator(mode="after") + def values_in_range(self) -> "LikertResponse": + for k, v in self.responses.items(): + if not 1 <= v <= 7: + raise ValueError(f"response {k}={v} out of 1..7 range") + for k, v in self.confidence.items(): + if not 0.0 <= v <= 1.0: + raise ValueError(f"confidence {k}={v} out of 0..1 range") + return self + +class QSortStatement(BaseModel): + statement_id: str + de: str + en: str + +class QSortInstrument(BaseModel): + name: str + version: str = "1.0" + statements: list[QSortStatement] + distribution: list[int] # e.g. [2,3,4,6,4,3,2] for -3..+3 + +class QSortResponse(BaseModel): + agent_id: int + placements: dict[str, int] # statement_id -> bucket (-3..+3) + likert_axes: dict[str, int] # axis_id -> 1..7 + +class DelphiOpenResponse(BaseModel): + agent_id: int + round: int = 1 + answers: dict[str, str] # question_id -> free text + +class DelphiRatingResponse(BaseModel): + agent_id: int + round: int + ratings: dict[str, dict[str, int]] # theme_id -> {importance, plausibility} + justification: Optional[str] = None + +class ScenarioRating(BaseModel): + desirability: int = Field(ge=1, le=7) + plausibility: int = Field(ge=1, le=7) + impact_on_my_group: int = Field(ge=1, le=7) + fairness: int = Field(ge=1, le=7) + if_woke_up_response: str + +class ScenarioResponse(BaseModel): + agent_id: int + ratings: dict[str, ScenarioRating] # scenario_id -> rating +``` + +- [ ] **Step 4: Run test to verify it passes** + +Run: `cd backend && uv run pytest tests/interviews/test_models.py -v` +Expected: 5 passed. + +- [ ] **Step 5: Commit** + +```bash +git add backend/app/models/interview.py backend/tests/interviews/__init__.py backend/tests/interviews/test_models.py +git commit -m "feat(interviews): add pydantic models for instruments and responses" +``` + +--- + +### Task 3: YAML instrument loader + validator + +**Files:** +- Create: `backend/app/services/interviews/__init__.py` +- Create: `backend/app/services/interviews/instrument_loader.py` +- Create: `backend/scripts/instruments/__init__.py` (empty marker so tests can import path) +- Test: `backend/tests/interviews/test_instrument_loader.py` + +- [ ] **Step 1: Write failing test** + +```python +# backend/tests/interviews/test_instrument_loader.py +import pytest +from app.services.interviews.instrument_loader import ( + load_likert_instrument, InstrumentValidationError, +) + +def _write(tmp_path, text): + p = tmp_path / "inst.yaml" + p.write_text(text, encoding="utf-8") + return p + +def test_loads_valid_likert(tmp_path): + p = _write(tmp_path, """ +name: longitudinal_v1 +version: "1.0" +language_default: de +items: + - item_id: stk_1 + de: "Der westliche Dorschbestand wird sich erholen" + en: "Western cod stock will recover" + scale: 5 + family: stocks +""") + inst = load_likert_instrument(p) + assert inst.name == "longitudinal_v1" + assert len(inst.items) == 1 + +def test_rejects_duplicate_item_id(tmp_path): + p = _write(tmp_path, """ +name: x +items: + - {item_id: a, de: d, en: e, scale: 5} + - {item_id: a, de: d, en: e, scale: 5} +""") + with pytest.raises(InstrumentValidationError): + load_likert_instrument(p) + +def test_rejects_missing_required_field(tmp_path): + p = _write(tmp_path, """ +name: x +items: + - {item_id: a, de: d, scale: 5} +""") + with pytest.raises(InstrumentValidationError): + load_likert_instrument(p) +``` + +- [ ] **Step 2: Run test to verify it fails** + +Run: `cd backend && uv run pytest tests/interviews/test_instrument_loader.py -v` +Expected: ImportError. + +- [ ] **Step 3: Create loader** + +Create `backend/app/services/interviews/__init__.py` (empty), `backend/scripts/instruments/__init__.py` (empty), then `backend/app/services/interviews/instrument_loader.py`: + +```python +from __future__ import annotations +import hashlib +import json +from pathlib import Path +import yaml +from pydantic import ValidationError +from app.models.interview import ( + LikertInstrument, QSortInstrument, +) + +class InstrumentValidationError(ValueError): + pass + +def _parse_yaml(path: Path) -> dict: + if not path.exists(): + raise InstrumentValidationError(f"instrument file not found: {path}") + try: + with path.open("r", encoding="utf-8") as f: + data = yaml.safe_load(f) + except yaml.YAMLError as e: + raise InstrumentValidationError(f"YAML parse error in {path}: {e}") from e + if not isinstance(data, dict): + raise InstrumentValidationError(f"top-level YAML must be a mapping in {path}") + return data + +def load_likert_instrument(path: Path) -> LikertInstrument: + data = _parse_yaml(Path(path)) + try: + return LikertInstrument(**data) + except ValidationError as e: + raise InstrumentValidationError(str(e)) from e + +def load_qsort_instrument(path: Path) -> QSortInstrument: + data = _parse_yaml(Path(path)) + try: + return QSortInstrument(**data) + except ValidationError as e: + raise InstrumentValidationError(str(e)) from e + +def instrument_hash(path: Path) -> str: + data = Path(path).read_bytes() + return hashlib.sha256(data).hexdigest()[:16] + +def freeze_snapshot(instruments: dict[str, Path], out_path: Path) -> dict: + snapshot = { + name: { + "path": str(p), + "hash": instrument_hash(p), + "content": _parse_yaml(p), + } + for name, p in instruments.items() + } + out_path.parent.mkdir(parents=True, exist_ok=True) + out_path.write_text(json.dumps(snapshot, ensure_ascii=False, indent=2), encoding="utf-8") + return snapshot +``` + +- [ ] **Step 4: Run test to verify it passes** + +Run: `cd backend && uv run pytest tests/interviews/test_instrument_loader.py -v` +Expected: 3 passed. + +- [ ] **Step 5: Commit** + +```bash +git add backend/app/services/interviews/__init__.py backend/app/services/interviews/instrument_loader.py backend/scripts/instruments/__init__.py backend/tests/interviews/test_instrument_loader.py +git commit -m "feat(interviews): YAML instrument loader with pydantic validation and hash freezing" +``` + +--- + +### Task 4: LLM stub mode + +**Files:** +- Modify: `backend/app/utils/llm_client.py` +- Test: `backend/tests/interviews/test_llm_stub.py` + +- [ ] **Step 1: Write failing test** + +```python +# backend/tests/interviews/test_llm_stub.py +import json +from app.utils.llm_client import LLMClient + +def test_stub_mode_returns_deterministic_canned_json(monkeypatch): + monkeypatch.setenv("LLM_STUB_MODE", "true") + from app.config import Config + Config.LLM_STUB_MODE = True + client = LLMClient(api_key="x", base_url="x", model="x") + messages = [ + {"role": "system", "content": "You are persona_42. Return JSON."}, + {"role": "user", "content": "stub_key=longitudinal:item_001"}, + ] + out1 = client.chat_json(messages=messages, temperature=0.0) + out2 = client.chat_json(messages=messages, temperature=0.0) + assert out1 == out2 + assert isinstance(out1, dict) +``` + +- [ ] **Step 2: Run test to verify it fails** + +Run: `cd backend && uv run pytest tests/interviews/test_llm_stub.py -v` +Expected: FAIL (real API call attempted or stub absent). + +- [ ] **Step 3: Read current `llm_client.py`** + +Read the file to locate `chat` and `chat_json` method bodies and where to insert the stub branch. + +- [ ] **Step 4: Add stub branch** + +At the top of `LLMClient.chat` (before the OpenAI call), insert: +```python + from app.config import Config + if getattr(Config, "LLM_STUB_MODE", False): + return self._stub_response(messages) +``` + +And at the top of `LLMClient.chat_json` (before delegating), insert the same guard returning a parsed dict via `self._stub_response_json(messages)`. + +Add these methods to `LLMClient`: +```python + def _stub_key(self, messages: list[dict]) -> str: + user_msg = next((m["content"] for m in reversed(messages) if m.get("role") == "user"), "") + sys_msg = next((m["content"] for m in messages if m.get("role") == "system"), "") + # Allow callers to embed an explicit stub_key=... token + for chunk in user_msg.split(): + if chunk.startswith("stub_key="): + return chunk[len("stub_key="):] + import hashlib + return hashlib.sha256((sys_msg + "|" + user_msg).encode("utf-8")).hexdigest()[:12] + + def _stub_response(self, messages: list[dict]) -> str: + import json as _json + return _json.dumps(self._stub_response_json(messages), ensure_ascii=False) + + def _stub_response_json(self, messages: list[dict]) -> dict: + key = self._stub_key(messages) + # Deterministic centered Likert + plausible open text + digit = sum(ord(c) for c in key) % 5 + 1 + return { + "stub_key": key, + "responses": {"item_001": digit, "item_002": digit, "item_003": (digit % 5) + 1}, + "confidence": {"item_001": 0.7, "item_002": 0.7, "item_003": 0.6}, + "open_comment": f"stub:{key}", + } +``` + +- [ ] **Step 5: Run test to verify it passes** + +Run: `cd backend && uv run pytest tests/interviews/test_llm_stub.py -v` +Expected: 1 passed. + +- [ ] **Step 6: Commit** + +```bash +git add backend/app/utils/llm_client.py backend/tests/interviews/test_llm_stub.py +git commit -m "feat(interviews): LLM stub mode for deterministic CI tests" +``` + +--- + +### Task 5: StakeholderInterviewer base class + +**Files:** +- Create: `backend/app/services/interviews/base.py` +- Test: `backend/tests/interviews/test_base_interviewer.py` + +- [ ] **Step 1: Write failing test** + +```python +# backend/tests/interviews/test_base_interviewer.py +import json +import pytest +from app.services.interviews.base import StakeholderInterviewer, MemoryDigest, PersonaRecord + +class _FakeLLM: + def __init__(self, responses): + self.responses = list(responses) + self.calls = [] + def chat_json(self, messages, temperature=0.0, max_tokens=None, **kw): + self.calls.append(messages) + return self.responses.pop(0) + +class _FakeMemory: + def get_digest(self, agent_id, max_chars=2000): + return MemoryDigest(text=f"digest-for-{agent_id}", available=True) + +def test_in_character_prompt_includes_persona_and_memory(): + llm = _FakeLLM([{"x": 1}]) + mem = _FakeMemory() + interviewer = StakeholderInterviewer(llm=llm, memory=mem) + persona = PersonaRecord(agent_id=7, name="A", persona="I am a small-scale Baltic fisher.") + out = interviewer.ask_in_character(persona, user_prompt="Q?", schema_hint="{...}") + assert out == {"x": 1} + sys_msg = llm.calls[0][0]["content"] + assert "small-scale Baltic fisher" in sys_msg + assert "digest-for-7" in sys_msg + +def test_schema_retry_on_first_failure(): + bad_then_good = [{}, {"responses": {"a": 3}}] + llm = _FakeLLM(bad_then_good) + mem = _FakeMemory() + interviewer = StakeholderInterviewer(llm=llm, memory=mem) + def validator(d): + return d if "responses" in d else None + persona = PersonaRecord(agent_id=1, name="A", persona="p") + out = interviewer.ask_in_character(persona, user_prompt="Q?", schema_hint="x", validate=validator) + assert out == {"responses": {"a": 3}} + assert len(llm.calls) == 2 + +def test_two_failures_raise(): + llm = _FakeLLM([{}, {}]) + mem = _FakeMemory() + interviewer = StakeholderInterviewer(llm=llm, memory=mem) + persona = PersonaRecord(agent_id=1, name="A", persona="p") + with pytest.raises(ValueError): + interviewer.ask_in_character(persona, user_prompt="Q?", schema_hint="x", + validate=lambda d: d if "responses" in d else None) +``` + +- [ ] **Step 2: Run test to verify it fails** + +Run: `cd backend && uv run pytest tests/interviews/test_base_interviewer.py -v` +Expected: ImportError. + +- [ ] **Step 3: Implement base** + +`backend/app/services/interviews/base.py`: +```python +from __future__ import annotations +from dataclasses import dataclass, field +from typing import Any, Callable, Optional, Protocol + +@dataclass +class PersonaRecord: + agent_id: int + name: str + persona: str + profession: Optional[str] = None + bio: Optional[str] = None + +@dataclass +class MemoryDigest: + text: str + available: bool = True + +class MemoryProvider(Protocol): + def get_digest(self, agent_id: int, max_chars: int = 2000) -> MemoryDigest: ... + +class StakeholderInterviewer: + def __init__(self, llm, memory: MemoryProvider, language: str = "de"): + self.llm = llm + self.memory = memory + self.language = language + + def _system_prompt(self, persona: PersonaRecord, digest: MemoryDigest, schema_hint: str) -> str: + memory_block = digest.text if digest.available else "[no simulation memory available]" + lang_note = "Antworte ausschließlich auf Deutsch." if self.language == "de" else "Answer in English." + return ( + f"You are {persona.name}. {persona.persona}\n\n" + "You are answering a survey about the future of German fisheries. " + "Answer strictly in character based on your background, values, and what you experienced " + "during the simulated social media discourse summarised below.\n\n" + f"--- simulation memory digest ---\n{memory_block}\n--- end ---\n\n" + f"{lang_note} Return JSON ONLY matching this schema:\n{schema_hint}" + ) + + def ask_in_character( + self, + persona: PersonaRecord, + user_prompt: str, + schema_hint: str, + *, + temperature: float = 0.3, + max_tokens: Optional[int] = None, + validate: Optional[Callable[[dict], Optional[dict]]] = None, + ) -> dict: + digest = self.memory.get_digest(persona.agent_id) + messages = [ + {"role": "system", "content": self._system_prompt(persona, digest, schema_hint)}, + {"role": "user", "content": user_prompt}, + ] + out = self.llm.chat_json(messages=messages, temperature=temperature, max_tokens=max_tokens) + if validate is not None: + validated = validate(out) + if validated is not None: + return validated + messages.append({"role": "assistant", "content": str(out)}) + messages.append({"role": "user", "content": + "Your previous response did not match the required schema. " + f"Return ONLY valid JSON matching: {schema_hint}"}) + out = self.llm.chat_json(messages=messages, temperature=0.0, max_tokens=max_tokens) + validated = validate(out) + if validated is None: + raise ValueError(f"agent {persona.agent_id}: schema violation after retry") + return validated + return out +``` + +- [ ] **Step 4: Run test to verify it passes** + +Run: `cd backend && uv run pytest tests/interviews/test_base_interviewer.py -v` +Expected: 3 passed. + +- [ ] **Step 5: Commit** + +```bash +git add backend/app/services/interviews/base.py backend/tests/interviews/test_base_interviewer.py +git commit -m "feat(interviews): StakeholderInterviewer base with in-character prompting and schema retry" +``` + +--- + +## Phase 2 — Subagents + +### Task 6: Longitudinal subagent + instrument YAML + +**Files:** +- Create: `backend/scripts/instruments/longitudinal_v1.yaml` +- Create: `backend/app/services/interviews/longitudinal.py` +- Test: `backend/tests/interviews/test_longitudinal.py` + +- [ ] **Step 1: Write failing test** + +```python +# backend/tests/interviews/test_longitudinal.py +from pathlib import Path +import pytest +from app.models.interview import InterviewPhase +from app.services.interviews.base import PersonaRecord, MemoryDigest +from app.services.interviews.longitudinal import LongitudinalSubagent, run_aggregate + +class _FakeMem: + def get_digest(self, agent_id, max_chars=2000): + return MemoryDigest(text="x", available=True) + +class _CannedLLM: + def __init__(self): self.n = 0 + def chat_json(self, messages, temperature=0.0, max_tokens=None, **kw): + self.n += 1 + return { + "responses": {"stk_1": 4, "gov_1": 3, "mkt_1": 5, "clm_1": 2}, + "confidence": {"stk_1": 0.8, "gov_1": 0.6, "mkt_1": 0.7, "clm_1": 0.5}, + "open_comment": "test", + } + +INSTRUMENT = Path(__file__).resolve().parents[2] / "scripts" / "instruments" / "longitudinal_v1.yaml" + +def test_longitudinal_administer_one_agent(): + sub = LongitudinalSubagent(llm=_CannedLLM(), memory=_FakeMem(), instrument_path=INSTRUMENT) + persona = PersonaRecord(agent_id=3, name="A", persona="p") + resp = sub.administer(persona, phase=InterviewPhase.T0) + assert resp.agent_id == 3 + assert resp.phase == InterviewPhase.T0 + assert set(resp.responses.keys()) >= {"stk_1", "gov_1", "mkt_1", "clm_1"} + +def test_longitudinal_aggregate_delta(): + from app.models.interview import LikertResponse + t0 = [LikertResponse(agent_id=i, phase=InterviewPhase.T0, + responses={"stk_1": 3, "gov_1": 4}, + confidence={"stk_1": 0.8, "gov_1": 0.8}) for i in range(5)] + t1 = [LikertResponse(agent_id=i, phase=InterviewPhase.T1, + responses={"stk_1": 4, "gov_1": 4}, + confidence={"stk_1": 0.8, "gov_1": 0.8}) for i in range(5)] + agg = run_aggregate(t0, t1) + assert agg["per_item"]["stk_1"]["mean_delta"] == 1.0 + assert agg["per_item"]["gov_1"]["mean_delta"] == 0.0 + assert agg["n_paired"] == 5 +``` + +- [ ] **Step 2: Run test to verify it fails** + +Run: `cd backend && uv run pytest tests/interviews/test_longitudinal.py -v` +Expected: ImportError + missing YAML file. + +- [ ] **Step 3: Create instrument YAML** + +`backend/scripts/instruments/longitudinal_v1.yaml`: +```yaml +name: longitudinal_v1 +version: "1.0" +language_default: de +items: + # Stock status & recovery + - {item_id: stk_1, family: stocks, scale: 5, + de: "Der westliche Dorschbestand wird sich bis 2035 erholen.", + en: "The Western Baltic cod stock will recover by 2035."} + - {item_id: stk_2, family: stocks, scale: 5, + de: "Der Heringsbestand in der westlichen Ostsee ist nicht mehr zu retten.", + en: "The Western Baltic herring stock can no longer be saved.", + reverse_coded: true} + - {item_id: stk_3, family: stocks, scale: 5, + de: "Wissenschaftliche Bestandsschätzungen sind generell zuverlässig.", + en: "Scientific stock assessments are generally reliable."} + # Governance & CFP + - {item_id: gov_1, family: governance, scale: 5, + de: "Die Gemeinsame Fischereipolitik der EU scheitert beim Schutz der Ostseefische.", + en: "The EU Common Fisheries Policy fails to protect Baltic fish.", + reverse_coded: true} + - {item_id: gov_2, family: governance, scale: 5, + de: "Entscheidungen über Fangquoten sollten stärker lokal getroffen werden.", + en: "Decisions on catch quotas should be taken more locally."} + - {item_id: gov_3, family: governance, scale: 5, + de: "Die deutsche Bundesregierung handelt entschlossen bei Fischereifragen.", + en: "The German federal government acts decisively on fisheries issues."} + # Market & MSC + - {item_id: mkt_1, family: market, scale: 5, + de: "Nur MSC-zertifizierter Fisch sollte verkauft werden dürfen.", + en: "Only MSC-certified fish should be allowed for sale."} + - {item_id: mkt_2, family: market, scale: 5, + de: "Importierter Fisch verdrängt die deutsche Kleinfischerei.", + en: "Imported fish displaces German small-scale fisheries."} + - {item_id: mkt_3, family: market, scale: 5, + de: "Verbraucher zahlen gerne mehr für nachhaltigen Ostseefisch.", + en: "Consumers gladly pay more for sustainable Baltic fish."} + # Climate & adaptation + - {item_id: clm_1, family: climate, scale: 5, + de: "Der Klimawandel macht traditionelle Ostseefischerei unmöglich.", + en: "Climate change makes traditional Baltic fisheries impossible.", + reverse_coded: true} + - {item_id: clm_2, family: climate, scale: 5, + de: "Aquakultur ist die Zukunft der deutschen Fischwirtschaft.", + en: "Aquaculture is the future of the German fishing industry."} + - {item_id: clm_3, family: climate, scale: 5, + de: "Die Fischerei muss sich grundlegend an neue Arten anpassen.", + en: "Fisheries must fundamentally adapt to new species."} +``` + +- [ ] **Step 4: Implement subagent** + +`backend/app/services/interviews/longitudinal.py`: +```python +from __future__ import annotations +import json +import math +from pathlib import Path +from typing import Optional +from app.models.interview import ( + LikertInstrument, LikertResponse, InterviewPhase, +) +from app.services.interviews.base import StakeholderInterviewer, PersonaRecord +from app.services.interviews.instrument_loader import load_likert_instrument + +class LongitudinalSubagent: + def __init__(self, llm, memory, instrument_path: Path, language: str = "de"): + self.instrument: LikertInstrument = load_likert_instrument(Path(instrument_path)) + self.interviewer = StakeholderInterviewer(llm=llm, memory=memory, language=language) + self.language = language + + def _schema_hint(self) -> str: + ids = [i.item_id for i in self.instrument.items] + return json.dumps({ + "responses": {k: "" for k in ids}, + "confidence": {k: "" for k in ids}, + "open_comment": "", + }, ensure_ascii=False) + + def _user_prompt(self) -> str: + lines = ["Bitte bewerten Sie die folgenden Aussagen auf einer Skala von 1 (lehne stark ab) bis 5 (stimme stark zu)." if self.language == "de" + else "Please rate the following statements on a scale from 1 (strongly disagree) to 5 (strongly agree)."] + for it in self.instrument.items: + txt = it.de if self.language == "de" else it.en + lines.append(f"- [{it.item_id}] {txt}") + return "\n".join(lines) + + def _validator(self, raw: dict) -> Optional[dict]: + if not isinstance(raw, dict): return None + resp = raw.get("responses") + if not isinstance(resp, dict): return None + required = {it.item_id for it in self.instrument.items} + if not required.issubset(resp.keys()): return None + for k, v in resp.items(): + if not isinstance(v, int) or not 1 <= v <= 5: return None + return raw + + def administer(self, persona: PersonaRecord, phase: InterviewPhase) -> LikertResponse: + raw = self.interviewer.ask_in_character( + persona, + user_prompt=self._user_prompt(), + schema_hint=self._schema_hint(), + validate=self._validator, + ) + return LikertResponse( + agent_id=persona.agent_id, + phase=phase, + responses={k: int(v) for k, v in raw["responses"].items()}, + confidence={k: float(v) for k, v in raw.get("confidence", {}).items()}, + open_comment=raw.get("open_comment"), + ) + +def run_aggregate(t0: list[LikertResponse], t1: list[LikertResponse]) -> dict: + by_t0 = {r.agent_id: r for r in t0} + by_t1 = {r.agent_id: r for r in t1} + paired = sorted(set(by_t0) & set(by_t1)) + items: set[str] = set() + for r in t0 + t1: + items.update(r.responses.keys()) + per_item: dict[str, dict] = {} + for it in sorted(items): + deltas = [] + for aid in paired: + v0 = by_t0[aid].responses.get(it) + v1 = by_t1[aid].responses.get(it) + if v0 is None or v1 is None: continue + deltas.append(v1 - v0) + if not deltas: + per_item[it] = {"mean_delta": None, "n": 0} + continue + m = sum(deltas) / len(deltas) + var = sum((d - m) ** 2 for d in deltas) / max(len(deltas) - 1, 1) + per_item[it] = { + "mean_delta": m, + "sd_delta": math.sqrt(var), + "n": len(deltas), + "n_positive": sum(1 for d in deltas if d > 0), + "n_negative": sum(1 for d in deltas if d < 0), + } + per_agent: dict[int, dict] = {} + for aid in paired: + r0 = by_t0[aid].responses + r1 = by_t1[aid].responses + common = set(r0) & set(r1) + total = sum(abs(r1[k] - r0[k]) for k in common) + per_agent[aid] = {"total_abs_drift": total, "n_items": len(common)} + return { + "n_paired": len(paired), + "n_t0_only": len(set(by_t0) - set(by_t1)), + "n_t1_only": len(set(by_t1) - set(by_t0)), + "per_item": per_item, + "per_agent": per_agent, + } +``` + +- [ ] **Step 5: Run test to verify it passes** + +Run: `cd backend && uv run pytest tests/interviews/test_longitudinal.py -v` +Expected: 2 passed. + +- [ ] **Step 6: Commit** + +```bash +git add backend/scripts/instruments/longitudinal_v1.yaml backend/app/services/interviews/longitudinal.py backend/tests/interviews/test_longitudinal.py +git commit -m "feat(interviews): longitudinal subagent + 12-item Likert instrument" +``` + +--- + +### Task 7: Diversity subagent + Q-sort instrument + +**Files:** +- Create: `backend/scripts/instruments/diversity_v1.yaml` +- Create: `backend/app/services/interviews/diversity.py` +- Test: `backend/tests/interviews/test_diversity.py` + +- [ ] **Step 1: Write failing test** + +```python +# backend/tests/interviews/test_diversity.py +from pathlib import Path +import numpy as np +from app.services.interviews.base import PersonaRecord, MemoryDigest +from app.services.interviews.diversity import ( + DiversitySubagent, run_typology, +) + +class _Mem: + def get_digest(self, agent_id, max_chars=2000): + return MemoryDigest(text="x", available=True) + +class _CannedLLM: + def chat_json(self, messages, temperature=0.0, max_tokens=None, **kw): + # Place all 24 statements into legal buckets per the forced distribution + placements = {} + buckets = [-3]*2 + [-2]*3 + [-1]*4 + [0]*6 + [1]*4 + [2]*3 + [3]*2 + for i in range(24): + placements[f"st_{i+1:02d}"] = buckets[i] + return { + "placements": placements, + "likert_axes": {"ax_pres_extr": 5, "ax_loc_eu": 3, "ax_sci_trad": 4, + "ax_ind_col": 4, "ax_short_long": 5, "ax_mkt_reg": 3}, + } + +INSTRUMENT = Path(__file__).resolve().parents[2] / "scripts" / "instruments" / "diversity_v1.yaml" + +def test_diversity_administer(): + sub = DiversitySubagent(llm=_CannedLLM(), memory=_Mem(), instrument_path=INSTRUMENT) + persona = PersonaRecord(agent_id=1, name="A", persona="p") + resp = sub.administer(persona) + assert len(resp.placements) == 24 + assert set(resp.likert_axes.keys()) == { + "ax_pres_extr","ax_loc_eu","ax_sci_trad","ax_ind_col","ax_short_long","ax_mkt_reg" + } + +def test_typology_runs_pca_kmeans(): + from app.models.interview import QSortResponse + rng = np.random.default_rng(42) + responses = [] + for aid in range(20): + placements = {f"st_{i+1:02d}": int(rng.integers(-3, 4)) for i in range(24)} + axes = {f"ax_{j}": int(rng.integers(1, 8)) for j in range(6)} + responses.append(QSortResponse(agent_id=aid, placements=placements, likert_axes=axes)) + result = run_typology(responses, n_clusters=3) + assert "clusters" in result + assert len(result["clusters"]) == 3 + assert "pca" in result + assert len(result["pca"]["components"]) >= 2 +``` + +- [ ] **Step 2: Run test to verify it fails** + +Run: `cd backend && uv run pytest tests/interviews/test_diversity.py -v` +Expected: ImportError. + +- [ ] **Step 3: Create instrument YAML** + +`backend/scripts/instruments/diversity_v1.yaml`: +```yaml +name: diversity_v1 +version: "1.0" +language_default: de +distribution: [2, 3, 4, 6, 4, 3, 2] # buckets from -3 to +3, total 24 +statements: + - {statement_id: st_01, de: "Die Ostsee gehört den Fischern, die hier seit Generationen leben.", en: "The Baltic belongs to fishers who have lived here for generations."} + - {statement_id: st_02, de: "MSC-Zertifizierung schützt vor allem große Konzerne.", en: "MSC certification mainly protects large corporations."} + - {statement_id: st_03, de: "Wissenschaftliche Quoten sind die einzige Grundlage für Politik.", en: "Scientific quotas are the only legitimate basis for policy."} + - {statement_id: st_04, de: "Aquakultur kann Ostseefischerei ersetzen.", en: "Aquaculture can replace Baltic fisheries."} + - {statement_id: st_05, de: "Sportfischer schaden den Beständen mehr als die Berufsfischer.", en: "Recreational anglers harm stocks more than commercial fishers."} + - {statement_id: st_06, de: "Die EU-Fischereipolitik kennt die Ostsee nicht.", en: "EU fisheries policy doesn't understand the Baltic."} + - {statement_id: st_07, de: "Großtechnische Fischerei ist effizienter und damit nachhaltiger.", en: "Industrial fisheries are more efficient and therefore more sustainable."} + - {statement_id: st_08, de: "Wer Fisch isst, sollte mehr dafür bezahlen.", en: "Those who eat fish should pay more for it."} + - {statement_id: st_09, de: "Die Kleinfischerei muss subventioniert werden.", en: "Small-scale fisheries must be subsidised."} + - {statement_id: st_10, de: "Marine Schutzgebiete sind reine Symbolpolitik.", en: "Marine protected areas are mere symbolism."} + - {statement_id: st_11, de: "Russlands Krieg ändert alles in der Ostsee.", en: "Russia's war changes everything in the Baltic."} + - {statement_id: st_12, de: "Nur drastische Reduktion der Fangmengen rettet die Bestände.", en: "Only drastic catch reductions will save the stocks."} + - {statement_id: st_13, de: "NGOs übertreiben die Krise systematisch.", en: "NGOs systematically exaggerate the crisis."} + - {statement_id: st_14, de: "Klimawandel ist das eigentliche Problem, nicht die Fischerei.", en: "Climate change is the real problem, not fisheries."} + - {statement_id: st_15, de: "Tradition zählt mehr als kurzfristige Bestandszahlen.", en: "Tradition matters more than short-term stock numbers."} + - {statement_id: st_16, de: "Verbraucher entscheiden über die Zukunft des Fisches.", en: "Consumers decide the future of fish."} + - {statement_id: st_17, de: "Ohne Generalstreik der Fischer ändert sich nichts.", en: "Without a fishers' general strike, nothing will change."} + - {statement_id: st_18, de: "Die Bundesregierung sollte Kutter aufkaufen und stilllegen.", en: "The federal government should buy out and decommission boats."} + - {statement_id: st_19, de: "Die Dorschkrise ist Folge gescheiterter Politik.", en: "The cod crisis is the result of policy failure."} + - {statement_id: st_20, de: "Ostsee-Aquakultur ist ökologisch problematisch.", en: "Baltic aquaculture is ecologically problematic."} + - {statement_id: st_21, de: "Junge Menschen werden keinen Fischereibetrieb mehr übernehmen.", en: "Young people will no longer take over fishing businesses."} + - {statement_id: st_22, de: "Markt regelt sich selbst, auch beim Fisch.", en: "The market regulates itself, also for fish."} + - {statement_id: st_23, de: "Lokale Genossenschaften sind die Lösung.", en: "Local cooperatives are the solution."} + - {statement_id: st_24, de: "In 20 Jahren gibt es keine deutsche Ostseefischerei mehr.", en: "In 20 years there will be no German Baltic fisheries left."} +likert_axes: + - {axis_id: ax_pres_extr, scale: 7, de: "Bewahrung (1) vs. Nutzung (7)", en: "Preservation (1) vs. Extraction (7)"} + - {axis_id: ax_loc_eu, scale: 7, de: "Lokal (1) vs. EU-zentral (7)", en: "Local (1) vs. EU-central (7)"} + - {axis_id: ax_sci_trad, scale: 7, de: "Wissenschaft (1) vs. Tradition (7)", en: "Science-led (1) vs. Tradition-led (7)"} + - {axis_id: ax_ind_col, scale: 7, de: "Individuum (1) vs. Kollektiv (7)", en: "Individual (1) vs. Collective (7)"} + - {axis_id: ax_short_long,scale: 7, de: "Kurzfristig (1) vs. Langfristig (7)", en: "Short-term (1) vs. Long-term (7)"} + - {axis_id: ax_mkt_reg, scale: 7, de: "Markt (1) vs. Regulierung (7)", en: "Market (1) vs. Regulation (7)"} +``` + +- [ ] **Step 4: Implement subagent** + +`backend/app/services/interviews/diversity.py`: +```python +from __future__ import annotations +import json +from pathlib import Path +from typing import Optional +import numpy as np +from sklearn.decomposition import PCA +from sklearn.cluster import KMeans +import yaml +from app.models.interview import QSortResponse +from app.services.interviews.base import StakeholderInterviewer, PersonaRecord +from app.services.interviews.instrument_loader import InstrumentValidationError + +class DiversitySubagent: + def __init__(self, llm, memory, instrument_path: Path, language: str = "de"): + self.instrument = self._load(Path(instrument_path)) + self.interviewer = StakeholderInterviewer(llm=llm, memory=memory, language=language) + self.language = language + + def _load(self, path: Path) -> dict: + with path.open("r", encoding="utf-8") as f: + data = yaml.safe_load(f) + if not isinstance(data, dict) or "statements" not in data or "distribution" not in data: + raise InstrumentValidationError(f"invalid diversity instrument: {path}") + if sum(data["distribution"]) != len(data["statements"]): + raise InstrumentValidationError("distribution sum must equal number of statements") + return data + + def _schema_hint(self) -> str: + return json.dumps({ + "placements": {s["statement_id"]: "" for s in self.instrument["statements"]}, + "likert_axes": {a["axis_id"]: "" for a in self.instrument["likert_axes"]}, + }, ensure_ascii=False) + + def _user_prompt(self) -> str: + dist = self.instrument["distribution"] + buckets = list(range(-3, 4)) + bucket_desc = ", ".join(f"{b}:{n}" for b, n in zip(buckets, dist)) + lines = [ + ("Ordnen Sie jede Aussage genau einer Box von -3 (lehne stark ab) bis +3 (stimme stark zu) zu. " + f"Die Verteilung ist erzwungen: {bucket_desc}.") if self.language == "de" else + ("Place every statement into exactly one box from -3 (strongly disagree) to +3 (strongly agree). " + f"The distribution is forced: {bucket_desc}."), + "", + "Statements:", + ] + for s in self.instrument["statements"]: + txt = s["de"] if self.language == "de" else s["en"] + lines.append(f"- [{s['statement_id']}] {txt}") + lines += ["", "Then rate each axis from 1 to 7:"] + for a in self.instrument["likert_axes"]: + txt = a["de"] if self.language == "de" else a["en"] + lines.append(f"- [{a['axis_id']}] {txt}") + return "\n".join(lines) + + def _validator(self, raw: dict) -> Optional[dict]: + if not isinstance(raw, dict): return None + placements = raw.get("placements", {}) + axes = raw.get("likert_axes", {}) + statements = {s["statement_id"] for s in self.instrument["statements"]} + if set(placements.keys()) != statements: return None + dist = self.instrument["distribution"] + target = {b: n for b, n in zip(range(-3, 4), dist)} + got: dict[int, int] = {} + for v in placements.values(): + if not isinstance(v, int) or not -3 <= v <= 3: return None + got[v] = got.get(v, 0) + 1 + if got != target: return None + for a in self.instrument["likert_axes"]: + v = axes.get(a["axis_id"]) + if not isinstance(v, int) or not 1 <= v <= 7: return None + return raw + + def administer(self, persona: PersonaRecord) -> QSortResponse: + raw = self.interviewer.ask_in_character( + persona, + user_prompt=self._user_prompt(), + schema_hint=self._schema_hint(), + validate=self._validator, + ) + return QSortResponse( + agent_id=persona.agent_id, + placements={k: int(v) for k, v in raw["placements"].items()}, + likert_axes={k: int(v) for k, v in raw["likert_axes"].items()}, + ) + +def _vectorize(r: QSortResponse, statements: list[str], axes: list[str]) -> np.ndarray: + return np.array( + [r.placements.get(s, 0) for s in statements] + + [r.likert_axes.get(a, 4) for a in axes], + dtype=float, + ) + +def run_typology(responses: list[QSortResponse], n_clusters: int = 4) -> dict: + if not responses: + return {"n": 0, "clusters": [], "pca": {"components": [], "explained_variance": []}} + statements = sorted({k for r in responses for k in r.placements}) + axes = sorted({k for r in responses for k in r.likert_axes}) + X = np.vstack([_vectorize(r, statements, axes) for r in responses]) + n_clusters = min(n_clusters, len(responses)) + pca = PCA(n_components=min(5, X.shape[1], X.shape[0])) + pcs = pca.fit_transform(X) + km = KMeans(n_clusters=n_clusters, n_init=10, random_state=0) + labels = km.fit_predict(X) + clusters = [] + for c in range(n_clusters): + members = [responses[i].agent_id for i in range(len(responses)) if labels[i] == c] + centroid = km.cluster_centers_[c] + clusters.append({ + "cluster_id": int(c), + "n": len(members), + "agent_ids": members, + "top_loadings": { + statements[i] if i < len(statements) else axes[i - len(statements)]: float(centroid[i]) + for i in np.argsort(np.abs(centroid))[::-1][:8].tolist() + }, + }) + return { + "n": len(responses), + "clusters": clusters, + "pca": { + "components": pcs.tolist(), + "explained_variance": pca.explained_variance_ratio_.tolist(), + "agent_ids": [r.agent_id for r in responses], + }, + } +``` + +- [ ] **Step 5: Run test to verify it passes** + +Run: `cd backend && uv run pytest tests/interviews/test_diversity.py -v` +Expected: 2 passed. + +- [ ] **Step 6: Commit** + +```bash +git add backend/scripts/instruments/diversity_v1.yaml backend/app/services/interviews/diversity.py backend/tests/interviews/test_diversity.py +git commit -m "feat(interviews): diversity subagent with Q-sort + 6 Likert axes + PCA/k-means typology" +``` + +--- + +### Task 8: Delphi subagent (three rounds) + +**Files:** +- Create: `backend/scripts/instruments/delphi_v1.yaml` +- Create: `backend/app/services/interviews/delphi.py` +- Test: `backend/tests/interviews/test_delphi.py` + +- [ ] **Step 1: Write failing test** + +```python +# backend/tests/interviews/test_delphi.py +from pathlib import Path +from app.services.interviews.base import PersonaRecord, MemoryDigest +from app.services.interviews.delphi import ( + DelphiSubagent, extract_themes, convergence_metrics, +) + +INSTRUMENT = Path(__file__).resolve().parents[2] / "scripts" / "instruments" / "delphi_v1.yaml" + +class _Mem: + def get_digest(self, agent_id, max_chars=2000): + return MemoryDigest(text="x", available=True) + +class _R1LLM: + def chat_json(self, messages, temperature=0.0, max_tokens=None, **kw): + return {"answers": { + "q1": "Klimawandel, Quoten, Generationswechsel", + "q2": "MSC, Aquakultur", + "q3": "Russland, EU-Politik", + "q4": "Verbraucherpreise", + }} + +class _R2LLM: + def chat_json(self, messages, temperature=0.0, max_tokens=None, **kw): + return {"ratings": {f"theme_{i}": {"importance": 4, "plausibility": 3} for i in range(5)}} + +class _ExtractLLM: + def chat_json(self, messages, temperature=0.0, max_tokens=None, **kw): + return {"themes": [ + {"theme_id": "theme_0", "label": "Klimawandel"}, + {"theme_id": "theme_1", "label": "Quoten"}, + {"theme_id": "theme_2", "label": "MSC"}, + {"theme_id": "theme_3", "label": "EU-Politik"}, + {"theme_id": "theme_4", "label": "Generationswechsel"}, + ]} + +def test_delphi_round1_open(): + sub = DelphiSubagent(llm=_R1LLM(), memory=_Mem(), instrument_path=INSTRUMENT) + persona = PersonaRecord(agent_id=2, name="A", persona="p") + resp = sub.administer_round1(persona) + assert resp.round == 1 + assert len(resp.answers) == 4 + +def test_extract_themes_aggregates(): + from app.models.interview import DelphiOpenResponse + r1 = [DelphiOpenResponse(agent_id=i, answers={"q1": "Klimawandel", "q2": "MSC"}) for i in range(3)] + themes = extract_themes(r1, llm=_ExtractLLM()) + assert len(themes) == 5 + assert all("theme_id" in t for t in themes) + +def test_convergence_metrics(): + from app.models.interview import DelphiRatingResponse + r2 = [DelphiRatingResponse(agent_id=i, round=2, + ratings={"t1": {"importance": 3, "plausibility": 3}}) for i in range(5)] + r3 = [DelphiRatingResponse(agent_id=i, round=3, + ratings={"t1": {"importance": 4, "plausibility": 4}}) for i in range(5)] + conv = convergence_metrics(r2, r3) + assert "t1" in conv + assert conv["t1"]["delta_iqr_importance"] is not None +``` + +- [ ] **Step 2: Run test to verify it fails** + +Run: `cd backend && uv run pytest tests/interviews/test_delphi.py -v` +Expected: ImportError. + +- [ ] **Step 3: Create instrument YAML** + +`backend/scripts/instruments/delphi_v1.yaml`: +```yaml +name: delphi_v1 +version: "1.0" +language_default: de +rounds: 3 +questions: + - {question_id: q1, de: "Welche drei Faktoren werden die deutsche Fischerei bis 2040 am stärksten prägen?", en: "Which three factors will most shape German fisheries by 2040?"} + - {question_id: q2, de: "Welche Akteurinnen und Akteure sind heute entscheidend, werden aber unterschätzt?", en: "Which actors are decisive today but underestimated?"} + - {question_id: q3, de: "Was sollte sich in den nächsten fünf Jahren ändern, damit die Fischerei eine Zukunft hat?", en: "What should change in the next five years for fisheries to have a future?"} + - {question_id: q4, de: "Welcher Trend macht Ihnen am meisten Hoffnung – und welcher am meisten Sorge?", en: "Which trend gives you most hope — and which most concern?"} +``` + +- [ ] **Step 4: Implement subagent** + +`backend/app/services/interviews/delphi.py`: +```python +from __future__ import annotations +import json +import statistics +from pathlib import Path +from typing import Optional +import yaml +from app.models.interview import ( + DelphiOpenResponse, DelphiRatingResponse, +) +from app.services.interviews.base import StakeholderInterviewer, PersonaRecord + +class DelphiSubagent: + def __init__(self, llm, memory, instrument_path: Path, language: str = "de"): + with Path(instrument_path).open("r", encoding="utf-8") as f: + self.instrument = yaml.safe_load(f) + self.interviewer = StakeholderInterviewer(llm=llm, memory=memory, language=language) + self.llm = llm + self.language = language + + # --- Round 1: open questions --- + def _r1_schema(self) -> str: + return json.dumps({ + "answers": {q["question_id"]: "" for q in self.instrument["questions"]} + }, ensure_ascii=False) + + def _r1_prompt(self) -> str: + lines = ["Bitte beantworten Sie offen:" if self.language == "de" else "Please answer openly:"] + for q in self.instrument["questions"]: + txt = q["de"] if self.language == "de" else q["en"] + lines.append(f"[{q['question_id']}] {txt}") + return "\n".join(lines) + + def _r1_validate(self, raw: dict) -> Optional[dict]: + if not isinstance(raw, dict): return None + ans = raw.get("answers") + if not isinstance(ans, dict): return None + required = {q["question_id"] for q in self.instrument["questions"]} + if not required.issubset(ans.keys()): return None + return raw + + def administer_round1(self, persona: PersonaRecord) -> DelphiOpenResponse: + raw = self.interviewer.ask_in_character( + persona, user_prompt=self._r1_prompt(), + schema_hint=self._r1_schema(), validate=self._r1_validate, + ) + return DelphiOpenResponse(agent_id=persona.agent_id, round=1, + answers={k: str(v) for k, v in raw["answers"].items()}) + + # --- Round 2: rate themes --- + def _r2_schema(self, theme_ids: list[str]) -> str: + return json.dumps({ + "ratings": {tid: {"importance": "", "plausibility": ""} for tid in theme_ids} + }, ensure_ascii=False) + + def _r2_prompt(self, themes: list[dict]) -> str: + head = "Bewerten Sie jedes Thema nach Wichtigkeit (1-5) und Plausibilität (1-5):" if self.language == "de" \ + else "Rate each theme on importance (1-5) and plausibility (1-5):" + body = [f"- [{t['theme_id']}] {t['label']}" for t in themes] + return head + "\n" + "\n".join(body) + + def _r2_validate(self, theme_ids: list[str]): + def v(raw: dict) -> Optional[dict]: + if not isinstance(raw, dict): return None + ratings = raw.get("ratings", {}) + if set(ratings.keys()) != set(theme_ids): return None + for tid, r in ratings.items(): + if not isinstance(r, dict): return None + for key in ("importance", "plausibility"): + if not isinstance(r.get(key), int) or not 1 <= r[key] <= 5: return None + return raw + return v + + def administer_round2(self, persona: PersonaRecord, themes: list[dict]) -> DelphiRatingResponse: + theme_ids = [t["theme_id"] for t in themes] + raw = self.interviewer.ask_in_character( + persona, user_prompt=self._r2_prompt(themes), + schema_hint=self._r2_schema(theme_ids), validate=self._r2_validate(theme_ids), + ) + return DelphiRatingResponse(agent_id=persona.agent_id, round=2, + ratings={k: dict(v) for k, v in raw["ratings"].items()}) + + # --- Round 3: revise after seeing group stats --- + def administer_round3( + self, persona: PersonaRecord, themes: list[dict], group_stats: dict, own_r2: DelphiRatingResponse + ) -> DelphiRatingResponse: + theme_ids = [t["theme_id"] for t in themes] + head = ("Sie sehen unten die anonymisierten Gruppenwerte (Median, IQR). " + "Bitte überarbeiten Sie Ihre Bewertungen, wenn Sie möchten, und begründen Sie kurz.") \ + if self.language == "de" else \ + ("Below are the anonymised group values (median, IQR). " + "Please revise your ratings if you wish and add a short justification.") + ctx_lines = [] + for t in themes: + tid = t["theme_id"] + gs = group_stats.get(tid, {}) + own = own_r2.ratings.get(tid, {}) + ctx_lines.append( + f"[{tid}] {t['label']} — group importance median={gs.get('imp_median')}, " + f"IQR={gs.get('imp_iqr')}; plausibility median={gs.get('plaus_median')}, " + f"IQR={gs.get('plaus_iqr')}. Your R2: imp={own.get('importance')}, plaus={own.get('plausibility')}." + ) + prompt = head + "\n\n" + "\n".join(ctx_lines) + schema = json.dumps({ + "ratings": {tid: {"importance": "", "plausibility": ""} for tid in theme_ids}, + "justification": "", + }, ensure_ascii=False) + def validate(raw): + if not isinstance(raw, dict): return None + ratings = raw.get("ratings", {}) + if set(ratings.keys()) != set(theme_ids): return None + for r in ratings.values(): + if not isinstance(r, dict): return None + for key in ("importance", "plausibility"): + if not isinstance(r.get(key), int) or not 1 <= r[key] <= 5: return None + return raw + raw = self.interviewer.ask_in_character(persona, user_prompt=prompt, + schema_hint=schema, validate=validate) + return DelphiRatingResponse( + agent_id=persona.agent_id, round=3, + ratings={k: dict(v) for k, v in raw["ratings"].items()}, + justification=raw.get("justification"), + ) + +def extract_themes(round1: list[DelphiOpenResponse], llm) -> list[dict]: + text_blocks = [] + for r in round1: + for qid, ans in r.answers.items(): + text_blocks.append(f"[agent {r.agent_id} {qid}] {ans}") + schema = json.dumps({"themes": [{"theme_id": "", "label": ""}]}, ensure_ascii=False) + messages = [ + {"role": "system", "content": + "You extract distinct thematic codes from open-ended German fisheries survey responses. " + f"Return JSON ONLY matching: {schema}. Use stable theme_ids of form theme_0, theme_1, …"}, + {"role": "user", "content": "Responses:\n" + "\n".join(text_blocks) + "\n\nReturn up to 12 distinct themes."}, + ] + raw = llm.chat_json(messages=messages, temperature=0.0) + themes = raw.get("themes", []) if isinstance(raw, dict) else [] + out = [] + for i, t in enumerate(themes): + if isinstance(t, dict) and "label" in t: + out.append({"theme_id": t.get("theme_id") or f"theme_{i}", "label": str(t["label"])}) + return out + +def _iqr(xs: list[float]) -> float: + if not xs: return 0.0 + xs = sorted(xs) + q1 = statistics.quantiles(xs, n=4)[0] if len(xs) >= 4 else xs[0] + q3 = statistics.quantiles(xs, n=4)[2] if len(xs) >= 4 else xs[-1] + return q3 - q1 + +def convergence_metrics(r2: list[DelphiRatingResponse], r3: list[DelphiRatingResponse]) -> dict: + by_r2 = {r.agent_id: r for r in r2} + by_r3 = {r.agent_id: r for r in r3} + themes: set[str] = set() + for r in r2 + r3: + themes.update(r.ratings.keys()) + out: dict[str, dict] = {} + for t in sorted(themes): + imp_r2 = [by_r2[a].ratings[t]["importance"] for a in by_r2 if t in by_r2[a].ratings] + imp_r3 = [by_r3[a].ratings[t]["importance"] for a in by_r3 if t in by_r3[a].ratings] + plaus_r2 = [by_r2[a].ratings[t]["plausibility"] for a in by_r2 if t in by_r2[a].ratings] + plaus_r3 = [by_r3[a].ratings[t]["plausibility"] for a in by_r3 if t in by_r3[a].ratings] + out[t] = { + "imp_median_r2": statistics.median(imp_r2) if imp_r2 else None, + "imp_median_r3": statistics.median(imp_r3) if imp_r3 else None, + "imp_iqr_r2": _iqr(imp_r2), + "imp_iqr_r3": _iqr(imp_r3), + "delta_iqr_importance": _iqr(imp_r3) - _iqr(imp_r2), + "plaus_iqr_r2": _iqr(plaus_r2), + "plaus_iqr_r3": _iqr(plaus_r3), + "delta_iqr_plausibility": _iqr(plaus_r3) - _iqr(plaus_r2), + } + return out + +def group_stats_from_r2(r2: list[DelphiRatingResponse]) -> dict: + themes: set[str] = set() + for r in r2: themes.update(r.ratings.keys()) + stats: dict[str, dict] = {} + for t in themes: + imps = [r.ratings[t]["importance"] for r in r2 if t in r.ratings] + plauss = [r.ratings[t]["plausibility"] for r in r2 if t in r.ratings] + stats[t] = { + "imp_median": statistics.median(imps) if imps else None, + "imp_iqr": _iqr(imps), + "plaus_median": statistics.median(plauss) if plauss else None, + "plaus_iqr": _iqr(plauss), + } + return stats +``` + +- [ ] **Step 5: Run test to verify it passes** + +Run: `cd backend && uv run pytest tests/interviews/test_delphi.py -v` +Expected: 3 passed. + +- [ ] **Step 6: Commit** + +```bash +git add backend/scripts/instruments/delphi_v1.yaml backend/app/services/interviews/delphi.py backend/tests/interviews/test_delphi.py +git commit -m "feat(interviews): Delphi subagent (3 rounds: open, rate, revise) + convergence metrics" +``` + +--- + +### Task 9: Scenario subagent + +**Files:** +- Create: `backend/scripts/instruments/scenario_v1.yaml` +- Create: `backend/app/services/interviews/scenario.py` +- Test: `backend/tests/interviews/test_scenario.py` + +- [ ] **Step 1: Write failing test** + +```python +# backend/tests/interviews/test_scenario.py +from pathlib import Path +from app.services.interviews.base import PersonaRecord, MemoryDigest +from app.services.interviews.scenario import ScenarioSubagent, polarity_matrix + +INSTRUMENT = Path(__file__).resolve().parents[2] / "scripts" / "instruments" / "scenario_v1.yaml" + +class _Mem: + def get_digest(self, agent_id, max_chars=2000): + return MemoryDigest(text="x", available=True) + +class _LLM: + def chat_json(self, messages, temperature=0.0, max_tokens=None, **kw): + return {"ratings": {sid: { + "desirability": 4, "plausibility": 3, "impact_on_my_group": 5, "fairness": 3, + "if_woke_up_response": f"act-on-{sid}", + } for sid in ("S1", "S2", "S3", "S4")}} + +def test_scenario_administer(): + sub = ScenarioSubagent(llm=_LLM(), memory=_Mem(), instrument_path=INSTRUMENT) + persona = PersonaRecord(agent_id=1, name="A", persona="p") + resp = sub.administer(persona) + assert set(resp.ratings.keys()) == {"S1", "S2", "S3", "S4"} + assert resp.ratings["S1"].desirability == 4 + +def test_polarity_matrix(): + from app.models.interview import ScenarioResponse, ScenarioRating + responses = [ScenarioResponse(agent_id=i, ratings={ + "S1": ScenarioRating(desirability=5, plausibility=4, impact_on_my_group=5, fairness=4, + if_woke_up_response="x"), + }) for i in range(3)] + m = polarity_matrix(responses) + assert "S1" in m + assert m["S1"]["mean_desirability"] == 5 + assert m["S1"]["n"] == 3 +``` + +- [ ] **Step 2: Run test to verify it fails** + +Run: `cd backend && uv run pytest tests/interviews/test_scenario.py -v` +Expected: ImportError. + +- [ ] **Step 3: Create instrument YAML** + +`backend/scripts/instruments/scenario_v1.yaml`: +```yaml +name: scenario_v1 +version: "1.0" +language_default: de +scenarios: + - scenario_id: S1 + label_de: "Erholung 2040" + label_en: "Recovery 2040" + description_de: | + Bis 2040 haben sich Dorsch- und Heringsbestände in der westlichen Ostsee + deutlich erholt. MSC-Zertifizierung ist branchenweit Standard. Die kleine + Küstenfischerei hat sich stabilisiert; die Politik gilt als erfolgreich. + description_en: | + By 2040, Western Baltic cod and herring stocks have substantially recovered. + MSC certification is industry-wide standard. Small-scale coastal fisheries + have stabilised; policy is regarded as successful. + - scenario_id: S2 + label_de: "Kollaps 2040" + label_en: "Collapse 2040" + description_de: | + Bis 2040 sind Dorsch- und Heringsbestände zusammengebrochen. Die Flotte + ist halbiert, Aquakultur dominiert den Markt, Häfen veröden. + description_en: | + By 2040, cod and herring stocks have collapsed. The fleet is halved, + aquaculture dominates the market, harbour towns decline. + - scenario_id: S3 + label_de: "Festung Europa 2040" + label_en: "Fortress Europe 2040" + description_de: | + Bis 2040 verfolgt die EU eine protektionistische Politik mit hohen Importzöllen, + Meeresschutzgebiete bedecken 30% der Ostsee, Sportfischerei ist stark eingeschränkt. + description_en: | + By 2040, the EU pursues a protectionist policy with high import tariffs, + MPAs cover 30% of the Baltic, recreational fishing is strongly curtailed. + - scenario_id: S4 + label_de: "Privatisierung 2040" + label_en: "Privatisation 2040" + description_de: | + Bis 2040 sind Fangrechte als handelbare Quoten (ITQs) etabliert. Die Branche + hat sich konsolidiert; nur große, kapitalstarke Unternehmen sind übrig. + description_en: | + By 2040, fishing rights are tradable quotas (ITQs). The industry has + consolidated; only large, well-capitalised firms remain. +dimensions: + - {dimension_id: desirability, scale: 7, + de: "Wie wünschenswert ist dieses Szenario?", en: "How desirable is this scenario?"} + - {dimension_id: plausibility, scale: 7, + de: "Wie plausibel ist dieses Szenario?", en: "How plausible is this scenario?"} + - {dimension_id: impact_on_my_group, scale: 7, + de: "Wie stark trifft es Ihre Gruppe?", en: "How strongly does it affect your group?"} + - {dimension_id: fairness, scale: 7, + de: "Wie fair ist dieses Szenario?", en: "How fair is this scenario?"} +``` + +- [ ] **Step 4: Implement subagent** + +`backend/app/services/interviews/scenario.py`: +```python +from __future__ import annotations +import json +import statistics +from pathlib import Path +from typing import Optional +import yaml +from app.models.interview import ScenarioRating, ScenarioResponse +from app.services.interviews.base import StakeholderInterviewer, PersonaRecord + +class ScenarioSubagent: + def __init__(self, llm, memory, instrument_path: Path, language: str = "de"): + with Path(instrument_path).open("r", encoding="utf-8") as f: + self.instrument = yaml.safe_load(f) + self.interviewer = StakeholderInterviewer(llm=llm, memory=memory, language=language) + self.language = language + + def _schema_hint(self) -> str: + sids = [s["scenario_id"] for s in self.instrument["scenarios"]] + return json.dumps({ + "ratings": {sid: { + "desirability": "", + "plausibility": "", + "impact_on_my_group": "", + "fairness": "", + "if_woke_up_response": "", + } for sid in sids} + }, ensure_ascii=False) + + def _user_prompt(self) -> str: + head = ("Bewerten Sie jedes der folgenden Szenarien auf vier Dimensionen (1-7) " + "und beantworten Sie kurz, was Sie tun würden, wenn Sie in dieser Welt aufwachten.") \ + if self.language == "de" else \ + ("Rate each of the following scenarios on four dimensions (1-7) " + "and briefly answer what you would do if you woke up in this world.") + blocks = [] + for s in self.instrument["scenarios"]: + label = s["label_de"] if self.language == "de" else s["label_en"] + desc = s["description_de"] if self.language == "de" else s["description_en"] + blocks.append(f"--- {s['scenario_id']}: {label} ---\n{desc}") + return head + "\n\n" + "\n\n".join(blocks) + + def _validate(self, raw: dict) -> Optional[dict]: + if not isinstance(raw, dict): return None + sids = {s["scenario_id"] for s in self.instrument["scenarios"]} + ratings = raw.get("ratings", {}) + if set(ratings.keys()) != sids: return None + for v in ratings.values(): + if not isinstance(v, dict): return None + for k in ("desirability", "plausibility", "impact_on_my_group", "fairness"): + if not isinstance(v.get(k), int) or not 1 <= v[k] <= 7: return None + if not isinstance(v.get("if_woke_up_response", ""), str): return None + return raw + + def administer(self, persona: PersonaRecord) -> ScenarioResponse: + raw = self.interviewer.ask_in_character( + persona, user_prompt=self._user_prompt(), + schema_hint=self._schema_hint(), validate=self._validate, + ) + ratings = {sid: ScenarioRating(**v) for sid, v in raw["ratings"].items()} + return ScenarioResponse(agent_id=persona.agent_id, ratings=ratings) + +def polarity_matrix(responses: list[ScenarioResponse]) -> dict: + matrix: dict[str, dict] = {} + sids: set[str] = set() + for r in responses: sids.update(r.ratings.keys()) + for sid in sorted(sids): + vals = [r.ratings[sid] for r in responses if sid in r.ratings] + if not vals: + matrix[sid] = {"n": 0} + continue + matrix[sid] = { + "n": len(vals), + "mean_desirability": statistics.mean(v.desirability for v in vals), + "mean_plausibility": statistics.mean(v.plausibility for v in vals), + "mean_impact": statistics.mean(v.impact_on_my_group for v in vals), + "mean_fairness": statistics.mean(v.fairness for v in vals), + "sd_desirability": statistics.pstdev([v.desirability for v in vals]) if len(vals) > 1 else 0.0, + "sd_plausibility": statistics.pstdev([v.plausibility for v in vals]) if len(vals) > 1 else 0.0, + } + return matrix +``` + +- [ ] **Step 5: Run test to verify it passes** + +Run: `cd backend && uv run pytest tests/interviews/test_scenario.py -v` +Expected: 2 passed. + +- [ ] **Step 6: Commit** + +```bash +git add backend/scripts/instruments/scenario_v1.yaml backend/app/services/interviews/scenario.py backend/tests/interviews/test_scenario.py +git commit -m "feat(interviews): scenario subagent with 4 futures × 4 dimensions + polarity matrix" +``` + +--- + +## Phase 3 — Storage and Zep + +### Task 10: Interview storage layout writer + +**Files:** +- Create: `backend/app/services/interviews/storage.py` +- Test: `backend/tests/interviews/test_storage.py` + +- [ ] **Step 1: Write failing test** + +```python +# backend/tests/interviews/test_storage.py +import json +from pathlib import Path +from app.models.interview import ( + LikertResponse, InterviewPhase, SubagentKind, +) +from app.services.interviews.storage import InterviewStore + +def test_run_directory_layout(tmp_path): + store = InterviewStore(root=tmp_path, sim_id="sim42") + run_dir = store.start_run(phase=InterviewPhase.T0, subagent=SubagentKind.LONGITUDINAL) + assert run_dir.exists() + assert run_dir.parent.name == "longitudinal" + assert run_dir.parent.parent.name == "T0" + +def test_append_response(tmp_path): + store = InterviewStore(root=tmp_path, sim_id="sim42") + run_dir = store.start_run(phase=InterviewPhase.T0, subagent=SubagentKind.LONGITUDINAL) + r = LikertResponse(agent_id=1, phase=InterviewPhase.T0, + responses={"a": 3}, confidence={"a": 0.5}) + store.append_response(run_dir, r) + contents = (run_dir / "responses.jsonl").read_text() + assert json.loads(contents.splitlines()[0])["agent_id"] == 1 + +def test_write_aggregate_and_latest_pointer(tmp_path): + store = InterviewStore(root=tmp_path, sim_id="sim42") + run_dir = store.start_run(phase=InterviewPhase.T1, subagent=SubagentKind.SCENARIO) + store.write_aggregate(run_dir, {"k": 1}) + store.mark_latest(run_dir) + latest = (run_dir.parent / "latest.json").read_text() + assert json.loads(latest)["run_dir"].endswith(run_dir.name) + +def test_audit_log_append(tmp_path): + store = InterviewStore(root=tmp_path, sim_id="sim42") + run_dir = store.start_run(phase=InterviewPhase.T0, subagent=SubagentKind.DELPHI) + store.audit(run_dir, agent_id=7, event="schema_violation", detail="missing key x") + audit = (run_dir / "audit.jsonl").read_text() + assert "schema_violation" in audit +``` + +- [ ] **Step 2: Run test to verify it fails** + +Run: `cd backend && uv run pytest tests/interviews/test_storage.py -v` +Expected: ImportError. + +- [ ] **Step 3: Implement storage** + +`backend/app/services/interviews/storage.py`: +```python +from __future__ import annotations +import json +import time +import uuid +from pathlib import Path +from typing import Any +from pydantic import BaseModel +from app.models.interview import InterviewPhase, SubagentKind + +class InterviewStore: + def __init__(self, root: Path, sim_id: str): + self.base = Path(root) / "simulations" / sim_id / "interviews" + self.base.mkdir(parents=True, exist_ok=True) + + def start_run(self, phase: InterviewPhase, subagent: SubagentKind) -> Path: + run_id = time.strftime("%Y%m%dT%H%M%S") + "-" + uuid.uuid4().hex[:6] + run_dir = self.base / phase.value / subagent.value / run_id + run_dir.mkdir(parents=True, exist_ok=True) + meta = {"run_id": run_id, "phase": phase.value, "subagent": subagent.value, + "created_at": time.time()} + (run_dir / "run.json").write_text(json.dumps(meta, indent=2), encoding="utf-8") + return run_dir + + def append_response(self, run_dir: Path, model: BaseModel) -> None: + path = run_dir / "responses.jsonl" + with path.open("a", encoding="utf-8") as f: + f.write(model.model_dump_json() + "\n") + + def append_jsonl(self, run_dir: Path, filename: str, payload: dict | BaseModel) -> None: + path = run_dir / filename + with path.open("a", encoding="utf-8") as f: + if isinstance(payload, BaseModel): + f.write(payload.model_dump_json() + "\n") + else: + f.write(json.dumps(payload, ensure_ascii=False) + "\n") + + def read_responses(self, run_dir: Path, filename: str = "responses.jsonl") -> list[dict]: + path = run_dir / filename + if not path.exists(): return [] + return [json.loads(line) for line in path.read_text(encoding="utf-8").splitlines() if line.strip()] + + def write_aggregate(self, run_dir: Path, payload: dict) -> None: + (run_dir / "aggregate.json").write_text( + json.dumps(payload, ensure_ascii=False, indent=2), encoding="utf-8") + + def write_named(self, run_dir: Path, name: str, payload: Any) -> None: + (run_dir / name).write_text( + json.dumps(payload, ensure_ascii=False, indent=2), encoding="utf-8") + + def audit(self, run_dir: Path, agent_id: int | None, event: str, detail: str = "") -> None: + entry = {"ts": time.time(), "agent_id": agent_id, "event": event, "detail": detail} + with (run_dir / "audit.jsonl").open("a", encoding="utf-8") as f: + f.write(json.dumps(entry, ensure_ascii=False) + "\n") + + def mark_latest(self, run_dir: Path) -> None: + pointer = run_dir.parent / "latest.json" + pointer.write_text(json.dumps({ + "run_dir": str(run_dir.relative_to(self.base)), + }), encoding="utf-8") + + def latest_run(self, phase: InterviewPhase, subagent: SubagentKind) -> Path | None: + pointer = self.base / phase.value / subagent.value / "latest.json" + if not pointer.exists(): return None + rel = json.loads(pointer.read_text())["run_dir"] + path = self.base / rel + return path if path.exists() else None +``` + +- [ ] **Step 4: Run test to verify it passes** + +Run: `cd backend && uv run pytest tests/interviews/test_storage.py -v` +Expected: 4 passed. + +- [ ] **Step 5: Commit** + +```bash +git add backend/app/services/interviews/storage.py backend/tests/interviews/test_storage.py +git commit -m "feat(interviews): JSONL/JSON storage layout with run_id directories and latest pointer" +``` + +--- + +### Task 11: Zep episode writer for interviews + +**Files:** +- Create: `backend/app/services/interviews/zep_writer.py` +- Test: `backend/tests/interviews/test_zep_writer.py` + +- [ ] **Step 1: Write failing test** + +```python +# backend/tests/interviews/test_zep_writer.py +from app.models.interview import ( + LikertResponse, InterviewPhase, SubagentKind, +) +from app.services.interviews.zep_writer import InterviewZepWriter + +class _FakeMemoryUpdater: + def __init__(self): + self.events = [] + def add_activity(self, activity): + self.events.append(activity) + def add_text_episode(self, graph_id, text): + self.events.append({"graph_id": graph_id, "text": text}) + +def test_per_agent_episode_text(): + upd = _FakeMemoryUpdater() + w = InterviewZepWriter(memory_updater=upd, graph_id="g1") + r = LikertResponse(agent_id=42, phase=InterviewPhase.T1, + responses={"stk_1": 4, "gov_1": 3}, + confidence={"stk_1": 0.8, "gov_1": 0.7}) + w.write_per_agent(SubagentKind.LONGITUDINAL, r, agent_name="Fischer Müller") + assert any("Fischer Müller" in str(e) for e in upd.events) + assert any("longitudinal/T1" in str(e) for e in upd.events) + +def test_aggregate_episode(): + upd = _FakeMemoryUpdater() + w = InterviewZepWriter(memory_updater=upd, graph_id="g1") + w.write_aggregate(SubagentKind.SCENARIO, summary="S1 mean desirability 5.2; S2 mean 2.1") + assert any("S1 mean" in str(e) for e in upd.events) +``` + +- [ ] **Step 2: Run test to verify it fails** + +Run: `cd backend && uv run pytest tests/interviews/test_zep_writer.py -v` +Expected: ImportError. + +- [ ] **Step 3: Implement writer** + +`backend/app/services/interviews/zep_writer.py`: +```python +from __future__ import annotations +from typing import Any, Optional +from app.models.interview import ( + LikertResponse, QSortResponse, DelphiRatingResponse, ScenarioResponse, SubagentKind, +) + +class InterviewZepWriter: + """Mirrors `ZepGraphMemoryUpdater.add_activity` usage but for interview episodes. + + The real `ZepGraphMemoryUpdater` may expose `add_activity` (preferred) or a lower-level + text-episode method; this writer adapts to either via duck typing. + """ + def __init__(self, memory_updater, graph_id: str): + self.updater = memory_updater + self.graph_id = graph_id + + def _emit(self, text: str) -> None: + if hasattr(self.updater, "add_text_episode"): + self.updater.add_text_episode(self.graph_id, text) + elif hasattr(self.updater, "add_activity"): + self.updater.add_activity({"graph_id": self.graph_id, "text": text}) + else: + raise RuntimeError("memory_updater has neither add_text_episode nor add_activity") + + def _summarize_likert(self, r: LikertResponse) -> str: + mean_v = sum(r.responses.values()) / max(len(r.responses), 1) + top = sorted(r.responses.items(), key=lambda kv: -kv[1])[:3] + bot = sorted(r.responses.items(), key=lambda kv: kv[1])[:3] + return (f"mean={mean_v:.2f}; agrees with {[k for k,_ in top]}; " + f"disagrees with {[k for k,_ in bot]}") + + def _summarize_qsort(self, r: QSortResponse) -> str: + plus = [k for k, v in r.placements.items() if v >= 2] + minus = [k for k, v in r.placements.items() if v <= -2] + return f"+strongly:{plus}; -strongly:{minus}" + + def _summarize_scenario(self, r: ScenarioResponse) -> str: + parts = [f"{sid}: des={rt.desirability} plaus={rt.plausibility}" + for sid, rt in r.ratings.items()] + return "; ".join(parts) + + def write_per_agent( + self, subagent: SubagentKind, response: Any, agent_name: str, + phase: Optional[str] = None, + ) -> None: + if isinstance(response, LikertResponse): + phase = phase or response.phase.value + summary = self._summarize_likert(response) + elif isinstance(response, QSortResponse): + phase = phase or "T1" + summary = self._summarize_qsort(response) + elif isinstance(response, ScenarioResponse): + phase = phase or "T1" + summary = self._summarize_scenario(response) + elif isinstance(response, DelphiRatingResponse): + phase = phase or f"T1/R{response.round}" + summary = f"round={response.round}; {len(response.ratings)} themes rated" + else: + phase = phase or "T1" + summary = str(response)[:200] + text = f"Agent {agent_name} (interview/{subagent.value}/{phase}): {summary}" + self._emit(text) + + def write_aggregate(self, subagent: SubagentKind, summary: str) -> None: + self._emit(f"Interview aggregate ({subagent.value}): {summary}") +``` + +- [ ] **Step 4: Run test to verify it passes** + +Run: `cd backend && uv run pytest tests/interviews/test_zep_writer.py -v` +Expected: 2 passed. + +- [ ] **Step 5: Commit** + +```bash +git add backend/app/services/interviews/zep_writer.py backend/tests/interviews/test_zep_writer.py +git commit -m "feat(interviews): Zep writer adapts add_activity/add_text_episode for per-agent + aggregate episodes" +``` + +--- + +## Phase 4 — Orchestrator, lifecycle, synthesiser + +### Task 12: InterviewOrchestrator (parallel fan-out) + +**Files:** +- Create: `backend/app/services/interview_orchestrator.py` +- Test: `backend/tests/interviews/test_orchestrator.py` + +- [ ] **Step 1: Write failing test** + +```python +# backend/tests/interviews/test_orchestrator.py +from pathlib import Path +import pytest +from app.models.interview import InterviewPhase, SubagentKind +from app.services.interviews.base import PersonaRecord, MemoryDigest +from app.services.interview_orchestrator import ( + InterviewOrchestrator, PersonaProvider, +) + +INST_DIR = Path(__file__).resolve().parents[2] / "scripts" / "instruments" + +class _Mem: + def get_digest(self, agent_id, max_chars=2000): + return MemoryDigest(text="x", available=True) + +class _LLM: + def chat_json(self, messages, temperature=0.0, max_tokens=None, **kw): + sys_text = next((m["content"] for m in messages if m["role"] == "system"), "") + if "longitudinal" in sys_text or "stk_" in (messages[-1].get("content") or ""): + return { + "responses": {k: 3 for k in ("stk_1","stk_2","stk_3","gov_1","gov_2","gov_3", + "mkt_1","mkt_2","mkt_3","clm_1","clm_2","clm_3")}, + "confidence": {}, "open_comment": "ok", + } + return {} + +class _Personas(PersonaProvider): + def __init__(self, n=3): + self._items = [PersonaRecord(agent_id=i, name=f"A{i}", persona="p") for i in range(n)] + def all(self): return list(self._items) + +class _NoopZep: + def write_per_agent(self, *a, **kw): pass + def write_aggregate(self, *a, **kw): pass + +def test_pre_phase_runs_longitudinal_only(tmp_path): + orch = InterviewOrchestrator( + llm=_LLM(), memory=_Mem(), personas=_Personas(3), + instrument_dir=INST_DIR, store_root=tmp_path, sim_id="sim1", + zep_writer=_NoopZep(), max_workers=2, + ) + result = orch.run_pre() + assert result["longitudinal"]["n_responded"] == 3 + assert "diversity" not in result # only longitudinal in pre-phase + +def test_partial_failure_does_not_kill_run(tmp_path): + class _FlakyLLM: + def __init__(self): self.n = 0 + def chat_json(self, messages, temperature=0.0, max_tokens=None, **kw): + self.n += 1 + if self.n % 2 == 0: + raise RuntimeError("simulated LLM 5xx") + return { + "responses": {k: 3 for k in ("stk_1","stk_2","stk_3","gov_1","gov_2","gov_3", + "mkt_1","mkt_2","mkt_3","clm_1","clm_2","clm_3")}, + "confidence": {}, "open_comment": "ok", + } + orch = InterviewOrchestrator( + llm=_FlakyLLM(), memory=_Mem(), personas=_Personas(4), + instrument_dir=INST_DIR, store_root=tmp_path, sim_id="sim2", + zep_writer=_NoopZep(), max_workers=1, + ) + result = orch.run_pre() + assert result["longitudinal"]["n_responded"] < 4 + assert result["longitudinal"]["n_failed"] > 0 +``` + +- [ ] **Step 2: Run test to verify it fails** + +Run: `cd backend && uv run pytest tests/interviews/test_orchestrator.py -v` +Expected: ImportError. + +- [ ] **Step 3: Implement orchestrator** + +`backend/app/services/interview_orchestrator.py`: +```python +from __future__ import annotations +from concurrent.futures import ThreadPoolExecutor, as_completed +from pathlib import Path +from typing import Protocol +from app.models.interview import ( + InterviewPhase, SubagentKind, LikertResponse, QSortResponse, + DelphiOpenResponse, DelphiRatingResponse, ScenarioResponse, +) +from app.services.interviews.base import PersonaRecord +from app.services.interviews.longitudinal import LongitudinalSubagent, run_aggregate as longitudinal_aggregate +from app.services.interviews.diversity import DiversitySubagent, run_typology +from app.services.interviews.delphi import ( + DelphiSubagent, extract_themes, convergence_metrics, group_stats_from_r2, +) +from app.services.interviews.scenario import ScenarioSubagent, polarity_matrix +from app.services.interviews.storage import InterviewStore +from app.services.interviews.instrument_loader import freeze_snapshot + +class PersonaProvider(Protocol): + def all(self) -> list[PersonaRecord]: ... + +class InterviewOrchestrator: + def __init__( + self, llm, memory, personas: PersonaProvider, + instrument_dir: Path, store_root: Path, sim_id: str, + zep_writer, max_workers: int = 8, language: str = "de", + ): + self.llm = llm + self.memory = memory + self.personas = personas + self.instrument_dir = Path(instrument_dir) + self.store = InterviewStore(root=store_root, sim_id=sim_id) + self.zep_writer = zep_writer + self.max_workers = max_workers + self.language = language + # Freeze snapshot once per orchestrator lifetime + freeze_snapshot( + instruments={ + "longitudinal": self.instrument_dir / "longitudinal_v1.yaml", + "diversity": self.instrument_dir / "diversity_v1.yaml", + "delphi": self.instrument_dir / "delphi_v1.yaml", + "scenario": self.instrument_dir / "scenario_v1.yaml", + }, + out_path=self.store.base / "instruments_used.json", + ) + + # --- Generic per-agent runner --- + def _fan_out(self, run_dir, agent_fn, personas, audit_label): + ok: list = [] + failed: list[int] = [] + with ThreadPoolExecutor(max_workers=self.max_workers) as pool: + futures = {pool.submit(agent_fn, p): p for p in personas} + for fut in as_completed(futures): + p = futures[fut] + try: + out = fut.result() + ok.append(out) + self.store.append_response(run_dir, out) + except Exception as e: + failed.append(p.agent_id) + self.store.audit(run_dir, agent_id=p.agent_id, + event="agent_failed", detail=f"{audit_label}: {e!r}") + return ok, failed + + # --- Pre-phase (T0) --- + def run_pre(self) -> dict: + sub = LongitudinalSubagent(self.llm, self.memory, + self.instrument_dir / "longitudinal_v1.yaml", + language=self.language) + run_dir = self.store.start_run(InterviewPhase.T0, SubagentKind.LONGITUDINAL) + ok, failed = self._fan_out( + run_dir, lambda p: sub.administer(p, phase=InterviewPhase.T0), + self.personas.all(), audit_label="longitudinal_T0", + ) + for r in ok: + persona = next(p for p in self.personas.all() if p.agent_id == r.agent_id) + try: self.zep_writer.write_per_agent(SubagentKind.LONGITUDINAL, r, persona.name) + except Exception: pass + self.store.mark_latest(run_dir) + return {"longitudinal": {"n_responded": len(ok), "n_failed": len(failed), + "run_dir": str(run_dir)}} + + # --- Post-phase (T1) --- + def run_post(self) -> dict: + personas = self.personas.all() + out: dict = {} + with ThreadPoolExecutor(max_workers=4) as pool: + futures = { + "longitudinal": pool.submit(self._post_longitudinal, personas), + "diversity": pool.submit(self._post_diversity, personas), + "scenario": pool.submit(self._post_scenario, personas), + } + for name, fut in futures.items(): + try: out[name] = fut.result() + except Exception as e: out[name] = {"error": repr(e)} + # Delphi runs sequentially (R1 → R2 → R3) and uses the LLM for theme extraction + try: out["delphi"] = self._post_delphi(personas) + except Exception as e: out["delphi"] = {"error": repr(e)} + return out + + def _post_longitudinal(self, personas) -> dict: + sub = LongitudinalSubagent(self.llm, self.memory, + self.instrument_dir / "longitudinal_v1.yaml", + language=self.language) + run_dir = self.store.start_run(InterviewPhase.T1, SubagentKind.LONGITUDINAL) + ok, failed = self._fan_out( + run_dir, lambda p: sub.administer(p, phase=InterviewPhase.T1), + personas, audit_label="longitudinal_T1", + ) + # Aggregate using T0 + T1 + t0_path = self.store.latest_run(InterviewPhase.T0, SubagentKind.LONGITUDINAL) + t0_raw = self.store.read_responses(t0_path) if t0_path else [] + t0 = [LikertResponse(**d) for d in t0_raw] + agg = longitudinal_aggregate(t0, ok) + self.store.write_aggregate(run_dir, agg) + for r in ok: + persona = next(p for p in personas if p.agent_id == r.agent_id) + try: self.zep_writer.write_per_agent(SubagentKind.LONGITUDINAL, r, persona.name) + except Exception: pass + try: self.zep_writer.write_aggregate(SubagentKind.LONGITUDINAL, + f"n_paired={agg['n_paired']}") + except Exception: pass + self.store.mark_latest(run_dir) + return {"n_responded": len(ok), "n_failed": len(failed), "run_dir": str(run_dir)} + + def _post_diversity(self, personas) -> dict: + sub = DiversitySubagent(self.llm, self.memory, + self.instrument_dir / "diversity_v1.yaml", + language=self.language) + run_dir = self.store.start_run(InterviewPhase.T1, SubagentKind.DIVERSITY) + ok, failed = self._fan_out( + run_dir, lambda p: sub.administer(p), personas, audit_label="diversity", + ) + typology = run_typology(ok) + self.store.write_named(run_dir, "typology.json", typology) + self.store.write_aggregate(run_dir, {"n": len(ok), "n_failed": len(failed), + "clusters": typology["clusters"]}) + for r in ok: + persona = next(p for p in personas if p.agent_id == r.agent_id) + try: self.zep_writer.write_per_agent(SubagentKind.DIVERSITY, r, persona.name) + except Exception: pass + self.store.mark_latest(run_dir) + return {"n_responded": len(ok), "n_failed": len(failed), "run_dir": str(run_dir)} + + def _post_scenario(self, personas) -> dict: + sub = ScenarioSubagent(self.llm, self.memory, + self.instrument_dir / "scenario_v1.yaml", + language=self.language) + run_dir = self.store.start_run(InterviewPhase.T1, SubagentKind.SCENARIO) + ok, failed = self._fan_out( + run_dir, lambda p: sub.administer(p), personas, audit_label="scenario", + ) + matrix = polarity_matrix(ok) + self.store.write_named(run_dir, "polarity_matrix.json", matrix) + self.store.write_aggregate(run_dir, {"n": len(ok), "n_failed": len(failed), + "polarity": matrix}) + for r in ok: + persona = next(p for p in personas if p.agent_id == r.agent_id) + try: self.zep_writer.write_per_agent(SubagentKind.SCENARIO, r, persona.name) + except Exception: pass + self.store.mark_latest(run_dir) + return {"n_responded": len(ok), "n_failed": len(failed), "run_dir": str(run_dir)} + + def _post_delphi(self, personas) -> dict: + sub = DelphiSubagent(self.llm, self.memory, + self.instrument_dir / "delphi_v1.yaml", + language=self.language) + run_dir = self.store.start_run(InterviewPhase.T1, SubagentKind.DELPHI) + # Round 1 + r1_ok, r1_failed = self._fan_out( + run_dir, lambda p: sub.administer_round1(p), personas, audit_label="delphi_r1", + ) + # Move all R1 responses into a dedicated file + for r in r1_ok: self.store.append_jsonl(run_dir, "round1_themes.jsonl", r) + # Extract themes from R1 + themes = extract_themes(r1_ok, llm=self.llm) + self.store.write_named(run_dir, "themes.json", {"themes": themes}) + # Round 2 + r2_ok, r2_failed = self._fan_out( + run_dir, lambda p: sub.administer_round2(p, themes), + [p for p in personas if p.agent_id in {r.agent_id for r in r1_ok}], + audit_label="delphi_r2", + ) + for r in r2_ok: self.store.append_jsonl(run_dir, "round2_ratings.jsonl", r) + gstats = group_stats_from_r2(r2_ok) + # Round 3 + r2_by = {r.agent_id: r for r in r2_ok} + r3_personas = [p for p in personas if p.agent_id in r2_by] + def r3_call(p): return sub.administer_round3(p, themes, gstats, r2_by[p.agent_id]) + r3_ok, r3_failed = self._fan_out(run_dir, r3_call, r3_personas, audit_label="delphi_r3") + for r in r3_ok: self.store.append_jsonl(run_dir, "round3_revisions.jsonl", r) + # Convergence + conv = convergence_metrics(r2_ok, r3_ok) + self.store.write_named(run_dir, "convergence.json", conv) + self.store.write_aggregate(run_dir, { + "n_r1": len(r1_ok), "n_r2": len(r2_ok), "n_r3": len(r3_ok), + "n_failed_r1": len(r1_failed), "n_failed_r2": len(r2_failed), "n_failed_r3": len(r3_failed), + "themes": themes, + }) + for r in r3_ok: + persona = next(p for p in personas if p.agent_id == r.agent_id) + try: self.zep_writer.write_per_agent(SubagentKind.DELPHI, r, persona.name) + except Exception: pass + self.store.mark_latest(run_dir) + return {"n_r1": len(r1_ok), "n_r2": len(r2_ok), "n_r3": len(r3_ok), + "run_dir": str(run_dir)} + + # --- Re-run a single subagent --- + def rerun(self, subagent: SubagentKind) -> dict: + personas = self.personas.all() + if subagent == SubagentKind.LONGITUDINAL: return {"longitudinal": self._post_longitudinal(personas)} + if subagent == SubagentKind.DIVERSITY: return {"diversity": self._post_diversity(personas)} + if subagent == SubagentKind.SCENARIO: return {"scenario": self._post_scenario(personas)} + if subagent == SubagentKind.DELPHI: return {"delphi": self._post_delphi(personas)} + raise ValueError(f"unknown subagent {subagent}") +``` + +- [ ] **Step 4: Run test to verify it passes** + +Run: `cd backend && uv run pytest tests/interviews/test_orchestrator.py -v` +Expected: 2 passed. + +- [ ] **Step 5: Commit** + +```bash +git add backend/app/services/interview_orchestrator.py backend/tests/interviews/test_orchestrator.py +git commit -m "feat(interviews): orchestrator with two-phase lifecycle, parallel fan-out, isolated failures" +``` + +--- + +### Task 13: Simulation manager lifecycle hooks + +**Files:** +- Modify: `backend/app/services/simulation_manager.py` +- Test: `backend/tests/interviews/test_simulation_hooks.py` + +- [ ] **Step 1: Write failing test** + +```python +# backend/tests/interviews/test_simulation_hooks.py +from app.services.simulation_manager import SimulationManager, SimulationState + +def test_register_post_ready_hook_invoked(monkeypatch): + called = [] + mgr = SimulationManager() + mgr.register_on_ready(lambda state: called.append(("ready", state.sim_id))) + state = SimulationState(sim_id="abc", status="ready") + mgr._notify_on_ready(state) + assert called == [("ready", "abc")] + +def test_register_post_completed_hook_invoked(): + called = [] + mgr = SimulationManager() + mgr.register_on_completed(lambda state: called.append(("done", state.sim_id))) + state = SimulationState(sim_id="abc", status="completed") + mgr._notify_on_completed(state) + assert called == [("done", "abc")] +``` + +- [ ] **Step 2: Run test to verify it fails** + +Run: `cd backend && uv run pytest tests/interviews/test_simulation_hooks.py -v` +Expected: AttributeError on `register_on_ready` / `register_on_completed`. + +- [ ] **Step 3: Add hook registry to SimulationManager** + +In `backend/app/services/simulation_manager.py`, find the `SimulationManager` class. Add to `__init__` (preserving existing init): +```python + self._on_ready_hooks: list = [] + self._on_completed_hooks: list = [] +``` + +Add methods to the class: +```python + def register_on_ready(self, fn) -> None: + self._on_ready_hooks.append(fn) + + def register_on_completed(self, fn) -> None: + self._on_completed_hooks.append(fn) + + def _notify_on_ready(self, state) -> None: + for fn in list(self._on_ready_hooks): + try: fn(state) + except Exception as e: + from app.utils.logger import get_logger + get_logger(__name__).warning(f"on_ready hook failed: {e!r}") + + def _notify_on_completed(self, state) -> None: + for fn in list(self._on_completed_hooks): + try: fn(state) + except Exception as e: + from app.utils.logger import get_logger + get_logger(__name__).warning(f"on_completed hook failed: {e!r}") +``` + +Locate the existing code that transitions state to `ready` (after `prepare_simulation` completes) and to `completed` (after simulation finishes). Insert calls to `self._notify_on_ready(state)` and `self._notify_on_completed(state)` immediately after each transition. If `SimulationState` is not a simple dataclass with `sim_id` and `status` attributes, adjust the test fixture to match the actual class shape (read the file first). + +- [ ] **Step 4: Run test to verify it passes** + +Run: `cd backend && uv run pytest tests/interviews/test_simulation_hooks.py -v` +Expected: 2 passed. + +- [ ] **Step 5: Commit** + +```bash +git add backend/app/services/simulation_manager.py backend/tests/interviews/test_simulation_hooks.py +git commit -m "feat(interviews): on_ready / on_completed hook registry on SimulationManager" +``` + +--- + +### Task 14: InterviewSynthesizer + +**Files:** +- Create: `backend/app/services/interview_synthesizer.py` +- Test: `backend/tests/interviews/test_synthesizer.py` + +- [ ] **Step 1: Write failing test** + +```python +# backend/tests/interviews/test_synthesizer.py +import json +from pathlib import Path +from app.services.interviews.storage import InterviewStore +from app.models.interview import InterviewPhase, SubagentKind, LikertResponse +from app.services.interview_synthesizer import InterviewSynthesizer + +def _seed_minimal(tmp_path: Path) -> InterviewStore: + store = InterviewStore(root=tmp_path, sim_id="s1") + rd = store.start_run(InterviewPhase.T0, SubagentKind.LONGITUDINAL) + for i in range(3): + store.append_response(rd, LikertResponse( + agent_id=i, phase=InterviewPhase.T0, + responses={"stk_1": 3, "gov_1": 3}, confidence={"stk_1": 0.5, "gov_1": 0.5}, + )) + store.write_aggregate(rd, {"per_item": {}, "n_paired": 0}) + store.mark_latest(rd) + return store + +def test_synthesizer_runs_with_partial_data(tmp_path): + store = _seed_minimal(tmp_path) + synth = InterviewSynthesizer(store=store) + report = synth.run() + assert "limitations" in report.lower() + assert "stub mode" in report.lower() or "n_responded" in report.lower() + +def test_synthesizer_writes_files(tmp_path): + store = _seed_minimal(tmp_path) + synth = InterviewSynthesizer(store=store) + synth.run() + files = list((store.base / "synthesis").iterdir()) + names = {f.name for f in files} + assert "report.md" in names +``` + +- [ ] **Step 2: Run test to verify it fails** + +Run: `cd backend && uv run pytest tests/interviews/test_synthesizer.py -v` +Expected: ImportError. + +- [ ] **Step 3: Implement synthesiser** + +`backend/app/services/interview_synthesizer.py`: +```python +from __future__ import annotations +import csv +import json +from pathlib import Path +from app.models.interview import InterviewPhase, SubagentKind +from app.services.interviews.storage import InterviewStore + +class InterviewSynthesizer: + def __init__(self, store: InterviewStore): + self.store = store + + def _maybe(self, phase: InterviewPhase, sub: SubagentKind) -> dict | None: + run = self.store.latest_run(phase, sub) + if run is None: return None + agg = run / "aggregate.json" + if not agg.exists(): return None + return {"run_dir": str(run), "aggregate": json.loads(agg.read_text(encoding="utf-8"))} + + def _instrument_hashes(self) -> dict: + snap = self.store.base / "instruments_used.json" + if not snap.exists(): return {} + try: data = json.loads(snap.read_text(encoding="utf-8")) + except Exception: return {} + return {k: v.get("hash") for k, v in data.items()} + + def _limitations_text(self, present: dict[str, bool]) -> str: + lines = [ + "## Limitations", + "- **Simulated, not real stakeholders.** Responses reflect how the seed-document discourse " + "and the LLM jointly encode each stakeholder type, not what an actual fisher or NGO " + "staffer would say. The instrument measures the *model of the stakeholder*, not the stakeholder.", + "- **Memory digest is lossy.** Each agent's experience of OASIS is summarised to bounded length; " + "agents do not have full episodic recall.", + "- **LLM acquiescence and centrality bias.** Likert scales with LLM respondents skew toward 3–4 " + "of 5; check per-item distribution shape before drawing conclusions.", + "- **N is what it is.** `n_responded` and `n_failed` are printed verbatim per subagent; no smoothing.", + "- **Instrument provenance.** Hashes of frozen instruments are listed below; an identical run " + "is reproducible from these snapshots.", + ] + for k, ok in present.items(): + if not ok: + lines.append(f"- *{k}* subagent results are missing for this run.") + return "\n".join(lines) + + def run(self) -> str: + sections: list[str] = [] + sections.append("# Stakeholder Interview Synthesis\n") + + long_t0 = self._maybe(InterviewPhase.T0, SubagentKind.LONGITUDINAL) + long_t1 = self._maybe(InterviewPhase.T1, SubagentKind.LONGITUDINAL) + if long_t1: + agg = long_t1["aggregate"] + sections.append("## Longitudinal opinion drift (T0 → T1)") + sections.append(f"- N paired: {agg.get('n_paired', 'NA')}") + per_item = agg.get("per_item", {}) + top = sorted(per_item.items(), + key=lambda kv: abs(kv[1].get("mean_delta") or 0), reverse=True)[:5] + sections.append("- Largest mean shifts:") + for k, v in top: + sections.append(f" - `{k}`: Δ̄ = {v.get('mean_delta'):+0.2f} (n={v.get('n')})") + + diversity = self._maybe(InterviewPhase.T1, SubagentKind.DIVERSITY) + if diversity: + clusters = diversity["aggregate"].get("clusters", []) + sections.append("## Stakeholder typology") + sections.append(f"- N agents: {diversity['aggregate'].get('n', 'NA')}") + sections.append(f"- Clusters: {len(clusters)}") + for c in clusters: + sections.append(f" - cluster {c['cluster_id']}: n={c['n']}, " + f"top loadings = {list(c['top_loadings'].keys())[:5]}") + + delphi = self._maybe(InterviewPhase.T1, SubagentKind.DELPHI) + if delphi: + agg = delphi["aggregate"] + sections.append("## Delphi consensus") + sections.append(f"- Rounds completed: R1={agg.get('n_r1')}, R2={agg.get('n_r2')}, R3={agg.get('n_r3')}") + themes = agg.get("themes", []) + sections.append(f"- Themes: {[t.get('label') for t in themes]}") + + scenario = self._maybe(InterviewPhase.T1, SubagentKind.SCENARIO) + if scenario: + pol = scenario["aggregate"].get("polarity", {}) + sections.append("## Scenario evaluation") + for sid in sorted(pol): + v = pol[sid] + if v.get("n", 0) == 0: continue + sections.append( + f"- **{sid}**: n={v['n']}, desirability {v['mean_desirability']:.2f}, " + f"plausibility {v['mean_plausibility']:.2f}, impact {v['mean_impact']:.2f}, " + f"fairness {v['mean_fairness']:.2f}") + + sections.append("") + sections.append(self._limitations_text({ + "longitudinal": bool(long_t1), + "diversity": bool(diversity), + "delphi": bool(delphi), + "scenario": bool(scenario), + })) + sections.append("") + sections.append("### Instrument provenance") + for name, h in self._instrument_hashes().items(): + sections.append(f"- `{name}`: hash `{h}`") + + report = "\n\n".join(sections) + out_dir = self.store.base / "synthesis" + out_dir.mkdir(parents=True, exist_ok=True) + (out_dir / "report.md").write_text(report, encoding="utf-8") + self._write_tidy_csv(out_dir / "exports" / "all_responses.csv") + return report + + def _write_tidy_csv(self, csv_path: Path) -> None: + csv_path.parent.mkdir(parents=True, exist_ok=True) + rows: list[dict] = [] + for phase in (InterviewPhase.T0, InterviewPhase.T1): + for sub in SubagentKind: + run = self.store.latest_run(phase, sub) + if run is None: continue + files = ["responses.jsonl", "round1_themes.jsonl", + "round2_ratings.jsonl", "round3_revisions.jsonl"] + for fname in files: + for rec in self.store.read_responses(run, fname): + flat = self._flatten(rec, phase=phase.value, subagent=sub.value) + rows.extend(flat) + if not rows: + csv_path.write_text("phase,subagent,agent_id,key,value\n", encoding="utf-8") + return + fieldnames = sorted({k for r in rows for k in r.keys()}) + with csv_path.open("w", encoding="utf-8", newline="") as f: + w = csv.DictWriter(f, fieldnames=fieldnames) + w.writeheader() + for r in rows: w.writerow(r) + + def _flatten(self, rec: dict, *, phase: str, subagent: str) -> list[dict]: + out: list[dict] = [] + aid = rec.get("agent_id") + for key, val in rec.items(): + if key == "agent_id": continue + if isinstance(val, dict): + for k2, v2 in val.items(): + if isinstance(v2, dict): + for k3, v3 in v2.items(): + out.append({"phase": phase, "subagent": subagent, "agent_id": aid, + "key": f"{key}.{k2}.{k3}", "value": v3}) + else: + out.append({"phase": phase, "subagent": subagent, "agent_id": aid, + "key": f"{key}.{k2}", "value": v2}) + else: + out.append({"phase": phase, "subagent": subagent, "agent_id": aid, + "key": key, "value": val}) + return out +``` + +- [ ] **Step 4: Run test to verify it passes** + +Run: `cd backend && uv run pytest tests/interviews/test_synthesizer.py -v` +Expected: 2 passed. + +- [ ] **Step 5: Commit** + +```bash +git add backend/app/services/interview_synthesizer.py backend/tests/interviews/test_synthesizer.py +git commit -m "feat(interviews): synthesiser emits cross-method report + tidy CSV + limitations section" +``` + +--- + +## Phase 5 — Adapters and API + +### Task 15: Persona + memory adapters + +**Files:** +- Create: `backend/app/services/interviews/adapters.py` +- Test: `backend/tests/interviews/test_adapters.py` + +- [ ] **Step 1: Write failing test** + +```python +# backend/tests/interviews/test_adapters.py +import csv +import json +from pathlib import Path +from app.services.interviews.adapters import ( + FileSystemPersonaProvider, ZepMemoryProvider, +) + +def _write_reddit_profiles(tmp_path: Path): + data = [ + {"user_id": 0, "user_name": "fischer1", "name": "Fischer Müller", + "persona": "I am a small-scale Baltic fisher.", "profession": "fisher", "bio": ""}, + {"user_id": 1, "user_name": "ngo1", "name": "Ines NGO", + "persona": "I work for an environmental NGO.", "profession": "ngo_staff", "bio": ""}, + ] + p = tmp_path / "reddit_profiles.json" + p.write_text(json.dumps(data), encoding="utf-8") + return p + +def test_file_system_persona_provider_reads_reddit_json(tmp_path): + p = _write_reddit_profiles(tmp_path) + provider = FileSystemPersonaProvider(reddit_path=p, twitter_path=None) + personas = provider.all() + assert len(personas) == 2 + assert personas[0].name == "Fischer Müller" + assert personas[0].agent_id == 0 + +def test_zep_memory_provider_returns_empty_when_unavailable(): + class _BrokenReader: + def get_entity_with_context(self, *a, **kw): + raise RuntimeError("offline") + prov = ZepMemoryProvider(entity_reader=_BrokenReader(), graph_id="g1", + agent_to_entity={0: "uuid-zero"}) + d = prov.get_digest(0) + assert d.available is False + assert d.text != "" + +def test_zep_memory_provider_truncates_to_max_chars(): + class _R: + def get_entity_with_context(self, *a, **kw): + class _Ctx: + name = "X"; summary = "Y" + related_edges = [{"fact": "very long fact " * 200}] + return _Ctx() + prov = ZepMemoryProvider(entity_reader=_R(), graph_id="g1", + agent_to_entity={5: "uuid-five"}) + d = prov.get_digest(5, max_chars=300) + assert d.available is True + assert len(d.text) <= 300 +``` + +- [ ] **Step 2: Run test to verify it fails** + +Run: `cd backend && uv run pytest tests/interviews/test_adapters.py -v` +Expected: ImportError. + +- [ ] **Step 3: Implement adapters** + +`backend/app/services/interviews/adapters.py`: +```python +from __future__ import annotations +import csv +import json +from pathlib import Path +from typing import Optional +from app.services.interviews.base import PersonaRecord, MemoryDigest + +class FileSystemPersonaProvider: + """Reads OASIS profiles from the simulation's `reddit_profiles.json` and/or `twitter_profiles.csv`. + + If both are present, agents from `reddit_profiles.json` take precedence; twitter-only agents are appended. + """ + def __init__(self, reddit_path: Optional[Path], twitter_path: Optional[Path]): + self.reddit_path = Path(reddit_path) if reddit_path else None + self.twitter_path = Path(twitter_path) if twitter_path else None + + def _load_reddit(self) -> list[PersonaRecord]: + if not self.reddit_path or not self.reddit_path.exists(): return [] + data = json.loads(self.reddit_path.read_text(encoding="utf-8")) + out = [] + for row in data: + out.append(PersonaRecord( + agent_id=int(row.get("user_id")), + name=str(row.get("name") or row.get("user_name") or f"agent_{row.get('user_id')}"), + persona=str(row.get("persona") or row.get("bio") or ""), + profession=row.get("profession"), + bio=row.get("bio"), + )) + return out + + def _load_twitter(self) -> list[PersonaRecord]: + if not self.twitter_path or not self.twitter_path.exists(): return [] + out = [] + with self.twitter_path.open("r", encoding="utf-8", newline="") as f: + for row in csv.DictReader(f): + if not row.get("user_id"): continue + out.append(PersonaRecord( + agent_id=int(row["user_id"]), + name=str(row.get("name") or row.get("user_name") or f"agent_{row['user_id']}"), + persona=str(row.get("persona") or row.get("bio") or ""), + profession=row.get("profession"), + bio=row.get("bio"), + )) + return out + + def all(self) -> list[PersonaRecord]: + reddit = self._load_reddit() + seen = {p.agent_id for p in reddit} + twitter = [p for p in self._load_twitter() if p.agent_id not in seen] + return reddit + twitter + +class ZepMemoryProvider: + """Builds a bounded memory digest per agent from Zep entity context. + + Maps `agent_id` (OASIS user_id) to a Zep entity UUID; falls back to the agent_id as a string. + """ + def __init__(self, entity_reader, graph_id: str, agent_to_entity: dict[int, str] | None = None): + self.reader = entity_reader + self.graph_id = graph_id + self.map = dict(agent_to_entity or {}) + + def get_digest(self, agent_id: int, max_chars: int = 2000) -> MemoryDigest: + entity_uuid = self.map.get(agent_id) or str(agent_id) + try: + ctx = self.reader.get_entity_with_context(self.graph_id, entity_uuid) + except Exception: + return MemoryDigest(text=f"[no memory for agent {agent_id}]", available=False) + parts: list[str] = [] + name = getattr(ctx, "name", None) + summary = getattr(ctx, "summary", None) + if name: parts.append(f"Name: {name}") + if summary: parts.append(f"Summary: {summary}") + edges = getattr(ctx, "related_edges", []) or [] + for e in edges[:20]: + fact = e.get("fact") if isinstance(e, dict) else getattr(e, "fact", None) + if fact: parts.append(f"- {fact}") + text = "\n".join(parts) + if len(text) > max_chars: text = text[: max_chars - 1] + "…" + return MemoryDigest(text=text or f"[empty memory for agent {agent_id}]", available=True) +``` + +- [ ] **Step 4: Run test to verify it passes** + +Run: `cd backend && uv run pytest tests/interviews/test_adapters.py -v` +Expected: 3 passed. + +- [ ] **Step 5: Commit** + +```bash +git add backend/app/services/interviews/adapters.py backend/tests/interviews/test_adapters.py +git commit -m "feat(interviews): persona + Zep memory adapters bridging existing services to interview subsystem" +``` + +--- + +### Task 16: /api/interview Flask blueprint + +**Files:** +- Create: `backend/app/api/interview.py` +- Modify: `backend/app/api/__init__.py` +- Test: `backend/tests/interviews/test_api_interview.py` + +- [ ] **Step 1: Write failing test** + +```python +# backend/tests/interviews/test_api_interview.py +import json +import os +from pathlib import Path +import pytest + +@pytest.fixture +def client(tmp_path, monkeypatch): + monkeypatch.setenv("LLM_STUB_MODE", "true") + monkeypatch.setenv("UPLOADS_DIR", str(tmp_path)) + from app.config import Config + Config.LLM_STUB_MODE = True + Config.UPLOADS_DIR = str(tmp_path) + # Seed a minimal reddit_profiles.json + sim_dir = tmp_path / "simulations" / "sim_test" + sim_dir.mkdir(parents=True) + profiles = [{"user_id": i, "user_name": f"u{i}", "name": f"A{i}", + "persona": "p", "profession": "fisher"} for i in range(3)] + (sim_dir / "reddit_profiles.json").write_text(json.dumps(profiles), encoding="utf-8") + from flask import Flask + from app.api import register_blueprints + app = Flask(__name__) + register_blueprints(app) + return app.test_client() + +def test_post_pre_returns_task_id(client): + res = client.post("/api/interview/sim_test/pre") + assert res.status_code == 200 + body = res.get_json() + assert body["success"] is True + assert "task_id" in body["data"] + +def test_status_endpoint_returns_progress(client): + res = client.post("/api/interview/sim_test/pre") + task_id = res.get_json()["data"]["task_id"] + res2 = client.get(f"/api/interview/sim_test/status?task_id={task_id}") + assert res2.status_code == 200 + assert "status" in res2.get_json()["data"] + +def test_unknown_subagent_returns_400(client): + res = client.post("/api/interview/sim_test/rerun", + json={"subagent": "nonsense"}) + assert res.status_code == 400 +``` + +- [ ] **Step 2: Run test to verify it fails** + +Run: `cd backend && uv run pytest tests/interviews/test_api_interview.py -v` +Expected: ImportError / 404. + +- [ ] **Step 3: Check current `api/__init__.py`** + +Read `backend/app/api/__init__.py` and identify how `graph_bp`, `simulation_bp`, `report_bp` are registered. The test expects a `register_blueprints(app)` helper — if one doesn't exist, add it. + +- [ ] **Step 4: Modify `api/__init__.py`** + +Replace contents (preserving existing blueprint imports — adjust to match actual file): +```python +from flask import Flask +from .graph import graph_bp +from .simulation import simulation_bp +from .report import report_bp +from .interview import interview_bp + +def register_blueprints(app: Flask) -> None: + app.register_blueprint(graph_bp, url_prefix="/api/graph") + app.register_blueprint(simulation_bp, url_prefix="/api/simulation") + app.register_blueprint(report_bp, url_prefix="/api/report") + app.register_blueprint(interview_bp, url_prefix="/api/interview") +``` + +If the existing app factory in `app/__init__.py` already calls register manually, update it to call `register_blueprints(app)` instead. + +- [ ] **Step 5: Implement blueprint** + +`backend/app/api/interview.py`: +```python +from __future__ import annotations +import threading +import traceback +import uuid +from pathlib import Path +from flask import Blueprint, jsonify, request, send_file +from app.config import Config +from app.models.interview import SubagentKind, InterviewPhase +from app.services.interviews.adapters import FileSystemPersonaProvider, ZepMemoryProvider +from app.services.interviews.zep_writer import InterviewZepWriter +from app.services.interview_orchestrator import InterviewOrchestrator +from app.services.interview_synthesizer import InterviewSynthesizer +from app.services.interviews.storage import InterviewStore +from app.utils.llm_client import LLMClient + +interview_bp = Blueprint("interview", __name__) +_TASKS: dict[str, dict] = {} +_LOCK = threading.Lock() + +INSTRUMENT_DIR = Path(__file__).resolve().parents[2] / "scripts" / "instruments" + +def _uploads_root() -> Path: + return Path(getattr(Config, "UPLOADS_DIR", "uploads")) + +def _build_orchestrator(sim_id: str) -> InterviewOrchestrator: + sim_dir = _uploads_root() / "simulations" / sim_id + reddit = sim_dir / "reddit_profiles.json" + twitter = sim_dir / "twitter_profiles.csv" + personas = FileSystemPersonaProvider(reddit_path=reddit if reddit.exists() else None, + twitter_path=twitter if twitter.exists() else None) + # Zep memory + writer: best-effort; in stub/test mode the writer no-ops on exceptions + class _NullUpdater: + def add_text_episode(self, *a, **kw): return None + try: + from app.services.zep_entity_reader import ZepEntityReader + from app.services.zep_graph_memory_updater import ZepGraphMemoryUpdater + graph_id = (sim_dir / "graph_id.txt").read_text().strip() if (sim_dir / "graph_id.txt").exists() else "" + reader = ZepEntityReader() + updater = ZepGraphMemoryUpdater() + memory = ZepMemoryProvider(reader, graph_id=graph_id) + zep_writer = InterviewZepWriter(memory_updater=updater, graph_id=graph_id) + except Exception: + class _Mem: + def get_digest(self, agent_id, max_chars=2000): + from app.services.interviews.base import MemoryDigest + return MemoryDigest(text="[memory unavailable]", available=False) + memory = _Mem() + zep_writer = InterviewZepWriter(memory_updater=_NullUpdater(), graph_id="") + llm = LLMClient(api_key=Config.LLM_API_KEY, base_url=Config.LLM_BASE_URL, + model=Config.LLM_MODEL_NAME) + return InterviewOrchestrator( + llm=llm, memory=memory, personas=personas, + instrument_dir=INSTRUMENT_DIR, store_root=_uploads_root(), sim_id=sim_id, + zep_writer=zep_writer, max_workers=Config.INTERVIEW_MAX_WORKERS, + language=Config.INTERVIEW_DEFAULT_LANGUAGE, + ) + +def _run_task(task_id: str, fn) -> None: + with _LOCK: + _TASKS[task_id] = {"status": "running", "progress": {}, "result": None, "error": None} + try: + result = fn(task_id) + with _LOCK: + _TASKS[task_id]["status"] = "completed"; _TASKS[task_id]["result"] = result + except Exception as e: + with _LOCK: + _TASKS[task_id]["status"] = "failed" + _TASKS[task_id]["error"] = repr(e) + _TASKS[task_id]["traceback"] = traceback.format_exc() + +def _start_task(fn) -> str: + task_id = uuid.uuid4().hex[:12] + with _LOCK: + _TASKS[task_id] = {"status": "queued", "progress": {}, "result": None, "error": None} + threading.Thread(target=_run_task, args=(task_id, fn), daemon=True).start() + return task_id + +def _envelope(data=None, error=None, status: int = 200): + body = {"success": error is None, "data": data or {}, "error": error} + return jsonify(body), status + +@interview_bp.route("//pre", methods=["POST"]) +def post_pre(sim_id: str): + orch = _build_orchestrator(sim_id) + task_id = _start_task(lambda tid: orch.run_pre()) + return _envelope({"task_id": task_id}) + +@interview_bp.route("//post", methods=["POST"]) +def post_post(sim_id: str): + orch = _build_orchestrator(sim_id) + def run(tid): + out = orch.run_post() + synth = InterviewSynthesizer(store=orch.store) + out["synthesis"] = synth.run()[:1000] # short preview + return out + task_id = _start_task(run) + return _envelope({"task_id": task_id}) + +@interview_bp.route("//rerun", methods=["POST"]) +def post_rerun(sim_id: str): + body = request.get_json(silent=True) or {} + sub = body.get("subagent") + try: subagent = SubagentKind(sub) + except ValueError: return _envelope(error=f"unknown subagent {sub!r}", status=400) + orch = _build_orchestrator(sim_id) + task_id = _start_task(lambda tid: orch.rerun(subagent)) + return _envelope({"task_id": task_id}) + +@interview_bp.route("//status", methods=["GET"]) +def get_status(sim_id: str): + task_id = request.args.get("task_id") + with _LOCK: + task = _TASKS.get(task_id) + if task is None: return _envelope(error="unknown task_id", status=404) + return _envelope({"status": task["status"], "progress": task.get("progress", {}), + "result": task.get("result"), "error": task.get("error")}) + +@interview_bp.route("//results/", methods=["GET"]) +def get_results(sim_id: str, subagent: str): + try: sub = SubagentKind(subagent) + except ValueError: return _envelope(error=f"unknown subagent {subagent!r}", status=400) + store = InterviewStore(root=_uploads_root(), sim_id=sim_id) + phase = InterviewPhase.T1 if sub != SubagentKind.LONGITUDINAL else InterviewPhase.T1 + run = store.latest_run(phase, sub) + if run is None: return _envelope(error="no results yet", status=404) + agg = (run / "aggregate.json") + if not agg.exists(): return _envelope(error="aggregate missing", status=404) + import json as _j + return _envelope({"aggregate": _j.loads(agg.read_text(encoding="utf-8")), + "run_dir": str(run)}) + +@interview_bp.route("//results/synthesis", methods=["GET"]) +def get_synthesis(sim_id: str): + store = InterviewStore(root=_uploads_root(), sim_id=sim_id) + report = store.base / "synthesis" / "report.md" + if not report.exists(): + synth = InterviewSynthesizer(store=store) + synth.run() + return _envelope({"report_markdown": report.read_text(encoding="utf-8")}) + +@interview_bp.route("//export.csv", methods=["GET"]) +def get_export_csv(sim_id: str): + store = InterviewStore(root=_uploads_root(), sim_id=sim_id) + csv_path = store.base / "synthesis" / "exports" / "all_responses.csv" + if not csv_path.exists(): + InterviewSynthesizer(store=store).run() + return send_file(csv_path, mimetype="text/csv", as_attachment=True, + download_name=f"{sim_id}_interviews.csv") +``` + +- [ ] **Step 6: Run test to verify it passes** + +Run: `cd backend && uv run pytest tests/interviews/test_api_interview.py -v` +Expected: 3 passed. + +- [ ] **Step 7: Commit** + +```bash +git add backend/app/api/__init__.py backend/app/api/interview.py backend/tests/interviews/test_api_interview.py +git commit -m "feat(interviews): Flask blueprint /api/interview with task-based async + CSV export" +``` + +--- + +## Phase 6 — Integration + +### Task 17: End-to-end pipeline test (stub LLM) + +**Files:** +- Create: `backend/tests/integration/__init__.py` +- Test: `backend/tests/integration/test_interview_pipeline.py` + +- [ ] **Step 1: Write failing test** + +Create `backend/tests/integration/__init__.py` (empty), then: + +```python +# backend/tests/integration/test_interview_pipeline.py +import json +import pytest +from pathlib import Path +from app.config import Config +from app.models.interview import SubagentKind, InterviewPhase +from app.services.interviews.adapters import FileSystemPersonaProvider +from app.services.interviews.base import MemoryDigest +from app.services.interviews.zep_writer import InterviewZepWriter +from app.services.interview_orchestrator import InterviewOrchestrator +from app.services.interview_synthesizer import InterviewSynthesizer +from app.utils.llm_client import LLMClient + +pytestmark = pytest.mark.integration + +INST_DIR = Path(__file__).resolve().parents[2] / "scripts" / "instruments" + +class _NullUpdater: + def __init__(self): self.events = [] + def add_text_episode(self, graph_id, text): self.events.append(text) + +class _StaticMem: + def get_digest(self, agent_id, max_chars=2000): + return MemoryDigest(text=f"agent {agent_id} memory snippet", available=True) + +@pytest.fixture +def seeded_uploads(tmp_path, monkeypatch): + monkeypatch.setenv("LLM_STUB_MODE", "true") + Config.LLM_STUB_MODE = True + sim_dir = tmp_path / "simulations" / "intg_sim" + sim_dir.mkdir(parents=True) + profiles = [{"user_id": i, "user_name": f"u{i}", "name": f"A{i}", + "persona": "stakeholder p", "profession": "fisher"} for i in range(5)] + (sim_dir / "reddit_profiles.json").write_text(json.dumps(profiles), encoding="utf-8") + return tmp_path + +def _make_orch(tmp_path): + sim_dir = tmp_path / "simulations" / "intg_sim" + personas = FileSystemPersonaProvider( + reddit_path=sim_dir / "reddit_profiles.json", twitter_path=None, + ) + llm = LLMClient(api_key="x", base_url="x", model="x") + updater = _NullUpdater() + writer = InterviewZepWriter(memory_updater=updater, graph_id="g") + return InterviewOrchestrator( + llm=llm, memory=_StaticMem(), personas=personas, + instrument_dir=INST_DIR, store_root=tmp_path, sim_id="intg_sim", + zep_writer=writer, max_workers=2, language="de", + ) + +def test_pipeline_runs_pre_then_post_then_synthesis(seeded_uploads): + tmp = seeded_uploads + orch = _make_orch(tmp) + + pre = orch.run_pre() + assert pre["longitudinal"]["n_responded"] >= 1 + + post = orch.run_post() + assert "longitudinal" in post + assert "diversity" in post + assert "scenario" in post + assert "delphi" in post + + synth = InterviewSynthesizer(store=orch.store) + report = synth.run() + assert "Stakeholder Interview Synthesis" in report + assert "Limitations" in report + + csv_path = orch.store.base / "synthesis" / "exports" / "all_responses.csv" + assert csv_path.exists() + lines = csv_path.read_text().splitlines() + assert lines[0].startswith("agent_id,") or "agent_id" in lines[0] + +def test_idempotent_rerun_creates_new_run_id(seeded_uploads): + tmp = seeded_uploads + orch = _make_orch(tmp) + orch.run_pre() + first = orch.run_post() + second = orch.rerun(SubagentKind.SCENARIO) + first_scn = first["scenario"]["run_dir"] + second_scn = second["scenario"]["run_dir"] + assert first_scn != second_scn +``` + +- [ ] **Step 2: Run test to verify it fails** + +Run: `cd backend && uv run pytest tests/integration/test_interview_pipeline.py -v -m integration` +Expected: most likely ValidationError from the stub LLM's canned JSON not satisfying every subagent's strict validator (forced Q-sort distribution, scenarios, Delphi). This is the signal to enrich the stub. + +- [ ] **Step 3: Enrich `_stub_response_json` in `LLMClient` to satisfy each subagent** + +Read the current `_stub_response_json` (Task 4). Replace its body with content-aware stubs by inspecting the user message text. In `backend/app/utils/llm_client.py`, replace `_stub_response_json` with: + +```python + def _stub_response_json(self, messages: list[dict]) -> dict: + import hashlib, json as _json + sys_msg = next((m["content"] for m in messages if m.get("role") == "system"), "") + usr_msg = next((m["content"] for m in reversed(messages) if m.get("role") == "user"), "") + h = hashlib.sha256((sys_msg + "|" + usr_msg).encode("utf-8")).hexdigest() + seed = int(h[:8], 16) + rng = (seed % 5) + 1 + + # Longitudinal Likert (12 items) + if all(tok in usr_msg for tok in ("stk_1", "gov_1", "mkt_1", "clm_1")): + ids = ["stk_1","stk_2","stk_3","gov_1","gov_2","gov_3", + "mkt_1","mkt_2","mkt_3","clm_1","clm_2","clm_3"] + return {"responses": {k: ((seed >> (i*3)) % 5) + 1 for i, k in enumerate(ids)}, + "confidence": {k: 0.6 for k in ids}, + "open_comment": f"stub:{h[:8]}"} + + # Diversity Q-sort: 24 statements + 6 axes, forced distribution 2,3,4,6,4,3,2 + if "st_01" in usr_msg and "ax_pres_extr" in usr_msg: + buckets = [-3]*2 + [-2]*3 + [-1]*4 + [0]*6 + [1]*4 + [2]*3 + [3]*2 + stmts = [f"st_{i+1:02d}" for i in range(24)] + # shuffle deterministically + order = sorted(range(24), key=lambda i: (h[i % len(h)], i)) + placements = {stmts[i]: buckets[order.index(i)] for i in range(24)} + return { + "placements": placements, + "likert_axes": {a: ((seed >> (j*3)) % 7) + 1 for j, a in enumerate( + ["ax_pres_extr","ax_loc_eu","ax_sci_trad", + "ax_ind_col","ax_short_long","ax_mkt_reg"])}, + } + + # Scenario: S1..S4 × 4 dims + if all(s in usr_msg for s in ("S1:", "S2:", "S3:", "S4:")): + return {"ratings": {sid: { + "desirability": ((seed >> (i*3)) % 7) + 1, + "plausibility": ((seed >> (i*3+1)) % 7) + 1, + "impact_on_my_group": ((seed >> (i*3+2)) % 7) + 1, + "fairness": ((seed >> (i*3+4)) % 7) + 1, + "if_woke_up_response": f"act-{sid}-{h[:4]}", + } for i, sid in enumerate(["S1","S2","S3","S4"])}} + + # Delphi R1: q1..q4 free text + if "q1" in usr_msg and "q2" in usr_msg and "Bewerten" not in usr_msg and "Sie sehen" not in usr_msg: + return {"answers": {qid: f"stub-themes-{qid}-{h[:4]}" for qid in ("q1","q2","q3","q4")}} + + # Delphi theme extraction (no in-character system prompt) + if "extract distinct thematic codes" in sys_msg: + return {"themes": [{"theme_id": f"theme_{i}", "label": f"Thema {i}"} for i in range(5)]} + + # Delphi R2 (rate) or R3 (revise) + if "Bewerten Sie jedes Thema" in usr_msg or "Sie sehen unten" in usr_msg \ + or "Rate each theme" in usr_msg or "Below are the anonymised" in usr_msg: + theme_ids = [f"theme_{i}" for i in range(5)] + out = {"ratings": {tid: {"importance": ((seed >> (i*2)) % 5) + 1, + "plausibility": ((seed >> (i*2+1)) % 5) + 1} + for i, tid in enumerate(theme_ids)}} + if "Sie sehen unten" in usr_msg or "Below are the anonymised" in usr_msg: + out["justification"] = "stub-revision" + return out + + # Fallback + return {"stub_key": h[:12], "value": rng} +``` + +- [ ] **Step 4: Run test to verify it passes** + +Run: `cd backend && uv run pytest tests/integration/test_interview_pipeline.py -v -m integration` +Expected: 2 passed. + +- [ ] **Step 5: Commit** + +```bash +git add backend/app/utils/llm_client.py backend/tests/integration/__init__.py backend/tests/integration/test_interview_pipeline.py +git commit -m "test(interviews): end-to-end pipeline test + content-aware LLM stubs for all 4 subagents" +``` + +--- + +## Phase 7 — Frontend + +Note: this project has no frontend test framework. Tasks below use the build (`npm run build`) plus a manual smoke check via `npm run dev` as the verification gate. Commit after each task once the build is green. + +### Task 18: Step4bInterviews.vue scaffold + tab shell + +**Files:** +- Create: `frontend/src/components/Step4bInterviews.vue` +- Create: `frontend/src/api/interview.js` +- Modify: `frontend/src/App.vue` (or the parent that orchestrates Step1..Step5 — locate and adjust) + +- [ ] **Step 1: Add API client module** + +`frontend/src/api/interview.js`: +```javascript +import { api } from "./index" + +export async function startPre(simId) { + const r = await api.post(`/api/interview/${simId}/pre`) + return r.data +} +export async function startPost(simId) { + const r = await api.post(`/api/interview/${simId}/post`) + return r.data +} +export async function rerun(simId, subagent) { + const r = await api.post(`/api/interview/${simId}/rerun`, { subagent }) + return r.data +} +export async function getStatus(simId, taskId) { + const r = await api.get(`/api/interview/${simId}/status`, { params: { task_id: taskId } }) + return r.data +} +export async function getResults(simId, subagent) { + const r = await api.get(`/api/interview/${simId}/results/${subagent}`) + return r.data +} +export async function getSynthesis(simId) { + const r = await api.get(`/api/interview/${simId}/results/synthesis`) + return r.data +} +export function exportCsvUrl(simId) { + return `/api/interview/${simId}/export.csv` +} +``` + +- [ ] **Step 2: Implement Step4bInterviews.vue scaffold** + +`frontend/src/components/Step4bInterviews.vue`: +```vue + + + + + +``` + +- [ ] **Step 3: Create placeholder panel components (to be filled in Task 19)** + +Create five empty-but-renderable Vue components so the scaffold compiles: + +`frontend/src/components/interviews/LongitudinalPanel.vue`: +```vue + + +``` + +Repeat the same pattern (changing only the inner text) for `DiversityPanel.vue`, `DelphiPanel.vue`, `ScenarioPanel.vue`, `SynthesisPanel.vue` in `frontend/src/components/interviews/`. + +- [ ] **Step 4: Wire Step4b into parent navigation** + +Read `frontend/src/App.vue` (or wherever Step1..Step5 are rendered). Locate the routing/visibility logic. Add a Step4b state between Step4 and Step5, and import `Step4bInterviews` from `./components/Step4bInterviews.vue`. Pass `:sim-id="currentSimId"` where the others receive the sim id. Add i18n keys to `locales/en.json`, `locales/de.json`, `locales/zh.json`: +```json +"interview": { + "title": "Stakeholder interviews", + "subtitle": "Four independent surveys of the simulated stakeholder population.", + "runAll": "Run all post-simulation interviews", + "downloadCsv": "Download CSV", + "tab": { + "longitudinal": "Longitudinal (Δ)", + "diversity": "Diversity", + "delphi": "Delphi", + "scenario": "Scenarios", + "synthesis": "Synthesis" + } +} +``` + +- [ ] **Step 5: Build to verify it compiles** + +Run: `cd frontend && npm run build` +Expected: build succeeds with no errors. + +- [ ] **Step 6: Commit** + +```bash +git add frontend/src/api/interview.js frontend/src/components/Step4bInterviews.vue \ + frontend/src/components/interviews/*.vue frontend/src/App.vue \ + locales/*.json +git commit -m "feat(interviews): Step4b Vue scaffold with five-tab navigation, API client, i18n keys" +``` + +--- + +### Task 19: Per-tab d3 visualisations + +**Files:** +- Modify: `frontend/src/components/interviews/LongitudinalPanel.vue` +- Modify: `frontend/src/components/interviews/DiversityPanel.vue` +- Modify: `frontend/src/components/interviews/DelphiPanel.vue` +- Modify: `frontend/src/components/interviews/ScenarioPanel.vue` +- Modify: `frontend/src/components/interviews/SynthesisPanel.vue` + +For each panel, fetch the relevant aggregate via the API on mount, then render with d3. The five implementations follow the same structure; each shows the full content below. + +- [ ] **Step 1: Longitudinal panel — heatmap of Δ̄ per item** + +`frontend/src/components/interviews/LongitudinalPanel.vue`: +```vue + + + + + +``` + +- [ ] **Step 2: Diversity panel — PCA scatter coloured by cluster** + +`frontend/src/components/interviews/DiversityPanel.vue`: +```vue + + + + + +``` + +- [ ] **Step 3: Delphi panel — convergence bar chart (R2 IQR vs R3 IQR per theme)** + +`frontend/src/components/interviews/DelphiPanel.vue`: +```vue + + + + + +``` + +- [ ] **Step 4: Scenario panel — polarity quadrant (desirability × plausibility)** + +`frontend/src/components/interviews/ScenarioPanel.vue`: +```vue + + + + + +``` + +- [ ] **Step 5: Synthesis panel — render markdown report** + +`frontend/src/components/interviews/SynthesisPanel.vue`: +```vue + + + + + +``` + +- [ ] **Step 6: Build + smoke test** + +Run: `cd frontend && npm run build` +Expected: build succeeds. Then `cd .. && npm run dev` and manually visit Step4b for a completed `sim_id` — verify all five tabs render without console errors. + +- [ ] **Step 7: Commit** + +```bash +git add frontend/src/components/interviews/*.vue +git commit -m "feat(interviews): d3 visualisations for longitudinal Δ, diversity PCA, Delphi, scenario polarity, synthesis" +``` + +--- + +### Task 20: Auto-trigger pre-survey on simulation `ready` + +**Files:** +- Create: `backend/app/services/interviews/lifecycle.py` +- Modify: `backend/app/__init__.py` (app factory) to install the hook + +- [ ] **Step 1: Write failing test** + +```python +# backend/tests/interviews/test_lifecycle.py +from app.services.interviews.lifecycle import install_hooks + +class _StubMgr: + def __init__(self): self.ready = []; self.completed = [] + def register_on_ready(self, fn): self.ready.append(fn) + def register_on_completed(self, fn): self.completed.append(fn) + +def test_install_hooks_registers_two_callables(): + mgr = _StubMgr() + install_hooks(mgr) + assert len(mgr.ready) == 1 + assert len(mgr.completed) == 1 + assert callable(mgr.ready[0]) + assert callable(mgr.completed[0]) +``` + +- [ ] **Step 2: Run test to verify it fails** + +Run: `cd backend && uv run pytest tests/interviews/test_lifecycle.py -v` +Expected: ImportError. + +- [ ] **Step 3: Implement lifecycle hook installer** + +`backend/app/services/interviews/lifecycle.py`: +```python +from __future__ import annotations +import threading +from app.utils.logger import get_logger + +logger = get_logger(__name__) + +def install_hooks(manager) -> None: + """Attach interview lifecycle callbacks to a SimulationManager. + + on_ready → spawn T0 longitudinal in a background thread + on_completed → spawn full post-sim batch in a background thread + Hooks are best-effort; failures only log. + """ + def _on_ready(state) -> None: + sim_id = getattr(state, "sim_id", None) or getattr(state, "id", None) + if not sim_id: return + threading.Thread(target=_run_pre, args=(sim_id,), daemon=True).start() + + def _on_completed(state) -> None: + sim_id = getattr(state, "sim_id", None) or getattr(state, "id", None) + if not sim_id: return + threading.Thread(target=_run_post, args=(sim_id,), daemon=True).start() + + manager.register_on_ready(_on_ready) + manager.register_on_completed(_on_completed) + +def _run_pre(sim_id: str) -> None: + try: + from app.api.interview import _build_orchestrator + orch = _build_orchestrator(sim_id) + orch.run_pre() + except Exception as e: + logger.warning(f"auto pre-survey failed for {sim_id}: {e!r}") + +def _run_post(sim_id: str) -> None: + try: + from app.api.interview import _build_orchestrator + from app.services.interview_synthesizer import InterviewSynthesizer + orch = _build_orchestrator(sim_id) + orch.run_post() + InterviewSynthesizer(store=orch.store).run() + except Exception as e: + logger.warning(f"auto post-survey failed for {sim_id}: {e!r}") +``` + +- [ ] **Step 4: Wire into app factory** + +Read `backend/app/__init__.py`. Locate where `SimulationManager` (or its singleton) is instantiated. Add: +```python + from app.services.interviews.lifecycle import install_hooks + install_hooks(simulation_manager) +``` +immediately after the manager is constructed. If `simulation_manager` is module-level in `simulation_manager.py`, attach the hooks at the bottom of that module instead — the goal is "install once on app startup". + +- [ ] **Step 5: Run test to verify it passes** + +Run: `cd backend && uv run pytest tests/interviews/test_lifecycle.py -v` +Expected: 1 passed. + +- [ ] **Step 6: Full backend test suite** + +Run: `cd backend && uv run pytest -m "not integration" -q` +Expected: all unit tests pass. + +Run: `cd backend && uv run pytest -m integration -q` +Expected: integration tests pass. + +- [ ] **Step 7: Commit** + +```bash +git add backend/app/services/interviews/lifecycle.py backend/app/__init__.py backend/tests/interviews/test_lifecycle.py +git commit -m "feat(interviews): auto-trigger pre and post interviews via SimulationManager lifecycle hooks" +``` + +--- + +## Final verification + +- [ ] **Run full backend test suite** + +Run: `cd backend && uv run pytest -q` +Expected: every test passes. + +- [ ] **Run frontend build** + +Run: `cd frontend && npm run build` +Expected: build succeeds with no errors. + +- [ ] **Smoke test the running app** + +Run: `npm run dev` from project root. With an existing completed simulation: +1. Navigate to Step4b in the UI +2. Click "Run all post-simulation interviews" +3. Wait for status to reach `completed` +4. Verify each of the five tabs renders without console errors +5. Click "Download CSV" and confirm the file downloads + +- [ ] **Verify spec coverage** + +Re-open `docs/superpowers/specs/2026-05-23-stakeholder-interview-subagents-design.md` and confirm every section in the spec has a corresponding task: + +- §3 architectural approach (deterministic runners) → Tasks 5–9 +- §4 file structure + lifecycle hooks → Tasks 2–14, 20 +- §5.1–5.4 four instruments → Tasks 6, 7, 8, 9 +- §5.5 in-character prompting + structured output + cost guardrails → Tasks 4, 5 +- §6.1 storage layout → Task 10 +- §6.2 Zep integration → Task 11 +- §6.3 API surface (all 7 endpoints) → Task 16 +- §6.4 parallelism + token guard → Task 12 (parallelism); token guard sits in `Config.INTERVIEW_MAX_TOKENS_PER_RUN` from Task 1 — *open: enforcement not implemented in v1; flag if you want it added* +- §6.5 frontend Step4b + per-tab viz → Tasks 18, 19 +- §7 error handling (per-agent isolation, schema retry, idempotency) → Tasks 5, 10, 12 +- §8 validation (schema, instrument, plausibility flags) → Tasks 2, 3 (schema + instrument); plausibility-flags currently sit inside synthesiser §10 — *check that flagged thresholds in §8 plausibility checks match what synthesiser currently emits* +- §9 testing (unit per subagent + integration + stub mode) → Tasks 4, 6–9, 12, 17 +- §10 methodological caveats in synthesis → Task 14 +- §11 defaults — already encoded in Task 1 config keys and instrument YAML + +If §6.4 token-guard enforcement is needed for v1, add a small follow-up task that computes a projected-token estimate before `run_post` and returns 400 with `confirm=true` override — but the spec keeps this as a guard, not a blocker, so it can ship in v1.1. + +--- + +**Plan complete and saved to `docs/superpowers/plans/2026-05-23-stakeholder-interview-subagents.md`. Two execution options:** + +**1. Subagent-Driven (recommended)** — I dispatch a fresh subagent per task, review between tasks, fast iteration. + +**2. Inline Execution** — Execute tasks in this session using executing-plans, batch execution with checkpoints. + +**Which approach?** + diff --git a/docs/superpowers/specs/2026-05-23-stakeholder-interview-subagents-design.md b/docs/superpowers/specs/2026-05-23-stakeholder-interview-subagents-design.md new file mode 100644 index 00000000..f82a7ec7 --- /dev/null +++ b/docs/superpowers/specs/2026-05-23-stakeholder-interview-subagents-design.md @@ -0,0 +1,280 @@ +# Stakeholder Interview Subagents — Design Spec + +- **Date:** 2026-05-23 +- **Project:** MiroFish (multi-agent simulation engine for German fisheries discourse) +- **Author:** Christian Möllmann (with Claude Code) +- **Status:** Approved design — pending implementation plan + +## 1. Purpose + +After the OASIS Twitter + Reddit simulation produces a population of in-character stakeholder agents (fishers, NGOs, policy actors, scientists, consumers, etc.) grounded in a German fisheries discourse knowledge graph, we want to interrogate each agent individually with a structured questionnaire about the future of German fisheries. + +Four methodologies run as independent subagents over the same agent population: + +1. **Longitudinal** — pre/post Likert to measure opinion drift induced by simulated peer interaction +2. **Diversity** — Q-sort + multi-dim Likert to map the value space and derive a stakeholder typology +3. **Delphi** — three-round consensus probing to identify where stakeholder views converge vs. stay polarised +4. **Scenario** — rating of 4 pre-defined 2040 scenarios on desirability, plausibility, group-impact, fairness + +A synthesiser combines the four outputs into a single cross-method report. + +## 2. Non-goals (v1) + +- Real-time WebSocket streaming of interview progress (polling suffices) +- Adaptive instruments / IRT calibration +- Web UI for editing instruments (YAML + restart is fine) +- Cross-simulation comparison endpoints (CSV exports support this externally) +- Multi-language support beyond DE / EN + +## 3. Architectural approach + +**Chosen approach: Deterministic instrument runners.** Each subagent is a fixed protocol, not a ReACT loop. Rationale: fisheries futures methodology favours instrument fidelity (every stakeholder sees the same scale) over agent autonomy; results must be directly tabularisable for downstream analysis in pandas/R. + +Rejected: +- *ReACT-style subagents* — non-deterministic, ~3–10× cost, can't guarantee every agent answered every item +- *Single InterviewService with mode enum* — couples four distinct methodologies (especially multi-round Delphi and two-phase Longitudinal) into one growing class + +## 4. System architecture + +``` + InterviewOrchestrator + │ + ┌──────────────┬───────┴───────┬──────────────┐ + ▼ ▼ ▼ ▼ +Longitudinal Diversity Delphi Scenario +Subagent Subagent Subagent Subagent + │ │ │ │ + └──────────────┴──────┬────────┴──────────────┘ + ▼ + StakeholderInterviewer (base) + │ + ┌─────────────────┼─────────────────┐ + ▼ ▼ ▼ + LLMClient ZepEntityReader ProfileLoader + (in-character) (memory digest) (reddit/twitter) + │ + ▼ + uploads/.../interviews/ + Zep episodes +``` + +### 4.1 New files + +| Path | Purpose | +|---|---| +| `backend/app/services/interviews/base.py` | `StakeholderInterviewer` — persona+memory loading, in-character prompting, retry/validation | +| `backend/app/services/interviews/longitudinal.py` | Pre/post Likert | +| `backend/app/services/interviews/diversity.py` | Q-sort + multi-dim value-space mapping | +| `backend/app/services/interviews/delphi.py` | Three-round consensus | +| `backend/app/services/interviews/scenario.py` | Scenario rating | +| `backend/app/services/interview_orchestrator.py` | Fan-out, parallel execution, two-phase lifecycle | +| `backend/app/services/interview_synthesizer.py` | Cross-method narrative report | +| `backend/app/api/interview.py` | New Flask blueprint `/api/interview/*` | +| `backend/app/models/interview.py` | Pydantic schemas for instruments + responses | +| `backend/scripts/instruments/*.yaml` | Editable instrument definitions (one YAML per subagent) | +| `frontend/src/components/Step4bInterviews.vue` | Four tabs + synthesis tab | +| `backend/tests/interviews/` | Unit tests per subagent + base + orchestrator + synthesiser | +| `tests/integration/test_interview_pipeline.py` | End-to-end with stub LLM + disposable Zep graph | + +### 4.2 Lifecycle integration + +Two hooks added to `backend/app/services/simulation_manager.py`: + +- `on_ready()` — automatically triggers Longitudinal T0 (pre-simulation baseline) +- `on_completed()` — queues a `task_id` running Longitudinal T1 + Diversity + Delphi + Scenario in parallel, then Synthesiser + +The two-phase split is **non-negotiable**: Longitudinal needs T0 captured before OASIS exposes agents to peer-generated content, otherwise drift is unmeasurable. + +## 5. Instrument design + +All instruments live in `backend/scripts/instruments/*.yaml` so content is editable without redeploying. Items default to German, translatable via existing locale system. + +### 5.1 Longitudinal — opinion drift + +- 12–15 item 5-point Likert ("lehne stark ab" → "stimme stark zu") +- Administered at T0 (post-persona, pre-OASIS) and T1 (post-OASIS) +- Item families (3–4 each): stock status & recovery; governance & CFP; market & MSC; climate & adaptation +- Per-agent output: response value + LLM self-reported confidence per item + one open comment +- Aggregate: Δ-matrix (N × M items), per-item Wilcoxon signed-rank, per-agent total drift magnitude + +### 5.2 Diversity — typology mapping + +- One-shot, post-simulation only +- **Part A (Q-sort lite):** 24 statements sorted onto forced quasi-normal distribution from −3 to +3 +- **Part B:** 6 multi-dim Likert axes (preservation↔extraction, local↔EU, science-led↔tradition-led, individual↔collective, short-term↔long-term, market↔regulation) +- Per-agent output: vector ∈ ℝ^30 +- Aggregate: PCA + k-means → 3–5 stakeholder clusters with archetype descriptions + cluster-membership probabilities + +### 5.3 Delphi — consensus probing + +- Three rounds, fully automated +- **R1 (open):** 4 open questions; LLM extracts thematic codes from responses +- **R2 (rate):** Agent sees anonymised list of all unique themes; rates each on importance (1–5) + plausibility (1–5) +- **R3 (revise):** Agent sees group median + IQR per theme; can revise own ratings; free-text justification +- Aggregate: per-theme convergence (Δ-IQR R2→R3), persistent disagreements (IQR > 2), ranked consensus statements + +### 5.4 Scenario — futures evaluation + +Four 2040 scenarios (YAML-editable): + +- **S1 "Erholung"** — cod and herring recover, MSC ubiquitous, small-scale fleet stabilises +- **S2 "Kollaps"** — both stocks collapse, fleet halved, aquaculture dominant +- **S3 "Festung Europa"** — protectionist EU policy, MPAs cover 30%, recreational fishing curtailed +- **S4 "Privatisierung"** — ITQs, consolidation, large operators only + +Each agent rates each scenario on 4 dimensions (1–7 Likert): desirability, plausibility, impact-on-my-group, fairness. Plus one open question per scenario: "If you woke up in this 2040, what would you do?" + +Aggregate: 4 × 4 per-agent matrix + open-text corpus → polarity charts (desirability × plausibility by stakeholder type), narrative themes. + +### 5.5 Cross-cutting + +**In-character prompting.** Every LLM call uses a system prompt of the form: + +> You are [persona_text]. You are answering a survey about the future of German fisheries. Answer strictly in character based on your background, values, and what you experienced during the simulated social media discourse summarised below: [Zep memory digest]. Return JSON only. + +Memory digest comes from `ZepEntityReader.get_entity_with_context()`. + +**Structured output enforced.** Every response goes through `LLMClient.chat_json()` with a per-instrument JSON schema. One auto-retry on schema violation; agent flagged in audit log on second failure. + +**Cost guardrails.** Longitudinal × 2 phases + Delphi × 3 rounds is heaviest. For N=50 agents and ~100 LLM calls per agent across all 4 subagents, budget ~5k calls / 5–10M tokens per simulation. Persona system prompts stay constant within a subagent run → cacheable. + +## 6. Data flow and storage + +### 6.1 Storage layout + +``` +uploads/simulations/{sim_id}/interviews/ +├── instruments_used.json # frozen snapshot of YAML at run-time +├── T0/ +│ └── longitudinal/ +│ ├── responses.jsonl +│ ├── audit.jsonl # raw LLM I/O, retries, validation failures +│ └── aggregate.json +├── T1/ +│ ├── longitudinal/{same structure} +│ ├── diversity/ +│ │ ├── responses.jsonl +│ │ ├── typology.json +│ │ └── pca.json +│ ├── delphi/ +│ │ ├── round1_themes.jsonl +│ │ ├── round2_ratings.jsonl +│ │ ├── round3_revisions.jsonl +│ │ └── convergence.json +│ └── scenario/ +│ ├── responses.jsonl +│ └── polarity_matrix.json +└── synthesis/ + ├── report.md + └── exports/ + ├── all_responses.csv # tidy long format + └── codebook.json +``` + +JSONL for raw responses (append-safe, streams cleanly); JSON for aggregates; CSV for analysis hand-off. `instruments_used.json` snapshot is critical for reproducibility when YAML is later edited. + +### 6.2 Zep integration + +Two write patterns, both reusing `ZepGraphMemoryUpdater.add_activity()`: + +- **Per-agent episode** — after each subagent finishes for an agent, write one episode: `"Agent {name} (interview/{subagent}/{phase}): {short summary of stance}"`. The existing ReportAgent can retrieve interview content via its current `panorama_search` / `insight_forge` tools without changes. +- **Aggregate episodes** — after each subagent's aggregate step, write one summary episode per cluster / theme / scenario. + +No new Zep schemas. No new entity types. Interviews are just more episodes — append-only, safe. + +### 6.3 API surface + +New blueprint `/api/interview`: + +| Method | Path | Purpose | +|---|---|---| +| `POST` | `/api/interview/{sim_id}/pre` | Trigger T0 longitudinal (auto on READY, manual for re-runs) | +| `POST` | `/api/interview/{sim_id}/post` | Trigger all 4 post-sim subagents; returns `task_id` | +| `GET` | `/api/interview/{sim_id}/status?task_id=...` | Per-subagent progress | +| `GET` | `/api/interview/{sim_id}/results/{subagent}` | Aggregate JSON for one subagent | +| `GET` | `/api/interview/{sim_id}/results/synthesis` | Full synthesis report | +| `GET` | `/api/interview/{sim_id}/export.csv` | Tidy long-format CSV across all 4 subagents | +| `POST` | `/api/interview/{sim_id}/rerun` | Re-run one subagent (e.g. after editing YAML) | + +All responses follow the existing `{success, data, error}` envelope. Polling reuses `models/task.py`. + +### 6.4 Parallelism + +- Within a subagent: `ThreadPoolExecutor(max_workers=8)` for per-agent LLM calls +- Across the 4 post-sim subagents: parallel, except Delphi (sequential rounds internally) +- Synthesiser waits for all four +- Token budget guard: `Config.INTERVIEW_MAX_TOKENS_PER_RUN`; if projected cost exceeds, API returns 400 with dry-run estimate and `confirm=true` override + +### 6.5 Frontend + +New `Step4bInterviews.vue` between current Step4 (report) and Step5 (interaction). Four tabs (one per subagent) + a synthesis tab. Each tab shows progress bar during run, then results: Likert heatmap (longitudinal Δ), PCA scatter (diversity), convergence chart (Delphi), polarity quadrants (scenario). Download button per tab pulls the CSV export. + +## 7. Error handling + +**Per-agent failures are isolated.** If agent 17 times out or fails JSON validation twice, agent 17 is marked `failed` in `audit.jsonl`; the rest of the run continues. Aggregates report `n_responded` / `n_total` honestly. + +| Failure | Handling | +|---|---| +| LLM timeout / 5xx | Exponential-backoff retry (3 attempts) via existing `LLMClient`; then mark agent failed | +| JSON schema violation | One auto-retry with explicit corrective instruction; then mark failed | +| Likert out-of-range / missing items | Re-ask only the bad items; if still bad, item-level missing | +| Zep memory fetch fails | Run without memory digest; flag in audit (`memory_available: false`); down-weight in drift analysis | +| Whole-subagent crash | Other 3 continue; synthesiser runs on what completed and flags the gap | +| Token budget exceeded | Pause, write partial results, return 503 with `resume_token` | + +**Idempotency.** Every subagent run is keyed by `(sim_id, subagent, phase, run_id)`. Re-runs write a new `run_id` directory; never overwrite. A `latest.json` pointer tracks the canonical run. + +## 8. Validation + +Three layers: + +1. **Schema validation** — pydantic models for every response; JSONL files validated on write +2. **Instrument validation** — `validate_instrument(yaml)` pre-flight: required fields, scale coherence, no duplicate item_ids, DE+EN both present if i18n enabled +3. **Plausibility checks** on aggregates (flag, don't kill): + - Longitudinal: >80% zero drift on every item OR >80% flip — likely a prompting bug or acquiescence bias + - Diversity: first two PCA components explain <30% of variance — instrument not discriminating + - Delphi: R3 ratings identical to R2 for >90% of agents — no engagement with anonymised feedback + - Scenario: all agents rate all scenarios identically on `desirability` — instrument failure + +Flags surface in the synthesis report under "instrument health" so the user can decide whether data is publishable. + +## 9. Testing + +**Unit tests** (`backend/tests/interviews/`): + +- `test_instruments.py` — every YAML parses and validates +- `test_base_interviewer.py` — persona+memory loading, in-character prompt construction, schema-retry logic (mock `LLMClient`) +- One file per subagent — happy path + each failure mode in §7 +- `test_orchestrator.py` — fan-out, partial failures, two-phase ordering (T0 before T1) +- `test_synthesizer.py` — missing-subagent handling, stable output shape + +**Integration test** (`tests/integration/test_interview_pipeline.py`): + +End-to-end with N=5 agents against a recorded LLM cassette. Verifies T0 at READY, T1 + 3 others at COMPLETED, CSV export well-formed, Zep episodes written. + +**Stub LLM mode** (`Config.LLM_STUB_MODE=true`) returns deterministic canned responses keyed by `(subagent, item_id, persona_hash)`. Full pipeline exercisable in CI for free. + +**Zep**: disposable graph in integration tests (consistent with project conventions); unit tests stub. + +## 10. Methodological caveats (auto-emitted in synthesis) + +The synthesiser **always** emits a "Limitations" section, programmatically generated from run metadata: + +- **Simulated, not real stakeholders.** Responses reflect how the seed-document discourse + LLM jointly encode each stakeholder type, not what actual fishers / NGO staff would say. The instrument measures the *model of the stakeholder*, not the stakeholder. +- **Memory digest is lossy.** Each agent's "experience" of OASIS is summarised to bounded length; agents do not have full episodic recall. +- **LLM acquiescence and centrality bias.** Likert with LLM respondents skews toward 3–4 of 5; per-item distribution shape statistics are reported. +- **N is what it is.** `n_total` and `n_responded` printed verbatim; no rounding, no smoothing. +- **Instrument provenance.** Hash of `instruments_used.json` printed so future-you can rebuild the exact instrument. + +This section is load-bearing for any publication: it makes the system intellectually defensible rather than a black box. + +## 11. Defaulted decisions (revisit later if needed) + +- **N agents:** assumed 50, driven from existing simulation config; if you typically run more/fewer, cost guardrail threshold needs adjusting +- **Default instrument language:** German with English fallback in YAML +- **Delphi rounds = 3:** classic Delphi can run more; 3 is the methodological floor and the cost ceiling here + +## 12. Open questions for implementation phase + +- Whether to write a separate `instruments_changelog.md` per run, or embed change tracking in `instruments_used.json` metadata +- Whether the synthesiser should write into Zep as a single mega-episode or stay file-only (current design: file-only, plus the per-agent + per-aggregate episodes from each subagent) +- Whether `Step4bInterviews.vue` should sit strictly after Step4 (current design) or render in parallel — interviews depend on the simulation having reached `completed` (Step3 output) and on the `graph_id` (created in Step1); they do not depend on Step4's ReportAgent run, so a parallel layout is technically possible diff --git a/frontend/src/api/interview.js b/frontend/src/api/interview.js new file mode 100644 index 00000000..0f5cdbf5 --- /dev/null +++ b/frontend/src/api/interview.js @@ -0,0 +1,29 @@ +import service from './index' + +export async function startPre(simId) { + const r = await service.post(`/api/interview/${simId}/pre`) + return r +} +export async function startPost(simId) { + const r = await service.post(`/api/interview/${simId}/post`) + return r +} +export async function rerun(simId, subagent) { + const r = await service.post(`/api/interview/${simId}/rerun`, { subagent }) + return r +} +export async function getStatus(simId, taskId) { + const r = await service.get(`/api/interview/${simId}/status`, { params: { task_id: taskId } }) + return r +} +export async function getResults(simId, subagent) { + const r = await service.get(`/api/interview/${simId}/results/${subagent}`) + return r +} +export async function getSynthesis(simId) { + const r = await service.get(`/api/interview/${simId}/results/synthesis`) + return r +} +export function exportCsvUrl(simId) { + return `/api/interview/${simId}/export.csv` +} diff --git a/frontend/src/components/Step4bInterviews.vue b/frontend/src/components/Step4bInterviews.vue new file mode 100644 index 00000000..d2aed844 --- /dev/null +++ b/frontend/src/components/Step4bInterviews.vue @@ -0,0 +1,79 @@ + + + + + diff --git a/frontend/src/components/interviews/DelphiPanel.vue b/frontend/src/components/interviews/DelphiPanel.vue new file mode 100644 index 00000000..c111d0d3 --- /dev/null +++ b/frontend/src/components/interviews/DelphiPanel.vue @@ -0,0 +1,58 @@ + + + + + diff --git a/frontend/src/components/interviews/DiversityPanel.vue b/frontend/src/components/interviews/DiversityPanel.vue new file mode 100644 index 00000000..558d8526 --- /dev/null +++ b/frontend/src/components/interviews/DiversityPanel.vue @@ -0,0 +1,63 @@ + + + + + diff --git a/frontend/src/components/interviews/LongitudinalPanel.vue b/frontend/src/components/interviews/LongitudinalPanel.vue new file mode 100644 index 00000000..1596e93b --- /dev/null +++ b/frontend/src/components/interviews/LongitudinalPanel.vue @@ -0,0 +1,63 @@ + + + + + diff --git a/frontend/src/components/interviews/ScenarioPanel.vue b/frontend/src/components/interviews/ScenarioPanel.vue new file mode 100644 index 00000000..ddc85b2b --- /dev/null +++ b/frontend/src/components/interviews/ScenarioPanel.vue @@ -0,0 +1,66 @@ + + + + + diff --git a/frontend/src/components/interviews/SynthesisPanel.vue b/frontend/src/components/interviews/SynthesisPanel.vue new file mode 100644 index 00000000..e435b4d2 --- /dev/null +++ b/frontend/src/components/interviews/SynthesisPanel.vue @@ -0,0 +1,34 @@ + + + + + diff --git a/frontend/src/router/index.js b/frontend/src/router/index.js index 62d23201..30b072b8 100644 --- a/frontend/src/router/index.js +++ b/frontend/src/router/index.js @@ -4,6 +4,7 @@ import Process from '../views/MainView.vue' import SimulationView from '../views/SimulationView.vue' import SimulationRunView from '../views/SimulationRunView.vue' import ReportView from '../views/ReportView.vue' +import InterviewView from '../views/InterviewView.vue' import InteractionView from '../views/InteractionView.vue' const routes = [ @@ -36,6 +37,12 @@ const routes = [ component: ReportView, props: true }, + { + path: '/interview/:simulationId', + name: 'Interview', + component: InterviewView, + props: true + }, { path: '/interaction/:reportId', name: 'Interaction', diff --git a/frontend/src/views/InterviewView.vue b/frontend/src/views/InterviewView.vue new file mode 100644 index 00000000..767ac9b7 --- /dev/null +++ b/frontend/src/views/InterviewView.vue @@ -0,0 +1,192 @@ + + + + + diff --git a/locales/de.json b/locales/de.json new file mode 100644 index 00000000..4032d4db --- /dev/null +++ b/locales/de.json @@ -0,0 +1,15 @@ +{ + "interview": { + "title": "Stakeholder-Interviews", + "subtitle": "Vier unabhängige Befragungen der simulierten Stakeholder-Population.", + "runAll": "Alle Post-Simulations-Interviews starten", + "downloadCsv": "CSV herunterladen", + "tab": { + "longitudinal": "Längsschnitt (Δ)", + "diversity": "Diversität", + "delphi": "Delphi", + "scenario": "Szenarien", + "synthesis": "Synthese" + } + } +} diff --git a/locales/en.json b/locales/en.json index 544c68b1..d22cf64f 100644 --- a/locales/en.json +++ b/locales/en.json @@ -661,5 +661,18 @@ "llmSelectAgentFailed": "LLM agent selection failed, using default selection: {error}", "generateInterviewQuestionsFailed": "Failed to generate interview questions: {error}", "generateInterviewSummaryFailed": "Failed to generate interview summary: {error}" + }, + "interview": { + "title": "Stakeholder interviews", + "subtitle": "Four independent surveys of the simulated stakeholder population.", + "runAll": "Run all post-simulation interviews", + "downloadCsv": "Download CSV", + "tab": { + "longitudinal": "Longitudinal (Δ)", + "diversity": "Diversity", + "delphi": "Delphi", + "scenario": "Scenarios", + "synthesis": "Synthesis" + } } } diff --git a/locales/zh.json b/locales/zh.json index cd747e2f..71ed6c4b 100644 --- a/locales/zh.json +++ b/locales/zh.json @@ -661,5 +661,18 @@ "llmSelectAgentFailed": "LLM选择Agent失败,使用默认选择: {error}", "generateInterviewQuestionsFailed": "生成采访问题失败: {error}", "generateInterviewSummaryFailed": "生成采访摘要失败: {error}" + }, + "interview": { + "title": "利益相关者访谈", + "subtitle": "对模拟利益相关者群体进行的四项独立调查。", + "runAll": "运行所有模拟后访谈", + "downloadCsv": "下载 CSV", + "tab": { + "longitudinal": "纵向分析 (Δ)", + "diversity": "多样性", + "delphi": "德尔菲法", + "scenario": "情景分析", + "synthesis": "综合分析" + } } }