From bf058080ac1647f7f7010b392904102fbbc05eba Mon Sep 17 00:00:00 2001 From: Christian Moellmann Date: Sat, 23 May 2026 10:53:28 +0200 Subject: [PATCH 01/26] docs(spec): stakeholder interview subagents design Approved design for a four-subagent post-simulation interview system (Longitudinal, Diversity, Delphi, Scenario) over MiroFish-simulated German fisheries stakeholders, with cross-method synthesiser. Includes architecture, instrument design, data flow, API surface, error handling, validation, testing, and methodological caveats. Co-Authored-By: Claude Opus 4.7 (1M context) --- ...-stakeholder-interview-subagents-design.md | 280 ++++++++++++++++++ 1 file changed, 280 insertions(+) create mode 100644 docs/superpowers/specs/2026-05-23-stakeholder-interview-subagents-design.md diff --git a/docs/superpowers/specs/2026-05-23-stakeholder-interview-subagents-design.md b/docs/superpowers/specs/2026-05-23-stakeholder-interview-subagents-design.md new file mode 100644 index 00000000..f82a7ec7 --- /dev/null +++ b/docs/superpowers/specs/2026-05-23-stakeholder-interview-subagents-design.md @@ -0,0 +1,280 @@ +# Stakeholder Interview Subagents — Design Spec + +- **Date:** 2026-05-23 +- **Project:** MiroFish (multi-agent simulation engine for German fisheries discourse) +- **Author:** Christian Möllmann (with Claude Code) +- **Status:** Approved design — pending implementation plan + +## 1. Purpose + +After the OASIS Twitter + Reddit simulation produces a population of in-character stakeholder agents (fishers, NGOs, policy actors, scientists, consumers, etc.) grounded in a German fisheries discourse knowledge graph, we want to interrogate each agent individually with a structured questionnaire about the future of German fisheries. + +Four methodologies run as independent subagents over the same agent population: + +1. **Longitudinal** — pre/post Likert to measure opinion drift induced by simulated peer interaction +2. **Diversity** — Q-sort + multi-dim Likert to map the value space and derive a stakeholder typology +3. **Delphi** — three-round consensus probing to identify where stakeholder views converge vs. stay polarised +4. **Scenario** — rating of 4 pre-defined 2040 scenarios on desirability, plausibility, group-impact, fairness + +A synthesiser combines the four outputs into a single cross-method report. + +## 2. Non-goals (v1) + +- Real-time WebSocket streaming of interview progress (polling suffices) +- Adaptive instruments / IRT calibration +- Web UI for editing instruments (YAML + restart is fine) +- Cross-simulation comparison endpoints (CSV exports support this externally) +- Multi-language support beyond DE / EN + +## 3. Architectural approach + +**Chosen approach: Deterministic instrument runners.** Each subagent is a fixed protocol, not a ReACT loop. Rationale: fisheries futures methodology favours instrument fidelity (every stakeholder sees the same scale) over agent autonomy; results must be directly tabularisable for downstream analysis in pandas/R. + +Rejected: +- *ReACT-style subagents* — non-deterministic, ~3–10× cost, can't guarantee every agent answered every item +- *Single InterviewService with mode enum* — couples four distinct methodologies (especially multi-round Delphi and two-phase Longitudinal) into one growing class + +## 4. System architecture + +``` + InterviewOrchestrator + │ + ┌──────────────┬───────┴───────┬──────────────┐ + ▼ ▼ ▼ ▼ +Longitudinal Diversity Delphi Scenario +Subagent Subagent Subagent Subagent + │ │ │ │ + └──────────────┴──────┬────────┴──────────────┘ + ▼ + StakeholderInterviewer (base) + │ + ┌─────────────────┼─────────────────┐ + ▼ ▼ ▼ + LLMClient ZepEntityReader ProfileLoader + (in-character) (memory digest) (reddit/twitter) + │ + ▼ + uploads/.../interviews/ + Zep episodes +``` + +### 4.1 New files + +| Path | Purpose | +|---|---| +| `backend/app/services/interviews/base.py` | `StakeholderInterviewer` — persona+memory loading, in-character prompting, retry/validation | +| `backend/app/services/interviews/longitudinal.py` | Pre/post Likert | +| `backend/app/services/interviews/diversity.py` | Q-sort + multi-dim value-space mapping | +| `backend/app/services/interviews/delphi.py` | Three-round consensus | +| `backend/app/services/interviews/scenario.py` | Scenario rating | +| `backend/app/services/interview_orchestrator.py` | Fan-out, parallel execution, two-phase lifecycle | +| `backend/app/services/interview_synthesizer.py` | Cross-method narrative report | +| `backend/app/api/interview.py` | New Flask blueprint `/api/interview/*` | +| `backend/app/models/interview.py` | Pydantic schemas for instruments + responses | +| `backend/scripts/instruments/*.yaml` | Editable instrument definitions (one YAML per subagent) | +| `frontend/src/components/Step4bInterviews.vue` | Four tabs + synthesis tab | +| `backend/tests/interviews/` | Unit tests per subagent + base + orchestrator + synthesiser | +| `tests/integration/test_interview_pipeline.py` | End-to-end with stub LLM + disposable Zep graph | + +### 4.2 Lifecycle integration + +Two hooks added to `backend/app/services/simulation_manager.py`: + +- `on_ready()` — automatically triggers Longitudinal T0 (pre-simulation baseline) +- `on_completed()` — queues a `task_id` running Longitudinal T1 + Diversity + Delphi + Scenario in parallel, then Synthesiser + +The two-phase split is **non-negotiable**: Longitudinal needs T0 captured before OASIS exposes agents to peer-generated content, otherwise drift is unmeasurable. + +## 5. Instrument design + +All instruments live in `backend/scripts/instruments/*.yaml` so content is editable without redeploying. Items default to German, translatable via existing locale system. + +### 5.1 Longitudinal — opinion drift + +- 12–15 item 5-point Likert ("lehne stark ab" → "stimme stark zu") +- Administered at T0 (post-persona, pre-OASIS) and T1 (post-OASIS) +- Item families (3–4 each): stock status & recovery; governance & CFP; market & MSC; climate & adaptation +- Per-agent output: response value + LLM self-reported confidence per item + one open comment +- Aggregate: Δ-matrix (N × M items), per-item Wilcoxon signed-rank, per-agent total drift magnitude + +### 5.2 Diversity — typology mapping + +- One-shot, post-simulation only +- **Part A (Q-sort lite):** 24 statements sorted onto forced quasi-normal distribution from −3 to +3 +- **Part B:** 6 multi-dim Likert axes (preservation↔extraction, local↔EU, science-led↔tradition-led, individual↔collective, short-term↔long-term, market↔regulation) +- Per-agent output: vector ∈ ℝ^30 +- Aggregate: PCA + k-means → 3–5 stakeholder clusters with archetype descriptions + cluster-membership probabilities + +### 5.3 Delphi — consensus probing + +- Three rounds, fully automated +- **R1 (open):** 4 open questions; LLM extracts thematic codes from responses +- **R2 (rate):** Agent sees anonymised list of all unique themes; rates each on importance (1–5) + plausibility (1–5) +- **R3 (revise):** Agent sees group median + IQR per theme; can revise own ratings; free-text justification +- Aggregate: per-theme convergence (Δ-IQR R2→R3), persistent disagreements (IQR > 2), ranked consensus statements + +### 5.4 Scenario — futures evaluation + +Four 2040 scenarios (YAML-editable): + +- **S1 "Erholung"** — cod and herring recover, MSC ubiquitous, small-scale fleet stabilises +- **S2 "Kollaps"** — both stocks collapse, fleet halved, aquaculture dominant +- **S3 "Festung Europa"** — protectionist EU policy, MPAs cover 30%, recreational fishing curtailed +- **S4 "Privatisierung"** — ITQs, consolidation, large operators only + +Each agent rates each scenario on 4 dimensions (1–7 Likert): desirability, plausibility, impact-on-my-group, fairness. Plus one open question per scenario: "If you woke up in this 2040, what would you do?" + +Aggregate: 4 × 4 per-agent matrix + open-text corpus → polarity charts (desirability × plausibility by stakeholder type), narrative themes. + +### 5.5 Cross-cutting + +**In-character prompting.** Every LLM call uses a system prompt of the form: + +> You are [persona_text]. You are answering a survey about the future of German fisheries. Answer strictly in character based on your background, values, and what you experienced during the simulated social media discourse summarised below: [Zep memory digest]. Return JSON only. + +Memory digest comes from `ZepEntityReader.get_entity_with_context()`. + +**Structured output enforced.** Every response goes through `LLMClient.chat_json()` with a per-instrument JSON schema. One auto-retry on schema violation; agent flagged in audit log on second failure. + +**Cost guardrails.** Longitudinal × 2 phases + Delphi × 3 rounds is heaviest. For N=50 agents and ~100 LLM calls per agent across all 4 subagents, budget ~5k calls / 5–10M tokens per simulation. Persona system prompts stay constant within a subagent run → cacheable. + +## 6. Data flow and storage + +### 6.1 Storage layout + +``` +uploads/simulations/{sim_id}/interviews/ +├── instruments_used.json # frozen snapshot of YAML at run-time +├── T0/ +│ └── longitudinal/ +│ ├── responses.jsonl +│ ├── audit.jsonl # raw LLM I/O, retries, validation failures +│ └── aggregate.json +├── T1/ +│ ├── longitudinal/{same structure} +│ ├── diversity/ +│ │ ├── responses.jsonl +│ │ ├── typology.json +│ │ └── pca.json +│ ├── delphi/ +│ │ ├── round1_themes.jsonl +│ │ ├── round2_ratings.jsonl +│ │ ├── round3_revisions.jsonl +│ │ └── convergence.json +│ └── scenario/ +│ ├── responses.jsonl +│ └── polarity_matrix.json +└── synthesis/ + ├── report.md + └── exports/ + ├── all_responses.csv # tidy long format + └── codebook.json +``` + +JSONL for raw responses (append-safe, streams cleanly); JSON for aggregates; CSV for analysis hand-off. `instruments_used.json` snapshot is critical for reproducibility when YAML is later edited. + +### 6.2 Zep integration + +Two write patterns, both reusing `ZepGraphMemoryUpdater.add_activity()`: + +- **Per-agent episode** — after each subagent finishes for an agent, write one episode: `"Agent {name} (interview/{subagent}/{phase}): {short summary of stance}"`. The existing ReportAgent can retrieve interview content via its current `panorama_search` / `insight_forge` tools without changes. +- **Aggregate episodes** — after each subagent's aggregate step, write one summary episode per cluster / theme / scenario. + +No new Zep schemas. No new entity types. Interviews are just more episodes — append-only, safe. + +### 6.3 API surface + +New blueprint `/api/interview`: + +| Method | Path | Purpose | +|---|---|---| +| `POST` | `/api/interview/{sim_id}/pre` | Trigger T0 longitudinal (auto on READY, manual for re-runs) | +| `POST` | `/api/interview/{sim_id}/post` | Trigger all 4 post-sim subagents; returns `task_id` | +| `GET` | `/api/interview/{sim_id}/status?task_id=...` | Per-subagent progress | +| `GET` | `/api/interview/{sim_id}/results/{subagent}` | Aggregate JSON for one subagent | +| `GET` | `/api/interview/{sim_id}/results/synthesis` | Full synthesis report | +| `GET` | `/api/interview/{sim_id}/export.csv` | Tidy long-format CSV across all 4 subagents | +| `POST` | `/api/interview/{sim_id}/rerun` | Re-run one subagent (e.g. after editing YAML) | + +All responses follow the existing `{success, data, error}` envelope. Polling reuses `models/task.py`. + +### 6.4 Parallelism + +- Within a subagent: `ThreadPoolExecutor(max_workers=8)` for per-agent LLM calls +- Across the 4 post-sim subagents: parallel, except Delphi (sequential rounds internally) +- Synthesiser waits for all four +- Token budget guard: `Config.INTERVIEW_MAX_TOKENS_PER_RUN`; if projected cost exceeds, API returns 400 with dry-run estimate and `confirm=true` override + +### 6.5 Frontend + +New `Step4bInterviews.vue` between current Step4 (report) and Step5 (interaction). Four tabs (one per subagent) + a synthesis tab. Each tab shows progress bar during run, then results: Likert heatmap (longitudinal Δ), PCA scatter (diversity), convergence chart (Delphi), polarity quadrants (scenario). Download button per tab pulls the CSV export. + +## 7. Error handling + +**Per-agent failures are isolated.** If agent 17 times out or fails JSON validation twice, agent 17 is marked `failed` in `audit.jsonl`; the rest of the run continues. Aggregates report `n_responded` / `n_total` honestly. + +| Failure | Handling | +|---|---| +| LLM timeout / 5xx | Exponential-backoff retry (3 attempts) via existing `LLMClient`; then mark agent failed | +| JSON schema violation | One auto-retry with explicit corrective instruction; then mark failed | +| Likert out-of-range / missing items | Re-ask only the bad items; if still bad, item-level missing | +| Zep memory fetch fails | Run without memory digest; flag in audit (`memory_available: false`); down-weight in drift analysis | +| Whole-subagent crash | Other 3 continue; synthesiser runs on what completed and flags the gap | +| Token budget exceeded | Pause, write partial results, return 503 with `resume_token` | + +**Idempotency.** Every subagent run is keyed by `(sim_id, subagent, phase, run_id)`. Re-runs write a new `run_id` directory; never overwrite. A `latest.json` pointer tracks the canonical run. + +## 8. Validation + +Three layers: + +1. **Schema validation** — pydantic models for every response; JSONL files validated on write +2. **Instrument validation** — `validate_instrument(yaml)` pre-flight: required fields, scale coherence, no duplicate item_ids, DE+EN both present if i18n enabled +3. **Plausibility checks** on aggregates (flag, don't kill): + - Longitudinal: >80% zero drift on every item OR >80% flip — likely a prompting bug or acquiescence bias + - Diversity: first two PCA components explain <30% of variance — instrument not discriminating + - Delphi: R3 ratings identical to R2 for >90% of agents — no engagement with anonymised feedback + - Scenario: all agents rate all scenarios identically on `desirability` — instrument failure + +Flags surface in the synthesis report under "instrument health" so the user can decide whether data is publishable. + +## 9. Testing + +**Unit tests** (`backend/tests/interviews/`): + +- `test_instruments.py` — every YAML parses and validates +- `test_base_interviewer.py` — persona+memory loading, in-character prompt construction, schema-retry logic (mock `LLMClient`) +- One file per subagent — happy path + each failure mode in §7 +- `test_orchestrator.py` — fan-out, partial failures, two-phase ordering (T0 before T1) +- `test_synthesizer.py` — missing-subagent handling, stable output shape + +**Integration test** (`tests/integration/test_interview_pipeline.py`): + +End-to-end with N=5 agents against a recorded LLM cassette. Verifies T0 at READY, T1 + 3 others at COMPLETED, CSV export well-formed, Zep episodes written. + +**Stub LLM mode** (`Config.LLM_STUB_MODE=true`) returns deterministic canned responses keyed by `(subagent, item_id, persona_hash)`. Full pipeline exercisable in CI for free. + +**Zep**: disposable graph in integration tests (consistent with project conventions); unit tests stub. + +## 10. Methodological caveats (auto-emitted in synthesis) + +The synthesiser **always** emits a "Limitations" section, programmatically generated from run metadata: + +- **Simulated, not real stakeholders.** Responses reflect how the seed-document discourse + LLM jointly encode each stakeholder type, not what actual fishers / NGO staff would say. The instrument measures the *model of the stakeholder*, not the stakeholder. +- **Memory digest is lossy.** Each agent's "experience" of OASIS is summarised to bounded length; agents do not have full episodic recall. +- **LLM acquiescence and centrality bias.** Likert with LLM respondents skews toward 3–4 of 5; per-item distribution shape statistics are reported. +- **N is what it is.** `n_total` and `n_responded` printed verbatim; no rounding, no smoothing. +- **Instrument provenance.** Hash of `instruments_used.json` printed so future-you can rebuild the exact instrument. + +This section is load-bearing for any publication: it makes the system intellectually defensible rather than a black box. + +## 11. Defaulted decisions (revisit later if needed) + +- **N agents:** assumed 50, driven from existing simulation config; if you typically run more/fewer, cost guardrail threshold needs adjusting +- **Default instrument language:** German with English fallback in YAML +- **Delphi rounds = 3:** classic Delphi can run more; 3 is the methodological floor and the cost ceiling here + +## 12. Open questions for implementation phase + +- Whether to write a separate `instruments_changelog.md` per run, or embed change tracking in `instruments_used.json` metadata +- Whether the synthesiser should write into Zep as a single mega-episode or stay file-only (current design: file-only, plus the per-agent + per-aggregate episodes from each subagent) +- Whether `Step4bInterviews.vue` should sit strictly after Step4 (current design) or render in parallel — interviews depend on the simulation having reached `completed` (Step3 output) and on the `graph_id` (created in Step1); they do not depend on Step4's ReportAgent run, so a parallel layout is technically possible From 815e4758b277f6fa3d0bcb00ceff7d894403ce17 Mon Sep 17 00:00:00 2001 From: Christian Moellmann Date: Sat, 23 May 2026 11:19:02 +0200 Subject: [PATCH 02/26] docs(plan): stakeholder interview subagents implementation plan MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Bite-sized TDD plan covering 21 tasks across 7 phases: setup → foundation (models, YAML loader, LLM stub, base interviewer) → 4 subagents (longitudinal, diversity Q-sort+PCA, Delphi 3-round, scenario) → storage + Zep writer → orchestrator + sim lifecycle hooks + synthesiser → Flask /api/interview blueprint → end-to-end integration test → Vue Step4b with d3 visualisations. Each task lists exact files, failing test code, implementation code, run commands, and commit message. Co-Authored-By: Claude Opus 4.7 (1M context) --- ...6-05-23-stakeholder-interview-subagents.md | 3837 +++++++++++++++++ 1 file changed, 3837 insertions(+) create mode 100644 docs/superpowers/plans/2026-05-23-stakeholder-interview-subagents.md diff --git a/docs/superpowers/plans/2026-05-23-stakeholder-interview-subagents.md b/docs/superpowers/plans/2026-05-23-stakeholder-interview-subagents.md new file mode 100644 index 00000000..4de7f7c6 --- /dev/null +++ b/docs/superpowers/plans/2026-05-23-stakeholder-interview-subagents.md @@ -0,0 +1,3837 @@ +# Stakeholder Interview Subagents Implementation Plan + +> **For agentic workers:** REQUIRED SUB-SKILL: Use superpowers:subagent-driven-development (recommended) or superpowers:executing-plans to implement this plan task-by-task. Steps use checkbox (`- [ ]`) syntax for tracking. + +**Goal:** Build a four-subagent post-simulation interview system (Longitudinal, Diversity, Delphi, Scenario) over MiroFish-simulated stakeholders, plus a cross-method synthesiser, exposed via `/api/interview` and rendered in a new Vue Step4b. + +**Architecture:** Deterministic instrument runners (not ReACT). Shared `StakeholderInterviewer` base loads persona + Zep memory digest and administers per-instrument JSON-schema-validated prompts via the existing `LLMClient`. Four subagents own their own instrument YAML + output schema. `InterviewOrchestrator` fans out parallel post-sim execution; `InterviewSynthesizer` aggregates. Files: backend Python services + new Flask blueprint; frontend new Vue component with d3 viz. + +**Tech Stack:** Python 3.12, Flask, pydantic v2, PyYAML, scikit-learn (PCA, k-means), scipy (Wilcoxon), numpy, pytest; Vue 3, axios, d3 v7, vue-i18n. + +**Spec:** `docs/superpowers/specs/2026-05-23-stakeholder-interview-subagents-design.md` + +--- + +## Phase 0 — Setup + +### Task 0: Add deps and pytest scaffold + +**Files:** +- Modify: `backend/pyproject.toml` +- Create: `backend/tests/__init__.py` +- Create: `backend/tests/conftest.py` +- Create: `backend/pytest.ini` + +- [ ] **Step 1: Add deps to `backend/pyproject.toml`** + +In the `dependencies` array (after `pydantic>=2.0.0`), add: +```toml + "PyYAML>=6.0", + "scikit-learn>=1.4", + "scipy>=1.12", + "numpy>=1.26", + "pandas>=2.1", +``` + +- [ ] **Step 2: Create `backend/pytest.ini`** + +```ini +[pytest] +testpaths = tests +python_files = test_*.py +python_classes = Test* +python_functions = test_* +addopts = -ra --strict-markers +markers = + integration: marks integration tests (deselect with -m 'not integration') +``` + +- [ ] **Step 3: Create `backend/tests/__init__.py`** + +Empty file. + +- [ ] **Step 4: Create `backend/tests/conftest.py`** + +```python +import os +import sys +import pathlib +import pytest + +ROOT = pathlib.Path(__file__).resolve().parents[1] +sys.path.insert(0, str(ROOT)) + +os.environ.setdefault("LLM_API_KEY", "test") +os.environ.setdefault("LLM_BASE_URL", "https://example.invalid") +os.environ.setdefault("LLM_MODEL_NAME", "test-model") +os.environ.setdefault("ZEP_API_KEY", "test") + +@pytest.fixture +def tmp_uploads(tmp_path, monkeypatch): + monkeypatch.setenv("UPLOADS_DIR", str(tmp_path)) + return tmp_path +``` + +- [ ] **Step 5: Install + verify** + +Run: `cd backend && uv sync --python 3.12 && uv run pytest -q` +Expected: `0 tests collected` (no failures). Confirms infrastructure works. + +- [ ] **Step 6: Commit** + +```bash +git add backend/pyproject.toml backend/uv.lock backend/pytest.ini backend/tests/__init__.py backend/tests/conftest.py +git commit -m "chore(interviews): add deps and pytest scaffold for interview subsystem" +``` + +--- + +### Task 1: Add interview config keys + +**Files:** +- Modify: `backend/app/config.py` + +- [ ] **Step 1: Read current config** + +Open `backend/app/config.py` and locate the `Config` class. + +- [ ] **Step 2: Add config keys** + +Add inside the `Config` class (preserving existing keys): +```python + # Interview subsystem + INTERVIEW_MAX_TOKENS_PER_RUN = int(os.environ.get("INTERVIEW_MAX_TOKENS_PER_RUN", 15_000_000)) + INTERVIEW_MAX_WORKERS = int(os.environ.get("INTERVIEW_MAX_WORKERS", 8)) + INTERVIEW_DEFAULT_LANGUAGE = os.environ.get("INTERVIEW_DEFAULT_LANGUAGE", "de") + LLM_STUB_MODE = os.environ.get("LLM_STUB_MODE", "false").lower() == "true" +``` + +- [ ] **Step 3: Verify import** + +Run: `cd backend && uv run python -c "from app.config import Config; print(Config.INTERVIEW_MAX_WORKERS, Config.LLM_STUB_MODE)"` +Expected: `8 False` + +- [ ] **Step 4: Commit** + +```bash +git add backend/app/config.py +git commit -m "feat(interviews): add interview config keys (token budget, workers, language, stub mode)" +``` + +--- + +## Phase 1 — Foundation + +### Task 2: Pydantic models for instruments and responses + +**Files:** +- Create: `backend/app/models/interview.py` +- Create: `backend/tests/interviews/__init__.py` +- Test: `backend/tests/interviews/test_models.py` + +- [ ] **Step 1: Write failing test** + +Create `backend/tests/interviews/__init__.py` (empty), then `backend/tests/interviews/test_models.py`: +```python +import pytest +from pydantic import ValidationError +from app.models.interview import ( + LikertItem, LikertInstrument, LikertResponse, + InterviewPhase, SubagentKind, +) + +def test_likert_item_requires_de_and_en(): + item = LikertItem(item_id="x1", de="Frage", en="Question", scale=5) + assert item.scale == 5 + +def test_likert_item_rejects_bad_scale(): + with pytest.raises(ValidationError): + LikertItem(item_id="x1", de="d", en="e", scale=2) + +def test_likert_instrument_unique_item_ids(): + with pytest.raises(ValidationError): + LikertInstrument( + name="t", + items=[LikertItem(item_id="a", de="d", en="e", scale=5), + LikertItem(item_id="a", de="d", en="e", scale=5)], + ) + +def test_likert_response_validates_scale_range(): + with pytest.raises(ValidationError): + LikertResponse(agent_id=1, phase=InterviewPhase.T0, + responses={"a": 6}, confidence={"a": 0.5}) + +def test_subagent_kind_enum(): + assert SubagentKind.LONGITUDINAL.value == "longitudinal" +``` + +- [ ] **Step 2: Run test to verify it fails** + +Run: `cd backend && uv run pytest tests/interviews/test_models.py -v` +Expected: ImportError (module not yet created). + +- [ ] **Step 3: Create `backend/app/models/interview.py`** + +```python +from __future__ import annotations +from enum import Enum +from typing import Optional +from pydantic import BaseModel, Field, field_validator, model_validator + +class InterviewPhase(str, Enum): + T0 = "T0" + T1 = "T1" + +class SubagentKind(str, Enum): + LONGITUDINAL = "longitudinal" + DIVERSITY = "diversity" + DELPHI = "delphi" + SCENARIO = "scenario" + +class LikertItem(BaseModel): + item_id: str + de: str + en: str + scale: int = Field(ge=3, le=7) + family: Optional[str] = None + reverse_coded: bool = False + + @field_validator("scale") + @classmethod + def odd_scale(cls, v: int) -> int: + if v not in (3, 5, 7): + raise ValueError("scale must be 3, 5, or 7") + return v + +class LikertInstrument(BaseModel): + name: str + version: str = "1.0" + language_default: str = "de" + items: list[LikertItem] + + @model_validator(mode="after") + def unique_item_ids(self) -> "LikertInstrument": + ids = [i.item_id for i in self.items] + if len(set(ids)) != len(ids): + raise ValueError("duplicate item_id in instrument") + return self + +class LikertResponse(BaseModel): + agent_id: int + phase: InterviewPhase + responses: dict[str, int] + confidence: dict[str, float] = Field(default_factory=dict) + open_comment: Optional[str] = None + memory_available: bool = True + failed_items: list[str] = Field(default_factory=list) + + @model_validator(mode="after") + def values_in_range(self) -> "LikertResponse": + for k, v in self.responses.items(): + if not 1 <= v <= 7: + raise ValueError(f"response {k}={v} out of 1..7 range") + for k, v in self.confidence.items(): + if not 0.0 <= v <= 1.0: + raise ValueError(f"confidence {k}={v} out of 0..1 range") + return self + +class QSortStatement(BaseModel): + statement_id: str + de: str + en: str + +class QSortInstrument(BaseModel): + name: str + version: str = "1.0" + statements: list[QSortStatement] + distribution: list[int] # e.g. [2,3,4,6,4,3,2] for -3..+3 + +class QSortResponse(BaseModel): + agent_id: int + placements: dict[str, int] # statement_id -> bucket (-3..+3) + likert_axes: dict[str, int] # axis_id -> 1..7 + +class DelphiOpenResponse(BaseModel): + agent_id: int + round: int = 1 + answers: dict[str, str] # question_id -> free text + +class DelphiRatingResponse(BaseModel): + agent_id: int + round: int + ratings: dict[str, dict[str, int]] # theme_id -> {importance, plausibility} + justification: Optional[str] = None + +class ScenarioRating(BaseModel): + desirability: int = Field(ge=1, le=7) + plausibility: int = Field(ge=1, le=7) + impact_on_my_group: int = Field(ge=1, le=7) + fairness: int = Field(ge=1, le=7) + if_woke_up_response: str + +class ScenarioResponse(BaseModel): + agent_id: int + ratings: dict[str, ScenarioRating] # scenario_id -> rating +``` + +- [ ] **Step 4: Run test to verify it passes** + +Run: `cd backend && uv run pytest tests/interviews/test_models.py -v` +Expected: 5 passed. + +- [ ] **Step 5: Commit** + +```bash +git add backend/app/models/interview.py backend/tests/interviews/__init__.py backend/tests/interviews/test_models.py +git commit -m "feat(interviews): add pydantic models for instruments and responses" +``` + +--- + +### Task 3: YAML instrument loader + validator + +**Files:** +- Create: `backend/app/services/interviews/__init__.py` +- Create: `backend/app/services/interviews/instrument_loader.py` +- Create: `backend/scripts/instruments/__init__.py` (empty marker so tests can import path) +- Test: `backend/tests/interviews/test_instrument_loader.py` + +- [ ] **Step 1: Write failing test** + +```python +# backend/tests/interviews/test_instrument_loader.py +import pytest +from app.services.interviews.instrument_loader import ( + load_likert_instrument, InstrumentValidationError, +) + +def _write(tmp_path, text): + p = tmp_path / "inst.yaml" + p.write_text(text, encoding="utf-8") + return p + +def test_loads_valid_likert(tmp_path): + p = _write(tmp_path, """ +name: longitudinal_v1 +version: "1.0" +language_default: de +items: + - item_id: stk_1 + de: "Der westliche Dorschbestand wird sich erholen" + en: "Western cod stock will recover" + scale: 5 + family: stocks +""") + inst = load_likert_instrument(p) + assert inst.name == "longitudinal_v1" + assert len(inst.items) == 1 + +def test_rejects_duplicate_item_id(tmp_path): + p = _write(tmp_path, """ +name: x +items: + - {item_id: a, de: d, en: e, scale: 5} + - {item_id: a, de: d, en: e, scale: 5} +""") + with pytest.raises(InstrumentValidationError): + load_likert_instrument(p) + +def test_rejects_missing_required_field(tmp_path): + p = _write(tmp_path, """ +name: x +items: + - {item_id: a, de: d, scale: 5} +""") + with pytest.raises(InstrumentValidationError): + load_likert_instrument(p) +``` + +- [ ] **Step 2: Run test to verify it fails** + +Run: `cd backend && uv run pytest tests/interviews/test_instrument_loader.py -v` +Expected: ImportError. + +- [ ] **Step 3: Create loader** + +Create `backend/app/services/interviews/__init__.py` (empty), `backend/scripts/instruments/__init__.py` (empty), then `backend/app/services/interviews/instrument_loader.py`: + +```python +from __future__ import annotations +import hashlib +import json +from pathlib import Path +import yaml +from pydantic import ValidationError +from app.models.interview import ( + LikertInstrument, QSortInstrument, +) + +class InstrumentValidationError(ValueError): + pass + +def _parse_yaml(path: Path) -> dict: + if not path.exists(): + raise InstrumentValidationError(f"instrument file not found: {path}") + try: + with path.open("r", encoding="utf-8") as f: + data = yaml.safe_load(f) + except yaml.YAMLError as e: + raise InstrumentValidationError(f"YAML parse error in {path}: {e}") from e + if not isinstance(data, dict): + raise InstrumentValidationError(f"top-level YAML must be a mapping in {path}") + return data + +def load_likert_instrument(path: Path) -> LikertInstrument: + data = _parse_yaml(Path(path)) + try: + return LikertInstrument(**data) + except ValidationError as e: + raise InstrumentValidationError(str(e)) from e + +def load_qsort_instrument(path: Path) -> QSortInstrument: + data = _parse_yaml(Path(path)) + try: + return QSortInstrument(**data) + except ValidationError as e: + raise InstrumentValidationError(str(e)) from e + +def instrument_hash(path: Path) -> str: + data = Path(path).read_bytes() + return hashlib.sha256(data).hexdigest()[:16] + +def freeze_snapshot(instruments: dict[str, Path], out_path: Path) -> dict: + snapshot = { + name: { + "path": str(p), + "hash": instrument_hash(p), + "content": _parse_yaml(p), + } + for name, p in instruments.items() + } + out_path.parent.mkdir(parents=True, exist_ok=True) + out_path.write_text(json.dumps(snapshot, ensure_ascii=False, indent=2), encoding="utf-8") + return snapshot +``` + +- [ ] **Step 4: Run test to verify it passes** + +Run: `cd backend && uv run pytest tests/interviews/test_instrument_loader.py -v` +Expected: 3 passed. + +- [ ] **Step 5: Commit** + +```bash +git add backend/app/services/interviews/__init__.py backend/app/services/interviews/instrument_loader.py backend/scripts/instruments/__init__.py backend/tests/interviews/test_instrument_loader.py +git commit -m "feat(interviews): YAML instrument loader with pydantic validation and hash freezing" +``` + +--- + +### Task 4: LLM stub mode + +**Files:** +- Modify: `backend/app/utils/llm_client.py` +- Test: `backend/tests/interviews/test_llm_stub.py` + +- [ ] **Step 1: Write failing test** + +```python +# backend/tests/interviews/test_llm_stub.py +import json +from app.utils.llm_client import LLMClient + +def test_stub_mode_returns_deterministic_canned_json(monkeypatch): + monkeypatch.setenv("LLM_STUB_MODE", "true") + from app.config import Config + Config.LLM_STUB_MODE = True + client = LLMClient(api_key="x", base_url="x", model="x") + messages = [ + {"role": "system", "content": "You are persona_42. Return JSON."}, + {"role": "user", "content": "stub_key=longitudinal:item_001"}, + ] + out1 = client.chat_json(messages=messages, temperature=0.0) + out2 = client.chat_json(messages=messages, temperature=0.0) + assert out1 == out2 + assert isinstance(out1, dict) +``` + +- [ ] **Step 2: Run test to verify it fails** + +Run: `cd backend && uv run pytest tests/interviews/test_llm_stub.py -v` +Expected: FAIL (real API call attempted or stub absent). + +- [ ] **Step 3: Read current `llm_client.py`** + +Read the file to locate `chat` and `chat_json` method bodies and where to insert the stub branch. + +- [ ] **Step 4: Add stub branch** + +At the top of `LLMClient.chat` (before the OpenAI call), insert: +```python + from app.config import Config + if getattr(Config, "LLM_STUB_MODE", False): + return self._stub_response(messages) +``` + +And at the top of `LLMClient.chat_json` (before delegating), insert the same guard returning a parsed dict via `self._stub_response_json(messages)`. + +Add these methods to `LLMClient`: +```python + def _stub_key(self, messages: list[dict]) -> str: + user_msg = next((m["content"] for m in reversed(messages) if m.get("role") == "user"), "") + sys_msg = next((m["content"] for m in messages if m.get("role") == "system"), "") + # Allow callers to embed an explicit stub_key=... token + for chunk in user_msg.split(): + if chunk.startswith("stub_key="): + return chunk[len("stub_key="):] + import hashlib + return hashlib.sha256((sys_msg + "|" + user_msg).encode("utf-8")).hexdigest()[:12] + + def _stub_response(self, messages: list[dict]) -> str: + import json as _json + return _json.dumps(self._stub_response_json(messages), ensure_ascii=False) + + def _stub_response_json(self, messages: list[dict]) -> dict: + key = self._stub_key(messages) + # Deterministic centered Likert + plausible open text + digit = sum(ord(c) for c in key) % 5 + 1 + return { + "stub_key": key, + "responses": {"item_001": digit, "item_002": digit, "item_003": (digit % 5) + 1}, + "confidence": {"item_001": 0.7, "item_002": 0.7, "item_003": 0.6}, + "open_comment": f"stub:{key}", + } +``` + +- [ ] **Step 5: Run test to verify it passes** + +Run: `cd backend && uv run pytest tests/interviews/test_llm_stub.py -v` +Expected: 1 passed. + +- [ ] **Step 6: Commit** + +```bash +git add backend/app/utils/llm_client.py backend/tests/interviews/test_llm_stub.py +git commit -m "feat(interviews): LLM stub mode for deterministic CI tests" +``` + +--- + +### Task 5: StakeholderInterviewer base class + +**Files:** +- Create: `backend/app/services/interviews/base.py` +- Test: `backend/tests/interviews/test_base_interviewer.py` + +- [ ] **Step 1: Write failing test** + +```python +# backend/tests/interviews/test_base_interviewer.py +import json +import pytest +from app.services.interviews.base import StakeholderInterviewer, MemoryDigest, PersonaRecord + +class _FakeLLM: + def __init__(self, responses): + self.responses = list(responses) + self.calls = [] + def chat_json(self, messages, temperature=0.0, max_tokens=None, **kw): + self.calls.append(messages) + return self.responses.pop(0) + +class _FakeMemory: + def get_digest(self, agent_id, max_chars=2000): + return MemoryDigest(text=f"digest-for-{agent_id}", available=True) + +def test_in_character_prompt_includes_persona_and_memory(): + llm = _FakeLLM([{"x": 1}]) + mem = _FakeMemory() + interviewer = StakeholderInterviewer(llm=llm, memory=mem) + persona = PersonaRecord(agent_id=7, name="A", persona="I am a small-scale Baltic fisher.") + out = interviewer.ask_in_character(persona, user_prompt="Q?", schema_hint="{...}") + assert out == {"x": 1} + sys_msg = llm.calls[0][0]["content"] + assert "small-scale Baltic fisher" in sys_msg + assert "digest-for-7" in sys_msg + +def test_schema_retry_on_first_failure(): + bad_then_good = [{}, {"responses": {"a": 3}}] + llm = _FakeLLM(bad_then_good) + mem = _FakeMemory() + interviewer = StakeholderInterviewer(llm=llm, memory=mem) + def validator(d): + return d if "responses" in d else None + persona = PersonaRecord(agent_id=1, name="A", persona="p") + out = interviewer.ask_in_character(persona, user_prompt="Q?", schema_hint="x", validate=validator) + assert out == {"responses": {"a": 3}} + assert len(llm.calls) == 2 + +def test_two_failures_raise(): + llm = _FakeLLM([{}, {}]) + mem = _FakeMemory() + interviewer = StakeholderInterviewer(llm=llm, memory=mem) + persona = PersonaRecord(agent_id=1, name="A", persona="p") + with pytest.raises(ValueError): + interviewer.ask_in_character(persona, user_prompt="Q?", schema_hint="x", + validate=lambda d: d if "responses" in d else None) +``` + +- [ ] **Step 2: Run test to verify it fails** + +Run: `cd backend && uv run pytest tests/interviews/test_base_interviewer.py -v` +Expected: ImportError. + +- [ ] **Step 3: Implement base** + +`backend/app/services/interviews/base.py`: +```python +from __future__ import annotations +from dataclasses import dataclass, field +from typing import Any, Callable, Optional, Protocol + +@dataclass +class PersonaRecord: + agent_id: int + name: str + persona: str + profession: Optional[str] = None + bio: Optional[str] = None + +@dataclass +class MemoryDigest: + text: str + available: bool = True + +class MemoryProvider(Protocol): + def get_digest(self, agent_id: int, max_chars: int = 2000) -> MemoryDigest: ... + +class StakeholderInterviewer: + def __init__(self, llm, memory: MemoryProvider, language: str = "de"): + self.llm = llm + self.memory = memory + self.language = language + + def _system_prompt(self, persona: PersonaRecord, digest: MemoryDigest, schema_hint: str) -> str: + memory_block = digest.text if digest.available else "[no simulation memory available]" + lang_note = "Antworte ausschließlich auf Deutsch." if self.language == "de" else "Answer in English." + return ( + f"You are {persona.name}. {persona.persona}\n\n" + "You are answering a survey about the future of German fisheries. " + "Answer strictly in character based on your background, values, and what you experienced " + "during the simulated social media discourse summarised below.\n\n" + f"--- simulation memory digest ---\n{memory_block}\n--- end ---\n\n" + f"{lang_note} Return JSON ONLY matching this schema:\n{schema_hint}" + ) + + def ask_in_character( + self, + persona: PersonaRecord, + user_prompt: str, + schema_hint: str, + *, + temperature: float = 0.3, + max_tokens: Optional[int] = None, + validate: Optional[Callable[[dict], Optional[dict]]] = None, + ) -> dict: + digest = self.memory.get_digest(persona.agent_id) + messages = [ + {"role": "system", "content": self._system_prompt(persona, digest, schema_hint)}, + {"role": "user", "content": user_prompt}, + ] + out = self.llm.chat_json(messages=messages, temperature=temperature, max_tokens=max_tokens) + if validate is not None: + validated = validate(out) + if validated is not None: + return validated + messages.append({"role": "assistant", "content": str(out)}) + messages.append({"role": "user", "content": + "Your previous response did not match the required schema. " + f"Return ONLY valid JSON matching: {schema_hint}"}) + out = self.llm.chat_json(messages=messages, temperature=0.0, max_tokens=max_tokens) + validated = validate(out) + if validated is None: + raise ValueError(f"agent {persona.agent_id}: schema violation after retry") + return validated + return out +``` + +- [ ] **Step 4: Run test to verify it passes** + +Run: `cd backend && uv run pytest tests/interviews/test_base_interviewer.py -v` +Expected: 3 passed. + +- [ ] **Step 5: Commit** + +```bash +git add backend/app/services/interviews/base.py backend/tests/interviews/test_base_interviewer.py +git commit -m "feat(interviews): StakeholderInterviewer base with in-character prompting and schema retry" +``` + +--- + +## Phase 2 — Subagents + +### Task 6: Longitudinal subagent + instrument YAML + +**Files:** +- Create: `backend/scripts/instruments/longitudinal_v1.yaml` +- Create: `backend/app/services/interviews/longitudinal.py` +- Test: `backend/tests/interviews/test_longitudinal.py` + +- [ ] **Step 1: Write failing test** + +```python +# backend/tests/interviews/test_longitudinal.py +from pathlib import Path +import pytest +from app.models.interview import InterviewPhase +from app.services.interviews.base import PersonaRecord, MemoryDigest +from app.services.interviews.longitudinal import LongitudinalSubagent, run_aggregate + +class _FakeMem: + def get_digest(self, agent_id, max_chars=2000): + return MemoryDigest(text="x", available=True) + +class _CannedLLM: + def __init__(self): self.n = 0 + def chat_json(self, messages, temperature=0.0, max_tokens=None, **kw): + self.n += 1 + return { + "responses": {"stk_1": 4, "gov_1": 3, "mkt_1": 5, "clm_1": 2}, + "confidence": {"stk_1": 0.8, "gov_1": 0.6, "mkt_1": 0.7, "clm_1": 0.5}, + "open_comment": "test", + } + +INSTRUMENT = Path(__file__).resolve().parents[2] / "scripts" / "instruments" / "longitudinal_v1.yaml" + +def test_longitudinal_administer_one_agent(): + sub = LongitudinalSubagent(llm=_CannedLLM(), memory=_FakeMem(), instrument_path=INSTRUMENT) + persona = PersonaRecord(agent_id=3, name="A", persona="p") + resp = sub.administer(persona, phase=InterviewPhase.T0) + assert resp.agent_id == 3 + assert resp.phase == InterviewPhase.T0 + assert set(resp.responses.keys()) >= {"stk_1", "gov_1", "mkt_1", "clm_1"} + +def test_longitudinal_aggregate_delta(): + from app.models.interview import LikertResponse + t0 = [LikertResponse(agent_id=i, phase=InterviewPhase.T0, + responses={"stk_1": 3, "gov_1": 4}, + confidence={"stk_1": 0.8, "gov_1": 0.8}) for i in range(5)] + t1 = [LikertResponse(agent_id=i, phase=InterviewPhase.T1, + responses={"stk_1": 4, "gov_1": 4}, + confidence={"stk_1": 0.8, "gov_1": 0.8}) for i in range(5)] + agg = run_aggregate(t0, t1) + assert agg["per_item"]["stk_1"]["mean_delta"] == 1.0 + assert agg["per_item"]["gov_1"]["mean_delta"] == 0.0 + assert agg["n_paired"] == 5 +``` + +- [ ] **Step 2: Run test to verify it fails** + +Run: `cd backend && uv run pytest tests/interviews/test_longitudinal.py -v` +Expected: ImportError + missing YAML file. + +- [ ] **Step 3: Create instrument YAML** + +`backend/scripts/instruments/longitudinal_v1.yaml`: +```yaml +name: longitudinal_v1 +version: "1.0" +language_default: de +items: + # Stock status & recovery + - {item_id: stk_1, family: stocks, scale: 5, + de: "Der westliche Dorschbestand wird sich bis 2035 erholen.", + en: "The Western Baltic cod stock will recover by 2035."} + - {item_id: stk_2, family: stocks, scale: 5, + de: "Der Heringsbestand in der westlichen Ostsee ist nicht mehr zu retten.", + en: "The Western Baltic herring stock can no longer be saved.", + reverse_coded: true} + - {item_id: stk_3, family: stocks, scale: 5, + de: "Wissenschaftliche Bestandsschätzungen sind generell zuverlässig.", + en: "Scientific stock assessments are generally reliable."} + # Governance & CFP + - {item_id: gov_1, family: governance, scale: 5, + de: "Die Gemeinsame Fischereipolitik der EU scheitert beim Schutz der Ostseefische.", + en: "The EU Common Fisheries Policy fails to protect Baltic fish.", + reverse_coded: true} + - {item_id: gov_2, family: governance, scale: 5, + de: "Entscheidungen über Fangquoten sollten stärker lokal getroffen werden.", + en: "Decisions on catch quotas should be taken more locally."} + - {item_id: gov_3, family: governance, scale: 5, + de: "Die deutsche Bundesregierung handelt entschlossen bei Fischereifragen.", + en: "The German federal government acts decisively on fisheries issues."} + # Market & MSC + - {item_id: mkt_1, family: market, scale: 5, + de: "Nur MSC-zertifizierter Fisch sollte verkauft werden dürfen.", + en: "Only MSC-certified fish should be allowed for sale."} + - {item_id: mkt_2, family: market, scale: 5, + de: "Importierter Fisch verdrängt die deutsche Kleinfischerei.", + en: "Imported fish displaces German small-scale fisheries."} + - {item_id: mkt_3, family: market, scale: 5, + de: "Verbraucher zahlen gerne mehr für nachhaltigen Ostseefisch.", + en: "Consumers gladly pay more for sustainable Baltic fish."} + # Climate & adaptation + - {item_id: clm_1, family: climate, scale: 5, + de: "Der Klimawandel macht traditionelle Ostseefischerei unmöglich.", + en: "Climate change makes traditional Baltic fisheries impossible.", + reverse_coded: true} + - {item_id: clm_2, family: climate, scale: 5, + de: "Aquakultur ist die Zukunft der deutschen Fischwirtschaft.", + en: "Aquaculture is the future of the German fishing industry."} + - {item_id: clm_3, family: climate, scale: 5, + de: "Die Fischerei muss sich grundlegend an neue Arten anpassen.", + en: "Fisheries must fundamentally adapt to new species."} +``` + +- [ ] **Step 4: Implement subagent** + +`backend/app/services/interviews/longitudinal.py`: +```python +from __future__ import annotations +import json +import math +from pathlib import Path +from typing import Optional +from app.models.interview import ( + LikertInstrument, LikertResponse, InterviewPhase, +) +from app.services.interviews.base import StakeholderInterviewer, PersonaRecord +from app.services.interviews.instrument_loader import load_likert_instrument + +class LongitudinalSubagent: + def __init__(self, llm, memory, instrument_path: Path, language: str = "de"): + self.instrument: LikertInstrument = load_likert_instrument(Path(instrument_path)) + self.interviewer = StakeholderInterviewer(llm=llm, memory=memory, language=language) + self.language = language + + def _schema_hint(self) -> str: + ids = [i.item_id for i in self.instrument.items] + return json.dumps({ + "responses": {k: "" for k in ids}, + "confidence": {k: "" for k in ids}, + "open_comment": "", + }, ensure_ascii=False) + + def _user_prompt(self) -> str: + lines = ["Bitte bewerten Sie die folgenden Aussagen auf einer Skala von 1 (lehne stark ab) bis 5 (stimme stark zu)." if self.language == "de" + else "Please rate the following statements on a scale from 1 (strongly disagree) to 5 (strongly agree)."] + for it in self.instrument.items: + txt = it.de if self.language == "de" else it.en + lines.append(f"- [{it.item_id}] {txt}") + return "\n".join(lines) + + def _validator(self, raw: dict) -> Optional[dict]: + if not isinstance(raw, dict): return None + resp = raw.get("responses") + if not isinstance(resp, dict): return None + required = {it.item_id for it in self.instrument.items} + if not required.issubset(resp.keys()): return None + for k, v in resp.items(): + if not isinstance(v, int) or not 1 <= v <= 5: return None + return raw + + def administer(self, persona: PersonaRecord, phase: InterviewPhase) -> LikertResponse: + raw = self.interviewer.ask_in_character( + persona, + user_prompt=self._user_prompt(), + schema_hint=self._schema_hint(), + validate=self._validator, + ) + return LikertResponse( + agent_id=persona.agent_id, + phase=phase, + responses={k: int(v) for k, v in raw["responses"].items()}, + confidence={k: float(v) for k, v in raw.get("confidence", {}).items()}, + open_comment=raw.get("open_comment"), + ) + +def run_aggregate(t0: list[LikertResponse], t1: list[LikertResponse]) -> dict: + by_t0 = {r.agent_id: r for r in t0} + by_t1 = {r.agent_id: r for r in t1} + paired = sorted(set(by_t0) & set(by_t1)) + items: set[str] = set() + for r in t0 + t1: + items.update(r.responses.keys()) + per_item: dict[str, dict] = {} + for it in sorted(items): + deltas = [] + for aid in paired: + v0 = by_t0[aid].responses.get(it) + v1 = by_t1[aid].responses.get(it) + if v0 is None or v1 is None: continue + deltas.append(v1 - v0) + if not deltas: + per_item[it] = {"mean_delta": None, "n": 0} + continue + m = sum(deltas) / len(deltas) + var = sum((d - m) ** 2 for d in deltas) / max(len(deltas) - 1, 1) + per_item[it] = { + "mean_delta": m, + "sd_delta": math.sqrt(var), + "n": len(deltas), + "n_positive": sum(1 for d in deltas if d > 0), + "n_negative": sum(1 for d in deltas if d < 0), + } + per_agent: dict[int, dict] = {} + for aid in paired: + r0 = by_t0[aid].responses + r1 = by_t1[aid].responses + common = set(r0) & set(r1) + total = sum(abs(r1[k] - r0[k]) for k in common) + per_agent[aid] = {"total_abs_drift": total, "n_items": len(common)} + return { + "n_paired": len(paired), + "n_t0_only": len(set(by_t0) - set(by_t1)), + "n_t1_only": len(set(by_t1) - set(by_t0)), + "per_item": per_item, + "per_agent": per_agent, + } +``` + +- [ ] **Step 5: Run test to verify it passes** + +Run: `cd backend && uv run pytest tests/interviews/test_longitudinal.py -v` +Expected: 2 passed. + +- [ ] **Step 6: Commit** + +```bash +git add backend/scripts/instruments/longitudinal_v1.yaml backend/app/services/interviews/longitudinal.py backend/tests/interviews/test_longitudinal.py +git commit -m "feat(interviews): longitudinal subagent + 12-item Likert instrument" +``` + +--- + +### Task 7: Diversity subagent + Q-sort instrument + +**Files:** +- Create: `backend/scripts/instruments/diversity_v1.yaml` +- Create: `backend/app/services/interviews/diversity.py` +- Test: `backend/tests/interviews/test_diversity.py` + +- [ ] **Step 1: Write failing test** + +```python +# backend/tests/interviews/test_diversity.py +from pathlib import Path +import numpy as np +from app.services.interviews.base import PersonaRecord, MemoryDigest +from app.services.interviews.diversity import ( + DiversitySubagent, run_typology, +) + +class _Mem: + def get_digest(self, agent_id, max_chars=2000): + return MemoryDigest(text="x", available=True) + +class _CannedLLM: + def chat_json(self, messages, temperature=0.0, max_tokens=None, **kw): + # Place all 24 statements into legal buckets per the forced distribution + placements = {} + buckets = [-3]*2 + [-2]*3 + [-1]*4 + [0]*6 + [1]*4 + [2]*3 + [3]*2 + for i in range(24): + placements[f"st_{i+1:02d}"] = buckets[i] + return { + "placements": placements, + "likert_axes": {"ax_pres_extr": 5, "ax_loc_eu": 3, "ax_sci_trad": 4, + "ax_ind_col": 4, "ax_short_long": 5, "ax_mkt_reg": 3}, + } + +INSTRUMENT = Path(__file__).resolve().parents[2] / "scripts" / "instruments" / "diversity_v1.yaml" + +def test_diversity_administer(): + sub = DiversitySubagent(llm=_CannedLLM(), memory=_Mem(), instrument_path=INSTRUMENT) + persona = PersonaRecord(agent_id=1, name="A", persona="p") + resp = sub.administer(persona) + assert len(resp.placements) == 24 + assert set(resp.likert_axes.keys()) == { + "ax_pres_extr","ax_loc_eu","ax_sci_trad","ax_ind_col","ax_short_long","ax_mkt_reg" + } + +def test_typology_runs_pca_kmeans(): + from app.models.interview import QSortResponse + rng = np.random.default_rng(42) + responses = [] + for aid in range(20): + placements = {f"st_{i+1:02d}": int(rng.integers(-3, 4)) for i in range(24)} + axes = {f"ax_{j}": int(rng.integers(1, 8)) for j in range(6)} + responses.append(QSortResponse(agent_id=aid, placements=placements, likert_axes=axes)) + result = run_typology(responses, n_clusters=3) + assert "clusters" in result + assert len(result["clusters"]) == 3 + assert "pca" in result + assert len(result["pca"]["components"]) >= 2 +``` + +- [ ] **Step 2: Run test to verify it fails** + +Run: `cd backend && uv run pytest tests/interviews/test_diversity.py -v` +Expected: ImportError. + +- [ ] **Step 3: Create instrument YAML** + +`backend/scripts/instruments/diversity_v1.yaml`: +```yaml +name: diversity_v1 +version: "1.0" +language_default: de +distribution: [2, 3, 4, 6, 4, 3, 2] # buckets from -3 to +3, total 24 +statements: + - {statement_id: st_01, de: "Die Ostsee gehört den Fischern, die hier seit Generationen leben.", en: "The Baltic belongs to fishers who have lived here for generations."} + - {statement_id: st_02, de: "MSC-Zertifizierung schützt vor allem große Konzerne.", en: "MSC certification mainly protects large corporations."} + - {statement_id: st_03, de: "Wissenschaftliche Quoten sind die einzige Grundlage für Politik.", en: "Scientific quotas are the only legitimate basis for policy."} + - {statement_id: st_04, de: "Aquakultur kann Ostseefischerei ersetzen.", en: "Aquaculture can replace Baltic fisheries."} + - {statement_id: st_05, de: "Sportfischer schaden den Beständen mehr als die Berufsfischer.", en: "Recreational anglers harm stocks more than commercial fishers."} + - {statement_id: st_06, de: "Die EU-Fischereipolitik kennt die Ostsee nicht.", en: "EU fisheries policy doesn't understand the Baltic."} + - {statement_id: st_07, de: "Großtechnische Fischerei ist effizienter und damit nachhaltiger.", en: "Industrial fisheries are more efficient and therefore more sustainable."} + - {statement_id: st_08, de: "Wer Fisch isst, sollte mehr dafür bezahlen.", en: "Those who eat fish should pay more for it."} + - {statement_id: st_09, de: "Die Kleinfischerei muss subventioniert werden.", en: "Small-scale fisheries must be subsidised."} + - {statement_id: st_10, de: "Marine Schutzgebiete sind reine Symbolpolitik.", en: "Marine protected areas are mere symbolism."} + - {statement_id: st_11, de: "Russlands Krieg ändert alles in der Ostsee.", en: "Russia's war changes everything in the Baltic."} + - {statement_id: st_12, de: "Nur drastische Reduktion der Fangmengen rettet die Bestände.", en: "Only drastic catch reductions will save the stocks."} + - {statement_id: st_13, de: "NGOs übertreiben die Krise systematisch.", en: "NGOs systematically exaggerate the crisis."} + - {statement_id: st_14, de: "Klimawandel ist das eigentliche Problem, nicht die Fischerei.", en: "Climate change is the real problem, not fisheries."} + - {statement_id: st_15, de: "Tradition zählt mehr als kurzfristige Bestandszahlen.", en: "Tradition matters more than short-term stock numbers."} + - {statement_id: st_16, de: "Verbraucher entscheiden über die Zukunft des Fisches.", en: "Consumers decide the future of fish."} + - {statement_id: st_17, de: "Ohne Generalstreik der Fischer ändert sich nichts.", en: "Without a fishers' general strike, nothing will change."} + - {statement_id: st_18, de: "Die Bundesregierung sollte Kutter aufkaufen und stilllegen.", en: "The federal government should buy out and decommission boats."} + - {statement_id: st_19, de: "Die Dorschkrise ist Folge gescheiterter Politik.", en: "The cod crisis is the result of policy failure."} + - {statement_id: st_20, de: "Ostsee-Aquakultur ist ökologisch problematisch.", en: "Baltic aquaculture is ecologically problematic."} + - {statement_id: st_21, de: "Junge Menschen werden keinen Fischereibetrieb mehr übernehmen.", en: "Young people will no longer take over fishing businesses."} + - {statement_id: st_22, de: "Markt regelt sich selbst, auch beim Fisch.", en: "The market regulates itself, also for fish."} + - {statement_id: st_23, de: "Lokale Genossenschaften sind die Lösung.", en: "Local cooperatives are the solution."} + - {statement_id: st_24, de: "In 20 Jahren gibt es keine deutsche Ostseefischerei mehr.", en: "In 20 years there will be no German Baltic fisheries left."} +likert_axes: + - {axis_id: ax_pres_extr, scale: 7, de: "Bewahrung (1) vs. Nutzung (7)", en: "Preservation (1) vs. Extraction (7)"} + - {axis_id: ax_loc_eu, scale: 7, de: "Lokal (1) vs. EU-zentral (7)", en: "Local (1) vs. EU-central (7)"} + - {axis_id: ax_sci_trad, scale: 7, de: "Wissenschaft (1) vs. Tradition (7)", en: "Science-led (1) vs. Tradition-led (7)"} + - {axis_id: ax_ind_col, scale: 7, de: "Individuum (1) vs. Kollektiv (7)", en: "Individual (1) vs. Collective (7)"} + - {axis_id: ax_short_long,scale: 7, de: "Kurzfristig (1) vs. Langfristig (7)", en: "Short-term (1) vs. Long-term (7)"} + - {axis_id: ax_mkt_reg, scale: 7, de: "Markt (1) vs. Regulierung (7)", en: "Market (1) vs. Regulation (7)"} +``` + +- [ ] **Step 4: Implement subagent** + +`backend/app/services/interviews/diversity.py`: +```python +from __future__ import annotations +import json +from pathlib import Path +from typing import Optional +import numpy as np +from sklearn.decomposition import PCA +from sklearn.cluster import KMeans +import yaml +from app.models.interview import QSortResponse +from app.services.interviews.base import StakeholderInterviewer, PersonaRecord +from app.services.interviews.instrument_loader import InstrumentValidationError + +class DiversitySubagent: + def __init__(self, llm, memory, instrument_path: Path, language: str = "de"): + self.instrument = self._load(Path(instrument_path)) + self.interviewer = StakeholderInterviewer(llm=llm, memory=memory, language=language) + self.language = language + + def _load(self, path: Path) -> dict: + with path.open("r", encoding="utf-8") as f: + data = yaml.safe_load(f) + if not isinstance(data, dict) or "statements" not in data or "distribution" not in data: + raise InstrumentValidationError(f"invalid diversity instrument: {path}") + if sum(data["distribution"]) != len(data["statements"]): + raise InstrumentValidationError("distribution sum must equal number of statements") + return data + + def _schema_hint(self) -> str: + return json.dumps({ + "placements": {s["statement_id"]: "" for s in self.instrument["statements"]}, + "likert_axes": {a["axis_id"]: "" for a in self.instrument["likert_axes"]}, + }, ensure_ascii=False) + + def _user_prompt(self) -> str: + dist = self.instrument["distribution"] + buckets = list(range(-3, 4)) + bucket_desc = ", ".join(f"{b}:{n}" for b, n in zip(buckets, dist)) + lines = [ + ("Ordnen Sie jede Aussage genau einer Box von -3 (lehne stark ab) bis +3 (stimme stark zu) zu. " + f"Die Verteilung ist erzwungen: {bucket_desc}.") if self.language == "de" else + ("Place every statement into exactly one box from -3 (strongly disagree) to +3 (strongly agree). " + f"The distribution is forced: {bucket_desc}."), + "", + "Statements:", + ] + for s in self.instrument["statements"]: + txt = s["de"] if self.language == "de" else s["en"] + lines.append(f"- [{s['statement_id']}] {txt}") + lines += ["", "Then rate each axis from 1 to 7:"] + for a in self.instrument["likert_axes"]: + txt = a["de"] if self.language == "de" else a["en"] + lines.append(f"- [{a['axis_id']}] {txt}") + return "\n".join(lines) + + def _validator(self, raw: dict) -> Optional[dict]: + if not isinstance(raw, dict): return None + placements = raw.get("placements", {}) + axes = raw.get("likert_axes", {}) + statements = {s["statement_id"] for s in self.instrument["statements"]} + if set(placements.keys()) != statements: return None + dist = self.instrument["distribution"] + target = {b: n for b, n in zip(range(-3, 4), dist)} + got: dict[int, int] = {} + for v in placements.values(): + if not isinstance(v, int) or not -3 <= v <= 3: return None + got[v] = got.get(v, 0) + 1 + if got != target: return None + for a in self.instrument["likert_axes"]: + v = axes.get(a["axis_id"]) + if not isinstance(v, int) or not 1 <= v <= 7: return None + return raw + + def administer(self, persona: PersonaRecord) -> QSortResponse: + raw = self.interviewer.ask_in_character( + persona, + user_prompt=self._user_prompt(), + schema_hint=self._schema_hint(), + validate=self._validator, + ) + return QSortResponse( + agent_id=persona.agent_id, + placements={k: int(v) for k, v in raw["placements"].items()}, + likert_axes={k: int(v) for k, v in raw["likert_axes"].items()}, + ) + +def _vectorize(r: QSortResponse, statements: list[str], axes: list[str]) -> np.ndarray: + return np.array( + [r.placements.get(s, 0) for s in statements] + + [r.likert_axes.get(a, 4) for a in axes], + dtype=float, + ) + +def run_typology(responses: list[QSortResponse], n_clusters: int = 4) -> dict: + if not responses: + return {"n": 0, "clusters": [], "pca": {"components": [], "explained_variance": []}} + statements = sorted({k for r in responses for k in r.placements}) + axes = sorted({k for r in responses for k in r.likert_axes}) + X = np.vstack([_vectorize(r, statements, axes) for r in responses]) + n_clusters = min(n_clusters, len(responses)) + pca = PCA(n_components=min(5, X.shape[1], X.shape[0])) + pcs = pca.fit_transform(X) + km = KMeans(n_clusters=n_clusters, n_init=10, random_state=0) + labels = km.fit_predict(X) + clusters = [] + for c in range(n_clusters): + members = [responses[i].agent_id for i in range(len(responses)) if labels[i] == c] + centroid = km.cluster_centers_[c] + clusters.append({ + "cluster_id": int(c), + "n": len(members), + "agent_ids": members, + "top_loadings": { + statements[i] if i < len(statements) else axes[i - len(statements)]: float(centroid[i]) + for i in np.argsort(np.abs(centroid))[::-1][:8].tolist() + }, + }) + return { + "n": len(responses), + "clusters": clusters, + "pca": { + "components": pcs.tolist(), + "explained_variance": pca.explained_variance_ratio_.tolist(), + "agent_ids": [r.agent_id for r in responses], + }, + } +``` + +- [ ] **Step 5: Run test to verify it passes** + +Run: `cd backend && uv run pytest tests/interviews/test_diversity.py -v` +Expected: 2 passed. + +- [ ] **Step 6: Commit** + +```bash +git add backend/scripts/instruments/diversity_v1.yaml backend/app/services/interviews/diversity.py backend/tests/interviews/test_diversity.py +git commit -m "feat(interviews): diversity subagent with Q-sort + 6 Likert axes + PCA/k-means typology" +``` + +--- + +### Task 8: Delphi subagent (three rounds) + +**Files:** +- Create: `backend/scripts/instruments/delphi_v1.yaml` +- Create: `backend/app/services/interviews/delphi.py` +- Test: `backend/tests/interviews/test_delphi.py` + +- [ ] **Step 1: Write failing test** + +```python +# backend/tests/interviews/test_delphi.py +from pathlib import Path +from app.services.interviews.base import PersonaRecord, MemoryDigest +from app.services.interviews.delphi import ( + DelphiSubagent, extract_themes, convergence_metrics, +) + +INSTRUMENT = Path(__file__).resolve().parents[2] / "scripts" / "instruments" / "delphi_v1.yaml" + +class _Mem: + def get_digest(self, agent_id, max_chars=2000): + return MemoryDigest(text="x", available=True) + +class _R1LLM: + def chat_json(self, messages, temperature=0.0, max_tokens=None, **kw): + return {"answers": { + "q1": "Klimawandel, Quoten, Generationswechsel", + "q2": "MSC, Aquakultur", + "q3": "Russland, EU-Politik", + "q4": "Verbraucherpreise", + }} + +class _R2LLM: + def chat_json(self, messages, temperature=0.0, max_tokens=None, **kw): + return {"ratings": {f"theme_{i}": {"importance": 4, "plausibility": 3} for i in range(5)}} + +class _ExtractLLM: + def chat_json(self, messages, temperature=0.0, max_tokens=None, **kw): + return {"themes": [ + {"theme_id": "theme_0", "label": "Klimawandel"}, + {"theme_id": "theme_1", "label": "Quoten"}, + {"theme_id": "theme_2", "label": "MSC"}, + {"theme_id": "theme_3", "label": "EU-Politik"}, + {"theme_id": "theme_4", "label": "Generationswechsel"}, + ]} + +def test_delphi_round1_open(): + sub = DelphiSubagent(llm=_R1LLM(), memory=_Mem(), instrument_path=INSTRUMENT) + persona = PersonaRecord(agent_id=2, name="A", persona="p") + resp = sub.administer_round1(persona) + assert resp.round == 1 + assert len(resp.answers) == 4 + +def test_extract_themes_aggregates(): + from app.models.interview import DelphiOpenResponse + r1 = [DelphiOpenResponse(agent_id=i, answers={"q1": "Klimawandel", "q2": "MSC"}) for i in range(3)] + themes = extract_themes(r1, llm=_ExtractLLM()) + assert len(themes) == 5 + assert all("theme_id" in t for t in themes) + +def test_convergence_metrics(): + from app.models.interview import DelphiRatingResponse + r2 = [DelphiRatingResponse(agent_id=i, round=2, + ratings={"t1": {"importance": 3, "plausibility": 3}}) for i in range(5)] + r3 = [DelphiRatingResponse(agent_id=i, round=3, + ratings={"t1": {"importance": 4, "plausibility": 4}}) for i in range(5)] + conv = convergence_metrics(r2, r3) + assert "t1" in conv + assert conv["t1"]["delta_iqr_importance"] is not None +``` + +- [ ] **Step 2: Run test to verify it fails** + +Run: `cd backend && uv run pytest tests/interviews/test_delphi.py -v` +Expected: ImportError. + +- [ ] **Step 3: Create instrument YAML** + +`backend/scripts/instruments/delphi_v1.yaml`: +```yaml +name: delphi_v1 +version: "1.0" +language_default: de +rounds: 3 +questions: + - {question_id: q1, de: "Welche drei Faktoren werden die deutsche Fischerei bis 2040 am stärksten prägen?", en: "Which three factors will most shape German fisheries by 2040?"} + - {question_id: q2, de: "Welche Akteurinnen und Akteure sind heute entscheidend, werden aber unterschätzt?", en: "Which actors are decisive today but underestimated?"} + - {question_id: q3, de: "Was sollte sich in den nächsten fünf Jahren ändern, damit die Fischerei eine Zukunft hat?", en: "What should change in the next five years for fisheries to have a future?"} + - {question_id: q4, de: "Welcher Trend macht Ihnen am meisten Hoffnung – und welcher am meisten Sorge?", en: "Which trend gives you most hope — and which most concern?"} +``` + +- [ ] **Step 4: Implement subagent** + +`backend/app/services/interviews/delphi.py`: +```python +from __future__ import annotations +import json +import statistics +from pathlib import Path +from typing import Optional +import yaml +from app.models.interview import ( + DelphiOpenResponse, DelphiRatingResponse, +) +from app.services.interviews.base import StakeholderInterviewer, PersonaRecord + +class DelphiSubagent: + def __init__(self, llm, memory, instrument_path: Path, language: str = "de"): + with Path(instrument_path).open("r", encoding="utf-8") as f: + self.instrument = yaml.safe_load(f) + self.interviewer = StakeholderInterviewer(llm=llm, memory=memory, language=language) + self.llm = llm + self.language = language + + # --- Round 1: open questions --- + def _r1_schema(self) -> str: + return json.dumps({ + "answers": {q["question_id"]: "" for q in self.instrument["questions"]} + }, ensure_ascii=False) + + def _r1_prompt(self) -> str: + lines = ["Bitte beantworten Sie offen:" if self.language == "de" else "Please answer openly:"] + for q in self.instrument["questions"]: + txt = q["de"] if self.language == "de" else q["en"] + lines.append(f"[{q['question_id']}] {txt}") + return "\n".join(lines) + + def _r1_validate(self, raw: dict) -> Optional[dict]: + if not isinstance(raw, dict): return None + ans = raw.get("answers") + if not isinstance(ans, dict): return None + required = {q["question_id"] for q in self.instrument["questions"]} + if not required.issubset(ans.keys()): return None + return raw + + def administer_round1(self, persona: PersonaRecord) -> DelphiOpenResponse: + raw = self.interviewer.ask_in_character( + persona, user_prompt=self._r1_prompt(), + schema_hint=self._r1_schema(), validate=self._r1_validate, + ) + return DelphiOpenResponse(agent_id=persona.agent_id, round=1, + answers={k: str(v) for k, v in raw["answers"].items()}) + + # --- Round 2: rate themes --- + def _r2_schema(self, theme_ids: list[str]) -> str: + return json.dumps({ + "ratings": {tid: {"importance": "", "plausibility": ""} for tid in theme_ids} + }, ensure_ascii=False) + + def _r2_prompt(self, themes: list[dict]) -> str: + head = "Bewerten Sie jedes Thema nach Wichtigkeit (1-5) und Plausibilität (1-5):" if self.language == "de" \ + else "Rate each theme on importance (1-5) and plausibility (1-5):" + body = [f"- [{t['theme_id']}] {t['label']}" for t in themes] + return head + "\n" + "\n".join(body) + + def _r2_validate(self, theme_ids: list[str]): + def v(raw: dict) -> Optional[dict]: + if not isinstance(raw, dict): return None + ratings = raw.get("ratings", {}) + if set(ratings.keys()) != set(theme_ids): return None + for tid, r in ratings.items(): + if not isinstance(r, dict): return None + for key in ("importance", "plausibility"): + if not isinstance(r.get(key), int) or not 1 <= r[key] <= 5: return None + return raw + return v + + def administer_round2(self, persona: PersonaRecord, themes: list[dict]) -> DelphiRatingResponse: + theme_ids = [t["theme_id"] for t in themes] + raw = self.interviewer.ask_in_character( + persona, user_prompt=self._r2_prompt(themes), + schema_hint=self._r2_schema(theme_ids), validate=self._r2_validate(theme_ids), + ) + return DelphiRatingResponse(agent_id=persona.agent_id, round=2, + ratings={k: dict(v) for k, v in raw["ratings"].items()}) + + # --- Round 3: revise after seeing group stats --- + def administer_round3( + self, persona: PersonaRecord, themes: list[dict], group_stats: dict, own_r2: DelphiRatingResponse + ) -> DelphiRatingResponse: + theme_ids = [t["theme_id"] for t in themes] + head = ("Sie sehen unten die anonymisierten Gruppenwerte (Median, IQR). " + "Bitte überarbeiten Sie Ihre Bewertungen, wenn Sie möchten, und begründen Sie kurz.") \ + if self.language == "de" else \ + ("Below are the anonymised group values (median, IQR). " + "Please revise your ratings if you wish and add a short justification.") + ctx_lines = [] + for t in themes: + tid = t["theme_id"] + gs = group_stats.get(tid, {}) + own = own_r2.ratings.get(tid, {}) + ctx_lines.append( + f"[{tid}] {t['label']} — group importance median={gs.get('imp_median')}, " + f"IQR={gs.get('imp_iqr')}; plausibility median={gs.get('plaus_median')}, " + f"IQR={gs.get('plaus_iqr')}. Your R2: imp={own.get('importance')}, plaus={own.get('plausibility')}." + ) + prompt = head + "\n\n" + "\n".join(ctx_lines) + schema = json.dumps({ + "ratings": {tid: {"importance": "", "plausibility": ""} for tid in theme_ids}, + "justification": "", + }, ensure_ascii=False) + def validate(raw): + if not isinstance(raw, dict): return None + ratings = raw.get("ratings", {}) + if set(ratings.keys()) != set(theme_ids): return None + for r in ratings.values(): + if not isinstance(r, dict): return None + for key in ("importance", "plausibility"): + if not isinstance(r.get(key), int) or not 1 <= r[key] <= 5: return None + return raw + raw = self.interviewer.ask_in_character(persona, user_prompt=prompt, + schema_hint=schema, validate=validate) + return DelphiRatingResponse( + agent_id=persona.agent_id, round=3, + ratings={k: dict(v) for k, v in raw["ratings"].items()}, + justification=raw.get("justification"), + ) + +def extract_themes(round1: list[DelphiOpenResponse], llm) -> list[dict]: + text_blocks = [] + for r in round1: + for qid, ans in r.answers.items(): + text_blocks.append(f"[agent {r.agent_id} {qid}] {ans}") + schema = json.dumps({"themes": [{"theme_id": "", "label": ""}]}, ensure_ascii=False) + messages = [ + {"role": "system", "content": + "You extract distinct thematic codes from open-ended German fisheries survey responses. " + f"Return JSON ONLY matching: {schema}. Use stable theme_ids of form theme_0, theme_1, …"}, + {"role": "user", "content": "Responses:\n" + "\n".join(text_blocks) + "\n\nReturn up to 12 distinct themes."}, + ] + raw = llm.chat_json(messages=messages, temperature=0.0) + themes = raw.get("themes", []) if isinstance(raw, dict) else [] + out = [] + for i, t in enumerate(themes): + if isinstance(t, dict) and "label" in t: + out.append({"theme_id": t.get("theme_id") or f"theme_{i}", "label": str(t["label"])}) + return out + +def _iqr(xs: list[float]) -> float: + if not xs: return 0.0 + xs = sorted(xs) + q1 = statistics.quantiles(xs, n=4)[0] if len(xs) >= 4 else xs[0] + q3 = statistics.quantiles(xs, n=4)[2] if len(xs) >= 4 else xs[-1] + return q3 - q1 + +def convergence_metrics(r2: list[DelphiRatingResponse], r3: list[DelphiRatingResponse]) -> dict: + by_r2 = {r.agent_id: r for r in r2} + by_r3 = {r.agent_id: r for r in r3} + themes: set[str] = set() + for r in r2 + r3: + themes.update(r.ratings.keys()) + out: dict[str, dict] = {} + for t in sorted(themes): + imp_r2 = [by_r2[a].ratings[t]["importance"] for a in by_r2 if t in by_r2[a].ratings] + imp_r3 = [by_r3[a].ratings[t]["importance"] for a in by_r3 if t in by_r3[a].ratings] + plaus_r2 = [by_r2[a].ratings[t]["plausibility"] for a in by_r2 if t in by_r2[a].ratings] + plaus_r3 = [by_r3[a].ratings[t]["plausibility"] for a in by_r3 if t in by_r3[a].ratings] + out[t] = { + "imp_median_r2": statistics.median(imp_r2) if imp_r2 else None, + "imp_median_r3": statistics.median(imp_r3) if imp_r3 else None, + "imp_iqr_r2": _iqr(imp_r2), + "imp_iqr_r3": _iqr(imp_r3), + "delta_iqr_importance": _iqr(imp_r3) - _iqr(imp_r2), + "plaus_iqr_r2": _iqr(plaus_r2), + "plaus_iqr_r3": _iqr(plaus_r3), + "delta_iqr_plausibility": _iqr(plaus_r3) - _iqr(plaus_r2), + } + return out + +def group_stats_from_r2(r2: list[DelphiRatingResponse]) -> dict: + themes: set[str] = set() + for r in r2: themes.update(r.ratings.keys()) + stats: dict[str, dict] = {} + for t in themes: + imps = [r.ratings[t]["importance"] for r in r2 if t in r.ratings] + plauss = [r.ratings[t]["plausibility"] for r in r2 if t in r.ratings] + stats[t] = { + "imp_median": statistics.median(imps) if imps else None, + "imp_iqr": _iqr(imps), + "plaus_median": statistics.median(plauss) if plauss else None, + "plaus_iqr": _iqr(plauss), + } + return stats +``` + +- [ ] **Step 5: Run test to verify it passes** + +Run: `cd backend && uv run pytest tests/interviews/test_delphi.py -v` +Expected: 3 passed. + +- [ ] **Step 6: Commit** + +```bash +git add backend/scripts/instruments/delphi_v1.yaml backend/app/services/interviews/delphi.py backend/tests/interviews/test_delphi.py +git commit -m "feat(interviews): Delphi subagent (3 rounds: open, rate, revise) + convergence metrics" +``` + +--- + +### Task 9: Scenario subagent + +**Files:** +- Create: `backend/scripts/instruments/scenario_v1.yaml` +- Create: `backend/app/services/interviews/scenario.py` +- Test: `backend/tests/interviews/test_scenario.py` + +- [ ] **Step 1: Write failing test** + +```python +# backend/tests/interviews/test_scenario.py +from pathlib import Path +from app.services.interviews.base import PersonaRecord, MemoryDigest +from app.services.interviews.scenario import ScenarioSubagent, polarity_matrix + +INSTRUMENT = Path(__file__).resolve().parents[2] / "scripts" / "instruments" / "scenario_v1.yaml" + +class _Mem: + def get_digest(self, agent_id, max_chars=2000): + return MemoryDigest(text="x", available=True) + +class _LLM: + def chat_json(self, messages, temperature=0.0, max_tokens=None, **kw): + return {"ratings": {sid: { + "desirability": 4, "plausibility": 3, "impact_on_my_group": 5, "fairness": 3, + "if_woke_up_response": f"act-on-{sid}", + } for sid in ("S1", "S2", "S3", "S4")}} + +def test_scenario_administer(): + sub = ScenarioSubagent(llm=_LLM(), memory=_Mem(), instrument_path=INSTRUMENT) + persona = PersonaRecord(agent_id=1, name="A", persona="p") + resp = sub.administer(persona) + assert set(resp.ratings.keys()) == {"S1", "S2", "S3", "S4"} + assert resp.ratings["S1"].desirability == 4 + +def test_polarity_matrix(): + from app.models.interview import ScenarioResponse, ScenarioRating + responses = [ScenarioResponse(agent_id=i, ratings={ + "S1": ScenarioRating(desirability=5, plausibility=4, impact_on_my_group=5, fairness=4, + if_woke_up_response="x"), + }) for i in range(3)] + m = polarity_matrix(responses) + assert "S1" in m + assert m["S1"]["mean_desirability"] == 5 + assert m["S1"]["n"] == 3 +``` + +- [ ] **Step 2: Run test to verify it fails** + +Run: `cd backend && uv run pytest tests/interviews/test_scenario.py -v` +Expected: ImportError. + +- [ ] **Step 3: Create instrument YAML** + +`backend/scripts/instruments/scenario_v1.yaml`: +```yaml +name: scenario_v1 +version: "1.0" +language_default: de +scenarios: + - scenario_id: S1 + label_de: "Erholung 2040" + label_en: "Recovery 2040" + description_de: | + Bis 2040 haben sich Dorsch- und Heringsbestände in der westlichen Ostsee + deutlich erholt. MSC-Zertifizierung ist branchenweit Standard. Die kleine + Küstenfischerei hat sich stabilisiert; die Politik gilt als erfolgreich. + description_en: | + By 2040, Western Baltic cod and herring stocks have substantially recovered. + MSC certification is industry-wide standard. Small-scale coastal fisheries + have stabilised; policy is regarded as successful. + - scenario_id: S2 + label_de: "Kollaps 2040" + label_en: "Collapse 2040" + description_de: | + Bis 2040 sind Dorsch- und Heringsbestände zusammengebrochen. Die Flotte + ist halbiert, Aquakultur dominiert den Markt, Häfen veröden. + description_en: | + By 2040, cod and herring stocks have collapsed. The fleet is halved, + aquaculture dominates the market, harbour towns decline. + - scenario_id: S3 + label_de: "Festung Europa 2040" + label_en: "Fortress Europe 2040" + description_de: | + Bis 2040 verfolgt die EU eine protektionistische Politik mit hohen Importzöllen, + Meeresschutzgebiete bedecken 30% der Ostsee, Sportfischerei ist stark eingeschränkt. + description_en: | + By 2040, the EU pursues a protectionist policy with high import tariffs, + MPAs cover 30% of the Baltic, recreational fishing is strongly curtailed. + - scenario_id: S4 + label_de: "Privatisierung 2040" + label_en: "Privatisation 2040" + description_de: | + Bis 2040 sind Fangrechte als handelbare Quoten (ITQs) etabliert. Die Branche + hat sich konsolidiert; nur große, kapitalstarke Unternehmen sind übrig. + description_en: | + By 2040, fishing rights are tradable quotas (ITQs). The industry has + consolidated; only large, well-capitalised firms remain. +dimensions: + - {dimension_id: desirability, scale: 7, + de: "Wie wünschenswert ist dieses Szenario?", en: "How desirable is this scenario?"} + - {dimension_id: plausibility, scale: 7, + de: "Wie plausibel ist dieses Szenario?", en: "How plausible is this scenario?"} + - {dimension_id: impact_on_my_group, scale: 7, + de: "Wie stark trifft es Ihre Gruppe?", en: "How strongly does it affect your group?"} + - {dimension_id: fairness, scale: 7, + de: "Wie fair ist dieses Szenario?", en: "How fair is this scenario?"} +``` + +- [ ] **Step 4: Implement subagent** + +`backend/app/services/interviews/scenario.py`: +```python +from __future__ import annotations +import json +import statistics +from pathlib import Path +from typing import Optional +import yaml +from app.models.interview import ScenarioRating, ScenarioResponse +from app.services.interviews.base import StakeholderInterviewer, PersonaRecord + +class ScenarioSubagent: + def __init__(self, llm, memory, instrument_path: Path, language: str = "de"): + with Path(instrument_path).open("r", encoding="utf-8") as f: + self.instrument = yaml.safe_load(f) + self.interviewer = StakeholderInterviewer(llm=llm, memory=memory, language=language) + self.language = language + + def _schema_hint(self) -> str: + sids = [s["scenario_id"] for s in self.instrument["scenarios"]] + return json.dumps({ + "ratings": {sid: { + "desirability": "", + "plausibility": "", + "impact_on_my_group": "", + "fairness": "", + "if_woke_up_response": "", + } for sid in sids} + }, ensure_ascii=False) + + def _user_prompt(self) -> str: + head = ("Bewerten Sie jedes der folgenden Szenarien auf vier Dimensionen (1-7) " + "und beantworten Sie kurz, was Sie tun würden, wenn Sie in dieser Welt aufwachten.") \ + if self.language == "de" else \ + ("Rate each of the following scenarios on four dimensions (1-7) " + "and briefly answer what you would do if you woke up in this world.") + blocks = [] + for s in self.instrument["scenarios"]: + label = s["label_de"] if self.language == "de" else s["label_en"] + desc = s["description_de"] if self.language == "de" else s["description_en"] + blocks.append(f"--- {s['scenario_id']}: {label} ---\n{desc}") + return head + "\n\n" + "\n\n".join(blocks) + + def _validate(self, raw: dict) -> Optional[dict]: + if not isinstance(raw, dict): return None + sids = {s["scenario_id"] for s in self.instrument["scenarios"]} + ratings = raw.get("ratings", {}) + if set(ratings.keys()) != sids: return None + for v in ratings.values(): + if not isinstance(v, dict): return None + for k in ("desirability", "plausibility", "impact_on_my_group", "fairness"): + if not isinstance(v.get(k), int) or not 1 <= v[k] <= 7: return None + if not isinstance(v.get("if_woke_up_response", ""), str): return None + return raw + + def administer(self, persona: PersonaRecord) -> ScenarioResponse: + raw = self.interviewer.ask_in_character( + persona, user_prompt=self._user_prompt(), + schema_hint=self._schema_hint(), validate=self._validate, + ) + ratings = {sid: ScenarioRating(**v) for sid, v in raw["ratings"].items()} + return ScenarioResponse(agent_id=persona.agent_id, ratings=ratings) + +def polarity_matrix(responses: list[ScenarioResponse]) -> dict: + matrix: dict[str, dict] = {} + sids: set[str] = set() + for r in responses: sids.update(r.ratings.keys()) + for sid in sorted(sids): + vals = [r.ratings[sid] for r in responses if sid in r.ratings] + if not vals: + matrix[sid] = {"n": 0} + continue + matrix[sid] = { + "n": len(vals), + "mean_desirability": statistics.mean(v.desirability for v in vals), + "mean_plausibility": statistics.mean(v.plausibility for v in vals), + "mean_impact": statistics.mean(v.impact_on_my_group for v in vals), + "mean_fairness": statistics.mean(v.fairness for v in vals), + "sd_desirability": statistics.pstdev([v.desirability for v in vals]) if len(vals) > 1 else 0.0, + "sd_plausibility": statistics.pstdev([v.plausibility for v in vals]) if len(vals) > 1 else 0.0, + } + return matrix +``` + +- [ ] **Step 5: Run test to verify it passes** + +Run: `cd backend && uv run pytest tests/interviews/test_scenario.py -v` +Expected: 2 passed. + +- [ ] **Step 6: Commit** + +```bash +git add backend/scripts/instruments/scenario_v1.yaml backend/app/services/interviews/scenario.py backend/tests/interviews/test_scenario.py +git commit -m "feat(interviews): scenario subagent with 4 futures × 4 dimensions + polarity matrix" +``` + +--- + +## Phase 3 — Storage and Zep + +### Task 10: Interview storage layout writer + +**Files:** +- Create: `backend/app/services/interviews/storage.py` +- Test: `backend/tests/interviews/test_storage.py` + +- [ ] **Step 1: Write failing test** + +```python +# backend/tests/interviews/test_storage.py +import json +from pathlib import Path +from app.models.interview import ( + LikertResponse, InterviewPhase, SubagentKind, +) +from app.services.interviews.storage import InterviewStore + +def test_run_directory_layout(tmp_path): + store = InterviewStore(root=tmp_path, sim_id="sim42") + run_dir = store.start_run(phase=InterviewPhase.T0, subagent=SubagentKind.LONGITUDINAL) + assert run_dir.exists() + assert run_dir.parent.name == "longitudinal" + assert run_dir.parent.parent.name == "T0" + +def test_append_response(tmp_path): + store = InterviewStore(root=tmp_path, sim_id="sim42") + run_dir = store.start_run(phase=InterviewPhase.T0, subagent=SubagentKind.LONGITUDINAL) + r = LikertResponse(agent_id=1, phase=InterviewPhase.T0, + responses={"a": 3}, confidence={"a": 0.5}) + store.append_response(run_dir, r) + contents = (run_dir / "responses.jsonl").read_text() + assert json.loads(contents.splitlines()[0])["agent_id"] == 1 + +def test_write_aggregate_and_latest_pointer(tmp_path): + store = InterviewStore(root=tmp_path, sim_id="sim42") + run_dir = store.start_run(phase=InterviewPhase.T1, subagent=SubagentKind.SCENARIO) + store.write_aggregate(run_dir, {"k": 1}) + store.mark_latest(run_dir) + latest = (run_dir.parent / "latest.json").read_text() + assert json.loads(latest)["run_dir"].endswith(run_dir.name) + +def test_audit_log_append(tmp_path): + store = InterviewStore(root=tmp_path, sim_id="sim42") + run_dir = store.start_run(phase=InterviewPhase.T0, subagent=SubagentKind.DELPHI) + store.audit(run_dir, agent_id=7, event="schema_violation", detail="missing key x") + audit = (run_dir / "audit.jsonl").read_text() + assert "schema_violation" in audit +``` + +- [ ] **Step 2: Run test to verify it fails** + +Run: `cd backend && uv run pytest tests/interviews/test_storage.py -v` +Expected: ImportError. + +- [ ] **Step 3: Implement storage** + +`backend/app/services/interviews/storage.py`: +```python +from __future__ import annotations +import json +import time +import uuid +from pathlib import Path +from typing import Any +from pydantic import BaseModel +from app.models.interview import InterviewPhase, SubagentKind + +class InterviewStore: + def __init__(self, root: Path, sim_id: str): + self.base = Path(root) / "simulations" / sim_id / "interviews" + self.base.mkdir(parents=True, exist_ok=True) + + def start_run(self, phase: InterviewPhase, subagent: SubagentKind) -> Path: + run_id = time.strftime("%Y%m%dT%H%M%S") + "-" + uuid.uuid4().hex[:6] + run_dir = self.base / phase.value / subagent.value / run_id + run_dir.mkdir(parents=True, exist_ok=True) + meta = {"run_id": run_id, "phase": phase.value, "subagent": subagent.value, + "created_at": time.time()} + (run_dir / "run.json").write_text(json.dumps(meta, indent=2), encoding="utf-8") + return run_dir + + def append_response(self, run_dir: Path, model: BaseModel) -> None: + path = run_dir / "responses.jsonl" + with path.open("a", encoding="utf-8") as f: + f.write(model.model_dump_json() + "\n") + + def append_jsonl(self, run_dir: Path, filename: str, payload: dict | BaseModel) -> None: + path = run_dir / filename + with path.open("a", encoding="utf-8") as f: + if isinstance(payload, BaseModel): + f.write(payload.model_dump_json() + "\n") + else: + f.write(json.dumps(payload, ensure_ascii=False) + "\n") + + def read_responses(self, run_dir: Path, filename: str = "responses.jsonl") -> list[dict]: + path = run_dir / filename + if not path.exists(): return [] + return [json.loads(line) for line in path.read_text(encoding="utf-8").splitlines() if line.strip()] + + def write_aggregate(self, run_dir: Path, payload: dict) -> None: + (run_dir / "aggregate.json").write_text( + json.dumps(payload, ensure_ascii=False, indent=2), encoding="utf-8") + + def write_named(self, run_dir: Path, name: str, payload: Any) -> None: + (run_dir / name).write_text( + json.dumps(payload, ensure_ascii=False, indent=2), encoding="utf-8") + + def audit(self, run_dir: Path, agent_id: int | None, event: str, detail: str = "") -> None: + entry = {"ts": time.time(), "agent_id": agent_id, "event": event, "detail": detail} + with (run_dir / "audit.jsonl").open("a", encoding="utf-8") as f: + f.write(json.dumps(entry, ensure_ascii=False) + "\n") + + def mark_latest(self, run_dir: Path) -> None: + pointer = run_dir.parent / "latest.json" + pointer.write_text(json.dumps({ + "run_dir": str(run_dir.relative_to(self.base)), + }), encoding="utf-8") + + def latest_run(self, phase: InterviewPhase, subagent: SubagentKind) -> Path | None: + pointer = self.base / phase.value / subagent.value / "latest.json" + if not pointer.exists(): return None + rel = json.loads(pointer.read_text())["run_dir"] + path = self.base / rel + return path if path.exists() else None +``` + +- [ ] **Step 4: Run test to verify it passes** + +Run: `cd backend && uv run pytest tests/interviews/test_storage.py -v` +Expected: 4 passed. + +- [ ] **Step 5: Commit** + +```bash +git add backend/app/services/interviews/storage.py backend/tests/interviews/test_storage.py +git commit -m "feat(interviews): JSONL/JSON storage layout with run_id directories and latest pointer" +``` + +--- + +### Task 11: Zep episode writer for interviews + +**Files:** +- Create: `backend/app/services/interviews/zep_writer.py` +- Test: `backend/tests/interviews/test_zep_writer.py` + +- [ ] **Step 1: Write failing test** + +```python +# backend/tests/interviews/test_zep_writer.py +from app.models.interview import ( + LikertResponse, InterviewPhase, SubagentKind, +) +from app.services.interviews.zep_writer import InterviewZepWriter + +class _FakeMemoryUpdater: + def __init__(self): + self.events = [] + def add_activity(self, activity): + self.events.append(activity) + def add_text_episode(self, graph_id, text): + self.events.append({"graph_id": graph_id, "text": text}) + +def test_per_agent_episode_text(): + upd = _FakeMemoryUpdater() + w = InterviewZepWriter(memory_updater=upd, graph_id="g1") + r = LikertResponse(agent_id=42, phase=InterviewPhase.T1, + responses={"stk_1": 4, "gov_1": 3}, + confidence={"stk_1": 0.8, "gov_1": 0.7}) + w.write_per_agent(SubagentKind.LONGITUDINAL, r, agent_name="Fischer Müller") + assert any("Fischer Müller" in str(e) for e in upd.events) + assert any("longitudinal/T1" in str(e) for e in upd.events) + +def test_aggregate_episode(): + upd = _FakeMemoryUpdater() + w = InterviewZepWriter(memory_updater=upd, graph_id="g1") + w.write_aggregate(SubagentKind.SCENARIO, summary="S1 mean desirability 5.2; S2 mean 2.1") + assert any("S1 mean" in str(e) for e in upd.events) +``` + +- [ ] **Step 2: Run test to verify it fails** + +Run: `cd backend && uv run pytest tests/interviews/test_zep_writer.py -v` +Expected: ImportError. + +- [ ] **Step 3: Implement writer** + +`backend/app/services/interviews/zep_writer.py`: +```python +from __future__ import annotations +from typing import Any, Optional +from app.models.interview import ( + LikertResponse, QSortResponse, DelphiRatingResponse, ScenarioResponse, SubagentKind, +) + +class InterviewZepWriter: + """Mirrors `ZepGraphMemoryUpdater.add_activity` usage but for interview episodes. + + The real `ZepGraphMemoryUpdater` may expose `add_activity` (preferred) or a lower-level + text-episode method; this writer adapts to either via duck typing. + """ + def __init__(self, memory_updater, graph_id: str): + self.updater = memory_updater + self.graph_id = graph_id + + def _emit(self, text: str) -> None: + if hasattr(self.updater, "add_text_episode"): + self.updater.add_text_episode(self.graph_id, text) + elif hasattr(self.updater, "add_activity"): + self.updater.add_activity({"graph_id": self.graph_id, "text": text}) + else: + raise RuntimeError("memory_updater has neither add_text_episode nor add_activity") + + def _summarize_likert(self, r: LikertResponse) -> str: + mean_v = sum(r.responses.values()) / max(len(r.responses), 1) + top = sorted(r.responses.items(), key=lambda kv: -kv[1])[:3] + bot = sorted(r.responses.items(), key=lambda kv: kv[1])[:3] + return (f"mean={mean_v:.2f}; agrees with {[k for k,_ in top]}; " + f"disagrees with {[k for k,_ in bot]}") + + def _summarize_qsort(self, r: QSortResponse) -> str: + plus = [k for k, v in r.placements.items() if v >= 2] + minus = [k for k, v in r.placements.items() if v <= -2] + return f"+strongly:{plus}; -strongly:{minus}" + + def _summarize_scenario(self, r: ScenarioResponse) -> str: + parts = [f"{sid}: des={rt.desirability} plaus={rt.plausibility}" + for sid, rt in r.ratings.items()] + return "; ".join(parts) + + def write_per_agent( + self, subagent: SubagentKind, response: Any, agent_name: str, + phase: Optional[str] = None, + ) -> None: + if isinstance(response, LikertResponse): + phase = phase or response.phase.value + summary = self._summarize_likert(response) + elif isinstance(response, QSortResponse): + phase = phase or "T1" + summary = self._summarize_qsort(response) + elif isinstance(response, ScenarioResponse): + phase = phase or "T1" + summary = self._summarize_scenario(response) + elif isinstance(response, DelphiRatingResponse): + phase = phase or f"T1/R{response.round}" + summary = f"round={response.round}; {len(response.ratings)} themes rated" + else: + phase = phase or "T1" + summary = str(response)[:200] + text = f"Agent {agent_name} (interview/{subagent.value}/{phase}): {summary}" + self._emit(text) + + def write_aggregate(self, subagent: SubagentKind, summary: str) -> None: + self._emit(f"Interview aggregate ({subagent.value}): {summary}") +``` + +- [ ] **Step 4: Run test to verify it passes** + +Run: `cd backend && uv run pytest tests/interviews/test_zep_writer.py -v` +Expected: 2 passed. + +- [ ] **Step 5: Commit** + +```bash +git add backend/app/services/interviews/zep_writer.py backend/tests/interviews/test_zep_writer.py +git commit -m "feat(interviews): Zep writer adapts add_activity/add_text_episode for per-agent + aggregate episodes" +``` + +--- + +## Phase 4 — Orchestrator, lifecycle, synthesiser + +### Task 12: InterviewOrchestrator (parallel fan-out) + +**Files:** +- Create: `backend/app/services/interview_orchestrator.py` +- Test: `backend/tests/interviews/test_orchestrator.py` + +- [ ] **Step 1: Write failing test** + +```python +# backend/tests/interviews/test_orchestrator.py +from pathlib import Path +import pytest +from app.models.interview import InterviewPhase, SubagentKind +from app.services.interviews.base import PersonaRecord, MemoryDigest +from app.services.interview_orchestrator import ( + InterviewOrchestrator, PersonaProvider, +) + +INST_DIR = Path(__file__).resolve().parents[2] / "scripts" / "instruments" + +class _Mem: + def get_digest(self, agent_id, max_chars=2000): + return MemoryDigest(text="x", available=True) + +class _LLM: + def chat_json(self, messages, temperature=0.0, max_tokens=None, **kw): + sys_text = next((m["content"] for m in messages if m["role"] == "system"), "") + if "longitudinal" in sys_text or "stk_" in (messages[-1].get("content") or ""): + return { + "responses": {k: 3 for k in ("stk_1","stk_2","stk_3","gov_1","gov_2","gov_3", + "mkt_1","mkt_2","mkt_3","clm_1","clm_2","clm_3")}, + "confidence": {}, "open_comment": "ok", + } + return {} + +class _Personas(PersonaProvider): + def __init__(self, n=3): + self._items = [PersonaRecord(agent_id=i, name=f"A{i}", persona="p") for i in range(n)] + def all(self): return list(self._items) + +class _NoopZep: + def write_per_agent(self, *a, **kw): pass + def write_aggregate(self, *a, **kw): pass + +def test_pre_phase_runs_longitudinal_only(tmp_path): + orch = InterviewOrchestrator( + llm=_LLM(), memory=_Mem(), personas=_Personas(3), + instrument_dir=INST_DIR, store_root=tmp_path, sim_id="sim1", + zep_writer=_NoopZep(), max_workers=2, + ) + result = orch.run_pre() + assert result["longitudinal"]["n_responded"] == 3 + assert "diversity" not in result # only longitudinal in pre-phase + +def test_partial_failure_does_not_kill_run(tmp_path): + class _FlakyLLM: + def __init__(self): self.n = 0 + def chat_json(self, messages, temperature=0.0, max_tokens=None, **kw): + self.n += 1 + if self.n % 2 == 0: + raise RuntimeError("simulated LLM 5xx") + return { + "responses": {k: 3 for k in ("stk_1","stk_2","stk_3","gov_1","gov_2","gov_3", + "mkt_1","mkt_2","mkt_3","clm_1","clm_2","clm_3")}, + "confidence": {}, "open_comment": "ok", + } + orch = InterviewOrchestrator( + llm=_FlakyLLM(), memory=_Mem(), personas=_Personas(4), + instrument_dir=INST_DIR, store_root=tmp_path, sim_id="sim2", + zep_writer=_NoopZep(), max_workers=1, + ) + result = orch.run_pre() + assert result["longitudinal"]["n_responded"] < 4 + assert result["longitudinal"]["n_failed"] > 0 +``` + +- [ ] **Step 2: Run test to verify it fails** + +Run: `cd backend && uv run pytest tests/interviews/test_orchestrator.py -v` +Expected: ImportError. + +- [ ] **Step 3: Implement orchestrator** + +`backend/app/services/interview_orchestrator.py`: +```python +from __future__ import annotations +from concurrent.futures import ThreadPoolExecutor, as_completed +from pathlib import Path +from typing import Protocol +from app.models.interview import ( + InterviewPhase, SubagentKind, LikertResponse, QSortResponse, + DelphiOpenResponse, DelphiRatingResponse, ScenarioResponse, +) +from app.services.interviews.base import PersonaRecord +from app.services.interviews.longitudinal import LongitudinalSubagent, run_aggregate as longitudinal_aggregate +from app.services.interviews.diversity import DiversitySubagent, run_typology +from app.services.interviews.delphi import ( + DelphiSubagent, extract_themes, convergence_metrics, group_stats_from_r2, +) +from app.services.interviews.scenario import ScenarioSubagent, polarity_matrix +from app.services.interviews.storage import InterviewStore +from app.services.interviews.instrument_loader import freeze_snapshot + +class PersonaProvider(Protocol): + def all(self) -> list[PersonaRecord]: ... + +class InterviewOrchestrator: + def __init__( + self, llm, memory, personas: PersonaProvider, + instrument_dir: Path, store_root: Path, sim_id: str, + zep_writer, max_workers: int = 8, language: str = "de", + ): + self.llm = llm + self.memory = memory + self.personas = personas + self.instrument_dir = Path(instrument_dir) + self.store = InterviewStore(root=store_root, sim_id=sim_id) + self.zep_writer = zep_writer + self.max_workers = max_workers + self.language = language + # Freeze snapshot once per orchestrator lifetime + freeze_snapshot( + instruments={ + "longitudinal": self.instrument_dir / "longitudinal_v1.yaml", + "diversity": self.instrument_dir / "diversity_v1.yaml", + "delphi": self.instrument_dir / "delphi_v1.yaml", + "scenario": self.instrument_dir / "scenario_v1.yaml", + }, + out_path=self.store.base / "instruments_used.json", + ) + + # --- Generic per-agent runner --- + def _fan_out(self, run_dir, agent_fn, personas, audit_label): + ok: list = [] + failed: list[int] = [] + with ThreadPoolExecutor(max_workers=self.max_workers) as pool: + futures = {pool.submit(agent_fn, p): p for p in personas} + for fut in as_completed(futures): + p = futures[fut] + try: + out = fut.result() + ok.append(out) + self.store.append_response(run_dir, out) + except Exception as e: + failed.append(p.agent_id) + self.store.audit(run_dir, agent_id=p.agent_id, + event="agent_failed", detail=f"{audit_label}: {e!r}") + return ok, failed + + # --- Pre-phase (T0) --- + def run_pre(self) -> dict: + sub = LongitudinalSubagent(self.llm, self.memory, + self.instrument_dir / "longitudinal_v1.yaml", + language=self.language) + run_dir = self.store.start_run(InterviewPhase.T0, SubagentKind.LONGITUDINAL) + ok, failed = self._fan_out( + run_dir, lambda p: sub.administer(p, phase=InterviewPhase.T0), + self.personas.all(), audit_label="longitudinal_T0", + ) + for r in ok: + persona = next(p for p in self.personas.all() if p.agent_id == r.agent_id) + try: self.zep_writer.write_per_agent(SubagentKind.LONGITUDINAL, r, persona.name) + except Exception: pass + self.store.mark_latest(run_dir) + return {"longitudinal": {"n_responded": len(ok), "n_failed": len(failed), + "run_dir": str(run_dir)}} + + # --- Post-phase (T1) --- + def run_post(self) -> dict: + personas = self.personas.all() + out: dict = {} + with ThreadPoolExecutor(max_workers=4) as pool: + futures = { + "longitudinal": pool.submit(self._post_longitudinal, personas), + "diversity": pool.submit(self._post_diversity, personas), + "scenario": pool.submit(self._post_scenario, personas), + } + for name, fut in futures.items(): + try: out[name] = fut.result() + except Exception as e: out[name] = {"error": repr(e)} + # Delphi runs sequentially (R1 → R2 → R3) and uses the LLM for theme extraction + try: out["delphi"] = self._post_delphi(personas) + except Exception as e: out["delphi"] = {"error": repr(e)} + return out + + def _post_longitudinal(self, personas) -> dict: + sub = LongitudinalSubagent(self.llm, self.memory, + self.instrument_dir / "longitudinal_v1.yaml", + language=self.language) + run_dir = self.store.start_run(InterviewPhase.T1, SubagentKind.LONGITUDINAL) + ok, failed = self._fan_out( + run_dir, lambda p: sub.administer(p, phase=InterviewPhase.T1), + personas, audit_label="longitudinal_T1", + ) + # Aggregate using T0 + T1 + t0_path = self.store.latest_run(InterviewPhase.T0, SubagentKind.LONGITUDINAL) + t0_raw = self.store.read_responses(t0_path) if t0_path else [] + t0 = [LikertResponse(**d) for d in t0_raw] + agg = longitudinal_aggregate(t0, ok) + self.store.write_aggregate(run_dir, agg) + for r in ok: + persona = next(p for p in personas if p.agent_id == r.agent_id) + try: self.zep_writer.write_per_agent(SubagentKind.LONGITUDINAL, r, persona.name) + except Exception: pass + try: self.zep_writer.write_aggregate(SubagentKind.LONGITUDINAL, + f"n_paired={agg['n_paired']}") + except Exception: pass + self.store.mark_latest(run_dir) + return {"n_responded": len(ok), "n_failed": len(failed), "run_dir": str(run_dir)} + + def _post_diversity(self, personas) -> dict: + sub = DiversitySubagent(self.llm, self.memory, + self.instrument_dir / "diversity_v1.yaml", + language=self.language) + run_dir = self.store.start_run(InterviewPhase.T1, SubagentKind.DIVERSITY) + ok, failed = self._fan_out( + run_dir, lambda p: sub.administer(p), personas, audit_label="diversity", + ) + typology = run_typology(ok) + self.store.write_named(run_dir, "typology.json", typology) + self.store.write_aggregate(run_dir, {"n": len(ok), "n_failed": len(failed), + "clusters": typology["clusters"]}) + for r in ok: + persona = next(p for p in personas if p.agent_id == r.agent_id) + try: self.zep_writer.write_per_agent(SubagentKind.DIVERSITY, r, persona.name) + except Exception: pass + self.store.mark_latest(run_dir) + return {"n_responded": len(ok), "n_failed": len(failed), "run_dir": str(run_dir)} + + def _post_scenario(self, personas) -> dict: + sub = ScenarioSubagent(self.llm, self.memory, + self.instrument_dir / "scenario_v1.yaml", + language=self.language) + run_dir = self.store.start_run(InterviewPhase.T1, SubagentKind.SCENARIO) + ok, failed = self._fan_out( + run_dir, lambda p: sub.administer(p), personas, audit_label="scenario", + ) + matrix = polarity_matrix(ok) + self.store.write_named(run_dir, "polarity_matrix.json", matrix) + self.store.write_aggregate(run_dir, {"n": len(ok), "n_failed": len(failed), + "polarity": matrix}) + for r in ok: + persona = next(p for p in personas if p.agent_id == r.agent_id) + try: self.zep_writer.write_per_agent(SubagentKind.SCENARIO, r, persona.name) + except Exception: pass + self.store.mark_latest(run_dir) + return {"n_responded": len(ok), "n_failed": len(failed), "run_dir": str(run_dir)} + + def _post_delphi(self, personas) -> dict: + sub = DelphiSubagent(self.llm, self.memory, + self.instrument_dir / "delphi_v1.yaml", + language=self.language) + run_dir = self.store.start_run(InterviewPhase.T1, SubagentKind.DELPHI) + # Round 1 + r1_ok, r1_failed = self._fan_out( + run_dir, lambda p: sub.administer_round1(p), personas, audit_label="delphi_r1", + ) + # Move all R1 responses into a dedicated file + for r in r1_ok: self.store.append_jsonl(run_dir, "round1_themes.jsonl", r) + # Extract themes from R1 + themes = extract_themes(r1_ok, llm=self.llm) + self.store.write_named(run_dir, "themes.json", {"themes": themes}) + # Round 2 + r2_ok, r2_failed = self._fan_out( + run_dir, lambda p: sub.administer_round2(p, themes), + [p for p in personas if p.agent_id in {r.agent_id for r in r1_ok}], + audit_label="delphi_r2", + ) + for r in r2_ok: self.store.append_jsonl(run_dir, "round2_ratings.jsonl", r) + gstats = group_stats_from_r2(r2_ok) + # Round 3 + r2_by = {r.agent_id: r for r in r2_ok} + r3_personas = [p for p in personas if p.agent_id in r2_by] + def r3_call(p): return sub.administer_round3(p, themes, gstats, r2_by[p.agent_id]) + r3_ok, r3_failed = self._fan_out(run_dir, r3_call, r3_personas, audit_label="delphi_r3") + for r in r3_ok: self.store.append_jsonl(run_dir, "round3_revisions.jsonl", r) + # Convergence + conv = convergence_metrics(r2_ok, r3_ok) + self.store.write_named(run_dir, "convergence.json", conv) + self.store.write_aggregate(run_dir, { + "n_r1": len(r1_ok), "n_r2": len(r2_ok), "n_r3": len(r3_ok), + "n_failed_r1": len(r1_failed), "n_failed_r2": len(r2_failed), "n_failed_r3": len(r3_failed), + "themes": themes, + }) + for r in r3_ok: + persona = next(p for p in personas if p.agent_id == r.agent_id) + try: self.zep_writer.write_per_agent(SubagentKind.DELPHI, r, persona.name) + except Exception: pass + self.store.mark_latest(run_dir) + return {"n_r1": len(r1_ok), "n_r2": len(r2_ok), "n_r3": len(r3_ok), + "run_dir": str(run_dir)} + + # --- Re-run a single subagent --- + def rerun(self, subagent: SubagentKind) -> dict: + personas = self.personas.all() + if subagent == SubagentKind.LONGITUDINAL: return {"longitudinal": self._post_longitudinal(personas)} + if subagent == SubagentKind.DIVERSITY: return {"diversity": self._post_diversity(personas)} + if subagent == SubagentKind.SCENARIO: return {"scenario": self._post_scenario(personas)} + if subagent == SubagentKind.DELPHI: return {"delphi": self._post_delphi(personas)} + raise ValueError(f"unknown subagent {subagent}") +``` + +- [ ] **Step 4: Run test to verify it passes** + +Run: `cd backend && uv run pytest tests/interviews/test_orchestrator.py -v` +Expected: 2 passed. + +- [ ] **Step 5: Commit** + +```bash +git add backend/app/services/interview_orchestrator.py backend/tests/interviews/test_orchestrator.py +git commit -m "feat(interviews): orchestrator with two-phase lifecycle, parallel fan-out, isolated failures" +``` + +--- + +### Task 13: Simulation manager lifecycle hooks + +**Files:** +- Modify: `backend/app/services/simulation_manager.py` +- Test: `backend/tests/interviews/test_simulation_hooks.py` + +- [ ] **Step 1: Write failing test** + +```python +# backend/tests/interviews/test_simulation_hooks.py +from app.services.simulation_manager import SimulationManager, SimulationState + +def test_register_post_ready_hook_invoked(monkeypatch): + called = [] + mgr = SimulationManager() + mgr.register_on_ready(lambda state: called.append(("ready", state.sim_id))) + state = SimulationState(sim_id="abc", status="ready") + mgr._notify_on_ready(state) + assert called == [("ready", "abc")] + +def test_register_post_completed_hook_invoked(): + called = [] + mgr = SimulationManager() + mgr.register_on_completed(lambda state: called.append(("done", state.sim_id))) + state = SimulationState(sim_id="abc", status="completed") + mgr._notify_on_completed(state) + assert called == [("done", "abc")] +``` + +- [ ] **Step 2: Run test to verify it fails** + +Run: `cd backend && uv run pytest tests/interviews/test_simulation_hooks.py -v` +Expected: AttributeError on `register_on_ready` / `register_on_completed`. + +- [ ] **Step 3: Add hook registry to SimulationManager** + +In `backend/app/services/simulation_manager.py`, find the `SimulationManager` class. Add to `__init__` (preserving existing init): +```python + self._on_ready_hooks: list = [] + self._on_completed_hooks: list = [] +``` + +Add methods to the class: +```python + def register_on_ready(self, fn) -> None: + self._on_ready_hooks.append(fn) + + def register_on_completed(self, fn) -> None: + self._on_completed_hooks.append(fn) + + def _notify_on_ready(self, state) -> None: + for fn in list(self._on_ready_hooks): + try: fn(state) + except Exception as e: + from app.utils.logger import get_logger + get_logger(__name__).warning(f"on_ready hook failed: {e!r}") + + def _notify_on_completed(self, state) -> None: + for fn in list(self._on_completed_hooks): + try: fn(state) + except Exception as e: + from app.utils.logger import get_logger + get_logger(__name__).warning(f"on_completed hook failed: {e!r}") +``` + +Locate the existing code that transitions state to `ready` (after `prepare_simulation` completes) and to `completed` (after simulation finishes). Insert calls to `self._notify_on_ready(state)` and `self._notify_on_completed(state)` immediately after each transition. If `SimulationState` is not a simple dataclass with `sim_id` and `status` attributes, adjust the test fixture to match the actual class shape (read the file first). + +- [ ] **Step 4: Run test to verify it passes** + +Run: `cd backend && uv run pytest tests/interviews/test_simulation_hooks.py -v` +Expected: 2 passed. + +- [ ] **Step 5: Commit** + +```bash +git add backend/app/services/simulation_manager.py backend/tests/interviews/test_simulation_hooks.py +git commit -m "feat(interviews): on_ready / on_completed hook registry on SimulationManager" +``` + +--- + +### Task 14: InterviewSynthesizer + +**Files:** +- Create: `backend/app/services/interview_synthesizer.py` +- Test: `backend/tests/interviews/test_synthesizer.py` + +- [ ] **Step 1: Write failing test** + +```python +# backend/tests/interviews/test_synthesizer.py +import json +from pathlib import Path +from app.services.interviews.storage import InterviewStore +from app.models.interview import InterviewPhase, SubagentKind, LikertResponse +from app.services.interview_synthesizer import InterviewSynthesizer + +def _seed_minimal(tmp_path: Path) -> InterviewStore: + store = InterviewStore(root=tmp_path, sim_id="s1") + rd = store.start_run(InterviewPhase.T0, SubagentKind.LONGITUDINAL) + for i in range(3): + store.append_response(rd, LikertResponse( + agent_id=i, phase=InterviewPhase.T0, + responses={"stk_1": 3, "gov_1": 3}, confidence={"stk_1": 0.5, "gov_1": 0.5}, + )) + store.write_aggregate(rd, {"per_item": {}, "n_paired": 0}) + store.mark_latest(rd) + return store + +def test_synthesizer_runs_with_partial_data(tmp_path): + store = _seed_minimal(tmp_path) + synth = InterviewSynthesizer(store=store) + report = synth.run() + assert "limitations" in report.lower() + assert "stub mode" in report.lower() or "n_responded" in report.lower() + +def test_synthesizer_writes_files(tmp_path): + store = _seed_minimal(tmp_path) + synth = InterviewSynthesizer(store=store) + synth.run() + files = list((store.base / "synthesis").iterdir()) + names = {f.name for f in files} + assert "report.md" in names +``` + +- [ ] **Step 2: Run test to verify it fails** + +Run: `cd backend && uv run pytest tests/interviews/test_synthesizer.py -v` +Expected: ImportError. + +- [ ] **Step 3: Implement synthesiser** + +`backend/app/services/interview_synthesizer.py`: +```python +from __future__ import annotations +import csv +import json +from pathlib import Path +from app.models.interview import InterviewPhase, SubagentKind +from app.services.interviews.storage import InterviewStore + +class InterviewSynthesizer: + def __init__(self, store: InterviewStore): + self.store = store + + def _maybe(self, phase: InterviewPhase, sub: SubagentKind) -> dict | None: + run = self.store.latest_run(phase, sub) + if run is None: return None + agg = run / "aggregate.json" + if not agg.exists(): return None + return {"run_dir": str(run), "aggregate": json.loads(agg.read_text(encoding="utf-8"))} + + def _instrument_hashes(self) -> dict: + snap = self.store.base / "instruments_used.json" + if not snap.exists(): return {} + try: data = json.loads(snap.read_text(encoding="utf-8")) + except Exception: return {} + return {k: v.get("hash") for k, v in data.items()} + + def _limitations_text(self, present: dict[str, bool]) -> str: + lines = [ + "## Limitations", + "- **Simulated, not real stakeholders.** Responses reflect how the seed-document discourse " + "and the LLM jointly encode each stakeholder type, not what an actual fisher or NGO " + "staffer would say. The instrument measures the *model of the stakeholder*, not the stakeholder.", + "- **Memory digest is lossy.** Each agent's experience of OASIS is summarised to bounded length; " + "agents do not have full episodic recall.", + "- **LLM acquiescence and centrality bias.** Likert scales with LLM respondents skew toward 3–4 " + "of 5; check per-item distribution shape before drawing conclusions.", + "- **N is what it is.** `n_responded` and `n_failed` are printed verbatim per subagent; no smoothing.", + "- **Instrument provenance.** Hashes of frozen instruments are listed below; an identical run " + "is reproducible from these snapshots.", + ] + for k, ok in present.items(): + if not ok: + lines.append(f"- *{k}* subagent results are missing for this run.") + return "\n".join(lines) + + def run(self) -> str: + sections: list[str] = [] + sections.append("# Stakeholder Interview Synthesis\n") + + long_t0 = self._maybe(InterviewPhase.T0, SubagentKind.LONGITUDINAL) + long_t1 = self._maybe(InterviewPhase.T1, SubagentKind.LONGITUDINAL) + if long_t1: + agg = long_t1["aggregate"] + sections.append("## Longitudinal opinion drift (T0 → T1)") + sections.append(f"- N paired: {agg.get('n_paired', 'NA')}") + per_item = agg.get("per_item", {}) + top = sorted(per_item.items(), + key=lambda kv: abs(kv[1].get("mean_delta") or 0), reverse=True)[:5] + sections.append("- Largest mean shifts:") + for k, v in top: + sections.append(f" - `{k}`: Δ̄ = {v.get('mean_delta'):+0.2f} (n={v.get('n')})") + + diversity = self._maybe(InterviewPhase.T1, SubagentKind.DIVERSITY) + if diversity: + clusters = diversity["aggregate"].get("clusters", []) + sections.append("## Stakeholder typology") + sections.append(f"- N agents: {diversity['aggregate'].get('n', 'NA')}") + sections.append(f"- Clusters: {len(clusters)}") + for c in clusters: + sections.append(f" - cluster {c['cluster_id']}: n={c['n']}, " + f"top loadings = {list(c['top_loadings'].keys())[:5]}") + + delphi = self._maybe(InterviewPhase.T1, SubagentKind.DELPHI) + if delphi: + agg = delphi["aggregate"] + sections.append("## Delphi consensus") + sections.append(f"- Rounds completed: R1={agg.get('n_r1')}, R2={agg.get('n_r2')}, R3={agg.get('n_r3')}") + themes = agg.get("themes", []) + sections.append(f"- Themes: {[t.get('label') for t in themes]}") + + scenario = self._maybe(InterviewPhase.T1, SubagentKind.SCENARIO) + if scenario: + pol = scenario["aggregate"].get("polarity", {}) + sections.append("## Scenario evaluation") + for sid in sorted(pol): + v = pol[sid] + if v.get("n", 0) == 0: continue + sections.append( + f"- **{sid}**: n={v['n']}, desirability {v['mean_desirability']:.2f}, " + f"plausibility {v['mean_plausibility']:.2f}, impact {v['mean_impact']:.2f}, " + f"fairness {v['mean_fairness']:.2f}") + + sections.append("") + sections.append(self._limitations_text({ + "longitudinal": bool(long_t1), + "diversity": bool(diversity), + "delphi": bool(delphi), + "scenario": bool(scenario), + })) + sections.append("") + sections.append("### Instrument provenance") + for name, h in self._instrument_hashes().items(): + sections.append(f"- `{name}`: hash `{h}`") + + report = "\n\n".join(sections) + out_dir = self.store.base / "synthesis" + out_dir.mkdir(parents=True, exist_ok=True) + (out_dir / "report.md").write_text(report, encoding="utf-8") + self._write_tidy_csv(out_dir / "exports" / "all_responses.csv") + return report + + def _write_tidy_csv(self, csv_path: Path) -> None: + csv_path.parent.mkdir(parents=True, exist_ok=True) + rows: list[dict] = [] + for phase in (InterviewPhase.T0, InterviewPhase.T1): + for sub in SubagentKind: + run = self.store.latest_run(phase, sub) + if run is None: continue + files = ["responses.jsonl", "round1_themes.jsonl", + "round2_ratings.jsonl", "round3_revisions.jsonl"] + for fname in files: + for rec in self.store.read_responses(run, fname): + flat = self._flatten(rec, phase=phase.value, subagent=sub.value) + rows.extend(flat) + if not rows: + csv_path.write_text("phase,subagent,agent_id,key,value\n", encoding="utf-8") + return + fieldnames = sorted({k for r in rows for k in r.keys()}) + with csv_path.open("w", encoding="utf-8", newline="") as f: + w = csv.DictWriter(f, fieldnames=fieldnames) + w.writeheader() + for r in rows: w.writerow(r) + + def _flatten(self, rec: dict, *, phase: str, subagent: str) -> list[dict]: + out: list[dict] = [] + aid = rec.get("agent_id") + for key, val in rec.items(): + if key == "agent_id": continue + if isinstance(val, dict): + for k2, v2 in val.items(): + if isinstance(v2, dict): + for k3, v3 in v2.items(): + out.append({"phase": phase, "subagent": subagent, "agent_id": aid, + "key": f"{key}.{k2}.{k3}", "value": v3}) + else: + out.append({"phase": phase, "subagent": subagent, "agent_id": aid, + "key": f"{key}.{k2}", "value": v2}) + else: + out.append({"phase": phase, "subagent": subagent, "agent_id": aid, + "key": key, "value": val}) + return out +``` + +- [ ] **Step 4: Run test to verify it passes** + +Run: `cd backend && uv run pytest tests/interviews/test_synthesizer.py -v` +Expected: 2 passed. + +- [ ] **Step 5: Commit** + +```bash +git add backend/app/services/interview_synthesizer.py backend/tests/interviews/test_synthesizer.py +git commit -m "feat(interviews): synthesiser emits cross-method report + tidy CSV + limitations section" +``` + +--- + +## Phase 5 — Adapters and API + +### Task 15: Persona + memory adapters + +**Files:** +- Create: `backend/app/services/interviews/adapters.py` +- Test: `backend/tests/interviews/test_adapters.py` + +- [ ] **Step 1: Write failing test** + +```python +# backend/tests/interviews/test_adapters.py +import csv +import json +from pathlib import Path +from app.services.interviews.adapters import ( + FileSystemPersonaProvider, ZepMemoryProvider, +) + +def _write_reddit_profiles(tmp_path: Path): + data = [ + {"user_id": 0, "user_name": "fischer1", "name": "Fischer Müller", + "persona": "I am a small-scale Baltic fisher.", "profession": "fisher", "bio": ""}, + {"user_id": 1, "user_name": "ngo1", "name": "Ines NGO", + "persona": "I work for an environmental NGO.", "profession": "ngo_staff", "bio": ""}, + ] + p = tmp_path / "reddit_profiles.json" + p.write_text(json.dumps(data), encoding="utf-8") + return p + +def test_file_system_persona_provider_reads_reddit_json(tmp_path): + p = _write_reddit_profiles(tmp_path) + provider = FileSystemPersonaProvider(reddit_path=p, twitter_path=None) + personas = provider.all() + assert len(personas) == 2 + assert personas[0].name == "Fischer Müller" + assert personas[0].agent_id == 0 + +def test_zep_memory_provider_returns_empty_when_unavailable(): + class _BrokenReader: + def get_entity_with_context(self, *a, **kw): + raise RuntimeError("offline") + prov = ZepMemoryProvider(entity_reader=_BrokenReader(), graph_id="g1", + agent_to_entity={0: "uuid-zero"}) + d = prov.get_digest(0) + assert d.available is False + assert d.text != "" + +def test_zep_memory_provider_truncates_to_max_chars(): + class _R: + def get_entity_with_context(self, *a, **kw): + class _Ctx: + name = "X"; summary = "Y" + related_edges = [{"fact": "very long fact " * 200}] + return _Ctx() + prov = ZepMemoryProvider(entity_reader=_R(), graph_id="g1", + agent_to_entity={5: "uuid-five"}) + d = prov.get_digest(5, max_chars=300) + assert d.available is True + assert len(d.text) <= 300 +``` + +- [ ] **Step 2: Run test to verify it fails** + +Run: `cd backend && uv run pytest tests/interviews/test_adapters.py -v` +Expected: ImportError. + +- [ ] **Step 3: Implement adapters** + +`backend/app/services/interviews/adapters.py`: +```python +from __future__ import annotations +import csv +import json +from pathlib import Path +from typing import Optional +from app.services.interviews.base import PersonaRecord, MemoryDigest + +class FileSystemPersonaProvider: + """Reads OASIS profiles from the simulation's `reddit_profiles.json` and/or `twitter_profiles.csv`. + + If both are present, agents from `reddit_profiles.json` take precedence; twitter-only agents are appended. + """ + def __init__(self, reddit_path: Optional[Path], twitter_path: Optional[Path]): + self.reddit_path = Path(reddit_path) if reddit_path else None + self.twitter_path = Path(twitter_path) if twitter_path else None + + def _load_reddit(self) -> list[PersonaRecord]: + if not self.reddit_path or not self.reddit_path.exists(): return [] + data = json.loads(self.reddit_path.read_text(encoding="utf-8")) + out = [] + for row in data: + out.append(PersonaRecord( + agent_id=int(row.get("user_id")), + name=str(row.get("name") or row.get("user_name") or f"agent_{row.get('user_id')}"), + persona=str(row.get("persona") or row.get("bio") or ""), + profession=row.get("profession"), + bio=row.get("bio"), + )) + return out + + def _load_twitter(self) -> list[PersonaRecord]: + if not self.twitter_path or not self.twitter_path.exists(): return [] + out = [] + with self.twitter_path.open("r", encoding="utf-8", newline="") as f: + for row in csv.DictReader(f): + if not row.get("user_id"): continue + out.append(PersonaRecord( + agent_id=int(row["user_id"]), + name=str(row.get("name") or row.get("user_name") or f"agent_{row['user_id']}"), + persona=str(row.get("persona") or row.get("bio") or ""), + profession=row.get("profession"), + bio=row.get("bio"), + )) + return out + + def all(self) -> list[PersonaRecord]: + reddit = self._load_reddit() + seen = {p.agent_id for p in reddit} + twitter = [p for p in self._load_twitter() if p.agent_id not in seen] + return reddit + twitter + +class ZepMemoryProvider: + """Builds a bounded memory digest per agent from Zep entity context. + + Maps `agent_id` (OASIS user_id) to a Zep entity UUID; falls back to the agent_id as a string. + """ + def __init__(self, entity_reader, graph_id: str, agent_to_entity: dict[int, str] | None = None): + self.reader = entity_reader + self.graph_id = graph_id + self.map = dict(agent_to_entity or {}) + + def get_digest(self, agent_id: int, max_chars: int = 2000) -> MemoryDigest: + entity_uuid = self.map.get(agent_id) or str(agent_id) + try: + ctx = self.reader.get_entity_with_context(self.graph_id, entity_uuid) + except Exception: + return MemoryDigest(text=f"[no memory for agent {agent_id}]", available=False) + parts: list[str] = [] + name = getattr(ctx, "name", None) + summary = getattr(ctx, "summary", None) + if name: parts.append(f"Name: {name}") + if summary: parts.append(f"Summary: {summary}") + edges = getattr(ctx, "related_edges", []) or [] + for e in edges[:20]: + fact = e.get("fact") if isinstance(e, dict) else getattr(e, "fact", None) + if fact: parts.append(f"- {fact}") + text = "\n".join(parts) + if len(text) > max_chars: text = text[: max_chars - 1] + "…" + return MemoryDigest(text=text or f"[empty memory for agent {agent_id}]", available=True) +``` + +- [ ] **Step 4: Run test to verify it passes** + +Run: `cd backend && uv run pytest tests/interviews/test_adapters.py -v` +Expected: 3 passed. + +- [ ] **Step 5: Commit** + +```bash +git add backend/app/services/interviews/adapters.py backend/tests/interviews/test_adapters.py +git commit -m "feat(interviews): persona + Zep memory adapters bridging existing services to interview subsystem" +``` + +--- + +### Task 16: /api/interview Flask blueprint + +**Files:** +- Create: `backend/app/api/interview.py` +- Modify: `backend/app/api/__init__.py` +- Test: `backend/tests/interviews/test_api_interview.py` + +- [ ] **Step 1: Write failing test** + +```python +# backend/tests/interviews/test_api_interview.py +import json +import os +from pathlib import Path +import pytest + +@pytest.fixture +def client(tmp_path, monkeypatch): + monkeypatch.setenv("LLM_STUB_MODE", "true") + monkeypatch.setenv("UPLOADS_DIR", str(tmp_path)) + from app.config import Config + Config.LLM_STUB_MODE = True + Config.UPLOADS_DIR = str(tmp_path) + # Seed a minimal reddit_profiles.json + sim_dir = tmp_path / "simulations" / "sim_test" + sim_dir.mkdir(parents=True) + profiles = [{"user_id": i, "user_name": f"u{i}", "name": f"A{i}", + "persona": "p", "profession": "fisher"} for i in range(3)] + (sim_dir / "reddit_profiles.json").write_text(json.dumps(profiles), encoding="utf-8") + from flask import Flask + from app.api import register_blueprints + app = Flask(__name__) + register_blueprints(app) + return app.test_client() + +def test_post_pre_returns_task_id(client): + res = client.post("/api/interview/sim_test/pre") + assert res.status_code == 200 + body = res.get_json() + assert body["success"] is True + assert "task_id" in body["data"] + +def test_status_endpoint_returns_progress(client): + res = client.post("/api/interview/sim_test/pre") + task_id = res.get_json()["data"]["task_id"] + res2 = client.get(f"/api/interview/sim_test/status?task_id={task_id}") + assert res2.status_code == 200 + assert "status" in res2.get_json()["data"] + +def test_unknown_subagent_returns_400(client): + res = client.post("/api/interview/sim_test/rerun", + json={"subagent": "nonsense"}) + assert res.status_code == 400 +``` + +- [ ] **Step 2: Run test to verify it fails** + +Run: `cd backend && uv run pytest tests/interviews/test_api_interview.py -v` +Expected: ImportError / 404. + +- [ ] **Step 3: Check current `api/__init__.py`** + +Read `backend/app/api/__init__.py` and identify how `graph_bp`, `simulation_bp`, `report_bp` are registered. The test expects a `register_blueprints(app)` helper — if one doesn't exist, add it. + +- [ ] **Step 4: Modify `api/__init__.py`** + +Replace contents (preserving existing blueprint imports — adjust to match actual file): +```python +from flask import Flask +from .graph import graph_bp +from .simulation import simulation_bp +from .report import report_bp +from .interview import interview_bp + +def register_blueprints(app: Flask) -> None: + app.register_blueprint(graph_bp, url_prefix="/api/graph") + app.register_blueprint(simulation_bp, url_prefix="/api/simulation") + app.register_blueprint(report_bp, url_prefix="/api/report") + app.register_blueprint(interview_bp, url_prefix="/api/interview") +``` + +If the existing app factory in `app/__init__.py` already calls register manually, update it to call `register_blueprints(app)` instead. + +- [ ] **Step 5: Implement blueprint** + +`backend/app/api/interview.py`: +```python +from __future__ import annotations +import threading +import traceback +import uuid +from pathlib import Path +from flask import Blueprint, jsonify, request, send_file +from app.config import Config +from app.models.interview import SubagentKind, InterviewPhase +from app.services.interviews.adapters import FileSystemPersonaProvider, ZepMemoryProvider +from app.services.interviews.zep_writer import InterviewZepWriter +from app.services.interview_orchestrator import InterviewOrchestrator +from app.services.interview_synthesizer import InterviewSynthesizer +from app.services.interviews.storage import InterviewStore +from app.utils.llm_client import LLMClient + +interview_bp = Blueprint("interview", __name__) +_TASKS: dict[str, dict] = {} +_LOCK = threading.Lock() + +INSTRUMENT_DIR = Path(__file__).resolve().parents[2] / "scripts" / "instruments" + +def _uploads_root() -> Path: + return Path(getattr(Config, "UPLOADS_DIR", "uploads")) + +def _build_orchestrator(sim_id: str) -> InterviewOrchestrator: + sim_dir = _uploads_root() / "simulations" / sim_id + reddit = sim_dir / "reddit_profiles.json" + twitter = sim_dir / "twitter_profiles.csv" + personas = FileSystemPersonaProvider(reddit_path=reddit if reddit.exists() else None, + twitter_path=twitter if twitter.exists() else None) + # Zep memory + writer: best-effort; in stub/test mode the writer no-ops on exceptions + class _NullUpdater: + def add_text_episode(self, *a, **kw): return None + try: + from app.services.zep_entity_reader import ZepEntityReader + from app.services.zep_graph_memory_updater import ZepGraphMemoryUpdater + graph_id = (sim_dir / "graph_id.txt").read_text().strip() if (sim_dir / "graph_id.txt").exists() else "" + reader = ZepEntityReader() + updater = ZepGraphMemoryUpdater() + memory = ZepMemoryProvider(reader, graph_id=graph_id) + zep_writer = InterviewZepWriter(memory_updater=updater, graph_id=graph_id) + except Exception: + class _Mem: + def get_digest(self, agent_id, max_chars=2000): + from app.services.interviews.base import MemoryDigest + return MemoryDigest(text="[memory unavailable]", available=False) + memory = _Mem() + zep_writer = InterviewZepWriter(memory_updater=_NullUpdater(), graph_id="") + llm = LLMClient(api_key=Config.LLM_API_KEY, base_url=Config.LLM_BASE_URL, + model=Config.LLM_MODEL_NAME) + return InterviewOrchestrator( + llm=llm, memory=memory, personas=personas, + instrument_dir=INSTRUMENT_DIR, store_root=_uploads_root(), sim_id=sim_id, + zep_writer=zep_writer, max_workers=Config.INTERVIEW_MAX_WORKERS, + language=Config.INTERVIEW_DEFAULT_LANGUAGE, + ) + +def _run_task(task_id: str, fn) -> None: + with _LOCK: + _TASKS[task_id] = {"status": "running", "progress": {}, "result": None, "error": None} + try: + result = fn(task_id) + with _LOCK: + _TASKS[task_id]["status"] = "completed"; _TASKS[task_id]["result"] = result + except Exception as e: + with _LOCK: + _TASKS[task_id]["status"] = "failed" + _TASKS[task_id]["error"] = repr(e) + _TASKS[task_id]["traceback"] = traceback.format_exc() + +def _start_task(fn) -> str: + task_id = uuid.uuid4().hex[:12] + with _LOCK: + _TASKS[task_id] = {"status": "queued", "progress": {}, "result": None, "error": None} + threading.Thread(target=_run_task, args=(task_id, fn), daemon=True).start() + return task_id + +def _envelope(data=None, error=None, status: int = 200): + body = {"success": error is None, "data": data or {}, "error": error} + return jsonify(body), status + +@interview_bp.route("//pre", methods=["POST"]) +def post_pre(sim_id: str): + orch = _build_orchestrator(sim_id) + task_id = _start_task(lambda tid: orch.run_pre()) + return _envelope({"task_id": task_id}) + +@interview_bp.route("//post", methods=["POST"]) +def post_post(sim_id: str): + orch = _build_orchestrator(sim_id) + def run(tid): + out = orch.run_post() + synth = InterviewSynthesizer(store=orch.store) + out["synthesis"] = synth.run()[:1000] # short preview + return out + task_id = _start_task(run) + return _envelope({"task_id": task_id}) + +@interview_bp.route("//rerun", methods=["POST"]) +def post_rerun(sim_id: str): + body = request.get_json(silent=True) or {} + sub = body.get("subagent") + try: subagent = SubagentKind(sub) + except ValueError: return _envelope(error=f"unknown subagent {sub!r}", status=400) + orch = _build_orchestrator(sim_id) + task_id = _start_task(lambda tid: orch.rerun(subagent)) + return _envelope({"task_id": task_id}) + +@interview_bp.route("//status", methods=["GET"]) +def get_status(sim_id: str): + task_id = request.args.get("task_id") + with _LOCK: + task = _TASKS.get(task_id) + if task is None: return _envelope(error="unknown task_id", status=404) + return _envelope({"status": task["status"], "progress": task.get("progress", {}), + "result": task.get("result"), "error": task.get("error")}) + +@interview_bp.route("//results/", methods=["GET"]) +def get_results(sim_id: str, subagent: str): + try: sub = SubagentKind(subagent) + except ValueError: return _envelope(error=f"unknown subagent {subagent!r}", status=400) + store = InterviewStore(root=_uploads_root(), sim_id=sim_id) + phase = InterviewPhase.T1 if sub != SubagentKind.LONGITUDINAL else InterviewPhase.T1 + run = store.latest_run(phase, sub) + if run is None: return _envelope(error="no results yet", status=404) + agg = (run / "aggregate.json") + if not agg.exists(): return _envelope(error="aggregate missing", status=404) + import json as _j + return _envelope({"aggregate": _j.loads(agg.read_text(encoding="utf-8")), + "run_dir": str(run)}) + +@interview_bp.route("//results/synthesis", methods=["GET"]) +def get_synthesis(sim_id: str): + store = InterviewStore(root=_uploads_root(), sim_id=sim_id) + report = store.base / "synthesis" / "report.md" + if not report.exists(): + synth = InterviewSynthesizer(store=store) + synth.run() + return _envelope({"report_markdown": report.read_text(encoding="utf-8")}) + +@interview_bp.route("//export.csv", methods=["GET"]) +def get_export_csv(sim_id: str): + store = InterviewStore(root=_uploads_root(), sim_id=sim_id) + csv_path = store.base / "synthesis" / "exports" / "all_responses.csv" + if not csv_path.exists(): + InterviewSynthesizer(store=store).run() + return send_file(csv_path, mimetype="text/csv", as_attachment=True, + download_name=f"{sim_id}_interviews.csv") +``` + +- [ ] **Step 6: Run test to verify it passes** + +Run: `cd backend && uv run pytest tests/interviews/test_api_interview.py -v` +Expected: 3 passed. + +- [ ] **Step 7: Commit** + +```bash +git add backend/app/api/__init__.py backend/app/api/interview.py backend/tests/interviews/test_api_interview.py +git commit -m "feat(interviews): Flask blueprint /api/interview with task-based async + CSV export" +``` + +--- + +## Phase 6 — Integration + +### Task 17: End-to-end pipeline test (stub LLM) + +**Files:** +- Create: `backend/tests/integration/__init__.py` +- Test: `backend/tests/integration/test_interview_pipeline.py` + +- [ ] **Step 1: Write failing test** + +Create `backend/tests/integration/__init__.py` (empty), then: + +```python +# backend/tests/integration/test_interview_pipeline.py +import json +import pytest +from pathlib import Path +from app.config import Config +from app.models.interview import SubagentKind, InterviewPhase +from app.services.interviews.adapters import FileSystemPersonaProvider +from app.services.interviews.base import MemoryDigest +from app.services.interviews.zep_writer import InterviewZepWriter +from app.services.interview_orchestrator import InterviewOrchestrator +from app.services.interview_synthesizer import InterviewSynthesizer +from app.utils.llm_client import LLMClient + +pytestmark = pytest.mark.integration + +INST_DIR = Path(__file__).resolve().parents[2] / "scripts" / "instruments" + +class _NullUpdater: + def __init__(self): self.events = [] + def add_text_episode(self, graph_id, text): self.events.append(text) + +class _StaticMem: + def get_digest(self, agent_id, max_chars=2000): + return MemoryDigest(text=f"agent {agent_id} memory snippet", available=True) + +@pytest.fixture +def seeded_uploads(tmp_path, monkeypatch): + monkeypatch.setenv("LLM_STUB_MODE", "true") + Config.LLM_STUB_MODE = True + sim_dir = tmp_path / "simulations" / "intg_sim" + sim_dir.mkdir(parents=True) + profiles = [{"user_id": i, "user_name": f"u{i}", "name": f"A{i}", + "persona": "stakeholder p", "profession": "fisher"} for i in range(5)] + (sim_dir / "reddit_profiles.json").write_text(json.dumps(profiles), encoding="utf-8") + return tmp_path + +def _make_orch(tmp_path): + sim_dir = tmp_path / "simulations" / "intg_sim" + personas = FileSystemPersonaProvider( + reddit_path=sim_dir / "reddit_profiles.json", twitter_path=None, + ) + llm = LLMClient(api_key="x", base_url="x", model="x") + updater = _NullUpdater() + writer = InterviewZepWriter(memory_updater=updater, graph_id="g") + return InterviewOrchestrator( + llm=llm, memory=_StaticMem(), personas=personas, + instrument_dir=INST_DIR, store_root=tmp_path, sim_id="intg_sim", + zep_writer=writer, max_workers=2, language="de", + ) + +def test_pipeline_runs_pre_then_post_then_synthesis(seeded_uploads): + tmp = seeded_uploads + orch = _make_orch(tmp) + + pre = orch.run_pre() + assert pre["longitudinal"]["n_responded"] >= 1 + + post = orch.run_post() + assert "longitudinal" in post + assert "diversity" in post + assert "scenario" in post + assert "delphi" in post + + synth = InterviewSynthesizer(store=orch.store) + report = synth.run() + assert "Stakeholder Interview Synthesis" in report + assert "Limitations" in report + + csv_path = orch.store.base / "synthesis" / "exports" / "all_responses.csv" + assert csv_path.exists() + lines = csv_path.read_text().splitlines() + assert lines[0].startswith("agent_id,") or "agent_id" in lines[0] + +def test_idempotent_rerun_creates_new_run_id(seeded_uploads): + tmp = seeded_uploads + orch = _make_orch(tmp) + orch.run_pre() + first = orch.run_post() + second = orch.rerun(SubagentKind.SCENARIO) + first_scn = first["scenario"]["run_dir"] + second_scn = second["scenario"]["run_dir"] + assert first_scn != second_scn +``` + +- [ ] **Step 2: Run test to verify it fails** + +Run: `cd backend && uv run pytest tests/integration/test_interview_pipeline.py -v -m integration` +Expected: most likely ValidationError from the stub LLM's canned JSON not satisfying every subagent's strict validator (forced Q-sort distribution, scenarios, Delphi). This is the signal to enrich the stub. + +- [ ] **Step 3: Enrich `_stub_response_json` in `LLMClient` to satisfy each subagent** + +Read the current `_stub_response_json` (Task 4). Replace its body with content-aware stubs by inspecting the user message text. In `backend/app/utils/llm_client.py`, replace `_stub_response_json` with: + +```python + def _stub_response_json(self, messages: list[dict]) -> dict: + import hashlib, json as _json + sys_msg = next((m["content"] for m in messages if m.get("role") == "system"), "") + usr_msg = next((m["content"] for m in reversed(messages) if m.get("role") == "user"), "") + h = hashlib.sha256((sys_msg + "|" + usr_msg).encode("utf-8")).hexdigest() + seed = int(h[:8], 16) + rng = (seed % 5) + 1 + + # Longitudinal Likert (12 items) + if all(tok in usr_msg for tok in ("stk_1", "gov_1", "mkt_1", "clm_1")): + ids = ["stk_1","stk_2","stk_3","gov_1","gov_2","gov_3", + "mkt_1","mkt_2","mkt_3","clm_1","clm_2","clm_3"] + return {"responses": {k: ((seed >> (i*3)) % 5) + 1 for i, k in enumerate(ids)}, + "confidence": {k: 0.6 for k in ids}, + "open_comment": f"stub:{h[:8]}"} + + # Diversity Q-sort: 24 statements + 6 axes, forced distribution 2,3,4,6,4,3,2 + if "st_01" in usr_msg and "ax_pres_extr" in usr_msg: + buckets = [-3]*2 + [-2]*3 + [-1]*4 + [0]*6 + [1]*4 + [2]*3 + [3]*2 + stmts = [f"st_{i+1:02d}" for i in range(24)] + # shuffle deterministically + order = sorted(range(24), key=lambda i: (h[i % len(h)], i)) + placements = {stmts[i]: buckets[order.index(i)] for i in range(24)} + return { + "placements": placements, + "likert_axes": {a: ((seed >> (j*3)) % 7) + 1 for j, a in enumerate( + ["ax_pres_extr","ax_loc_eu","ax_sci_trad", + "ax_ind_col","ax_short_long","ax_mkt_reg"])}, + } + + # Scenario: S1..S4 × 4 dims + if all(s in usr_msg for s in ("S1:", "S2:", "S3:", "S4:")): + return {"ratings": {sid: { + "desirability": ((seed >> (i*3)) % 7) + 1, + "plausibility": ((seed >> (i*3+1)) % 7) + 1, + "impact_on_my_group": ((seed >> (i*3+2)) % 7) + 1, + "fairness": ((seed >> (i*3+4)) % 7) + 1, + "if_woke_up_response": f"act-{sid}-{h[:4]}", + } for i, sid in enumerate(["S1","S2","S3","S4"])}} + + # Delphi R1: q1..q4 free text + if "q1" in usr_msg and "q2" in usr_msg and "Bewerten" not in usr_msg and "Sie sehen" not in usr_msg: + return {"answers": {qid: f"stub-themes-{qid}-{h[:4]}" for qid in ("q1","q2","q3","q4")}} + + # Delphi theme extraction (no in-character system prompt) + if "extract distinct thematic codes" in sys_msg: + return {"themes": [{"theme_id": f"theme_{i}", "label": f"Thema {i}"} for i in range(5)]} + + # Delphi R2 (rate) or R3 (revise) + if "Bewerten Sie jedes Thema" in usr_msg or "Sie sehen unten" in usr_msg \ + or "Rate each theme" in usr_msg or "Below are the anonymised" in usr_msg: + theme_ids = [f"theme_{i}" for i in range(5)] + out = {"ratings": {tid: {"importance": ((seed >> (i*2)) % 5) + 1, + "plausibility": ((seed >> (i*2+1)) % 5) + 1} + for i, tid in enumerate(theme_ids)}} + if "Sie sehen unten" in usr_msg or "Below are the anonymised" in usr_msg: + out["justification"] = "stub-revision" + return out + + # Fallback + return {"stub_key": h[:12], "value": rng} +``` + +- [ ] **Step 4: Run test to verify it passes** + +Run: `cd backend && uv run pytest tests/integration/test_interview_pipeline.py -v -m integration` +Expected: 2 passed. + +- [ ] **Step 5: Commit** + +```bash +git add backend/app/utils/llm_client.py backend/tests/integration/__init__.py backend/tests/integration/test_interview_pipeline.py +git commit -m "test(interviews): end-to-end pipeline test + content-aware LLM stubs for all 4 subagents" +``` + +--- + +## Phase 7 — Frontend + +Note: this project has no frontend test framework. Tasks below use the build (`npm run build`) plus a manual smoke check via `npm run dev` as the verification gate. Commit after each task once the build is green. + +### Task 18: Step4bInterviews.vue scaffold + tab shell + +**Files:** +- Create: `frontend/src/components/Step4bInterviews.vue` +- Create: `frontend/src/api/interview.js` +- Modify: `frontend/src/App.vue` (or the parent that orchestrates Step1..Step5 — locate and adjust) + +- [ ] **Step 1: Add API client module** + +`frontend/src/api/interview.js`: +```javascript +import { api } from "./index" + +export async function startPre(simId) { + const r = await api.post(`/api/interview/${simId}/pre`) + return r.data +} +export async function startPost(simId) { + const r = await api.post(`/api/interview/${simId}/post`) + return r.data +} +export async function rerun(simId, subagent) { + const r = await api.post(`/api/interview/${simId}/rerun`, { subagent }) + return r.data +} +export async function getStatus(simId, taskId) { + const r = await api.get(`/api/interview/${simId}/status`, { params: { task_id: taskId } }) + return r.data +} +export async function getResults(simId, subagent) { + const r = await api.get(`/api/interview/${simId}/results/${subagent}`) + return r.data +} +export async function getSynthesis(simId) { + const r = await api.get(`/api/interview/${simId}/results/synthesis`) + return r.data +} +export function exportCsvUrl(simId) { + return `/api/interview/${simId}/export.csv` +} +``` + +- [ ] **Step 2: Implement Step4bInterviews.vue scaffold** + +`frontend/src/components/Step4bInterviews.vue`: +```vue + + + + + +``` + +- [ ] **Step 3: Create placeholder panel components (to be filled in Task 19)** + +Create five empty-but-renderable Vue components so the scaffold compiles: + +`frontend/src/components/interviews/LongitudinalPanel.vue`: +```vue + + +``` + +Repeat the same pattern (changing only the inner text) for `DiversityPanel.vue`, `DelphiPanel.vue`, `ScenarioPanel.vue`, `SynthesisPanel.vue` in `frontend/src/components/interviews/`. + +- [ ] **Step 4: Wire Step4b into parent navigation** + +Read `frontend/src/App.vue` (or wherever Step1..Step5 are rendered). Locate the routing/visibility logic. Add a Step4b state between Step4 and Step5, and import `Step4bInterviews` from `./components/Step4bInterviews.vue`. Pass `:sim-id="currentSimId"` where the others receive the sim id. Add i18n keys to `locales/en.json`, `locales/de.json`, `locales/zh.json`: +```json +"interview": { + "title": "Stakeholder interviews", + "subtitle": "Four independent surveys of the simulated stakeholder population.", + "runAll": "Run all post-simulation interviews", + "downloadCsv": "Download CSV", + "tab": { + "longitudinal": "Longitudinal (Δ)", + "diversity": "Diversity", + "delphi": "Delphi", + "scenario": "Scenarios", + "synthesis": "Synthesis" + } +} +``` + +- [ ] **Step 5: Build to verify it compiles** + +Run: `cd frontend && npm run build` +Expected: build succeeds with no errors. + +- [ ] **Step 6: Commit** + +```bash +git add frontend/src/api/interview.js frontend/src/components/Step4bInterviews.vue \ + frontend/src/components/interviews/*.vue frontend/src/App.vue \ + locales/*.json +git commit -m "feat(interviews): Step4b Vue scaffold with five-tab navigation, API client, i18n keys" +``` + +--- + +### Task 19: Per-tab d3 visualisations + +**Files:** +- Modify: `frontend/src/components/interviews/LongitudinalPanel.vue` +- Modify: `frontend/src/components/interviews/DiversityPanel.vue` +- Modify: `frontend/src/components/interviews/DelphiPanel.vue` +- Modify: `frontend/src/components/interviews/ScenarioPanel.vue` +- Modify: `frontend/src/components/interviews/SynthesisPanel.vue` + +For each panel, fetch the relevant aggregate via the API on mount, then render with d3. The five implementations follow the same structure; each shows the full content below. + +- [ ] **Step 1: Longitudinal panel — heatmap of Δ̄ per item** + +`frontend/src/components/interviews/LongitudinalPanel.vue`: +```vue + + + + + +``` + +- [ ] **Step 2: Diversity panel — PCA scatter coloured by cluster** + +`frontend/src/components/interviews/DiversityPanel.vue`: +```vue + + + + + +``` + +- [ ] **Step 3: Delphi panel — convergence bar chart (R2 IQR vs R3 IQR per theme)** + +`frontend/src/components/interviews/DelphiPanel.vue`: +```vue + + + + + +``` + +- [ ] **Step 4: Scenario panel — polarity quadrant (desirability × plausibility)** + +`frontend/src/components/interviews/ScenarioPanel.vue`: +```vue + + + + + +``` + +- [ ] **Step 5: Synthesis panel — render markdown report** + +`frontend/src/components/interviews/SynthesisPanel.vue`: +```vue + + + + + +``` + +- [ ] **Step 6: Build + smoke test** + +Run: `cd frontend && npm run build` +Expected: build succeeds. Then `cd .. && npm run dev` and manually visit Step4b for a completed `sim_id` — verify all five tabs render without console errors. + +- [ ] **Step 7: Commit** + +```bash +git add frontend/src/components/interviews/*.vue +git commit -m "feat(interviews): d3 visualisations for longitudinal Δ, diversity PCA, Delphi, scenario polarity, synthesis" +``` + +--- + +### Task 20: Auto-trigger pre-survey on simulation `ready` + +**Files:** +- Create: `backend/app/services/interviews/lifecycle.py` +- Modify: `backend/app/__init__.py` (app factory) to install the hook + +- [ ] **Step 1: Write failing test** + +```python +# backend/tests/interviews/test_lifecycle.py +from app.services.interviews.lifecycle import install_hooks + +class _StubMgr: + def __init__(self): self.ready = []; self.completed = [] + def register_on_ready(self, fn): self.ready.append(fn) + def register_on_completed(self, fn): self.completed.append(fn) + +def test_install_hooks_registers_two_callables(): + mgr = _StubMgr() + install_hooks(mgr) + assert len(mgr.ready) == 1 + assert len(mgr.completed) == 1 + assert callable(mgr.ready[0]) + assert callable(mgr.completed[0]) +``` + +- [ ] **Step 2: Run test to verify it fails** + +Run: `cd backend && uv run pytest tests/interviews/test_lifecycle.py -v` +Expected: ImportError. + +- [ ] **Step 3: Implement lifecycle hook installer** + +`backend/app/services/interviews/lifecycle.py`: +```python +from __future__ import annotations +import threading +from app.utils.logger import get_logger + +logger = get_logger(__name__) + +def install_hooks(manager) -> None: + """Attach interview lifecycle callbacks to a SimulationManager. + + on_ready → spawn T0 longitudinal in a background thread + on_completed → spawn full post-sim batch in a background thread + Hooks are best-effort; failures only log. + """ + def _on_ready(state) -> None: + sim_id = getattr(state, "sim_id", None) or getattr(state, "id", None) + if not sim_id: return + threading.Thread(target=_run_pre, args=(sim_id,), daemon=True).start() + + def _on_completed(state) -> None: + sim_id = getattr(state, "sim_id", None) or getattr(state, "id", None) + if not sim_id: return + threading.Thread(target=_run_post, args=(sim_id,), daemon=True).start() + + manager.register_on_ready(_on_ready) + manager.register_on_completed(_on_completed) + +def _run_pre(sim_id: str) -> None: + try: + from app.api.interview import _build_orchestrator + orch = _build_orchestrator(sim_id) + orch.run_pre() + except Exception as e: + logger.warning(f"auto pre-survey failed for {sim_id}: {e!r}") + +def _run_post(sim_id: str) -> None: + try: + from app.api.interview import _build_orchestrator + from app.services.interview_synthesizer import InterviewSynthesizer + orch = _build_orchestrator(sim_id) + orch.run_post() + InterviewSynthesizer(store=orch.store).run() + except Exception as e: + logger.warning(f"auto post-survey failed for {sim_id}: {e!r}") +``` + +- [ ] **Step 4: Wire into app factory** + +Read `backend/app/__init__.py`. Locate where `SimulationManager` (or its singleton) is instantiated. Add: +```python + from app.services.interviews.lifecycle import install_hooks + install_hooks(simulation_manager) +``` +immediately after the manager is constructed. If `simulation_manager` is module-level in `simulation_manager.py`, attach the hooks at the bottom of that module instead — the goal is "install once on app startup". + +- [ ] **Step 5: Run test to verify it passes** + +Run: `cd backend && uv run pytest tests/interviews/test_lifecycle.py -v` +Expected: 1 passed. + +- [ ] **Step 6: Full backend test suite** + +Run: `cd backend && uv run pytest -m "not integration" -q` +Expected: all unit tests pass. + +Run: `cd backend && uv run pytest -m integration -q` +Expected: integration tests pass. + +- [ ] **Step 7: Commit** + +```bash +git add backend/app/services/interviews/lifecycle.py backend/app/__init__.py backend/tests/interviews/test_lifecycle.py +git commit -m "feat(interviews): auto-trigger pre and post interviews via SimulationManager lifecycle hooks" +``` + +--- + +## Final verification + +- [ ] **Run full backend test suite** + +Run: `cd backend && uv run pytest -q` +Expected: every test passes. + +- [ ] **Run frontend build** + +Run: `cd frontend && npm run build` +Expected: build succeeds with no errors. + +- [ ] **Smoke test the running app** + +Run: `npm run dev` from project root. With an existing completed simulation: +1. Navigate to Step4b in the UI +2. Click "Run all post-simulation interviews" +3. Wait for status to reach `completed` +4. Verify each of the five tabs renders without console errors +5. Click "Download CSV" and confirm the file downloads + +- [ ] **Verify spec coverage** + +Re-open `docs/superpowers/specs/2026-05-23-stakeholder-interview-subagents-design.md` and confirm every section in the spec has a corresponding task: + +- §3 architectural approach (deterministic runners) → Tasks 5–9 +- §4 file structure + lifecycle hooks → Tasks 2–14, 20 +- §5.1–5.4 four instruments → Tasks 6, 7, 8, 9 +- §5.5 in-character prompting + structured output + cost guardrails → Tasks 4, 5 +- §6.1 storage layout → Task 10 +- §6.2 Zep integration → Task 11 +- §6.3 API surface (all 7 endpoints) → Task 16 +- §6.4 parallelism + token guard → Task 12 (parallelism); token guard sits in `Config.INTERVIEW_MAX_TOKENS_PER_RUN` from Task 1 — *open: enforcement not implemented in v1; flag if you want it added* +- §6.5 frontend Step4b + per-tab viz → Tasks 18, 19 +- §7 error handling (per-agent isolation, schema retry, idempotency) → Tasks 5, 10, 12 +- §8 validation (schema, instrument, plausibility flags) → Tasks 2, 3 (schema + instrument); plausibility-flags currently sit inside synthesiser §10 — *check that flagged thresholds in §8 plausibility checks match what synthesiser currently emits* +- §9 testing (unit per subagent + integration + stub mode) → Tasks 4, 6–9, 12, 17 +- §10 methodological caveats in synthesis → Task 14 +- §11 defaults — already encoded in Task 1 config keys and instrument YAML + +If §6.4 token-guard enforcement is needed for v1, add a small follow-up task that computes a projected-token estimate before `run_post` and returns 400 with `confirm=true` override — but the spec keeps this as a guard, not a blocker, so it can ship in v1.1. + +--- + +**Plan complete and saved to `docs/superpowers/plans/2026-05-23-stakeholder-interview-subagents.md`. Two execution options:** + +**1. Subagent-Driven (recommended)** — I dispatch a fresh subagent per task, review between tasks, fast iteration. + +**2. Inline Execution** — Execute tasks in this session using executing-plans, batch execution with checkpoints. + +**Which approach?** + From f63bc5542aa0bc1216bda3e1e943b4a4def89edf Mon Sep 17 00:00:00 2001 From: Christian Moellmann Date: Sat, 23 May 2026 12:00:09 +0200 Subject: [PATCH 03/26] chore(interviews): add deps and pytest scaffold for interview subsystem Co-Authored-By: Claude Opus 4.7 (1M context) --- backend/pyproject.toml | 5 +++++ backend/pytest.ini | 8 ++++++++ backend/tests/__init__.py | 0 backend/tests/conftest.py | 17 +++++++++++++++++ backend/uv.lock | 10 ++++++++++ 5 files changed, 40 insertions(+) create mode 100644 backend/pytest.ini create mode 100644 backend/tests/__init__.py create mode 100644 backend/tests/conftest.py diff --git a/backend/pyproject.toml b/backend/pyproject.toml index 4f5361d5..093f5040 100644 --- a/backend/pyproject.toml +++ b/backend/pyproject.toml @@ -32,6 +32,11 @@ dependencies = [ # 工具库 "python-dotenv>=1.0.0", "pydantic>=2.0.0", + "PyYAML>=6.0", + "scikit-learn>=1.4", + "scipy>=1.12", + "numpy>=1.26", + "pandas>=2.1", ] [project.optional-dependencies] diff --git a/backend/pytest.ini b/backend/pytest.ini new file mode 100644 index 00000000..60f69ff1 --- /dev/null +++ b/backend/pytest.ini @@ -0,0 +1,8 @@ +[pytest] +testpaths = tests +python_files = test_*.py +python_classes = Test* +python_functions = test_* +addopts = -ra --strict-markers +markers = + integration: marks integration tests (deselect with -m 'not integration') diff --git a/backend/tests/__init__.py b/backend/tests/__init__.py new file mode 100644 index 00000000..e69de29b diff --git a/backend/tests/conftest.py b/backend/tests/conftest.py new file mode 100644 index 00000000..2ba3931d --- /dev/null +++ b/backend/tests/conftest.py @@ -0,0 +1,17 @@ +import os +import sys +import pathlib +import pytest + +ROOT = pathlib.Path(__file__).resolve().parents[1] +sys.path.insert(0, str(ROOT)) + +os.environ.setdefault("LLM_API_KEY", "test") +os.environ.setdefault("LLM_BASE_URL", "https://example.invalid") +os.environ.setdefault("LLM_MODEL_NAME", "test-model") +os.environ.setdefault("ZEP_API_KEY", "test") + +@pytest.fixture +def tmp_uploads(tmp_path, monkeypatch): + monkeypatch.setenv("UPLOADS_DIR", str(tmp_path)) + return tmp_path diff --git a/backend/uv.lock b/backend/uv.lock index f1ce4b60..b5f8c6b4 100644 --- a/backend/uv.lock +++ b/backend/uv.lock @@ -1248,10 +1248,15 @@ dependencies = [ { name = "charset-normalizer" }, { name = "flask" }, { name = "flask-cors" }, + { name = "numpy" }, { name = "openai" }, + { name = "pandas" }, { name = "pydantic" }, { name = "pymupdf" }, { name = "python-dotenv" }, + { name = "pyyaml" }, + { name = "scikit-learn" }, + { name = "scipy" }, { name = "zep-cloud" }, ] @@ -1276,13 +1281,18 @@ requires-dist = [ { name = "charset-normalizer", specifier = ">=3.0.0" }, { name = "flask", specifier = ">=3.0.0" }, { name = "flask-cors", specifier = ">=6.0.0" }, + { name = "numpy", specifier = ">=1.26" }, { name = "openai", specifier = ">=1.0.0" }, + { name = "pandas", specifier = ">=2.1" }, { name = "pipreqs", marker = "extra == 'dev'", specifier = ">=0.5.0" }, { name = "pydantic", specifier = ">=2.0.0" }, { name = "pymupdf", specifier = ">=1.24.0" }, { name = "pytest", marker = "extra == 'dev'", specifier = ">=8.0.0" }, { name = "pytest-asyncio", marker = "extra == 'dev'", specifier = ">=0.23.0" }, { name = "python-dotenv", specifier = ">=1.0.0" }, + { name = "pyyaml", specifier = ">=6.0" }, + { name = "scikit-learn", specifier = ">=1.4" }, + { name = "scipy", specifier = ">=1.12" }, { name = "zep-cloud", specifier = "==3.13.0" }, ] provides-extras = ["dev"] From 071f8b5c4ca9c214d789cbbbf7b49de09a905987 Mon Sep 17 00:00:00 2001 From: Christian Moellmann Date: Sat, 23 May 2026 12:02:33 +0200 Subject: [PATCH 04/26] feat(interviews): add interview config keys (token budget, workers, language, stub mode) Co-Authored-By: Claude Opus 4.7 (1M context) --- backend/app/config.py | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/backend/app/config.py b/backend/app/config.py index 953dfa50..da7df8c1 100644 --- a/backend/app/config.py +++ b/backend/app/config.py @@ -62,6 +62,12 @@ class Config: REPORT_AGENT_MAX_TOOL_CALLS = int(os.environ.get('REPORT_AGENT_MAX_TOOL_CALLS', '5')) REPORT_AGENT_MAX_REFLECTION_ROUNDS = int(os.environ.get('REPORT_AGENT_MAX_REFLECTION_ROUNDS', '2')) REPORT_AGENT_TEMPERATURE = float(os.environ.get('REPORT_AGENT_TEMPERATURE', '0.5')) + + # Interview subsystem + INTERVIEW_MAX_TOKENS_PER_RUN = int(os.environ.get("INTERVIEW_MAX_TOKENS_PER_RUN", 15_000_000)) + INTERVIEW_MAX_WORKERS = int(os.environ.get("INTERVIEW_MAX_WORKERS", 8)) + INTERVIEW_DEFAULT_LANGUAGE = os.environ.get("INTERVIEW_DEFAULT_LANGUAGE", "de") + LLM_STUB_MODE = os.environ.get("LLM_STUB_MODE", "false").lower() == "true" @classmethod def validate(cls): From f1898b4eacf0df41531ba7a454d8a3b4e07ab108 Mon Sep 17 00:00:00 2001 From: Christian Moellmann Date: Sat, 23 May 2026 12:04:45 +0200 Subject: [PATCH 05/26] feat(interviews): add pydantic models for instruments and responses Co-Authored-By: Claude Opus 4.7 (1M context) --- backend/app/models/interview.py | 99 +++++++++++++++++++++++++ backend/tests/interviews/__init__.py | 0 backend/tests/interviews/test_models.py | 30 ++++++++ 3 files changed, 129 insertions(+) create mode 100644 backend/app/models/interview.py create mode 100644 backend/tests/interviews/__init__.py create mode 100644 backend/tests/interviews/test_models.py diff --git a/backend/app/models/interview.py b/backend/app/models/interview.py new file mode 100644 index 00000000..980efc82 --- /dev/null +++ b/backend/app/models/interview.py @@ -0,0 +1,99 @@ +from __future__ import annotations +from enum import Enum +from typing import Optional +from pydantic import BaseModel, Field, field_validator, model_validator + +class InterviewPhase(str, Enum): + T0 = "T0" + T1 = "T1" + +class SubagentKind(str, Enum): + LONGITUDINAL = "longitudinal" + DIVERSITY = "diversity" + DELPHI = "delphi" + SCENARIO = "scenario" + +class LikertItem(BaseModel): + item_id: str + de: str + en: str + scale: int = Field(ge=3, le=7) + family: Optional[str] = None + reverse_coded: bool = False + + @field_validator("scale") + @classmethod + def odd_scale(cls, v: int) -> int: + if v not in (3, 5, 7): + raise ValueError("scale must be 3, 5, or 7") + return v + +class LikertInstrument(BaseModel): + name: str + version: str = "1.0" + language_default: str = "de" + items: list[LikertItem] + + @model_validator(mode="after") + def unique_item_ids(self) -> "LikertInstrument": + ids = [i.item_id for i in self.items] + if len(set(ids)) != len(ids): + raise ValueError("duplicate item_id in instrument") + return self + +class LikertResponse(BaseModel): + agent_id: int + phase: InterviewPhase + responses: dict[str, int] + confidence: dict[str, float] = Field(default_factory=dict) + open_comment: Optional[str] = None + memory_available: bool = True + failed_items: list[str] = Field(default_factory=list) + + @model_validator(mode="after") + def values_in_range(self) -> "LikertResponse": + for k, v in self.responses.items(): + if not 1 <= v <= 5: + raise ValueError(f"response {k}={v} out of 1..5 range") + for k, v in self.confidence.items(): + if not 0.0 <= v <= 1.0: + raise ValueError(f"confidence {k}={v} out of 0..1 range") + return self + +class QSortStatement(BaseModel): + statement_id: str + de: str + en: str + +class QSortInstrument(BaseModel): + name: str + version: str = "1.0" + statements: list[QSortStatement] + distribution: list[int] # e.g. [2,3,4,6,4,3,2] for -3..+3 + +class QSortResponse(BaseModel): + agent_id: int + placements: dict[str, int] # statement_id -> bucket (-3..+3) + likert_axes: dict[str, int] # axis_id -> 1..7 + +class DelphiOpenResponse(BaseModel): + agent_id: int + round: int = 1 + answers: dict[str, str] # question_id -> free text + +class DelphiRatingResponse(BaseModel): + agent_id: int + round: int + ratings: dict[str, dict[str, int]] # theme_id -> {importance, plausibility} + justification: Optional[str] = None + +class ScenarioRating(BaseModel): + desirability: int = Field(ge=1, le=7) + plausibility: int = Field(ge=1, le=7) + impact_on_my_group: int = Field(ge=1, le=7) + fairness: int = Field(ge=1, le=7) + if_woke_up_response: str + +class ScenarioResponse(BaseModel): + agent_id: int + ratings: dict[str, ScenarioRating] # scenario_id -> rating diff --git a/backend/tests/interviews/__init__.py b/backend/tests/interviews/__init__.py new file mode 100644 index 00000000..e69de29b diff --git a/backend/tests/interviews/test_models.py b/backend/tests/interviews/test_models.py new file mode 100644 index 00000000..e575d118 --- /dev/null +++ b/backend/tests/interviews/test_models.py @@ -0,0 +1,30 @@ +import pytest +from pydantic import ValidationError +from app.models.interview import ( + LikertItem, LikertInstrument, LikertResponse, + InterviewPhase, SubagentKind, +) + +def test_likert_item_requires_de_and_en(): + item = LikertItem(item_id="x1", de="Frage", en="Question", scale=5) + assert item.scale == 5 + +def test_likert_item_rejects_bad_scale(): + with pytest.raises(ValidationError): + LikertItem(item_id="x1", de="d", en="e", scale=2) + +def test_likert_instrument_unique_item_ids(): + with pytest.raises(ValidationError): + LikertInstrument( + name="t", + items=[LikertItem(item_id="a", de="d", en="e", scale=5), + LikertItem(item_id="a", de="d", en="e", scale=5)], + ) + +def test_likert_response_validates_scale_range(): + with pytest.raises(ValidationError): + LikertResponse(agent_id=1, phase=InterviewPhase.T0, + responses={"a": 6}, confidence={"a": 0.5}) + +def test_subagent_kind_enum(): + assert SubagentKind.LONGITUDINAL.value == "longitudinal" From 29be754ff4060140361f39efc15b355c48b3a09b Mon Sep 17 00:00:00 2001 From: Christian Moellmann Date: Sat, 23 May 2026 12:06:52 +0200 Subject: [PATCH 06/26] feat(interviews): YAML instrument loader with pydantic validation and hash freezing Co-Authored-By: Claude Opus 4.7 (1M context) --- backend/app/services/interviews/__init__.py | 0 .../services/interviews/instrument_loader.py | 55 +++++++++++++++++++ backend/scripts/instruments/__init__.py | 0 .../interviews/test_instrument_loader.py | 44 +++++++++++++++ 4 files changed, 99 insertions(+) create mode 100644 backend/app/services/interviews/__init__.py create mode 100644 backend/app/services/interviews/instrument_loader.py create mode 100644 backend/scripts/instruments/__init__.py create mode 100644 backend/tests/interviews/test_instrument_loader.py diff --git a/backend/app/services/interviews/__init__.py b/backend/app/services/interviews/__init__.py new file mode 100644 index 00000000..e69de29b diff --git a/backend/app/services/interviews/instrument_loader.py b/backend/app/services/interviews/instrument_loader.py new file mode 100644 index 00000000..6d35d8a1 --- /dev/null +++ b/backend/app/services/interviews/instrument_loader.py @@ -0,0 +1,55 @@ +from __future__ import annotations +import hashlib +import json +from pathlib import Path +import yaml +from pydantic import ValidationError +from app.models.interview import ( + LikertInstrument, QSortInstrument, +) + +class InstrumentValidationError(ValueError): + pass + +def _parse_yaml(path: Path) -> dict: + if not path.exists(): + raise InstrumentValidationError(f"instrument file not found: {path}") + try: + with path.open("r", encoding="utf-8") as f: + data = yaml.safe_load(f) + except yaml.YAMLError as e: + raise InstrumentValidationError(f"YAML parse error in {path}: {e}") from e + if not isinstance(data, dict): + raise InstrumentValidationError(f"top-level YAML must be a mapping in {path}") + return data + +def load_likert_instrument(path: Path) -> LikertInstrument: + data = _parse_yaml(Path(path)) + try: + return LikertInstrument(**data) + except ValidationError as e: + raise InstrumentValidationError(str(e)) from e + +def load_qsort_instrument(path: Path) -> QSortInstrument: + data = _parse_yaml(Path(path)) + try: + return QSortInstrument(**data) + except ValidationError as e: + raise InstrumentValidationError(str(e)) from e + +def instrument_hash(path: Path) -> str: + data = Path(path).read_bytes() + return hashlib.sha256(data).hexdigest()[:16] + +def freeze_snapshot(instruments: dict[str, Path], out_path: Path) -> dict: + snapshot = { + name: { + "path": str(p), + "hash": instrument_hash(p), + "content": _parse_yaml(p), + } + for name, p in instruments.items() + } + out_path.parent.mkdir(parents=True, exist_ok=True) + out_path.write_text(json.dumps(snapshot, ensure_ascii=False, indent=2), encoding="utf-8") + return snapshot diff --git a/backend/scripts/instruments/__init__.py b/backend/scripts/instruments/__init__.py new file mode 100644 index 00000000..e69de29b diff --git a/backend/tests/interviews/test_instrument_loader.py b/backend/tests/interviews/test_instrument_loader.py new file mode 100644 index 00000000..dfb0852e --- /dev/null +++ b/backend/tests/interviews/test_instrument_loader.py @@ -0,0 +1,44 @@ +import pytest +from app.services.interviews.instrument_loader import ( + load_likert_instrument, InstrumentValidationError, +) + +def _write(tmp_path, text): + p = tmp_path / "inst.yaml" + p.write_text(text, encoding="utf-8") + return p + +def test_loads_valid_likert(tmp_path): + p = _write(tmp_path, """ +name: longitudinal_v1 +version: "1.0" +language_default: de +items: + - item_id: stk_1 + de: "Der westliche Dorschbestand wird sich erholen" + en: "Western cod stock will recover" + scale: 5 + family: stocks +""") + inst = load_likert_instrument(p) + assert inst.name == "longitudinal_v1" + assert len(inst.items) == 1 + +def test_rejects_duplicate_item_id(tmp_path): + p = _write(tmp_path, """ +name: x +items: + - {item_id: a, de: d, en: e, scale: 5} + - {item_id: a, de: d, en: e, scale: 5} +""") + with pytest.raises(InstrumentValidationError): + load_likert_instrument(p) + +def test_rejects_missing_required_field(tmp_path): + p = _write(tmp_path, """ +name: x +items: + - {item_id: a, de: d, scale: 5} +""") + with pytest.raises(InstrumentValidationError): + load_likert_instrument(p) From eb3c3629c1512c03dd5b24bc3a7117df0c061c0f Mon Sep 17 00:00:00 2001 From: Christian Moellmann Date: Sat, 23 May 2026 12:08:29 +0200 Subject: [PATCH 07/26] feat(interviews): LLM stub mode for deterministic CI tests Co-Authored-By: Claude Opus 4.7 (1M context) --- backend/app/utils/llm_client.py | 41 ++++++++++++++++++++--- backend/tests/interviews/test_llm_stub.py | 17 ++++++++++ 2 files changed, 54 insertions(+), 4 deletions(-) create mode 100644 backend/tests/interviews/test_llm_stub.py diff --git a/backend/app/utils/llm_client.py b/backend/app/utils/llm_client.py index 6c1a81f4..32285596 100644 --- a/backend/app/utils/llm_client.py +++ b/backend/app/utils/llm_client.py @@ -32,6 +32,31 @@ class LLMClient: base_url=self.base_url ) + def _stub_key(self, messages: list[dict]) -> str: + user_msg = next((m["content"] for m in reversed(messages) if m.get("role") == "user"), "") + sys_msg = next((m["content"] for m in messages if m.get("role") == "system"), "") + # Allow callers to embed an explicit stub_key=... token + for chunk in user_msg.split(): + if chunk.startswith("stub_key="): + return chunk[len("stub_key="):] + import hashlib + return hashlib.sha256((sys_msg + "|" + user_msg).encode("utf-8")).hexdigest()[:12] + + def _stub_response(self, messages: list[dict]) -> str: + import json as _json + return _json.dumps(self._stub_response_json(messages), ensure_ascii=False) + + def _stub_response_json(self, messages: list[dict]) -> dict: + key = self._stub_key(messages) + # Deterministic centered Likert + plausible open text + digit = sum(ord(c) for c in key) % 5 + 1 + return { + "stub_key": key, + "responses": {"item_001": digit, "item_002": digit, "item_003": (digit % 5) + 1}, + "confidence": {"item_001": 0.7, "item_002": 0.7, "item_003": 0.6}, + "open_comment": f"stub:{key}", + } + def chat( self, messages: List[Dict[str, str]], @@ -41,16 +66,20 @@ class LLMClient: ) -> str: """ 发送聊天请求 - + Args: messages: 消息列表 temperature: 温度参数 max_tokens: 最大token数 response_format: 响应格式(如JSON模式) - + Returns: 模型响应文本 """ + from app.config import Config + if getattr(Config, "LLM_STUB_MODE", False): + return self._stub_response(messages) + kwargs = { "model": self.model, "messages": messages, @@ -75,15 +104,19 @@ class LLMClient: ) -> Dict[str, Any]: """ 发送聊天请求并返回JSON - + Args: messages: 消息列表 temperature: 温度参数 max_tokens: 最大token数 - + Returns: 解析后的JSON对象 """ + from app.config import Config + if getattr(Config, "LLM_STUB_MODE", False): + return self._stub_response_json(messages) + response = self.chat( messages=messages, temperature=temperature, diff --git a/backend/tests/interviews/test_llm_stub.py b/backend/tests/interviews/test_llm_stub.py new file mode 100644 index 00000000..6be5ed2a --- /dev/null +++ b/backend/tests/interviews/test_llm_stub.py @@ -0,0 +1,17 @@ +import json +from app.utils.llm_client import LLMClient + + +def test_stub_mode_returns_deterministic_canned_json(monkeypatch): + monkeypatch.setenv("LLM_STUB_MODE", "true") + from app.config import Config + Config.LLM_STUB_MODE = True + client = LLMClient(api_key="x", base_url="x", model="x") + messages = [ + {"role": "system", "content": "You are persona_42. Return JSON."}, + {"role": "user", "content": "stub_key=longitudinal:item_001"}, + ] + out1 = client.chat_json(messages=messages, temperature=0.0) + out2 = client.chat_json(messages=messages, temperature=0.0) + assert out1 == out2 + assert isinstance(out1, dict) From 289a0cff569025549ea853855477f0959b60375d Mon Sep 17 00:00:00 2001 From: Christian Moellmann Date: Sat, 23 May 2026 12:10:01 +0200 Subject: [PATCH 08/26] feat(interviews): StakeholderInterviewer base with in-character prompting and schema retry Co-Authored-By: Claude Opus 4.7 (1M context) --- backend/app/services/interviews/base.py | 72 +++++++++++++++++++ .../tests/interviews/test_base_interviewer.py | 47 ++++++++++++ 2 files changed, 119 insertions(+) create mode 100644 backend/app/services/interviews/base.py create mode 100644 backend/tests/interviews/test_base_interviewer.py diff --git a/backend/app/services/interviews/base.py b/backend/app/services/interviews/base.py new file mode 100644 index 00000000..bb318db9 --- /dev/null +++ b/backend/app/services/interviews/base.py @@ -0,0 +1,72 @@ +from __future__ import annotations +from dataclasses import dataclass, field +from typing import Any, Callable, Optional, Protocol + + +@dataclass +class PersonaRecord: + agent_id: int + name: str + persona: str + profession: Optional[str] = None + bio: Optional[str] = None + + +@dataclass +class MemoryDigest: + text: str + available: bool = True + + +class MemoryProvider(Protocol): + def get_digest(self, agent_id: int, max_chars: int = 2000) -> MemoryDigest: ... + + +class StakeholderInterviewer: + def __init__(self, llm, memory: MemoryProvider, language: str = "de"): + self.llm = llm + self.memory = memory + self.language = language + + def _system_prompt(self, persona: PersonaRecord, digest: MemoryDigest, schema_hint: str) -> str: + memory_block = digest.text if digest.available else "[no simulation memory available]" + lang_note = "Antworte ausschließlich auf Deutsch." if self.language == "de" else "Answer in English." + return ( + f"You are {persona.name}. {persona.persona}\n\n" + "You are answering a survey about the future of German fisheries. " + "Answer strictly in character based on your background, values, and what you experienced " + "during the simulated social media discourse summarised below.\n\n" + f"--- simulation memory digest ---\n{memory_block}\n--- end ---\n\n" + f"{lang_note} Return JSON ONLY matching this schema:\n{schema_hint}" + ) + + def ask_in_character( + self, + persona: PersonaRecord, + user_prompt: str, + schema_hint: str, + *, + temperature: float = 0.3, + max_tokens: Optional[int] = None, + validate: Optional[Callable[[dict], Optional[dict]]] = None, + ) -> dict: + digest = self.memory.get_digest(persona.agent_id) + messages = [ + {"role": "system", "content": self._system_prompt(persona, digest, schema_hint)}, + {"role": "user", "content": user_prompt}, + ] + out = self.llm.chat_json(messages=messages, temperature=temperature, max_tokens=max_tokens) + if validate is not None: + validated = validate(out) + if validated is not None: + return validated + messages.append({"role": "assistant", "content": str(out)}) + messages.append({"role": "user", "content": + "Your previous response did not match the required schema. " + f"Return ONLY valid JSON matching: {schema_hint}"}) + out = self.llm.chat_json(messages=messages, temperature=0.0, max_tokens=max_tokens) + validated = validate(out) + if validated is None: + raise ValueError(f"agent {persona.agent_id}: schema violation after retry") + return validated + return out diff --git a/backend/tests/interviews/test_base_interviewer.py b/backend/tests/interviews/test_base_interviewer.py new file mode 100644 index 00000000..2c8962ef --- /dev/null +++ b/backend/tests/interviews/test_base_interviewer.py @@ -0,0 +1,47 @@ +import json +import pytest +from app.services.interviews.base import StakeholderInterviewer, MemoryDigest, PersonaRecord + +class _FakeLLM: + def __init__(self, responses): + self.responses = list(responses) + self.calls = [] + def chat_json(self, messages, temperature=0.0, max_tokens=None, **kw): + self.calls.append(messages) + return self.responses.pop(0) + +class _FakeMemory: + def get_digest(self, agent_id, max_chars=2000): + return MemoryDigest(text=f"digest-for-{agent_id}", available=True) + +def test_in_character_prompt_includes_persona_and_memory(): + llm = _FakeLLM([{"x": 1}]) + mem = _FakeMemory() + interviewer = StakeholderInterviewer(llm=llm, memory=mem) + persona = PersonaRecord(agent_id=7, name="A", persona="I am a small-scale Baltic fisher.") + out = interviewer.ask_in_character(persona, user_prompt="Q?", schema_hint="{...}") + assert out == {"x": 1} + sys_msg = llm.calls[0][0]["content"] + assert "small-scale Baltic fisher" in sys_msg + assert "digest-for-7" in sys_msg + +def test_schema_retry_on_first_failure(): + bad_then_good = [{}, {"responses": {"a": 3}}] + llm = _FakeLLM(bad_then_good) + mem = _FakeMemory() + interviewer = StakeholderInterviewer(llm=llm, memory=mem) + def validator(d): + return d if "responses" in d else None + persona = PersonaRecord(agent_id=1, name="A", persona="p") + out = interviewer.ask_in_character(persona, user_prompt="Q?", schema_hint="x", validate=validator) + assert out == {"responses": {"a": 3}} + assert len(llm.calls) == 2 + +def test_two_failures_raise(): + llm = _FakeLLM([{}, {}]) + mem = _FakeMemory() + interviewer = StakeholderInterviewer(llm=llm, memory=mem) + persona = PersonaRecord(agent_id=1, name="A", persona="p") + with pytest.raises(ValueError): + interviewer.ask_in_character(persona, user_prompt="Q?", schema_hint="x", + validate=lambda d: d if "responses" in d else None) From 0fcb815cde4dfc0e7f3cc86b38a5487265f5e73e Mon Sep 17 00:00:00 2001 From: Christian Moellmann Date: Sat, 23 May 2026 12:12:46 +0200 Subject: [PATCH 09/26] feat(interviews): longitudinal subagent + 12-item Likert instrument Co-Authored-By: Claude Opus 4.7 (1M context) --- .../app/services/interviews/longitudinal.py | 109 ++++++++++++++++++ .../scripts/instruments/longitudinal_v1.yaml | 47 ++++++++ backend/tests/interviews/test_longitudinal.py | 57 +++++++++ 3 files changed, 213 insertions(+) create mode 100644 backend/app/services/interviews/longitudinal.py create mode 100644 backend/scripts/instruments/longitudinal_v1.yaml create mode 100644 backend/tests/interviews/test_longitudinal.py diff --git a/backend/app/services/interviews/longitudinal.py b/backend/app/services/interviews/longitudinal.py new file mode 100644 index 00000000..4f13ec23 --- /dev/null +++ b/backend/app/services/interviews/longitudinal.py @@ -0,0 +1,109 @@ +from __future__ import annotations +import json +import math +from pathlib import Path +from typing import Optional +from app.models.interview import ( + LikertInstrument, LikertResponse, InterviewPhase, +) +from app.services.interviews.base import StakeholderInterviewer, PersonaRecord +from app.services.interviews.instrument_loader import load_likert_instrument + + +class LongitudinalSubagent: + def __init__(self, llm, memory, instrument_path: Path, language: str = "de"): + self.instrument: LikertInstrument = load_likert_instrument(Path(instrument_path)) + self.interviewer = StakeholderInterviewer(llm=llm, memory=memory, language=language) + self.language = language + + def _schema_hint(self) -> str: + ids = [i.item_id for i in self.instrument.items] + return json.dumps({ + "responses": {k: "" for k in ids}, + "confidence": {k: "" for k in ids}, + "open_comment": "", + }, ensure_ascii=False) + + def _user_prompt(self) -> str: + lines = [ + "Bitte bewerten Sie die folgenden Aussagen auf einer Skala von 1 (lehne stark ab) bis 5 (stimme stark zu)." + if self.language == "de" + else "Please rate the following statements on a scale from 1 (strongly disagree) to 5 (strongly agree)." + ] + for it in self.instrument.items: + txt = it.de if self.language == "de" else it.en + lines.append(f"- [{it.item_id}] {txt}") + return "\n".join(lines) + + def _validator(self, raw: dict) -> Optional[dict]: + if not isinstance(raw, dict): + return None + resp = raw.get("responses") + if not isinstance(resp, dict): + return None + required = {it.item_id for it in self.instrument.items} + if not required.issubset(resp.keys()): + return None + for k, v in resp.items(): + if not isinstance(v, int) or not 1 <= v <= 5: + return None + return raw + + def administer(self, persona: PersonaRecord, phase: InterviewPhase) -> LikertResponse: + raw = self.interviewer.ask_in_character( + persona, + user_prompt=self._user_prompt(), + schema_hint=self._schema_hint(), + validate=self._validator, + ) + return LikertResponse( + agent_id=persona.agent_id, + phase=phase, + responses={k: int(v) for k, v in raw["responses"].items()}, + confidence={k: float(v) for k, v in raw.get("confidence", {}).items()}, + open_comment=raw.get("open_comment"), + ) + + +def run_aggregate(t0: list[LikertResponse], t1: list[LikertResponse]) -> dict: + by_t0 = {r.agent_id: r for r in t0} + by_t1 = {r.agent_id: r for r in t1} + paired = sorted(set(by_t0) & set(by_t1)) + items: set[str] = set() + for r in t0 + t1: + items.update(r.responses.keys()) + per_item: dict[str, dict] = {} + for it in sorted(items): + deltas = [] + for aid in paired: + v0 = by_t0[aid].responses.get(it) + v1 = by_t1[aid].responses.get(it) + if v0 is None or v1 is None: + continue + deltas.append(v1 - v0) + if not deltas: + per_item[it] = {"mean_delta": None, "n": 0} + continue + m = sum(deltas) / len(deltas) + var = sum((d - m) ** 2 for d in deltas) / max(len(deltas) - 1, 1) + per_item[it] = { + "mean_delta": m, + "sd_delta": math.sqrt(var), + "n": len(deltas), + "n_positive": sum(1 for d in deltas if d > 0), + "n_negative": sum(1 for d in deltas if d < 0), + } + per_agent: dict[int, dict] = {} + for aid in paired: + r0 = by_t0[aid].responses + r1 = by_t1[aid].responses + common = set(r0) & set(r1) + total = sum(abs(r1[k] - r0[k]) for k in common) + per_agent[aid] = {"total_abs_drift": total, "n_items": len(common)} + return { + "n_paired": len(paired), + "n_t0_only": len(set(by_t0) - set(by_t1)), + "n_t1_only": len(set(by_t1) - set(by_t0)), + "per_item": per_item, + "per_agent": per_agent, + } diff --git a/backend/scripts/instruments/longitudinal_v1.yaml b/backend/scripts/instruments/longitudinal_v1.yaml new file mode 100644 index 00000000..7a37d18c --- /dev/null +++ b/backend/scripts/instruments/longitudinal_v1.yaml @@ -0,0 +1,47 @@ +name: longitudinal_v1 +version: "1.0" +language_default: de +items: + # Stock status & recovery + - {item_id: stk_1, family: stocks, scale: 5, + de: "Der westliche Dorschbestand wird sich bis 2035 erholen.", + en: "The Western Baltic cod stock will recover by 2035."} + - {item_id: stk_2, family: stocks, scale: 5, + de: "Der Heringsbestand in der westlichen Ostsee ist nicht mehr zu retten.", + en: "The Western Baltic herring stock can no longer be saved.", + reverse_coded: true} + - {item_id: stk_3, family: stocks, scale: 5, + de: "Wissenschaftliche Bestandsschätzungen sind generell zuverlässig.", + en: "Scientific stock assessments are generally reliable."} + # Governance & CFP + - {item_id: gov_1, family: governance, scale: 5, + de: "Die Gemeinsame Fischereipolitik der EU scheitert beim Schutz der Ostseefische.", + en: "The EU Common Fisheries Policy fails to protect Baltic fish.", + reverse_coded: true} + - {item_id: gov_2, family: governance, scale: 5, + de: "Entscheidungen über Fangquoten sollten stärker lokal getroffen werden.", + en: "Decisions on catch quotas should be taken more locally."} + - {item_id: gov_3, family: governance, scale: 5, + de: "Die deutsche Bundesregierung handelt entschlossen bei Fischereifragen.", + en: "The German federal government acts decisively on fisheries issues."} + # Market & MSC + - {item_id: mkt_1, family: market, scale: 5, + de: "Nur MSC-zertifizierter Fisch sollte verkauft werden dürfen.", + en: "Only MSC-certified fish should be allowed for sale."} + - {item_id: mkt_2, family: market, scale: 5, + de: "Importierter Fisch verdrängt die deutsche Kleinfischerei.", + en: "Imported fish displaces German small-scale fisheries."} + - {item_id: mkt_3, family: market, scale: 5, + de: "Verbraucher zahlen gerne mehr für nachhaltigen Ostseefisch.", + en: "Consumers gladly pay more for sustainable Baltic fish."} + # Climate & adaptation + - {item_id: clm_1, family: climate, scale: 5, + de: "Der Klimawandel macht traditionelle Ostseefischerei unmöglich.", + en: "Climate change makes traditional Baltic fisheries impossible.", + reverse_coded: true} + - {item_id: clm_2, family: climate, scale: 5, + de: "Aquakultur ist die Zukunft der deutschen Fischwirtschaft.", + en: "Aquaculture is the future of the German fishing industry."} + - {item_id: clm_3, family: climate, scale: 5, + de: "Die Fischerei muss sich grundlegend an neue Arten anpassen.", + en: "Fisheries must fundamentally adapt to new species."} diff --git a/backend/tests/interviews/test_longitudinal.py b/backend/tests/interviews/test_longitudinal.py new file mode 100644 index 00000000..823e1552 --- /dev/null +++ b/backend/tests/interviews/test_longitudinal.py @@ -0,0 +1,57 @@ +from pathlib import Path +import pytest +from app.models.interview import InterviewPhase +from app.services.interviews.base import PersonaRecord, MemoryDigest +from app.services.interviews.longitudinal import LongitudinalSubagent, run_aggregate + + +class _FakeMem: + def get_digest(self, agent_id, max_chars=2000): + return MemoryDigest(text="x", available=True) + + +class _CannedLLM: + def __init__(self): self.n = 0 + def chat_json(self, messages, temperature=0.0, max_tokens=None, **kw): + self.n += 1 + return { + "responses": { + "stk_1": 4, "stk_2": 3, "stk_3": 5, + "gov_1": 3, "gov_2": 4, "gov_3": 2, + "mkt_1": 5, "mkt_2": 3, "mkt_3": 4, + "clm_1": 2, "clm_2": 4, "clm_3": 5, + }, + "confidence": { + "stk_1": 0.8, "stk_2": 0.7, "stk_3": 0.9, + "gov_1": 0.6, "gov_2": 0.7, "gov_3": 0.5, + "mkt_1": 0.7, "mkt_2": 0.6, "mkt_3": 0.8, + "clm_1": 0.5, "clm_2": 0.7, "clm_3": 0.6, + }, + "open_comment": "test", + } + + +INSTRUMENT = Path(__file__).resolve().parents[2] / "scripts" / "instruments" / "longitudinal_v1.yaml" + + +def test_longitudinal_administer_one_agent(): + sub = LongitudinalSubagent(llm=_CannedLLM(), memory=_FakeMem(), instrument_path=INSTRUMENT) + persona = PersonaRecord(agent_id=3, name="A", persona="p") + resp = sub.administer(persona, phase=InterviewPhase.T0) + assert resp.agent_id == 3 + assert resp.phase == InterviewPhase.T0 + assert set(resp.responses.keys()) >= {"stk_1", "gov_1", "mkt_1", "clm_1"} + + +def test_longitudinal_aggregate_delta(): + from app.models.interview import LikertResponse + t0 = [LikertResponse(agent_id=i, phase=InterviewPhase.T0, + responses={"stk_1": 3, "gov_1": 4}, + confidence={"stk_1": 0.8, "gov_1": 0.8}) for i in range(5)] + t1 = [LikertResponse(agent_id=i, phase=InterviewPhase.T1, + responses={"stk_1": 4, "gov_1": 4}, + confidence={"stk_1": 0.8, "gov_1": 0.8}) for i in range(5)] + agg = run_aggregate(t0, t1) + assert agg["per_item"]["stk_1"]["mean_delta"] == 1.0 + assert agg["per_item"]["gov_1"]["mean_delta"] == 0.0 + assert agg["n_paired"] == 5 From 75762ccc186d67f2f37e4e31756ba97fd42d8535 Mon Sep 17 00:00:00 2001 From: Christian Moellmann Date: Sat, 23 May 2026 12:16:21 +0200 Subject: [PATCH 10/26] feat(interviews): diversity subagent with Q-sort + 6 Likert axes + PCA/k-means typology Co-Authored-By: Claude Opus 4.7 (1M context) --- backend/app/services/interviews/diversity.py | 133 ++++++++++++++++++ backend/scripts/instruments/diversity_v1.yaml | 36 +++++ backend/tests/interviews/test_diversity.py | 48 +++++++ 3 files changed, 217 insertions(+) create mode 100644 backend/app/services/interviews/diversity.py create mode 100644 backend/scripts/instruments/diversity_v1.yaml create mode 100644 backend/tests/interviews/test_diversity.py diff --git a/backend/app/services/interviews/diversity.py b/backend/app/services/interviews/diversity.py new file mode 100644 index 00000000..96febcf5 --- /dev/null +++ b/backend/app/services/interviews/diversity.py @@ -0,0 +1,133 @@ +from __future__ import annotations +import json +from pathlib import Path +from typing import Optional +import numpy as np +from sklearn.decomposition import PCA +from sklearn.cluster import KMeans +import yaml +from app.models.interview import QSortResponse +from app.services.interviews.base import StakeholderInterviewer, PersonaRecord +from app.services.interviews.instrument_loader import InstrumentValidationError + + +class DiversitySubagent: + def __init__(self, llm, memory, instrument_path: Path, language: str = "de"): + self.instrument = self._load(Path(instrument_path)) + self.interviewer = StakeholderInterviewer(llm=llm, memory=memory, language=language) + self.language = language + + def _load(self, path: Path) -> dict: + with path.open("r", encoding="utf-8") as f: + data = yaml.safe_load(f) + if not isinstance(data, dict) or "statements" not in data or "distribution" not in data: + raise InstrumentValidationError(f"invalid diversity instrument: {path}") + if sum(data["distribution"]) != len(data["statements"]): + raise InstrumentValidationError("distribution sum must equal number of statements") + return data + + def _schema_hint(self) -> str: + return json.dumps({ + "placements": {s["statement_id"]: "" for s in self.instrument["statements"]}, + "likert_axes": {a["axis_id"]: "" for a in self.instrument["likert_axes"]}, + }, ensure_ascii=False) + + def _user_prompt(self) -> str: + dist = self.instrument["distribution"] + buckets = list(range(-3, 4)) + bucket_desc = ", ".join(f"{b}:{n}" for b, n in zip(buckets, dist)) + lines = [ + ("Ordnen Sie jede Aussage genau einer Box von -3 (lehne stark ab) bis +3 (stimme stark zu) zu. " + f"Die Verteilung ist erzwungen: {bucket_desc}.") if self.language == "de" else + ("Place every statement into exactly one box from -3 (strongly disagree) to +3 (strongly agree). " + f"The distribution is forced: {bucket_desc}."), + "", + "Statements:", + ] + for s in self.instrument["statements"]: + txt = s["de"] if self.language == "de" else s["en"] + lines.append(f"- [{s['statement_id']}] {txt}") + lines += ["", "Then rate each axis from 1 to 7:"] + for a in self.instrument["likert_axes"]: + txt = a["de"] if self.language == "de" else a["en"] + lines.append(f"- [{a['axis_id']}] {txt}") + return "\n".join(lines) + + def _validator(self, raw: dict) -> Optional[dict]: + if not isinstance(raw, dict): + return None + placements = raw.get("placements", {}) + axes = raw.get("likert_axes", {}) + statements = {s["statement_id"] for s in self.instrument["statements"]} + if set(placements.keys()) != statements: + return None + dist = self.instrument["distribution"] + target = {b: n for b, n in zip(range(-3, 4), dist)} + got: dict[int, int] = {} + for v in placements.values(): + if not isinstance(v, int) or not -3 <= v <= 3: + return None + got[v] = got.get(v, 0) + 1 + if got != target: + return None + for a in self.instrument["likert_axes"]: + v = axes.get(a["axis_id"]) + if not isinstance(v, int) or not 1 <= v <= 7: + return None + return raw + + def administer(self, persona: PersonaRecord) -> QSortResponse: + raw = self.interviewer.ask_in_character( + persona, + user_prompt=self._user_prompt(), + schema_hint=self._schema_hint(), + validate=self._validator, + ) + return QSortResponse( + agent_id=persona.agent_id, + placements={k: int(v) for k, v in raw["placements"].items()}, + likert_axes={k: int(v) for k, v in raw["likert_axes"].items()}, + ) + + +def _vectorize(r: QSortResponse, statements: list[str], axes: list[str]) -> np.ndarray: + return np.array( + [r.placements.get(s, 0) for s in statements] + + [r.likert_axes.get(a, 4) for a in axes], + dtype=float, + ) + + +def run_typology(responses: list[QSortResponse], n_clusters: int = 4) -> dict: + if not responses: + return {"n": 0, "clusters": [], "pca": {"components": [], "explained_variance": []}} + statements = sorted({k for r in responses for k in r.placements}) + axes = sorted({k for r in responses for k in r.likert_axes}) + X = np.vstack([_vectorize(r, statements, axes) for r in responses]) + n_clusters = min(n_clusters, len(responses)) + pca = PCA(n_components=min(5, X.shape[1], X.shape[0])) + pcs = pca.fit_transform(X) + km = KMeans(n_clusters=n_clusters, n_init=10, random_state=0) + labels = km.fit_predict(X) + clusters = [] + for c in range(n_clusters): + members = [responses[i].agent_id for i in range(len(responses)) if labels[i] == c] + centroid = km.cluster_centers_[c] + clusters.append({ + "cluster_id": int(c), + "n": len(members), + "agent_ids": members, + "top_loadings": { + statements[i] if i < len(statements) else axes[i - len(statements)]: float(centroid[i]) + for i in np.argsort(np.abs(centroid))[::-1][:8].tolist() + }, + }) + return { + "n": len(responses), + "clusters": clusters, + "pca": { + "components": pcs.tolist(), + "explained_variance": pca.explained_variance_ratio_.tolist(), + "agent_ids": [r.agent_id for r in responses], + }, + } diff --git a/backend/scripts/instruments/diversity_v1.yaml b/backend/scripts/instruments/diversity_v1.yaml new file mode 100644 index 00000000..7c47cd96 --- /dev/null +++ b/backend/scripts/instruments/diversity_v1.yaml @@ -0,0 +1,36 @@ +name: diversity_v1 +version: "1.0" +language_default: de +distribution: [2, 3, 4, 6, 4, 3, 2] # buckets from -3 to +3, total 24 +statements: + - {statement_id: st_01, de: "Die Ostsee gehört den Fischern, die hier seit Generationen leben.", en: "The Baltic belongs to fishers who have lived here for generations."} + - {statement_id: st_02, de: "MSC-Zertifizierung schützt vor allem große Konzerne.", en: "MSC certification mainly protects large corporations."} + - {statement_id: st_03, de: "Wissenschaftliche Quoten sind die einzige Grundlage für Politik.", en: "Scientific quotas are the only legitimate basis for policy."} + - {statement_id: st_04, de: "Aquakultur kann Ostseefischerei ersetzen.", en: "Aquaculture can replace Baltic fisheries."} + - {statement_id: st_05, de: "Sportfischer schaden den Beständen mehr als die Berufsfischer.", en: "Recreational anglers harm stocks more than commercial fishers."} + - {statement_id: st_06, de: "Die EU-Fischereipolitik kennt die Ostsee nicht.", en: "EU fisheries policy doesn't understand the Baltic."} + - {statement_id: st_07, de: "Großtechnische Fischerei ist effizienter und damit nachhaltiger.", en: "Industrial fisheries are more efficient and therefore more sustainable."} + - {statement_id: st_08, de: "Wer Fisch isst, sollte mehr dafür bezahlen.", en: "Those who eat fish should pay more for it."} + - {statement_id: st_09, de: "Die Kleinfischerei muss subventioniert werden.", en: "Small-scale fisheries must be subsidised."} + - {statement_id: st_10, de: "Marine Schutzgebiete sind reine Symbolpolitik.", en: "Marine protected areas are mere symbolism."} + - {statement_id: st_11, de: "Russlands Krieg ändert alles in der Ostsee.", en: "Russia's war changes everything in the Baltic."} + - {statement_id: st_12, de: "Nur drastische Reduktion der Fangmengen rettet die Bestände.", en: "Only drastic catch reductions will save the stocks."} + - {statement_id: st_13, de: "NGOs übertreiben die Krise systematisch.", en: "NGOs systematically exaggerate the crisis."} + - {statement_id: st_14, de: "Klimawandel ist das eigentliche Problem, nicht die Fischerei.", en: "Climate change is the real problem, not fisheries."} + - {statement_id: st_15, de: "Tradition zählt mehr als kurzfristige Bestandszahlen.", en: "Tradition matters more than short-term stock numbers."} + - {statement_id: st_16, de: "Verbraucher entscheiden über die Zukunft des Fisches.", en: "Consumers decide the future of fish."} + - {statement_id: st_17, de: "Ohne Generalstreik der Fischer ändert sich nichts.", en: "Without a fishers' general strike, nothing will change."} + - {statement_id: st_18, de: "Die Bundesregierung sollte Kutter aufkaufen und stilllegen.", en: "The federal government should buy out and decommission boats."} + - {statement_id: st_19, de: "Die Dorschkrise ist Folge gescheiterter Politik.", en: "The cod crisis is the result of policy failure."} + - {statement_id: st_20, de: "Ostsee-Aquakultur ist ökologisch problematisch.", en: "Baltic aquaculture is ecologically problematic."} + - {statement_id: st_21, de: "Junge Menschen werden keinen Fischereibetrieb mehr übernehmen.", en: "Young people will no longer take over fishing businesses."} + - {statement_id: st_22, de: "Markt regelt sich selbst, auch beim Fisch.", en: "The market regulates itself, also for fish."} + - {statement_id: st_23, de: "Lokale Genossenschaften sind die Lösung.", en: "Local cooperatives are the solution."} + - {statement_id: st_24, de: "In 20 Jahren gibt es keine deutsche Ostseefischerei mehr.", en: "In 20 years there will be no German Baltic fisheries left."} +likert_axes: + - {axis_id: ax_pres_extr, scale: 7, de: "Bewahrung (1) vs. Nutzung (7)", en: "Preservation (1) vs. Extraction (7)"} + - {axis_id: ax_loc_eu, scale: 7, de: "Lokal (1) vs. EU-zentral (7)", en: "Local (1) vs. EU-central (7)"} + - {axis_id: ax_sci_trad, scale: 7, de: "Wissenschaft (1) vs. Tradition (7)", en: "Science-led (1) vs. Tradition-led (7)"} + - {axis_id: ax_ind_col, scale: 7, de: "Individuum (1) vs. Kollektiv (7)", en: "Individual (1) vs. Collective (7)"} + - {axis_id: ax_short_long,scale: 7, de: "Kurzfristig (1) vs. Langfristig (7)", en: "Short-term (1) vs. Long-term (7)"} + - {axis_id: ax_mkt_reg, scale: 7, de: "Markt (1) vs. Regulierung (7)", en: "Market (1) vs. Regulation (7)"} diff --git a/backend/tests/interviews/test_diversity.py b/backend/tests/interviews/test_diversity.py new file mode 100644 index 00000000..7650fac2 --- /dev/null +++ b/backend/tests/interviews/test_diversity.py @@ -0,0 +1,48 @@ +from pathlib import Path +import numpy as np +from app.services.interviews.base import PersonaRecord, MemoryDigest +from app.services.interviews.diversity import ( + DiversitySubagent, run_typology, +) + +class _Mem: + def get_digest(self, agent_id, max_chars=2000): + return MemoryDigest(text="x", available=True) + +class _CannedLLM: + def chat_json(self, messages, temperature=0.0, max_tokens=None, **kw): + # Place all 24 statements into legal buckets per the forced distribution + placements = {} + buckets = [-3]*2 + [-2]*3 + [-1]*4 + [0]*6 + [1]*4 + [2]*3 + [3]*2 + for i in range(24): + placements[f"st_{i+1:02d}"] = buckets[i] + return { + "placements": placements, + "likert_axes": {"ax_pres_extr": 5, "ax_loc_eu": 3, "ax_sci_trad": 4, + "ax_ind_col": 4, "ax_short_long": 5, "ax_mkt_reg": 3}, + } + +INSTRUMENT = Path(__file__).resolve().parents[2] / "scripts" / "instruments" / "diversity_v1.yaml" + +def test_diversity_administer(): + sub = DiversitySubagent(llm=_CannedLLM(), memory=_Mem(), instrument_path=INSTRUMENT) + persona = PersonaRecord(agent_id=1, name="A", persona="p") + resp = sub.administer(persona) + assert len(resp.placements) == 24 + assert set(resp.likert_axes.keys()) == { + "ax_pres_extr","ax_loc_eu","ax_sci_trad","ax_ind_col","ax_short_long","ax_mkt_reg" + } + +def test_typology_runs_pca_kmeans(): + from app.models.interview import QSortResponse + rng = np.random.default_rng(42) + responses = [] + for aid in range(20): + placements = {f"st_{i+1:02d}": int(rng.integers(-3, 4)) for i in range(24)} + axes = {f"ax_{j}": int(rng.integers(1, 8)) for j in range(6)} + responses.append(QSortResponse(agent_id=aid, placements=placements, likert_axes=axes)) + result = run_typology(responses, n_clusters=3) + assert "clusters" in result + assert len(result["clusters"]) == 3 + assert "pca" in result + assert len(result["pca"]["components"]) >= 2 From 5d7111b54e941aa5e1704ecfad406522b51d6471 Mon Sep 17 00:00:00 2001 From: Christian Moellmann Date: Sat, 23 May 2026 12:19:07 +0200 Subject: [PATCH 11/26] feat(interviews): Delphi subagent (3 rounds: open, rate, revise) + convergence metrics Co-Authored-By: Claude Opus 4.7 (1M context) --- backend/app/services/interviews/delphi.py | 195 +++++++++++++++++++++ backend/scripts/instruments/delphi_v1.yaml | 9 + backend/tests/interviews/test_delphi.py | 58 ++++++ 3 files changed, 262 insertions(+) create mode 100644 backend/app/services/interviews/delphi.py create mode 100644 backend/scripts/instruments/delphi_v1.yaml create mode 100644 backend/tests/interviews/test_delphi.py diff --git a/backend/app/services/interviews/delphi.py b/backend/app/services/interviews/delphi.py new file mode 100644 index 00000000..be455ae9 --- /dev/null +++ b/backend/app/services/interviews/delphi.py @@ -0,0 +1,195 @@ +from __future__ import annotations +import json +import statistics +from pathlib import Path +from typing import Optional +import yaml +from app.models.interview import ( + DelphiOpenResponse, DelphiRatingResponse, +) +from app.services.interviews.base import StakeholderInterviewer, PersonaRecord + + +class DelphiSubagent: + def __init__(self, llm, memory, instrument_path: Path, language: str = "de"): + with Path(instrument_path).open("r", encoding="utf-8") as f: + self.instrument = yaml.safe_load(f) + self.interviewer = StakeholderInterviewer(llm=llm, memory=memory, language=language) + self.llm = llm + self.language = language + + # --- Round 1: open questions --- + def _r1_schema(self) -> str: + return json.dumps({ + "answers": {q["question_id"]: "" for q in self.instrument["questions"]} + }, ensure_ascii=False) + + def _r1_prompt(self) -> str: + lines = ["Bitte beantworten Sie offen:" if self.language == "de" else "Please answer openly:"] + for q in self.instrument["questions"]: + txt = q["de"] if self.language == "de" else q["en"] + lines.append(f"[{q['question_id']}] {txt}") + return "\n".join(lines) + + def _r1_validate(self, raw: dict) -> Optional[dict]: + if not isinstance(raw, dict): return None + ans = raw.get("answers") + if not isinstance(ans, dict): return None + required = {q["question_id"] for q in self.instrument["questions"]} + if not required.issubset(ans.keys()): return None + return raw + + def administer_round1(self, persona: PersonaRecord) -> DelphiOpenResponse: + raw = self.interviewer.ask_in_character( + persona, user_prompt=self._r1_prompt(), + schema_hint=self._r1_schema(), validate=self._r1_validate, + ) + return DelphiOpenResponse(agent_id=persona.agent_id, round=1, + answers={k: str(v) for k, v in raw["answers"].items()}) + + # --- Round 2: rate themes --- + def _r2_schema(self, theme_ids: list[str]) -> str: + return json.dumps({ + "ratings": {tid: {"importance": "", "plausibility": ""} for tid in theme_ids} + }, ensure_ascii=False) + + def _r2_prompt(self, themes: list[dict]) -> str: + head = "Bewerten Sie jedes Thema nach Wichtigkeit (1-5) und Plausibilität (1-5):" if self.language == "de" \ + else "Rate each theme on importance (1-5) and plausibility (1-5):" + body = [f"- [{t['theme_id']}] {t['label']}" for t in themes] + return head + "\n" + "\n".join(body) + + def _r2_validate(self, theme_ids: list[str]): + def v(raw: dict) -> Optional[dict]: + if not isinstance(raw, dict): return None + ratings = raw.get("ratings", {}) + if set(ratings.keys()) != set(theme_ids): return None + for tid, r in ratings.items(): + if not isinstance(r, dict): return None + for key in ("importance", "plausibility"): + if not isinstance(r.get(key), int) or not 1 <= r[key] <= 5: return None + return raw + return v + + def administer_round2(self, persona: PersonaRecord, themes: list[dict]) -> DelphiRatingResponse: + theme_ids = [t["theme_id"] for t in themes] + raw = self.interviewer.ask_in_character( + persona, user_prompt=self._r2_prompt(themes), + schema_hint=self._r2_schema(theme_ids), validate=self._r2_validate(theme_ids), + ) + return DelphiRatingResponse(agent_id=persona.agent_id, round=2, + ratings={k: dict(v) for k, v in raw["ratings"].items()}) + + # --- Round 3: revise after seeing group stats --- + def administer_round3( + self, persona: PersonaRecord, themes: list[dict], group_stats: dict, own_r2: DelphiRatingResponse + ) -> DelphiRatingResponse: + theme_ids = [t["theme_id"] for t in themes] + head = ("Sie sehen unten die anonymisierten Gruppenwerte (Median, IQR). " + "Bitte überarbeiten Sie Ihre Bewertungen, wenn Sie möchten, und begründen Sie kurz.") \ + if self.language == "de" else \ + ("Below are the anonymised group values (median, IQR). " + "Please revise your ratings if you wish and add a short justification.") + ctx_lines = [] + for t in themes: + tid = t["theme_id"] + gs = group_stats.get(tid, {}) + own = own_r2.ratings.get(tid, {}) + ctx_lines.append( + f"[{tid}] {t['label']} — group importance median={gs.get('imp_median')}, " + f"IQR={gs.get('imp_iqr')}; plausibility median={gs.get('plaus_median')}, " + f"IQR={gs.get('plaus_iqr')}. Your R2: imp={own.get('importance')}, plaus={own.get('plausibility')}." + ) + prompt = head + "\n\n" + "\n".join(ctx_lines) + schema = json.dumps({ + "ratings": {tid: {"importance": "", "plausibility": ""} for tid in theme_ids}, + "justification": "", + }, ensure_ascii=False) + + def validate(raw): + if not isinstance(raw, dict): return None + ratings = raw.get("ratings", {}) + if set(ratings.keys()) != set(theme_ids): return None + for r in ratings.values(): + if not isinstance(r, dict): return None + for key in ("importance", "plausibility"): + if not isinstance(r.get(key), int) or not 1 <= r[key] <= 5: return None + return raw + + raw = self.interviewer.ask_in_character(persona, user_prompt=prompt, + schema_hint=schema, validate=validate) + return DelphiRatingResponse( + agent_id=persona.agent_id, round=3, + ratings={k: dict(v) for k, v in raw["ratings"].items()}, + justification=raw.get("justification"), + ) + + +def extract_themes(round1: list[DelphiOpenResponse], llm) -> list[dict]: + text_blocks = [] + for r in round1: + for qid, ans in r.answers.items(): + text_blocks.append(f"[agent {r.agent_id} {qid}] {ans}") + schema = json.dumps({"themes": [{"theme_id": "", "label": ""}]}, ensure_ascii=False) + messages = [ + {"role": "system", "content": + "You extract distinct thematic codes from open-ended German fisheries survey responses. " + f"Return JSON ONLY matching: {schema}. Use stable theme_ids of form theme_0, theme_1, …"}, + {"role": "user", "content": "Responses:\n" + "\n".join(text_blocks) + "\n\nReturn up to 12 distinct themes."}, + ] + raw = llm.chat_json(messages=messages, temperature=0.0) + themes = raw.get("themes", []) if isinstance(raw, dict) else [] + out = [] + for i, t in enumerate(themes): + if isinstance(t, dict) and "label" in t: + out.append({"theme_id": t.get("theme_id") or f"theme_{i}", "label": str(t["label"])}) + return out + + +def _iqr(xs: list[float]) -> float: + if not xs: return 0.0 + xs = sorted(xs) + q1 = statistics.quantiles(xs, n=4)[0] if len(xs) >= 4 else xs[0] + q3 = statistics.quantiles(xs, n=4)[2] if len(xs) >= 4 else xs[-1] + return q3 - q1 + + +def convergence_metrics(r2: list[DelphiRatingResponse], r3: list[DelphiRatingResponse]) -> dict: + by_r2 = {r.agent_id: r for r in r2} + by_r3 = {r.agent_id: r for r in r3} + themes: set[str] = set() + for r in r2 + r3: + themes.update(r.ratings.keys()) + out: dict[str, dict] = {} + for t in sorted(themes): + imp_r2 = [by_r2[a].ratings[t]["importance"] for a in by_r2 if t in by_r2[a].ratings] + imp_r3 = [by_r3[a].ratings[t]["importance"] for a in by_r3 if t in by_r3[a].ratings] + plaus_r2 = [by_r2[a].ratings[t]["plausibility"] for a in by_r2 if t in by_r2[a].ratings] + plaus_r3 = [by_r3[a].ratings[t]["plausibility"] for a in by_r3 if t in by_r3[a].ratings] + out[t] = { + "imp_median_r2": statistics.median(imp_r2) if imp_r2 else None, + "imp_median_r3": statistics.median(imp_r3) if imp_r3 else None, + "imp_iqr_r2": _iqr(imp_r2), + "imp_iqr_r3": _iqr(imp_r3), + "delta_iqr_importance": _iqr(imp_r3) - _iqr(imp_r2), + "plaus_iqr_r2": _iqr(plaus_r2), + "plaus_iqr_r3": _iqr(plaus_r3), + "delta_iqr_plausibility": _iqr(plaus_r3) - _iqr(plaus_r2), + } + return out + + +def group_stats_from_r2(r2: list[DelphiRatingResponse]) -> dict: + themes: set[str] = set() + for r in r2: themes.update(r.ratings.keys()) + stats: dict[str, dict] = {} + for t in themes: + imps = [r.ratings[t]["importance"] for r in r2 if t in r.ratings] + plauss = [r.ratings[t]["plausibility"] for r in r2 if t in r.ratings] + stats[t] = { + "imp_median": statistics.median(imps) if imps else None, + "imp_iqr": _iqr(imps), + "plaus_median": statistics.median(plauss) if plauss else None, + "plaus_iqr": _iqr(plauss), + } + return stats diff --git a/backend/scripts/instruments/delphi_v1.yaml b/backend/scripts/instruments/delphi_v1.yaml new file mode 100644 index 00000000..bb7650dc --- /dev/null +++ b/backend/scripts/instruments/delphi_v1.yaml @@ -0,0 +1,9 @@ +name: delphi_v1 +version: "1.0" +language_default: de +rounds: 3 +questions: + - {question_id: q1, de: "Welche drei Faktoren werden die deutsche Fischerei bis 2040 am stärksten prägen?", en: "Which three factors will most shape German fisheries by 2040?"} + - {question_id: q2, de: "Welche Akteurinnen und Akteure sind heute entscheidend, werden aber unterschätzt?", en: "Which actors are decisive today but underestimated?"} + - {question_id: q3, de: "Was sollte sich in den nächsten fünf Jahren ändern, damit die Fischerei eine Zukunft hat?", en: "What should change in the next five years for fisheries to have a future?"} + - {question_id: q4, de: "Welcher Trend macht Ihnen am meisten Hoffnung – und welcher am meisten Sorge?", en: "Which trend gives you most hope — and which most concern?"} diff --git a/backend/tests/interviews/test_delphi.py b/backend/tests/interviews/test_delphi.py new file mode 100644 index 00000000..c01ecfb8 --- /dev/null +++ b/backend/tests/interviews/test_delphi.py @@ -0,0 +1,58 @@ +from pathlib import Path +from app.services.interviews.base import PersonaRecord, MemoryDigest +from app.services.interviews.delphi import ( + DelphiSubagent, extract_themes, convergence_metrics, +) + +INSTRUMENT = Path(__file__).resolve().parents[2] / "scripts" / "instruments" / "delphi_v1.yaml" + +class _Mem: + def get_digest(self, agent_id, max_chars=2000): + return MemoryDigest(text="x", available=True) + +class _R1LLM: + def chat_json(self, messages, temperature=0.0, max_tokens=None, **kw): + return {"answers": { + "q1": "Klimawandel, Quoten, Generationswechsel", + "q2": "MSC, Aquakultur", + "q3": "Russland, EU-Politik", + "q4": "Verbraucherpreise", + }} + +class _R2LLM: + def chat_json(self, messages, temperature=0.0, max_tokens=None, **kw): + return {"ratings": {f"theme_{i}": {"importance": 4, "plausibility": 3} for i in range(5)}} + +class _ExtractLLM: + def chat_json(self, messages, temperature=0.0, max_tokens=None, **kw): + return {"themes": [ + {"theme_id": "theme_0", "label": "Klimawandel"}, + {"theme_id": "theme_1", "label": "Quoten"}, + {"theme_id": "theme_2", "label": "MSC"}, + {"theme_id": "theme_3", "label": "EU-Politik"}, + {"theme_id": "theme_4", "label": "Generationswechsel"}, + ]} + +def test_delphi_round1_open(): + sub = DelphiSubagent(llm=_R1LLM(), memory=_Mem(), instrument_path=INSTRUMENT) + persona = PersonaRecord(agent_id=2, name="A", persona="p") + resp = sub.administer_round1(persona) + assert resp.round == 1 + assert len(resp.answers) == 4 + +def test_extract_themes_aggregates(): + from app.models.interview import DelphiOpenResponse + r1 = [DelphiOpenResponse(agent_id=i, answers={"q1": "Klimawandel", "q2": "MSC"}) for i in range(3)] + themes = extract_themes(r1, llm=_ExtractLLM()) + assert len(themes) == 5 + assert all("theme_id" in t for t in themes) + +def test_convergence_metrics(): + from app.models.interview import DelphiRatingResponse + r2 = [DelphiRatingResponse(agent_id=i, round=2, + ratings={"t1": {"importance": 3, "plausibility": 3}}) for i in range(5)] + r3 = [DelphiRatingResponse(agent_id=i, round=3, + ratings={"t1": {"importance": 4, "plausibility": 4}}) for i in range(5)] + conv = convergence_metrics(r2, r3) + assert "t1" in conv + assert conv["t1"]["delta_iqr_importance"] is not None From ae4941df8e45c51c4e18d626202f34494c65b0c2 Mon Sep 17 00:00:00 2001 From: Christian Moellmann Date: Sat, 23 May 2026 12:21:21 +0200 Subject: [PATCH 12/26] =?UTF-8?q?feat(interviews):=20scenario=20subagent?= =?UTF-8?q?=20with=204=20futures=20=C3=97=204=20dimensions=20+=20polarity?= =?UTF-8?q?=20matrix?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Co-Authored-By: Claude Opus 4.7 (1M context) --- backend/app/services/interviews/scenario.py | 80 ++++++++++++++++++++ backend/scripts/instruments/scenario_v1.yaml | 51 +++++++++++++ backend/tests/interviews/test_scenario.py | 34 +++++++++ 3 files changed, 165 insertions(+) create mode 100644 backend/app/services/interviews/scenario.py create mode 100644 backend/scripts/instruments/scenario_v1.yaml create mode 100644 backend/tests/interviews/test_scenario.py diff --git a/backend/app/services/interviews/scenario.py b/backend/app/services/interviews/scenario.py new file mode 100644 index 00000000..f78239fb --- /dev/null +++ b/backend/app/services/interviews/scenario.py @@ -0,0 +1,80 @@ +from __future__ import annotations +import json +import statistics +from pathlib import Path +from typing import Optional +import yaml +from app.models.interview import ScenarioRating, ScenarioResponse +from app.services.interviews.base import StakeholderInterviewer, PersonaRecord + +class ScenarioSubagent: + def __init__(self, llm, memory, instrument_path: Path, language: str = "de"): + with Path(instrument_path).open("r", encoding="utf-8") as f: + self.instrument = yaml.safe_load(f) + self.interviewer = StakeholderInterviewer(llm=llm, memory=memory, language=language) + self.language = language + + def _schema_hint(self) -> str: + sids = [s["scenario_id"] for s in self.instrument["scenarios"]] + return json.dumps({ + "ratings": {sid: { + "desirability": "", + "plausibility": "", + "impact_on_my_group": "", + "fairness": "", + "if_woke_up_response": "", + } for sid in sids} + }, ensure_ascii=False) + + def _user_prompt(self) -> str: + head = ("Bewerten Sie jedes der folgenden Szenarien auf vier Dimensionen (1-7) " + "und beantworten Sie kurz, was Sie tun würden, wenn Sie in dieser Welt aufwachten.") \ + if self.language == "de" else \ + ("Rate each of the following scenarios on four dimensions (1-7) " + "and briefly answer what you would do if you woke up in this world.") + blocks = [] + for s in self.instrument["scenarios"]: + label = s["label_de"] if self.language == "de" else s["label_en"] + desc = s["description_de"] if self.language == "de" else s["description_en"] + blocks.append(f"--- {s['scenario_id']}: {label} ---\n{desc}") + return head + "\n\n" + "\n\n".join(blocks) + + def _validate(self, raw: dict) -> Optional[dict]: + if not isinstance(raw, dict): return None + sids = {s["scenario_id"] for s in self.instrument["scenarios"]} + ratings = raw.get("ratings", {}) + if set(ratings.keys()) != sids: return None + for v in ratings.values(): + if not isinstance(v, dict): return None + for k in ("desirability", "plausibility", "impact_on_my_group", "fairness"): + if not isinstance(v.get(k), int) or not 1 <= v[k] <= 7: return None + if not isinstance(v.get("if_woke_up_response", ""), str): return None + return raw + + def administer(self, persona: PersonaRecord) -> ScenarioResponse: + raw = self.interviewer.ask_in_character( + persona, user_prompt=self._user_prompt(), + schema_hint=self._schema_hint(), validate=self._validate, + ) + ratings = {sid: ScenarioRating(**v) for sid, v in raw["ratings"].items()} + return ScenarioResponse(agent_id=persona.agent_id, ratings=ratings) + +def polarity_matrix(responses: list[ScenarioResponse]) -> dict: + matrix: dict[str, dict] = {} + sids: set[str] = set() + for r in responses: sids.update(r.ratings.keys()) + for sid in sorted(sids): + vals = [r.ratings[sid] for r in responses if sid in r.ratings] + if not vals: + matrix[sid] = {"n": 0} + continue + matrix[sid] = { + "n": len(vals), + "mean_desirability": statistics.mean(v.desirability for v in vals), + "mean_plausibility": statistics.mean(v.plausibility for v in vals), + "mean_impact": statistics.mean(v.impact_on_my_group for v in vals), + "mean_fairness": statistics.mean(v.fairness for v in vals), + "sd_desirability": statistics.pstdev([v.desirability for v in vals]) if len(vals) > 1 else 0.0, + "sd_plausibility": statistics.pstdev([v.plausibility for v in vals]) if len(vals) > 1 else 0.0, + } + return matrix diff --git a/backend/scripts/instruments/scenario_v1.yaml b/backend/scripts/instruments/scenario_v1.yaml new file mode 100644 index 00000000..5c150b80 --- /dev/null +++ b/backend/scripts/instruments/scenario_v1.yaml @@ -0,0 +1,51 @@ +name: scenario_v1 +version: "1.0" +language_default: de +scenarios: + - scenario_id: S1 + label_de: "Erholung 2040" + label_en: "Recovery 2040" + description_de: | + Bis 2040 haben sich Dorsch- und Heringsbestände in der westlichen Ostsee + deutlich erholt. MSC-Zertifizierung ist branchenweit Standard. Die kleine + Küstenfischerei hat sich stabilisiert; die Politik gilt als erfolgreich. + description_en: | + By 2040, Western Baltic cod and herring stocks have substantially recovered. + MSC certification is industry-wide standard. Small-scale coastal fisheries + have stabilised; policy is regarded as successful. + - scenario_id: S2 + label_de: "Kollaps 2040" + label_en: "Collapse 2040" + description_de: | + Bis 2040 sind Dorsch- und Heringsbestände zusammengebrochen. Die Flotte + ist halbiert, Aquakultur dominiert den Markt, Häfen veröden. + description_en: | + By 2040, cod and herring stocks have collapsed. The fleet is halved, + aquaculture dominates the market, harbour towns decline. + - scenario_id: S3 + label_de: "Festung Europa 2040" + label_en: "Fortress Europe 2040" + description_de: | + Bis 2040 verfolgt die EU eine protektionistische Politik mit hohen Importzöllen, + Meeresschutzgebiete bedecken 30% der Ostsee, Sportfischerei ist stark eingeschränkt. + description_en: | + By 2040, the EU pursues a protectionist policy with high import tariffs, + MPAs cover 30% of the Baltic, recreational fishing is strongly curtailed. + - scenario_id: S4 + label_de: "Privatisierung 2040" + label_en: "Privatisation 2040" + description_de: | + Bis 2040 sind Fangrechte als handelbare Quoten (ITQs) etabliert. Die Branche + hat sich konsolidiert; nur große, kapitalstarke Unternehmen sind übrig. + description_en: | + By 2040, fishing rights are tradable quotas (ITQs). The industry has + consolidated; only large, well-capitalised firms remain. +dimensions: + - {dimension_id: desirability, scale: 7, + de: "Wie wünschenswert ist dieses Szenario?", en: "How desirable is this scenario?"} + - {dimension_id: plausibility, scale: 7, + de: "Wie plausibel ist dieses Szenario?", en: "How plausible is this scenario?"} + - {dimension_id: impact_on_my_group, scale: 7, + de: "Wie stark trifft es Ihre Gruppe?", en: "How strongly does it affect your group?"} + - {dimension_id: fairness, scale: 7, + de: "Wie fair ist dieses Szenario?", en: "How fair is this scenario?"} diff --git a/backend/tests/interviews/test_scenario.py b/backend/tests/interviews/test_scenario.py new file mode 100644 index 00000000..567290d1 --- /dev/null +++ b/backend/tests/interviews/test_scenario.py @@ -0,0 +1,34 @@ +from pathlib import Path +from app.services.interviews.base import PersonaRecord, MemoryDigest +from app.services.interviews.scenario import ScenarioSubagent, polarity_matrix + +INSTRUMENT = Path(__file__).resolve().parents[2] / "scripts" / "instruments" / "scenario_v1.yaml" + +class _Mem: + def get_digest(self, agent_id, max_chars=2000): + return MemoryDigest(text="x", available=True) + +class _LLM: + def chat_json(self, messages, temperature=0.0, max_tokens=None, **kw): + return {"ratings": {sid: { + "desirability": 4, "plausibility": 3, "impact_on_my_group": 5, "fairness": 3, + "if_woke_up_response": f"act-on-{sid}", + } for sid in ("S1", "S2", "S3", "S4")}} + +def test_scenario_administer(): + sub = ScenarioSubagent(llm=_LLM(), memory=_Mem(), instrument_path=INSTRUMENT) + persona = PersonaRecord(agent_id=1, name="A", persona="p") + resp = sub.administer(persona) + assert set(resp.ratings.keys()) == {"S1", "S2", "S3", "S4"} + assert resp.ratings["S1"].desirability == 4 + +def test_polarity_matrix(): + from app.models.interview import ScenarioResponse, ScenarioRating + responses = [ScenarioResponse(agent_id=i, ratings={ + "S1": ScenarioRating(desirability=5, plausibility=4, impact_on_my_group=5, fairness=4, + if_woke_up_response="x"), + }) for i in range(3)] + m = polarity_matrix(responses) + assert "S1" in m + assert m["S1"]["mean_desirability"] == 5 + assert m["S1"]["n"] == 3 From 998cf1ac27e6163d1c6f94b4c44203b24e7c9990 Mon Sep 17 00:00:00 2001 From: Christian Moellmann Date: Sat, 23 May 2026 12:23:06 +0200 Subject: [PATCH 13/26] feat(interviews): JSONL/JSON storage layout with run_id directories and latest pointer Co-Authored-By: Claude Opus 4.7 (1M context) --- backend/app/services/interviews/storage.py | 69 ++++++++++++++++++++++ backend/tests/interviews/test_storage.py | 37 ++++++++++++ 2 files changed, 106 insertions(+) create mode 100644 backend/app/services/interviews/storage.py create mode 100644 backend/tests/interviews/test_storage.py diff --git a/backend/app/services/interviews/storage.py b/backend/app/services/interviews/storage.py new file mode 100644 index 00000000..50579830 --- /dev/null +++ b/backend/app/services/interviews/storage.py @@ -0,0 +1,69 @@ +from __future__ import annotations +import json +import time +import uuid +from pathlib import Path +from typing import Any +from pydantic import BaseModel +from app.models.interview import InterviewPhase, SubagentKind + + +class InterviewStore: + def __init__(self, root: Path, sim_id: str): + self.base = Path(root) / "simulations" / sim_id / "interviews" + self.base.mkdir(parents=True, exist_ok=True) + + def start_run(self, phase: InterviewPhase, subagent: SubagentKind) -> Path: + run_id = time.strftime("%Y%m%dT%H%M%S") + "-" + uuid.uuid4().hex[:6] + run_dir = self.base / phase.value / subagent.value / run_id + run_dir.mkdir(parents=True, exist_ok=True) + meta = {"run_id": run_id, "phase": phase.value, "subagent": subagent.value, + "created_at": time.time()} + (run_dir / "run.json").write_text(json.dumps(meta, indent=2), encoding="utf-8") + return run_dir + + def append_response(self, run_dir: Path, model: BaseModel) -> None: + path = run_dir / "responses.jsonl" + with path.open("a", encoding="utf-8") as f: + f.write(model.model_dump_json() + "\n") + + def append_jsonl(self, run_dir: Path, filename: str, payload: dict | BaseModel) -> None: + path = run_dir / filename + with path.open("a", encoding="utf-8") as f: + if isinstance(payload, BaseModel): + f.write(payload.model_dump_json() + "\n") + else: + f.write(json.dumps(payload, ensure_ascii=False) + "\n") + + def read_responses(self, run_dir: Path, filename: str = "responses.jsonl") -> list[dict]: + path = run_dir / filename + if not path.exists(): + return [] + return [json.loads(line) for line in path.read_text(encoding="utf-8").splitlines() if line.strip()] + + def write_aggregate(self, run_dir: Path, payload: dict) -> None: + (run_dir / "aggregate.json").write_text( + json.dumps(payload, ensure_ascii=False, indent=2), encoding="utf-8") + + def write_named(self, run_dir: Path, name: str, payload: Any) -> None: + (run_dir / name).write_text( + json.dumps(payload, ensure_ascii=False, indent=2), encoding="utf-8") + + def audit(self, run_dir: Path, agent_id: int | None, event: str, detail: str = "") -> None: + entry = {"ts": time.time(), "agent_id": agent_id, "event": event, "detail": detail} + with (run_dir / "audit.jsonl").open("a", encoding="utf-8") as f: + f.write(json.dumps(entry, ensure_ascii=False) + "\n") + + def mark_latest(self, run_dir: Path) -> None: + pointer = run_dir.parent / "latest.json" + pointer.write_text(json.dumps({ + "run_dir": str(run_dir.relative_to(self.base)), + }), encoding="utf-8") + + def latest_run(self, phase: InterviewPhase, subagent: SubagentKind) -> Path | None: + pointer = self.base / phase.value / subagent.value / "latest.json" + if not pointer.exists(): + return None + rel = json.loads(pointer.read_text())["run_dir"] + path = self.base / rel + return path if path.exists() else None diff --git a/backend/tests/interviews/test_storage.py b/backend/tests/interviews/test_storage.py new file mode 100644 index 00000000..26837e92 --- /dev/null +++ b/backend/tests/interviews/test_storage.py @@ -0,0 +1,37 @@ +import json +from pathlib import Path +from app.models.interview import ( + LikertResponse, InterviewPhase, SubagentKind, +) +from app.services.interviews.storage import InterviewStore + +def test_run_directory_layout(tmp_path): + store = InterviewStore(root=tmp_path, sim_id="sim42") + run_dir = store.start_run(phase=InterviewPhase.T0, subagent=SubagentKind.LONGITUDINAL) + assert run_dir.exists() + assert run_dir.parent.name == "longitudinal" + assert run_dir.parent.parent.name == "T0" + +def test_append_response(tmp_path): + store = InterviewStore(root=tmp_path, sim_id="sim42") + run_dir = store.start_run(phase=InterviewPhase.T0, subagent=SubagentKind.LONGITUDINAL) + r = LikertResponse(agent_id=1, phase=InterviewPhase.T0, + responses={"a": 3}, confidence={"a": 0.5}) + store.append_response(run_dir, r) + contents = (run_dir / "responses.jsonl").read_text() + assert json.loads(contents.splitlines()[0])["agent_id"] == 1 + +def test_write_aggregate_and_latest_pointer(tmp_path): + store = InterviewStore(root=tmp_path, sim_id="sim42") + run_dir = store.start_run(phase=InterviewPhase.T1, subagent=SubagentKind.SCENARIO) + store.write_aggregate(run_dir, {"k": 1}) + store.mark_latest(run_dir) + latest = (run_dir.parent / "latest.json").read_text() + assert json.loads(latest)["run_dir"].endswith(run_dir.name) + +def test_audit_log_append(tmp_path): + store = InterviewStore(root=tmp_path, sim_id="sim42") + run_dir = store.start_run(phase=InterviewPhase.T0, subagent=SubagentKind.DELPHI) + store.audit(run_dir, agent_id=7, event="schema_violation", detail="missing key x") + audit = (run_dir / "audit.jsonl").read_text() + assert "schema_violation" in audit From cca67365b928a10682c1899df347f6d9ace84f41 Mon Sep 17 00:00:00 2001 From: Christian Moellmann Date: Sat, 23 May 2026 12:24:33 +0200 Subject: [PATCH 14/26] feat(interviews): Zep writer adapts add_activity/add_text_episode for per-agent + aggregate episodes Co-Authored-By: Claude Opus 4.7 (1M context) --- backend/app/services/interviews/zep_writer.py | 65 +++++++++++++++++++ backend/tests/interviews/test_zep_writer.py | 28 ++++++++ 2 files changed, 93 insertions(+) create mode 100644 backend/app/services/interviews/zep_writer.py create mode 100644 backend/tests/interviews/test_zep_writer.py diff --git a/backend/app/services/interviews/zep_writer.py b/backend/app/services/interviews/zep_writer.py new file mode 100644 index 00000000..c4b6e971 --- /dev/null +++ b/backend/app/services/interviews/zep_writer.py @@ -0,0 +1,65 @@ +from __future__ import annotations +from typing import Any, Optional +from app.models.interview import ( + LikertResponse, QSortResponse, DelphiRatingResponse, ScenarioResponse, SubagentKind, +) + +class InterviewZepWriter: + """Mirrors `ZepGraphMemoryUpdater.add_activity` usage but for interview episodes. + + The real `ZepGraphMemoryUpdater` may expose `add_activity` (preferred) or a lower-level + text-episode method; this writer adapts to either via duck typing. + """ + def __init__(self, memory_updater, graph_id: str): + self.updater = memory_updater + self.graph_id = graph_id + + def _emit(self, text: str) -> None: + if hasattr(self.updater, "add_text_episode"): + self.updater.add_text_episode(self.graph_id, text) + elif hasattr(self.updater, "add_activity"): + self.updater.add_activity({"graph_id": self.graph_id, "text": text}) + else: + raise RuntimeError("memory_updater has neither add_text_episode nor add_activity") + + def _summarize_likert(self, r: LikertResponse) -> str: + mean_v = sum(r.responses.values()) / max(len(r.responses), 1) + top = sorted(r.responses.items(), key=lambda kv: -kv[1])[:3] + bot = sorted(r.responses.items(), key=lambda kv: kv[1])[:3] + return (f"mean={mean_v:.2f}; agrees with {[k for k,_ in top]}; " + f"disagrees with {[k for k,_ in bot]}") + + def _summarize_qsort(self, r: QSortResponse) -> str: + plus = [k for k, v in r.placements.items() if v >= 2] + minus = [k for k, v in r.placements.items() if v <= -2] + return f"+strongly:{plus}; -strongly:{minus}" + + def _summarize_scenario(self, r: ScenarioResponse) -> str: + parts = [f"{sid}: des={rt.desirability} plaus={rt.plausibility}" + for sid, rt in r.ratings.items()] + return "; ".join(parts) + + def write_per_agent( + self, subagent: SubagentKind, response: Any, agent_name: str, + phase: Optional[str] = None, + ) -> None: + if isinstance(response, LikertResponse): + phase = phase or response.phase.value + summary = self._summarize_likert(response) + elif isinstance(response, QSortResponse): + phase = phase or "T1" + summary = self._summarize_qsort(response) + elif isinstance(response, ScenarioResponse): + phase = phase or "T1" + summary = self._summarize_scenario(response) + elif isinstance(response, DelphiRatingResponse): + phase = phase or f"T1/R{response.round}" + summary = f"round={response.round}; {len(response.ratings)} themes rated" + else: + phase = phase or "T1" + summary = str(response)[:200] + text = f"Agent {agent_name} (interview/{subagent.value}/{phase}): {summary}" + self._emit(text) + + def write_aggregate(self, subagent: SubagentKind, summary: str) -> None: + self._emit(f"Interview aggregate ({subagent.value}): {summary}") diff --git a/backend/tests/interviews/test_zep_writer.py b/backend/tests/interviews/test_zep_writer.py new file mode 100644 index 00000000..661ef44b --- /dev/null +++ b/backend/tests/interviews/test_zep_writer.py @@ -0,0 +1,28 @@ +from app.models.interview import ( + LikertResponse, InterviewPhase, SubagentKind, +) +from app.services.interviews.zep_writer import InterviewZepWriter + +class _FakeMemoryUpdater: + def __init__(self): + self.events = [] + def add_activity(self, activity): + self.events.append(activity) + def add_text_episode(self, graph_id, text): + self.events.append({"graph_id": graph_id, "text": text}) + +def test_per_agent_episode_text(): + upd = _FakeMemoryUpdater() + w = InterviewZepWriter(memory_updater=upd, graph_id="g1") + r = LikertResponse(agent_id=42, phase=InterviewPhase.T1, + responses={"stk_1": 4, "gov_1": 3}, + confidence={"stk_1": 0.8, "gov_1": 0.7}) + w.write_per_agent(SubagentKind.LONGITUDINAL, r, agent_name="Fischer Müller") + assert any("Fischer Müller" in str(e) for e in upd.events) + assert any("longitudinal/T1" in str(e) for e in upd.events) + +def test_aggregate_episode(): + upd = _FakeMemoryUpdater() + w = InterviewZepWriter(memory_updater=upd, graph_id="g1") + w.write_aggregate(SubagentKind.SCENARIO, summary="S1 mean desirability 5.2; S2 mean 2.1") + assert any("S1 mean" in str(e) for e in upd.events) From b3e203981732c4dd8194244a5d79bdbb3bdca240 Mon Sep 17 00:00:00 2001 From: Christian Moellmann Date: Sat, 23 May 2026 12:27:47 +0200 Subject: [PATCH 15/26] feat(interviews): orchestrator with two-phase lifecycle, parallel fan-out, isolated failures Co-Authored-By: Claude Opus 4.7 (1M context) --- .../app/services/interview_orchestrator.py | 217 ++++++++++++++++++ backend/tests/interviews/test_orchestrator.py | 64 ++++++ 2 files changed, 281 insertions(+) create mode 100644 backend/app/services/interview_orchestrator.py create mode 100644 backend/tests/interviews/test_orchestrator.py diff --git a/backend/app/services/interview_orchestrator.py b/backend/app/services/interview_orchestrator.py new file mode 100644 index 00000000..ff0d2ad8 --- /dev/null +++ b/backend/app/services/interview_orchestrator.py @@ -0,0 +1,217 @@ +from __future__ import annotations +from concurrent.futures import ThreadPoolExecutor, as_completed +from pathlib import Path +from typing import Protocol +from app.models.interview import ( + InterviewPhase, SubagentKind, LikertResponse, QSortResponse, + DelphiOpenResponse, DelphiRatingResponse, ScenarioResponse, +) +from app.services.interviews.base import PersonaRecord +from app.services.interviews.longitudinal import LongitudinalSubagent, run_aggregate as longitudinal_aggregate +from app.services.interviews.diversity import DiversitySubagent, run_typology +from app.services.interviews.delphi import ( + DelphiSubagent, extract_themes, convergence_metrics, group_stats_from_r2, +) +from app.services.interviews.scenario import ScenarioSubagent, polarity_matrix +from app.services.interviews.storage import InterviewStore +from app.services.interviews.instrument_loader import freeze_snapshot + + +class PersonaProvider(Protocol): + def all(self) -> list[PersonaRecord]: ... + + +class InterviewOrchestrator: + def __init__( + self, llm, memory, personas: PersonaProvider, + instrument_dir: Path, store_root: Path, sim_id: str, + zep_writer, max_workers: int = 8, language: str = "de", + ): + self.llm = llm + self.memory = memory + self.personas = personas + self.instrument_dir = Path(instrument_dir) + self.store = InterviewStore(root=store_root, sim_id=sim_id) + self.zep_writer = zep_writer + self.max_workers = max_workers + self.language = language + # Freeze snapshot once per orchestrator lifetime + freeze_snapshot( + instruments={ + "longitudinal": self.instrument_dir / "longitudinal_v1.yaml", + "diversity": self.instrument_dir / "diversity_v1.yaml", + "delphi": self.instrument_dir / "delphi_v1.yaml", + "scenario": self.instrument_dir / "scenario_v1.yaml", + }, + out_path=self.store.base / "instruments_used.json", + ) + + # --- Generic per-agent runner --- + def _fan_out(self, run_dir, agent_fn, personas, audit_label): + ok: list = [] + failed: list[int] = [] + with ThreadPoolExecutor(max_workers=self.max_workers) as pool: + futures = {pool.submit(agent_fn, p): p for p in personas} + for fut in as_completed(futures): + p = futures[fut] + try: + out = fut.result() + ok.append(out) + self.store.append_response(run_dir, out) + except Exception as e: + failed.append(p.agent_id) + self.store.audit(run_dir, agent_id=p.agent_id, + event="agent_failed", detail=f"{audit_label}: {e!r}") + return ok, failed + + # --- Pre-phase (T0) --- + def run_pre(self) -> dict: + sub = LongitudinalSubagent(self.llm, self.memory, + self.instrument_dir / "longitudinal_v1.yaml", + language=self.language) + run_dir = self.store.start_run(InterviewPhase.T0, SubagentKind.LONGITUDINAL) + ok, failed = self._fan_out( + run_dir, lambda p: sub.administer(p, phase=InterviewPhase.T0), + self.personas.all(), audit_label="longitudinal_T0", + ) + for r in ok: + persona = next(p for p in self.personas.all() if p.agent_id == r.agent_id) + try: self.zep_writer.write_per_agent(SubagentKind.LONGITUDINAL, r, persona.name) + except Exception: pass + self.store.mark_latest(run_dir) + return {"longitudinal": {"n_responded": len(ok), "n_failed": len(failed), + "run_dir": str(run_dir)}} + + # --- Post-phase (T1) --- + def run_post(self) -> dict: + personas = self.personas.all() + out: dict = {} + with ThreadPoolExecutor(max_workers=4) as pool: + futures = { + "longitudinal": pool.submit(self._post_longitudinal, personas), + "diversity": pool.submit(self._post_diversity, personas), + "scenario": pool.submit(self._post_scenario, personas), + } + for name, fut in futures.items(): + try: out[name] = fut.result() + except Exception as e: out[name] = {"error": repr(e)} + # Delphi runs sequentially (R1 → R2 → R3) and uses the LLM for theme extraction + try: out["delphi"] = self._post_delphi(personas) + except Exception as e: out["delphi"] = {"error": repr(e)} + return out + + def _post_longitudinal(self, personas) -> dict: + sub = LongitudinalSubagent(self.llm, self.memory, + self.instrument_dir / "longitudinal_v1.yaml", + language=self.language) + run_dir = self.store.start_run(InterviewPhase.T1, SubagentKind.LONGITUDINAL) + ok, failed = self._fan_out( + run_dir, lambda p: sub.administer(p, phase=InterviewPhase.T1), + personas, audit_label="longitudinal_T1", + ) + # Aggregate using T0 + T1 + t0_path = self.store.latest_run(InterviewPhase.T0, SubagentKind.LONGITUDINAL) + t0_raw = self.store.read_responses(t0_path) if t0_path else [] + t0 = [LikertResponse(**d) for d in t0_raw] + agg = longitudinal_aggregate(t0, ok) + self.store.write_aggregate(run_dir, agg) + for r in ok: + persona = next(p for p in personas if p.agent_id == r.agent_id) + try: self.zep_writer.write_per_agent(SubagentKind.LONGITUDINAL, r, persona.name) + except Exception: pass + try: self.zep_writer.write_aggregate(SubagentKind.LONGITUDINAL, + f"n_paired={agg['n_paired']}") + except Exception: pass + self.store.mark_latest(run_dir) + return {"n_responded": len(ok), "n_failed": len(failed), "run_dir": str(run_dir)} + + def _post_diversity(self, personas) -> dict: + sub = DiversitySubagent(self.llm, self.memory, + self.instrument_dir / "diversity_v1.yaml", + language=self.language) + run_dir = self.store.start_run(InterviewPhase.T1, SubagentKind.DIVERSITY) + ok, failed = self._fan_out( + run_dir, lambda p: sub.administer(p), personas, audit_label="diversity", + ) + typology = run_typology(ok) + self.store.write_named(run_dir, "typology.json", typology) + self.store.write_aggregate(run_dir, {"n": len(ok), "n_failed": len(failed), + "clusters": typology["clusters"]}) + for r in ok: + persona = next(p for p in personas if p.agent_id == r.agent_id) + try: self.zep_writer.write_per_agent(SubagentKind.DIVERSITY, r, persona.name) + except Exception: pass + self.store.mark_latest(run_dir) + return {"n_responded": len(ok), "n_failed": len(failed), "run_dir": str(run_dir)} + + def _post_scenario(self, personas) -> dict: + sub = ScenarioSubagent(self.llm, self.memory, + self.instrument_dir / "scenario_v1.yaml", + language=self.language) + run_dir = self.store.start_run(InterviewPhase.T1, SubagentKind.SCENARIO) + ok, failed = self._fan_out( + run_dir, lambda p: sub.administer(p), personas, audit_label="scenario", + ) + matrix = polarity_matrix(ok) + self.store.write_named(run_dir, "polarity_matrix.json", matrix) + self.store.write_aggregate(run_dir, {"n": len(ok), "n_failed": len(failed), + "polarity": matrix}) + for r in ok: + persona = next(p for p in personas if p.agent_id == r.agent_id) + try: self.zep_writer.write_per_agent(SubagentKind.SCENARIO, r, persona.name) + except Exception: pass + self.store.mark_latest(run_dir) + return {"n_responded": len(ok), "n_failed": len(failed), "run_dir": str(run_dir)} + + def _post_delphi(self, personas) -> dict: + sub = DelphiSubagent(self.llm, self.memory, + self.instrument_dir / "delphi_v1.yaml", + language=self.language) + run_dir = self.store.start_run(InterviewPhase.T1, SubagentKind.DELPHI) + # Round 1 + r1_ok, r1_failed = self._fan_out( + run_dir, lambda p: sub.administer_round1(p), personas, audit_label="delphi_r1", + ) + # Move all R1 responses into a dedicated file + for r in r1_ok: self.store.append_jsonl(run_dir, "round1_themes.jsonl", r) + # Extract themes from R1 + themes = extract_themes(r1_ok, llm=self.llm) + self.store.write_named(run_dir, "themes.json", {"themes": themes}) + # Round 2 + r2_ok, r2_failed = self._fan_out( + run_dir, lambda p: sub.administer_round2(p, themes), + [p for p in personas if p.agent_id in {r.agent_id for r in r1_ok}], + audit_label="delphi_r2", + ) + for r in r2_ok: self.store.append_jsonl(run_dir, "round2_ratings.jsonl", r) + gstats = group_stats_from_r2(r2_ok) + # Round 3 + r2_by = {r.agent_id: r for r in r2_ok} + r3_personas = [p for p in personas if p.agent_id in r2_by] + def r3_call(p): return sub.administer_round3(p, themes, gstats, r2_by[p.agent_id]) + r3_ok, r3_failed = self._fan_out(run_dir, r3_call, r3_personas, audit_label="delphi_r3") + for r in r3_ok: self.store.append_jsonl(run_dir, "round3_revisions.jsonl", r) + # Convergence + conv = convergence_metrics(r2_ok, r3_ok) + self.store.write_named(run_dir, "convergence.json", conv) + self.store.write_aggregate(run_dir, { + "n_r1": len(r1_ok), "n_r2": len(r2_ok), "n_r3": len(r3_ok), + "n_failed_r1": len(r1_failed), "n_failed_r2": len(r2_failed), "n_failed_r3": len(r3_failed), + "themes": themes, + }) + for r in r3_ok: + persona = next(p for p in personas if p.agent_id == r.agent_id) + try: self.zep_writer.write_per_agent(SubagentKind.DELPHI, r, persona.name) + except Exception: pass + self.store.mark_latest(run_dir) + return {"n_r1": len(r1_ok), "n_r2": len(r2_ok), "n_r3": len(r3_ok), + "run_dir": str(run_dir)} + + # --- Re-run a single subagent --- + def rerun(self, subagent: SubagentKind) -> dict: + personas = self.personas.all() + if subagent == SubagentKind.LONGITUDINAL: return {"longitudinal": self._post_longitudinal(personas)} + if subagent == SubagentKind.DIVERSITY: return {"diversity": self._post_diversity(personas)} + if subagent == SubagentKind.SCENARIO: return {"scenario": self._post_scenario(personas)} + if subagent == SubagentKind.DELPHI: return {"delphi": self._post_delphi(personas)} + raise ValueError(f"unknown subagent {subagent}") diff --git a/backend/tests/interviews/test_orchestrator.py b/backend/tests/interviews/test_orchestrator.py new file mode 100644 index 00000000..323c4361 --- /dev/null +++ b/backend/tests/interviews/test_orchestrator.py @@ -0,0 +1,64 @@ +from pathlib import Path +import pytest +from app.models.interview import InterviewPhase, SubagentKind +from app.services.interviews.base import PersonaRecord, MemoryDigest +from app.services.interview_orchestrator import ( + InterviewOrchestrator, PersonaProvider, +) + +INST_DIR = Path(__file__).resolve().parents[2] / "scripts" / "instruments" + +class _Mem: + def get_digest(self, agent_id, max_chars=2000): + return MemoryDigest(text="x", available=True) + +class _LLM: + def chat_json(self, messages, temperature=0.0, max_tokens=None, **kw): + sys_text = next((m["content"] for m in messages if m["role"] == "system"), "") + if "longitudinal" in sys_text or "stk_" in (messages[-1].get("content") or ""): + return { + "responses": {k: 3 for k in ("stk_1","stk_2","stk_3","gov_1","gov_2","gov_3", + "mkt_1","mkt_2","mkt_3","clm_1","clm_2","clm_3")}, + "confidence": {}, "open_comment": "ok", + } + return {} + +class _Personas(PersonaProvider): + def __init__(self, n=3): + self._items = [PersonaRecord(agent_id=i, name=f"A{i}", persona="p") for i in range(n)] + def all(self): return list(self._items) + +class _NoopZep: + def write_per_agent(self, *a, **kw): pass + def write_aggregate(self, *a, **kw): pass + +def test_pre_phase_runs_longitudinal_only(tmp_path): + orch = InterviewOrchestrator( + llm=_LLM(), memory=_Mem(), personas=_Personas(3), + instrument_dir=INST_DIR, store_root=tmp_path, sim_id="sim1", + zep_writer=_NoopZep(), max_workers=2, + ) + result = orch.run_pre() + assert result["longitudinal"]["n_responded"] == 3 + assert "diversity" not in result # only longitudinal in pre-phase + +def test_partial_failure_does_not_kill_run(tmp_path): + class _FlakyLLM: + def __init__(self): self.n = 0 + def chat_json(self, messages, temperature=0.0, max_tokens=None, **kw): + self.n += 1 + if self.n % 2 == 0: + raise RuntimeError("simulated LLM 5xx") + return { + "responses": {k: 3 for k in ("stk_1","stk_2","stk_3","gov_1","gov_2","gov_3", + "mkt_1","mkt_2","mkt_3","clm_1","clm_2","clm_3")}, + "confidence": {}, "open_comment": "ok", + } + orch = InterviewOrchestrator( + llm=_FlakyLLM(), memory=_Mem(), personas=_Personas(4), + instrument_dir=INST_DIR, store_root=tmp_path, sim_id="sim2", + zep_writer=_NoopZep(), max_workers=1, + ) + result = orch.run_pre() + assert result["longitudinal"]["n_responded"] < 4 + assert result["longitudinal"]["n_failed"] > 0 From 3322bcb20cb2137053674c2717938441e51b807a Mon Sep 17 00:00:00 2001 From: Christian Moellmann Date: Sat, 23 May 2026 12:29:30 +0200 Subject: [PATCH 16/26] feat(interviews): on_ready / on_completed hook registry on SimulationManager Co-Authored-By: Claude Opus 4.7 (1M context) --- backend/app/services/simulation_manager.py | 39 +++++++++++++++++- .../tests/interviews/test_simulation_hooks.py | 40 +++++++++++++++++++ 2 files changed, 77 insertions(+), 2 deletions(-) create mode 100644 backend/tests/interviews/test_simulation_hooks.py diff --git a/backend/app/services/simulation_manager.py b/backend/app/services/simulation_manager.py index 0d161a90..5fe962f6 100644 --- a/backend/app/services/simulation_manager.py +++ b/backend/app/services/simulation_manager.py @@ -132,9 +132,13 @@ class SimulationManager: def __init__(self): # 确保目录存在 os.makedirs(self.SIMULATION_DATA_DIR, exist_ok=True) - + # 内存中的模拟状态缓存 self._simulations: Dict[str, SimulationState] = {} + + # Lifecycle hook registries + self._on_ready_hooks: list = [] + self._on_completed_hooks: list = [] def _get_simulation_dir(self, simulation_id: str) -> str: """获取模拟数据目录""" @@ -191,6 +195,36 @@ class SimulationManager: self._simulations[simulation_id] = state return state + # ------------------------------------------------------------------ + # Lifecycle hook registration + # ------------------------------------------------------------------ + + def register_on_ready(self, fn) -> None: + """Register a callback invoked when a simulation transitions to READY.""" + self._on_ready_hooks.append(fn) + + def register_on_completed(self, fn) -> None: + """Register a callback invoked when a simulation transitions to COMPLETED.""" + self._on_completed_hooks.append(fn) + + def _notify_on_ready(self, state: "SimulationState") -> None: + """Invoke all on_ready hooks; exceptions are isolated per hook.""" + for fn in list(self._on_ready_hooks): + try: + fn(state) + except Exception as e: + logger.warning(f"on_ready hook failed: {e!r}") + + def _notify_on_completed(self, state: "SimulationState") -> None: + """Invoke all on_completed hooks; exceptions are isolated per hook.""" + for fn in list(self._on_completed_hooks): + try: + fn(state) + except Exception as e: + logger.warning(f"on_completed hook failed: {e!r}") + + # ------------------------------------------------------------------ + def create_simulation( self, project_id: str, @@ -441,7 +475,8 @@ class SimulationManager: # 更新状态 state.status = SimulationStatus.READY self._save_simulation_state(state) - + self._notify_on_ready(state) + logger.info(f"模拟准备完成: {simulation_id}, " f"entities={state.entities_count}, profiles={state.profiles_count}") diff --git a/backend/tests/interviews/test_simulation_hooks.py b/backend/tests/interviews/test_simulation_hooks.py new file mode 100644 index 00000000..cef304f2 --- /dev/null +++ b/backend/tests/interviews/test_simulation_hooks.py @@ -0,0 +1,40 @@ +""" +Tests for SimulationManager lifecycle hooks (on_ready / on_completed). + +NOTE ON SHAPE DIVERGENCE vs. original plan spec: +- SimulationState uses `simulation_id` (not `sim_id`) +- `status` is a SimulationStatus enum, not a plain string +- The COMPLETED transition lives in simulation_runner.py (SimulationRunner._monitor_simulation), + not in simulation_manager.py. The _notify_on_completed hook is registered on SimulationManager + and the production insertion point for COMPLETED is documented in DONE_WITH_CONCERNS. +""" + +from app.services.simulation_manager import SimulationManager, SimulationState, SimulationStatus + + +def test_register_post_ready_hook_invoked(): + called = [] + mgr = SimulationManager() + mgr.register_on_ready(lambda state: called.append(("ready", state.simulation_id))) + state = SimulationState( + simulation_id="abc", + project_id="proj1", + graph_id="graph1", + status=SimulationStatus.READY, + ) + mgr._notify_on_ready(state) + assert called == [("ready", "abc")] + + +def test_register_post_completed_hook_invoked(): + called = [] + mgr = SimulationManager() + mgr.register_on_completed(lambda state: called.append(("done", state.simulation_id))) + state = SimulationState( + simulation_id="abc", + project_id="proj1", + graph_id="graph1", + status=SimulationStatus.COMPLETED, + ) + mgr._notify_on_completed(state) + assert called == [("done", "abc")] From d79c81d2b723dca2e182a2e74a87db5d9d52d5c5 Mon Sep 17 00:00:00 2001 From: Christian Moellmann Date: Sat, 23 May 2026 12:32:14 +0200 Subject: [PATCH 17/26] feat(interviews): synthesiser emits cross-method report + tidy CSV + limitations section Co-Authored-By: Claude Opus 4.7 (1M context) --- backend/app/services/interview_synthesizer.py | 160 ++++++++++++++++++ backend/tests/interviews/test_synthesizer.py | 32 ++++ 2 files changed, 192 insertions(+) create mode 100644 backend/app/services/interview_synthesizer.py create mode 100644 backend/tests/interviews/test_synthesizer.py diff --git a/backend/app/services/interview_synthesizer.py b/backend/app/services/interview_synthesizer.py new file mode 100644 index 00000000..a74609ae --- /dev/null +++ b/backend/app/services/interview_synthesizer.py @@ -0,0 +1,160 @@ +from __future__ import annotations +import csv +import json +from pathlib import Path +from app.models.interview import InterviewPhase, SubagentKind +from app.services.interviews.storage import InterviewStore + + +class InterviewSynthesizer: + def __init__(self, store: InterviewStore): + self.store = store + + def _maybe(self, phase: InterviewPhase, sub: SubagentKind) -> dict | None: + run = self.store.latest_run(phase, sub) + if run is None: + return None + agg = run / "aggregate.json" + if not agg.exists(): + return None + return {"run_dir": str(run), "aggregate": json.loads(agg.read_text(encoding="utf-8"))} + + def _instrument_hashes(self) -> dict: + snap = self.store.base / "instruments_used.json" + if not snap.exists(): + return {} + try: + data = json.loads(snap.read_text(encoding="utf-8")) + except Exception: + return {} + return {k: v.get("hash") for k, v in data.items()} + + def _limitations_text(self, present: dict[str, bool]) -> str: + lines = [ + "## Limitations", + "- **Simulated, not real stakeholders.** Responses reflect how the seed-document discourse " + "and the LLM jointly encode each stakeholder type, not what an actual fisher or NGO " + "staffer would say. The instrument measures the *model of the stakeholder*, not the stakeholder.", + "- **Memory digest is lossy.** Each agent's experience of OASIS is summarised to bounded length; " + "agents do not have full episodic recall.", + "- **LLM acquiescence and centrality bias.** Likert scales with LLM respondents skew toward 3–4 " + "of 5; check per-item distribution shape before drawing conclusions.", + "- **N is what it is.** `n_responded` and `n_failed` are printed verbatim per subagent; no smoothing.", + "- **Instrument provenance.** Hashes of frozen instruments are listed below; an identical run " + "is reproducible from these snapshots.", + ] + for k, ok in present.items(): + if not ok: + lines.append(f"- *{k}* subagent results are missing for this run.") + return "\n".join(lines) + + def run(self) -> str: + sections: list[str] = [] + sections.append("# Stakeholder Interview Synthesis\n") + + long_t0 = self._maybe(InterviewPhase.T0, SubagentKind.LONGITUDINAL) + long_t1 = self._maybe(InterviewPhase.T1, SubagentKind.LONGITUDINAL) + if long_t1: + agg = long_t1["aggregate"] + sections.append("## Longitudinal opinion drift (T0 → T1)") + sections.append(f"- N paired: {agg.get('n_paired', 'NA')}") + per_item = agg.get("per_item", {}) + top = sorted(per_item.items(), + key=lambda kv: abs(kv[1].get("mean_delta") or 0), reverse=True)[:5] + sections.append("- Largest mean shifts:") + for k, v in top: + sections.append(f" - `{k}`: Δ̄ = {v.get('mean_delta'):+0.2f} (n={v.get('n')})") + + diversity = self._maybe(InterviewPhase.T1, SubagentKind.DIVERSITY) + if diversity: + clusters = diversity["aggregate"].get("clusters", []) + sections.append("## Stakeholder typology") + sections.append(f"- N agents: {diversity['aggregate'].get('n', 'NA')}") + sections.append(f"- Clusters: {len(clusters)}") + for c in clusters: + sections.append(f" - cluster {c['cluster_id']}: n={c['n']}, " + f"top loadings = {list(c['top_loadings'].keys())[:5]}") + + delphi = self._maybe(InterviewPhase.T1, SubagentKind.DELPHI) + if delphi: + agg = delphi["aggregate"] + sections.append("## Delphi consensus") + sections.append(f"- Rounds completed: R1={agg.get('n_r1')}, R2={agg.get('n_r2')}, R3={agg.get('n_r3')}") + themes = agg.get("themes", []) + sections.append(f"- Themes: {[t.get('label') for t in themes]}") + + scenario = self._maybe(InterviewPhase.T1, SubagentKind.SCENARIO) + if scenario: + pol = scenario["aggregate"].get("polarity", {}) + sections.append("## Scenario evaluation") + for sid in sorted(pol): + v = pol[sid] + if v.get("n", 0) == 0: + continue + sections.append( + f"- **{sid}**: n={v['n']}, desirability {v['mean_desirability']:.2f}, " + f"plausibility {v['mean_plausibility']:.2f}, impact {v['mean_impact']:.2f}, " + f"fairness {v['mean_fairness']:.2f}") + + sections.append("") + sections.append(self._limitations_text({ + "longitudinal": bool(long_t1), + "diversity": bool(diversity), + "delphi": bool(delphi), + "scenario": bool(scenario), + })) + sections.append("") + sections.append("### Instrument provenance") + for name, h in self._instrument_hashes().items(): + sections.append(f"- `{name}`: hash `{h}`") + + report = "\n\n".join(sections) + out_dir = self.store.base / "synthesis" + out_dir.mkdir(parents=True, exist_ok=True) + (out_dir / "report.md").write_text(report, encoding="utf-8") + self._write_tidy_csv(out_dir / "exports" / "all_responses.csv") + return report + + def _write_tidy_csv(self, csv_path: Path) -> None: + csv_path.parent.mkdir(parents=True, exist_ok=True) + rows: list[dict] = [] + for phase in (InterviewPhase.T0, InterviewPhase.T1): + for sub in SubagentKind: + run = self.store.latest_run(phase, sub) + if run is None: + continue + files = ["responses.jsonl", "round1_themes.jsonl", + "round2_ratings.jsonl", "round3_revisions.jsonl"] + for fname in files: + for rec in self.store.read_responses(run, fname): + flat = self._flatten(rec, phase=phase.value, subagent=sub.value) + rows.extend(flat) + if not rows: + csv_path.write_text("phase,subagent,agent_id,key,value\n", encoding="utf-8") + return + fieldnames = sorted({k for r in rows for k in r.keys()}) + with csv_path.open("w", encoding="utf-8", newline="") as f: + w = csv.DictWriter(f, fieldnames=fieldnames) + w.writeheader() + for r in rows: + w.writerow(r) + + def _flatten(self, rec: dict, *, phase: str, subagent: str) -> list[dict]: + out: list[dict] = [] + aid = rec.get("agent_id") + for key, val in rec.items(): + if key == "agent_id": + continue + if isinstance(val, dict): + for k2, v2 in val.items(): + if isinstance(v2, dict): + for k3, v3 in v2.items(): + out.append({"phase": phase, "subagent": subagent, "agent_id": aid, + "key": f"{key}.{k2}.{k3}", "value": v3}) + else: + out.append({"phase": phase, "subagent": subagent, "agent_id": aid, + "key": f"{key}.{k2}", "value": v2}) + else: + out.append({"phase": phase, "subagent": subagent, "agent_id": aid, + "key": key, "value": val}) + return out diff --git a/backend/tests/interviews/test_synthesizer.py b/backend/tests/interviews/test_synthesizer.py new file mode 100644 index 00000000..2a842114 --- /dev/null +++ b/backend/tests/interviews/test_synthesizer.py @@ -0,0 +1,32 @@ +import json +from pathlib import Path +from app.services.interviews.storage import InterviewStore +from app.models.interview import InterviewPhase, SubagentKind, LikertResponse +from app.services.interview_synthesizer import InterviewSynthesizer + +def _seed_minimal(tmp_path: Path) -> InterviewStore: + store = InterviewStore(root=tmp_path, sim_id="s1") + rd = store.start_run(InterviewPhase.T0, SubagentKind.LONGITUDINAL) + for i in range(3): + store.append_response(rd, LikertResponse( + agent_id=i, phase=InterviewPhase.T0, + responses={"stk_1": 3, "gov_1": 3}, confidence={"stk_1": 0.5, "gov_1": 0.5}, + )) + store.write_aggregate(rd, {"per_item": {}, "n_paired": 0}) + store.mark_latest(rd) + return store + +def test_synthesizer_runs_with_partial_data(tmp_path): + store = _seed_minimal(tmp_path) + synth = InterviewSynthesizer(store=store) + report = synth.run() + assert "limitations" in report.lower() + assert "stub mode" in report.lower() or "n_responded" in report.lower() + +def test_synthesizer_writes_files(tmp_path): + store = _seed_minimal(tmp_path) + synth = InterviewSynthesizer(store=store) + synth.run() + files = list((store.base / "synthesis").iterdir()) + names = {f.name for f in files} + assert "report.md" in names From bc07170dbf3aad9a0b6b9b60adaa1f731aad11d9 Mon Sep 17 00:00:00 2001 From: Christian Moellmann Date: Sat, 23 May 2026 12:34:03 +0200 Subject: [PATCH 18/26] feat(interviews): persona + Zep memory adapters bridging existing services to interview subsystem Co-Authored-By: Claude Opus 4.7 (1M context) --- backend/app/services/interviews/adapters.py | 90 +++++++++++++++++++++ backend/tests/interviews/test_adapters.py | 48 +++++++++++ 2 files changed, 138 insertions(+) create mode 100644 backend/app/services/interviews/adapters.py create mode 100644 backend/tests/interviews/test_adapters.py diff --git a/backend/app/services/interviews/adapters.py b/backend/app/services/interviews/adapters.py new file mode 100644 index 00000000..94431fe9 --- /dev/null +++ b/backend/app/services/interviews/adapters.py @@ -0,0 +1,90 @@ +from __future__ import annotations +import csv +import json +from pathlib import Path +from typing import Optional +from app.services.interviews.base import PersonaRecord, MemoryDigest + + +class FileSystemPersonaProvider: + """Reads OASIS profiles from the simulation's `reddit_profiles.json` and/or `twitter_profiles.csv`. + + If both are present, agents from `reddit_profiles.json` take precedence; twitter-only agents are appended. + """ + + def __init__(self, reddit_path: Optional[Path], twitter_path: Optional[Path]): + self.reddit_path = Path(reddit_path) if reddit_path else None + self.twitter_path = Path(twitter_path) if twitter_path else None + + def _load_reddit(self) -> list[PersonaRecord]: + if not self.reddit_path or not self.reddit_path.exists(): + return [] + data = json.loads(self.reddit_path.read_text(encoding="utf-8")) + out = [] + for row in data: + out.append(PersonaRecord( + agent_id=int(row.get("user_id")), + name=str(row.get("name") or row.get("user_name") or f"agent_{row.get('user_id')}"), + persona=str(row.get("persona") or row.get("bio") or ""), + profession=row.get("profession"), + bio=row.get("bio"), + )) + return out + + def _load_twitter(self) -> list[PersonaRecord]: + if not self.twitter_path or not self.twitter_path.exists(): + return [] + out = [] + with self.twitter_path.open("r", encoding="utf-8", newline="") as f: + for row in csv.DictReader(f): + if not row.get("user_id"): + continue + out.append(PersonaRecord( + agent_id=int(row["user_id"]), + name=str(row.get("name") or row.get("user_name") or f"agent_{row['user_id']}"), + persona=str(row.get("persona") or row.get("bio") or ""), + profession=row.get("profession"), + bio=row.get("bio"), + )) + return out + + def all(self) -> list[PersonaRecord]: + reddit = self._load_reddit() + seen = {p.agent_id for p in reddit} + twitter = [p for p in self._load_twitter() if p.agent_id not in seen] + return reddit + twitter + + +class ZepMemoryProvider: + """Builds a bounded memory digest per agent from Zep entity context. + + Maps `agent_id` (OASIS user_id) to a Zep entity UUID; falls back to the agent_id as a string. + """ + + def __init__(self, entity_reader, graph_id: str, agent_to_entity: dict[int, str] | None = None): + self.reader = entity_reader + self.graph_id = graph_id + self.map = dict(agent_to_entity or {}) + + def get_digest(self, agent_id: int, max_chars: int = 2000) -> MemoryDigest: + entity_uuid = self.map.get(agent_id) or str(agent_id) + try: + ctx = self.reader.get_entity_with_context(self.graph_id, entity_uuid) + except Exception: + return MemoryDigest(text=f"[no memory for agent {agent_id}]", available=False) + parts: list[str] = [] + name = getattr(ctx, "name", None) + summary = getattr(ctx, "summary", None) + if name: + parts.append(f"Name: {name}") + if summary: + parts.append(f"Summary: {summary}") + edges = getattr(ctx, "related_edges", []) or [] + for e in edges[:20]: + fact = e.get("fact") if isinstance(e, dict) else getattr(e, "fact", None) + if fact: + parts.append(f"- {fact}") + text = "\n".join(parts) + if len(text) > max_chars: + text = text[: max_chars - 1] + "…" + return MemoryDigest(text=text or f"[empty memory for agent {agent_id}]", available=True) diff --git a/backend/tests/interviews/test_adapters.py b/backend/tests/interviews/test_adapters.py new file mode 100644 index 00000000..ab7dee2e --- /dev/null +++ b/backend/tests/interviews/test_adapters.py @@ -0,0 +1,48 @@ +import csv +import json +from pathlib import Path +from app.services.interviews.adapters import ( + FileSystemPersonaProvider, ZepMemoryProvider, +) + +def _write_reddit_profiles(tmp_path: Path): + data = [ + {"user_id": 0, "user_name": "fischer1", "name": "Fischer Müller", + "persona": "I am a small-scale Baltic fisher.", "profession": "fisher", "bio": ""}, + {"user_id": 1, "user_name": "ngo1", "name": "Ines NGO", + "persona": "I work for an environmental NGO.", "profession": "ngo_staff", "bio": ""}, + ] + p = tmp_path / "reddit_profiles.json" + p.write_text(json.dumps(data), encoding="utf-8") + return p + +def test_file_system_persona_provider_reads_reddit_json(tmp_path): + p = _write_reddit_profiles(tmp_path) + provider = FileSystemPersonaProvider(reddit_path=p, twitter_path=None) + personas = provider.all() + assert len(personas) == 2 + assert personas[0].name == "Fischer Müller" + assert personas[0].agent_id == 0 + +def test_zep_memory_provider_returns_empty_when_unavailable(): + class _BrokenReader: + def get_entity_with_context(self, *a, **kw): + raise RuntimeError("offline") + prov = ZepMemoryProvider(entity_reader=_BrokenReader(), graph_id="g1", + agent_to_entity={0: "uuid-zero"}) + d = prov.get_digest(0) + assert d.available is False + assert d.text != "" + +def test_zep_memory_provider_truncates_to_max_chars(): + class _R: + def get_entity_with_context(self, *a, **kw): + class _Ctx: + name = "X"; summary = "Y" + related_edges = [{"fact": "very long fact " * 200}] + return _Ctx() + prov = ZepMemoryProvider(entity_reader=_R(), graph_id="g1", + agent_to_entity={5: "uuid-five"}) + d = prov.get_digest(5, max_chars=300) + assert d.available is True + assert len(d.text) <= 300 From 52bae0a3daee13a85d435c4b6abb2a0f36335a5a Mon Sep 17 00:00:00 2001 From: Christian Moellmann Date: Sat, 23 May 2026 12:37:04 +0200 Subject: [PATCH 19/26] feat(interviews): Flask blueprint /api/interview with task-based async + CSV export Add /api/interview blueprint with POST pre/post/rerun, GET status/results/synthesis/export.csv endpoints. Background tasks tracked by UUID in module-level dict. Add register_blueprints() helper to api/__init__.py and wire app factory through it. Add UPLOADS_DIR to Config with env-override default. Co-Authored-By: Claude Opus 4.7 (1M context) --- backend/app/__init__.py | 6 +- backend/app/api/__init__.py | 11 +- backend/app/api/interview.py | 161 ++++++++++++++++++ backend/app/config.py | 2 + .../tests/interviews/test_api_interview.py | 42 +++++ 5 files changed, 217 insertions(+), 5 deletions(-) create mode 100644 backend/app/api/interview.py create mode 100644 backend/tests/interviews/test_api_interview.py diff --git a/backend/app/__init__.py b/backend/app/__init__.py index aba624bb..c2a36fd2 100644 --- a/backend/app/__init__.py +++ b/backend/app/__init__.py @@ -63,10 +63,8 @@ def create_app(config_class=Config): return response # 注册蓝图 - from .api import graph_bp, simulation_bp, report_bp - app.register_blueprint(graph_bp, url_prefix='/api/graph') - app.register_blueprint(simulation_bp, url_prefix='/api/simulation') - app.register_blueprint(report_bp, url_prefix='/api/report') + from .api import register_blueprints + register_blueprints(app) # 健康检查 @app.route('/health') diff --git a/backend/app/api/__init__.py b/backend/app/api/__init__.py index ffda743a..396750f2 100644 --- a/backend/app/api/__init__.py +++ b/backend/app/api/__init__.py @@ -2,13 +2,22 @@ API路由模块 """ -from flask import Blueprint +from flask import Blueprint, Flask graph_bp = Blueprint('graph', __name__) simulation_bp = Blueprint('simulation', __name__) report_bp = Blueprint('report', __name__) +interview_bp = Blueprint('interview', __name__) from . import graph # noqa: E402, F401 from . import simulation # noqa: E402, F401 from . import report # noqa: E402, F401 +from . import interview # noqa: E402, F401 + +def register_blueprints(app: Flask) -> None: + """Register all API blueprints on *app* with their canonical URL prefixes.""" + app.register_blueprint(graph_bp, url_prefix='/api/graph') + app.register_blueprint(simulation_bp, url_prefix='/api/simulation') + app.register_blueprint(report_bp, url_prefix='/api/report') + app.register_blueprint(interview_bp, url_prefix='/api/interview') diff --git a/backend/app/api/interview.py b/backend/app/api/interview.py new file mode 100644 index 00000000..993fda17 --- /dev/null +++ b/backend/app/api/interview.py @@ -0,0 +1,161 @@ +from __future__ import annotations +import threading +import traceback +import uuid +from pathlib import Path +from flask import Blueprint, jsonify, request, send_file +from app.config import Config +from app.models.interview import SubagentKind, InterviewPhase +from app.services.interviews.adapters import FileSystemPersonaProvider, ZepMemoryProvider +from app.services.interviews.zep_writer import InterviewZepWriter +from app.services.interview_orchestrator import InterviewOrchestrator +from app.services.interview_synthesizer import InterviewSynthesizer +from app.services.interviews.storage import InterviewStore +from app.utils.llm_client import LLMClient + +from . import interview_bp + +_TASKS: dict[str, dict] = {} +_LOCK = threading.Lock() + +INSTRUMENT_DIR = Path(__file__).resolve().parents[2] / "scripts" / "instruments" + + +def _uploads_root() -> Path: + return Path(getattr(Config, "UPLOADS_DIR", "uploads")) + + +def _build_orchestrator(sim_id: str) -> InterviewOrchestrator: + sim_dir = _uploads_root() / "simulations" / sim_id + reddit = sim_dir / "reddit_profiles.json" + twitter = sim_dir / "twitter_profiles.csv" + personas = FileSystemPersonaProvider(reddit_path=reddit if reddit.exists() else None, + twitter_path=twitter if twitter.exists() else None) + # Zep memory + writer: best-effort; in stub/test mode the writer no-ops on exceptions + class _NullUpdater: + def add_text_episode(self, *a, **kw): return None + try: + from app.services.zep_entity_reader import ZepEntityReader + from app.services.zep_graph_memory_updater import ZepGraphMemoryUpdater + graph_id = (sim_dir / "graph_id.txt").read_text().strip() if (sim_dir / "graph_id.txt").exists() else "" + reader = ZepEntityReader() + updater = ZepGraphMemoryUpdater() + memory = ZepMemoryProvider(reader, graph_id=graph_id) + zep_writer = InterviewZepWriter(memory_updater=updater, graph_id=graph_id) + except Exception: + class _Mem: + def get_digest(self, agent_id, max_chars=2000): + from app.services.interviews.base import MemoryDigest + return MemoryDigest(text="[memory unavailable]", available=False) + memory = _Mem() + zep_writer = InterviewZepWriter(memory_updater=_NullUpdater(), graph_id="") + llm = LLMClient(api_key=Config.LLM_API_KEY, base_url=Config.LLM_BASE_URL, + model=Config.LLM_MODEL_NAME) + return InterviewOrchestrator( + llm=llm, memory=memory, personas=personas, + instrument_dir=INSTRUMENT_DIR, store_root=_uploads_root(), sim_id=sim_id, + zep_writer=zep_writer, max_workers=Config.INTERVIEW_MAX_WORKERS, + language=Config.INTERVIEW_DEFAULT_LANGUAGE, + ) + + +def _run_task(task_id: str, fn) -> None: + with _LOCK: + _TASKS[task_id] = {"status": "running", "progress": {}, "result": None, "error": None} + try: + result = fn(task_id) + with _LOCK: + _TASKS[task_id]["status"] = "completed"; _TASKS[task_id]["result"] = result + except Exception as e: + with _LOCK: + _TASKS[task_id]["status"] = "failed" + _TASKS[task_id]["error"] = repr(e) + _TASKS[task_id]["traceback"] = traceback.format_exc() + + +def _start_task(fn) -> str: + task_id = uuid.uuid4().hex[:12] + with _LOCK: + _TASKS[task_id] = {"status": "queued", "progress": {}, "result": None, "error": None} + threading.Thread(target=_run_task, args=(task_id, fn), daemon=True).start() + return task_id + + +def _envelope(data=None, error=None, status: int = 200): + body = {"success": error is None, "data": data or {}, "error": error} + return jsonify(body), status + + +@interview_bp.route("//pre", methods=["POST"]) +def post_pre(sim_id: str): + orch = _build_orchestrator(sim_id) + task_id = _start_task(lambda tid: orch.run_pre()) + return _envelope({"task_id": task_id}) + + +@interview_bp.route("//post", methods=["POST"]) +def post_post(sim_id: str): + orch = _build_orchestrator(sim_id) + def run(tid): + out = orch.run_post() + synth = InterviewSynthesizer(store=orch.store) + out["synthesis"] = synth.run()[:1000] # short preview + return out + task_id = _start_task(run) + return _envelope({"task_id": task_id}) + + +@interview_bp.route("//rerun", methods=["POST"]) +def post_rerun(sim_id: str): + body = request.get_json(silent=True) or {} + sub = body.get("subagent") + try: subagent = SubagentKind(sub) + except ValueError: return _envelope(error=f"unknown subagent {sub!r}", status=400) + orch = _build_orchestrator(sim_id) + task_id = _start_task(lambda tid: orch.rerun(subagent)) + return _envelope({"task_id": task_id}) + + +@interview_bp.route("//status", methods=["GET"]) +def get_status(sim_id: str): + task_id = request.args.get("task_id") + with _LOCK: + task = _TASKS.get(task_id) + if task is None: return _envelope(error="unknown task_id", status=404) + return _envelope({"status": task["status"], "progress": task.get("progress", {}), + "result": task.get("result"), "error": task.get("error")}) + + +@interview_bp.route("//results/", methods=["GET"]) +def get_results(sim_id: str, subagent: str): + try: sub = SubagentKind(subagent) + except ValueError: return _envelope(error=f"unknown subagent {subagent!r}", status=400) + store = InterviewStore(root=_uploads_root(), sim_id=sim_id) + phase = InterviewPhase.T1 if sub != SubagentKind.LONGITUDINAL else InterviewPhase.T1 + run = store.latest_run(phase, sub) + if run is None: return _envelope(error="no results yet", status=404) + agg = (run / "aggregate.json") + if not agg.exists(): return _envelope(error="aggregate missing", status=404) + import json as _j + return _envelope({"aggregate": _j.loads(agg.read_text(encoding="utf-8")), + "run_dir": str(run)}) + + +@interview_bp.route("//results/synthesis", methods=["GET"]) +def get_synthesis(sim_id: str): + store = InterviewStore(root=_uploads_root(), sim_id=sim_id) + report = store.base / "synthesis" / "report.md" + if not report.exists(): + synth = InterviewSynthesizer(store=store) + synth.run() + return _envelope({"report_markdown": report.read_text(encoding="utf-8")}) + + +@interview_bp.route("//export.csv", methods=["GET"]) +def get_export_csv(sim_id: str): + store = InterviewStore(root=_uploads_root(), sim_id=sim_id) + csv_path = store.base / "synthesis" / "exports" / "all_responses.csv" + if not csv_path.exists(): + InterviewSynthesizer(store=store).run() + return send_file(csv_path, mimetype="text/csv", as_attachment=True, + download_name=f"{sim_id}_interviews.csv") diff --git a/backend/app/config.py b/backend/app/config.py index da7df8c1..11cf568a 100644 --- a/backend/app/config.py +++ b/backend/app/config.py @@ -39,6 +39,8 @@ class Config: MAX_CONTENT_LENGTH = 50 * 1024 * 1024 # 50MB UPLOAD_FOLDER = os.path.join(os.path.dirname(__file__), '../uploads') ALLOWED_EXTENSIONS = {'pdf', 'md', 'txt', 'markdown'} + # Root directory for simulation uploads (used by the interview subsystem) + UPLOADS_DIR = os.environ.get("UPLOADS_DIR", os.path.join(os.path.dirname(__file__), '../uploads')) # 文本处理配置 DEFAULT_CHUNK_SIZE = 500 # 默认切块大小 diff --git a/backend/tests/interviews/test_api_interview.py b/backend/tests/interviews/test_api_interview.py new file mode 100644 index 00000000..baad634b --- /dev/null +++ b/backend/tests/interviews/test_api_interview.py @@ -0,0 +1,42 @@ +import json +import os +from pathlib import Path +import pytest + +@pytest.fixture +def client(tmp_path, monkeypatch): + monkeypatch.setenv("LLM_STUB_MODE", "true") + monkeypatch.setenv("UPLOADS_DIR", str(tmp_path)) + from app.config import Config + Config.LLM_STUB_MODE = True + Config.UPLOADS_DIR = str(tmp_path) + # Seed a minimal reddit_profiles.json + sim_dir = tmp_path / "simulations" / "sim_test" + sim_dir.mkdir(parents=True) + profiles = [{"user_id": i, "user_name": f"u{i}", "name": f"A{i}", + "persona": "p", "profession": "fisher"} for i in range(3)] + (sim_dir / "reddit_profiles.json").write_text(json.dumps(profiles), encoding="utf-8") + from flask import Flask + from app.api import register_blueprints + app = Flask(__name__) + register_blueprints(app) + return app.test_client() + +def test_post_pre_returns_task_id(client): + res = client.post("/api/interview/sim_test/pre") + assert res.status_code == 200 + body = res.get_json() + assert body["success"] is True + assert "task_id" in body["data"] + +def test_status_endpoint_returns_progress(client): + res = client.post("/api/interview/sim_test/pre") + task_id = res.get_json()["data"]["task_id"] + res2 = client.get(f"/api/interview/sim_test/status?task_id={task_id}") + assert res2.status_code == 200 + assert "status" in res2.get_json()["data"] + +def test_unknown_subagent_returns_400(client): + res = client.post("/api/interview/sim_test/rerun", + json={"subagent": "nonsense"}) + assert res.status_code == 400 From 61f13a806d6a01119f4bfcfddf88b36a08f9ccfa Mon Sep 17 00:00:00 2001 From: Christian Moellmann Date: Sat, 23 May 2026 12:40:53 +0200 Subject: [PATCH 20/26] test(interviews): end-to-end pipeline test + content-aware LLM stubs for all 4 subagents Co-Authored-By: Claude Opus 4.7 (1M context) --- backend/app/utils/llm_client.py | 69 +++++++++++++--- backend/tests/integration/__init__.py | 0 .../integration/test_interview_pipeline.py | 81 +++++++++++++++++++ 3 files changed, 141 insertions(+), 9 deletions(-) create mode 100644 backend/tests/integration/__init__.py create mode 100644 backend/tests/integration/test_interview_pipeline.py diff --git a/backend/app/utils/llm_client.py b/backend/app/utils/llm_client.py index 32285596..9b22ac02 100644 --- a/backend/app/utils/llm_client.py +++ b/backend/app/utils/llm_client.py @@ -47,15 +47,66 @@ class LLMClient: return _json.dumps(self._stub_response_json(messages), ensure_ascii=False) def _stub_response_json(self, messages: list[dict]) -> dict: - key = self._stub_key(messages) - # Deterministic centered Likert + plausible open text - digit = sum(ord(c) for c in key) % 5 + 1 - return { - "stub_key": key, - "responses": {"item_001": digit, "item_002": digit, "item_003": (digit % 5) + 1}, - "confidence": {"item_001": 0.7, "item_002": 0.7, "item_003": 0.6}, - "open_comment": f"stub:{key}", - } + import hashlib, json as _json + sys_msg = next((m["content"] for m in messages if m.get("role") == "system"), "") + usr_msg = next((m["content"] for m in reversed(messages) if m.get("role") == "user"), "") + h = hashlib.sha256((sys_msg + "|" + usr_msg).encode("utf-8")).hexdigest() + seed = int(h[:8], 16) + rng = (seed % 5) + 1 + + # Longitudinal Likert (12 items) + if all(tok in usr_msg for tok in ("stk_1", "gov_1", "mkt_1", "clm_1")): + ids = ["stk_1","stk_2","stk_3","gov_1","gov_2","gov_3", + "mkt_1","mkt_2","mkt_3","clm_1","clm_2","clm_3"] + return {"responses": {k: ((seed >> (i*3)) % 5) + 1 for i, k in enumerate(ids)}, + "confidence": {k: 0.6 for k in ids}, + "open_comment": f"stub:{h[:8]}"} + + # Diversity Q-sort: 24 statements + 6 axes, forced distribution 2,3,4,6,4,3,2 + if "st_01" in usr_msg and "ax_pres_extr" in usr_msg: + buckets = [-3]*2 + [-2]*3 + [-1]*4 + [0]*6 + [1]*4 + [2]*3 + [3]*2 + stmts = [f"st_{i+1:02d}" for i in range(24)] + # shuffle deterministically + order = sorted(range(24), key=lambda i: (h[i % len(h)], i)) + placements = {stmts[i]: buckets[order.index(i)] for i in range(24)} + return { + "placements": placements, + "likert_axes": {a: ((seed >> (j*3)) % 7) + 1 for j, a in enumerate( + ["ax_pres_extr","ax_loc_eu","ax_sci_trad", + "ax_ind_col","ax_short_long","ax_mkt_reg"])}, + } + + # Scenario: S1..S4 × 4 dims + if all(s in usr_msg for s in ("S1:", "S2:", "S3:", "S4:")): + return {"ratings": {sid: { + "desirability": ((seed >> (i*3)) % 7) + 1, + "plausibility": ((seed >> (i*3+1)) % 7) + 1, + "impact_on_my_group": ((seed >> (i*3+2)) % 7) + 1, + "fairness": ((seed >> (i*3+4)) % 7) + 1, + "if_woke_up_response": f"act-{sid}-{h[:4]}", + } for i, sid in enumerate(["S1","S2","S3","S4"])}} + + # Delphi R1: q1..q4 free text + if "q1" in usr_msg and "q2" in usr_msg and "Bewerten" not in usr_msg and "Sie sehen" not in usr_msg: + return {"answers": {qid: f"stub-themes-{qid}-{h[:4]}" for qid in ("q1","q2","q3","q4")}} + + # Delphi theme extraction (no in-character system prompt) + if "extract distinct thematic codes" in sys_msg: + return {"themes": [{"theme_id": f"theme_{i}", "label": f"Thema {i}"} for i in range(5)]} + + # Delphi R2 (rate) or R3 (revise) + if "Bewerten Sie jedes Thema" in usr_msg or "Sie sehen unten" in usr_msg \ + or "Rate each theme" in usr_msg or "Below are the anonymised" in usr_msg: + theme_ids = [f"theme_{i}" for i in range(5)] + out = {"ratings": {tid: {"importance": ((seed >> (i*2)) % 5) + 1, + "plausibility": ((seed >> (i*2+1)) % 5) + 1} + for i, tid in enumerate(theme_ids)}} + if "Sie sehen unten" in usr_msg or "Below are the anonymised" in usr_msg: + out["justification"] = "stub-revision" + return out + + # Fallback + return {"stub_key": h[:12], "value": rng} def chat( self, diff --git a/backend/tests/integration/__init__.py b/backend/tests/integration/__init__.py new file mode 100644 index 00000000..e69de29b diff --git a/backend/tests/integration/test_interview_pipeline.py b/backend/tests/integration/test_interview_pipeline.py new file mode 100644 index 00000000..54bb0540 --- /dev/null +++ b/backend/tests/integration/test_interview_pipeline.py @@ -0,0 +1,81 @@ +import json +import pytest +from pathlib import Path +from app.config import Config +from app.models.interview import SubagentKind, InterviewPhase +from app.services.interviews.adapters import FileSystemPersonaProvider +from app.services.interviews.base import MemoryDigest +from app.services.interviews.zep_writer import InterviewZepWriter +from app.services.interview_orchestrator import InterviewOrchestrator +from app.services.interview_synthesizer import InterviewSynthesizer +from app.utils.llm_client import LLMClient + +pytestmark = pytest.mark.integration + +INST_DIR = Path(__file__).resolve().parents[2] / "scripts" / "instruments" + +class _NullUpdater: + def __init__(self): self.events = [] + def add_text_episode(self, graph_id, text): self.events.append(text) + +class _StaticMem: + def get_digest(self, agent_id, max_chars=2000): + return MemoryDigest(text=f"agent {agent_id} memory snippet", available=True) + +@pytest.fixture +def seeded_uploads(tmp_path, monkeypatch): + monkeypatch.setenv("LLM_STUB_MODE", "true") + Config.LLM_STUB_MODE = True + sim_dir = tmp_path / "simulations" / "intg_sim" + sim_dir.mkdir(parents=True) + profiles = [{"user_id": i, "user_name": f"u{i}", "name": f"A{i}", + "persona": "stakeholder p", "profession": "fisher"} for i in range(5)] + (sim_dir / "reddit_profiles.json").write_text(json.dumps(profiles), encoding="utf-8") + return tmp_path + +def _make_orch(tmp_path): + sim_dir = tmp_path / "simulations" / "intg_sim" + personas = FileSystemPersonaProvider( + reddit_path=sim_dir / "reddit_profiles.json", twitter_path=None, + ) + llm = LLMClient(api_key="x", base_url="x", model="x") + updater = _NullUpdater() + writer = InterviewZepWriter(memory_updater=updater, graph_id="g") + return InterviewOrchestrator( + llm=llm, memory=_StaticMem(), personas=personas, + instrument_dir=INST_DIR, store_root=tmp_path, sim_id="intg_sim", + zep_writer=writer, max_workers=2, language="de", + ) + +def test_pipeline_runs_pre_then_post_then_synthesis(seeded_uploads): + tmp = seeded_uploads + orch = _make_orch(tmp) + + pre = orch.run_pre() + assert pre["longitudinal"]["n_responded"] >= 1 + + post = orch.run_post() + assert "longitudinal" in post + assert "diversity" in post + assert "scenario" in post + assert "delphi" in post + + synth = InterviewSynthesizer(store=orch.store) + report = synth.run() + assert "Stakeholder Interview Synthesis" in report + assert "Limitations" in report + + csv_path = orch.store.base / "synthesis" / "exports" / "all_responses.csv" + assert csv_path.exists() + lines = csv_path.read_text().splitlines() + assert lines[0].startswith("agent_id,") or "agent_id" in lines[0] + +def test_idempotent_rerun_creates_new_run_id(seeded_uploads): + tmp = seeded_uploads + orch = _make_orch(tmp) + orch.run_pre() + first = orch.run_post() + second = orch.rerun(SubagentKind.SCENARIO) + first_scn = first["scenario"]["run_dir"] + second_scn = second["scenario"]["run_dir"] + assert first_scn != second_scn From fede66cac366f85970beef55a26be3b9bf7242dc Mon Sep 17 00:00:00 2001 From: Christian Moellmann Date: Sat, 23 May 2026 12:44:37 +0200 Subject: [PATCH 21/26] feat(interviews): Step4b Vue scaffold with five-tab navigation, API client, i18n keys Co-Authored-By: Claude Opus 4.7 (1M context) --- frontend/src/api/interview.js | 29 +++ frontend/src/components/Step4bInterviews.vue | 79 +++++++ .../src/components/interviews/DelphiPanel.vue | 4 + .../components/interviews/DiversityPanel.vue | 4 + .../interviews/LongitudinalPanel.vue | 4 + .../components/interviews/ScenarioPanel.vue | 4 + .../components/interviews/SynthesisPanel.vue | 4 + frontend/src/router/index.js | 7 + frontend/src/views/InterviewView.vue | 192 ++++++++++++++++++ locales/de.json | 15 ++ locales/en.json | 13 ++ locales/zh.json | 13 ++ 12 files changed, 368 insertions(+) create mode 100644 frontend/src/api/interview.js create mode 100644 frontend/src/components/Step4bInterviews.vue create mode 100644 frontend/src/components/interviews/DelphiPanel.vue create mode 100644 frontend/src/components/interviews/DiversityPanel.vue create mode 100644 frontend/src/components/interviews/LongitudinalPanel.vue create mode 100644 frontend/src/components/interviews/ScenarioPanel.vue create mode 100644 frontend/src/components/interviews/SynthesisPanel.vue create mode 100644 frontend/src/views/InterviewView.vue create mode 100644 locales/de.json diff --git a/frontend/src/api/interview.js b/frontend/src/api/interview.js new file mode 100644 index 00000000..0f5cdbf5 --- /dev/null +++ b/frontend/src/api/interview.js @@ -0,0 +1,29 @@ +import service from './index' + +export async function startPre(simId) { + const r = await service.post(`/api/interview/${simId}/pre`) + return r +} +export async function startPost(simId) { + const r = await service.post(`/api/interview/${simId}/post`) + return r +} +export async function rerun(simId, subagent) { + const r = await service.post(`/api/interview/${simId}/rerun`, { subagent }) + return r +} +export async function getStatus(simId, taskId) { + const r = await service.get(`/api/interview/${simId}/status`, { params: { task_id: taskId } }) + return r +} +export async function getResults(simId, subagent) { + const r = await service.get(`/api/interview/${simId}/results/${subagent}`) + return r +} +export async function getSynthesis(simId) { + const r = await service.get(`/api/interview/${simId}/results/synthesis`) + return r +} +export function exportCsvUrl(simId) { + return `/api/interview/${simId}/export.csv` +} diff --git a/frontend/src/components/Step4bInterviews.vue b/frontend/src/components/Step4bInterviews.vue new file mode 100644 index 00000000..d2aed844 --- /dev/null +++ b/frontend/src/components/Step4bInterviews.vue @@ -0,0 +1,79 @@ + + + + + diff --git a/frontend/src/components/interviews/DelphiPanel.vue b/frontend/src/components/interviews/DelphiPanel.vue new file mode 100644 index 00000000..f8b27647 --- /dev/null +++ b/frontend/src/components/interviews/DelphiPanel.vue @@ -0,0 +1,4 @@ + + diff --git a/frontend/src/components/interviews/DiversityPanel.vue b/frontend/src/components/interviews/DiversityPanel.vue new file mode 100644 index 00000000..759114b3 --- /dev/null +++ b/frontend/src/components/interviews/DiversityPanel.vue @@ -0,0 +1,4 @@ + + diff --git a/frontend/src/components/interviews/LongitudinalPanel.vue b/frontend/src/components/interviews/LongitudinalPanel.vue new file mode 100644 index 00000000..189c2488 --- /dev/null +++ b/frontend/src/components/interviews/LongitudinalPanel.vue @@ -0,0 +1,4 @@ + + diff --git a/frontend/src/components/interviews/ScenarioPanel.vue b/frontend/src/components/interviews/ScenarioPanel.vue new file mode 100644 index 00000000..ea2686e3 --- /dev/null +++ b/frontend/src/components/interviews/ScenarioPanel.vue @@ -0,0 +1,4 @@ + + diff --git a/frontend/src/components/interviews/SynthesisPanel.vue b/frontend/src/components/interviews/SynthesisPanel.vue new file mode 100644 index 00000000..7f3f7966 --- /dev/null +++ b/frontend/src/components/interviews/SynthesisPanel.vue @@ -0,0 +1,4 @@ + + diff --git a/frontend/src/router/index.js b/frontend/src/router/index.js index 62d23201..30b072b8 100644 --- a/frontend/src/router/index.js +++ b/frontend/src/router/index.js @@ -4,6 +4,7 @@ import Process from '../views/MainView.vue' import SimulationView from '../views/SimulationView.vue' import SimulationRunView from '../views/SimulationRunView.vue' import ReportView from '../views/ReportView.vue' +import InterviewView from '../views/InterviewView.vue' import InteractionView from '../views/InteractionView.vue' const routes = [ @@ -36,6 +37,12 @@ const routes = [ component: ReportView, props: true }, + { + path: '/interview/:simulationId', + name: 'Interview', + component: InterviewView, + props: true + }, { path: '/interaction/:reportId', name: 'Interaction', diff --git a/frontend/src/views/InterviewView.vue b/frontend/src/views/InterviewView.vue new file mode 100644 index 00000000..767ac9b7 --- /dev/null +++ b/frontend/src/views/InterviewView.vue @@ -0,0 +1,192 @@ + + + + + diff --git a/locales/de.json b/locales/de.json new file mode 100644 index 00000000..4032d4db --- /dev/null +++ b/locales/de.json @@ -0,0 +1,15 @@ +{ + "interview": { + "title": "Stakeholder-Interviews", + "subtitle": "Vier unabhängige Befragungen der simulierten Stakeholder-Population.", + "runAll": "Alle Post-Simulations-Interviews starten", + "downloadCsv": "CSV herunterladen", + "tab": { + "longitudinal": "Längsschnitt (Δ)", + "diversity": "Diversität", + "delphi": "Delphi", + "scenario": "Szenarien", + "synthesis": "Synthese" + } + } +} diff --git a/locales/en.json b/locales/en.json index 544c68b1..d22cf64f 100644 --- a/locales/en.json +++ b/locales/en.json @@ -661,5 +661,18 @@ "llmSelectAgentFailed": "LLM agent selection failed, using default selection: {error}", "generateInterviewQuestionsFailed": "Failed to generate interview questions: {error}", "generateInterviewSummaryFailed": "Failed to generate interview summary: {error}" + }, + "interview": { + "title": "Stakeholder interviews", + "subtitle": "Four independent surveys of the simulated stakeholder population.", + "runAll": "Run all post-simulation interviews", + "downloadCsv": "Download CSV", + "tab": { + "longitudinal": "Longitudinal (Δ)", + "diversity": "Diversity", + "delphi": "Delphi", + "scenario": "Scenarios", + "synthesis": "Synthesis" + } } } diff --git a/locales/zh.json b/locales/zh.json index cd747e2f..71ed6c4b 100644 --- a/locales/zh.json +++ b/locales/zh.json @@ -661,5 +661,18 @@ "llmSelectAgentFailed": "LLM选择Agent失败,使用默认选择: {error}", "generateInterviewQuestionsFailed": "生成采访问题失败: {error}", "generateInterviewSummaryFailed": "生成采访摘要失败: {error}" + }, + "interview": { + "title": "利益相关者访谈", + "subtitle": "对模拟利益相关者群体进行的四项独立调查。", + "runAll": "运行所有模拟后访谈", + "downloadCsv": "下载 CSV", + "tab": { + "longitudinal": "纵向分析 (Δ)", + "diversity": "多样性", + "delphi": "德尔菲法", + "scenario": "情景分析", + "synthesis": "综合分析" + } } } From acaa06170e4d25ef860599ae68950ed2b9f5a64a Mon Sep 17 00:00:00 2001 From: Christian Moellmann Date: Sat, 23 May 2026 12:47:34 +0200 Subject: [PATCH 22/26] =?UTF-8?q?feat(interviews):=20d3=20visualisations?= =?UTF-8?q?=20for=20longitudinal=20=CE=94,=20diversity=20PCA,=20Delphi,=20?= =?UTF-8?q?scenario=20polarity,=20synthesis?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Co-Authored-By: Claude Opus 4.7 (1M context) --- .../src/components/interviews/DelphiPanel.vue | 58 +++++++++++++++- .../components/interviews/DiversityPanel.vue | 63 +++++++++++++++++- .../interviews/LongitudinalPanel.vue | 63 +++++++++++++++++- .../components/interviews/ScenarioPanel.vue | 66 ++++++++++++++++++- .../components/interviews/SynthesisPanel.vue | 34 +++++++++- 5 files changed, 274 insertions(+), 10 deletions(-) diff --git a/frontend/src/components/interviews/DelphiPanel.vue b/frontend/src/components/interviews/DelphiPanel.vue index f8b27647..c111d0d3 100644 --- a/frontend/src/components/interviews/DelphiPanel.vue +++ b/frontend/src/components/interviews/DelphiPanel.vue @@ -1,4 +1,58 @@ - + + + + diff --git a/frontend/src/components/interviews/DiversityPanel.vue b/frontend/src/components/interviews/DiversityPanel.vue index 759114b3..558d8526 100644 --- a/frontend/src/components/interviews/DiversityPanel.vue +++ b/frontend/src/components/interviews/DiversityPanel.vue @@ -1,4 +1,63 @@ - + + + + diff --git a/frontend/src/components/interviews/LongitudinalPanel.vue b/frontend/src/components/interviews/LongitudinalPanel.vue index 189c2488..1596e93b 100644 --- a/frontend/src/components/interviews/LongitudinalPanel.vue +++ b/frontend/src/components/interviews/LongitudinalPanel.vue @@ -1,4 +1,63 @@ - + + + + diff --git a/frontend/src/components/interviews/ScenarioPanel.vue b/frontend/src/components/interviews/ScenarioPanel.vue index ea2686e3..ddc85b2b 100644 --- a/frontend/src/components/interviews/ScenarioPanel.vue +++ b/frontend/src/components/interviews/ScenarioPanel.vue @@ -1,4 +1,66 @@ - + + + + diff --git a/frontend/src/components/interviews/SynthesisPanel.vue b/frontend/src/components/interviews/SynthesisPanel.vue index 7f3f7966..e435b4d2 100644 --- a/frontend/src/components/interviews/SynthesisPanel.vue +++ b/frontend/src/components/interviews/SynthesisPanel.vue @@ -1,4 +1,34 @@ - + + + + From 6b04ea5c271154abebb3434e8a9752d410e8f48c Mon Sep 17 00:00:00 2001 From: Christian Moellmann Date: Sat, 23 May 2026 12:51:13 +0200 Subject: [PATCH 23/26] =?UTF-8?q?feat(interviews):=20auto-trigger=20lifecy?= =?UTF-8?q?cle=20hooks=20+=20bridge=20SimulationRunner=E2=86=92Manager=20o?= =?UTF-8?q?n=20COMPLETED?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit - Add backend/app/services/interviews/lifecycle.py with install_hooks() that registers on_ready (pre-survey) and on_completed (post-survey + synthesis) daemon-thread callbacks on a SimulationManager. - Add SimulationRunner.register_on_completed() / _fire_on_completed() so external callbacks can be notified when _monitor_simulation transitions to COMPLETED (both exit-code-0 path and simulation_end event path). - Wire both in app/__init__.py: create singleton SimulationManager, install lifecycle hooks, and register its _notify_on_completed with SimulationRunner. - Add test_lifecycle.py: verifies install_hooks registers one callable for each of ready and completed. - All 40 unit tests + 2 integration tests pass. Co-Authored-By: Claude Opus 4.7 (1M context) --- backend/app/__init__.py | 13 ++++ backend/app/services/interviews/lifecycle.py | 72 ++++++++++++++++++++ backend/app/services/simulation_runner.py | 26 ++++++- backend/tests/interviews/test_lifecycle.py | 26 +++++++ 4 files changed, 136 insertions(+), 1 deletion(-) create mode 100644 backend/app/services/interviews/lifecycle.py create mode 100644 backend/tests/interviews/test_lifecycle.py diff --git a/backend/app/__init__.py b/backend/app/__init__.py index c2a36fd2..d3a6d543 100644 --- a/backend/app/__init__.py +++ b/backend/app/__init__.py @@ -47,6 +47,19 @@ def create_app(config_class=Config): SimulationRunner.register_cleanup() if should_log_startup: logger.info("已注册模拟进程清理函数") + + # Install interview lifecycle hooks on a singleton SimulationManager. + # The singleton's _notify_on_completed is also wired into SimulationRunner + # so that the runner's monitor thread fires the completed hooks when a + # simulation process exits successfully. + from .services.simulation_manager import SimulationManager + from .services.interviews.lifecycle import install_hooks + + _simulation_manager_singleton = SimulationManager() + install_hooks(_simulation_manager_singleton) + SimulationRunner.register_on_completed(_simulation_manager_singleton._notify_on_completed) + if should_log_startup: + logger.info("已安装面试生命周期钩子") # 请求日志中间件 @app.before_request diff --git a/backend/app/services/interviews/lifecycle.py b/backend/app/services/interviews/lifecycle.py new file mode 100644 index 00000000..5e2d351d --- /dev/null +++ b/backend/app/services/interviews/lifecycle.py @@ -0,0 +1,72 @@ +""" +Interview lifecycle hook installer (Task 20). + +install_hooks(manager) registers two callbacks on a SimulationManager: + - on_ready → spawn T0 longitudinal pre-survey in a background thread + - on_completed → spawn full post-sim batch + synthesis in a background thread + +Both hooks are best-effort: failures are logged but never propagate to the +calling thread. +""" + +from __future__ import annotations + +import threading + +from app.utils.logger import get_logger + +logger = get_logger(__name__) + + +def install_hooks(manager) -> None: + """Attach interview lifecycle callbacks to a SimulationManager. + + on_ready → spawn T0 longitudinal in a background thread + on_completed → spawn full post-sim batch in a background thread + Hooks are best-effort; failures only log. + """ + + def _on_ready(state) -> None: + sim_id = ( + getattr(state, "simulation_id", None) + or getattr(state, "sim_id", None) + or getattr(state, "id", None) + ) + if not sim_id: + return + threading.Thread(target=_run_pre, args=(sim_id,), daemon=True).start() + + def _on_completed(state) -> None: + sim_id = ( + getattr(state, "simulation_id", None) + or getattr(state, "sim_id", None) + or getattr(state, "id", None) + ) + if not sim_id: + return + threading.Thread(target=_run_post, args=(sim_id,), daemon=True).start() + + manager.register_on_ready(_on_ready) + manager.register_on_completed(_on_completed) + + +def _run_pre(sim_id: str) -> None: + try: + from app.api.interview import _build_orchestrator + + orch = _build_orchestrator(sim_id) + orch.run_pre() + except Exception as e: + logger.warning(f"auto pre-survey failed for {sim_id}: {e!r}") + + +def _run_post(sim_id: str) -> None: + try: + from app.api.interview import _build_orchestrator + from app.services.interview_synthesizer import InterviewSynthesizer + + orch = _build_orchestrator(sim_id) + orch.run_post() + InterviewSynthesizer(store=orch.store).run() + except Exception as e: + logger.warning(f"auto post-survey failed for {sim_id}: {e!r}") diff --git a/backend/app/services/simulation_runner.py b/backend/app/services/simulation_runner.py index e86021f8..942f522f 100644 --- a/backend/app/services/simulation_runner.py +++ b/backend/app/services/simulation_runner.py @@ -226,7 +226,29 @@ class SimulationRunner: # 图谱记忆更新配置 _graph_memory_enabled: Dict[str, bool] = {} # simulation_id -> enabled - + + # Completion callbacks registered from outside (e.g. SimulationManager lifecycle hooks). + # Each callable receives the SimulationRunState that just transitioned to COMPLETED. + _on_completed_callbacks: list = [] + + @classmethod + def register_on_completed(cls, fn) -> None: + """Register a callback invoked when a simulation transitions to COMPLETED. + + The callback receives the SimulationRunState instance. It is called from + the monitor daemon thread, so keep it short or hand off to another thread. + """ + cls._on_completed_callbacks.append(fn) + + @classmethod + def _fire_on_completed(cls, state: SimulationRunState) -> None: + """Invoke all registered on_completed callbacks; exceptions are isolated.""" + for fn in list(cls._on_completed_callbacks): + try: + fn(state) + except Exception as e: + logger.warning(f"on_completed callback failed: {e!r}") + @classmethod def get_run_state(cls, simulation_id: str) -> Optional[SimulationRunState]: """获取运行状态""" @@ -528,6 +550,7 @@ class SimulationRunner: state.runner_status = RunnerStatus.COMPLETED state.completed_at = datetime.now().isoformat() logger.info(f"模拟完成: {simulation_id}") + cls._fire_on_completed(state) else: state.runner_status = RunnerStatus.FAILED # 从主日志文件读取错误信息 @@ -638,6 +661,7 @@ class SimulationRunner: state.runner_status = RunnerStatus.COMPLETED state.completed_at = datetime.now().isoformat() logger.info(f"所有平台模拟已完成: {state.simulation_id}") + cls._fire_on_completed(state) # 更新轮次信息(从 round_end 事件) elif event_type == "round_end": diff --git a/backend/tests/interviews/test_lifecycle.py b/backend/tests/interviews/test_lifecycle.py new file mode 100644 index 00000000..f8d2c952 --- /dev/null +++ b/backend/tests/interviews/test_lifecycle.py @@ -0,0 +1,26 @@ +""" +Tests for interview lifecycle hook installer (Task 20). +""" + +from app.services.interviews.lifecycle import install_hooks + + +class _StubMgr: + def __init__(self): + self.ready = [] + self.completed = [] + + def register_on_ready(self, fn): + self.ready.append(fn) + + def register_on_completed(self, fn): + self.completed.append(fn) + + +def test_install_hooks_registers_two_callables(): + mgr = _StubMgr() + install_hooks(mgr) + assert len(mgr.ready) == 1 + assert len(mgr.completed) == 1 + assert callable(mgr.ready[0]) + assert callable(mgr.completed[0]) From 6e1489fe08c99ed5c49c151cf44ac66e7b4f05c6 Mon Sep 17 00:00:00 2001 From: Christian Moellmann Date: Sat, 23 May 2026 13:27:47 +0200 Subject: [PATCH 24/26] fix(interviews): wire Zep updater/memory/hooks correctly for production runs (C1-C5) Five tightly-coupled fixes that were causing the interview subsystem to silently degrade in production: - C1+C2: `_build_orchestrator` now resolves `graph_id` from `SimulationManager().get_simulation(sim_id).graph_id` (the real persisted state) instead of a `graph_id.txt` that nothing in the codebase writes. `ZepGraphMemoryUpdater(graph_id=...)` is now called with the correct positional argument; the bare `try/except Exception` that was swallowing the TypeError is replaced with a narrow fallback that logs explicitly. - C3: `SimulationManager._on_ready_hooks` / `_on_completed_hooks` are now class-level (mirroring `SimulationRunner._on_completed_callbacks`). Hooks registered at app startup now survive across the per-request `SimulationManager()` instances created by the Flask API, so the T0 longitudinal auto-survey actually fires. - C4: `ZepGraphMemoryUpdater` gains an explicit `add_text_episode(graph_id, text)` method for synchronous text writes. `InterviewZepWriter._emit` no longer silently falls back to a dict-shaped `add_activity` call that the real implementation rejects (its `add_activity` requires an `AgentActivity` dataclass). - C5: `FileSystemPersonaProvider.agent_to_entity()` builds an `{agent_id: zep_entity_uuid}` map from the persisted profile files; the map is now passed to `ZepMemoryProvider` so `get_entity_with_context` is called with real Zep UUIDs instead of `str(agent_id)`. To make this work, `OasisProfileGenerator._save_reddit_json` and `_save_twitter_csv` now persist `source_entity_uuid` (Reddit JSON: optional field; Twitter CSV: appended column). Tests: 51 unit + 2 integration pass (was 40 + 2). New tests lock in each fix: - `test_hooks_survive_across_instances` (C3) - `test_build_orchestrator_reads_graph_id_from_state` (C1+C2+C5) - `test_build_orchestrator_falls_back_when_state_missing` (C1+C2) - `test_emit_uses_add_text_episode_with_graph_id`, `test_emit_raises_when_updater_lacks_add_text_episode`, `test_real_updater_exposes_add_text_episode` (C4) - `test_agent_to_entity_from_reddit_json`, `test_agent_to_entity_empty_when_no_field`, `test_agent_to_entity_falls_back_to_twitter_csv`, `test_agent_to_entity_reddit_takes_precedence` (C5) Co-Authored-By: Claude Opus 4.7 (1M context) --- backend/app/__init__.py | 15 +-- backend/app/api/interview.py | 102 +++++++++++++--- backend/app/services/interviews/adapters.py | 43 +++++++ backend/app/services/interviews/zep_writer.py | 15 ++- .../app/services/oasis_profile_generator.py | 29 +++-- backend/app/services/simulation_manager.py | 47 +++++--- .../app/services/zep_graph_memory_updater.py | 38 ++++++ backend/tests/interviews/test_adapters.py | 75 ++++++++++++ .../tests/interviews/test_api_interview.py | 113 ++++++++++++++++++ .../tests/interviews/test_simulation_hooks.py | 56 +++++++++ backend/tests/interviews/test_zep_writer.py | 55 ++++++++- 11 files changed, 526 insertions(+), 62 deletions(-) diff --git a/backend/app/__init__.py b/backend/app/__init__.py index d3a6d543..fdc49112 100644 --- a/backend/app/__init__.py +++ b/backend/app/__init__.py @@ -48,16 +48,17 @@ def create_app(config_class=Config): if should_log_startup: logger.info("已注册模拟进程清理函数") - # Install interview lifecycle hooks on a singleton SimulationManager. - # The singleton's _notify_on_completed is also wired into SimulationRunner - # so that the runner's monitor thread fires the completed hooks when a - # simulation process exits successfully. + # Install interview lifecycle hooks on the SimulationManager class. + # Hooks are stored on the class itself (not on a particular instance), so + # any fresh `SimulationManager()` constructed later (e.g. per request in + # the Flask API) will see them. We still bridge `_notify_on_completed` + # into SimulationRunner via a transient instance so the runner's monitor + # thread fires the completed hooks when a simulation process exits. from .services.simulation_manager import SimulationManager from .services.interviews.lifecycle import install_hooks - _simulation_manager_singleton = SimulationManager() - install_hooks(_simulation_manager_singleton) - SimulationRunner.register_on_completed(_simulation_manager_singleton._notify_on_completed) + install_hooks(SimulationManager) + SimulationRunner.register_on_completed(SimulationManager()._notify_on_completed) if should_log_startup: logger.info("已安装面试生命周期钩子") diff --git a/backend/app/api/interview.py b/backend/app/api/interview.py index 993fda17..e638aaab 100644 --- a/backend/app/api/interview.py +++ b/backend/app/api/interview.py @@ -12,9 +12,31 @@ from app.services.interview_orchestrator import InterviewOrchestrator from app.services.interview_synthesizer import InterviewSynthesizer from app.services.interviews.storage import InterviewStore from app.utils.llm_client import LLMClient +from app.utils.logger import get_logger from . import interview_bp +logger = get_logger(__name__) + + +class _NullUpdater: + """No-op stand-in for ``ZepGraphMemoryUpdater`` used when Zep is unavailable. + + Exposes ``add_text_episode`` so ``InterviewZepWriter._emit`` succeeds silently — + the interview pipeline still produces local artefacts; Zep just isn't updated. + """ + + def add_text_episode(self, graph_id, text): # noqa: ARG002 - matches real API + return None + + +class _NullMemory: + """Fallback memory provider that always reports unavailable digests.""" + + def get_digest(self, agent_id, max_chars=2000): # noqa: ARG002 - matches Protocol + from app.services.interviews.base import MemoryDigest + return MemoryDigest(text="[memory unavailable]", available=False) + _TASKS: dict[str, dict] = {} _LOCK = threading.Lock() @@ -25,30 +47,72 @@ def _uploads_root() -> Path: return Path(getattr(Config, "UPLOADS_DIR", "uploads")) +def _load_graph_id(sim_id: str) -> str: + """Read the Zep ``graph_id`` for a simulation from its persisted state. + + The graph_id is written by ``SimulationManager`` into + ``uploads/simulations/{sim_id}/state.json``. Returns ``""`` if the state + file is missing or unreadable — callers should treat empty graph_id as + "Zep unavailable" and fall back to the null memory/writer path. + """ + try: + from app.services.simulation_manager import SimulationManager + state = SimulationManager().get_simulation(sim_id) + if state and state.graph_id: + return state.graph_id + except Exception as e: # pragma: no cover - defensive + logger.warning(f"_load_graph_id({sim_id}) failed: {e!r}") + return "" + + def _build_orchestrator(sim_id: str) -> InterviewOrchestrator: sim_dir = _uploads_root() / "simulations" / sim_id reddit = sim_dir / "reddit_profiles.json" twitter = sim_dir / "twitter_profiles.csv" - personas = FileSystemPersonaProvider(reddit_path=reddit if reddit.exists() else None, - twitter_path=twitter if twitter.exists() else None) - # Zep memory + writer: best-effort; in stub/test mode the writer no-ops on exceptions - class _NullUpdater: - def add_text_episode(self, *a, **kw): return None - try: - from app.services.zep_entity_reader import ZepEntityReader - from app.services.zep_graph_memory_updater import ZepGraphMemoryUpdater - graph_id = (sim_dir / "graph_id.txt").read_text().strip() if (sim_dir / "graph_id.txt").exists() else "" - reader = ZepEntityReader() - updater = ZepGraphMemoryUpdater() - memory = ZepMemoryProvider(reader, graph_id=graph_id) - zep_writer = InterviewZepWriter(memory_updater=updater, graph_id=graph_id) - except Exception: - class _Mem: - def get_digest(self, agent_id, max_chars=2000): - from app.services.interviews.base import MemoryDigest - return MemoryDigest(text="[memory unavailable]", available=False) - memory = _Mem() + personas = FileSystemPersonaProvider( + reddit_path=reddit if reddit.exists() else None, + twitter_path=twitter if twitter.exists() else None, + ) + # Build agent_id -> Zep entity uuid map from the persisted profile files. + agent_to_entity = personas.agent_to_entity() + + # Resolve the graph_id from the simulation's persisted state — NOT from a + # ``graph_id.txt`` (nothing in the codebase writes such a file). + graph_id = _load_graph_id(sim_id) + + memory: object + zep_writer: InterviewZepWriter + if not graph_id: + logger.warning( + f"interview: no graph_id for sim {sim_id} — Zep memory/writer disabled " + "(simulation state missing or graph_id empty)" + ) + memory = _NullMemory() zep_writer = InterviewZepWriter(memory_updater=_NullUpdater(), graph_id="") + else: + try: + from app.services.zep_entity_reader import ZepEntityReader + from app.services.zep_graph_memory_updater import ZepGraphMemoryUpdater + + reader = ZepEntityReader() + updater = ZepGraphMemoryUpdater(graph_id=graph_id) + memory = ZepMemoryProvider( + reader, graph_id=graph_id, agent_to_entity=agent_to_entity + ) + zep_writer = InterviewZepWriter(memory_updater=updater, graph_id=graph_id) + if not agent_to_entity: + logger.warning( + f"interview: empty agent_to_entity map for sim {sim_id} — " + "memory digests will be unavailable. Check that profile files " + "include `source_entity_uuid`." + ) + except Exception as e: + logger.warning( + f"interview: Zep init failed for sim {sim_id} ({e!r}); " + "falling back to null memory/writer" + ) + memory = _NullMemory() + zep_writer = InterviewZepWriter(memory_updater=_NullUpdater(), graph_id="") llm = LLMClient(api_key=Config.LLM_API_KEY, base_url=Config.LLM_BASE_URL, model=Config.LLM_MODEL_NAME) return InterviewOrchestrator( diff --git a/backend/app/services/interviews/adapters.py b/backend/app/services/interviews/adapters.py index 94431fe9..06d05e94 100644 --- a/backend/app/services/interviews/adapters.py +++ b/backend/app/services/interviews/adapters.py @@ -54,6 +54,49 @@ class FileSystemPersonaProvider: twitter = [p for p in self._load_twitter() if p.agent_id not in seen] return reddit + twitter + def agent_to_entity(self) -> dict[int, str]: + """Build the ``{agent_id: zep_entity_uuid}`` map from the persisted profile files. + + Both writers (``oasis_profile_generator._save_reddit_json`` and + ``_save_twitter_csv``) emit ``source_entity_uuid`` per agent. Reddit takes + precedence; rows with a missing/blank uuid are skipped. + Returns an empty dict if neither file is present or no row has the field. + """ + mapping: dict[int, str] = {} + + # Reddit JSON + if self.reddit_path and self.reddit_path.exists(): + try: + rows = json.loads(self.reddit_path.read_text(encoding="utf-8")) + for row in rows: + uid = row.get("user_id") + uuid_ = row.get("source_entity_uuid") + if uid is None or not uuid_: + continue + mapping[int(uid)] = str(uuid_) + except (json.JSONDecodeError, ValueError, TypeError): + pass + + # Twitter CSV (only fills agents not already mapped) + if self.twitter_path and self.twitter_path.exists(): + try: + with self.twitter_path.open("r", encoding="utf-8", newline="") as f: + for row in csv.DictReader(f): + uid = row.get("user_id") + uuid_ = row.get("source_entity_uuid") + if not uid or not uuid_: + continue + try: + uid_int = int(uid) + except (TypeError, ValueError): + continue + if uid_int not in mapping: + mapping[uid_int] = str(uuid_) + except OSError: + pass + + return mapping + class ZepMemoryProvider: """Builds a bounded memory digest per agent from Zep entity context. diff --git a/backend/app/services/interviews/zep_writer.py b/backend/app/services/interviews/zep_writer.py index c4b6e971..fdd9f185 100644 --- a/backend/app/services/interviews/zep_writer.py +++ b/backend/app/services/interviews/zep_writer.py @@ -5,10 +5,12 @@ from app.models.interview import ( ) class InterviewZepWriter: - """Mirrors `ZepGraphMemoryUpdater.add_activity` usage but for interview episodes. + """Writes interview episodes (per-agent responses, aggregates) to a Zep graph. - The real `ZepGraphMemoryUpdater` may expose `add_activity` (preferred) or a lower-level - text-episode method; this writer adapts to either via duck typing. + Expects ``memory_updater`` to expose ``add_text_episode(graph_id, text)`` — that + is the method the real ``ZepGraphMemoryUpdater`` provides for synchronous text + writes outside the agent-activity batch pipeline. A no-op shim with the same + method is acceptable for tests and stub mode. """ def __init__(self, memory_updater, graph_id: str): self.updater = memory_updater @@ -17,10 +19,11 @@ class InterviewZepWriter: def _emit(self, text: str) -> None: if hasattr(self.updater, "add_text_episode"): self.updater.add_text_episode(self.graph_id, text) - elif hasattr(self.updater, "add_activity"): - self.updater.add_activity({"graph_id": self.graph_id, "text": text}) else: - raise RuntimeError("memory_updater has neither add_text_episode nor add_activity") + raise RuntimeError( + "memory_updater is missing add_text_episode(graph_id, text); " + "InterviewZepWriter requires the explicit text-episode API." + ) def _summarize_likert(self, r: LikertResponse) -> str: mean_v = sum(r.responses.values()) / max(len(r.responses), 1) diff --git a/backend/app/services/oasis_profile_generator.py b/backend/app/services/oasis_profile_generator.py index 7704a627..9360e18c 100644 --- a/backend/app/services/oasis_profile_generator.py +++ b/backend/app/services/oasis_profile_generator.py @@ -1090,11 +1090,13 @@ class OasisProfileGenerator: with open(file_path, 'w', newline='', encoding='utf-8') as f: writer = csv.writer(f) - - # 写入OASIS要求的表头 - headers = ['user_id', 'name', 'username', 'user_char', 'description'] + + # 写入表头:OASIS要求的5列 + 额外的source_entity_uuid列(反向链接到Zep实体)。 + # OASIS按列名读取,额外的列不会影响其行为,但允许下游(面试子系统等) + # 重建 agent_id -> Zep entity uuid 的映射。 + headers = ['user_id', 'name', 'username', 'user_char', 'description', 'source_entity_uuid'] writer.writerow(headers) - + # 写入数据行 for idx, profile in enumerate(profiles): # user_char: 完整人设(bio + persona),用于LLM系统提示 @@ -1103,16 +1105,17 @@ class OasisProfileGenerator: user_char = f"{profile.bio} {profile.persona}" # 处理换行符(CSV中用空格替代) user_char = user_char.replace('\n', ' ').replace('\r', ' ') - + # description: 简短简介,用于外部显示 description = profile.bio.replace('\n', ' ').replace('\r', ' ') - + row = [ idx, # user_id: 从0开始的顺序ID profile.name, # name: 真实姓名 profile.user_name, # username: 用户名 user_char, # user_char: 完整人设(内部LLM使用) - description # description: 简短简介(外部显示) + description, # description: 简短简介(外部显示) + profile.source_entity_uuid or "", # source_entity_uuid: Zep实体UUID ] writer.writerow(row) @@ -1184,12 +1187,18 @@ class OasisProfileGenerator: item["profession"] = profile.profession if profile.interested_topics: item["interested_topics"] = profile.interested_topics - + # source_entity_uuid: 反向链接到Zep实体,下游(面试子系统等)需要此映射以 + # 在Zep图谱中查找Agent的上下文。仅在存在时写入。 + if profile.source_entity_uuid: + item["source_entity_uuid"] = profile.source_entity_uuid + if profile.source_entity_type: + item["source_entity_type"] = profile.source_entity_type + data.append(item) - + with open(file_path, 'w', encoding='utf-8') as f: json.dump(data, f, ensure_ascii=False, indent=2) - + logger.info(f"已保存 {len(profiles)} 个Reddit Profile到 {file_path} (JSON格式,包含user_id字段)") # 保留旧方法名作为别名,保持向后兼容 diff --git a/backend/app/services/simulation_manager.py b/backend/app/services/simulation_manager.py index 5fe962f6..50b7890a 100644 --- a/backend/app/services/simulation_manager.py +++ b/backend/app/services/simulation_manager.py @@ -115,30 +115,33 @@ class SimulationState: class SimulationManager: """ 模拟管理器 - + 核心功能: 1. 从Zep图谱读取实体并过滤 2. 生成OASIS Agent Profile 3. 使用LLM智能生成模拟配置参数 4. 准备预设脚本所需的所有文件 """ - + # 模拟数据存储目录 SIMULATION_DATA_DIR = os.path.join( - os.path.dirname(__file__), + os.path.dirname(__file__), '../../uploads/simulations' ) - + + # Class-level hook registries so callbacks survive across instances. + # The Flask API endpoints construct fresh `SimulationManager()` instances per request, + # while lifecycle hooks are registered once at app startup — storing the lists on the + # instance would silently drop those hooks on every request. + _on_ready_hooks: list = [] + _on_completed_hooks: list = [] + def __init__(self): # 确保目录存在 os.makedirs(self.SIMULATION_DATA_DIR, exist_ok=True) # 内存中的模拟状态缓存 self._simulations: Dict[str, SimulationState] = {} - - # Lifecycle hook registries - self._on_ready_hooks: list = [] - self._on_completed_hooks: list = [] def _get_simulation_dir(self, simulation_id: str) -> str: """获取模拟数据目录""" @@ -196,20 +199,30 @@ class SimulationManager: return state # ------------------------------------------------------------------ - # Lifecycle hook registration + # Lifecycle hook registration (class-level — see class docstring) # ------------------------------------------------------------------ - def register_on_ready(self, fn) -> None: - """Register a callback invoked when a simulation transitions to READY.""" - self._on_ready_hooks.append(fn) + @classmethod + def register_on_ready(cls, fn) -> None: + """Register a callback invoked when a simulation transitions to READY. - def register_on_completed(self, fn) -> None: - """Register a callback invoked when a simulation transitions to COMPLETED.""" - self._on_completed_hooks.append(fn) + Class-level so hooks registered at app startup remain visible to every + SimulationManager() instance constructed later (e.g. per-request in Flask). + """ + cls._on_ready_hooks.append(fn) + + @classmethod + def register_on_completed(cls, fn) -> None: + """Register a callback invoked when a simulation transitions to COMPLETED. + + Class-level so hooks registered at app startup remain visible to every + SimulationManager() instance constructed later (e.g. per-request in Flask). + """ + cls._on_completed_hooks.append(fn) def _notify_on_ready(self, state: "SimulationState") -> None: """Invoke all on_ready hooks; exceptions are isolated per hook.""" - for fn in list(self._on_ready_hooks): + for fn in list(type(self)._on_ready_hooks): try: fn(state) except Exception as e: @@ -217,7 +230,7 @@ class SimulationManager: def _notify_on_completed(self, state: "SimulationState") -> None: """Invoke all on_completed hooks; exceptions are isolated per hook.""" - for fn in list(self._on_completed_hooks): + for fn in list(type(self)._on_completed_hooks): try: fn(state) except Exception as e: diff --git a/backend/app/services/zep_graph_memory_updater.py b/backend/app/services/zep_graph_memory_updater.py index e034fee2..86a4e1e2 100644 --- a/backend/app/services/zep_graph_memory_updater.py +++ b/backend/app/services/zep_graph_memory_updater.py @@ -337,6 +337,44 @@ class ZepGraphMemoryUpdater: self._total_activities += 1 logger.debug(f"添加活动到Zep队列: {activity.agent_name} - {activity.action_type}") + def add_text_episode(self, graph_id: str, text: str) -> None: + """ + 直接将一段文本写入Zep图谱(同步发送,不经过批量队列) + + 用于面试子系统(InterviewZepWriter)等需要立即写入、不属于 + agent活动流水线的场景。绕过 _send_batch_activities 的批量逻辑, + 但仍带重试。 + + Args: + graph_id: 目标图谱ID(允许覆盖 self.graph_id,便于多图场景) + text: 要发送的文本内容 + """ + if not text: + return + target_graph_id = graph_id or self.graph_id + if not target_graph_id: + logger.warning("add_text_episode 调用时未指定graph_id,跳过") + return + + for attempt in range(self.MAX_RETRIES): + try: + self.client.graph.add( + graph_id=target_graph_id, + type="text", + data=text, + ) + self._total_sent += 1 + self._total_items_sent += 1 + logger.debug(f"add_text_episode 发送成功 (graph={target_graph_id}, len={len(text)})") + return + except Exception as e: + if attempt < self.MAX_RETRIES - 1: + logger.warning(f"add_text_episode 失败 (尝试 {attempt + 1}/{self.MAX_RETRIES}): {e}") + time.sleep(self.RETRY_DELAY * (attempt + 1)) + else: + logger.error(f"add_text_episode 失败,已重试{self.MAX_RETRIES}次: {e}") + self._failed_count += 1 + def add_activity_from_dict(self, data: Dict[str, Any], platform: str): """ 从字典数据添加活动 diff --git a/backend/tests/interviews/test_adapters.py b/backend/tests/interviews/test_adapters.py index ab7dee2e..977d5997 100644 --- a/backend/tests/interviews/test_adapters.py +++ b/backend/tests/interviews/test_adapters.py @@ -46,3 +46,78 @@ def test_zep_memory_provider_truncates_to_max_chars(): d = prov.get_digest(5, max_chars=300) assert d.available is True assert len(d.text) <= 300 + + +def test_agent_to_entity_from_reddit_json(tmp_path): + """C5: ``FileSystemPersonaProvider.agent_to_entity()`` must reconstruct the + ``{agent_id: zep_entity_uuid}`` map from a reddit_profiles.json that + includes ``source_entity_uuid``. + """ + data = [ + {"user_id": 0, "user_name": "fischer1", "name": "Fischer Müller", + "persona": "p", "profession": "fisher", + "source_entity_uuid": "uuid-zero"}, + {"user_id": 1, "user_name": "ngo1", "name": "Ines NGO", + "persona": "p", "profession": "ngo_staff", + "source_entity_uuid": "uuid-one"}, + # Row with no uuid must be skipped. + {"user_id": 2, "user_name": "gov1", "name": "Gov Agent", + "persona": "p", "profession": "official"}, + ] + p = tmp_path / "reddit_profiles.json" + p.write_text(json.dumps(data), encoding="utf-8") + + provider = FileSystemPersonaProvider(reddit_path=p, twitter_path=None) + mapping = provider.agent_to_entity() + + assert mapping == {0: "uuid-zero", 1: "uuid-one"} + # Map values are strings, keys are ints. + for k, v in mapping.items(): + assert isinstance(k, int) + assert isinstance(v, str) + + +def test_agent_to_entity_empty_when_no_field(tmp_path): + """C5: if no row has ``source_entity_uuid``, return an empty dict — not + a crash, not partial garbage.""" + data = [{"user_id": 0, "user_name": "u", "name": "A", "persona": "p"}] + p = tmp_path / "reddit_profiles.json" + p.write_text(json.dumps(data), encoding="utf-8") + provider = FileSystemPersonaProvider(reddit_path=p, twitter_path=None) + assert provider.agent_to_entity() == {} + + +def test_agent_to_entity_falls_back_to_twitter_csv(tmp_path): + """C5: when only twitter_profiles.csv exists, the helper must still + extract uuids from the CSV's ``source_entity_uuid`` column. + """ + p = tmp_path / "twitter_profiles.csv" + with p.open("w", newline="", encoding="utf-8") as f: + writer = csv.writer(f) + writer.writerow(["user_id", "name", "username", "user_char", "description", "source_entity_uuid"]) + writer.writerow([0, "A0", "u0", "char", "desc", "uuid-zero"]) + writer.writerow([1, "A1", "u1", "char", "desc", ""]) # skipped (blank uuid) + writer.writerow([2, "A2", "u2", "char", "desc", "uuid-two"]) + + provider = FileSystemPersonaProvider(reddit_path=None, twitter_path=p) + assert provider.agent_to_entity() == {0: "uuid-zero", 2: "uuid-two"} + + +def test_agent_to_entity_reddit_takes_precedence(tmp_path): + """C5: when both files exist, Reddit JSON wins; Twitter CSV only fills + agents not already mapped.""" + reddit = tmp_path / "reddit_profiles.json" + reddit.write_text(json.dumps([ + {"user_id": 0, "user_name": "u0", "name": "A0", "persona": "p", + "source_entity_uuid": "reddit-zero"}, + ]), encoding="utf-8") + + twitter = tmp_path / "twitter_profiles.csv" + with twitter.open("w", newline="", encoding="utf-8") as f: + writer = csv.writer(f) + writer.writerow(["user_id", "name", "username", "user_char", "description", "source_entity_uuid"]) + writer.writerow([0, "A0", "u0", "char", "desc", "twitter-zero"]) # ignored + writer.writerow([1, "A1", "u1", "char", "desc", "twitter-one"]) # used + + provider = FileSystemPersonaProvider(reddit_path=reddit, twitter_path=twitter) + assert provider.agent_to_entity() == {0: "reddit-zero", 1: "twitter-one"} diff --git a/backend/tests/interviews/test_api_interview.py b/backend/tests/interviews/test_api_interview.py index baad634b..7e55d627 100644 --- a/backend/tests/interviews/test_api_interview.py +++ b/backend/tests/interviews/test_api_interview.py @@ -40,3 +40,116 @@ def test_unknown_subagent_returns_400(client): res = client.post("/api/interview/sim_test/rerun", json={"subagent": "nonsense"}) assert res.status_code == 400 + + +def test_build_orchestrator_reads_graph_id_from_state(tmp_path, monkeypatch): + """C1+C2: ``_build_orchestrator`` must resolve the Zep graph_id from + ``state.json`` (written by ``SimulationManager``), not from the + nonexistent ``graph_id.txt``. The graph_id then must reach the + ``InterviewZepWriter`` instead of being silently swallowed. + """ + monkeypatch.setenv("LLM_STUB_MODE", "true") + monkeypatch.setenv("UPLOADS_DIR", str(tmp_path)) + monkeypatch.setenv("ZEP_API_KEY", "test-fake-key") + from app.config import Config + Config.LLM_STUB_MODE = True + Config.UPLOADS_DIR = str(tmp_path) + Config.ZEP_API_KEY = "test-fake-key" + + # SimulationManager's data dir is class-level — point it at tmp_path. + from app.services.simulation_manager import SimulationManager + sim_root = tmp_path / "simulations" + sim_root.mkdir(parents=True, exist_ok=True) + monkeypatch.setattr(SimulationManager, "SIMULATION_DATA_DIR", str(sim_root)) + + sim_id = "sim_graphid" + sim_dir = sim_root / sim_id + sim_dir.mkdir(parents=True) + # Seed a profile file so FileSystemPersonaProvider can work. + (sim_dir / "reddit_profiles.json").write_text( + json.dumps([ + {"user_id": 0, "user_name": "u0", "name": "A0", + "persona": "p", "profession": "fisher", + "source_entity_uuid": "uuid-zero"}, + {"user_id": 1, "user_name": "u1", "name": "A1", + "persona": "p", "profession": "fisher", + "source_entity_uuid": "uuid-one"}, + ]), + encoding="utf-8", + ) + # Seed state.json with the graph_id. + state_doc = { + "simulation_id": sim_id, + "project_id": "p", + "graph_id": "graph-from-state", + "status": "ready", + "enable_twitter": False, + "enable_reddit": True, + } + (sim_dir / "state.json").write_text(json.dumps(state_doc), encoding="utf-8") + + # Patch ZepGraphMemoryUpdater + ZepEntityReader so we don't hit the network. + import app.services.zep_graph_memory_updater as zgmu + import app.services.zep_entity_reader as zer + + class _FakeUpdater: + def __init__(self, graph_id, api_key=None): + self.graph_id = graph_id + + def add_text_episode(self, graph_id, text): + return None + + class _FakeReader: + def __init__(self, api_key=None): + pass + + def get_entity_with_context(self, graph_id, entity_uuid): + return None + + monkeypatch.setattr(zgmu, "ZepGraphMemoryUpdater", _FakeUpdater) + monkeypatch.setattr(zer, "ZepEntityReader", _FakeReader) + + from app.api.interview import _build_orchestrator + + orch = _build_orchestrator(sim_id) + assert orch.zep_writer.graph_id == "graph-from-state" + # Updater on the writer must be the real (or fake) ZepGraphMemoryUpdater path, + # NOT the null updater — i.e. its graph_id must match. + assert getattr(orch.zep_writer.updater, "graph_id", None) == "graph-from-state" + + # ZepMemoryProvider must have received the agent_to_entity map (C5). + assert hasattr(orch.memory, "map") + assert orch.memory.map == {0: "uuid-zero", 1: "uuid-one"} + + +def test_build_orchestrator_falls_back_when_state_missing(tmp_path, monkeypatch): + """C1+C2: when ``state.json`` is missing, the orchestrator must still be + constructed with the null updater/memory path (not crash, not silently + pass a bare ``ZepGraphMemoryUpdater()`` that would error out). + """ + monkeypatch.setenv("LLM_STUB_MODE", "true") + monkeypatch.setenv("UPLOADS_DIR", str(tmp_path)) + from app.config import Config + Config.LLM_STUB_MODE = True + Config.UPLOADS_DIR = str(tmp_path) + + from app.services.simulation_manager import SimulationManager + sim_root = tmp_path / "simulations" + sim_root.mkdir(parents=True, exist_ok=True) + monkeypatch.setattr(SimulationManager, "SIMULATION_DATA_DIR", str(sim_root)) + + sim_id = "sim_no_state" + sim_dir = sim_root / sim_id + sim_dir.mkdir(parents=True) + (sim_dir / "reddit_profiles.json").write_text( + json.dumps([{"user_id": 0, "user_name": "u0", "name": "A0", + "persona": "p", "profession": "fisher"}]), + encoding="utf-8", + ) + + from app.api.interview import _build_orchestrator + + orch = _build_orchestrator(sim_id) + assert orch.zep_writer.graph_id == "" + # Null updater path: writer must still respond to _emit without raising. + orch.zep_writer._emit("hello") diff --git a/backend/tests/interviews/test_simulation_hooks.py b/backend/tests/interviews/test_simulation_hooks.py index cef304f2..52852d28 100644 --- a/backend/tests/interviews/test_simulation_hooks.py +++ b/backend/tests/interviews/test_simulation_hooks.py @@ -7,11 +7,27 @@ NOTE ON SHAPE DIVERGENCE vs. original plan spec: - The COMPLETED transition lives in simulation_runner.py (SimulationRunner._monitor_simulation), not in simulation_manager.py. The _notify_on_completed hook is registered on SimulationManager and the production insertion point for COMPLETED is documented in DONE_WITH_CONCERNS. + +Hooks are stored on the class (C3 fix), so each test snapshots/restores the +registries via the autouse fixture to keep test isolation. """ +import pytest + from app.services.simulation_manager import SimulationManager, SimulationState, SimulationStatus +@pytest.fixture(autouse=True) +def _isolate_class_hooks(): + saved_ready = list(SimulationManager._on_ready_hooks) + saved_completed = list(SimulationManager._on_completed_hooks) + try: + yield + finally: + SimulationManager._on_ready_hooks[:] = saved_ready + SimulationManager._on_completed_hooks[:] = saved_completed + + def test_register_post_ready_hook_invoked(): called = [] mgr = SimulationManager() @@ -38,3 +54,43 @@ def test_register_post_completed_hook_invoked(): ) mgr._notify_on_completed(state) assert called == [("done", "abc")] + + +def test_hooks_survive_across_instances(): + """C3: hook registries are class-level, so callbacks registered through the + classmethod must still fire on a freshly constructed instance. This is + what makes the Flask per-request ``SimulationManager()`` pattern work + after ``install_hooks(SimulationManager)`` runs at app startup. + """ + called: list[str] = [] + + # Register via the class — the production install_hooks(cls) path. + SimulationManager.register_on_ready(lambda s: called.append(f"ready:{s.simulation_id}")) + SimulationManager.register_on_completed(lambda s: called.append(f"done:{s.simulation_id}")) + + # New, independently-constructed instance must still see the hooks. + fresh = SimulationManager() + state = SimulationState( + simulation_id="cross_instance", + project_id="p", + graph_id="g", + status=SimulationStatus.READY, + ) + fresh._notify_on_ready(state) + state.status = SimulationStatus.COMPLETED + fresh._notify_on_completed(state) + + assert "ready:cross_instance" in called + assert "done:cross_instance" in called + + +def test_register_via_instance_also_lands_on_class(): + """Registering through an instance must populate the class registry too — + backward-compatibility with code that calls ``manager.register_on_*``. + """ + mgr1 = SimulationManager() + mgr1.register_on_ready(lambda s: None) + # A second, unrelated instance must see the hook. + mgr2 = SimulationManager() + assert len(SimulationManager._on_ready_hooks) >= 1 + assert SimulationManager._on_ready_hooks is mgr2.__class__._on_ready_hooks diff --git a/backend/tests/interviews/test_zep_writer.py b/backend/tests/interviews/test_zep_writer.py index 661ef44b..6eaed454 100644 --- a/backend/tests/interviews/test_zep_writer.py +++ b/backend/tests/interviews/test_zep_writer.py @@ -1,16 +1,26 @@ +import pytest + from app.models.interview import ( LikertResponse, InterviewPhase, SubagentKind, ) from app.services.interviews.zep_writer import InterviewZepWriter + class _FakeMemoryUpdater: + """Fake mirroring the real ZepGraphMemoryUpdater contract. + + Post-C4 the writer only uses ``add_text_episode(graph_id, text)`` — + ``add_activity`` is deliberately omitted to lock in the new behaviour and + catch any regression that re-introduces the broken dict-based fallback. + """ + def __init__(self): - self.events = [] - def add_activity(self, activity): - self.events.append(activity) + self.events: list[dict] = [] + def add_text_episode(self, graph_id, text): self.events.append({"graph_id": graph_id, "text": text}) + def test_per_agent_episode_text(): upd = _FakeMemoryUpdater() w = InterviewZepWriter(memory_updater=upd, graph_id="g1") @@ -20,9 +30,48 @@ def test_per_agent_episode_text(): w.write_per_agent(SubagentKind.LONGITUDINAL, r, agent_name="Fischer Müller") assert any("Fischer Müller" in str(e) for e in upd.events) assert any("longitudinal/T1" in str(e) for e in upd.events) + # Each event must carry the configured graph_id. + assert all(e["graph_id"] == "g1" for e in upd.events) + def test_aggregate_episode(): upd = _FakeMemoryUpdater() w = InterviewZepWriter(memory_updater=upd, graph_id="g1") w.write_aggregate(SubagentKind.SCENARIO, summary="S1 mean desirability 5.2; S2 mean 2.1") assert any("S1 mean" in str(e) for e in upd.events) + + +def test_emit_uses_add_text_episode_with_graph_id(): + """C4: ``_emit`` must call ``updater.add_text_episode(graph_id, text)`` + with the constructor's graph_id and the raw text — no dict shape, no + ``add_activity`` fallback (the real ``add_activity`` rejects dicts). + """ + upd = _FakeMemoryUpdater() + w = InterviewZepWriter(memory_updater=upd, graph_id="g_xyz") + w._emit("hello world") + assert upd.events == [{"graph_id": "g_xyz", "text": "hello world"}] + + +def test_emit_raises_when_updater_lacks_add_text_episode(): + """C4: a memory_updater without ``add_text_episode`` must surface a + RuntimeError rather than silently no-op via a broken ``add_activity`` + fallback. + """ + + class _Broken: + def add_activity(self, activity): # pragma: no cover - kept for clarity + raise AssertionError("must not be called") + + w = InterviewZepWriter(memory_updater=_Broken(), graph_id="g1") + with pytest.raises(RuntimeError, match="add_text_episode"): + w._emit("x") + + +def test_real_updater_exposes_add_text_episode(): + """C4 sanity check: ZepGraphMemoryUpdater (the real class) must expose + ``add_text_episode`` so the production wiring works without falling + through to the broken ``add_activity(dict)`` path. + """ + from app.services.zep_graph_memory_updater import ZepGraphMemoryUpdater + + assert hasattr(ZepGraphMemoryUpdater, "add_text_episode") From 6a53c110b7e7ddd73392ba565478451adfd9be31 Mon Sep 17 00:00:00 2001 From: Christian Moellmann Date: Sat, 23 May 2026 13:40:43 +0200 Subject: [PATCH 25/26] feat(interviews): capture raw LLM output on schema-validation failures Adds SchemaValidationFailure exception carrying both retry attempts' raw output, so audit.jsonl preserves what the model actually said when an agent's response can't be coerced into the instrument schema. Lets us diagnose persona-vs-format failures without re-running. Two new tests. Co-Authored-By: Claude Opus 4.7 (1M context) --- .../app/services/interview_orchestrator.py | 7 ++++- backend/app/services/interviews/base.py | 27 +++++++++++----- backend/app/services/interviews/storage.py | 10 ++++-- .../tests/interviews/test_base_interviewer.py | 23 +++++++++++++- backend/tests/interviews/test_orchestrator.py | 31 +++++++++++++++++++ 5 files changed, 87 insertions(+), 11 deletions(-) diff --git a/backend/app/services/interview_orchestrator.py b/backend/app/services/interview_orchestrator.py index ff0d2ad8..d87e90ea 100644 --- a/backend/app/services/interview_orchestrator.py +++ b/backend/app/services/interview_orchestrator.py @@ -6,7 +6,7 @@ from app.models.interview import ( InterviewPhase, SubagentKind, LikertResponse, QSortResponse, DelphiOpenResponse, DelphiRatingResponse, ScenarioResponse, ) -from app.services.interviews.base import PersonaRecord +from app.services.interviews.base import PersonaRecord, SchemaValidationFailure from app.services.interviews.longitudinal import LongitudinalSubagent, run_aggregate as longitudinal_aggregate from app.services.interviews.diversity import DiversitySubagent, run_typology from app.services.interviews.delphi import ( @@ -58,6 +58,11 @@ class InterviewOrchestrator: out = fut.result() ok.append(out) self.store.append_response(run_dir, out) + except SchemaValidationFailure as e: + failed.append(p.agent_id) + self.store.audit(run_dir, agent_id=p.agent_id, + event="schema_validation_failure", + detail={"label": audit_label, "attempts": e.attempts}) except Exception as e: failed.append(p.agent_id) self.store.audit(run_dir, agent_id=p.agent_id, diff --git a/backend/app/services/interviews/base.py b/backend/app/services/interviews/base.py index bb318db9..87d9a1f5 100644 --- a/backend/app/services/interviews/base.py +++ b/backend/app/services/interviews/base.py @@ -22,6 +22,13 @@ class MemoryProvider(Protocol): def get_digest(self, agent_id: int, max_chars: int = 2000) -> MemoryDigest: ... +class SchemaValidationFailure(ValueError): + def __init__(self, agent_id: int, attempts: list[dict]): + super().__init__(f"agent {agent_id}: schema violation after retry") + self.agent_id = agent_id + self.attempts = attempts + + class StakeholderInterviewer: def __init__(self, llm, memory: MemoryProvider, language: str = "de"): self.llm = llm @@ -55,18 +62,24 @@ class StakeholderInterviewer: {"role": "system", "content": self._system_prompt(persona, digest, schema_hint)}, {"role": "user", "content": user_prompt}, ] - out = self.llm.chat_json(messages=messages, temperature=temperature, max_tokens=max_tokens) + first = self.llm.chat_json(messages=messages, temperature=temperature, max_tokens=max_tokens) if validate is not None: - validated = validate(out) + validated = validate(first) if validated is not None: return validated - messages.append({"role": "assistant", "content": str(out)}) + messages.append({"role": "assistant", "content": str(first)}) messages.append({"role": "user", "content": "Your previous response did not match the required schema. " f"Return ONLY valid JSON matching: {schema_hint}"}) - out = self.llm.chat_json(messages=messages, temperature=0.0, max_tokens=max_tokens) - validated = validate(out) + second = self.llm.chat_json(messages=messages, temperature=0.0, max_tokens=max_tokens) + validated = validate(second) if validated is None: - raise ValueError(f"agent {persona.agent_id}: schema violation after retry") + raise SchemaValidationFailure( + persona.agent_id, + attempts=[ + {"attempt": 1, "raw": first, "schema_hint": schema_hint}, + {"attempt": 2, "raw": second, "schema_hint": schema_hint}, + ], + ) return validated - return out + return first diff --git a/backend/app/services/interviews/storage.py b/backend/app/services/interviews/storage.py index 50579830..9ba23d49 100644 --- a/backend/app/services/interviews/storage.py +++ b/backend/app/services/interviews/storage.py @@ -49,10 +49,16 @@ class InterviewStore: (run_dir / name).write_text( json.dumps(payload, ensure_ascii=False, indent=2), encoding="utf-8") - def audit(self, run_dir: Path, agent_id: int | None, event: str, detail: str = "") -> None: + def audit( + self, + run_dir: Path, + agent_id: int | None, + event: str, + detail: str | dict = "", + ) -> None: entry = {"ts": time.time(), "agent_id": agent_id, "event": event, "detail": detail} with (run_dir / "audit.jsonl").open("a", encoding="utf-8") as f: - f.write(json.dumps(entry, ensure_ascii=False) + "\n") + f.write(json.dumps(entry, ensure_ascii=False, default=str) + "\n") def mark_latest(self, run_dir: Path) -> None: pointer = run_dir.parent / "latest.json" diff --git a/backend/tests/interviews/test_base_interviewer.py b/backend/tests/interviews/test_base_interviewer.py index 2c8962ef..822dee45 100644 --- a/backend/tests/interviews/test_base_interviewer.py +++ b/backend/tests/interviews/test_base_interviewer.py @@ -1,6 +1,8 @@ import json import pytest -from app.services.interviews.base import StakeholderInterviewer, MemoryDigest, PersonaRecord +from app.services.interviews.base import ( + StakeholderInterviewer, MemoryDigest, PersonaRecord, SchemaValidationFailure, +) class _FakeLLM: def __init__(self, responses): @@ -45,3 +47,22 @@ def test_two_failures_raise(): with pytest.raises(ValueError): interviewer.ask_in_character(persona, user_prompt="Q?", schema_hint="x", validate=lambda d: d if "responses" in d else None) + + +def test_schema_failure_captures_both_raw_attempts(): + bad1 = {"oops": "no responses key"} + bad2 = {"still": "wrong shape"} + llm = _FakeLLM([bad1, bad2]) + mem = _FakeMemory() + interviewer = StakeholderInterviewer(llm=llm, memory=mem) + persona = PersonaRecord(agent_id=42, name="A", persona="p") + with pytest.raises(SchemaValidationFailure) as exc_info: + interviewer.ask_in_character(persona, user_prompt="Q?", schema_hint="x", + validate=lambda d: d if "responses" in d else None) + err = exc_info.value + assert err.agent_id == 42 + assert len(err.attempts) == 2 + assert err.attempts[0]["raw"] == bad1 + assert err.attempts[1]["raw"] == bad2 + assert err.attempts[0]["attempt"] == 1 + assert err.attempts[1]["attempt"] == 2 diff --git a/backend/tests/interviews/test_orchestrator.py b/backend/tests/interviews/test_orchestrator.py index 323c4361..8d380eaf 100644 --- a/backend/tests/interviews/test_orchestrator.py +++ b/backend/tests/interviews/test_orchestrator.py @@ -62,3 +62,34 @@ def test_partial_failure_does_not_kill_run(tmp_path): result = orch.run_pre() assert result["longitudinal"]["n_responded"] < 4 assert result["longitudinal"]["n_failed"] > 0 + + +def test_schema_failure_audit_captures_raw_llm_output(tmp_path): + """When an agent's LLM output fails the schema validator twice, the audit log + should preserve both raw outputs so we can debug what the model actually said.""" + bad_response = {"wrong": "shape, no responses key"} + class _BadLLM: + def chat_json(self, messages, temperature=0.0, max_tokens=None, **kw): + return bad_response # always fails Longitudinal validator + orch = InterviewOrchestrator( + llm=_BadLLM(), memory=_Mem(), personas=_Personas(1), + instrument_dir=INST_DIR, store_root=tmp_path, sim_id="sim3", + zep_writer=_NoopZep(), max_workers=1, + ) + result = orch.run_pre() + assert result["longitudinal"]["n_responded"] == 0 + assert result["longitudinal"]["n_failed"] == 1 + + import json as _j + run_dir = Path(result["longitudinal"]["run_dir"]) + audit_path = run_dir / "audit.jsonl" + lines = audit_path.read_text(encoding="utf-8").splitlines() + assert lines, "audit.jsonl should not be empty" + entry = _j.loads(lines[0]) + assert entry["event"] == "schema_validation_failure" + assert entry["agent_id"] == 0 + detail = entry["detail"] + assert detail["label"] == "longitudinal_T0" + assert len(detail["attempts"]) == 2 + assert detail["attempts"][0]["raw"] == bad_response + assert detail["attempts"][1]["raw"] == bad_response From 895a5fbaee7198fe73a7c5c177a4a6c242c014fd Mon Sep 17 00:00:00 2001 From: Christian Moellmann Date: Sat, 23 May 2026 14:01:42 +0200 Subject: [PATCH 26/26] fix(interviews): accept stringified ints in all 4 subagent validators Real LLMs (observed with anthropic/claude-haiku-4-5 on a 23-agent run) sometimes return Likert values as JSON strings ('3' not 3). The 4 subagent validators rejected this with isinstance(v, int), losing ~30% of agents at N=23. Added a shared coerce_int helper in base.py that accepts ints and numeric strings, rejects bools/floats/garbage, and is now used by: - Longitudinal: response values 1-5 - Diversity: Q-sort placements -3..+3 and 6 Likert axes 1-7 - Delphi: R2 and R3 importance/plausibility 1-5 - Scenario: 4 dimensions 1-7 Validators now coerce in place so downstream code sees ints regardless of the wire format. Added 8 tests (4 unit on coerce_int + 4 per-subagent contract tests showing stringified values are accepted). Co-Authored-By: Claude Opus 4.7 (1M context) --- backend/app/services/interviews/base.py | 22 ++++++++++++ backend/app/services/interviews/delphi.py | 16 ++++++--- backend/app/services/interviews/diversity.py | 19 +++++++---- .../app/services/interviews/longitudinal.py | 8 +++-- backend/app/services/interviews/scenario.py | 8 +++-- .../tests/interviews/test_base_interviewer.py | 28 +++++++++++++++ backend/tests/interviews/test_delphi.py | 26 ++++++++++++++ backend/tests/interviews/test_diversity.py | 30 ++++++++++++++++ backend/tests/interviews/test_longitudinal.py | 34 +++++++++++++++++++ backend/tests/interviews/test_scenario.py | 26 ++++++++++++++ 10 files changed, 202 insertions(+), 15 deletions(-) diff --git a/backend/app/services/interviews/base.py b/backend/app/services/interviews/base.py index 87d9a1f5..0eb2f821 100644 --- a/backend/app/services/interviews/base.py +++ b/backend/app/services/interviews/base.py @@ -22,6 +22,28 @@ class MemoryProvider(Protocol): def get_digest(self, agent_id: int, max_chars: int = 2000) -> MemoryDigest: ... +def coerce_int(value: Any) -> Optional[int]: + """Coerce LLM-returned Likert values into ints. + + Real LLMs frequently return numeric Likert responses as JSON strings + (e.g. "3" instead of 3). Returns the int if value is an int or a string + that round-trips through int(); otherwise None. Bools are rejected so + True/False aren't accepted as 1/0. + """ + if isinstance(value, bool): + return None + if isinstance(value, int): + return value + if isinstance(value, str): + s = value.strip() + if s and s.lstrip("-").isdigit(): + try: + return int(s) + except ValueError: + return None + return None + + class SchemaValidationFailure(ValueError): def __init__(self, agent_id: int, attempts: list[dict]): super().__init__(f"agent {agent_id}: schema violation after retry") diff --git a/backend/app/services/interviews/delphi.py b/backend/app/services/interviews/delphi.py index be455ae9..198da793 100644 --- a/backend/app/services/interviews/delphi.py +++ b/backend/app/services/interviews/delphi.py @@ -7,7 +7,7 @@ import yaml from app.models.interview import ( DelphiOpenResponse, DelphiRatingResponse, ) -from app.services.interviews.base import StakeholderInterviewer, PersonaRecord +from app.services.interviews.base import StakeholderInterviewer, PersonaRecord, coerce_int class DelphiSubagent: @@ -66,8 +66,12 @@ class DelphiSubagent: if set(ratings.keys()) != set(theme_ids): return None for tid, r in ratings.items(): if not isinstance(r, dict): return None + coerced: dict[str, int] = {} for key in ("importance", "plausibility"): - if not isinstance(r.get(key), int) or not 1 <= r[key] <= 5: return None + iv = coerce_int(r.get(key)) + if iv is None or not 1 <= iv <= 5: return None + coerced[key] = iv + ratings[tid] = coerced return raw return v @@ -110,10 +114,14 @@ class DelphiSubagent: if not isinstance(raw, dict): return None ratings = raw.get("ratings", {}) if set(ratings.keys()) != set(theme_ids): return None - for r in ratings.values(): + for tid, r in ratings.items(): if not isinstance(r, dict): return None + coerced: dict[str, int] = {} for key in ("importance", "plausibility"): - if not isinstance(r.get(key), int) or not 1 <= r[key] <= 5: return None + iv = coerce_int(r.get(key)) + if iv is None or not 1 <= iv <= 5: return None + coerced[key] = iv + ratings[tid] = coerced return raw raw = self.interviewer.ask_in_character(persona, user_prompt=prompt, diff --git a/backend/app/services/interviews/diversity.py b/backend/app/services/interviews/diversity.py index 96febcf5..2c129828 100644 --- a/backend/app/services/interviews/diversity.py +++ b/backend/app/services/interviews/diversity.py @@ -7,7 +7,7 @@ from sklearn.decomposition import PCA from sklearn.cluster import KMeans import yaml from app.models.interview import QSortResponse -from app.services.interviews.base import StakeholderInterviewer, PersonaRecord +from app.services.interviews.base import StakeholderInterviewer, PersonaRecord, coerce_int from app.services.interviews.instrument_loader import InstrumentValidationError @@ -64,16 +64,23 @@ class DiversitySubagent: dist = self.instrument["distribution"] target = {b: n for b, n in zip(range(-3, 4), dist)} got: dict[int, int] = {} - for v in placements.values(): - if not isinstance(v, int) or not -3 <= v <= 3: + coerced_p: dict[str, int] = {} + for k, v in placements.items(): + iv = coerce_int(v) + if iv is None or not -3 <= iv <= 3: return None - got[v] = got.get(v, 0) + 1 + coerced_p[k] = iv + got[iv] = got.get(iv, 0) + 1 if got != target: return None + coerced_a: dict[str, int] = {} for a in self.instrument["likert_axes"]: - v = axes.get(a["axis_id"]) - if not isinstance(v, int) or not 1 <= v <= 7: + iv = coerce_int(axes.get(a["axis_id"])) + if iv is None or not 1 <= iv <= 7: return None + coerced_a[a["axis_id"]] = iv + raw["placements"] = coerced_p + raw["likert_axes"] = coerced_a return raw def administer(self, persona: PersonaRecord) -> QSortResponse: diff --git a/backend/app/services/interviews/longitudinal.py b/backend/app/services/interviews/longitudinal.py index 4f13ec23..6ef7b811 100644 --- a/backend/app/services/interviews/longitudinal.py +++ b/backend/app/services/interviews/longitudinal.py @@ -6,7 +6,7 @@ from typing import Optional from app.models.interview import ( LikertInstrument, LikertResponse, InterviewPhase, ) -from app.services.interviews.base import StakeholderInterviewer, PersonaRecord +from app.services.interviews.base import StakeholderInterviewer, PersonaRecord, coerce_int from app.services.interviews.instrument_loader import load_likert_instrument @@ -44,9 +44,13 @@ class LongitudinalSubagent: required = {it.item_id for it in self.instrument.items} if not required.issubset(resp.keys()): return None + coerced: dict[str, int] = {} for k, v in resp.items(): - if not isinstance(v, int) or not 1 <= v <= 5: + iv = coerce_int(v) + if iv is None or not 1 <= iv <= 5: return None + coerced[k] = iv + raw["responses"] = coerced return raw def administer(self, persona: PersonaRecord, phase: InterviewPhase) -> LikertResponse: diff --git a/backend/app/services/interviews/scenario.py b/backend/app/services/interviews/scenario.py index f78239fb..1b1e8468 100644 --- a/backend/app/services/interviews/scenario.py +++ b/backend/app/services/interviews/scenario.py @@ -5,7 +5,7 @@ from pathlib import Path from typing import Optional import yaml from app.models.interview import ScenarioRating, ScenarioResponse -from app.services.interviews.base import StakeholderInterviewer, PersonaRecord +from app.services.interviews.base import StakeholderInterviewer, PersonaRecord, coerce_int class ScenarioSubagent: def __init__(self, llm, memory, instrument_path: Path, language: str = "de"): @@ -44,10 +44,12 @@ class ScenarioSubagent: sids = {s["scenario_id"] for s in self.instrument["scenarios"]} ratings = raw.get("ratings", {}) if set(ratings.keys()) != sids: return None - for v in ratings.values(): + for sid, v in ratings.items(): if not isinstance(v, dict): return None for k in ("desirability", "plausibility", "impact_on_my_group", "fairness"): - if not isinstance(v.get(k), int) or not 1 <= v[k] <= 7: return None + iv = coerce_int(v.get(k)) + if iv is None or not 1 <= iv <= 7: return None + v[k] = iv if not isinstance(v.get("if_woke_up_response", ""), str): return None return raw diff --git a/backend/tests/interviews/test_base_interviewer.py b/backend/tests/interviews/test_base_interviewer.py index 822dee45..03295867 100644 --- a/backend/tests/interviews/test_base_interviewer.py +++ b/backend/tests/interviews/test_base_interviewer.py @@ -2,8 +2,36 @@ import json import pytest from app.services.interviews.base import ( StakeholderInterviewer, MemoryDigest, PersonaRecord, SchemaValidationFailure, + coerce_int, ) + +def test_coerce_int_accepts_real_int(): + assert coerce_int(3) == 3 + assert coerce_int(-2) == -2 + assert coerce_int(0) == 0 + + +def test_coerce_int_accepts_numeric_strings(): + assert coerce_int("3") == 3 + assert coerce_int(" 4 ") == 4 + assert coerce_int("-2") == -2 + + +def test_coerce_int_rejects_non_numeric(): + assert coerce_int("3.5") is None + assert coerce_int("abc") is None + assert coerce_int(None) is None + assert coerce_int([3]) is None + assert coerce_int(3.5) is None + + +def test_coerce_int_rejects_bool(): + """True/False should NOT silently coerce to 1/0 even though Python says they're ints.""" + assert coerce_int(True) is None + assert coerce_int(False) is None + + class _FakeLLM: def __init__(self, responses): self.responses = list(responses) diff --git a/backend/tests/interviews/test_delphi.py b/backend/tests/interviews/test_delphi.py index c01ecfb8..e55cab7a 100644 --- a/backend/tests/interviews/test_delphi.py +++ b/backend/tests/interviews/test_delphi.py @@ -56,3 +56,29 @@ def test_convergence_metrics(): conv = convergence_metrics(r2, r3) assert "t1" in conv assert conv["t1"]["delta_iqr_importance"] is not None + + +def test_delphi_r2_accepts_string_ratings(): + """Delphi R2/R3 ratings should accept stringified importance/plausibility ints.""" + from app.services.interviews.base import PersonaRecord, MemoryDigest + from app.services.interviews.delphi import DelphiSubagent + from pathlib import Path as _P + + class _Mem: + def get_digest(self, agent_id, max_chars=2000): + return MemoryDigest(text="x", available=True) + + class _StringLLM: + def chat_json(self, messages, temperature=0.0, max_tokens=None, **kw): + return {"ratings": { + "t1": {"importance": "4", "plausibility": "3"}, + "t2": {"importance": "5", "plausibility": "2"}, + }} + + inst = _P(__file__).resolve().parents[2] / "scripts" / "instruments" / "delphi_v1.yaml" + sub = DelphiSubagent(llm=_StringLLM(), memory=_Mem(), instrument_path=inst) + persona = PersonaRecord(agent_id=1, name="A", persona="p") + themes = [{"theme_id": "t1", "label": "T1"}, {"theme_id": "t2", "label": "T2"}] + resp = sub.administer_round2(persona, themes) + assert resp.ratings["t1"]["importance"] == 4 + assert isinstance(resp.ratings["t1"]["importance"], int) diff --git a/backend/tests/interviews/test_diversity.py b/backend/tests/interviews/test_diversity.py index 7650fac2..d8eb45d3 100644 --- a/backend/tests/interviews/test_diversity.py +++ b/backend/tests/interviews/test_diversity.py @@ -46,3 +46,33 @@ def test_typology_runs_pca_kmeans(): assert len(result["clusters"]) == 3 assert "pca" in result assert len(result["pca"]["components"]) >= 2 + + +def test_diversity_accepts_string_likert_values(): + """Diversity placements + axes should accept stringified ints.""" + from app.services.interviews.base import PersonaRecord, MemoryDigest + from app.services.interviews.diversity import DiversitySubagent + from pathlib import Path as _P + + class _Mem: + def get_digest(self, agent_id, max_chars=2000): + return MemoryDigest(text="x", available=True) + + buckets = [-3]*2 + [-2]*3 + [-1]*4 + [0]*6 + [1]*4 + [2]*3 + [3]*2 + + class _StringLLM: + def chat_json(self, messages, temperature=0.0, max_tokens=None, **kw): + return { + "placements": {f"st_{i+1:02d}": str(buckets[i]) for i in range(24)}, + "likert_axes": {a: "4" for a in ( + "ax_pres_extr","ax_loc_eu","ax_sci_trad", + "ax_ind_col","ax_short_long","ax_mkt_reg")}, + } + + inst = _P(__file__).resolve().parents[2] / "scripts" / "instruments" / "diversity_v1.yaml" + sub = DiversitySubagent(llm=_StringLLM(), memory=_Mem(), instrument_path=inst) + persona = PersonaRecord(agent_id=7, name="A", persona="p") + resp = sub.administer(persona) + assert isinstance(resp.placements["st_01"], int) + assert isinstance(resp.likert_axes["ax_pres_extr"], int) + assert resp.likert_axes["ax_pres_extr"] == 4 diff --git a/backend/tests/interviews/test_longitudinal.py b/backend/tests/interviews/test_longitudinal.py index 823e1552..006c293a 100644 --- a/backend/tests/interviews/test_longitudinal.py +++ b/backend/tests/interviews/test_longitudinal.py @@ -55,3 +55,37 @@ def test_longitudinal_aggregate_delta(): assert agg["per_item"]["stk_1"]["mean_delta"] == 1.0 assert agg["per_item"]["gov_1"]["mean_delta"] == 0.0 assert agg["n_paired"] == 5 + + +def test_longitudinal_accepts_string_likert_values(): + """Real LLMs sometimes return Likert values as JSON strings ('3' not 3). + The validator should coerce them rather than fail the agent.""" + from app.models.interview import InterviewPhase + from app.services.interviews.base import PersonaRecord, MemoryDigest + from app.services.interviews.longitudinal import LongitudinalSubagent + from pathlib import Path as _P + + class _Mem: + def get_digest(self, agent_id, max_chars=2000): + return MemoryDigest(text="x", available=True) + + class _StringLLM: + def chat_json(self, messages, temperature=0.0, max_tokens=None, **kw): + return { + "responses": { # all strings, not ints + "stk_1": "4", "stk_2": "3", "stk_3": "5", + "gov_1": "3", "gov_2": "4", "gov_3": "2", + "mkt_1": "5", "mkt_2": "3", "mkt_3": "4", + "clm_1": "2", "clm_2": "4", "clm_3": "5", + }, + "confidence": {}, + "open_comment": "stringified", + } + + inst = _P(__file__).resolve().parents[2] / "scripts" / "instruments" / "longitudinal_v1.yaml" + sub = LongitudinalSubagent(llm=_StringLLM(), memory=_Mem(), instrument_path=inst) + persona = PersonaRecord(agent_id=99, name="A", persona="p") + resp = sub.administer(persona, phase=InterviewPhase.T0) + assert resp.agent_id == 99 + assert resp.responses["stk_1"] == 4 + assert isinstance(resp.responses["stk_1"], int) diff --git a/backend/tests/interviews/test_scenario.py b/backend/tests/interviews/test_scenario.py index 567290d1..61787211 100644 --- a/backend/tests/interviews/test_scenario.py +++ b/backend/tests/interviews/test_scenario.py @@ -32,3 +32,29 @@ def test_polarity_matrix(): assert "S1" in m assert m["S1"]["mean_desirability"] == 5 assert m["S1"]["n"] == 3 + + +def test_scenario_accepts_string_likert_values(): + """Scenario ratings should accept stringified ints across all 4 dimensions.""" + from app.services.interviews.base import PersonaRecord, MemoryDigest + from app.services.interviews.scenario import ScenarioSubagent + from pathlib import Path as _P + + class _Mem: + def get_digest(self, agent_id, max_chars=2000): + return MemoryDigest(text="x", available=True) + + class _StringLLM: + def chat_json(self, messages, temperature=0.0, max_tokens=None, **kw): + return {"ratings": {sid: { + "desirability": "4", "plausibility": "3", + "impact_on_my_group": "5", "fairness": "3", + "if_woke_up_response": f"act-{sid}", + } for sid in ("S1","S2","S3","S4")}} + + inst = _P(__file__).resolve().parents[2] / "scripts" / "instruments" / "scenario_v1.yaml" + sub = ScenarioSubagent(llm=_StringLLM(), memory=_Mem(), instrument_path=inst) + persona = PersonaRecord(agent_id=3, name="A", persona="p") + resp = sub.administer(persona) + assert resp.ratings["S1"].desirability == 4 + assert isinstance(resp.ratings["S1"].desirability, int)