From 895a5fbaee7198fe73a7c5c177a4a6c242c014fd Mon Sep 17 00:00:00 2001 From: Christian Moellmann Date: Sat, 23 May 2026 14:01:42 +0200 Subject: [PATCH] fix(interviews): accept stringified ints in all 4 subagent validators Real LLMs (observed with anthropic/claude-haiku-4-5 on a 23-agent run) sometimes return Likert values as JSON strings ('3' not 3). The 4 subagent validators rejected this with isinstance(v, int), losing ~30% of agents at N=23. Added a shared coerce_int helper in base.py that accepts ints and numeric strings, rejects bools/floats/garbage, and is now used by: - Longitudinal: response values 1-5 - Diversity: Q-sort placements -3..+3 and 6 Likert axes 1-7 - Delphi: R2 and R3 importance/plausibility 1-5 - Scenario: 4 dimensions 1-7 Validators now coerce in place so downstream code sees ints regardless of the wire format. Added 8 tests (4 unit on coerce_int + 4 per-subagent contract tests showing stringified values are accepted). Co-Authored-By: Claude Opus 4.7 (1M context) --- backend/app/services/interviews/base.py | 22 ++++++++++++ backend/app/services/interviews/delphi.py | 16 ++++++--- backend/app/services/interviews/diversity.py | 19 +++++++---- .../app/services/interviews/longitudinal.py | 8 +++-- backend/app/services/interviews/scenario.py | 8 +++-- .../tests/interviews/test_base_interviewer.py | 28 +++++++++++++++ backend/tests/interviews/test_delphi.py | 26 ++++++++++++++ backend/tests/interviews/test_diversity.py | 30 ++++++++++++++++ backend/tests/interviews/test_longitudinal.py | 34 +++++++++++++++++++ backend/tests/interviews/test_scenario.py | 26 ++++++++++++++ 10 files changed, 202 insertions(+), 15 deletions(-) diff --git a/backend/app/services/interviews/base.py b/backend/app/services/interviews/base.py index 87d9a1f5..0eb2f821 100644 --- a/backend/app/services/interviews/base.py +++ b/backend/app/services/interviews/base.py @@ -22,6 +22,28 @@ class MemoryProvider(Protocol): def get_digest(self, agent_id: int, max_chars: int = 2000) -> MemoryDigest: ... +def coerce_int(value: Any) -> Optional[int]: + """Coerce LLM-returned Likert values into ints. + + Real LLMs frequently return numeric Likert responses as JSON strings + (e.g. "3" instead of 3). Returns the int if value is an int or a string + that round-trips through int(); otherwise None. Bools are rejected so + True/False aren't accepted as 1/0. + """ + if isinstance(value, bool): + return None + if isinstance(value, int): + return value + if isinstance(value, str): + s = value.strip() + if s and s.lstrip("-").isdigit(): + try: + return int(s) + except ValueError: + return None + return None + + class SchemaValidationFailure(ValueError): def __init__(self, agent_id: int, attempts: list[dict]): super().__init__(f"agent {agent_id}: schema violation after retry") diff --git a/backend/app/services/interviews/delphi.py b/backend/app/services/interviews/delphi.py index be455ae9..198da793 100644 --- a/backend/app/services/interviews/delphi.py +++ b/backend/app/services/interviews/delphi.py @@ -7,7 +7,7 @@ import yaml from app.models.interview import ( DelphiOpenResponse, DelphiRatingResponse, ) -from app.services.interviews.base import StakeholderInterviewer, PersonaRecord +from app.services.interviews.base import StakeholderInterviewer, PersonaRecord, coerce_int class DelphiSubagent: @@ -66,8 +66,12 @@ class DelphiSubagent: if set(ratings.keys()) != set(theme_ids): return None for tid, r in ratings.items(): if not isinstance(r, dict): return None + coerced: dict[str, int] = {} for key in ("importance", "plausibility"): - if not isinstance(r.get(key), int) or not 1 <= r[key] <= 5: return None + iv = coerce_int(r.get(key)) + if iv is None or not 1 <= iv <= 5: return None + coerced[key] = iv + ratings[tid] = coerced return raw return v @@ -110,10 +114,14 @@ class DelphiSubagent: if not isinstance(raw, dict): return None ratings = raw.get("ratings", {}) if set(ratings.keys()) != set(theme_ids): return None - for r in ratings.values(): + for tid, r in ratings.items(): if not isinstance(r, dict): return None + coerced: dict[str, int] = {} for key in ("importance", "plausibility"): - if not isinstance(r.get(key), int) or not 1 <= r[key] <= 5: return None + iv = coerce_int(r.get(key)) + if iv is None or not 1 <= iv <= 5: return None + coerced[key] = iv + ratings[tid] = coerced return raw raw = self.interviewer.ask_in_character(persona, user_prompt=prompt, diff --git a/backend/app/services/interviews/diversity.py b/backend/app/services/interviews/diversity.py index 96febcf5..2c129828 100644 --- a/backend/app/services/interviews/diversity.py +++ b/backend/app/services/interviews/diversity.py @@ -7,7 +7,7 @@ from sklearn.decomposition import PCA from sklearn.cluster import KMeans import yaml from app.models.interview import QSortResponse -from app.services.interviews.base import StakeholderInterviewer, PersonaRecord +from app.services.interviews.base import StakeholderInterviewer, PersonaRecord, coerce_int from app.services.interviews.instrument_loader import InstrumentValidationError @@ -64,16 +64,23 @@ class DiversitySubagent: dist = self.instrument["distribution"] target = {b: n for b, n in zip(range(-3, 4), dist)} got: dict[int, int] = {} - for v in placements.values(): - if not isinstance(v, int) or not -3 <= v <= 3: + coerced_p: dict[str, int] = {} + for k, v in placements.items(): + iv = coerce_int(v) + if iv is None or not -3 <= iv <= 3: return None - got[v] = got.get(v, 0) + 1 + coerced_p[k] = iv + got[iv] = got.get(iv, 0) + 1 if got != target: return None + coerced_a: dict[str, int] = {} for a in self.instrument["likert_axes"]: - v = axes.get(a["axis_id"]) - if not isinstance(v, int) or not 1 <= v <= 7: + iv = coerce_int(axes.get(a["axis_id"])) + if iv is None or not 1 <= iv <= 7: return None + coerced_a[a["axis_id"]] = iv + raw["placements"] = coerced_p + raw["likert_axes"] = coerced_a return raw def administer(self, persona: PersonaRecord) -> QSortResponse: diff --git a/backend/app/services/interviews/longitudinal.py b/backend/app/services/interviews/longitudinal.py index 4f13ec23..6ef7b811 100644 --- a/backend/app/services/interviews/longitudinal.py +++ b/backend/app/services/interviews/longitudinal.py @@ -6,7 +6,7 @@ from typing import Optional from app.models.interview import ( LikertInstrument, LikertResponse, InterviewPhase, ) -from app.services.interviews.base import StakeholderInterviewer, PersonaRecord +from app.services.interviews.base import StakeholderInterviewer, PersonaRecord, coerce_int from app.services.interviews.instrument_loader import load_likert_instrument @@ -44,9 +44,13 @@ class LongitudinalSubagent: required = {it.item_id for it in self.instrument.items} if not required.issubset(resp.keys()): return None + coerced: dict[str, int] = {} for k, v in resp.items(): - if not isinstance(v, int) or not 1 <= v <= 5: + iv = coerce_int(v) + if iv is None or not 1 <= iv <= 5: return None + coerced[k] = iv + raw["responses"] = coerced return raw def administer(self, persona: PersonaRecord, phase: InterviewPhase) -> LikertResponse: diff --git a/backend/app/services/interviews/scenario.py b/backend/app/services/interviews/scenario.py index f78239fb..1b1e8468 100644 --- a/backend/app/services/interviews/scenario.py +++ b/backend/app/services/interviews/scenario.py @@ -5,7 +5,7 @@ from pathlib import Path from typing import Optional import yaml from app.models.interview import ScenarioRating, ScenarioResponse -from app.services.interviews.base import StakeholderInterviewer, PersonaRecord +from app.services.interviews.base import StakeholderInterviewer, PersonaRecord, coerce_int class ScenarioSubagent: def __init__(self, llm, memory, instrument_path: Path, language: str = "de"): @@ -44,10 +44,12 @@ class ScenarioSubagent: sids = {s["scenario_id"] for s in self.instrument["scenarios"]} ratings = raw.get("ratings", {}) if set(ratings.keys()) != sids: return None - for v in ratings.values(): + for sid, v in ratings.items(): if not isinstance(v, dict): return None for k in ("desirability", "plausibility", "impact_on_my_group", "fairness"): - if not isinstance(v.get(k), int) or not 1 <= v[k] <= 7: return None + iv = coerce_int(v.get(k)) + if iv is None or not 1 <= iv <= 7: return None + v[k] = iv if not isinstance(v.get("if_woke_up_response", ""), str): return None return raw diff --git a/backend/tests/interviews/test_base_interviewer.py b/backend/tests/interviews/test_base_interviewer.py index 822dee45..03295867 100644 --- a/backend/tests/interviews/test_base_interviewer.py +++ b/backend/tests/interviews/test_base_interviewer.py @@ -2,8 +2,36 @@ import json import pytest from app.services.interviews.base import ( StakeholderInterviewer, MemoryDigest, PersonaRecord, SchemaValidationFailure, + coerce_int, ) + +def test_coerce_int_accepts_real_int(): + assert coerce_int(3) == 3 + assert coerce_int(-2) == -2 + assert coerce_int(0) == 0 + + +def test_coerce_int_accepts_numeric_strings(): + assert coerce_int("3") == 3 + assert coerce_int(" 4 ") == 4 + assert coerce_int("-2") == -2 + + +def test_coerce_int_rejects_non_numeric(): + assert coerce_int("3.5") is None + assert coerce_int("abc") is None + assert coerce_int(None) is None + assert coerce_int([3]) is None + assert coerce_int(3.5) is None + + +def test_coerce_int_rejects_bool(): + """True/False should NOT silently coerce to 1/0 even though Python says they're ints.""" + assert coerce_int(True) is None + assert coerce_int(False) is None + + class _FakeLLM: def __init__(self, responses): self.responses = list(responses) diff --git a/backend/tests/interviews/test_delphi.py b/backend/tests/interviews/test_delphi.py index c01ecfb8..e55cab7a 100644 --- a/backend/tests/interviews/test_delphi.py +++ b/backend/tests/interviews/test_delphi.py @@ -56,3 +56,29 @@ def test_convergence_metrics(): conv = convergence_metrics(r2, r3) assert "t1" in conv assert conv["t1"]["delta_iqr_importance"] is not None + + +def test_delphi_r2_accepts_string_ratings(): + """Delphi R2/R3 ratings should accept stringified importance/plausibility ints.""" + from app.services.interviews.base import PersonaRecord, MemoryDigest + from app.services.interviews.delphi import DelphiSubagent + from pathlib import Path as _P + + class _Mem: + def get_digest(self, agent_id, max_chars=2000): + return MemoryDigest(text="x", available=True) + + class _StringLLM: + def chat_json(self, messages, temperature=0.0, max_tokens=None, **kw): + return {"ratings": { + "t1": {"importance": "4", "plausibility": "3"}, + "t2": {"importance": "5", "plausibility": "2"}, + }} + + inst = _P(__file__).resolve().parents[2] / "scripts" / "instruments" / "delphi_v1.yaml" + sub = DelphiSubagent(llm=_StringLLM(), memory=_Mem(), instrument_path=inst) + persona = PersonaRecord(agent_id=1, name="A", persona="p") + themes = [{"theme_id": "t1", "label": "T1"}, {"theme_id": "t2", "label": "T2"}] + resp = sub.administer_round2(persona, themes) + assert resp.ratings["t1"]["importance"] == 4 + assert isinstance(resp.ratings["t1"]["importance"], int) diff --git a/backend/tests/interviews/test_diversity.py b/backend/tests/interviews/test_diversity.py index 7650fac2..d8eb45d3 100644 --- a/backend/tests/interviews/test_diversity.py +++ b/backend/tests/interviews/test_diversity.py @@ -46,3 +46,33 @@ def test_typology_runs_pca_kmeans(): assert len(result["clusters"]) == 3 assert "pca" in result assert len(result["pca"]["components"]) >= 2 + + +def test_diversity_accepts_string_likert_values(): + """Diversity placements + axes should accept stringified ints.""" + from app.services.interviews.base import PersonaRecord, MemoryDigest + from app.services.interviews.diversity import DiversitySubagent + from pathlib import Path as _P + + class _Mem: + def get_digest(self, agent_id, max_chars=2000): + return MemoryDigest(text="x", available=True) + + buckets = [-3]*2 + [-2]*3 + [-1]*4 + [0]*6 + [1]*4 + [2]*3 + [3]*2 + + class _StringLLM: + def chat_json(self, messages, temperature=0.0, max_tokens=None, **kw): + return { + "placements": {f"st_{i+1:02d}": str(buckets[i]) for i in range(24)}, + "likert_axes": {a: "4" for a in ( + "ax_pres_extr","ax_loc_eu","ax_sci_trad", + "ax_ind_col","ax_short_long","ax_mkt_reg")}, + } + + inst = _P(__file__).resolve().parents[2] / "scripts" / "instruments" / "diversity_v1.yaml" + sub = DiversitySubagent(llm=_StringLLM(), memory=_Mem(), instrument_path=inst) + persona = PersonaRecord(agent_id=7, name="A", persona="p") + resp = sub.administer(persona) + assert isinstance(resp.placements["st_01"], int) + assert isinstance(resp.likert_axes["ax_pres_extr"], int) + assert resp.likert_axes["ax_pres_extr"] == 4 diff --git a/backend/tests/interviews/test_longitudinal.py b/backend/tests/interviews/test_longitudinal.py index 823e1552..006c293a 100644 --- a/backend/tests/interviews/test_longitudinal.py +++ b/backend/tests/interviews/test_longitudinal.py @@ -55,3 +55,37 @@ def test_longitudinal_aggregate_delta(): assert agg["per_item"]["stk_1"]["mean_delta"] == 1.0 assert agg["per_item"]["gov_1"]["mean_delta"] == 0.0 assert agg["n_paired"] == 5 + + +def test_longitudinal_accepts_string_likert_values(): + """Real LLMs sometimes return Likert values as JSON strings ('3' not 3). + The validator should coerce them rather than fail the agent.""" + from app.models.interview import InterviewPhase + from app.services.interviews.base import PersonaRecord, MemoryDigest + from app.services.interviews.longitudinal import LongitudinalSubagent + from pathlib import Path as _P + + class _Mem: + def get_digest(self, agent_id, max_chars=2000): + return MemoryDigest(text="x", available=True) + + class _StringLLM: + def chat_json(self, messages, temperature=0.0, max_tokens=None, **kw): + return { + "responses": { # all strings, not ints + "stk_1": "4", "stk_2": "3", "stk_3": "5", + "gov_1": "3", "gov_2": "4", "gov_3": "2", + "mkt_1": "5", "mkt_2": "3", "mkt_3": "4", + "clm_1": "2", "clm_2": "4", "clm_3": "5", + }, + "confidence": {}, + "open_comment": "stringified", + } + + inst = _P(__file__).resolve().parents[2] / "scripts" / "instruments" / "longitudinal_v1.yaml" + sub = LongitudinalSubagent(llm=_StringLLM(), memory=_Mem(), instrument_path=inst) + persona = PersonaRecord(agent_id=99, name="A", persona="p") + resp = sub.administer(persona, phase=InterviewPhase.T0) + assert resp.agent_id == 99 + assert resp.responses["stk_1"] == 4 + assert isinstance(resp.responses["stk_1"], int) diff --git a/backend/tests/interviews/test_scenario.py b/backend/tests/interviews/test_scenario.py index 567290d1..61787211 100644 --- a/backend/tests/interviews/test_scenario.py +++ b/backend/tests/interviews/test_scenario.py @@ -32,3 +32,29 @@ def test_polarity_matrix(): assert "S1" in m assert m["S1"]["mean_desirability"] == 5 assert m["S1"]["n"] == 3 + + +def test_scenario_accepts_string_likert_values(): + """Scenario ratings should accept stringified ints across all 4 dimensions.""" + from app.services.interviews.base import PersonaRecord, MemoryDigest + from app.services.interviews.scenario import ScenarioSubagent + from pathlib import Path as _P + + class _Mem: + def get_digest(self, agent_id, max_chars=2000): + return MemoryDigest(text="x", available=True) + + class _StringLLM: + def chat_json(self, messages, temperature=0.0, max_tokens=None, **kw): + return {"ratings": {sid: { + "desirability": "4", "plausibility": "3", + "impact_on_my_group": "5", "fairness": "3", + "if_woke_up_response": f"act-{sid}", + } for sid in ("S1","S2","S3","S4")}} + + inst = _P(__file__).resolve().parents[2] / "scripts" / "instruments" / "scenario_v1.yaml" + sub = ScenarioSubagent(llm=_StringLLM(), memory=_Mem(), instrument_path=inst) + persona = PersonaRecord(agent_id=3, name="A", persona="p") + resp = sub.administer(persona) + assert resp.ratings["S1"].desirability == 4 + assert isinstance(resp.ratings["S1"].desirability, int)