fix(interviews): accept stringified ints in all 4 subagent validators

Real LLMs (observed with anthropic/claude-haiku-4-5 on a 23-agent run)
sometimes return Likert values as JSON strings ('3' not 3). The 4 subagent
validators rejected this with isinstance(v, int), losing ~30% of agents at
N=23. Added a shared coerce_int helper in base.py that accepts ints and
numeric strings, rejects bools/floats/garbage, and is now used by:

- Longitudinal: response values 1-5
- Diversity: Q-sort placements -3..+3 and 6 Likert axes 1-7
- Delphi: R2 and R3 importance/plausibility 1-5
- Scenario: 4 dimensions 1-7

Validators now coerce in place so downstream code sees ints regardless of
the wire format. Added 8 tests (4 unit on coerce_int + 4 per-subagent
contract tests showing stringified values are accepted).

Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
This commit is contained in:
Christian Moellmann 2026-05-23 14:01:42 +02:00
parent 6a53c110b7
commit 895a5fbaee
10 changed files with 202 additions and 15 deletions

View File

@ -22,6 +22,28 @@ class MemoryProvider(Protocol):
def get_digest(self, agent_id: int, max_chars: int = 2000) -> MemoryDigest: ...
def coerce_int(value: Any) -> Optional[int]:
"""Coerce LLM-returned Likert values into ints.
Real LLMs frequently return numeric Likert responses as JSON strings
(e.g. "3" instead of 3). Returns the int if value is an int or a string
that round-trips through int(); otherwise None. Bools are rejected so
True/False aren't accepted as 1/0.
"""
if isinstance(value, bool):
return None
if isinstance(value, int):
return value
if isinstance(value, str):
s = value.strip()
if s and s.lstrip("-").isdigit():
try:
return int(s)
except ValueError:
return None
return None
class SchemaValidationFailure(ValueError):
def __init__(self, agent_id: int, attempts: list[dict]):
super().__init__(f"agent {agent_id}: schema violation after retry")

View File

@ -7,7 +7,7 @@ import yaml
from app.models.interview import (
DelphiOpenResponse, DelphiRatingResponse,
)
from app.services.interviews.base import StakeholderInterviewer, PersonaRecord
from app.services.interviews.base import StakeholderInterviewer, PersonaRecord, coerce_int
class DelphiSubagent:
@ -66,8 +66,12 @@ class DelphiSubagent:
if set(ratings.keys()) != set(theme_ids): return None
for tid, r in ratings.items():
if not isinstance(r, dict): return None
coerced: dict[str, int] = {}
for key in ("importance", "plausibility"):
if not isinstance(r.get(key), int) or not 1 <= r[key] <= 5: return None
iv = coerce_int(r.get(key))
if iv is None or not 1 <= iv <= 5: return None
coerced[key] = iv
ratings[tid] = coerced
return raw
return v
@ -110,10 +114,14 @@ class DelphiSubagent:
if not isinstance(raw, dict): return None
ratings = raw.get("ratings", {})
if set(ratings.keys()) != set(theme_ids): return None
for r in ratings.values():
for tid, r in ratings.items():
if not isinstance(r, dict): return None
coerced: dict[str, int] = {}
for key in ("importance", "plausibility"):
if not isinstance(r.get(key), int) or not 1 <= r[key] <= 5: return None
iv = coerce_int(r.get(key))
if iv is None or not 1 <= iv <= 5: return None
coerced[key] = iv
ratings[tid] = coerced
return raw
raw = self.interviewer.ask_in_character(persona, user_prompt=prompt,

View File

@ -7,7 +7,7 @@ from sklearn.decomposition import PCA
from sklearn.cluster import KMeans
import yaml
from app.models.interview import QSortResponse
from app.services.interviews.base import StakeholderInterviewer, PersonaRecord
from app.services.interviews.base import StakeholderInterviewer, PersonaRecord, coerce_int
from app.services.interviews.instrument_loader import InstrumentValidationError
@ -64,16 +64,23 @@ class DiversitySubagent:
dist = self.instrument["distribution"]
target = {b: n for b, n in zip(range(-3, 4), dist)}
got: dict[int, int] = {}
for v in placements.values():
if not isinstance(v, int) or not -3 <= v <= 3:
coerced_p: dict[str, int] = {}
for k, v in placements.items():
iv = coerce_int(v)
if iv is None or not -3 <= iv <= 3:
return None
got[v] = got.get(v, 0) + 1
coerced_p[k] = iv
got[iv] = got.get(iv, 0) + 1
if got != target:
return None
coerced_a: dict[str, int] = {}
for a in self.instrument["likert_axes"]:
v = axes.get(a["axis_id"])
if not isinstance(v, int) or not 1 <= v <= 7:
iv = coerce_int(axes.get(a["axis_id"]))
if iv is None or not 1 <= iv <= 7:
return None
coerced_a[a["axis_id"]] = iv
raw["placements"] = coerced_p
raw["likert_axes"] = coerced_a
return raw
def administer(self, persona: PersonaRecord) -> QSortResponse:

View File

@ -6,7 +6,7 @@ from typing import Optional
from app.models.interview import (
LikertInstrument, LikertResponse, InterviewPhase,
)
from app.services.interviews.base import StakeholderInterviewer, PersonaRecord
from app.services.interviews.base import StakeholderInterviewer, PersonaRecord, coerce_int
from app.services.interviews.instrument_loader import load_likert_instrument
@ -44,9 +44,13 @@ class LongitudinalSubagent:
required = {it.item_id for it in self.instrument.items}
if not required.issubset(resp.keys()):
return None
coerced: dict[str, int] = {}
for k, v in resp.items():
if not isinstance(v, int) or not 1 <= v <= 5:
iv = coerce_int(v)
if iv is None or not 1 <= iv <= 5:
return None
coerced[k] = iv
raw["responses"] = coerced
return raw
def administer(self, persona: PersonaRecord, phase: InterviewPhase) -> LikertResponse:

View File

@ -5,7 +5,7 @@ from pathlib import Path
from typing import Optional
import yaml
from app.models.interview import ScenarioRating, ScenarioResponse
from app.services.interviews.base import StakeholderInterviewer, PersonaRecord
from app.services.interviews.base import StakeholderInterviewer, PersonaRecord, coerce_int
class ScenarioSubagent:
def __init__(self, llm, memory, instrument_path: Path, language: str = "de"):
@ -44,10 +44,12 @@ class ScenarioSubagent:
sids = {s["scenario_id"] for s in self.instrument["scenarios"]}
ratings = raw.get("ratings", {})
if set(ratings.keys()) != sids: return None
for v in ratings.values():
for sid, v in ratings.items():
if not isinstance(v, dict): return None
for k in ("desirability", "plausibility", "impact_on_my_group", "fairness"):
if not isinstance(v.get(k), int) or not 1 <= v[k] <= 7: return None
iv = coerce_int(v.get(k))
if iv is None or not 1 <= iv <= 7: return None
v[k] = iv
if not isinstance(v.get("if_woke_up_response", ""), str): return None
return raw

View File

@ -2,8 +2,36 @@ import json
import pytest
from app.services.interviews.base import (
StakeholderInterviewer, MemoryDigest, PersonaRecord, SchemaValidationFailure,
coerce_int,
)
def test_coerce_int_accepts_real_int():
assert coerce_int(3) == 3
assert coerce_int(-2) == -2
assert coerce_int(0) == 0
def test_coerce_int_accepts_numeric_strings():
assert coerce_int("3") == 3
assert coerce_int(" 4 ") == 4
assert coerce_int("-2") == -2
def test_coerce_int_rejects_non_numeric():
assert coerce_int("3.5") is None
assert coerce_int("abc") is None
assert coerce_int(None) is None
assert coerce_int([3]) is None
assert coerce_int(3.5) is None
def test_coerce_int_rejects_bool():
"""True/False should NOT silently coerce to 1/0 even though Python says they're ints."""
assert coerce_int(True) is None
assert coerce_int(False) is None
class _FakeLLM:
def __init__(self, responses):
self.responses = list(responses)

View File

@ -56,3 +56,29 @@ def test_convergence_metrics():
conv = convergence_metrics(r2, r3)
assert "t1" in conv
assert conv["t1"]["delta_iqr_importance"] is not None
def test_delphi_r2_accepts_string_ratings():
"""Delphi R2/R3 ratings should accept stringified importance/plausibility ints."""
from app.services.interviews.base import PersonaRecord, MemoryDigest
from app.services.interviews.delphi import DelphiSubagent
from pathlib import Path as _P
class _Mem:
def get_digest(self, agent_id, max_chars=2000):
return MemoryDigest(text="x", available=True)
class _StringLLM:
def chat_json(self, messages, temperature=0.0, max_tokens=None, **kw):
return {"ratings": {
"t1": {"importance": "4", "plausibility": "3"},
"t2": {"importance": "5", "plausibility": "2"},
}}
inst = _P(__file__).resolve().parents[2] / "scripts" / "instruments" / "delphi_v1.yaml"
sub = DelphiSubagent(llm=_StringLLM(), memory=_Mem(), instrument_path=inst)
persona = PersonaRecord(agent_id=1, name="A", persona="p")
themes = [{"theme_id": "t1", "label": "T1"}, {"theme_id": "t2", "label": "T2"}]
resp = sub.administer_round2(persona, themes)
assert resp.ratings["t1"]["importance"] == 4
assert isinstance(resp.ratings["t1"]["importance"], int)

View File

@ -46,3 +46,33 @@ def test_typology_runs_pca_kmeans():
assert len(result["clusters"]) == 3
assert "pca" in result
assert len(result["pca"]["components"]) >= 2
def test_diversity_accepts_string_likert_values():
"""Diversity placements + axes should accept stringified ints."""
from app.services.interviews.base import PersonaRecord, MemoryDigest
from app.services.interviews.diversity import DiversitySubagent
from pathlib import Path as _P
class _Mem:
def get_digest(self, agent_id, max_chars=2000):
return MemoryDigest(text="x", available=True)
buckets = [-3]*2 + [-2]*3 + [-1]*4 + [0]*6 + [1]*4 + [2]*3 + [3]*2
class _StringLLM:
def chat_json(self, messages, temperature=0.0, max_tokens=None, **kw):
return {
"placements": {f"st_{i+1:02d}": str(buckets[i]) for i in range(24)},
"likert_axes": {a: "4" for a in (
"ax_pres_extr","ax_loc_eu","ax_sci_trad",
"ax_ind_col","ax_short_long","ax_mkt_reg")},
}
inst = _P(__file__).resolve().parents[2] / "scripts" / "instruments" / "diversity_v1.yaml"
sub = DiversitySubagent(llm=_StringLLM(), memory=_Mem(), instrument_path=inst)
persona = PersonaRecord(agent_id=7, name="A", persona="p")
resp = sub.administer(persona)
assert isinstance(resp.placements["st_01"], int)
assert isinstance(resp.likert_axes["ax_pres_extr"], int)
assert resp.likert_axes["ax_pres_extr"] == 4

View File

@ -55,3 +55,37 @@ def test_longitudinal_aggregate_delta():
assert agg["per_item"]["stk_1"]["mean_delta"] == 1.0
assert agg["per_item"]["gov_1"]["mean_delta"] == 0.0
assert agg["n_paired"] == 5
def test_longitudinal_accepts_string_likert_values():
"""Real LLMs sometimes return Likert values as JSON strings ('3' not 3).
The validator should coerce them rather than fail the agent."""
from app.models.interview import InterviewPhase
from app.services.interviews.base import PersonaRecord, MemoryDigest
from app.services.interviews.longitudinal import LongitudinalSubagent
from pathlib import Path as _P
class _Mem:
def get_digest(self, agent_id, max_chars=2000):
return MemoryDigest(text="x", available=True)
class _StringLLM:
def chat_json(self, messages, temperature=0.0, max_tokens=None, **kw):
return {
"responses": { # all strings, not ints
"stk_1": "4", "stk_2": "3", "stk_3": "5",
"gov_1": "3", "gov_2": "4", "gov_3": "2",
"mkt_1": "5", "mkt_2": "3", "mkt_3": "4",
"clm_1": "2", "clm_2": "4", "clm_3": "5",
},
"confidence": {},
"open_comment": "stringified",
}
inst = _P(__file__).resolve().parents[2] / "scripts" / "instruments" / "longitudinal_v1.yaml"
sub = LongitudinalSubagent(llm=_StringLLM(), memory=_Mem(), instrument_path=inst)
persona = PersonaRecord(agent_id=99, name="A", persona="p")
resp = sub.administer(persona, phase=InterviewPhase.T0)
assert resp.agent_id == 99
assert resp.responses["stk_1"] == 4
assert isinstance(resp.responses["stk_1"], int)

View File

@ -32,3 +32,29 @@ def test_polarity_matrix():
assert "S1" in m
assert m["S1"]["mean_desirability"] == 5
assert m["S1"]["n"] == 3
def test_scenario_accepts_string_likert_values():
"""Scenario ratings should accept stringified ints across all 4 dimensions."""
from app.services.interviews.base import PersonaRecord, MemoryDigest
from app.services.interviews.scenario import ScenarioSubagent
from pathlib import Path as _P
class _Mem:
def get_digest(self, agent_id, max_chars=2000):
return MemoryDigest(text="x", available=True)
class _StringLLM:
def chat_json(self, messages, temperature=0.0, max_tokens=None, **kw):
return {"ratings": {sid: {
"desirability": "4", "plausibility": "3",
"impact_on_my_group": "5", "fairness": "3",
"if_woke_up_response": f"act-{sid}",
} for sid in ("S1","S2","S3","S4")}}
inst = _P(__file__).resolve().parents[2] / "scripts" / "instruments" / "scenario_v1.yaml"
sub = ScenarioSubagent(llm=_StringLLM(), memory=_Mem(), instrument_path=inst)
persona = PersonaRecord(agent_id=3, name="A", persona="p")
resp = sub.administer(persona)
assert resp.ratings["S1"].desirability == 4
assert isinstance(resp.ratings["S1"].desirability, int)