fix(interviews): accept stringified ints in all 4 subagent validators

Real LLMs (observed with anthropic/claude-haiku-4-5 on a 23-agent run) sometimes return Likert values as JSON strings ('3' not 3). The 4 subagent validators rejected this with isinstance(v, int), losing ~30% of agents at N=23. Added a shared coerce_int helper in base.py that accepts ints and numeric strings, rejects bools/floats/garbage, and is now used by: - Longitudinal: response values 1-5 - Diversity: Q-sort placements -3..+3 and 6 Likert axes 1-7 - Delphi: R2 and R3 importance/plausibility 1-5 - Scenario: 4 dimensions 1-7 Validators now coerce in place so downstream code sees ints regardless of the wire format. Added 8 tests (4 unit on coerce_int + 4 per-subagent contract tests showing stringified values are accepted). Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
2026-05-23 14:01:42 +02:00 · 2026-05-23 14:01:42 +02:00 · 895a5fbaee
parent 6a53c110b7
commit 895a5fbaee
10 changed files with 202 additions and 15 deletions
--- a/backend/app/services/interviews/base.py
+++ b/backend/app/services/interviews/base.py
@ -22,6 +22,28 @@ class MemoryProvider(Protocol):
    def get_digest(self, agent_id: int, max_chars: int = 2000) -> MemoryDigest: ...


+def coerce_int(value: Any) -> Optional[int]:
+    """Coerce LLM-returned Likert values into ints.
+
+    Real LLMs frequently return numeric Likert responses as JSON strings
+    (e.g. "3" instead of 3). Returns the int if value is an int or a string
+    that round-trips through int(); otherwise None. Bools are rejected so
+    True/False aren't accepted as 1/0.
+    """
+    if isinstance(value, bool):
+        return None
+    if isinstance(value, int):
+        return value
+    if isinstance(value, str):
+        s = value.strip()
+        if s and s.lstrip("-").isdigit():
+            try:
+                return int(s)
+            except ValueError:
+                return None
+    return None
+
+
 class SchemaValidationFailure(ValueError):
    def __init__(self, agent_id: int, attempts: list[dict]):
        super().__init__(f"agent {agent_id}: schema violation after retry")
--- a/backend/app/services/interviews/delphi.py
+++ b/backend/app/services/interviews/delphi.py
@ -7,7 +7,7 @@ import yaml
 from app.models.interview import (
    DelphiOpenResponse, DelphiRatingResponse,
 )
-from app.services.interviews.base import StakeholderInterviewer, PersonaRecord
+from app.services.interviews.base import StakeholderInterviewer, PersonaRecord, coerce_int


 class DelphiSubagent:
@ -66,8 +66,12 @@ class DelphiSubagent:
            if set(ratings.keys()) != set(theme_ids): return None
            for tid, r in ratings.items():
                if not isinstance(r, dict): return None
+                coerced: dict[str, int] = {}
                for key in ("importance", "plausibility"):
-                    if not isinstance(r.get(key), int) or not 1 <= r[key] <= 5: return None
+                    iv = coerce_int(r.get(key))
+                    if iv is None or not 1 <= iv <= 5: return None
+                    coerced[key] = iv
+                ratings[tid] = coerced
            return raw
        return v

@ -110,10 +114,14 @@ class DelphiSubagent:
            if not isinstance(raw, dict): return None
            ratings = raw.get("ratings", {})
            if set(ratings.keys()) != set(theme_ids): return None
-            for r in ratings.values():
+            for tid, r in ratings.items():
                if not isinstance(r, dict): return None
+                coerced: dict[str, int] = {}
                for key in ("importance", "plausibility"):
-                    if not isinstance(r.get(key), int) or not 1 <= r[key] <= 5: return None
+                    iv = coerce_int(r.get(key))
+                    if iv is None or not 1 <= iv <= 5: return None
+                    coerced[key] = iv
+                ratings[tid] = coerced
            return raw

        raw = self.interviewer.ask_in_character(persona, user_prompt=prompt,
--- a/backend/app/services/interviews/diversity.py
+++ b/backend/app/services/interviews/diversity.py
@ -7,7 +7,7 @@ from sklearn.decomposition import PCA
 from sklearn.cluster import KMeans
 import yaml
 from app.models.interview import QSortResponse
-from app.services.interviews.base import StakeholderInterviewer, PersonaRecord
+from app.services.interviews.base import StakeholderInterviewer, PersonaRecord, coerce_int
 from app.services.interviews.instrument_loader import InstrumentValidationError


@ -64,16 +64,23 @@ class DiversitySubagent:
        dist = self.instrument["distribution"]
        target = {b: n for b, n in zip(range(-3, 4), dist)}
        got: dict[int, int] = {}
-        for v in placements.values():
-            if not isinstance(v, int) or not -3 <= v <= 3:
+        coerced_p: dict[str, int] = {}
+        for k, v in placements.items():
+            iv = coerce_int(v)
+            if iv is None or not -3 <= iv <= 3:
                return None
-            got[v] = got.get(v, 0) + 1
+            coerced_p[k] = iv
+            got[iv] = got.get(iv, 0) + 1
        if got != target:
            return None
+        coerced_a: dict[str, int] = {}
        for a in self.instrument["likert_axes"]:
-            v = axes.get(a["axis_id"])
-            if not isinstance(v, int) or not 1 <= v <= 7:
+            iv = coerce_int(axes.get(a["axis_id"]))
+            if iv is None or not 1 <= iv <= 7:
                return None
+            coerced_a[a["axis_id"]] = iv
+        raw["placements"] = coerced_p
+        raw["likert_axes"] = coerced_a
        return raw

    def administer(self, persona: PersonaRecord) -> QSortResponse:
--- a/backend/app/services/interviews/longitudinal.py
+++ b/backend/app/services/interviews/longitudinal.py
@ -6,7 +6,7 @@ from typing import Optional
 from app.models.interview import (
    LikertInstrument, LikertResponse, InterviewPhase,
 )
-from app.services.interviews.base import StakeholderInterviewer, PersonaRecord
+from app.services.interviews.base import StakeholderInterviewer, PersonaRecord, coerce_int
 from app.services.interviews.instrument_loader import load_likert_instrument


@ -44,9 +44,13 @@ class LongitudinalSubagent:
        required = {it.item_id for it in self.instrument.items}
        if not required.issubset(resp.keys()):
            return None
+        coerced: dict[str, int] = {}
        for k, v in resp.items():
-            if not isinstance(v, int) or not 1 <= v <= 5:
+            iv = coerce_int(v)
+            if iv is None or not 1 <= iv <= 5:
                return None
+            coerced[k] = iv
+        raw["responses"] = coerced
        return raw

    def administer(self, persona: PersonaRecord, phase: InterviewPhase) -> LikertResponse:
--- a/backend/app/services/interviews/scenario.py
+++ b/backend/app/services/interviews/scenario.py
@ -5,7 +5,7 @@ from pathlib import Path
 from typing import Optional
 import yaml
 from app.models.interview import ScenarioRating, ScenarioResponse
-from app.services.interviews.base import StakeholderInterviewer, PersonaRecord
+from app.services.interviews.base import StakeholderInterviewer, PersonaRecord, coerce_int

 class ScenarioSubagent:
    def __init__(self, llm, memory, instrument_path: Path, language: str = "de"):
@ -44,10 +44,12 @@ class ScenarioSubagent:
        sids = {s["scenario_id"] for s in self.instrument["scenarios"]}
        ratings = raw.get("ratings", {})
        if set(ratings.keys()) != sids: return None
-        for v in ratings.values():
+        for sid, v in ratings.items():
            if not isinstance(v, dict): return None
            for k in ("desirability", "plausibility", "impact_on_my_group", "fairness"):
-                if not isinstance(v.get(k), int) or not 1 <= v[k] <= 7: return None
+                iv = coerce_int(v.get(k))
+                if iv is None or not 1 <= iv <= 7: return None
+                v[k] = iv
            if not isinstance(v.get("if_woke_up_response", ""), str): return None
        return raw

--- a/backend/tests/interviews/test_base_interviewer.py
+++ b/backend/tests/interviews/test_base_interviewer.py
@ -2,8 +2,36 @@ import json
 import pytest
 from app.services.interviews.base import (
    StakeholderInterviewer, MemoryDigest, PersonaRecord, SchemaValidationFailure,
+    coerce_int,
 )

+
+def test_coerce_int_accepts_real_int():
+    assert coerce_int(3) == 3
+    assert coerce_int(-2) == -2
+    assert coerce_int(0) == 0
+
+
+def test_coerce_int_accepts_numeric_strings():
+    assert coerce_int("3") == 3
+    assert coerce_int(" 4 ") == 4
+    assert coerce_int("-2") == -2
+
+
+def test_coerce_int_rejects_non_numeric():
+    assert coerce_int("3.5") is None
+    assert coerce_int("abc") is None
+    assert coerce_int(None) is None
+    assert coerce_int([3]) is None
+    assert coerce_int(3.5) is None
+
+
+def test_coerce_int_rejects_bool():
+    """True/False should NOT silently coerce to 1/0 even though Python says they're ints."""
+    assert coerce_int(True) is None
+    assert coerce_int(False) is None
+
+
 class _FakeLLM:
    def __init__(self, responses):
        self.responses = list(responses)
--- a/backend/tests/interviews/test_delphi.py
+++ b/backend/tests/interviews/test_delphi.py
@ -56,3 +56,29 @@ def test_convergence_metrics():
    conv = convergence_metrics(r2, r3)
    assert "t1" in conv
    assert conv["t1"]["delta_iqr_importance"] is not None
+
+
+def test_delphi_r2_accepts_string_ratings():
+    """Delphi R2/R3 ratings should accept stringified importance/plausibility ints."""
+    from app.services.interviews.base import PersonaRecord, MemoryDigest
+    from app.services.interviews.delphi import DelphiSubagent
+    from pathlib import Path as _P
+
+    class _Mem:
+        def get_digest(self, agent_id, max_chars=2000):
+            return MemoryDigest(text="x", available=True)
+
+    class _StringLLM:
+        def chat_json(self, messages, temperature=0.0, max_tokens=None, **kw):
+            return {"ratings": {
+                "t1": {"importance": "4", "plausibility": "3"},
+                "t2": {"importance": "5", "plausibility": "2"},
+            }}
+
+    inst = _P(__file__).resolve().parents[2] / "scripts" / "instruments" / "delphi_v1.yaml"
+    sub = DelphiSubagent(llm=_StringLLM(), memory=_Mem(), instrument_path=inst)
+    persona = PersonaRecord(agent_id=1, name="A", persona="p")
+    themes = [{"theme_id": "t1", "label": "T1"}, {"theme_id": "t2", "label": "T2"}]
+    resp = sub.administer_round2(persona, themes)
+    assert resp.ratings["t1"]["importance"] == 4
+    assert isinstance(resp.ratings["t1"]["importance"], int)
--- a/backend/tests/interviews/test_diversity.py
+++ b/backend/tests/interviews/test_diversity.py
@ -46,3 +46,33 @@ def test_typology_runs_pca_kmeans():
    assert len(result["clusters"]) == 3
    assert "pca" in result
    assert len(result["pca"]["components"]) >= 2
+
+
+def test_diversity_accepts_string_likert_values():
+    """Diversity placements + axes should accept stringified ints."""
+    from app.services.interviews.base import PersonaRecord, MemoryDigest
+    from app.services.interviews.diversity import DiversitySubagent
+    from pathlib import Path as _P
+
+    class _Mem:
+        def get_digest(self, agent_id, max_chars=2000):
+            return MemoryDigest(text="x", available=True)
+
+    buckets = [-3]*2 + [-2]*3 + [-1]*4 + [0]*6 + [1]*4 + [2]*3 + [3]*2
+
+    class _StringLLM:
+        def chat_json(self, messages, temperature=0.0, max_tokens=None, **kw):
+            return {
+                "placements": {f"st_{i+1:02d}": str(buckets[i]) for i in range(24)},
+                "likert_axes": {a: "4" for a in (
+                    "ax_pres_extr","ax_loc_eu","ax_sci_trad",
+                    "ax_ind_col","ax_short_long","ax_mkt_reg")},
+            }
+
+    inst = _P(__file__).resolve().parents[2] / "scripts" / "instruments" / "diversity_v1.yaml"
+    sub = DiversitySubagent(llm=_StringLLM(), memory=_Mem(), instrument_path=inst)
+    persona = PersonaRecord(agent_id=7, name="A", persona="p")
+    resp = sub.administer(persona)
+    assert isinstance(resp.placements["st_01"], int)
+    assert isinstance(resp.likert_axes["ax_pres_extr"], int)
+    assert resp.likert_axes["ax_pres_extr"] == 4
--- a/backend/tests/interviews/test_longitudinal.py
+++ b/backend/tests/interviews/test_longitudinal.py
@ -55,3 +55,37 @@ def test_longitudinal_aggregate_delta():
    assert agg["per_item"]["stk_1"]["mean_delta"] == 1.0
    assert agg["per_item"]["gov_1"]["mean_delta"] == 0.0
    assert agg["n_paired"] == 5
+
+
+def test_longitudinal_accepts_string_likert_values():
+    """Real LLMs sometimes return Likert values as JSON strings ('3' not 3).
+    The validator should coerce them rather than fail the agent."""
+    from app.models.interview import InterviewPhase
+    from app.services.interviews.base import PersonaRecord, MemoryDigest
+    from app.services.interviews.longitudinal import LongitudinalSubagent
+    from pathlib import Path as _P
+
+    class _Mem:
+        def get_digest(self, agent_id, max_chars=2000):
+            return MemoryDigest(text="x", available=True)
+
+    class _StringLLM:
+        def chat_json(self, messages, temperature=0.0, max_tokens=None, **kw):
+            return {
+                "responses": {  # all strings, not ints
+                    "stk_1": "4", "stk_2": "3", "stk_3": "5",
+                    "gov_1": "3", "gov_2": "4", "gov_3": "2",
+                    "mkt_1": "5", "mkt_2": "3", "mkt_3": "4",
+                    "clm_1": "2", "clm_2": "4", "clm_3": "5",
+                },
+                "confidence": {},
+                "open_comment": "stringified",
+            }
+
+    inst = _P(__file__).resolve().parents[2] / "scripts" / "instruments" / "longitudinal_v1.yaml"
+    sub = LongitudinalSubagent(llm=_StringLLM(), memory=_Mem(), instrument_path=inst)
+    persona = PersonaRecord(agent_id=99, name="A", persona="p")
+    resp = sub.administer(persona, phase=InterviewPhase.T0)
+    assert resp.agent_id == 99
+    assert resp.responses["stk_1"] == 4
+    assert isinstance(resp.responses["stk_1"], int)
--- a/backend/tests/interviews/test_scenario.py
+++ b/backend/tests/interviews/test_scenario.py
@ -32,3 +32,29 @@ def test_polarity_matrix():
    assert "S1" in m
    assert m["S1"]["mean_desirability"] == 5
    assert m["S1"]["n"] == 3
+
+
+def test_scenario_accepts_string_likert_values():
+    """Scenario ratings should accept stringified ints across all 4 dimensions."""
+    from app.services.interviews.base import PersonaRecord, MemoryDigest
+    from app.services.interviews.scenario import ScenarioSubagent
+    from pathlib import Path as _P
+
+    class _Mem:
+        def get_digest(self, agent_id, max_chars=2000):
+            return MemoryDigest(text="x", available=True)
+
+    class _StringLLM:
+        def chat_json(self, messages, temperature=0.0, max_tokens=None, **kw):
+            return {"ratings": {sid: {
+                "desirability": "4", "plausibility": "3",
+                "impact_on_my_group": "5", "fairness": "3",
+                "if_woke_up_response": f"act-{sid}",
+            } for sid in ("S1","S2","S3","S4")}}
+
+    inst = _P(__file__).resolve().parents[2] / "scripts" / "instruments" / "scenario_v1.yaml"
+    sub = ScenarioSubagent(llm=_StringLLM(), memory=_Mem(), instrument_path=inst)
+    persona = PersonaRecord(agent_id=3, name="A", persona="p")
+    resp = sub.administer(persona)
+    assert resp.ratings["S1"].desirability == 4
+    assert isinstance(resp.ratings["S1"].desirability, int)