fix(interviews): accept stringified ints in all 4 subagent validators
Real LLMs (observed with anthropic/claude-haiku-4-5 on a 23-agent run)
sometimes return Likert values as JSON strings ('3' not 3). The 4 subagent
validators rejected this with isinstance(v, int), losing ~30% of agents at
N=23. Added a shared coerce_int helper in base.py that accepts ints and
numeric strings, rejects bools/floats/garbage, and is now used by:
- Longitudinal: response values 1-5
- Diversity: Q-sort placements -3..+3 and 6 Likert axes 1-7
- Delphi: R2 and R3 importance/plausibility 1-5
- Scenario: 4 dimensions 1-7
Validators now coerce in place so downstream code sees ints regardless of
the wire format. Added 8 tests (4 unit on coerce_int + 4 per-subagent
contract tests showing stringified values are accepted).
Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
This commit is contained in:
parent
6a53c110b7
commit
895a5fbaee
|
|
@ -22,6 +22,28 @@ class MemoryProvider(Protocol):
|
||||||
def get_digest(self, agent_id: int, max_chars: int = 2000) -> MemoryDigest: ...
|
def get_digest(self, agent_id: int, max_chars: int = 2000) -> MemoryDigest: ...
|
||||||
|
|
||||||
|
|
||||||
|
def coerce_int(value: Any) -> Optional[int]:
|
||||||
|
"""Coerce LLM-returned Likert values into ints.
|
||||||
|
|
||||||
|
Real LLMs frequently return numeric Likert responses as JSON strings
|
||||||
|
(e.g. "3" instead of 3). Returns the int if value is an int or a string
|
||||||
|
that round-trips through int(); otherwise None. Bools are rejected so
|
||||||
|
True/False aren't accepted as 1/0.
|
||||||
|
"""
|
||||||
|
if isinstance(value, bool):
|
||||||
|
return None
|
||||||
|
if isinstance(value, int):
|
||||||
|
return value
|
||||||
|
if isinstance(value, str):
|
||||||
|
s = value.strip()
|
||||||
|
if s and s.lstrip("-").isdigit():
|
||||||
|
try:
|
||||||
|
return int(s)
|
||||||
|
except ValueError:
|
||||||
|
return None
|
||||||
|
return None
|
||||||
|
|
||||||
|
|
||||||
class SchemaValidationFailure(ValueError):
|
class SchemaValidationFailure(ValueError):
|
||||||
def __init__(self, agent_id: int, attempts: list[dict]):
|
def __init__(self, agent_id: int, attempts: list[dict]):
|
||||||
super().__init__(f"agent {agent_id}: schema violation after retry")
|
super().__init__(f"agent {agent_id}: schema violation after retry")
|
||||||
|
|
|
||||||
|
|
@ -7,7 +7,7 @@ import yaml
|
||||||
from app.models.interview import (
|
from app.models.interview import (
|
||||||
DelphiOpenResponse, DelphiRatingResponse,
|
DelphiOpenResponse, DelphiRatingResponse,
|
||||||
)
|
)
|
||||||
from app.services.interviews.base import StakeholderInterviewer, PersonaRecord
|
from app.services.interviews.base import StakeholderInterviewer, PersonaRecord, coerce_int
|
||||||
|
|
||||||
|
|
||||||
class DelphiSubagent:
|
class DelphiSubagent:
|
||||||
|
|
@ -66,8 +66,12 @@ class DelphiSubagent:
|
||||||
if set(ratings.keys()) != set(theme_ids): return None
|
if set(ratings.keys()) != set(theme_ids): return None
|
||||||
for tid, r in ratings.items():
|
for tid, r in ratings.items():
|
||||||
if not isinstance(r, dict): return None
|
if not isinstance(r, dict): return None
|
||||||
|
coerced: dict[str, int] = {}
|
||||||
for key in ("importance", "plausibility"):
|
for key in ("importance", "plausibility"):
|
||||||
if not isinstance(r.get(key), int) or not 1 <= r[key] <= 5: return None
|
iv = coerce_int(r.get(key))
|
||||||
|
if iv is None or not 1 <= iv <= 5: return None
|
||||||
|
coerced[key] = iv
|
||||||
|
ratings[tid] = coerced
|
||||||
return raw
|
return raw
|
||||||
return v
|
return v
|
||||||
|
|
||||||
|
|
@ -110,10 +114,14 @@ class DelphiSubagent:
|
||||||
if not isinstance(raw, dict): return None
|
if not isinstance(raw, dict): return None
|
||||||
ratings = raw.get("ratings", {})
|
ratings = raw.get("ratings", {})
|
||||||
if set(ratings.keys()) != set(theme_ids): return None
|
if set(ratings.keys()) != set(theme_ids): return None
|
||||||
for r in ratings.values():
|
for tid, r in ratings.items():
|
||||||
if not isinstance(r, dict): return None
|
if not isinstance(r, dict): return None
|
||||||
|
coerced: dict[str, int] = {}
|
||||||
for key in ("importance", "plausibility"):
|
for key in ("importance", "plausibility"):
|
||||||
if not isinstance(r.get(key), int) or not 1 <= r[key] <= 5: return None
|
iv = coerce_int(r.get(key))
|
||||||
|
if iv is None or not 1 <= iv <= 5: return None
|
||||||
|
coerced[key] = iv
|
||||||
|
ratings[tid] = coerced
|
||||||
return raw
|
return raw
|
||||||
|
|
||||||
raw = self.interviewer.ask_in_character(persona, user_prompt=prompt,
|
raw = self.interviewer.ask_in_character(persona, user_prompt=prompt,
|
||||||
|
|
|
||||||
|
|
@ -7,7 +7,7 @@ from sklearn.decomposition import PCA
|
||||||
from sklearn.cluster import KMeans
|
from sklearn.cluster import KMeans
|
||||||
import yaml
|
import yaml
|
||||||
from app.models.interview import QSortResponse
|
from app.models.interview import QSortResponse
|
||||||
from app.services.interviews.base import StakeholderInterviewer, PersonaRecord
|
from app.services.interviews.base import StakeholderInterviewer, PersonaRecord, coerce_int
|
||||||
from app.services.interviews.instrument_loader import InstrumentValidationError
|
from app.services.interviews.instrument_loader import InstrumentValidationError
|
||||||
|
|
||||||
|
|
||||||
|
|
@ -64,16 +64,23 @@ class DiversitySubagent:
|
||||||
dist = self.instrument["distribution"]
|
dist = self.instrument["distribution"]
|
||||||
target = {b: n for b, n in zip(range(-3, 4), dist)}
|
target = {b: n for b, n in zip(range(-3, 4), dist)}
|
||||||
got: dict[int, int] = {}
|
got: dict[int, int] = {}
|
||||||
for v in placements.values():
|
coerced_p: dict[str, int] = {}
|
||||||
if not isinstance(v, int) or not -3 <= v <= 3:
|
for k, v in placements.items():
|
||||||
|
iv = coerce_int(v)
|
||||||
|
if iv is None or not -3 <= iv <= 3:
|
||||||
return None
|
return None
|
||||||
got[v] = got.get(v, 0) + 1
|
coerced_p[k] = iv
|
||||||
|
got[iv] = got.get(iv, 0) + 1
|
||||||
if got != target:
|
if got != target:
|
||||||
return None
|
return None
|
||||||
|
coerced_a: dict[str, int] = {}
|
||||||
for a in self.instrument["likert_axes"]:
|
for a in self.instrument["likert_axes"]:
|
||||||
v = axes.get(a["axis_id"])
|
iv = coerce_int(axes.get(a["axis_id"]))
|
||||||
if not isinstance(v, int) or not 1 <= v <= 7:
|
if iv is None or not 1 <= iv <= 7:
|
||||||
return None
|
return None
|
||||||
|
coerced_a[a["axis_id"]] = iv
|
||||||
|
raw["placements"] = coerced_p
|
||||||
|
raw["likert_axes"] = coerced_a
|
||||||
return raw
|
return raw
|
||||||
|
|
||||||
def administer(self, persona: PersonaRecord) -> QSortResponse:
|
def administer(self, persona: PersonaRecord) -> QSortResponse:
|
||||||
|
|
|
||||||
|
|
@ -6,7 +6,7 @@ from typing import Optional
|
||||||
from app.models.interview import (
|
from app.models.interview import (
|
||||||
LikertInstrument, LikertResponse, InterviewPhase,
|
LikertInstrument, LikertResponse, InterviewPhase,
|
||||||
)
|
)
|
||||||
from app.services.interviews.base import StakeholderInterviewer, PersonaRecord
|
from app.services.interviews.base import StakeholderInterviewer, PersonaRecord, coerce_int
|
||||||
from app.services.interviews.instrument_loader import load_likert_instrument
|
from app.services.interviews.instrument_loader import load_likert_instrument
|
||||||
|
|
||||||
|
|
||||||
|
|
@ -44,9 +44,13 @@ class LongitudinalSubagent:
|
||||||
required = {it.item_id for it in self.instrument.items}
|
required = {it.item_id for it in self.instrument.items}
|
||||||
if not required.issubset(resp.keys()):
|
if not required.issubset(resp.keys()):
|
||||||
return None
|
return None
|
||||||
|
coerced: dict[str, int] = {}
|
||||||
for k, v in resp.items():
|
for k, v in resp.items():
|
||||||
if not isinstance(v, int) or not 1 <= v <= 5:
|
iv = coerce_int(v)
|
||||||
|
if iv is None or not 1 <= iv <= 5:
|
||||||
return None
|
return None
|
||||||
|
coerced[k] = iv
|
||||||
|
raw["responses"] = coerced
|
||||||
return raw
|
return raw
|
||||||
|
|
||||||
def administer(self, persona: PersonaRecord, phase: InterviewPhase) -> LikertResponse:
|
def administer(self, persona: PersonaRecord, phase: InterviewPhase) -> LikertResponse:
|
||||||
|
|
|
||||||
|
|
@ -5,7 +5,7 @@ from pathlib import Path
|
||||||
from typing import Optional
|
from typing import Optional
|
||||||
import yaml
|
import yaml
|
||||||
from app.models.interview import ScenarioRating, ScenarioResponse
|
from app.models.interview import ScenarioRating, ScenarioResponse
|
||||||
from app.services.interviews.base import StakeholderInterviewer, PersonaRecord
|
from app.services.interviews.base import StakeholderInterviewer, PersonaRecord, coerce_int
|
||||||
|
|
||||||
class ScenarioSubagent:
|
class ScenarioSubagent:
|
||||||
def __init__(self, llm, memory, instrument_path: Path, language: str = "de"):
|
def __init__(self, llm, memory, instrument_path: Path, language: str = "de"):
|
||||||
|
|
@ -44,10 +44,12 @@ class ScenarioSubagent:
|
||||||
sids = {s["scenario_id"] for s in self.instrument["scenarios"]}
|
sids = {s["scenario_id"] for s in self.instrument["scenarios"]}
|
||||||
ratings = raw.get("ratings", {})
|
ratings = raw.get("ratings", {})
|
||||||
if set(ratings.keys()) != sids: return None
|
if set(ratings.keys()) != sids: return None
|
||||||
for v in ratings.values():
|
for sid, v in ratings.items():
|
||||||
if not isinstance(v, dict): return None
|
if not isinstance(v, dict): return None
|
||||||
for k in ("desirability", "plausibility", "impact_on_my_group", "fairness"):
|
for k in ("desirability", "plausibility", "impact_on_my_group", "fairness"):
|
||||||
if not isinstance(v.get(k), int) or not 1 <= v[k] <= 7: return None
|
iv = coerce_int(v.get(k))
|
||||||
|
if iv is None or not 1 <= iv <= 7: return None
|
||||||
|
v[k] = iv
|
||||||
if not isinstance(v.get("if_woke_up_response", ""), str): return None
|
if not isinstance(v.get("if_woke_up_response", ""), str): return None
|
||||||
return raw
|
return raw
|
||||||
|
|
||||||
|
|
|
||||||
|
|
@ -2,8 +2,36 @@ import json
|
||||||
import pytest
|
import pytest
|
||||||
from app.services.interviews.base import (
|
from app.services.interviews.base import (
|
||||||
StakeholderInterviewer, MemoryDigest, PersonaRecord, SchemaValidationFailure,
|
StakeholderInterviewer, MemoryDigest, PersonaRecord, SchemaValidationFailure,
|
||||||
|
coerce_int,
|
||||||
)
|
)
|
||||||
|
|
||||||
|
|
||||||
|
def test_coerce_int_accepts_real_int():
|
||||||
|
assert coerce_int(3) == 3
|
||||||
|
assert coerce_int(-2) == -2
|
||||||
|
assert coerce_int(0) == 0
|
||||||
|
|
||||||
|
|
||||||
|
def test_coerce_int_accepts_numeric_strings():
|
||||||
|
assert coerce_int("3") == 3
|
||||||
|
assert coerce_int(" 4 ") == 4
|
||||||
|
assert coerce_int("-2") == -2
|
||||||
|
|
||||||
|
|
||||||
|
def test_coerce_int_rejects_non_numeric():
|
||||||
|
assert coerce_int("3.5") is None
|
||||||
|
assert coerce_int("abc") is None
|
||||||
|
assert coerce_int(None) is None
|
||||||
|
assert coerce_int([3]) is None
|
||||||
|
assert coerce_int(3.5) is None
|
||||||
|
|
||||||
|
|
||||||
|
def test_coerce_int_rejects_bool():
|
||||||
|
"""True/False should NOT silently coerce to 1/0 even though Python says they're ints."""
|
||||||
|
assert coerce_int(True) is None
|
||||||
|
assert coerce_int(False) is None
|
||||||
|
|
||||||
|
|
||||||
class _FakeLLM:
|
class _FakeLLM:
|
||||||
def __init__(self, responses):
|
def __init__(self, responses):
|
||||||
self.responses = list(responses)
|
self.responses = list(responses)
|
||||||
|
|
|
||||||
|
|
@ -56,3 +56,29 @@ def test_convergence_metrics():
|
||||||
conv = convergence_metrics(r2, r3)
|
conv = convergence_metrics(r2, r3)
|
||||||
assert "t1" in conv
|
assert "t1" in conv
|
||||||
assert conv["t1"]["delta_iqr_importance"] is not None
|
assert conv["t1"]["delta_iqr_importance"] is not None
|
||||||
|
|
||||||
|
|
||||||
|
def test_delphi_r2_accepts_string_ratings():
|
||||||
|
"""Delphi R2/R3 ratings should accept stringified importance/plausibility ints."""
|
||||||
|
from app.services.interviews.base import PersonaRecord, MemoryDigest
|
||||||
|
from app.services.interviews.delphi import DelphiSubagent
|
||||||
|
from pathlib import Path as _P
|
||||||
|
|
||||||
|
class _Mem:
|
||||||
|
def get_digest(self, agent_id, max_chars=2000):
|
||||||
|
return MemoryDigest(text="x", available=True)
|
||||||
|
|
||||||
|
class _StringLLM:
|
||||||
|
def chat_json(self, messages, temperature=0.0, max_tokens=None, **kw):
|
||||||
|
return {"ratings": {
|
||||||
|
"t1": {"importance": "4", "plausibility": "3"},
|
||||||
|
"t2": {"importance": "5", "plausibility": "2"},
|
||||||
|
}}
|
||||||
|
|
||||||
|
inst = _P(__file__).resolve().parents[2] / "scripts" / "instruments" / "delphi_v1.yaml"
|
||||||
|
sub = DelphiSubagent(llm=_StringLLM(), memory=_Mem(), instrument_path=inst)
|
||||||
|
persona = PersonaRecord(agent_id=1, name="A", persona="p")
|
||||||
|
themes = [{"theme_id": "t1", "label": "T1"}, {"theme_id": "t2", "label": "T2"}]
|
||||||
|
resp = sub.administer_round2(persona, themes)
|
||||||
|
assert resp.ratings["t1"]["importance"] == 4
|
||||||
|
assert isinstance(resp.ratings["t1"]["importance"], int)
|
||||||
|
|
|
||||||
|
|
@ -46,3 +46,33 @@ def test_typology_runs_pca_kmeans():
|
||||||
assert len(result["clusters"]) == 3
|
assert len(result["clusters"]) == 3
|
||||||
assert "pca" in result
|
assert "pca" in result
|
||||||
assert len(result["pca"]["components"]) >= 2
|
assert len(result["pca"]["components"]) >= 2
|
||||||
|
|
||||||
|
|
||||||
|
def test_diversity_accepts_string_likert_values():
|
||||||
|
"""Diversity placements + axes should accept stringified ints."""
|
||||||
|
from app.services.interviews.base import PersonaRecord, MemoryDigest
|
||||||
|
from app.services.interviews.diversity import DiversitySubagent
|
||||||
|
from pathlib import Path as _P
|
||||||
|
|
||||||
|
class _Mem:
|
||||||
|
def get_digest(self, agent_id, max_chars=2000):
|
||||||
|
return MemoryDigest(text="x", available=True)
|
||||||
|
|
||||||
|
buckets = [-3]*2 + [-2]*3 + [-1]*4 + [0]*6 + [1]*4 + [2]*3 + [3]*2
|
||||||
|
|
||||||
|
class _StringLLM:
|
||||||
|
def chat_json(self, messages, temperature=0.0, max_tokens=None, **kw):
|
||||||
|
return {
|
||||||
|
"placements": {f"st_{i+1:02d}": str(buckets[i]) for i in range(24)},
|
||||||
|
"likert_axes": {a: "4" for a in (
|
||||||
|
"ax_pres_extr","ax_loc_eu","ax_sci_trad",
|
||||||
|
"ax_ind_col","ax_short_long","ax_mkt_reg")},
|
||||||
|
}
|
||||||
|
|
||||||
|
inst = _P(__file__).resolve().parents[2] / "scripts" / "instruments" / "diversity_v1.yaml"
|
||||||
|
sub = DiversitySubagent(llm=_StringLLM(), memory=_Mem(), instrument_path=inst)
|
||||||
|
persona = PersonaRecord(agent_id=7, name="A", persona="p")
|
||||||
|
resp = sub.administer(persona)
|
||||||
|
assert isinstance(resp.placements["st_01"], int)
|
||||||
|
assert isinstance(resp.likert_axes["ax_pres_extr"], int)
|
||||||
|
assert resp.likert_axes["ax_pres_extr"] == 4
|
||||||
|
|
|
||||||
|
|
@ -55,3 +55,37 @@ def test_longitudinal_aggregate_delta():
|
||||||
assert agg["per_item"]["stk_1"]["mean_delta"] == 1.0
|
assert agg["per_item"]["stk_1"]["mean_delta"] == 1.0
|
||||||
assert agg["per_item"]["gov_1"]["mean_delta"] == 0.0
|
assert agg["per_item"]["gov_1"]["mean_delta"] == 0.0
|
||||||
assert agg["n_paired"] == 5
|
assert agg["n_paired"] == 5
|
||||||
|
|
||||||
|
|
||||||
|
def test_longitudinal_accepts_string_likert_values():
|
||||||
|
"""Real LLMs sometimes return Likert values as JSON strings ('3' not 3).
|
||||||
|
The validator should coerce them rather than fail the agent."""
|
||||||
|
from app.models.interview import InterviewPhase
|
||||||
|
from app.services.interviews.base import PersonaRecord, MemoryDigest
|
||||||
|
from app.services.interviews.longitudinal import LongitudinalSubagent
|
||||||
|
from pathlib import Path as _P
|
||||||
|
|
||||||
|
class _Mem:
|
||||||
|
def get_digest(self, agent_id, max_chars=2000):
|
||||||
|
return MemoryDigest(text="x", available=True)
|
||||||
|
|
||||||
|
class _StringLLM:
|
||||||
|
def chat_json(self, messages, temperature=0.0, max_tokens=None, **kw):
|
||||||
|
return {
|
||||||
|
"responses": { # all strings, not ints
|
||||||
|
"stk_1": "4", "stk_2": "3", "stk_3": "5",
|
||||||
|
"gov_1": "3", "gov_2": "4", "gov_3": "2",
|
||||||
|
"mkt_1": "5", "mkt_2": "3", "mkt_3": "4",
|
||||||
|
"clm_1": "2", "clm_2": "4", "clm_3": "5",
|
||||||
|
},
|
||||||
|
"confidence": {},
|
||||||
|
"open_comment": "stringified",
|
||||||
|
}
|
||||||
|
|
||||||
|
inst = _P(__file__).resolve().parents[2] / "scripts" / "instruments" / "longitudinal_v1.yaml"
|
||||||
|
sub = LongitudinalSubagent(llm=_StringLLM(), memory=_Mem(), instrument_path=inst)
|
||||||
|
persona = PersonaRecord(agent_id=99, name="A", persona="p")
|
||||||
|
resp = sub.administer(persona, phase=InterviewPhase.T0)
|
||||||
|
assert resp.agent_id == 99
|
||||||
|
assert resp.responses["stk_1"] == 4
|
||||||
|
assert isinstance(resp.responses["stk_1"], int)
|
||||||
|
|
|
||||||
|
|
@ -32,3 +32,29 @@ def test_polarity_matrix():
|
||||||
assert "S1" in m
|
assert "S1" in m
|
||||||
assert m["S1"]["mean_desirability"] == 5
|
assert m["S1"]["mean_desirability"] == 5
|
||||||
assert m["S1"]["n"] == 3
|
assert m["S1"]["n"] == 3
|
||||||
|
|
||||||
|
|
||||||
|
def test_scenario_accepts_string_likert_values():
|
||||||
|
"""Scenario ratings should accept stringified ints across all 4 dimensions."""
|
||||||
|
from app.services.interviews.base import PersonaRecord, MemoryDigest
|
||||||
|
from app.services.interviews.scenario import ScenarioSubagent
|
||||||
|
from pathlib import Path as _P
|
||||||
|
|
||||||
|
class _Mem:
|
||||||
|
def get_digest(self, agent_id, max_chars=2000):
|
||||||
|
return MemoryDigest(text="x", available=True)
|
||||||
|
|
||||||
|
class _StringLLM:
|
||||||
|
def chat_json(self, messages, temperature=0.0, max_tokens=None, **kw):
|
||||||
|
return {"ratings": {sid: {
|
||||||
|
"desirability": "4", "plausibility": "3",
|
||||||
|
"impact_on_my_group": "5", "fairness": "3",
|
||||||
|
"if_woke_up_response": f"act-{sid}",
|
||||||
|
} for sid in ("S1","S2","S3","S4")}}
|
||||||
|
|
||||||
|
inst = _P(__file__).resolve().parents[2] / "scripts" / "instruments" / "scenario_v1.yaml"
|
||||||
|
sub = ScenarioSubagent(llm=_StringLLM(), memory=_Mem(), instrument_path=inst)
|
||||||
|
persona = PersonaRecord(agent_id=3, name="A", persona="p")
|
||||||
|
resp = sub.administer(persona)
|
||||||
|
assert resp.ratings["S1"].desirability == 4
|
||||||
|
assert isinstance(resp.ratings["S1"].desirability, int)
|
||||||
|
|
|
||||||
Loading…
Reference in New Issue