fix(interviews): accept stringified ints in all 4 subagent validators
Real LLMs (observed with anthropic/claude-haiku-4-5 on a 23-agent run)
sometimes return Likert values as JSON strings ('3' not 3). The 4 subagent
validators rejected this with isinstance(v, int), losing ~30% of agents at
N=23. Added a shared coerce_int helper in base.py that accepts ints and
numeric strings, rejects bools/floats/garbage, and is now used by:
- Longitudinal: response values 1-5
- Diversity: Q-sort placements -3..+3 and 6 Likert axes 1-7
- Delphi: R2 and R3 importance/plausibility 1-5
- Scenario: 4 dimensions 1-7
Validators now coerce in place so downstream code sees ints regardless of
the wire format. Added 8 tests (4 unit on coerce_int + 4 per-subagent
contract tests showing stringified values are accepted).
Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
This commit is contained in:
parent
6a53c110b7
commit
895a5fbaee
|
|
@ -22,6 +22,28 @@ class MemoryProvider(Protocol):
|
|||
def get_digest(self, agent_id: int, max_chars: int = 2000) -> MemoryDigest: ...
|
||||
|
||||
|
||||
def coerce_int(value: Any) -> Optional[int]:
|
||||
"""Coerce LLM-returned Likert values into ints.
|
||||
|
||||
Real LLMs frequently return numeric Likert responses as JSON strings
|
||||
(e.g. "3" instead of 3). Returns the int if value is an int or a string
|
||||
that round-trips through int(); otherwise None. Bools are rejected so
|
||||
True/False aren't accepted as 1/0.
|
||||
"""
|
||||
if isinstance(value, bool):
|
||||
return None
|
||||
if isinstance(value, int):
|
||||
return value
|
||||
if isinstance(value, str):
|
||||
s = value.strip()
|
||||
if s and s.lstrip("-").isdigit():
|
||||
try:
|
||||
return int(s)
|
||||
except ValueError:
|
||||
return None
|
||||
return None
|
||||
|
||||
|
||||
class SchemaValidationFailure(ValueError):
|
||||
def __init__(self, agent_id: int, attempts: list[dict]):
|
||||
super().__init__(f"agent {agent_id}: schema violation after retry")
|
||||
|
|
|
|||
|
|
@ -7,7 +7,7 @@ import yaml
|
|||
from app.models.interview import (
|
||||
DelphiOpenResponse, DelphiRatingResponse,
|
||||
)
|
||||
from app.services.interviews.base import StakeholderInterviewer, PersonaRecord
|
||||
from app.services.interviews.base import StakeholderInterviewer, PersonaRecord, coerce_int
|
||||
|
||||
|
||||
class DelphiSubagent:
|
||||
|
|
@ -66,8 +66,12 @@ class DelphiSubagent:
|
|||
if set(ratings.keys()) != set(theme_ids): return None
|
||||
for tid, r in ratings.items():
|
||||
if not isinstance(r, dict): return None
|
||||
coerced: dict[str, int] = {}
|
||||
for key in ("importance", "plausibility"):
|
||||
if not isinstance(r.get(key), int) or not 1 <= r[key] <= 5: return None
|
||||
iv = coerce_int(r.get(key))
|
||||
if iv is None or not 1 <= iv <= 5: return None
|
||||
coerced[key] = iv
|
||||
ratings[tid] = coerced
|
||||
return raw
|
||||
return v
|
||||
|
||||
|
|
@ -110,10 +114,14 @@ class DelphiSubagent:
|
|||
if not isinstance(raw, dict): return None
|
||||
ratings = raw.get("ratings", {})
|
||||
if set(ratings.keys()) != set(theme_ids): return None
|
||||
for r in ratings.values():
|
||||
for tid, r in ratings.items():
|
||||
if not isinstance(r, dict): return None
|
||||
coerced: dict[str, int] = {}
|
||||
for key in ("importance", "plausibility"):
|
||||
if not isinstance(r.get(key), int) or not 1 <= r[key] <= 5: return None
|
||||
iv = coerce_int(r.get(key))
|
||||
if iv is None or not 1 <= iv <= 5: return None
|
||||
coerced[key] = iv
|
||||
ratings[tid] = coerced
|
||||
return raw
|
||||
|
||||
raw = self.interviewer.ask_in_character(persona, user_prompt=prompt,
|
||||
|
|
|
|||
|
|
@ -7,7 +7,7 @@ from sklearn.decomposition import PCA
|
|||
from sklearn.cluster import KMeans
|
||||
import yaml
|
||||
from app.models.interview import QSortResponse
|
||||
from app.services.interviews.base import StakeholderInterviewer, PersonaRecord
|
||||
from app.services.interviews.base import StakeholderInterviewer, PersonaRecord, coerce_int
|
||||
from app.services.interviews.instrument_loader import InstrumentValidationError
|
||||
|
||||
|
||||
|
|
@ -64,16 +64,23 @@ class DiversitySubagent:
|
|||
dist = self.instrument["distribution"]
|
||||
target = {b: n for b, n in zip(range(-3, 4), dist)}
|
||||
got: dict[int, int] = {}
|
||||
for v in placements.values():
|
||||
if not isinstance(v, int) or not -3 <= v <= 3:
|
||||
coerced_p: dict[str, int] = {}
|
||||
for k, v in placements.items():
|
||||
iv = coerce_int(v)
|
||||
if iv is None or not -3 <= iv <= 3:
|
||||
return None
|
||||
got[v] = got.get(v, 0) + 1
|
||||
coerced_p[k] = iv
|
||||
got[iv] = got.get(iv, 0) + 1
|
||||
if got != target:
|
||||
return None
|
||||
coerced_a: dict[str, int] = {}
|
||||
for a in self.instrument["likert_axes"]:
|
||||
v = axes.get(a["axis_id"])
|
||||
if not isinstance(v, int) or not 1 <= v <= 7:
|
||||
iv = coerce_int(axes.get(a["axis_id"]))
|
||||
if iv is None or not 1 <= iv <= 7:
|
||||
return None
|
||||
coerced_a[a["axis_id"]] = iv
|
||||
raw["placements"] = coerced_p
|
||||
raw["likert_axes"] = coerced_a
|
||||
return raw
|
||||
|
||||
def administer(self, persona: PersonaRecord) -> QSortResponse:
|
||||
|
|
|
|||
|
|
@ -6,7 +6,7 @@ from typing import Optional
|
|||
from app.models.interview import (
|
||||
LikertInstrument, LikertResponse, InterviewPhase,
|
||||
)
|
||||
from app.services.interviews.base import StakeholderInterviewer, PersonaRecord
|
||||
from app.services.interviews.base import StakeholderInterviewer, PersonaRecord, coerce_int
|
||||
from app.services.interviews.instrument_loader import load_likert_instrument
|
||||
|
||||
|
||||
|
|
@ -44,9 +44,13 @@ class LongitudinalSubagent:
|
|||
required = {it.item_id for it in self.instrument.items}
|
||||
if not required.issubset(resp.keys()):
|
||||
return None
|
||||
coerced: dict[str, int] = {}
|
||||
for k, v in resp.items():
|
||||
if not isinstance(v, int) or not 1 <= v <= 5:
|
||||
iv = coerce_int(v)
|
||||
if iv is None or not 1 <= iv <= 5:
|
||||
return None
|
||||
coerced[k] = iv
|
||||
raw["responses"] = coerced
|
||||
return raw
|
||||
|
||||
def administer(self, persona: PersonaRecord, phase: InterviewPhase) -> LikertResponse:
|
||||
|
|
|
|||
|
|
@ -5,7 +5,7 @@ from pathlib import Path
|
|||
from typing import Optional
|
||||
import yaml
|
||||
from app.models.interview import ScenarioRating, ScenarioResponse
|
||||
from app.services.interviews.base import StakeholderInterviewer, PersonaRecord
|
||||
from app.services.interviews.base import StakeholderInterviewer, PersonaRecord, coerce_int
|
||||
|
||||
class ScenarioSubagent:
|
||||
def __init__(self, llm, memory, instrument_path: Path, language: str = "de"):
|
||||
|
|
@ -44,10 +44,12 @@ class ScenarioSubagent:
|
|||
sids = {s["scenario_id"] for s in self.instrument["scenarios"]}
|
||||
ratings = raw.get("ratings", {})
|
||||
if set(ratings.keys()) != sids: return None
|
||||
for v in ratings.values():
|
||||
for sid, v in ratings.items():
|
||||
if not isinstance(v, dict): return None
|
||||
for k in ("desirability", "plausibility", "impact_on_my_group", "fairness"):
|
||||
if not isinstance(v.get(k), int) or not 1 <= v[k] <= 7: return None
|
||||
iv = coerce_int(v.get(k))
|
||||
if iv is None or not 1 <= iv <= 7: return None
|
||||
v[k] = iv
|
||||
if not isinstance(v.get("if_woke_up_response", ""), str): return None
|
||||
return raw
|
||||
|
||||
|
|
|
|||
|
|
@ -2,8 +2,36 @@ import json
|
|||
import pytest
|
||||
from app.services.interviews.base import (
|
||||
StakeholderInterviewer, MemoryDigest, PersonaRecord, SchemaValidationFailure,
|
||||
coerce_int,
|
||||
)
|
||||
|
||||
|
||||
def test_coerce_int_accepts_real_int():
|
||||
assert coerce_int(3) == 3
|
||||
assert coerce_int(-2) == -2
|
||||
assert coerce_int(0) == 0
|
||||
|
||||
|
||||
def test_coerce_int_accepts_numeric_strings():
|
||||
assert coerce_int("3") == 3
|
||||
assert coerce_int(" 4 ") == 4
|
||||
assert coerce_int("-2") == -2
|
||||
|
||||
|
||||
def test_coerce_int_rejects_non_numeric():
|
||||
assert coerce_int("3.5") is None
|
||||
assert coerce_int("abc") is None
|
||||
assert coerce_int(None) is None
|
||||
assert coerce_int([3]) is None
|
||||
assert coerce_int(3.5) is None
|
||||
|
||||
|
||||
def test_coerce_int_rejects_bool():
|
||||
"""True/False should NOT silently coerce to 1/0 even though Python says they're ints."""
|
||||
assert coerce_int(True) is None
|
||||
assert coerce_int(False) is None
|
||||
|
||||
|
||||
class _FakeLLM:
|
||||
def __init__(self, responses):
|
||||
self.responses = list(responses)
|
||||
|
|
|
|||
|
|
@ -56,3 +56,29 @@ def test_convergence_metrics():
|
|||
conv = convergence_metrics(r2, r3)
|
||||
assert "t1" in conv
|
||||
assert conv["t1"]["delta_iqr_importance"] is not None
|
||||
|
||||
|
||||
def test_delphi_r2_accepts_string_ratings():
|
||||
"""Delphi R2/R3 ratings should accept stringified importance/plausibility ints."""
|
||||
from app.services.interviews.base import PersonaRecord, MemoryDigest
|
||||
from app.services.interviews.delphi import DelphiSubagent
|
||||
from pathlib import Path as _P
|
||||
|
||||
class _Mem:
|
||||
def get_digest(self, agent_id, max_chars=2000):
|
||||
return MemoryDigest(text="x", available=True)
|
||||
|
||||
class _StringLLM:
|
||||
def chat_json(self, messages, temperature=0.0, max_tokens=None, **kw):
|
||||
return {"ratings": {
|
||||
"t1": {"importance": "4", "plausibility": "3"},
|
||||
"t2": {"importance": "5", "plausibility": "2"},
|
||||
}}
|
||||
|
||||
inst = _P(__file__).resolve().parents[2] / "scripts" / "instruments" / "delphi_v1.yaml"
|
||||
sub = DelphiSubagent(llm=_StringLLM(), memory=_Mem(), instrument_path=inst)
|
||||
persona = PersonaRecord(agent_id=1, name="A", persona="p")
|
||||
themes = [{"theme_id": "t1", "label": "T1"}, {"theme_id": "t2", "label": "T2"}]
|
||||
resp = sub.administer_round2(persona, themes)
|
||||
assert resp.ratings["t1"]["importance"] == 4
|
||||
assert isinstance(resp.ratings["t1"]["importance"], int)
|
||||
|
|
|
|||
|
|
@ -46,3 +46,33 @@ def test_typology_runs_pca_kmeans():
|
|||
assert len(result["clusters"]) == 3
|
||||
assert "pca" in result
|
||||
assert len(result["pca"]["components"]) >= 2
|
||||
|
||||
|
||||
def test_diversity_accepts_string_likert_values():
|
||||
"""Diversity placements + axes should accept stringified ints."""
|
||||
from app.services.interviews.base import PersonaRecord, MemoryDigest
|
||||
from app.services.interviews.diversity import DiversitySubagent
|
||||
from pathlib import Path as _P
|
||||
|
||||
class _Mem:
|
||||
def get_digest(self, agent_id, max_chars=2000):
|
||||
return MemoryDigest(text="x", available=True)
|
||||
|
||||
buckets = [-3]*2 + [-2]*3 + [-1]*4 + [0]*6 + [1]*4 + [2]*3 + [3]*2
|
||||
|
||||
class _StringLLM:
|
||||
def chat_json(self, messages, temperature=0.0, max_tokens=None, **kw):
|
||||
return {
|
||||
"placements": {f"st_{i+1:02d}": str(buckets[i]) for i in range(24)},
|
||||
"likert_axes": {a: "4" for a in (
|
||||
"ax_pres_extr","ax_loc_eu","ax_sci_trad",
|
||||
"ax_ind_col","ax_short_long","ax_mkt_reg")},
|
||||
}
|
||||
|
||||
inst = _P(__file__).resolve().parents[2] / "scripts" / "instruments" / "diversity_v1.yaml"
|
||||
sub = DiversitySubagent(llm=_StringLLM(), memory=_Mem(), instrument_path=inst)
|
||||
persona = PersonaRecord(agent_id=7, name="A", persona="p")
|
||||
resp = sub.administer(persona)
|
||||
assert isinstance(resp.placements["st_01"], int)
|
||||
assert isinstance(resp.likert_axes["ax_pres_extr"], int)
|
||||
assert resp.likert_axes["ax_pres_extr"] == 4
|
||||
|
|
|
|||
|
|
@ -55,3 +55,37 @@ def test_longitudinal_aggregate_delta():
|
|||
assert agg["per_item"]["stk_1"]["mean_delta"] == 1.0
|
||||
assert agg["per_item"]["gov_1"]["mean_delta"] == 0.0
|
||||
assert agg["n_paired"] == 5
|
||||
|
||||
|
||||
def test_longitudinal_accepts_string_likert_values():
|
||||
"""Real LLMs sometimes return Likert values as JSON strings ('3' not 3).
|
||||
The validator should coerce them rather than fail the agent."""
|
||||
from app.models.interview import InterviewPhase
|
||||
from app.services.interviews.base import PersonaRecord, MemoryDigest
|
||||
from app.services.interviews.longitudinal import LongitudinalSubagent
|
||||
from pathlib import Path as _P
|
||||
|
||||
class _Mem:
|
||||
def get_digest(self, agent_id, max_chars=2000):
|
||||
return MemoryDigest(text="x", available=True)
|
||||
|
||||
class _StringLLM:
|
||||
def chat_json(self, messages, temperature=0.0, max_tokens=None, **kw):
|
||||
return {
|
||||
"responses": { # all strings, not ints
|
||||
"stk_1": "4", "stk_2": "3", "stk_3": "5",
|
||||
"gov_1": "3", "gov_2": "4", "gov_3": "2",
|
||||
"mkt_1": "5", "mkt_2": "3", "mkt_3": "4",
|
||||
"clm_1": "2", "clm_2": "4", "clm_3": "5",
|
||||
},
|
||||
"confidence": {},
|
||||
"open_comment": "stringified",
|
||||
}
|
||||
|
||||
inst = _P(__file__).resolve().parents[2] / "scripts" / "instruments" / "longitudinal_v1.yaml"
|
||||
sub = LongitudinalSubagent(llm=_StringLLM(), memory=_Mem(), instrument_path=inst)
|
||||
persona = PersonaRecord(agent_id=99, name="A", persona="p")
|
||||
resp = sub.administer(persona, phase=InterviewPhase.T0)
|
||||
assert resp.agent_id == 99
|
||||
assert resp.responses["stk_1"] == 4
|
||||
assert isinstance(resp.responses["stk_1"], int)
|
||||
|
|
|
|||
|
|
@ -32,3 +32,29 @@ def test_polarity_matrix():
|
|||
assert "S1" in m
|
||||
assert m["S1"]["mean_desirability"] == 5
|
||||
assert m["S1"]["n"] == 3
|
||||
|
||||
|
||||
def test_scenario_accepts_string_likert_values():
|
||||
"""Scenario ratings should accept stringified ints across all 4 dimensions."""
|
||||
from app.services.interviews.base import PersonaRecord, MemoryDigest
|
||||
from app.services.interviews.scenario import ScenarioSubagent
|
||||
from pathlib import Path as _P
|
||||
|
||||
class _Mem:
|
||||
def get_digest(self, agent_id, max_chars=2000):
|
||||
return MemoryDigest(text="x", available=True)
|
||||
|
||||
class _StringLLM:
|
||||
def chat_json(self, messages, temperature=0.0, max_tokens=None, **kw):
|
||||
return {"ratings": {sid: {
|
||||
"desirability": "4", "plausibility": "3",
|
||||
"impact_on_my_group": "5", "fairness": "3",
|
||||
"if_woke_up_response": f"act-{sid}",
|
||||
} for sid in ("S1","S2","S3","S4")}}
|
||||
|
||||
inst = _P(__file__).resolve().parents[2] / "scripts" / "instruments" / "scenario_v1.yaml"
|
||||
sub = ScenarioSubagent(llm=_StringLLM(), memory=_Mem(), instrument_path=inst)
|
||||
persona = PersonaRecord(agent_id=3, name="A", persona="p")
|
||||
resp = sub.administer(persona)
|
||||
assert resp.ratings["S1"].desirability == 4
|
||||
assert isinstance(resp.ratings["S1"].desirability, int)
|
||||
|
|
|
|||
Loading…
Reference in New Issue