MicroFish/.kiro/specs/i18n-e2e-english-verification/audit/scripts/render_report.py

#!/usr/bin/env python3
"""Render the gap report and the issue-#10 comment body.

Inputs (from <sha-dir>):
  classified.csv          - per-match classification rows.
  parity.txt              - en/zh catalogue parity output.
  cjk-grep-bucketed.txt   - human-readable bucketed grep output.

Inputs (from repo):
  .ticket/10.md           - snapshot of issue #10's body (used to mirror its checklist).

Outputs (to <sha-dir>):
  gap-report.md           - full structured report (seven sections).
  comment-body.md         - markdown comment to be posted on issue #10.
  PENDING-followups/01..04-*.md - one body per gap category (placeholders allowed).

Usage:
    python3 render_report.py <sha-dir> <commit-sha>
"""
from __future__ import annotations

import csv
import re
import sys
from collections import Counter, defaultdict
from pathlib import Path
from typing import Dict, List

ISSUE_NUMBER = 10
REPO_SLUG = "salestech-group/MiroFish"


def load_rows(csv_path: Path) -> list[dict]:
    with csv_path.open(encoding="utf-8", newline="") as fh:
        return list(csv.DictReader(fh))


def load_ticket_body(ticket_path: Path) -> str:
    """Strip the YAML frontmatter and return the markdown body."""
    text = ticket_path.read_text(encoding="utf-8")
    if text.startswith("---\n"):
        end = text.find("\n---\n", 4)
        if end != -1:
            return text[end + 5 :]
    return text


CHECKBOX_RE = re.compile(r"^(\s*)- \[ \] (.+)$")
SUBBULLET_RE = re.compile(r"^(\s+)- (.+)$")


def evidence_for_step(rows: list[dict], step: str) -> list[dict]:
    """Return gap rows whose pipeline_step matches the given UI tag."""
    return [r for r in rows if r["class"] == "gap" and r["pipeline_step"] == step]


def render_section_5(ticket_body: str, rows: list[dict]) -> str:
    """Map every checklist item from the ticket body to a status."""
    gaps_by_step = defaultdict(list)
    for row in rows:
        if row["class"] == "gap":
            gaps_by_step[row["pipeline_step"]].append(row)

    out: list[str] = []
    out.append("## Section 5 - Issue #10 checklist mapping\n")
    out.append("Each line below is taken from the ticket body, with an explicit status.\n")

    in_checklist = False
    for line in ticket_body.splitlines():
        match = CHECKBOX_RE.match(line)
        if match:
            in_checklist = True
            indent, text = match.group(1), match.group(2)
            status, note = status_for_checklist_item(text, gaps_by_step)
            out.append(f"{indent}- [{('x' if status == 'pass' else ' ')}] **{status.upper()}** - {text}{note}")
            continue

        sub = SUBBULLET_RE.match(line)
        if in_checklist and sub:
            indent, text = sub.group(1), sub.group(2)
            status, note = status_for_checklist_item(text, gaps_by_step)
            out.append(f"{indent}- {status.upper()}: {text}{note}")
            continue

        if line.startswith("##") or line.startswith("---"):
            in_checklist = False

    return "\n".join(out) + "\n"


def status_for_checklist_item(text: str, gaps_by_step: Dict[str, list]) -> tuple[str, str]:
    """Return (status, suffix-note) for one checklist line.

    Pure-UI items default to manual-pending in this run; items with a
    backing pipeline-step that has gaps are reported as gap with a count.
    """
    lower = text.lower()
    candidates: list[str] = []
    if "graph build" in lower or "ontology" in lower:
        candidates.append("Graph Build")
    if "env setup" in lower or "agent profile" in lower or "profession" in lower:
        candidates.append("Env Setup")
    if "simulation" in lower or "tweet" in lower or "reddit" in lower or "sim config" in lower:
        candidates.append("Simulation")
    if "report" in lower:
        candidates.append("Report")
    if "interaction" in lower or "interview" in lower or "chat repl" in lower:
        candidates.append("Interaction")
    if "log" in lower:
        candidates.append("Logs")

    relevant_gaps = []
    for step in candidates:
        relevant_gaps.extend(gaps_by_step.get(step, []))

    if "frontend ui" in lower or "no chinese strings on screen" in lower or "every label" in lower:
        ui_gaps = gaps_by_step.get("UI", [])
        if ui_gaps:
            return ("gap", f" - {len(ui_gaps)} hard-coded CJK literal(s) in `frontend/src/views|components/`")
        return ("manual-pending", " - live UI walkthrough not run in this sandbox")

    if "locale propagation" in lower or "set_locale" in lower:
        prop = gaps_by_step.get("Logs", [])
        if prop:
            return ("gap", f" - {len(prop)} CJK log strings on EN code path")
        return ("manual-pending", " - locale-propagation runtime check not run in this sandbox")

    if relevant_gaps:
        return ("gap", f" - {len(relevant_gaps)} gap(s) classified, see Section 1/3")

    if any(c in lower for c in ("ui", "screenshot", "chat", "modal", "tooltip", "render", "trace", "thinking")):
        return ("manual-pending", " - requires live walkthrough")

    return ("manual-pending", " - not verifiable statically; awaiting live run")


def render_gap_report(rows: list[dict], ticket_body: str, parity_text: str, sha: str) -> str:
    classes = Counter(r["class"] for r in rows)
    gap_rows = [r for r in rows if r["class"] == "gap"]
    gap_categories = Counter(r["category"] for r in gap_rows)
    gap_steps = Counter(r["pipeline_step"] for r in gap_rows)

    out: list[str] = []
    out.append(f"# Verification gap report - i18n-e2e-english-verification\n")
    out.append(f"**Commit:** `{sha}`\n")
    out.append("")
    out.append("## Overview\n")
    out.append(f"- Total CJK matches audited: **{len(rows)}**")
    out.append(f"- Class distribution: {format_counter(classes)}")
    out.append(f"- Gap categories: {format_counter(gap_categories)}")
    out.append(f"- Gap pipeline steps: {format_counter(gap_steps)}")
    out.append("")

    out.append("## Section 1 - Static CJK audit\n")
    out.append("Canonical command (PCRE):\n")
    out.append("```")
    out.append('git grep -nIP "[\\x{4e00}-\\x{9fff}]" -- backend/app frontend/src locales/en.json')
    out.append("```")
    out.append("")
    out.append(f"Raw output captured at `audit/{sha}/cjk-grep.txt` and bucketed at `audit/{sha}/cjk-grep-bucketed.txt`.")
    out.append("")
    out.append(f"`locales/en.json` CJK matches: **{sum(1 for r in rows if r['file'] == 'locales/en.json')}** (acceptance: zero).")
    out.append("")
    out.append("Top files by gap count:")
    out.append("")
    out.append("| File | Gap count |")
    out.append("|------|-----------|")
    by_file = Counter(r["file"] for r in gap_rows)
    for file, count in by_file.most_common(15):
        out.append(f"| `{file}` | {count} |")
    out.append("")

    out.append("## Section 2 - Locale catalogue parity\n")
    out.append("```")
    out.append(parity_text.strip())
    out.append("```")
    out.append("")

    out.append("## Section 3 - LLM-prompt locale verification\n")
    prompt_gaps = [r for r in gap_rows if r["category"] == "backend-prompt-label"]
    out.append(f"Backend prompt-label gaps (CJK string literals inside services that compose LLM prompts): **{len(prompt_gaps)}**")
    out.append("")
    if prompt_gaps:
        out.append("First 10 examples (file:line - match):")
        out.append("")
        for row in prompt_gaps[:10]:
            out.append(f"- `{row['file']}:{row['line']}` - {row['match']}")
        if len(prompt_gaps) > 10:
            out.append(f"- ... and {len(prompt_gaps) - 10} more (see `classified.csv`)")
        out.append("")
    out.append(
        "These prompts feed the LLM verbatim; CJK labels bias the model toward Chinese output even when "
        "the requested locale is English."
    )
    out.append("")

    out.append("## Section 4 - Locale propagation surface\n")
    log_gaps = [r for r in gap_rows if r["category"] == "backend-log"]
    out.append("| Boundary | Status | Evidence |")
    out.append("|----------|--------|----------|")
    out.append(
        "| HTTP -> Flask handler | manual-pending | runtime not exercised in sandbox; static review showed no per-request locale carrier |"
    )
    out.append(
        "| Flask handler -> Task worker | manual-pending | thread-local `set_locale` referenced in CLAUDE.md but not statically verified end-to-end |"
    )
    out.append(
        f"| Task worker -> OASIS subprocess | manual-pending | subprocess boundary requires live run |"
    )
    out.append(
        f"| Backend logger | {'gap' if log_gaps else 'pass'} | {len(log_gaps)} hard-coded CJK log line(s) on EN code path |"
    )
    out.append("")
    if log_gaps:
        out.append("First 10 backend-log gap examples:")
        out.append("")
        for row in log_gaps[:10]:
            out.append(f"- `{row['file']}:{row['line']}` - {row['match']}")
        out.append("")

    out.append(render_section_5(ticket_body, rows))

    out.append("## Section 6 - ZH regression check\n")
    out.append(
        "- Locale catalogues at full key parity (953 EN keys / 953 ZH keys, symmetric difference 0 - "
        "see Section 2).\n"
        "- No ZH-specific regression detected in static review. Live ZH walkthrough is `manual-pending`.\n"
    )

    out.append("## Section 7 - Follow-up plan\n")
    out.append("Per R7.2, gaps are grouped into the following follow-up issues (placeholder bodies in `PENDING-followups/`):")
    out.append("")
    out.append(
        f"1. **Frontend hard-coded UI strings** ({len(by_category(rows, 'frontend-ui-string'))} matches + "
        f"{len(by_category(rows, 'frontend-regex-parser'))} regex parsers depending on CJK backend output)."
    )
    out.append(f"2. **Backend log strings** ({len(by_category(rows, 'backend-log'))} matches).")
    out.append(f"3. **Backend LLM-prompt context labels** ({len(by_category(rows, 'backend-prompt-label'))} matches).")
    out.append("4. **Permanent CI guard** (preventative - re-run this audit on every PR).")
    out.append("")
    out.append(
        "Backend docstring/comment matches (the bulk of `deliberate` rows) are covered by the existing issue #7 and are not re-filed here."
    )

    return "\n".join(out) + "\n"


def by_category(rows: list[dict], category: str) -> list[dict]:
    return [r for r in rows if r["category"] == category and r["class"] == "gap"]


def format_counter(c: Counter) -> str:
    return ", ".join(f"{k}={v}" for k, v in c.most_common())


def render_comment_body(rows: list[dict], ticket_body: str, sha: str) -> str:
    classes = Counter(r["class"] for r in rows)
    gap_rows = [r for r in rows if r["class"] == "gap"]
    gap_categories = Counter(r["category"] for r in gap_rows)

    out: list[str] = []
    out.append(f"### Verification report - run on commit `{sha}`\n")
    out.append("This run was produced by `.kiro/specs/i18n-e2e-english-verification/audit/scripts/run_audit.sh`.")
    out.append("Captured artefacts live under `.kiro/specs/i18n-e2e-english-verification/audit/<commit-sha>/`.\n")
    out.append("")
    out.append(f"**Audit summary:** {sum(classes.values())} CJK matches across the auditable paths.")
    out.append(f"- {classes.get('gap', 0)} `gap` (actionable, see follow-ups)")
    out.append(f"- {classes.get('review-needed', 0)} `review-needed` (soft signal; needs human eyeball)")
    out.append(f"- {classes.get('deliberate', 0)} `deliberate` (mostly backend docstrings/comments - covered by issue #7)")
    out.append(
        f"- {classes.get('non-applicable', 0)} `non-applicable` (binary file false positives - excluded)"
    )
    out.append("")
    out.append(f"**Gap-category breakdown:** {format_counter(gap_categories)}")
    out.append("")
    out.append("---")
    out.append("")
    out.append("#### Issue checklist mapping")
    out.append("")
    out.append(render_section_5(ticket_body, rows))
    out.append("---")
    out.append("")
    out.append("#### How to re-run")
    out.append("")
    out.append("```bash")
    out.append("# from the repository root, on any commit:")
    out.append("bash .kiro/specs/i18n-e2e-english-verification/audit/scripts/run_audit.sh")
    out.append("# artefacts at .kiro/specs/i18n-e2e-english-verification/audit/<HEAD-sha>/")
    out.append("```")
    out.append("")
    out.append(
        "If `gh` is not authenticated when re-running, the comment body and follow-up bodies are written to "
        "`PENDING-issue-10-comment.md` / `PENDING-followups/` for a human to post."
    )
    out.append("")
    out.append("Out of scope for this run (per R5.3 / R7.3): live UI walkthrough, full Docker-Compose pipeline run, and any inline gap fixes.")
    return "\n".join(out) + "\n"


def render_followup_bodies(rows: list[dict], sha_dir: Path, sha: str) -> None:
    pending_dir = sha_dir / "PENDING-followups"
    pending_dir.mkdir(parents=True, exist_ok=True)

    ui_gaps = by_category(rows, "frontend-ui-string") + by_category(rows, "frontend-regex-parser")
    log_gaps = by_category(rows, "backend-log")
    prompt_gaps = by_category(rows, "backend-prompt-label")

    files = [
        (
            "01-frontend-ui-strings.md",
            "i18n: replace hard-coded chinese ui strings in process and step components with i18n keys",
            ui_gaps,
            (
                "Several `.vue` templates in `frontend/src/views/` and `frontend/src/components/` still emit "
                "Chinese strings directly instead of routing them through `vue-i18n` keys. Some `Step4Report.vue` "
                "regex parsers also rely on Chinese tokens emitted by the backend (so they will silently break "
                "once the backend prompts are translated)."
            ),
            ["i18n", "bug"],
        ),
        (
            "02-backend-log-strings.md",
            "i18n: externalise remaining chinese log strings in flask api and utils",
            log_gaps,
            (
                "After issue #6 externalised most backend log messages, a handful of `logger.info` / "
                "`logger.error` call sites in `backend/app/api/graph.py` and `backend/app/utils/retry.py` "
                "still hard-code Chinese strings, so backend logs leak Chinese under EN locale."
            ),
            ["i18n"],
        ),
        (
            "03-backend-prompt-labels.md",
            "i18n: translate chinese context labels inside llm-prompt assembly in backend services",
            prompt_gaps,
            (
                "Several `services/*_generator.py` files compose LLM prompts that still embed Chinese "
                "context labels (e.g. `\"事实信息:\"`, `\"相关实体:\"`) into the prompt string verbatim. These "
                "labels bias the LLM toward Chinese output even when the requested locale is English."
            ),
            ["i18n"],
        ),
        (
            "04-permanent-ci-guard.md",
            "i18n: add a permanent ci guard that runs the e2e cjk audit on every pr",
            [],
            (
                "Promote the audit pipeline at `.kiro/specs/i18n-e2e-english-verification/audit/scripts/` to "
                "a permanent CI check. The guard should fail when `locales/en.json` contains any CJK character "
                "and when the gap count regresses against a committed baseline."
            ),
            ["i18n", "enhancement"],
        ),
    ]

    for name, title, gaps, summary, labels in files:
        if not gaps and not name.startswith("04-"):
            (pending_dir / name).write_text("", encoding="utf-8")
            continue

        body = [
            f"# {title}",
            "",
            "## Summary",
            "",
            summary,
            "",
            "## Linked from",
            "",
            f"- Issue #{ISSUE_NUMBER} (verification report comment).",
            f"- Spec: `.kiro/specs/i18n-e2e-english-verification/` at commit `{sha}`.",
            "",
            "## Evidence",
            "",
        ]
        if gaps:
            for row in gaps[:50]:
                body.append(f"- `{row['file']}:{row['line']}` - {row['match']}")
            if len(gaps) > 50:
                body.append(f"- ... and {len(gaps) - 50} more (see `classified.csv` in the spec dir)")
        else:
            body.append("- (No gaps in this run; this is a preventative follow-up only.)")
        body.append("")
        body.append("## Acceptance")
        body.append("")
        body.append("- [ ] Each `file:line` above is fixed (or explicitly classified as `deliberate`).")
        body.append("- [ ] Re-running `bash .kiro/specs/i18n-e2e-english-verification/audit/scripts/run_audit.sh` shows zero gaps in this category.")
        body.append("")
        body.append(f"<!-- labels: {','.join(labels)} -->")
        body.append("")
        (pending_dir / name).write_text("\n".join(body), encoding="utf-8")


def main(argv: list[str]) -> int:
    if len(argv) != 3:
        print(f"usage: {argv[0]} <sha-dir> <commit-sha>", file=sys.stderr)
        return 64

    sha_dir = Path(argv[1])
    sha = argv[2]

    rows = load_rows(sha_dir / "classified.csv")
    parity_text = (sha_dir / "parity.txt").read_text(encoding="utf-8")
    ticket_body = load_ticket_body(Path(".ticket/10.md"))

    gap_report = render_gap_report(rows, ticket_body, parity_text, sha)
    (sha_dir / "gap-report.md").write_text(gap_report, encoding="utf-8")

    comment_body = render_comment_body(rows, ticket_body, sha)
    (sha_dir / "comment-body.md").write_text(comment_body, encoding="utf-8")

    render_followup_bodies(rows, sha_dir, sha)

    print(f"  gap-report.md, comment-body.md, PENDING-followups/ written under {sha_dir}")
    return 0


if __name__ == "__main__":
    sys.exit(main(sys.argv))