MicroFish/.kiro/specs/i18n-e2e-english-verification/audit/scripts/classify.py

#!/usr/bin/env python3
"""Classify each CJK match into a 4-class label and a category tag.

Inputs (read from <sha-dir>):
  cjk-grep.txt   - raw `git grep -nP` output, one match per line.
  parity.txt     - output of check_parity.py (used to harvest cjk-in-en gaps).

Output (written to <sha-dir>/classified.csv):
  CSV columns: file, line, match, class, category, pipeline_step

Classes are a closed set: deliberate / gap / non-applicable / review-needed.
Categories and pipeline-step tags are likewise closed sets - see classify_match.

Run from the repository root.
"""
from __future__ import annotations

import csv
import re
import sys
from pathlib import Path
from typing import Iterable, Tuple

CJK_RANGE = re.compile(r"[一-鿿]")
PROMPT_FILES = (
    "backend/app/services/ontology_generator.py",
    "backend/app/services/oasis_profile_generator.py",
    "backend/app/services/simulation_config_generator.py",
    "backend/app/services/report_agent.py",
    "backend/app/services/zep_graph_memory_updater.py",
)
LOG_HINTS = ("logger.", "log.", "print(", "build_logger.", "logging.")
BINARY_EXTS = (
    ".jpg", ".jpeg", ".png", ".gif", ".pdf",
    ".woff", ".woff2", ".ttf", ".eot", ".ico",
)


def classify_match(file: str, raw_line: str) -> Tuple[str, str, str]:
    """Return (class, category, pipeline_step) for one grep match line."""
    if any(file.lower().endswith(ext) for ext in BINARY_EXTS):
        return ("non-applicable", "binary-false-positive", "n/a")

    if file == "locales/en.json":
        return ("gap", "catalogue-parity", "UI")

    stripped = raw_line.lstrip()
    pipeline_step = pipeline_step_for(file)

    if file.endswith(".vue"):
        if re.search(r"\.match\s*\(\s*/", raw_line):
            return ("gap", "frontend-regex-parser", pipeline_step)
        if re.search(r"['\"`].*[一-鿿].*['\"`]", raw_line):
            return ("gap", "frontend-ui-string", pipeline_step)
        if stripped.startswith("//") or stripped.startswith("/*") or stripped.startswith("*"):
            return ("deliberate", "frontend-comment", pipeline_step)
        return ("review-needed", "frontend-other", pipeline_step)

    if file.endswith(".py"):
        if stripped.startswith("#"):
            return ("deliberate", "backend-comment", pipeline_step)
        if stripped.startswith('"""') or stripped.startswith("'''"):
            return ("deliberate", "backend-docstring", pipeline_step)
        if not re.search(r"['\"]", raw_line):
            # bare CJK on a non-string line: most likely an unterminated docstring
            # body. Treat as a docstring continuation.
            return ("deliberate", "backend-docstring", pipeline_step)
        if any(hint in raw_line for hint in LOG_HINTS):
            return ("gap", "backend-log", "Logs")
        if file in PROMPT_FILES:
            return ("gap", "backend-prompt-label", pipeline_step)
        return ("review-needed", "backend-string", pipeline_step)

    if file.endswith(".js") or file.endswith(".ts"):
        if stripped.startswith("//") or stripped.startswith("*"):
            return ("deliberate", "frontend-comment", pipeline_step)
        return ("review-needed", "frontend-other", pipeline_step)

    return ("review-needed", "uncategorised", pipeline_step)


def pipeline_step_for(file: str) -> str:
    """Map a path to one of the closed-set pipeline-step tags."""
    if "ontology_generator" in file or "graph_builder" in file or "graph.py" in file:
        return "Graph Build"
    if "oasis_profile_generator" in file or "Step2" in file:
        return "Env Setup"
    if "simulation_config_generator" in file or "simulation" in file or "Step3" in file:
        return "Simulation"
    if "report_agent" in file or "Step4" in file:
        return "Report"
    if "Step5" in file or "interaction" in file.lower() or "interview" in file.lower():
        return "Interaction"
    if "logger" in file or "retry" in file:
        return "Logs"
    if file.startswith("frontend/src/views/") or file.startswith("frontend/src/components/"):
        return "UI"
    return "n/a"


def parse_grep_line(line: str) -> Tuple[str, str, str]:
    """Split a `git grep -n` line into (file, line-number, match-text)."""
    parts = line.split(":", 2)
    if len(parts) < 3:
        return ("", "", line)
    return (parts[0], parts[1], parts[2])


def parity_to_rows(parity_path: Path) -> Iterable[Tuple[str, str, str, str, str, str]]:
    """Promote `[cjk-in-en]` block entries from parity.txt into classified rows."""
    if not parity_path.exists():
        return
    in_block = False
    for raw in parity_path.read_text(encoding="utf-8").splitlines():
        if raw.startswith("["):
            in_block = raw.strip() == "[cjk-in-en]"
            continue
        if not in_block:
            continue
        if not raw or raw.startswith("#"):
            continue
        yield (
            "locales/en.json",
            "0",
            raw,
            "gap",
            "catalogue-parity",
            "UI",
        )


def main(argv: list[str]) -> int:
    if len(argv) != 2:
        print(f"usage: {argv[0]} <sha-dir>", file=sys.stderr)
        return 64

    sha_dir = Path(argv[1])
    grep_path = sha_dir / "cjk-grep.txt"
    parity_path = sha_dir / "parity.txt"
    out_path = sha_dir / "classified.csv"

    if not grep_path.exists():
        print(f"missing input: {grep_path}", file=sys.stderr)
        return 1

    rows: list[Tuple[str, str, str, str, str, str]] = []
    grep_lines = grep_path.read_text(encoding="utf-8").splitlines()
    for raw_line in grep_lines:
        if not raw_line:
            continue
        file, lineno, match = parse_grep_line(raw_line)
        if not file:
            continue
        cls, category, step = classify_match(file, match)
        rows.append((file, lineno, match.strip(), cls, category, step))

    rows.extend(parity_to_rows(parity_path))

    raw_count = sum(1 for line in grep_lines if line.strip())
    grep_rows = [r for r in rows if r[0] != "locales/en.json" or r[1] != "0"]
    if len(grep_rows) != raw_count:
        print(
            f"row-count drift: input={raw_count}, classified={len(grep_rows)}",
            file=sys.stderr,
        )
        return 1

    with out_path.open("w", encoding="utf-8", newline="") as fh:
        writer = csv.writer(fh)
        writer.writerow(["file", "line", "match", "class", "category", "pipeline_step"])
        writer.writerows(rows)

    summary: dict[str, int] = {}
    for row in rows:
        summary[row[3]] = summary.get(row[3], 0) + 1
    summary_str = ", ".join(f"{cls}={n}" for cls, n in sorted(summary.items()))
    print(f"  classified.csv: {len(rows)} rows ({summary_str})")
    return 0


if __name__ == "__main__":
    sys.exit(main(sys.argv))