394 lines
12 KiB
Python
Executable File
394 lines
12 KiB
Python
Executable File
#!/usr/bin/env python3
|
|
"""i18n CJK guard for pull-request CI.
|
|
|
|
Run from the repository root::
|
|
|
|
python scripts/ci/i18n_cjk_guard.py
|
|
python scripts/ci/i18n_cjk_guard.py --update-baseline
|
|
|
|
Two checks always run (no short-circuit):
|
|
|
|
* ``locales/en.json`` must contain zero CJK characters
|
|
(range ``U+4E00..U+9FFF``).
|
|
* CJK match counts under ``backend/app/`` and ``frontend/src/`` must not
|
|
exceed the committed per-path baseline at
|
|
``.kiro/specs/i18n-ci-guard/baseline.txt``.
|
|
|
|
Both checks rely on the canonical scan
|
|
``git grep -nIP '[\\x{4e00}-\\x{9fff}]' -- <scoped_path>`` so the guard
|
|
stays bytewise-aligned with the broader audit pipeline.
|
|
|
|
Stdlib only. Exit code is 0 on success and 1 on any failure or hard
|
|
error.
|
|
"""
|
|
from __future__ import annotations
|
|
|
|
import argparse
|
|
import json
|
|
import os
|
|
import re
|
|
import subprocess
|
|
import sys
|
|
from pathlib import Path
|
|
|
|
CJK_RE: re.Pattern[str] = re.compile(r"[一-鿿]")
|
|
CJK_PATTERN: str = r"[\x{4e00}-\x{9fff}]"
|
|
SCOPED_PATHS: tuple[str, ...] = ("backend/app", "frontend/src")
|
|
EN_JSON_REL_PATH: str = "locales/en.json"
|
|
DEFAULT_BASELINE_REL_PATH: str = ".kiro/specs/i18n-ci-guard/baseline.txt"
|
|
SNIPPET_MAX_LEN: int = 80
|
|
REFRESH_COMMAND: str = "python scripts/ci/i18n_cjk_guard.py --update-baseline"
|
|
REFRESH_HINT: str = f"# refresh via: {REFRESH_COMMAND}"
|
|
|
|
LocaleFinding = tuple[str, int, str]
|
|
|
|
|
|
class BaselineError(Exception):
|
|
"""Raised when the baseline file is missing or malformed."""
|
|
|
|
|
|
def _truncate(text: str, limit: int = SNIPPET_MAX_LEN) -> str:
|
|
if len(text) <= limit:
|
|
return text
|
|
return text[: limit - 3] + "..."
|
|
|
|
|
|
def _flatten(prefix: str, value: object, out: list[tuple[str, object]]) -> None:
|
|
if isinstance(value, dict):
|
|
for key, child in value.items():
|
|
child_prefix = f"{prefix}.{key}" if prefix else str(key)
|
|
_flatten(child_prefix, child, out)
|
|
else:
|
|
out.append((prefix, value))
|
|
|
|
|
|
def _value_line_number(text_lines: list[str], value: str) -> int:
|
|
"""Best-effort line number for ``value`` in the original JSON text.
|
|
|
|
Tries the raw value first (matches when the JSON file was written with
|
|
``ensure_ascii=False``), then the JSON-escaped form, then falls back to
|
|
line 1 so callers always have a usable integer.
|
|
"""
|
|
candidates: list[str] = [value]
|
|
escaped = json.dumps(value)[1:-1]
|
|
if escaped not in candidates:
|
|
candidates.append(escaped)
|
|
for candidate in candidates:
|
|
if not candidate:
|
|
continue
|
|
for index, line in enumerate(text_lines, start=1):
|
|
if candidate in line:
|
|
return index
|
|
return 1
|
|
|
|
|
|
def scan_locale_cjk(en_json_path: Path) -> list[LocaleFinding]:
|
|
"""Return ``(dotted_key, line_number, snippet)`` for every CJK leaf.
|
|
|
|
Args:
|
|
en_json_path: Path to ``locales/en.json``.
|
|
|
|
Returns:
|
|
A list of findings in document order. Empty when the catalogue is
|
|
CJK-clean. Non-string leaves and empty strings are skipped.
|
|
|
|
Raises:
|
|
FileNotFoundError: If ``en_json_path`` does not exist.
|
|
json.JSONDecodeError: If the file is not valid JSON.
|
|
"""
|
|
raw = en_json_path.read_text(encoding="utf-8")
|
|
data = json.loads(raw)
|
|
flat: list[tuple[str, object]] = []
|
|
_flatten("", data, flat)
|
|
text_lines = raw.splitlines()
|
|
findings: list[LocaleFinding] = []
|
|
for key, value in flat:
|
|
if not isinstance(value, str) or not value:
|
|
continue
|
|
if not CJK_RE.search(value):
|
|
continue
|
|
line_no = _value_line_number(text_lines, value)
|
|
findings.append((key, line_no, _truncate(value)))
|
|
return findings
|
|
|
|
|
|
def count_path_cjk(repo_root: Path, scoped_path: str) -> int:
|
|
"""Count CJK match lines under ``scoped_path`` via ``git grep -nIP``.
|
|
|
|
Args:
|
|
repo_root: Working-tree root used as ``git`` CWD.
|
|
scoped_path: Repo-relative path to scan (e.g. ``backend/app``).
|
|
|
|
Returns:
|
|
The number of matching tracked-text lines. ``-I`` excludes binary
|
|
files; untracked files are excluded by default.
|
|
|
|
Raises:
|
|
RuntimeError: If ``git grep`` fails for any reason other than
|
|
"no matches" (exit code 1, which is treated as zero matches).
|
|
"""
|
|
cmd = ["git", "grep", "-nIP", CJK_PATTERN, "--", scoped_path]
|
|
proc = subprocess.run(
|
|
cmd,
|
|
cwd=repo_root,
|
|
stdout=subprocess.PIPE,
|
|
stderr=subprocess.PIPE,
|
|
text=True,
|
|
)
|
|
if proc.returncode not in (0, 1):
|
|
raise RuntimeError(
|
|
f"git grep failed (exit {proc.returncode}) for {scoped_path}: "
|
|
f"{proc.stderr.strip()}"
|
|
)
|
|
if not proc.stdout:
|
|
return 0
|
|
return sum(1 for line in proc.stdout.splitlines() if line)
|
|
|
|
|
|
def read_baseline(baseline_path: Path) -> dict[str, int]:
|
|
"""Parse the baseline file and return ``{scoped_path: count}``.
|
|
|
|
Args:
|
|
baseline_path: Absolute path to the baseline file.
|
|
|
|
Returns:
|
|
A dict keyed by scoped path with non-negative integer counts.
|
|
|
|
Raises:
|
|
BaselineError: If the file is missing or contains a malformed line.
|
|
"""
|
|
if not baseline_path.exists():
|
|
raise BaselineError(
|
|
f"{baseline_path}: missing or malformed; "
|
|
f"refresh via: {REFRESH_COMMAND}"
|
|
)
|
|
counts: dict[str, int] = {}
|
|
for raw_line in baseline_path.read_text(encoding="utf-8").splitlines():
|
|
line = raw_line.rstrip()
|
|
if not line or line.startswith("#"):
|
|
continue
|
|
if "\t" not in line:
|
|
raise BaselineError(
|
|
f"{baseline_path}: malformed line {raw_line!r}; "
|
|
f"expected '<path>\\t<count>'"
|
|
)
|
|
path, _, count_str = line.partition("\t")
|
|
if not path or not count_str.isdigit():
|
|
raise BaselineError(
|
|
f"{baseline_path}: malformed line {raw_line!r}; "
|
|
f"expected '<path>\\t<count>'"
|
|
)
|
|
counts[path] = int(count_str)
|
|
return counts
|
|
|
|
|
|
def write_baseline(baseline_path: Path, counts: dict[str, int]) -> None:
|
|
"""Atomically write the baseline file with sorted entries.
|
|
|
|
Args:
|
|
baseline_path: Target file path.
|
|
counts: Per-path baseline counts; keys are written in lexicographic
|
|
order with a single trailing newline.
|
|
"""
|
|
header = (
|
|
"# Per-path CJK baseline for the i18n CI guard.\n"
|
|
"# Format: <path>\\t<count>. Sorted lexicographically.\n"
|
|
f"# Refresh via: {REFRESH_COMMAND}\n"
|
|
)
|
|
body_lines = [f"{path}\t{counts[path]}" for path in sorted(counts)]
|
|
body = "\n".join(body_lines) + "\n"
|
|
contents = header + body
|
|
baseline_path.parent.mkdir(parents=True, exist_ok=True)
|
|
tmp = baseline_path.with_suffix(baseline_path.suffix + ".tmp")
|
|
tmp.write_text(contents, encoding="utf-8")
|
|
os.replace(tmp, baseline_path)
|
|
|
|
|
|
def _format_locale_finding(key: str, line_no: int, snippet: str) -> str:
|
|
return f"{EN_JSON_REL_PATH}:{line_no}: cjk-in-en: {key} = {snippet}"
|
|
|
|
|
|
def _format_regression_line(path: str, baseline: int, current: int) -> str:
|
|
delta = current - baseline
|
|
sign = "+" if delta > 0 else ""
|
|
return (
|
|
f"{path}: cjk-regression: baseline={baseline} "
|
|
f"current={current} delta={sign}{delta}"
|
|
)
|
|
|
|
|
|
def run_check(repo_root: Path, baseline_path: Path) -> int:
|
|
"""Run both guard checks and return the script exit code.
|
|
|
|
Args:
|
|
repo_root: Working-tree root passed to ``git grep``.
|
|
baseline_path: Path to the baseline file.
|
|
|
|
Returns:
|
|
``0`` when both checks pass, ``1`` otherwise.
|
|
"""
|
|
failed = False
|
|
success_summary: list[str] = []
|
|
|
|
en_json_path = repo_root / EN_JSON_REL_PATH
|
|
if not en_json_path.exists():
|
|
print(f"{EN_JSON_REL_PATH}: missing catalogue file", file=sys.stderr)
|
|
failed = True
|
|
else:
|
|
try:
|
|
findings = scan_locale_cjk(en_json_path)
|
|
except json.JSONDecodeError as exc:
|
|
print(
|
|
f"{EN_JSON_REL_PATH}: invalid JSON: {exc.msg}",
|
|
file=sys.stderr,
|
|
)
|
|
findings = []
|
|
failed = True
|
|
if findings:
|
|
for key, line_no, snippet in findings:
|
|
print(
|
|
_format_locale_finding(key, line_no, snippet),
|
|
file=sys.stderr,
|
|
)
|
|
print(f"{len(findings)} issues", file=sys.stderr)
|
|
failed = True
|
|
elif not failed:
|
|
success_summary.append("OK locales/en.json is CJK-clean")
|
|
|
|
try:
|
|
baseline = read_baseline(baseline_path)
|
|
except BaselineError as exc:
|
|
print(str(exc), file=sys.stderr)
|
|
return 1
|
|
|
|
current_counts: dict[str, int] = {}
|
|
try:
|
|
for path in SCOPED_PATHS:
|
|
current_counts[path] = count_path_cjk(repo_root, path)
|
|
except RuntimeError as exc:
|
|
print(f"git grep failed: {exc}", file=sys.stderr)
|
|
return 1
|
|
|
|
regressions: list[str] = []
|
|
for path in SCOPED_PATHS:
|
|
baseline_value = baseline.get(path, 0)
|
|
current_value = current_counts[path]
|
|
if current_value > baseline_value:
|
|
regressions.append(
|
|
_format_regression_line(path, baseline_value, current_value)
|
|
)
|
|
|
|
if regressions:
|
|
for line in regressions:
|
|
print(line, file=sys.stderr)
|
|
print(REFRESH_HINT, file=sys.stderr)
|
|
failed = True
|
|
else:
|
|
per_path = ", ".join(
|
|
f"{path}={current_counts[path]}<={baseline.get(path, 0)}"
|
|
for path in SCOPED_PATHS
|
|
)
|
|
success_summary.append(
|
|
f"OK per-path counts within baseline ({per_path})"
|
|
)
|
|
|
|
if not failed:
|
|
for line in success_summary:
|
|
print(line)
|
|
|
|
return 1 if failed else 0
|
|
|
|
|
|
def update_baseline(repo_root: Path, baseline_path: Path) -> int:
|
|
"""Refresh ``baseline_path`` with current per-path counts.
|
|
|
|
Args:
|
|
repo_root: Working-tree root passed to ``git grep``.
|
|
baseline_path: Target baseline file path; created if missing.
|
|
|
|
Returns:
|
|
``0`` on success.
|
|
"""
|
|
counts: dict[str, int] = {}
|
|
for path in SCOPED_PATHS:
|
|
counts[path] = count_path_cjk(repo_root, path)
|
|
write_baseline(baseline_path, counts)
|
|
print(f"baseline updated: {baseline_path}")
|
|
for path in sorted(counts):
|
|
print(f" {path}\t{counts[path]}")
|
|
return 0
|
|
|
|
|
|
def _build_parser() -> argparse.ArgumentParser:
|
|
parser = argparse.ArgumentParser(
|
|
prog="i18n_cjk_guard",
|
|
description=(
|
|
"PR-time guard: fail when locales/en.json contains CJK or when "
|
|
"backend/app + frontend/src CJK match counts exceed the "
|
|
"committed baseline."
|
|
),
|
|
)
|
|
parser.add_argument(
|
|
"--update-baseline",
|
|
action="store_true",
|
|
help=(
|
|
"overwrite the baseline file with current counts and exit 0"
|
|
),
|
|
)
|
|
parser.add_argument(
|
|
"--baseline",
|
|
type=Path,
|
|
default=None,
|
|
help=(
|
|
f"path to the baseline file (default: {DEFAULT_BASELINE_REL_PATH})"
|
|
),
|
|
)
|
|
parser.add_argument(
|
|
"--repo-root",
|
|
type=Path,
|
|
default=None,
|
|
help=(
|
|
"repository root (default: detected via "
|
|
"`git rev-parse --show-toplevel`)"
|
|
),
|
|
)
|
|
return parser
|
|
|
|
|
|
def _detect_repo_root(explicit: Path | None) -> Path:
|
|
if explicit is not None:
|
|
return explicit.resolve()
|
|
proc = subprocess.run(
|
|
["git", "rev-parse", "--show-toplevel"],
|
|
stdout=subprocess.PIPE,
|
|
stderr=subprocess.PIPE,
|
|
text=True,
|
|
)
|
|
if proc.returncode != 0:
|
|
raise RuntimeError(
|
|
f"unable to detect repository root: {proc.stderr.strip()}"
|
|
)
|
|
return Path(proc.stdout.strip())
|
|
|
|
|
|
def main(argv: list[str] | None = None) -> int:
|
|
"""CLI entry point. Returns the script exit code."""
|
|
parser = _build_parser()
|
|
args = parser.parse_args(argv)
|
|
try:
|
|
repo_root = _detect_repo_root(args.repo_root)
|
|
except RuntimeError as exc:
|
|
print(str(exc), file=sys.stderr)
|
|
return 1
|
|
if args.baseline is not None:
|
|
baseline_path = args.baseline.resolve()
|
|
else:
|
|
baseline_path = (repo_root / DEFAULT_BASELINE_REL_PATH).resolve()
|
|
if args.update_baseline:
|
|
return update_baseline(repo_root, baseline_path)
|
|
return run_check(repo_root, baseline_path)
|
|
|
|
|
|
if __name__ == "__main__":
|
|
sys.exit(main())
|