MicroFish/scripts/ci/tests/test_i18n_cjk_guard.py

711 lines
25 KiB
Python

"""Unit and integration tests for ``scripts/ci/i18n_cjk_guard.py``.
Stdlib-only tests using ``unittest``. Run from the repository root with::
python -m unittest scripts/ci/tests/test_i18n_cjk_guard.py
or as a script::
python scripts/ci/tests/test_i18n_cjk_guard.py
"""
from __future__ import annotations
import json
import os
import subprocess
import sys
import tempfile
import unittest
from pathlib import Path
_HERE = Path(__file__).resolve().parent
_GUARD_DIR = _HERE.parent
sys.path.insert(0, str(_GUARD_DIR))
import i18n_cjk_guard as guard # noqa: E402
def _git(repo: Path, *args: str) -> subprocess.CompletedProcess[str]:
"""Run a git command in ``repo`` and return the completed process."""
return subprocess.run(
["git", *args],
cwd=repo,
check=True,
stdout=subprocess.PIPE,
stderr=subprocess.PIPE,
text=True,
)
def _make_repo(tmp: Path) -> Path:
"""Initialize an isolated git repository at ``tmp`` and return the path."""
_git(tmp, "init", "-q", "-b", "main")
_git(tmp, "config", "user.email", "test@example.com")
_git(tmp, "config", "user.name", "Test")
return tmp
def _commit_file(repo: Path, rel: str, content: str | bytes) -> None:
"""Write a file under ``repo`` and commit it."""
target = repo / rel
target.parent.mkdir(parents=True, exist_ok=True)
if isinstance(content, str):
target.write_text(content, encoding="utf-8")
else:
target.write_bytes(content)
_git(repo, "add", "--", rel)
_git(repo, "commit", "-q", "-m", f"add {rel}")
class ScanLocaleCjkTests(unittest.TestCase):
"""``scan_locale_cjk`` returns one ``LocaleFinding`` per CJK leaf string."""
def test_clean_catalogue_returns_empty_list(self) -> None:
with tempfile.TemporaryDirectory() as tmp:
en_path = Path(tmp) / "en.json"
en_path.write_text(
json.dumps(
{"common": {"confirm": "Confirm", "cancel": "Cancel"}},
indent=2,
),
encoding="utf-8",
)
self.assertEqual(guard.scan_locale_cjk(en_path), [])
def test_planted_cjk_returns_one_finding(self) -> None:
with tempfile.TemporaryDirectory() as tmp:
en_path = Path(tmp) / "en.json"
data = {
"common": {
"confirm": "Confirm",
"cancel": "取消",
}
}
en_path.write_text(
json.dumps(data, indent=2, ensure_ascii=False),
encoding="utf-8",
)
findings = guard.scan_locale_cjk(en_path)
self.assertEqual(len(findings), 1)
key, line_no, snippet = findings[0]
self.assertEqual(key, "common.cancel")
self.assertGreaterEqual(line_no, 1)
self.assertIn("取消", snippet)
def test_long_value_is_truncated(self) -> None:
with tempfile.TemporaryDirectory() as tmp:
en_path = Path(tmp) / "en.json"
value = "前置" + ("x" * 200)
en_path.write_text(
json.dumps({"k": value}, ensure_ascii=False),
encoding="utf-8",
)
findings = guard.scan_locale_cjk(en_path)
self.assertEqual(len(findings), 1)
self.assertLessEqual(len(findings[0][2]), guard.SNIPPET_MAX_LEN)
class CountPathCjkTests(unittest.TestCase):
"""``count_path_cjk`` shells out to ``git grep -nIP``."""
def test_returns_zero_for_empty_match(self) -> None:
with tempfile.TemporaryDirectory() as tmp:
repo = _make_repo(Path(tmp))
_commit_file(repo, "src/a.txt", "hello world\n")
self.assertEqual(guard.count_path_cjk(repo, "src"), 0)
def test_counts_planted_cjk_lines(self) -> None:
with tempfile.TemporaryDirectory() as tmp:
repo = _make_repo(Path(tmp))
_commit_file(
repo,
"src/a.py",
"# 一\nprint('hi')\n# 二三\nx = ''\n",
)
# Three lines contain CJK: # 一 ; # 二三 ; x = '四'.
self.assertEqual(guard.count_path_cjk(repo, "src"), 3)
def test_skips_binary_files(self) -> None:
with tempfile.TemporaryDirectory() as tmp:
repo = _make_repo(Path(tmp))
# A "binary" blob containing CJK bytes; -I should exclude it.
_commit_file(
repo,
"src/blob.bin",
b"\x00\x01\x02\xe4\xb8\x80\x00\xff",
)
self.assertEqual(guard.count_path_cjk(repo, "src"), 0)
def test_skips_untracked_files(self) -> None:
with tempfile.TemporaryDirectory() as tmp:
repo = _make_repo(Path(tmp))
_commit_file(repo, "src/.gitkeep", "")
(repo / "src" / "untracked.py").write_text(
"x = ''\n", encoding="utf-8"
)
self.assertEqual(guard.count_path_cjk(repo, "src"), 0)
class BaselineRoundTripTests(unittest.TestCase):
"""``read_baseline`` and ``write_baseline`` round-trip cleanly."""
def test_round_trip(self) -> None:
with tempfile.TemporaryDirectory() as tmp:
path = Path(tmp) / "baseline.txt"
counts = {"backend/app": 2792, "frontend/src": 902}
guard.write_baseline(path, counts)
self.assertTrue(path.read_text().endswith("\n"))
self.assertEqual(guard.read_baseline(path), counts)
def test_sorted_lexicographically_and_single_trailing_newline(self) -> None:
with tempfile.TemporaryDirectory() as tmp:
path = Path(tmp) / "baseline.txt"
guard.write_baseline(path, {"frontend/src": 1, "backend/app": 2})
text = path.read_text(encoding="utf-8")
data_lines = [
line for line in text.splitlines() if not line.startswith("#")
]
self.assertEqual(
data_lines,
["backend/app\t2", "frontend/src\t1"],
)
self.assertTrue(text.endswith("\n"))
self.assertFalse(text.endswith("\n\n"))
def test_missing_file_raises_baseline_error(self) -> None:
with tempfile.TemporaryDirectory() as tmp:
path = Path(tmp) / "missing.txt"
with self.assertRaises(guard.BaselineError):
guard.read_baseline(path)
def test_malformed_line_raises_baseline_error(self) -> None:
with tempfile.TemporaryDirectory() as tmp:
path = Path(tmp) / "baseline.txt"
path.write_text(
"# header\nbackend/app 100\n", encoding="utf-8"
)
with self.assertRaises(guard.BaselineError):
guard.read_baseline(path)
class RunCheckEndToEndTests(unittest.TestCase):
"""End-to-end test of ``run_check`` against a synthetic repo."""
def _make_full_repo(
self,
tmp: Path,
*,
en_json: dict,
backend_lines: int,
frontend_lines: int,
zh_json: dict | None = None,
) -> tuple[Path, Path]:
repo = _make_repo(tmp)
_commit_file(
repo,
"locales/en.json",
json.dumps(en_json, indent=2, ensure_ascii=False),
)
zh_payload = zh_json if zh_json is not None else en_json
_commit_file(
repo,
"locales/zh.json",
json.dumps(zh_payload, indent=2, ensure_ascii=False),
)
if backend_lines:
content = "\n".join(f"# 中{i}" for i in range(backend_lines)) + "\n"
_commit_file(repo, "backend/app/x.py", content)
else:
_commit_file(repo, "backend/app/.gitkeep", "")
if frontend_lines:
content = "\n".join(f"// 中{i}" for i in range(frontend_lines)) + "\n"
_commit_file(repo, "frontend/src/x.js", content)
else:
_commit_file(repo, "frontend/src/.gitkeep", "")
baseline_path = repo / "baseline.txt"
return repo, baseline_path
def test_pass_within_baseline(self) -> None:
with tempfile.TemporaryDirectory() as tmp:
repo, baseline_path = self._make_full_repo(
Path(tmp),
en_json={"k": "Confirm"},
backend_lines=3,
frontend_lines=2,
)
guard.write_baseline(
baseline_path,
{"backend/app": 5, "frontend/src": 5},
)
rc = guard.run_check(repo, baseline_path)
self.assertEqual(rc, 0)
def test_fail_on_locale_cjk(self) -> None:
with tempfile.TemporaryDirectory() as tmp:
repo, baseline_path = self._make_full_repo(
Path(tmp),
en_json={"k": "中文"},
backend_lines=0,
frontend_lines=0,
)
guard.write_baseline(
baseline_path,
{"backend/app": 0, "frontend/src": 0},
)
rc = guard.run_check(repo, baseline_path)
self.assertEqual(rc, 1)
def test_fail_on_regression_with_refresh_hint(self) -> None:
with tempfile.TemporaryDirectory() as tmp:
repo, baseline_path = self._make_full_repo(
Path(tmp),
en_json={"k": "Confirm"},
backend_lines=10,
frontend_lines=0,
)
guard.write_baseline(
baseline_path,
{"backend/app": 5, "frontend/src": 0},
)
# Capture stderr.
from io import StringIO
captured_err = StringIO()
old_err = sys.stderr
sys.stderr = captured_err
try:
rc = guard.run_check(repo, baseline_path)
finally:
sys.stderr = old_err
self.assertEqual(rc, 1)
err_text = captured_err.getvalue()
self.assertIn("cjk-regression", err_text)
self.assertIn(
"python scripts/ci/i18n_cjk_guard.py --update-baseline",
err_text,
)
def test_missing_en_json_fails(self) -> None:
with tempfile.TemporaryDirectory() as tmp:
repo = _make_repo(Path(tmp))
_commit_file(repo, "backend/app/.gitkeep", "")
_commit_file(repo, "frontend/src/.gitkeep", "")
baseline_path = repo / "baseline.txt"
guard.write_baseline(
baseline_path,
{"backend/app": 0, "frontend/src": 0},
)
rc = guard.run_check(repo, baseline_path)
self.assertEqual(rc, 1)
def test_missing_baseline_fails(self) -> None:
with tempfile.TemporaryDirectory() as tmp:
repo, baseline_path = self._make_full_repo(
Path(tmp),
en_json={"k": "Confirm"},
backend_lines=0,
frontend_lines=0,
)
# Do not write the baseline.
self.assertFalse(baseline_path.exists())
rc = guard.run_check(repo, baseline_path)
self.assertEqual(rc, 1)
class UpdateBaselineTests(unittest.TestCase):
"""``update_baseline`` writes current counts and exits 0."""
def test_update_then_check_passes(self) -> None:
with tempfile.TemporaryDirectory() as tmp:
repo = _make_repo(Path(tmp))
_commit_file(
repo,
"locales/en.json",
json.dumps({"k": "Confirm"}, indent=2),
)
_commit_file(
repo,
"locales/zh.json",
json.dumps({"k": "Confirm"}, indent=2),
)
_commit_file(repo, "backend/app/x.py", "# 一\n# 二\n")
_commit_file(repo, "frontend/src/.gitkeep", "")
baseline_path = repo / "baseline.txt"
self.assertEqual(
guard.update_baseline(repo, baseline_path), 0
)
counts = guard.read_baseline(baseline_path)
self.assertEqual(counts["backend/app"], 2)
self.assertEqual(counts["frontend/src"], 0)
self.assertEqual(guard.run_check(repo, baseline_path), 0)
class CliSmokeTests(unittest.TestCase):
"""``main`` exposes the documented CLI surface."""
def test_help_flag_exits_zero(self) -> None:
guard_script = _GUARD_DIR / "i18n_cjk_guard.py"
proc = subprocess.run(
[sys.executable, str(guard_script), "--help"],
stdout=subprocess.PIPE,
stderr=subprocess.PIPE,
text=True,
)
self.assertEqual(proc.returncode, 0)
for flag in ("--update-baseline", "--baseline", "--repo-root"):
self.assertIn(flag, proc.stdout)
def test_unknown_flag_exits_nonzero(self) -> None:
guard_script = _GUARD_DIR / "i18n_cjk_guard.py"
proc = subprocess.run(
[sys.executable, str(guard_script), "--no-such-flag"],
stdout=subprocess.PIPE,
stderr=subprocess.PIPE,
text=True,
)
self.assertNotEqual(proc.returncode, 0)
class FlattenKeysTests(unittest.TestCase):
"""``_flatten_keys`` returns the dotted-key set of a parsed catalogue."""
def test_empty_dict_returns_empty_set(self) -> None:
self.assertEqual(guard._flatten_keys({}), set())
def test_flat_dict_returns_top_level_keys(self) -> None:
self.assertEqual(
guard._flatten_keys({"a": "v", "b": "w"}),
{"a", "b"},
)
def test_nested_dict_uses_dot_separator(self) -> None:
self.assertEqual(
guard._flatten_keys({"a": {"b": {"c": "v"}}}),
{"a.b.c"},
)
def test_scalar_leaves_count_as_keys(self) -> None:
# Requirement 1.5: scalar leaves (number, bool, null) and string
# leaves are treated identically for parity purposes.
self.assertEqual(
guard._flatten_keys(
{
"n": 42,
"b": True,
"s": "x",
"z": None,
"f": 3.14,
}
),
{"n", "b", "s", "z", "f"},
)
def test_dict_leaf_does_not_become_a_key(self) -> None:
# Only non-dict leaves emit keys; the parent path is NOT itself
# emitted when it has children.
keys = guard._flatten_keys({"parent": {"child": "v"}})
self.assertNotIn("parent", keys)
self.assertIn("parent.child", keys)
class LocateKeyLineTests(unittest.TestCase):
"""``_locate_key_line`` resolves the 1-based line of a dotted key."""
def test_returns_line_number_of_quoted_leaf_segment(self) -> None:
text_lines = [
"{",
' "a": {',
' "missingKey": "v"',
" }",
"}",
]
self.assertEqual(
guard._locate_key_line(text_lines, "a.missingKey"),
3,
)
def test_first_match_wins(self) -> None:
text_lines = [
"{",
' "k": "first"',
' "k": "second"',
"}",
]
self.assertEqual(guard._locate_key_line(text_lines, "k"), 2)
def test_missing_key_falls_back_to_line_one(self) -> None:
text_lines = ["{", ' "other": "v"', "}"]
self.assertEqual(guard._locate_key_line(text_lines, "absent"), 1)
class FormatParityFindingTests(unittest.TestCase):
"""``_format_parity_finding`` produces canonical parity-failure lines."""
def test_en_only_layout(self) -> None:
line = guard._format_parity_finding(
"locales/en.json", 17, "common.foo", "en-only"
)
self.assertEqual(
line, "locales/en.json:17: parity-en-only: common.foo"
)
def test_zh_only_layout(self) -> None:
line = guard._format_parity_finding(
"locales/zh.json", 5, "log.api.bar", "zh-only"
)
self.assertEqual(
line, "locales/zh.json:5: parity-zh-only: log.api.bar"
)
class RunParityCheckTests(unittest.TestCase):
"""``run_parity_check`` returns a ``ParityResult`` for the live tree."""
def _write_catalogues(
self,
repo: Path,
en_payload: dict,
zh_payload: dict,
) -> None:
(repo / "locales").mkdir(parents=True, exist_ok=True)
(repo / "locales" / "en.json").write_text(
json.dumps(en_payload, indent=2, ensure_ascii=False) + "\n",
encoding="utf-8",
)
(repo / "locales" / "zh.json").write_text(
json.dumps(zh_payload, indent=2, ensure_ascii=False) + "\n",
encoding="utf-8",
)
def test_passes_when_keys_match(self) -> None:
with tempfile.TemporaryDirectory() as tmp:
repo = Path(tmp)
payload = {"common": {"a": "A", "b": "B"}, "k": "v"}
self._write_catalogues(repo, payload, payload)
result = guard.run_parity_check(repo)
self.assertTrue(result.passed)
self.assertEqual(result.failure_lines, [])
self.assertIsNotNone(result.success_summary)
self.assertIn(
"OK locale-parity:", result.success_summary or ""
)
# Three flattened keys: common.a, common.b, k.
self.assertIn("3 keys per side", result.success_summary or "")
def test_fails_on_en_only_key(self) -> None:
with tempfile.TemporaryDirectory() as tmp:
repo = Path(tmp)
self._write_catalogues(
repo,
{"k": "v", "extra": "only-en"},
{"k": "v"},
)
result = guard.run_parity_check(repo)
self.assertFalse(result.passed)
self.assertTrue(
any(
"parity-en-only: extra" in line
for line in result.failure_lines
),
result.failure_lines,
)
self.assertEqual(
result.failure_lines[-1],
"parity: en-only=1, zh-only=0",
)
self.assertIsNone(result.success_summary)
def test_fails_on_zh_only_key(self) -> None:
with tempfile.TemporaryDirectory() as tmp:
repo = Path(tmp)
self._write_catalogues(
repo,
{"k": "v"},
{"k": "v", "extra": "only-zh"},
)
result = guard.run_parity_check(repo)
self.assertFalse(result.passed)
self.assertTrue(
any(
"parity-zh-only: extra" in line
for line in result.failure_lines
),
result.failure_lines,
)
self.assertEqual(
result.failure_lines[-1],
"parity: en-only=0, zh-only=1",
)
def test_fails_on_two_sided_divergence_with_en_first(self) -> None:
with tempfile.TemporaryDirectory() as tmp:
repo = Path(tmp)
self._write_catalogues(
repo,
{"a": "v", "z": "v", "shared": "v"},
{"b": "v", "y": "v", "shared": "v"},
)
result = guard.run_parity_check(repo)
self.assertFalse(result.passed)
categories = [
"en-only" if "parity-en-only" in line else
"zh-only" if "parity-zh-only" in line else
"summary"
for line in result.failure_lines
]
# All en-only lines come before all zh-only lines, and the
# summary is last.
self.assertEqual(
categories,
[
"en-only", "en-only",
"zh-only", "zh-only",
"summary",
],
result.failure_lines,
)
# Within each side keys appear lexicographically.
en_only_lines = [
line for line in result.failure_lines
if "parity-en-only" in line
]
zh_only_lines = [
line for line in result.failure_lines
if "parity-zh-only" in line
]
self.assertTrue(en_only_lines[0].endswith(": a"))
self.assertTrue(en_only_lines[1].endswith(": z"))
self.assertTrue(zh_only_lines[0].endswith(": b"))
self.assertTrue(zh_only_lines[1].endswith(": y"))
self.assertEqual(
result.failure_lines[-1],
"parity: en-only=2, zh-only=2",
)
def test_passes_with_scalar_leaves_at_same_path(self) -> None:
# Requirement 1.5: scalar leaves at the same dotted path on both
# sides do not count as a parity divergence.
with tempfile.TemporaryDirectory() as tmp:
repo = Path(tmp)
self._write_catalogues(
repo,
{"flag": True, "count": 42, "label": "x", "missing": None},
{"flag": False, "count": 7, "label": "y", "missing": None},
)
result = guard.run_parity_check(repo)
self.assertTrue(result.passed)
def test_missing_zh_catalogue_fails(self) -> None:
with tempfile.TemporaryDirectory() as tmp:
repo = Path(tmp)
(repo / "locales").mkdir(parents=True)
(repo / "locales" / "en.json").write_text(
'{"k": "v"}\n', encoding="utf-8"
)
# zh.json deliberately not written.
result = guard.run_parity_check(repo)
self.assertFalse(result.passed)
self.assertTrue(
any(
"locales/zh.json" in line and "parity-error" in line
for line in result.failure_lines
),
result.failure_lines,
)
class RunCheckParityCompositionTests(unittest.TestCase):
"""End-to-end: ``run_check`` composes CJK, ratchet, and parity."""
def _make_repo(
self,
tmp: Path,
*,
en_json: dict,
zh_json: dict | None = None,
backend_lines: int = 0,
) -> tuple[Path, Path]:
repo = _make_repo(tmp)
_commit_file(
repo,
"locales/en.json",
json.dumps(en_json, indent=2, ensure_ascii=False),
)
zh_payload = zh_json if zh_json is not None else en_json
_commit_file(
repo,
"locales/zh.json",
json.dumps(zh_payload, indent=2, ensure_ascii=False),
)
if backend_lines:
content = (
"\n".join(f"# 中{i}" for i in range(backend_lines)) + "\n"
)
_commit_file(repo, "backend/app/x.py", content)
else:
_commit_file(repo, "backend/app/.gitkeep", "")
_commit_file(repo, "frontend/src/.gitkeep", "")
baseline_path = repo / "baseline.txt"
return repo, baseline_path
def test_clean_repo_emits_three_success_summaries(self) -> None:
with tempfile.TemporaryDirectory() as tmp:
repo, baseline_path = self._make_repo(
Path(tmp),
en_json={"k": "Confirm"},
)
guard.write_baseline(
baseline_path,
{"backend/app": 0, "frontend/src": 0},
)
from io import StringIO
captured_out = StringIO()
old_out = sys.stdout
sys.stdout = captured_out
try:
rc = guard.run_check(repo, baseline_path)
finally:
sys.stdout = old_out
self.assertEqual(rc, 0)
stdout = captured_out.getvalue()
self.assertIn("OK locales/en.json is CJK-clean", stdout)
self.assertIn("OK per-path counts within baseline", stdout)
self.assertIn("OK locale-parity:", stdout)
def test_no_short_circuit_on_combined_failures(self) -> None:
# Plant CJK in en.json AND a parity divergence so that BOTH
# the existing CJK-clean check and the new parity check fail
# in the same run. The orchestrator must run both blocks
# without short-circuiting; both failure tokens must surface
# in stderr together.
with tempfile.TemporaryDirectory() as tmp:
repo, baseline_path = self._make_repo(
Path(tmp),
en_json={"k": "Confirm", "extra": "中文"},
zh_json={"k": "Confirm"},
)
guard.write_baseline(
baseline_path,
{"backend/app": 0, "frontend/src": 0},
)
from io import StringIO
captured_err = StringIO()
old_err = sys.stderr
sys.stderr = captured_err
try:
rc = guard.run_check(repo, baseline_path)
finally:
sys.stderr = old_err
self.assertEqual(rc, 1)
err = captured_err.getvalue()
# Both check categories must surface.
self.assertIn("cjk-in-en", err)
self.assertIn("parity-en-only: extra", err)
self.assertIn("parity: en-only=1, zh-only=0", err)
if __name__ == "__main__":
unittest.main()