MicroFish/.kiro/specs/i18n-e2e-english-verification/audit/scripts/check_parity.py

129 lines
4.0 KiB
Python
Executable File

#!/usr/bin/env python3
"""Diff locales/en.json against locales/zh.json and emit parity.txt.
Three labelled blocks are written:
* `[missing-keys]` - keys present on one side but not the other.
* `[cjk-in-en]` - EN catalogue values that contain CJK characters.
* `[identical-values]` - keys whose EN and ZH value are identical AND the
value is non-empty AND has more than two ASCII words.
These are review-needed signals, not gaps.
Run from the repository root.
"""
from __future__ import annotations
import json
import re
import sys
from pathlib import Path
from typing import Dict, Iterator, Tuple
CJK_RANGE = re.compile(r"[一-鿿]")
def flatten(d: Dict[str, object], prefix: str = "") -> Iterator[Tuple[str, object]]:
"""Recursively yield (dotted-key, value) pairs from a nested dict."""
for key, value in d.items():
path = f"{prefix}.{key}" if prefix else key
if isinstance(value, dict):
yield from flatten(value, path)
else:
yield path, value
def is_non_trivial_english_prose(value: object) -> bool:
"""Heuristic for the identical-value 'review-needed' signal.
True when:
* value is a string,
* value is non-empty after strip,
* value contains more than two whitespace-separated tokens,
* value contains no CJK characters (otherwise it's just an untranslated
ZH original which is not a review-needed signal here).
"""
if not isinstance(value, str):
return False
text = value.strip()
if not text:
return False
if CJK_RANGE.search(text):
return False
return len(text.split()) > 2
def main(argv: list[str]) -> int:
if len(argv) != 2:
print(f"usage: {argv[0]} <sha-dir>", file=sys.stderr)
return 64
sha_dir = Path(argv[1])
sha_dir.mkdir(parents=True, exist_ok=True)
out_path = sha_dir / "parity.txt"
en_path = Path("locales/en.json")
zh_path = Path("locales/zh.json")
if not en_path.exists() or not zh_path.exists():
print(f"missing locale files: {en_path}, {zh_path}", file=sys.stderr)
return 1
en = json.loads(en_path.read_text(encoding="utf-8"))
zh = json.loads(zh_path.read_text(encoding="utf-8"))
en_flat = dict(flatten(en))
zh_flat = dict(flatten(zh))
en_only = sorted(set(en_flat) - set(zh_flat))
zh_only = sorted(set(zh_flat) - set(en_flat))
cjk_in_en = []
for key, value in sorted(en_flat.items()):
if isinstance(value, str) and CJK_RANGE.search(value):
cjk_in_en.append((key, value))
identical = []
for key in sorted(set(en_flat) & set(zh_flat)):
en_val = en_flat[key]
zh_val = zh_flat[key]
if en_val == zh_val and is_non_trivial_english_prose(en_val):
identical.append((key, en_val))
lines: list[str] = []
lines.append(f"# Locale parity for HEAD")
lines.append(f"# en keys: {len(en_flat)}")
lines.append(f"# zh keys: {len(zh_flat)}")
lines.append("")
lines.append("[missing-keys]")
if not en_only and not zh_only:
lines.append("# (none)")
for key in en_only:
lines.append(f"en-only: {key}")
for key in zh_only:
lines.append(f"zh-only: {key}")
lines.append("")
lines.append("[cjk-in-en]")
if not cjk_in_en:
lines.append("# (none)")
for key, value in cjk_in_en:
snippet = value if len(value) <= 80 else value[:77] + "..."
lines.append(f"{key}: {snippet}")
lines.append("")
lines.append("[identical-values]")
if not identical:
lines.append("# (none)")
for key, value in identical:
snippet = value if len(value) <= 80 else value[:77] + "..."
lines.append(f"{key}: {snippet}")
lines.append("")
out_path.write_text("\n".join(lines) + "\n", encoding="utf-8")
print(
f" parity.txt written: missing={len(en_only) + len(zh_only)}, "
f"cjk-in-en={len(cjk_in_en)}, identical-values={len(identical)}"
)
return 0
if __name__ == "__main__":
sys.exit(main(sys.argv))