From 081de636f1e9a29455ab7f2ec0eb7cbf8dfb1da6 Mon Sep 17 00:00:00 2001 From: Dominik Seemann Date: Fri, 8 May 2026 00:39:34 +0000 Subject: [PATCH] ci(i18n): add cjk regression guard for every pull request Adds a stdlib-only Python script and a new GitHub Actions workflow that fail any pull request which reintroduces CJK characters into locales/en.json or which raises the total CJK match count under backend/app or frontend/src above a committed per-path baseline. The guard captures the two highest-signal checks of the larger i18n-e2e-english-verification audit so it can run on every PR with a sub-second budget and without depending on that pipeline being on main. The committed baseline lets the codebase ratchet down toward English-only without blocking unrelated PRs on pre-existing CJK content; refresh it intentionally via the documented flag. Closes #26 --- .github/workflows/i18n-cjk-guard.yml | 26 ++ .kiro/specs/i18n-ci-guard/baseline.txt | 5 + .kiro/specs/i18n-ci-guard/design.md | 544 ++++++++++++++++++++++ .kiro/specs/i18n-ci-guard/gap-analysis.md | 169 +++++++ .kiro/specs/i18n-ci-guard/requirements.md | 189 ++++++++ .kiro/specs/i18n-ci-guard/research.md | 175 +++++++ .kiro/specs/i18n-ci-guard/spec.json | 24 + .kiro/specs/i18n-ci-guard/tasks.md | 157 +++++++ scripts/ci/i18n_cjk_guard.py | 393 ++++++++++++++++ scripts/ci/tests/test_i18n_cjk_guard.py | 358 ++++++++++++++ 10 files changed, 2040 insertions(+) create mode 100644 .github/workflows/i18n-cjk-guard.yml create mode 100644 .kiro/specs/i18n-ci-guard/baseline.txt create mode 100644 .kiro/specs/i18n-ci-guard/design.md create mode 100644 .kiro/specs/i18n-ci-guard/gap-analysis.md create mode 100644 .kiro/specs/i18n-ci-guard/requirements.md create mode 100644 .kiro/specs/i18n-ci-guard/research.md create mode 100644 .kiro/specs/i18n-ci-guard/spec.json create mode 100644 .kiro/specs/i18n-ci-guard/tasks.md create mode 100755 scripts/ci/i18n_cjk_guard.py create mode 100644 scripts/ci/tests/test_i18n_cjk_guard.py diff --git a/.github/workflows/i18n-cjk-guard.yml b/.github/workflows/i18n-cjk-guard.yml new file mode 100644 index 00000000..067d06b5 --- /dev/null +++ b/.github/workflows/i18n-cjk-guard.yml @@ -0,0 +1,26 @@ +name: i18n CJK Guard + +on: + pull_request: + branches: [main] + +permissions: + contents: read + +jobs: + guard: + runs-on: ubuntu-latest + timeout-minutes: 1 + steps: + - name: Checkout + uses: actions/checkout@v4 + with: + fetch-depth: 1 + + - name: Set up Python + uses: actions/setup-python@v5 + with: + python-version: '3.11' + + - name: Run i18n CJK guard + run: python scripts/ci/i18n_cjk_guard.py diff --git a/.kiro/specs/i18n-ci-guard/baseline.txt b/.kiro/specs/i18n-ci-guard/baseline.txt new file mode 100644 index 00000000..e92f1a6e --- /dev/null +++ b/.kiro/specs/i18n-ci-guard/baseline.txt @@ -0,0 +1,5 @@ +# Per-path CJK baseline for the i18n CI guard. +# Format: \t. Sorted lexicographically. +# Refresh via: python scripts/ci/i18n_cjk_guard.py --update-baseline +backend/app 2792 +frontend/src 902 diff --git a/.kiro/specs/i18n-ci-guard/design.md b/.kiro/specs/i18n-ci-guard/design.md new file mode 100644 index 00000000..d694e1f6 --- /dev/null +++ b/.kiro/specs/i18n-ci-guard/design.md @@ -0,0 +1,544 @@ +# Design — i18n-ci-guard + +## Overview + +This feature installs a permanent, PR-time CI guard that blocks +regressions of the project's English-by-default state. It performs two +checks: `locales/en.json` must contain zero CJK characters, and the +total CJK match count under `backend/app/` and `frontend/src/` must not +exceed a committed per-path baseline. The guard is a single Python +script invoked by a single GitHub Actions workflow. + +**Purpose**: This feature delivers an automatic regression gate to the +i18n initiative so reviewers do not have to spot CJK reintroductions +by eye. +**Users**: Project maintainers and PR authors. Maintainers gain a +hard regression gate; PR authors gain a script they can run locally to +catch regressions before pushing. +**Impact**: Adds the project's first `pull_request`-triggered CI +workflow. No production source under `backend/app/`, `frontend/src/`, +or `locales/` is modified by this spec — only new files are added. + +### Goals + +- Fail any PR that introduces a CJK character into `locales/en.json`. +- Fail any PR whose CJK match count under `backend/app/` or + `frontend/src/` exceeds the committed baseline. +- Print a single actionable failure message that includes the exact + command a contributor must run if the regression is intentional. +- Run end-to-end under sixty seconds on `ubuntu-latest`. +- Be reproducible verbatim on a developer machine with Python ≥3.11 + and `git`. + +### Non-Goals + +- Re-implementing the full classification pipeline from + `.kiro/specs/i18n-e2e-english-verification/` (that work belongs to + PR #27). +- Auto-updating the baseline on `main`. +- Translating any production source to satisfy a higher baseline. The + initial baseline is recorded against `main` and only ratchets down + over time. +- Gating commits at pre-commit time. The guard is CI-only; a future + spec may wrap it in a hook. + +## Boundary Commitments + +### This Spec Owns + +- The guard script `scripts/ci/i18n_cjk_guard.py` and its CLI + contract. +- The workflow `.github/workflows/i18n-cjk-guard.yml` and its + trigger configuration. +- The baseline file `.kiro/specs/i18n-ci-guard/baseline.txt` and its + format. +- The pass/fail semantics of both checks. + +### Out of Boundary + +- Any change to files under `backend/app/`, `frontend/src/`, or + `locales/` — except `locales/en.json` if it is found to contain CJK + during initial baseline calibration (a remediation translation would + be a separate spec/PR). +- The classification heuristics in PR #27's `classify.py`. +- Pre-commit hooks; IDE integrations; alternative scoped paths beyond + `backend/app/` and `frontend/src/`. + +### Allowed Dependencies + +- Python ≥3.11 standard library. +- `git` (for `git grep -nIP` invocation). +- `actions/checkout@v4` and `actions/setup-python@v5` from the + GitHub Actions Marketplace. + +### Revalidation Triggers + +- Adding a third scoped path → baseline file format changes; consumers + (none today) re-check. +- Changing the regex range → audit pipeline alignment must be + re-confirmed. +- Switching from `pull_request` to `merge_group` or other event → + required-status-check rules in branch protection must be re-checked. + +## Architecture + +### Existing Architecture Analysis + +- **Repo layout**: monorepo split by runtime (`backend/`, `frontend/`) + with shared `locales/` at root. The guard scopes its scan to + `backend/app/`, `frontend/src/`, and `locales/en.json`, matching the + audit pipeline's canonical scope. +- **Existing scripts pattern**: `scripts/.py` for developer + tools. The new `scripts/ci/` subdirectory introduces a clear, + CI-only home without disturbing the existing developer scripts. +- **Existing CI**: `.github/workflows/docker-image.yml` is tag-only. + No `pull_request` workflow exists. The new workflow is additive and + does not affect the docker-image workflow. + +### Architecture Pattern & Boundary Map + +```mermaid +flowchart LR + PR[Pull Request to main] -->|trigger| WF[.github/workflows/i18n-cjk-guard.yml] + WF -->|setup-python + checkout| RUN[python scripts/ci/i18n_cjk_guard.py] + RUN -->|read| EN[locales/en.json] + RUN -->|git grep -nIP| BAPP[backend/app/] + RUN -->|git grep -nIP| FSRC[frontend/src/] + RUN -->|read| BL[.kiro/specs/i18n-ci-guard/baseline.txt] + RUN -->|exit 0 or 1| WF + WF -->|status| PR + + DEV[Developer terminal] -->|python scripts/ci/i18n_cjk_guard.py| RUN + DEV -->|--update-baseline| RUN + RUN -.->|writes| BL +``` + +**Architecture Integration**: + +- **Selected pattern**: single-purpose script + thin workflow. + Matches the project's existing `scripts/.py` convention. +- **Domain boundaries**: the guard is a pure verification tool with no + side effects on production code. Its only writeable surface is the + baseline file, and only when explicitly invoked with + `--update-baseline`. +- **Existing patterns preserved**: stdlib-only Python tooling + (precedent: `scripts/check_i18n_logs.py`); single-file workflows in + `.github/workflows/`. +- **New components rationale**: a new file rather than an extension of + an existing script — the existing script is scoped to a fixed + module list and is not a regression gate. +- **Steering compliance**: respects layer-based structure (script + lives at repo root in `scripts/ci/`, not under `backend/` or + `frontend/`), no new heavy dependencies, no `os.getenv` calls + outside `backend/app/config.py`. + +### Technology Stack + +| Layer | Choice / Version | Role in Feature | Notes | +|-------|------------------|-----------------|-------| +| Frontend / CLI | Python 3.11 stdlib (`argparse`, `json`, `re`, `subprocess`, `pathlib`, `sys`) | Guard CLI | Stdlib only — Req 5.5 | +| Backend / Services | n/a | — | Guard does not touch backend services | +| Data / Storage | Plain-text baseline file under `.kiro/specs/` | Per-path count store | One line per path, `\t` | +| Messaging / Events | n/a | — | — | +| Infrastructure / Runtime | GitHub Actions `ubuntu-latest`, `actions/checkout@v4`, `actions/setup-python@v5` | PR-time runner | `fetch-depth: 1` is sufficient | + +## File Structure Plan + +### Directory Structure + +``` +scripts/ +└── ci/ + └── i18n_cjk_guard.py # Guard CLI (new) + +.github/ +└── workflows/ + └── i18n-cjk-guard.yml # PR-time workflow (new) + +.kiro/specs/i18n-ci-guard/ +├── spec.json # (existing, updated) +├── requirements.md # (existing) +├── gap-analysis.md # (existing) +├── research.md # (existing) +├── design.md # (this file) +├── tasks.md # (created in next phase) +└── baseline.txt # Per-path CJK match counts (new) +``` + +### Modified Files + +- `.kiro/specs/i18n-ci-guard/spec.json` — phase / approval fields + updated by Kiro flow only. +- No production source files are modified by this spec. + +## System Flows + +### Guard execution (default mode) + +```mermaid +sequenceDiagram + participant CI as GitHub Actions + participant Script as i18n_cjk_guard.py + participant Repo as Working tree + participant BL as baseline.txt + + CI->>Script: python scripts/ci/i18n_cjk_guard.py + Script->>Repo: read locales/en.json + Script->>Script: scan for CJK chars + alt en.json has CJK + Script-->>CI: exit 1 + per-key findings + else en.json clean + Script->>Repo: git grep -nIP backend/app/ + Script->>Repo: git grep -nIP frontend/src/ + Script->>BL: read baseline counts + alt any current count > baseline + Script-->>CI: exit 1 + per-path delta + refresh hint + else within baseline + Script-->>CI: exit 0 + summary + end + end +``` + +### Baseline refresh + +```mermaid +sequenceDiagram + participant Dev as Developer + participant Script as i18n_cjk_guard.py + participant Repo as Working tree + participant BL as baseline.txt + + Dev->>Script: python scripts/ci/i18n_cjk_guard.py --update-baseline + Script->>Repo: git grep -nIP backend/app/ + Script->>Repo: git grep -nIP frontend/src/ + Script->>BL: write per-path counts (sorted) + Script-->>Dev: exit 0 + new counts +``` + +The two checks run in fixed order: en.json first (cheap, decisive), +then per-path counts. Both run under all conditions; the script does +not short-circuit after the first failure so the contributor sees the +complete diagnostic in one CI log. + +## Requirements Traceability + +| Requirement | Summary | Components | Interfaces | Flows | +|-------------|---------|------------|------------|-------| +| 1.1 | Scan en.json for CJK | `i18n_cjk_guard.py` | CLI default mode | Guard execution | +| 1.2 | Fail with key:line per offender | `i18n_cjk_guard.py` | CLI stderr output | Guard execution | +| 1.3 | Report clean state | `i18n_cjk_guard.py` | CLI stdout summary | Guard execution | +| 1.4 | Hard error if file missing | `i18n_cjk_guard.py` | CLI stderr + exit 1 | Guard execution | +| 2.1 | Count CJK matches per scoped path | `i18n_cjk_guard.py` | `git grep -nIP` invocation | Guard execution | +| 2.2 | Read baseline counts | `i18n_cjk_guard.py`, `baseline.txt` | File read | Guard execution | +| 2.3 | Fail on regression | `i18n_cjk_guard.py` | Exit 1 | Guard execution | +| 2.4 | Pass when within baseline | `i18n_cjk_guard.py` | Exit 0 | Guard execution | +| 2.5 | Skip binary files | `git grep -I` | — | Guard execution | +| 2.6 | Tracked-only scope | `git grep` default | — | Guard execution | +| 3.1 | Per-key locale failure detail | `i18n_cjk_guard.py` | CLI stderr lines | Guard execution | +| 3.2 | Per-path regression detail | `i18n_cjk_guard.py` | CLI stderr lines | Guard execution | +| 3.3 | Print refresh command | `i18n_cjk_guard.py` | CLI stderr footer | Guard execution | +| 3.4 | Success summary lines | `i18n_cjk_guard.py` | CLI stdout | Guard execution | +| 4.1 | Baseline under spec dir | `baseline.txt` | File path | — | +| 4.2 | Diff-friendly text format | `baseline.txt` | File format | — | +| 4.3 | Refresh via flag | `i18n_cjk_guard.py` | `--update-baseline` | Baseline refresh | +| 4.4 | No implicit baseline writes | `i18n_cjk_guard.py` | CLI default mode | Guard execution | +| 4.5 | Hard error if baseline missing | `i18n_cjk_guard.py` | Exit 1 + message | Guard execution | +| 5.1 | PR-only trigger to main | `i18n-cjk-guard.yml` | `on.pull_request.branches` | — | +| 5.2 | Checkout PR head | `i18n-cjk-guard.yml` | `actions/checkout@v4` | — | +| 5.3 | Surface output on failure | `i18n-cjk-guard.yml` | Default GH log | — | +| 5.4 | Pass on exit 0 | `i18n-cjk-guard.yml` | Default | — | +| 5.5 | Stdlib-only, no third-party | `i18n_cjk_guard.py`, `i18n-cjk-guard.yml` | — | — | +| 5.6 | ≤60s runtime | `i18n-cjk-guard.yml` | `timeout-minutes: 1` | — | +| 6.1 | Same result locally | `i18n_cjk_guard.py` | CLI | — | +| 6.2 | Single stable entry point | `scripts/ci/i18n_cjk_guard.py` | Path | — | +| 6.3 | No env vars / secrets | `i18n_cjk_guard.py` | CLI | — | + +## Components and Interfaces + +| Component | Domain/Layer | Intent | Req Coverage | Key Dependencies | Contracts | +|-----------|--------------|--------|--------------|------------------|-----------| +| `i18n_cjk_guard.py` | CI script | Two-check guard CLI | 1.1–6.3 | `git`, Python stdlib | Service (CLI) | +| `i18n-cjk-guard.yml` | CI workflow | Run guard on every PR to main | 5.1–5.6 | `actions/checkout@v4`, `actions/setup-python@v5` | Batch / Job | +| `baseline.txt` | Data | Per-path baseline counts | 4.1, 4.2, 2.2 | — | State (file) | + +### CI Script + +#### `i18n_cjk_guard.py` + +| Field | Detail | +|-------|--------| +| Intent | Run two CJK-regression checks; optionally refresh the baseline | +| Requirements | 1.1, 1.2, 1.3, 1.4, 2.1, 2.2, 2.3, 2.4, 2.5, 2.6, 3.1, 3.2, 3.3, 3.4, 4.1, 4.3, 4.4, 4.5, 5.5, 6.1, 6.2, 6.3 | +| Owner / Reviewers | i18n maintainers | + +**Responsibilities & Constraints** + +- Owns the canonical guard semantics: which paths are scoped, which + regex is canonical, what counts as a regression. +- Runs in pure Python 3.11 stdlib + a single `git` subprocess per + scoped path. +- Never modifies any file other than the baseline file, and only when + invoked with `--update-baseline`. +- Always runs both checks (does not short-circuit), so a single CI log + shows every failure mode at once. + +**Dependencies** + +- Inbound: `i18n-cjk-guard.yml` workflow; developers running locally. +- Outbound: `git` subprocess (`git grep`, `git rev-parse`). +- External: none. + +**Contracts**: Service [x] / API [ ] / Event [ ] / Batch [ ] / State [x] + +##### Service Interface (CLI) + +```text +i18n_cjk_guard.py [--update-baseline] [--baseline PATH] [--repo-root PATH] +``` + +Type-annotated module signature (Python type hints, public functions +only): + +```python +def main(argv: list[str]) -> int: ... + +def run_check(repo_root: pathlib.Path, baseline_path: pathlib.Path) -> int: + """Run both checks; return 0 on success, 1 on any failure.""" + +def update_baseline(repo_root: pathlib.Path, baseline_path: pathlib.Path) -> int: + """Refresh the baseline file with current per-path counts; return 0.""" + +def scan_locale_cjk(en_json_path: pathlib.Path) -> list[LocaleFinding]: + """Return a list of (key, line_number, snippet) tuples for every + CJK occurrence in locales/en.json. Empty list when clean.""" + +def count_path_cjk(repo_root: pathlib.Path, scoped_path: str) -> int: + """Return the number of CJK match lines under scoped_path, + using `git grep -nIP '[\\x{4e00}-\\x{9fff}]' -- `.""" + +def read_baseline(baseline_path: pathlib.Path) -> dict[str, int]: + """Parse the baseline file. Each non-empty, non-comment line is + '\\t'. Raise BaselineError on any malformed input + or missing file.""" + +def write_baseline(baseline_path: pathlib.Path, counts: dict[str, int]) -> None: + """Atomically overwrite the baseline file with sorted entries + and a single trailing newline.""" +``` + +Where: + +```python +LocaleFinding = tuple[str, int, str] # (dotted_key, line_number, snippet) +SCOPED_PATHS: tuple[str, ...] = ("backend/app", "frontend/src") +EN_JSON_REL_PATH: str = "locales/en.json" +CJK_PATTERN: str = "[\\x{4e00}-\\x{9fff}]" # passed to git grep -P +CJK_RE: re.Pattern[str] = re.compile(r"[一-鿿]") +SNIPPET_MAX_LEN: int = 80 +``` + +- **Preconditions**: invoked with CWD at the repo root or + `--repo-root` set; `git` is on `$PATH`; the working tree is the + intended scan target. +- **Postconditions** (default mode): exit 0 iff both checks pass; + exit 1 otherwise. Stdout receives the success summary; stderr + receives findings on failure. The baseline file is unchanged. +- **Postconditions** (`--update-baseline`): the baseline file is + rewritten to current per-path counts and exit 0 is returned. +- **Invariants**: regex range, scoped paths, and baseline file path + are constants — no env-var override. + +##### State Management + +- **State model**: a dict `{: }` parsed from + the baseline file. +- **Persistence**: plain-text file at + `.kiro/specs/i18n-ci-guard/baseline.txt`. Atomic write via + `tmp + os.replace`. +- **Concurrency**: single-writer (developer running + `--update-baseline`); CI workers only read. + +**Implementation Notes** + +- Output format mirrors `scripts/check_i18n_logs.py`: + `:: : ` on stderr, summary on stdout, + trailing `OK` or `N issues`. +- The exact refresh command printed on regression failure is: + `python scripts/ci/i18n_cjk_guard.py --update-baseline`. +- `count_path_cjk` invokes `git grep` via `subprocess.run` with + `check=False`; `git grep` exits 1 when there are zero matches, so + the function treats exit codes 0 and 1 as success and any other + code as a hard error. +- Localised key extraction for `en.json` walks the parsed JSON dict; + line numbers are obtained by re-reading the file as text and + matching the value's first textual occurrence. +- Risks: see `research.md` § Risks & Mitigations. + +### CI Workflow + +#### `i18n-cjk-guard.yml` + +| Field | Detail | +|-------|--------| +| Intent | Run the guard on every PR to `main` | +| Requirements | 5.1, 5.2, 5.3, 5.4, 5.5, 5.6 | +| Owner / Reviewers | i18n maintainers | + +**Contracts**: Batch / Job [x] + +##### Batch / Job Contract + +- **Trigger**: `on: pull_request: branches: [main]`. +- **Input / validation**: PR head ref checkout via + `actions/checkout@v4` with `fetch-depth: 1`. Python set up via + `actions/setup-python@v5` with `python-version: '3.11'`. +- **Output / destination**: pass/fail status surfaced as a GitHub + Actions check on the PR. Script stdout/stderr appears in the + workflow log. +- **Idempotency & recovery**: re-running the workflow re-evaluates the + same working tree; no persistent side effects on the runner. + +##### Workflow shape (sketch) + +```yaml +name: i18n CJK Guard +on: + pull_request: + branches: [main] +jobs: + guard: + runs-on: ubuntu-latest + timeout-minutes: 1 + steps: + - uses: actions/checkout@v4 + with: + fetch-depth: 1 + - uses: actions/setup-python@v5 + with: + python-version: '3.11' + - run: python scripts/ci/i18n_cjk_guard.py +``` + +### Baseline Data File + +#### `baseline.txt` + +| Field | Detail | +|-------|--------| +| Intent | Persist the per-path CJK match-count baseline | +| Requirements | 2.2, 4.1, 4.2 | + +**Contracts**: State [x] + +##### Format + +```text +# Per-path CJK baseline for the i18n CI guard. +# Format: \t. Sorted lexicographically. +# Refresh via: python scripts/ci/i18n_cjk_guard.py --update-baseline +backend/app +frontend/src +``` + +- One header block of `#`-prefixed comments (parser ignores). +- Blank lines ignored. +- Lines must match `^(?P[^\t\n]+)\t(?P\d+)$`. +- Trailing newline mandatory. + +## Data Models + +### Domain Model + +- `LocaleFinding` — value object + `(dotted_key: str, line_number: int, snippet: str)`. +- `PathCount` — pair `(scoped_path: str, count: int)`. The full + baseline is a `dict[str, int]` keyed by scoped path. + +Invariants: + +- `count` is a non-negative integer. +- `scoped_path` is one of `SCOPED_PATHS`. +- `LocaleFinding.snippet` is at most `SNIPPET_MAX_LEN` characters, + truncated with an ellipsis when needed. + +## Error Handling + +### Error Strategy + +- All non-zero exits are accompanied by a stderr message identifying + the failing check, the offending file or path, and (for regressions) + the refresh command. The script never raises uncaught exceptions + past `main()` in normal flow; unexpected I/O errors propagate as + `OSError` with a clear traceback so CI logs surface them clearly. + +### Error Categories and Responses + +- **Locale failure** (Req 1.2): one stderr line per offending key + (`locales/en.json:: cjk-in-en: = `), then a + trailing `N issues` summary. +- **Regression failure** (Req 3.2): one stderr line per regressed + path (`: cjk-regression: baseline= current= delta=+`) + followed by a one-line refresh hint: + `# refresh via: python scripts/ci/i18n_cjk_guard.py --update-baseline`. +- **Missing en.json** (Req 1.4): stderr `locales/en.json: missing + catalogue file`, exit 1. +- **Missing or malformed baseline** (Req 4.5): stderr + `: missing or malformed; refresh via …`, exit 1. +- **`git grep` unavailable / non-PCRE**: stderr + `git grep failed: `, exit 1. + +### Monitoring + +- The guard is a single short-lived script. All observability is + delegated to GitHub Actions logs (stdout/stderr, run duration). + No external telemetry. + +## Testing Strategy + +### Unit Tests (Python) + +Place tests under `scripts/ci/tests/test_i18n_cjk_guard.py` (or invoke +the script directly via subprocess in a tmp git repo). The project's +test runner is `pytest` (already used by `backend/`), but the new +tests must be runnable with `python -m pytest` from the repo root +without backend dependencies. Tests are scoped to: + +1. `scan_locale_cjk` — clean catalogue returns empty list; planted CJK + value returns a single `LocaleFinding` with the correct key and + line number. +2. `count_path_cjk` — given a tmp git repo with N planted CJK lines, + returns N; binary file matches are excluded; untracked file + matches are excluded. +3. `read_baseline` / `write_baseline` round-trip — write counts, + re-read, equal. +4. `read_baseline` malformed input — non-tab line → `BaselineError`. +5. `run_check` end-to-end — passing baseline → exit 0; regressed + baseline → exit 1 and stderr contains the refresh command. + +### Integration Tests + +1. Workflow shape — `actionlint` (optional, if installed locally) on + `i18n-cjk-guard.yml`. At minimum, `python -c "import yaml; + yaml.safe_load(open('.github/workflows/i18n-cjk-guard.yml'))"` for + YAML validity. +2. Local end-to-end — run + `python scripts/ci/i18n_cjk_guard.py` from the repo root with the + committed baseline; expect exit 0 on a clean checkout of `main`. +3. Refresh end-to-end — run with `--update-baseline`; verify + baseline file is rewritten and a second default run is exit 0. + +### Performance / Load + +- Single-pass `git grep` over the scoped paths runs in <2 s on the + current repo. The workflow's `timeout-minutes: 1` is a hard ceiling + per Req 5.6. + +## Optional Sections + +### Security Considerations + +- The guard reads only tracked text files; no secrets are accessed. +- The workflow uses `GITHUB_TOKEN` only implicitly via + `actions/checkout`; no additional permissions are requested + (`permissions:` block omitted relies on the repo default of + `contents: read`, which is sufficient). diff --git a/.kiro/specs/i18n-ci-guard/gap-analysis.md b/.kiro/specs/i18n-ci-guard/gap-analysis.md new file mode 100644 index 00000000..15bc37de --- /dev/null +++ b/.kiro/specs/i18n-ci-guard/gap-analysis.md @@ -0,0 +1,169 @@ +# Gap Analysis — i18n-ci-guard + +Comparison of the approved requirements against the current MiroFish +codebase, focused on what already exists, what is missing, and what +options the design phase should choose between. + +## 1. Current State Investigation + +### Domain assets already in the repo + +- **`scripts/check_i18n_logs.py`** — Python-stdlib-only, exit-code-based + i18n verification script. Uses the same canonical CJK regex + `[一-鿿]` (`U+4E00..U+9FFF`) the new guard needs, prints findings as + `:: : `, and was written for ticket #6. + Strong precedent for the new guard's CLI surface and output format. +- **`scripts/_apply_translations.py`, `scripts/_codemod_i18n.py`, + `scripts/_merge_locale_keys.py`** — i18n tooling sibling scripts. + Convention is to keep auxiliary i18n scripts under `scripts/` at the + repo root. +- **`.github/workflows/docker-image.yml`** — only existing GH Actions + workflow; triggers on tag pushes and `workflow_dispatch`. No PR-time + workflow exists yet, so the new guard introduces the project's first + PR-blocking CI check. +- **PR #27 / branch `chore/i18n-10-e2e-english-verification`** — defines + the audit methodology referenced by the ticket. Its `audit_cjk.sh` + uses `git grep -nIP '[\x{4e00}-\x{9fff}]' -- backend/app frontend/src + locales/en.json` — the canonical scoped scan command. PR #27 is open; + the new guard must work with or without it merged. +- **`.kiro/specs//`** — established home for spec artefacts. + `i18n-externalize-backend-logs/` is the closest precedent for an + i18n-flavoured spec. +- **`locales/en.json`, `locales/zh.json`, `locales/languages.json`** — + shared i18n source consumed by both runtimes. + +### Conventions extracted + +- Auxiliary scripts: `scripts/.py`, Python ≥3.11 stdlib only, + shebang `#!/usr/bin/env python3`, double-quoted strings, snake_case, + Google-style docstrings on the module and public functions. +- Output format: `:: : `, summary line + `OK` or `N issues`, exit `0`/`1`. +- Reuse the canonical regex `[一-鿿]` rather than re-deriving range + literals. +- 4-space indent, ≤120 cols, no trailing whitespace, single trailing + newline (`.claude/rules/dev-guidelines.md`). + +### Integration surfaces + +- **CI**: GitHub Actions, `.github/workflows/`. `ubuntu-latest` runner, + Python 3.11+ via `actions/setup-python@v5` (use the same version + pin already present in the docker-image workflow ecosystem if any). +- **Repo layout boundaries** scoped by the audit: `backend/app/`, + `frontend/src/`, `locales/en.json` — all live at repo root or two + levels deep. +- **Git working tree**: the guard relies on `git grep -I` for tracked, + text-only matches; this binds the guard to a runner that has `git` + available (true on `ubuntu-latest` and on developer machines). + +## 2. Requirement-to-Asset Map + +| Req | Need | Existing asset | Gap | +| --- | --------------------------------- | ----------------------------------------------------------------------------------------------- | ----------- | +| 1 | CJK scan of `locales/en.json` | `scripts/check_i18n_logs.py` already loads `locales/*.json` and runs the canonical regex. | Missing — new guard must scan en.json specifically and emit `key:line` per offender. | +| 2 | CJK count under `backend/app/` and `frontend/src/` against baseline | Audit `audit_cjk.sh` (PR #27) demonstrates `git grep -nIP` is the canonical scan; no baseline file exists yet on main. | Missing — no per-path counter, no baseline file. | +| 3 | Actionable failure messaging | `check_i18n_logs.py` output format reusable. | Missing — need refresh-baseline command in failure text. | +| 4 | Baseline file lifecycle | None. | Missing — file format and refresh subcommand to design. | +| 5 | GH Actions PR integration | `.github/workflows/` directory exists; one tag-only workflow. | Missing — new `pull_request` workflow. | +| 6 | Local reproducibility | Existing scripts run locally with stdlib; same pattern reusable. | None — covered by following the existing pattern. | + +## 3. Implementation Approach Options + +### Option A — Extend `scripts/check_i18n_logs.py` + +Add a new `--cjk-guard` mode (catalogue scan + per-path baseline diff) +to the existing script, then call it from the new workflow. + +- ✅ One file to maintain; reuses the regex constant and CLI. +- ❌ The existing script is tightly scoped to the in-scope backend + modules and the parity check. Mixing a PR-gating regression check into + it dilutes its intent and grows it past the SRP line that the + surrounding scripts respect. +- ❌ The existing script targets a fixed list of backend modules; the + new guard scans whole subtrees. The two scopes don't fit one CLI. + +### Option B — New, focused script `scripts/ci/i18n_cjk_guard.py` + new workflow (recommended) + +A new directory `scripts/ci/` holds CI-only scripts; the guard is a +single file that performs both checks and supports a `--refresh-baseline` +flag. New workflow `.github/workflows/i18n-cjk-guard.yml` runs it on +every PR to `main`. + +- ✅ Clean separation: production-i18n script (`check_i18n_logs.py`) + and CI-gating script (`i18n_cjk_guard.py`) live side by side without + overlapping responsibilities. +- ✅ Mirrors the established convention of one script per + responsibility under `scripts/`. +- ✅ The baseline file lives under the spec dir + (`.kiro/specs/i18n-ci-guard/baseline.txt`), matching the ticket's + "baseline must be committed and reviewable" requirement. +- ❌ One more file in the repo, but the file is small (~150 LoC). + +### Option C — Hybrid: shared `cjk_scan.py` helper + thin guard script + +Factor the regex + git-grep logic into a tiny shared helper consumed by +both `check_i18n_logs.py` and the new guard. + +- ✅ DRY for the regex constant. +- ❌ Premature abstraction: today the only shared element is one + one-line regex. The two scripts have different scopes, output + formats, and consumers. Pulling a helper out now satisfies + consistency without paying for itself; defer until a third caller + appears. + +### Recommendation + +**Option B**. It matches the project's established "one focused script +per responsibility" convention, isolates the new CI surface from +existing i18n scripts, and keeps the baseline file collocated with +spec metadata where reviewers expect to find it. + +## 4. Research Items for Design Phase + +- **Baseline file format**: prefer a stable, line-oriented text format + over JSON to minimize diff churn (e.g., `pathcount` per line, + trailing newline). Confirm in design. +- **`git grep` invocation portability**: `git grep -nIP` works on all + modern git builds (≥2.4 ships PCRE2). `ubuntu-latest` ships ≥2.40. + No portability concern; record the assumption explicitly. +- **`fetch-depth`** for the `actions/checkout@v4` step: `git grep` + scans the working tree, not history, so a shallow clone (`fetch-depth: + 1`) is sufficient. +- **Workflow timeout budget**: capture the empirical runtime of the + full scan locally (already measured: a single `git grep` over the + scoped paths runs in <2 seconds with ~3.6k matches). The 60-second + ceiling in Req 5 is comfortable. +- **Failure-message refresh command** wording: the design should pin + the exact command shown to contributors so it stays one stable + string developers can copy. +- **Initial baseline values**: with `git grep -nIP '[\x{4e00}-\x{9fff}]'` + on the current branch — `backend/app` = 2707, `frontend/src` = 902, + `locales/en.json` = 0. The committed baseline must be regenerated + against `main` at implementation time so it reflects the merge target. + +## 5. Effort & Risk + +- **Effort**: **S** (1–3 days). Small, self-contained additions + (one Python script, one workflow file, one baseline file, plus the + spec). All patterns already exist in the repo. +- **Risk**: **Low**. No production-source changes, no new dependencies, + no architectural shifts. The only failure mode is a noisy guard + blocking unrelated PRs — mitigated by the per-path baseline ratchet. + +## 6. Recommendations for Design Phase + +- Adopt **Option B** (new focused script + new workflow + baseline file + under spec dir). +- Lock in the canonical regex `[一-鿿]` and the canonical scan command + `git grep -nIP '[\x{4e00}-\x{9fff}]' -- ` to keep this guard + bytewise-aligned with the audit pipeline. +- Use a line-oriented baseline format keyed by scoped path; explicit + `--refresh-baseline` (or equivalent) subcommand updates it; no + implicit overwrite. +- Output: machine-friendly findings on stderr, summary on stdout, + exit `0`/`1`. +- The workflow should run only on `pull_request` to `main` (Req 5.1) + with `fetch-depth: 1` and `actions/setup-python@v5`. No third-party + packages. +- Baseline counts must be recomputed against `main` before the PR + ships; do not commit baselines from a feature branch's working tree. diff --git a/.kiro/specs/i18n-ci-guard/requirements.md b/.kiro/specs/i18n-ci-guard/requirements.md new file mode 100644 index 00000000..78eb6139 --- /dev/null +++ b/.kiro/specs/i18n-ci-guard/requirements.md @@ -0,0 +1,189 @@ +# Requirements Document + +## Project Description (Input) +Add a permanent CI guard that runs an i18n CJK audit on every pull request. + +Linked GitHub issue: #26 (.ticket/26.md). + +The guard must fail a PR build when: +1. locales/en.json contains any CJK character (range U+4E00..U+9FFF), or +2. The total count of CJK matches across backend/app/ and frontend/src/ regresses (i.e. exceeds) a committed baseline value. + +## Introduction + +The i18n initiative has driven the project toward English-by-default UI, logs, +prompts, and documentation. Manual audits (see PR #27, the +`i18n-e2e-english-verification` spec) have repeatedly surfaced regressions +where Chinese strings re-enter the codebase. This spec installs a permanent, +self-contained CI guard that runs on every pull request and fails the build +when (a) `locales/en.json` is no longer CJK-clean, or (b) the total CJK match +count under `backend/app/` and `frontend/src/` regresses against a committed +baseline. + +The guard is intentionally minimal: it captures the two highest-signal checks +from the larger audit pipeline so it can run on every PR with a sub-minute +budget and without depending on the (currently unmerged) verification spec. +The committed baseline lets the project ratchet down gaps over time without +blocking unrelated PRs on pre-existing CJK content. + +## Boundary Context + +- **In scope**: + - A locally runnable Python script that performs both guard checks on the + current working tree. + - A baseline file committed under the spec directory recording the + accepted CJK match counts per scoped path. + - A GitHub Actions workflow that runs the script on every pull request + targeting `main` and fails the build when either check fails. + - A clear, actionable failure message (which path regressed, baseline + value, current value, command to update the baseline). +- **Out of scope**: + - The full classification pipeline (`classify.py`, `render_report.py`, + `post_comment.sh`) from the unmerged `i18n-e2e-english-verification` + spec — those scripts perform deeper audit work and are not required + for the PR-time guard. + - Auto-updating the baseline on `main` (the baseline is a normal + reviewable file). + - Translation work itself; this spec only enforces a regression gate. + - Any change to production source under `backend/app/`, `frontend/src/`, + or `locales/` apart from translations needed to satisfy the guard + against its own initial baseline. +- **Adjacent expectations**: + - PR #27 (`chore/i18n-10-e2e-english-verification`) provides the + methodology referenced here. This spec must remain functional whether + PR #27 has been merged or not. + - The guard reuses the canonical CJK regex range + `[一-鿿]` already established by that audit. + +## Requirements + +### Requirement 1: Locale-catalogue CJK cleanliness check + +**Objective:** As a maintainer of the English locale catalogue, I want every +PR to fail when `locales/en.json` reintroduces any CJK character, so that the +English catalogue stays CJK-free. + +#### Acceptance Criteria + +1. When the guard script is run from the repository root, the i18n CI Guard + shall scan the contents of `locales/en.json` for any character in the + range `U+4E00..U+9FFF`. +2. If `locales/en.json` contains at least one such character, the i18n CI + Guard shall exit with a non-zero status and report each offending + `key:line` pair on standard output. +3. While `locales/en.json` contains zero such characters, the i18n CI Guard + shall report the catalogue as CJK-clean. +4. If `locales/en.json` is missing or unreadable, the i18n CI Guard shall + exit with a non-zero status and emit an explicit error message naming + the missing file. + +### Requirement 2: Backend/frontend CJK regression check against committed baseline + +**Objective:** As a maintainer of English support across the codebase, I +want every PR to fail when the total CJK match count under `backend/app/` +or `frontend/src/` exceeds a committed baseline, so that the codebase +ratchets monotonically toward English-only without blocking PRs on +pre-existing CJK content. + +#### Acceptance Criteria + +1. When the guard script is run, the i18n CI Guard shall count the total + number of CJK matches (range `U+4E00..U+9FFF`, line-level, text files + only) under each of the scoped paths `backend/app/` and `frontend/src/`. +2. The i18n CI Guard shall read the baseline counts from a single + committed baseline file under the spec directory. +3. If the current count for any scoped path exceeds the baseline count for + that path, the i18n CI Guard shall exit with a non-zero status. +4. While the current count for every scoped path is less than or equal to + the baseline, the i18n CI Guard shall exit with status zero for this + check. +5. The i18n CI Guard shall ignore matches inside binary files + (image, font, archive, lockfile, or other non-text formats) by relying + on `git grep -I` semantics. +6. The i18n CI Guard shall scope its scan to tracked files only (matches + in untracked or ignored files shall not contribute to the count). + +### Requirement 3: Actionable failure messaging + +**Objective:** As a contributor whose PR was rejected by the guard, I want +the failure message to tell me exactly what regressed and how to fix it, +so that I can either translate the offending content or — when intentional — +update the baseline through normal review. + +#### Acceptance Criteria + +1. If the locale-catalogue check fails, the i18n CI Guard shall print, for + each offending entry: the dotted catalogue key, the line number in + `locales/en.json`, and a truncated snippet of the value. +2. If the regression check fails, the i18n CI Guard shall print, for each + regressed scoped path: the path name, the baseline count, the current + count, and the delta. +3. If the regression check fails, the i18n CI Guard shall print the exact + shell command a contributor must run locally to refresh the baseline + file so the PR can be re-reviewed against the new value. +4. The i18n CI Guard shall print, on success, a one-line summary per check + confirming the catalogue is CJK-clean and the per-path counts are at or + below baseline. + +### Requirement 4: Baseline file lifecycle + +**Objective:** As a reviewer enforcing English support, I want the baseline +to live in the repository as a small, human-readable file that only changes +through code review, so that downward ratcheting is intentional and +auditable. + +#### Acceptance Criteria + +1. The i18n CI Guard shall store the baseline as a single committed file + under `.kiro/specs/i18n-ci-guard/`. +2. The baseline file shall record one count per scoped path, in a stable, + diff-friendly text format (no JSON line shuffling, no trailing + whitespace). +3. When the guard script is invoked with an explicit "refresh baseline" + subcommand or flag, the i18n CI Guard shall overwrite the baseline file + with the current per-path counts and exit with status zero. +4. While no refresh flag is supplied, the i18n CI Guard shall never modify + the baseline file. +5. If the baseline file is missing at check time, the i18n CI Guard shall + exit with a non-zero status and instruct the contributor to refresh it. + +### Requirement 5: GitHub Actions PR integration + +**Objective:** As a project maintainer, I want every pull request targeting +`main` to be gated by the guard, so that no merge silently regresses the +English-only state of the catalogue or codebase. + +#### Acceptance Criteria + +1. The i18n CI Guard workflow shall trigger on every `pull_request` event + whose base ref is `main`. +2. While the workflow runs, the i18n CI Guard shall check out the PR head + commit with full history sufficient for `git grep` to scan tracked + files. +3. When the guard script exits with non-zero status, the workflow shall + fail and surface the script's standard output and standard error in the + GitHub Actions log. +4. When the guard script exits with status zero, the workflow shall pass. +5. The workflow shall use only Python from the standard + `actions/setup-python` distribution and tools already available on the + GitHub-hosted `ubuntu-latest` runner (`bash`, `git`); it shall not + install third-party Python packages. +6. The workflow shall complete within sixty seconds of wall-clock time on + a clean `ubuntu-latest` runner. + +### Requirement 6: Local reproducibility + +**Objective:** As a developer preparing a PR, I want to run the same guard +locally before pushing, so that I can catch regressions before CI does. + +#### Acceptance Criteria + +1. When the guard script is invoked from a developer machine that has + Python 3.11 or newer and `git` available, the i18n CI Guard shall + produce the same pass/fail result and the same per-path counts that + it would produce in CI for the same working tree. +2. The i18n CI Guard shall expose a single, stable invocation entry point + (a script under `scripts/ci/`) documented in the spec's design and + README touchpoints. +3. The i18n CI Guard shall require zero environment variables or secrets + to run locally. diff --git a/.kiro/specs/i18n-ci-guard/research.md b/.kiro/specs/i18n-ci-guard/research.md new file mode 100644 index 00000000..65171669 --- /dev/null +++ b/.kiro/specs/i18n-ci-guard/research.md @@ -0,0 +1,175 @@ +# Research & Design Decisions — i18n-ci-guard + +## Summary +- **Feature**: `i18n-ci-guard` +- **Discovery Scope**: Simple Addition (one Python script + one GH Actions + workflow + one baseline file). Extension-flavoured because it builds on + established `scripts/` conventions and the canonical CJK regex used by + the larger audit pipeline. +- **Key Findings**: + - The canonical CJK match command `git grep -nIP '[\x{4e00}-\x{9fff}]' + -- ` is already used by the unmerged audit pipeline (PR #27) + and is portable on every git ≥2.4 (`ubuntu-latest` ships ≥2.40). + - `scripts/check_i18n_logs.py` is a strong CLI/style precedent: + Python-stdlib-only, exit `0`/`1`, output as `:: + : `, canonical regex `[一-鿿]`. + - The repository has no existing `pull_request`-triggered GH Actions + workflow; this guard introduces the first one. The only existing + workflow (`.github/workflows/docker-image.yml`) runs on tag pushes + only. + - Current per-path counts on this branch: + `backend/app=2707, frontend/src=902, locales/en.json=0`. These are + sample counts; the committed baseline must be regenerated against + `main` at implementation time. + +## Research Log + +### Canonical scan command +- **Context**: Requirement 2 needs a stable per-path CJK count and + Requirement 5.5 forbids third-party packages. +- **Sources Consulted**: + - `audit_cjk.sh` from PR #27 commit `3481408`. + - `git grep` man page. +- **Findings**: + - `git grep -nIP '[\x{4e00}-\x{9fff}]' -- ` returns one match + per matching line in tracked, text-only files. `-I` excludes binary + files; `-P` enables PCRE2 so the `\x{...}` Unicode range works. + - This matches the input format consumed by the existing audit + classifier, so the guard's match counts are directly comparable + across pipelines. +- **Implications**: + - The guard re-uses this exact command; no new dependencies. + - Because `-I` skips binary files and tracked-only is the default, + Requirements 2.5 and 2.6 are satisfied by the command itself + rather than by additional script logic. + +### Baseline file format +- **Context**: Requirement 4 needs a diff-friendly committed baseline. +- **Sources Consulted**: + - Diff churn behaviour of JSON vs. line-oriented text in this repo's + history (e.g. `locales/*.json` PR diffs frequently re-key, while + plain-text `parity.txt` from PR #27 reads cleanly). +- **Findings**: + - Line-oriented `\t` files produce minimal diffs and + require no JSON parser. + - A two-line file (one per scoped path) is large enough to be + self-explanatory and small enough to never line-shuffle. +- **Implications**: + - Use plain text, sorted by path, single trailing newline. Reject + the file as malformed if the script cannot parse it (Req 4.5). + +### Locale-catalogue scan path +- **Context**: Requirement 1 wants `key:line` per CJK offender in + `locales/en.json`. +- **Sources Consulted**: + - `scripts/check_i18n_logs.py` (`flatten_keys` reuse pattern). + - `check_parity.py` from PR #27 (`flatten`, `[cjk-in-en]` block). +- **Findings**: + - Both precedents flatten the locale dict and run the canonical + regex against each leaf string value. Line numbers are derivable + by re-reading the file as text and matching the value's first + occurrence (good enough for an actionable error message). + - Empty-string values and non-string leaf values (booleans, null) + are skipped. +- **Implications**: + - Implement a tiny flatten-then-scan helper inside the guard + script; do not add a new shared utility module. + +### GH Actions trigger and budget +- **Context**: Requirements 5.1, 5.5, 5.6. +- **Sources Consulted**: + - GitHub-hosted runners reference (`ubuntu-latest`). + - `actions/setup-python@v5` README. +- **Findings**: + - `ubuntu-latest` has Python 3.10+ pre-installed; `actions/setup-python@v5` + pins to 3.11 in <5 s. + - A single `git grep` over the scoped paths runs in <2 s on this + repo (~3.6k matches). End-to-end the workflow comfortably fits + inside the 60 s ceiling. +- **Implications**: + - Use `actions/checkout@v4` with `fetch-depth: 1`, + `actions/setup-python@v5` with `python-version: '3.11'`, and run + the script directly. No caching layer needed. + +## Architecture Pattern Evaluation + +| Option | Description | Strengths | Risks / Limitations | Notes | +|--------|-------------|-----------|---------------------|-------| +| A. Extend `check_i18n_logs.py` | Add `--cjk-guard` mode to existing script | Reuses one file | Conflates two scopes; existing script is module-scoped, guard is subtree-scoped | Rejected | +| B. New `scripts/ci/i18n_cjk_guard.py` + new workflow | Single-purpose script + workflow + baseline file | Clean SRP; matches "one script per responsibility" precedent | One additional file | **Selected** | +| C. Shared `cjk_scan.py` helper + thin guard | Factor regex/git-grep into helper | DRY for regex constant | Premature abstraction; only one shared symbol today | Rejected | + +## Design Decisions + +### Decision: Single-purpose CI script + GH Actions workflow (Option B) +- **Context**: Requirements 1–6 demand a small, self-contained guard. +- **Alternatives Considered**: A (extend), C (shared helper). +- **Selected Approach**: New script `scripts/ci/i18n_cjk_guard.py`, + new workflow `.github/workflows/i18n-cjk-guard.yml`, baseline file + `.kiro/specs/i18n-ci-guard/baseline.txt`. +- **Rationale**: Matches the project's "one focused script per + responsibility" convention; isolates a CI-blocking surface from the + existing i18n developer scripts; keeps the baseline collocated with + the spec for review traceability. +- **Trade-offs**: One more file in `scripts/` vs. tighter cohesion. +- **Follow-up**: When a third caller wants the canonical regex, factor + it out then. + +### Decision: Plain-text baseline format +- **Context**: Requirement 4.2 demands stable, diff-friendly format. +- **Alternatives Considered**: JSON, YAML. +- **Selected Approach**: One line per scoped path: `\t`, + sorted lexicographically by path, single trailing newline. +- **Rationale**: Zero parser dependency; predictable diffs; trivial + to refresh atomically. +- **Trade-offs**: Less expressive than JSON (no nested structure), but + the data model is two integers — nesting is unnecessary. + +### Decision: Refresh via `--update-baseline` subcommand-style flag +- **Context**: Requirement 4.3 needs an explicit refresh path. +- **Alternatives Considered**: Separate `update_baseline.py` script; + Makefile target. +- **Selected Approach**: Single script with two modes: default (check + + exit 0/1) and `--update-baseline` (overwrite baseline + exit 0). +- **Rationale**: One CLI surface to remember; the failure message + prints the exact command to run. +- **Trade-offs**: Slightly more conditional logic in one script; + acceptable given the small total LoC. + +### Decision: Workflow runs only on `pull_request` to `main` +- **Context**: Requirement 5.1. +- **Alternatives Considered**: Run on `push` to all branches as well; + run on `pull_request` to any base branch. +- **Selected Approach**: `on.pull_request.branches: [main]` only. +- **Rationale**: Aligns with how the existing project uses `main` as + the protected branch (see `gh pr list` history; every feature PR + targets `main`). Avoids redundant runs on intra-branch chains. +- **Trade-offs**: A direct push to `main` would not be guarded — but + branch protection already discourages that path (per + `dev-guidelines.md`). + +## Risks & Mitigations + +- **Risk**: Baseline drifts upward unintentionally during + `--update-baseline` runs, hiding real regressions. + - *Mitigation*: Failure message instructs contributors to refresh + *only when intentional*; the baseline file is reviewed in the same + PR diff. Acceptance Criteria 3.3 makes this explicit. +- **Risk**: `git grep -P` not built with PCRE on a developer's local + git build (rare on Linux/macOS, possible on minimal Windows builds). + - *Mitigation*: The guard prints a clear error if `git grep` exits + non-zero with PCRE mode; documents Python ≥3.11 + git ≥2.20 as + prerequisites. +- **Risk**: Baseline counts captured on a feature branch include + changes not yet on `main`, mis-anchoring the ratchet. + - *Mitigation*: The implementation task explicitly recomputes + baseline against `origin/main` before committing; documented in + `tasks.md`. + +## References +- PR #27 audit pipeline (`audit_cjk.sh`, `check_parity.py`, + `classify.py`) — methodology source of truth. +- `scripts/check_i18n_logs.py` — CLI/style precedent. +- `git grep` man page — `-n`, `-I`, `-P` flag semantics. +- GitHub Actions `actions/setup-python@v5` and `actions/checkout@v4` + README pages. diff --git a/.kiro/specs/i18n-ci-guard/spec.json b/.kiro/specs/i18n-ci-guard/spec.json new file mode 100644 index 00000000..3a251576 --- /dev/null +++ b/.kiro/specs/i18n-ci-guard/spec.json @@ -0,0 +1,24 @@ +{ + "feature_name": "i18n-ci-guard", + "created_at": "2026-05-08T00:25:37Z", + "updated_at": "2026-05-08T00:40:00Z", + "language": "en", + "phase": "tasks-generated", + "approvals": { + "requirements": { + "generated": true, + "approved": true + }, + "design": { + "generated": true, + "approved": true + }, + "tasks": { + "generated": true, + "approved": true + } + }, + "ready_for_implementation": true, + "ticket": "26", + "ticket_url": "https://github.com/salestech-group/MiroFish/issues/26" +} diff --git a/.kiro/specs/i18n-ci-guard/tasks.md b/.kiro/specs/i18n-ci-guard/tasks.md new file mode 100644 index 00000000..cf5e6ad1 --- /dev/null +++ b/.kiro/specs/i18n-ci-guard/tasks.md @@ -0,0 +1,157 @@ +# Implementation Tasks — i18n-ci-guard + +> Approved spec: see `requirements.md`, `design.md`, `research.md`, +> `gap-analysis.md` in this directory. + +## Tasks + +- [x] 1. Foundation: scaffold the CI guard script with stable CLI surface and stdlib-only dependencies +- [x] 1.1 Create the empty guard script and CLI skeleton + - Place the new script at the path designated by the design (`scripts/ci/`). + - Establish the module docstring, the canonical CJK regex constant, the + scoped-paths constant tuple, and the `argparse` parser exposing default + check mode plus an explicit `--update-baseline` flag and a + `--baseline` path override. + - Confirm the script exits 0 on a smoke `--help` invocation and rejects + unknown flags with non-zero exit. + - Observable: running `python scripts/ci/i18n_cjk_guard.py --help` from + the repo root prints usage text containing every documented flag and + exits 0; running with an unknown flag exits non-zero. + - _Requirements: 5.5, 6.2, 6.3_ + - _Boundary: i18n_cjk_guard.py_ + +- [x] 2. Core: implement the two CJK checks +- [x] 2.1 Implement the locale-catalogue scan + - Recursively walk the parsed `locales/en.json` dict, applying the + canonical regex to every string leaf to gather offending entries. + - Compute the source line number by re-reading the file as text and + matching the value's first textual occurrence; truncate snippets to + the documented snippet length. + - On a missing or unreadable catalogue file, emit a clear stderr + message and exit non-zero. + - Observable: against a synthetic clean catalogue, the function returns + an empty list; against a synthetic catalogue with one CJK value, it + returns exactly one finding tuple with the correct dotted key and + line number. + - _Requirements: 1.1, 1.2, 1.3, 1.4, 3.1_ + - _Boundary: i18n_cjk_guard.py_ + +- [x] 2.2 (P) Implement the per-path CJK count via `git grep` + - Invoke `git grep -nIP '[\x{4e00}-\x{9fff}]' -- ` for each + scoped path; treat exit codes 0 (matches found) and 1 (no matches) as + success, any other exit code as a hard error reported on stderr. + - Count lines of stdout; the result for a zero-match path must be the + integer `0`, never an exception. + - Reject working-tree states where `git` is not available or PCRE is + not enabled, with a clear stderr message. + - Observable: against a tmp git repository with N planted CJK lines + under a scoped path, the function returns N; with zero CJK content, + it returns 0; binary files and untracked files do not contribute. + - _Requirements: 2.1, 2.4, 2.5, 2.6_ + - _Boundary: i18n_cjk_guard.py_ + +- [x] 2.3 Implement baseline file read/write with strict format + - Parse the baseline file as `\t` lines, ignoring `#` + comments and blank lines, raising a typed error on malformed input + or missing file. + - Write atomically (`tmp + os.replace`) with sorted entries, a single + header comment block, and a single trailing newline. + - Observable: a round-trip write/read of a deterministic counts dict + yields the same dict; a baseline file containing a non-tab line is + rejected with a clear error; the baseline file ends with exactly one + `\n`. + - _Requirements: 4.2, 4.3_ + - _Boundary: i18n_cjk_guard.py_ + +- [x] 3. Integration: wire the two checks into the default and refresh modes +- [x] 3.1 Compose the default check mode + - Run both checks under all conditions (do not short-circuit), so a + single CI log shows every failure in one pass. + - Print a one-line success summary per check on stdout when both pass. + - On locale failure, print `:: : ` lines + on stderr and a trailing `N issues` summary; on regression failure, + print `: cjk-regression: baseline= current= delta=+` + lines plus the exact verbatim refresh command. + - Surface a non-zero exit when either check fails and exit 0 only when + both pass. + - Observable: against a working tree with the committed baseline at or + above the current count and a CJK-clean en.json, exit code is 0 and + stdout contains the success summary; planting one CJK char in + en.json or planting enough new CJK lines to break the baseline + yields exit 1 and the documented stderr text. + - _Requirements: 1.2, 1.3, 1.4, 2.2, 2.3, 2.4, 3.1, 3.2, 3.3, 3.4, 4.4, 4.5_ + - _Boundary: i18n_cjk_guard.py_ + +- [x] 3.2 Compose the `--update-baseline` mode + - When the flag is provided, recompute current per-path counts and + overwrite the baseline file via the atomic writer; print the new + counts on stdout; exit 0. + - When the flag is absent, never write the baseline file under any + code path. + - Observable: invoking with `--update-baseline` rewrites the baseline + file's contents to match current counts and exits 0; running the + default mode immediately afterward exits 0. + - _Requirements: 4.3, 4.4_ + - _Boundary: i18n_cjk_guard.py_ + +- [x] 4. Establish the committed baseline anchored to `main` +- [x] 4.1 Capture initial baseline counts against `main` + - Operate from a tree that reflects `origin/main`'s state for the + scoped paths (e.g., a fresh checkout, a worktree at `origin/main`, + or `git checkout origin/main -- backend/app frontend/src` followed + by a clean revert) so the committed baseline does not over- or + under-count relative to the merge target. + - Run `--update-baseline` to materialize the counts; confirm the + resulting file is exactly two non-comment data lines (one per + scoped path) sorted lexicographically. + - Observable: the baseline file is committed to + `.kiro/specs/i18n-ci-guard/baseline.txt` and `python scripts/ci/i18n_cjk_guard.py` + against the same `main`-aligned tree exits 0. + - _Requirements: 4.1, 4.2_ + - _Boundary: baseline.txt_ + +- [x] 5. Wire the guard into GitHub Actions on every PR to `main` +- [x] 5.1 Add the PR-time workflow + - Create the workflow file at the path designated by the design, + triggered on `pull_request` whose base ref is `main`. + - Set explicit minimal permissions (`contents: read`), a one-minute + job timeout, `actions/checkout@v4` with `fetch-depth: 1`, and + `actions/setup-python@v5` pinned to Python 3.11. + - The single executable step invokes the guard script with no + arguments; the workflow surfaces the script's stdout and stderr in + the GitHub Actions log without filtering. + - Observable: the workflow YAML parses cleanly; on a PR with no CJK + regression, the job passes; on a PR that introduces a CJK regression + or CJK in en.json, the job fails and the log shows the documented + failure messages. + - _Requirements: 5.1, 5.2, 5.3, 5.4, 5.5, 5.6_ + - _Boundary: i18n-cjk-guard.yml_ + +- [x] 6. Validation: tests and end-to-end checks +- [x] 6.1 Add unit and integration tests for the guard script + - Cover the locale scan against a synthetic clean catalogue and a + synthetic CJK-tainted catalogue, asserting findings tuples match. + - Cover the per-path counter against a tmp git repo with both N>0 + and N=0 planted CJK lines, asserting the zero-match path exits + cleanly with a count of 0. + - Cover the baseline read/write round-trip and the malformed-input + rejection path. + - Cover the default mode end-to-end (pass and fail paths) with the + expected exit codes and stderr fragments, including the verbatim + refresh command on regression failure. + - Observable: `python -m pytest scripts/ci/tests/test_i18n_cjk_guard.py` + from the repo root passes locally with stdlib-only Python. + - _Requirements: 1.1, 1.2, 1.3, 1.4, 2.1, 2.4, 2.5, 2.6, 3.3, 4.3, 4.5, 6.1, 6.3_ + - _Boundary: scripts/ci/tests/_ + +- [x] 6.2 Run the guard locally to confirm reproducibility against the committed baseline + - From a clean working tree at `main` (or a worktree at `origin/main` + + this branch's new files merged on top), invoke the guard with no + arguments and confirm exit code 0 and the success summary. + - Confirm the same command is the documented developer entry point + referenced from the failure-message refresh hint. + - Observable: terminal session shows exit code 0 and the documented + one-line per-check success summary; the same script path (`scripts/ci/i18n_cjk_guard.py`) + appears verbatim in the regression-failure refresh hint. + - _Requirements: 6.1, 6.2, 6.3_ + - _Boundary: i18n_cjk_guard.py, baseline.txt_ diff --git a/scripts/ci/i18n_cjk_guard.py b/scripts/ci/i18n_cjk_guard.py new file mode 100755 index 00000000..dd955826 --- /dev/null +++ b/scripts/ci/i18n_cjk_guard.py @@ -0,0 +1,393 @@ +#!/usr/bin/env python3 +"""i18n CJK guard for pull-request CI. + +Run from the repository root:: + + python scripts/ci/i18n_cjk_guard.py + python scripts/ci/i18n_cjk_guard.py --update-baseline + +Two checks always run (no short-circuit): + +* ``locales/en.json`` must contain zero CJK characters + (range ``U+4E00..U+9FFF``). +* CJK match counts under ``backend/app/`` and ``frontend/src/`` must not + exceed the committed per-path baseline at + ``.kiro/specs/i18n-ci-guard/baseline.txt``. + +Both checks rely on the canonical scan +``git grep -nIP '[\\x{4e00}-\\x{9fff}]' -- `` so the guard +stays bytewise-aligned with the broader audit pipeline. + +Stdlib only. Exit code is 0 on success and 1 on any failure or hard +error. +""" +from __future__ import annotations + +import argparse +import json +import os +import re +import subprocess +import sys +from pathlib import Path + +CJK_RE: re.Pattern[str] = re.compile(r"[一-鿿]") +CJK_PATTERN: str = r"[\x{4e00}-\x{9fff}]" +SCOPED_PATHS: tuple[str, ...] = ("backend/app", "frontend/src") +EN_JSON_REL_PATH: str = "locales/en.json" +DEFAULT_BASELINE_REL_PATH: str = ".kiro/specs/i18n-ci-guard/baseline.txt" +SNIPPET_MAX_LEN: int = 80 +REFRESH_COMMAND: str = "python scripts/ci/i18n_cjk_guard.py --update-baseline" +REFRESH_HINT: str = f"# refresh via: {REFRESH_COMMAND}" + +LocaleFinding = tuple[str, int, str] + + +class BaselineError(Exception): + """Raised when the baseline file is missing or malformed.""" + + +def _truncate(text: str, limit: int = SNIPPET_MAX_LEN) -> str: + if len(text) <= limit: + return text + return text[: limit - 3] + "..." + + +def _flatten(prefix: str, value: object, out: list[tuple[str, object]]) -> None: + if isinstance(value, dict): + for key, child in value.items(): + child_prefix = f"{prefix}.{key}" if prefix else str(key) + _flatten(child_prefix, child, out) + else: + out.append((prefix, value)) + + +def _value_line_number(text_lines: list[str], value: str) -> int: + """Best-effort line number for ``value`` in the original JSON text. + + Tries the raw value first (matches when the JSON file was written with + ``ensure_ascii=False``), then the JSON-escaped form, then falls back to + line 1 so callers always have a usable integer. + """ + candidates: list[str] = [value] + escaped = json.dumps(value)[1:-1] + if escaped not in candidates: + candidates.append(escaped) + for candidate in candidates: + if not candidate: + continue + for index, line in enumerate(text_lines, start=1): + if candidate in line: + return index + return 1 + + +def scan_locale_cjk(en_json_path: Path) -> list[LocaleFinding]: + """Return ``(dotted_key, line_number, snippet)`` for every CJK leaf. + + Args: + en_json_path: Path to ``locales/en.json``. + + Returns: + A list of findings in document order. Empty when the catalogue is + CJK-clean. Non-string leaves and empty strings are skipped. + + Raises: + FileNotFoundError: If ``en_json_path`` does not exist. + json.JSONDecodeError: If the file is not valid JSON. + """ + raw = en_json_path.read_text(encoding="utf-8") + data = json.loads(raw) + flat: list[tuple[str, object]] = [] + _flatten("", data, flat) + text_lines = raw.splitlines() + findings: list[LocaleFinding] = [] + for key, value in flat: + if not isinstance(value, str) or not value: + continue + if not CJK_RE.search(value): + continue + line_no = _value_line_number(text_lines, value) + findings.append((key, line_no, _truncate(value))) + return findings + + +def count_path_cjk(repo_root: Path, scoped_path: str) -> int: + """Count CJK match lines under ``scoped_path`` via ``git grep -nIP``. + + Args: + repo_root: Working-tree root used as ``git`` CWD. + scoped_path: Repo-relative path to scan (e.g. ``backend/app``). + + Returns: + The number of matching tracked-text lines. ``-I`` excludes binary + files; untracked files are excluded by default. + + Raises: + RuntimeError: If ``git grep`` fails for any reason other than + "no matches" (exit code 1, which is treated as zero matches). + """ + cmd = ["git", "grep", "-nIP", CJK_PATTERN, "--", scoped_path] + proc = subprocess.run( + cmd, + cwd=repo_root, + stdout=subprocess.PIPE, + stderr=subprocess.PIPE, + text=True, + ) + if proc.returncode not in (0, 1): + raise RuntimeError( + f"git grep failed (exit {proc.returncode}) for {scoped_path}: " + f"{proc.stderr.strip()}" + ) + if not proc.stdout: + return 0 + return sum(1 for line in proc.stdout.splitlines() if line) + + +def read_baseline(baseline_path: Path) -> dict[str, int]: + """Parse the baseline file and return ``{scoped_path: count}``. + + Args: + baseline_path: Absolute path to the baseline file. + + Returns: + A dict keyed by scoped path with non-negative integer counts. + + Raises: + BaselineError: If the file is missing or contains a malformed line. + """ + if not baseline_path.exists(): + raise BaselineError( + f"{baseline_path}: missing or malformed; " + f"refresh via: {REFRESH_COMMAND}" + ) + counts: dict[str, int] = {} + for raw_line in baseline_path.read_text(encoding="utf-8").splitlines(): + line = raw_line.rstrip() + if not line or line.startswith("#"): + continue + if "\t" not in line: + raise BaselineError( + f"{baseline_path}: malformed line {raw_line!r}; " + f"expected '\\t'" + ) + path, _, count_str = line.partition("\t") + if not path or not count_str.isdigit(): + raise BaselineError( + f"{baseline_path}: malformed line {raw_line!r}; " + f"expected '\\t'" + ) + counts[path] = int(count_str) + return counts + + +def write_baseline(baseline_path: Path, counts: dict[str, int]) -> None: + """Atomically write the baseline file with sorted entries. + + Args: + baseline_path: Target file path. + counts: Per-path baseline counts; keys are written in lexicographic + order with a single trailing newline. + """ + header = ( + "# Per-path CJK baseline for the i18n CI guard.\n" + "# Format: \\t. Sorted lexicographically.\n" + f"# Refresh via: {REFRESH_COMMAND}\n" + ) + body_lines = [f"{path}\t{counts[path]}" for path in sorted(counts)] + body = "\n".join(body_lines) + "\n" + contents = header + body + baseline_path.parent.mkdir(parents=True, exist_ok=True) + tmp = baseline_path.with_suffix(baseline_path.suffix + ".tmp") + tmp.write_text(contents, encoding="utf-8") + os.replace(tmp, baseline_path) + + +def _format_locale_finding(key: str, line_no: int, snippet: str) -> str: + return f"{EN_JSON_REL_PATH}:{line_no}: cjk-in-en: {key} = {snippet}" + + +def _format_regression_line(path: str, baseline: int, current: int) -> str: + delta = current - baseline + sign = "+" if delta > 0 else "" + return ( + f"{path}: cjk-regression: baseline={baseline} " + f"current={current} delta={sign}{delta}" + ) + + +def run_check(repo_root: Path, baseline_path: Path) -> int: + """Run both guard checks and return the script exit code. + + Args: + repo_root: Working-tree root passed to ``git grep``. + baseline_path: Path to the baseline file. + + Returns: + ``0`` when both checks pass, ``1`` otherwise. + """ + failed = False + success_summary: list[str] = [] + + en_json_path = repo_root / EN_JSON_REL_PATH + if not en_json_path.exists(): + print(f"{EN_JSON_REL_PATH}: missing catalogue file", file=sys.stderr) + failed = True + else: + try: + findings = scan_locale_cjk(en_json_path) + except json.JSONDecodeError as exc: + print( + f"{EN_JSON_REL_PATH}: invalid JSON: {exc.msg}", + file=sys.stderr, + ) + findings = [] + failed = True + if findings: + for key, line_no, snippet in findings: + print( + _format_locale_finding(key, line_no, snippet), + file=sys.stderr, + ) + print(f"{len(findings)} issues", file=sys.stderr) + failed = True + elif not failed: + success_summary.append("OK locales/en.json is CJK-clean") + + try: + baseline = read_baseline(baseline_path) + except BaselineError as exc: + print(str(exc), file=sys.stderr) + return 1 + + current_counts: dict[str, int] = {} + try: + for path in SCOPED_PATHS: + current_counts[path] = count_path_cjk(repo_root, path) + except RuntimeError as exc: + print(f"git grep failed: {exc}", file=sys.stderr) + return 1 + + regressions: list[str] = [] + for path in SCOPED_PATHS: + baseline_value = baseline.get(path, 0) + current_value = current_counts[path] + if current_value > baseline_value: + regressions.append( + _format_regression_line(path, baseline_value, current_value) + ) + + if regressions: + for line in regressions: + print(line, file=sys.stderr) + print(REFRESH_HINT, file=sys.stderr) + failed = True + else: + per_path = ", ".join( + f"{path}={current_counts[path]}<={baseline.get(path, 0)}" + for path in SCOPED_PATHS + ) + success_summary.append( + f"OK per-path counts within baseline ({per_path})" + ) + + if not failed: + for line in success_summary: + print(line) + + return 1 if failed else 0 + + +def update_baseline(repo_root: Path, baseline_path: Path) -> int: + """Refresh ``baseline_path`` with current per-path counts. + + Args: + repo_root: Working-tree root passed to ``git grep``. + baseline_path: Target baseline file path; created if missing. + + Returns: + ``0`` on success. + """ + counts: dict[str, int] = {} + for path in SCOPED_PATHS: + counts[path] = count_path_cjk(repo_root, path) + write_baseline(baseline_path, counts) + print(f"baseline updated: {baseline_path}") + for path in sorted(counts): + print(f" {path}\t{counts[path]}") + return 0 + + +def _build_parser() -> argparse.ArgumentParser: + parser = argparse.ArgumentParser( + prog="i18n_cjk_guard", + description=( + "PR-time guard: fail when locales/en.json contains CJK or when " + "backend/app + frontend/src CJK match counts exceed the " + "committed baseline." + ), + ) + parser.add_argument( + "--update-baseline", + action="store_true", + help=( + "overwrite the baseline file with current counts and exit 0" + ), + ) + parser.add_argument( + "--baseline", + type=Path, + default=None, + help=( + f"path to the baseline file (default: {DEFAULT_BASELINE_REL_PATH})" + ), + ) + parser.add_argument( + "--repo-root", + type=Path, + default=None, + help=( + "repository root (default: detected via " + "`git rev-parse --show-toplevel`)" + ), + ) + return parser + + +def _detect_repo_root(explicit: Path | None) -> Path: + if explicit is not None: + return explicit.resolve() + proc = subprocess.run( + ["git", "rev-parse", "--show-toplevel"], + stdout=subprocess.PIPE, + stderr=subprocess.PIPE, + text=True, + ) + if proc.returncode != 0: + raise RuntimeError( + f"unable to detect repository root: {proc.stderr.strip()}" + ) + return Path(proc.stdout.strip()) + + +def main(argv: list[str] | None = None) -> int: + """CLI entry point. Returns the script exit code.""" + parser = _build_parser() + args = parser.parse_args(argv) + try: + repo_root = _detect_repo_root(args.repo_root) + except RuntimeError as exc: + print(str(exc), file=sys.stderr) + return 1 + if args.baseline is not None: + baseline_path = args.baseline.resolve() + else: + baseline_path = (repo_root / DEFAULT_BASELINE_REL_PATH).resolve() + if args.update_baseline: + return update_baseline(repo_root, baseline_path) + return run_check(repo_root, baseline_path) + + +if __name__ == "__main__": + sys.exit(main()) diff --git a/scripts/ci/tests/test_i18n_cjk_guard.py b/scripts/ci/tests/test_i18n_cjk_guard.py new file mode 100644 index 00000000..39d6375c --- /dev/null +++ b/scripts/ci/tests/test_i18n_cjk_guard.py @@ -0,0 +1,358 @@ +"""Unit and integration tests for ``scripts/ci/i18n_cjk_guard.py``. + +Stdlib-only tests using ``unittest``. Run from the repository root with:: + + python -m unittest scripts/ci/tests/test_i18n_cjk_guard.py + +or as a script:: + + python scripts/ci/tests/test_i18n_cjk_guard.py +""" +from __future__ import annotations + +import json +import os +import subprocess +import sys +import tempfile +import unittest +from pathlib import Path + +_HERE = Path(__file__).resolve().parent +_GUARD_DIR = _HERE.parent +sys.path.insert(0, str(_GUARD_DIR)) + +import i18n_cjk_guard as guard # noqa: E402 + + +def _git(repo: Path, *args: str) -> subprocess.CompletedProcess[str]: + """Run a git command in ``repo`` and return the completed process.""" + return subprocess.run( + ["git", *args], + cwd=repo, + check=True, + stdout=subprocess.PIPE, + stderr=subprocess.PIPE, + text=True, + ) + + +def _make_repo(tmp: Path) -> Path: + """Initialize an isolated git repository at ``tmp`` and return the path.""" + _git(tmp, "init", "-q", "-b", "main") + _git(tmp, "config", "user.email", "test@example.com") + _git(tmp, "config", "user.name", "Test") + return tmp + + +def _commit_file(repo: Path, rel: str, content: str | bytes) -> None: + """Write a file under ``repo`` and commit it.""" + target = repo / rel + target.parent.mkdir(parents=True, exist_ok=True) + if isinstance(content, str): + target.write_text(content, encoding="utf-8") + else: + target.write_bytes(content) + _git(repo, "add", "--", rel) + _git(repo, "commit", "-q", "-m", f"add {rel}") + + +class ScanLocaleCjkTests(unittest.TestCase): + """``scan_locale_cjk`` returns one ``LocaleFinding`` per CJK leaf string.""" + + def test_clean_catalogue_returns_empty_list(self) -> None: + with tempfile.TemporaryDirectory() as tmp: + en_path = Path(tmp) / "en.json" + en_path.write_text( + json.dumps( + {"common": {"confirm": "Confirm", "cancel": "Cancel"}}, + indent=2, + ), + encoding="utf-8", + ) + self.assertEqual(guard.scan_locale_cjk(en_path), []) + + def test_planted_cjk_returns_one_finding(self) -> None: + with tempfile.TemporaryDirectory() as tmp: + en_path = Path(tmp) / "en.json" + data = { + "common": { + "confirm": "Confirm", + "cancel": "取消", + } + } + en_path.write_text( + json.dumps(data, indent=2, ensure_ascii=False), + encoding="utf-8", + ) + findings = guard.scan_locale_cjk(en_path) + self.assertEqual(len(findings), 1) + key, line_no, snippet = findings[0] + self.assertEqual(key, "common.cancel") + self.assertGreaterEqual(line_no, 1) + self.assertIn("取消", snippet) + + def test_long_value_is_truncated(self) -> None: + with tempfile.TemporaryDirectory() as tmp: + en_path = Path(tmp) / "en.json" + value = "前置" + ("x" * 200) + en_path.write_text( + json.dumps({"k": value}, ensure_ascii=False), + encoding="utf-8", + ) + findings = guard.scan_locale_cjk(en_path) + self.assertEqual(len(findings), 1) + self.assertLessEqual(len(findings[0][2]), guard.SNIPPET_MAX_LEN) + + +class CountPathCjkTests(unittest.TestCase): + """``count_path_cjk`` shells out to ``git grep -nIP``.""" + + def test_returns_zero_for_empty_match(self) -> None: + with tempfile.TemporaryDirectory() as tmp: + repo = _make_repo(Path(tmp)) + _commit_file(repo, "src/a.txt", "hello world\n") + self.assertEqual(guard.count_path_cjk(repo, "src"), 0) + + def test_counts_planted_cjk_lines(self) -> None: + with tempfile.TemporaryDirectory() as tmp: + repo = _make_repo(Path(tmp)) + _commit_file( + repo, + "src/a.py", + "# 一\nprint('hi')\n# 二三\nx = '四'\n", + ) + # Three lines contain CJK: # 一 ; # 二三 ; x = '四'. + self.assertEqual(guard.count_path_cjk(repo, "src"), 3) + + def test_skips_binary_files(self) -> None: + with tempfile.TemporaryDirectory() as tmp: + repo = _make_repo(Path(tmp)) + # A "binary" blob containing CJK bytes; -I should exclude it. + _commit_file( + repo, + "src/blob.bin", + b"\x00\x01\x02\xe4\xb8\x80\x00\xff", + ) + self.assertEqual(guard.count_path_cjk(repo, "src"), 0) + + def test_skips_untracked_files(self) -> None: + with tempfile.TemporaryDirectory() as tmp: + repo = _make_repo(Path(tmp)) + _commit_file(repo, "src/.gitkeep", "") + (repo / "src" / "untracked.py").write_text( + "x = '中'\n", encoding="utf-8" + ) + self.assertEqual(guard.count_path_cjk(repo, "src"), 0) + + +class BaselineRoundTripTests(unittest.TestCase): + """``read_baseline`` and ``write_baseline`` round-trip cleanly.""" + + def test_round_trip(self) -> None: + with tempfile.TemporaryDirectory() as tmp: + path = Path(tmp) / "baseline.txt" + counts = {"backend/app": 2792, "frontend/src": 902} + guard.write_baseline(path, counts) + self.assertTrue(path.read_text().endswith("\n")) + self.assertEqual(guard.read_baseline(path), counts) + + def test_sorted_lexicographically_and_single_trailing_newline(self) -> None: + with tempfile.TemporaryDirectory() as tmp: + path = Path(tmp) / "baseline.txt" + guard.write_baseline(path, {"frontend/src": 1, "backend/app": 2}) + text = path.read_text(encoding="utf-8") + data_lines = [ + line for line in text.splitlines() if not line.startswith("#") + ] + self.assertEqual( + data_lines, + ["backend/app\t2", "frontend/src\t1"], + ) + self.assertTrue(text.endswith("\n")) + self.assertFalse(text.endswith("\n\n")) + + def test_missing_file_raises_baseline_error(self) -> None: + with tempfile.TemporaryDirectory() as tmp: + path = Path(tmp) / "missing.txt" + with self.assertRaises(guard.BaselineError): + guard.read_baseline(path) + + def test_malformed_line_raises_baseline_error(self) -> None: + with tempfile.TemporaryDirectory() as tmp: + path = Path(tmp) / "baseline.txt" + path.write_text( + "# header\nbackend/app 100\n", encoding="utf-8" + ) + with self.assertRaises(guard.BaselineError): + guard.read_baseline(path) + + +class RunCheckEndToEndTests(unittest.TestCase): + """End-to-end test of ``run_check`` against a synthetic repo.""" + + def _make_full_repo( + self, + tmp: Path, + *, + en_json: dict, + backend_lines: int, + frontend_lines: int, + ) -> tuple[Path, Path]: + repo = _make_repo(tmp) + _commit_file( + repo, + "locales/en.json", + json.dumps(en_json, indent=2, ensure_ascii=False), + ) + if backend_lines: + content = "\n".join(f"# 中{i}" for i in range(backend_lines)) + "\n" + _commit_file(repo, "backend/app/x.py", content) + else: + _commit_file(repo, "backend/app/.gitkeep", "") + if frontend_lines: + content = "\n".join(f"// 中{i}" for i in range(frontend_lines)) + "\n" + _commit_file(repo, "frontend/src/x.js", content) + else: + _commit_file(repo, "frontend/src/.gitkeep", "") + baseline_path = repo / "baseline.txt" + return repo, baseline_path + + def test_pass_within_baseline(self) -> None: + with tempfile.TemporaryDirectory() as tmp: + repo, baseline_path = self._make_full_repo( + Path(tmp), + en_json={"k": "Confirm"}, + backend_lines=3, + frontend_lines=2, + ) + guard.write_baseline( + baseline_path, + {"backend/app": 5, "frontend/src": 5}, + ) + rc = guard.run_check(repo, baseline_path) + self.assertEqual(rc, 0) + + def test_fail_on_locale_cjk(self) -> None: + with tempfile.TemporaryDirectory() as tmp: + repo, baseline_path = self._make_full_repo( + Path(tmp), + en_json={"k": "中文"}, + backend_lines=0, + frontend_lines=0, + ) + guard.write_baseline( + baseline_path, + {"backend/app": 0, "frontend/src": 0}, + ) + rc = guard.run_check(repo, baseline_path) + self.assertEqual(rc, 1) + + def test_fail_on_regression_with_refresh_hint(self) -> None: + with tempfile.TemporaryDirectory() as tmp: + repo, baseline_path = self._make_full_repo( + Path(tmp), + en_json={"k": "Confirm"}, + backend_lines=10, + frontend_lines=0, + ) + guard.write_baseline( + baseline_path, + {"backend/app": 5, "frontend/src": 0}, + ) + # Capture stderr. + from io import StringIO + + captured_err = StringIO() + old_err = sys.stderr + sys.stderr = captured_err + try: + rc = guard.run_check(repo, baseline_path) + finally: + sys.stderr = old_err + self.assertEqual(rc, 1) + err_text = captured_err.getvalue() + self.assertIn("cjk-regression", err_text) + self.assertIn( + "python scripts/ci/i18n_cjk_guard.py --update-baseline", + err_text, + ) + + def test_missing_en_json_fails(self) -> None: + with tempfile.TemporaryDirectory() as tmp: + repo = _make_repo(Path(tmp)) + _commit_file(repo, "backend/app/.gitkeep", "") + _commit_file(repo, "frontend/src/.gitkeep", "") + baseline_path = repo / "baseline.txt" + guard.write_baseline( + baseline_path, + {"backend/app": 0, "frontend/src": 0}, + ) + rc = guard.run_check(repo, baseline_path) + self.assertEqual(rc, 1) + + def test_missing_baseline_fails(self) -> None: + with tempfile.TemporaryDirectory() as tmp: + repo, baseline_path = self._make_full_repo( + Path(tmp), + en_json={"k": "Confirm"}, + backend_lines=0, + frontend_lines=0, + ) + # Do not write the baseline. + self.assertFalse(baseline_path.exists()) + rc = guard.run_check(repo, baseline_path) + self.assertEqual(rc, 1) + + +class UpdateBaselineTests(unittest.TestCase): + """``update_baseline`` writes current counts and exits 0.""" + + def test_update_then_check_passes(self) -> None: + with tempfile.TemporaryDirectory() as tmp: + repo = _make_repo(Path(tmp)) + _commit_file( + repo, + "locales/en.json", + json.dumps({"k": "Confirm"}, indent=2), + ) + _commit_file(repo, "backend/app/x.py", "# 一\n# 二\n") + _commit_file(repo, "frontend/src/.gitkeep", "") + baseline_path = repo / "baseline.txt" + self.assertEqual( + guard.update_baseline(repo, baseline_path), 0 + ) + counts = guard.read_baseline(baseline_path) + self.assertEqual(counts["backend/app"], 2) + self.assertEqual(counts["frontend/src"], 0) + self.assertEqual(guard.run_check(repo, baseline_path), 0) + + +class CliSmokeTests(unittest.TestCase): + """``main`` exposes the documented CLI surface.""" + + def test_help_flag_exits_zero(self) -> None: + guard_script = _GUARD_DIR / "i18n_cjk_guard.py" + proc = subprocess.run( + [sys.executable, str(guard_script), "--help"], + stdout=subprocess.PIPE, + stderr=subprocess.PIPE, + text=True, + ) + self.assertEqual(proc.returncode, 0) + for flag in ("--update-baseline", "--baseline", "--repo-root"): + self.assertIn(flag, proc.stdout) + + def test_unknown_flag_exits_nonzero(self) -> None: + guard_script = _GUARD_DIR / "i18n_cjk_guard.py" + proc = subprocess.run( + [sys.executable, str(guard_script), "--no-such-flag"], + stdout=subprocess.PIPE, + stderr=subprocess.PIPE, + text=True, + ) + self.assertNotEqual(proc.returncode, 0) + + +if __name__ == "__main__": + unittest.main()