MicroFish/.kiro/specs/i18n-e2e-english-verification/audit/scripts/audit_cjk.sh

63 lines
1.7 KiB
Bash
Executable File

#!/usr/bin/env bash
# Run the canonical CJK grep with PCRE, then write the raw output and a
# bucketed summary partitioned by top-level path. Excludes binary file
# matches (e.g. .jpeg) since ripgrep / git grep can otherwise score them.
set -euo pipefail
if [ "$#" -ne 1 ]; then
printf 'usage: %s <sha-dir>\n' "$0" >&2
exit 64
fi
sha_dir="$1"
mkdir -p "${sha_dir}"
raw="${sha_dir}/cjk-grep.txt"
bucketed="${sha_dir}/cjk-grep-bucketed.txt"
# Canonical PCRE grep against the three top-level paths owned by this audit.
# git grep -P uses PCRE2 - ranges like \x{4e00}-\x{9fff} are valid here.
# `-I` (--no-binary) excludes binary-file matches outright so the audit
# reports only text content.
git grep -nIP '[\x{4e00}-\x{9fff}]' \
-- backend/app frontend/src locales/en.json \
> "${raw}" \
|| true
awk_script='
function bucket(path) {
if (path ~ /^backend\/app\//) return "backend/app"
if (path ~ /^frontend\/src\//) return "frontend/src"
if (path ~ /^locales\/en\.json/) return "locales/en.json"
return "other"
}
{
split($0, parts, ":")
path = parts[1]
b = bucket(path)
counts[b]++
lines[b] = (b in lines ? lines[b] "\n" : "") $0
}
END {
order[1] = "backend/app"
order[2] = "frontend/src"
order[3] = "locales/en.json"
order[4] = "other"
for (i = 1; i <= 4; i++) {
b = order[i]
c = (b in counts ? counts[b] : 0)
printf("[%s] (%d lines)\n", b, c)
if (c > 0) {
print lines[b]
}
print ""
}
}
'
awk "${awk_script}" "${raw}" > "${bucketed}"
raw_lines=$(wc -l < "${raw}" | tr -d ' ')
printf ' cjk-grep.txt: %s lines\n' "${raw_lines}"
printf ' cjk-grep-bucketed.txt: written\n'