MicroFish/backend/scripts/test_ontology_prompts_no_cj...

122 lines
4.0 KiB
Python

"""Static guard: assert ontology prompt strings contain no CJK characters.
This script enforces the i18n contract for `ontology_generator.py` (issue #2):
the module-level system prompt constant and every string literal contributed
by `_build_user_message` (excluding the method's docstring) must contain
zero CJK characters.
Logger calls, docstrings, and inline comments in the same module are
explicitly out of scope (issues #6 and #7) and are not inspected here.
The check is purely AST-based to avoid coupling to the heavy Flask /
LLM client import chain. Exit 0 on success, non-zero on regression.
"""
import ast
import os
import re
import sys
CJK_PATTERN = re.compile(r"[一-鿿]")
def _string_literals_in_function(node: ast.FunctionDef) -> list[str]:
"""Return all string-literal payloads inside a function body, except the
function's own docstring.
Both plain strings (`ast.Constant` of type `str`) and f-strings
(`ast.JoinedStr`) are included. For f-strings, only the static text
portions (`ast.Constant` children) are returned — interpolation
placeholders cannot contain CJK literals, so they are irrelevant.
"""
docstring = ast.get_docstring(node, clean=False)
pieces: list[str] = []
for child in ast.walk(node):
if isinstance(child, ast.Constant) and isinstance(child.value, str):
pieces.append(child.value)
elif isinstance(child, ast.JoinedStr):
for part in child.values:
if isinstance(part, ast.Constant) and isinstance(part.value, str):
pieces.append(part.value)
if docstring is not None:
try:
pieces.remove(docstring)
except ValueError:
pass
return pieces
def _module_constant_value(tree: ast.Module, name: str) -> str:
for node in tree.body:
if isinstance(node, ast.Assign):
for target in node.targets:
if isinstance(target, ast.Name) and target.id == name:
if isinstance(node.value, ast.Constant) and isinstance(
node.value.value, str
):
return node.value.value
raise SystemExit(f"Could not locate string constant '{name}' in source.")
def _find_method(tree: ast.Module, class_name: str, method_name: str) -> ast.FunctionDef:
for node in tree.body:
if isinstance(node, ast.ClassDef) and node.name == class_name:
for item in node.body:
if isinstance(item, ast.FunctionDef) and item.name == method_name:
return item
raise SystemExit(f"Could not locate method '{class_name}.{method_name}'.")
def _assert_no_cjk(label: str, text: str) -> int:
matches = CJK_PATTERN.findall(text)
if matches:
sample = "".join(matches[:30])
print(
f"FAIL: {label} contains {len(matches)} CJK character(s). "
f"First few: {sample!r}",
file=sys.stderr,
)
return 1
print(f"OK: {label} is CJK-free ({len(text)} chars inspected).")
return 0
def main() -> int:
target = os.path.join(
os.path.dirname(os.path.dirname(os.path.abspath(__file__))),
"app",
"services",
"ontology_generator.py",
)
with open(target, "r", encoding="utf-8") as f:
source = f.read()
tree = ast.parse(source)
failures = 0
system_prompt_value = _module_constant_value(tree, "ONTOLOGY_SYSTEM_PROMPT")
failures += _assert_no_cjk("ONTOLOGY_SYSTEM_PROMPT", system_prompt_value)
method = _find_method(tree, "OntologyGenerator", "_build_user_message")
literals = _string_literals_in_function(method)
aggregated = "\n".join(literals)
failures += _assert_no_cjk(
"_build_user_message string literals (excl. docstring)", aggregated
)
if failures:
print(f"\n{failures} CJK-regression check(s) failed.", file=sys.stderr)
return 1
print("\nAll CJK-regression checks passed.")
return 0
if __name__ == "__main__":
sys.exit(main())