MicroFish/backend/tests/test_text_processor.py

65 lines
2.2 KiB
Python
Raw Blame History

This file contains ambiguous Unicode characters

This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

from importlib.util import module_from_spec, spec_from_file_location
from pathlib import Path
import sys
def _load_text_processor():
backend_dir = Path(__file__).resolve().parents[1]
if str(backend_dir) not in sys.path:
sys.path.insert(0, str(backend_dir))
module_path = backend_dir / "app" / "services" / "text_processor.py"
spec = spec_from_file_location("app.services.text_processor_test", module_path)
module = module_from_spec(spec)
assert spec and spec.loader
spec.loader.exec_module(module)
return module.TextProcessor
TextProcessor = _load_text_processor()
def test_preprocess_text_removes_graph_meta_and_episode_titles():
text = """
本文供 **MiroFish** 等工具做人物与关系底座抽取与推演。
## 给 MiroFish / 知识图谱的约定(请先读)
1. **剧集怎样引用**S04E13 *Face Off* 不是角色,勿单独建 Agent 节点。
## Walter White一类
高中化学教师,已以 **Heisenberg** 身份与 Jesse 制毒。
7. *End Times*:利用 Brock 急病建构 Gus 陷害叙事。
8. *Face Off* 正史:与 Hector 合谋炸弹杀 Gus。
"""
processed = TextProcessor.preprocess_text(text)
assert "MiroFish" not in processed
assert "8. *Face Off*" not in processed
assert "S04E13" not in processed
assert "Walter White" in processed
assert "Heisenberg" in processed
assert "Gus" in processed
def test_preprocess_text_keeps_naming_table_but_strips_episode_title_noise():
text = """
## 命名规范(抽取实体时请合并为单一节点)
| Walter White | 亦称 Walt**Heisenberg** 仍指同一人,勿拆节点。 |
| Hank Schrader | DEA 探员。**DEA 为机构**,勿与 Hank 重复为同级「人物节点」。 |
## 剧集编号与惯用标题(防误抽)
| **S04E13** | *Face Off* |
- **请勿**将 *Mandala*、*Face Off* 等注册为同级人物 Agent——它们是**集名**。
"""
processed = TextProcessor.preprocess_text(text)
assert "Graph Extraction Hints" not in processed
assert "Walter White" in processed
assert "Heisenberg" in processed
assert "DEA 为机构" in processed
assert "S04E13" not in processed
assert "Face Off" not in processed
assert "Mandala" not in processed