65 lines
2.2 KiB
Python
65 lines
2.2 KiB
Python
from importlib.util import module_from_spec, spec_from_file_location
|
||
from pathlib import Path
|
||
import sys
|
||
|
||
|
||
def _load_text_processor():
|
||
backend_dir = Path(__file__).resolve().parents[1]
|
||
if str(backend_dir) not in sys.path:
|
||
sys.path.insert(0, str(backend_dir))
|
||
|
||
module_path = backend_dir / "app" / "services" / "text_processor.py"
|
||
spec = spec_from_file_location("app.services.text_processor_test", module_path)
|
||
module = module_from_spec(spec)
|
||
assert spec and spec.loader
|
||
spec.loader.exec_module(module)
|
||
return module.TextProcessor
|
||
|
||
|
||
TextProcessor = _load_text_processor()
|
||
|
||
|
||
def test_preprocess_text_removes_graph_meta_and_episode_titles():
|
||
text = """
|
||
本文供 **MiroFish** 等工具做人物与关系底座抽取与推演。
|
||
|
||
## 给 MiroFish / 知识图谱的约定(请先读)
|
||
1. **剧集怎样引用**:S04E13 *Face Off* 不是角色,勿单独建 Agent 节点。
|
||
|
||
## Walter White(一类)
|
||
高中化学教师,已以 **Heisenberg** 身份与 Jesse 制毒。
|
||
7. *End Times*:利用 Brock 急病建构 Gus 陷害叙事。
|
||
8. *Face Off* 正史:与 Hector 合谋炸弹杀 Gus。
|
||
"""
|
||
|
||
processed = TextProcessor.preprocess_text(text)
|
||
|
||
assert "MiroFish" not in processed
|
||
assert "8. *Face Off*" not in processed
|
||
assert "S04E13" not in processed
|
||
assert "Walter White" in processed
|
||
assert "Heisenberg" in processed
|
||
assert "Gus" in processed
|
||
|
||
|
||
def test_preprocess_text_keeps_naming_table_but_strips_episode_title_noise():
|
||
text = """
|
||
## 命名规范(抽取实体时请合并为单一节点)
|
||
| Walter White | 亦称 Walt;**Heisenberg** 仍指同一人,勿拆节点。 |
|
||
| Hank Schrader | DEA 探员。**DEA 为机构**,勿与 Hank 重复为同级「人物节点」。 |
|
||
|
||
## 剧集编号与惯用标题(防误抽)
|
||
| **S04E13** | *Face Off* |
|
||
- **请勿**将 *Mandala*、*Face Off* 等注册为同级人物 Agent——它们是**集名**。
|
||
"""
|
||
|
||
processed = TextProcessor.preprocess_text(text)
|
||
|
||
assert "Graph Extraction Hints" not in processed
|
||
assert "Walter White" in processed
|
||
assert "Heisenberg" in processed
|
||
assert "DEA 为机构" in processed
|
||
assert "S04E13" not in processed
|
||
assert "Face Off" not in processed
|
||
assert "Mandala" not in processed
|