This commit is contained in:
YeJe-cpu 2026-05-28 17:41:09 -04:00 committed by GitHub
commit 4ae911aef2
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194
2 changed files with 146 additions and 11 deletions

View File

@ -2,6 +2,7 @@
文本处理服务
"""
import re
from typing import List, Optional
from ..utils.file_parser import FileParser, split_text_into_chunks
@ -9,6 +10,24 @@ from ..utils.file_parser import FileParser, split_text_into_chunks
class TextProcessor:
"""文本处理器"""
# 这类章节和说明更像“如何喂系统”的元文本,不应直接送入图谱抽取。
GRAPH_META_SECTION_PATTERNS = (
r"\s*MiroFish\s*/\s*知识图谱的约定",
r"剧集编号与惯用标题",
r"防误抽",
)
GRAPH_META_LINE_PATTERNS = (
r"本文供\s*\*{0,2}MiroFish",
r"\s*MiroFish.*抽取.*推演",
r"请勿.*Agent",
r"请勿.*节点",
r"非角色",
r"抽取实体时请合并",
r"平行假设放哪",
r"勿把平行结局写进本文件正文",
)
@staticmethod
def extract_from_files(file_paths: List[str]) -> str:
"""从多个文件提取文本"""
@ -31,6 +50,7 @@ class TextProcessor:
Returns:
文本块列表
"""
text = TextProcessor.preprocess_text(text)
return split_text_into_chunks(text, chunk_size, overlap)
@staticmethod
@ -39,6 +59,8 @@ class TextProcessor:
预处理文本
- 移除多余空白
- 标准化换行
- 保守清理说明层/导航层文本
- 删除剧集编号与英文单集标题等非主体锚点
Args:
text: 原始文本
@ -46,18 +68,68 @@ class TextProcessor:
Returns:
处理后的文本
"""
import re
# 标准化换行
text = text.replace('\r\n', '\n').replace('\r', '\n')
# 移除连续空行(保留最多两个换行)
raw_lines = [line.strip() for line in text.split('\n')]
title_pattern = re.compile(r"(?<!\*)\*([A-Za-z][A-Za-z0-9 '\-]{1,60})\*(?!\*)")
non_subject_terms = set()
for line in raw_lines:
if "惯用标题" in line or ("|" in line and re.search(r"\bS\d{2}E\d{2}\b", line)):
for match in title_pattern.findall(line):
non_subject_terms.add(match.strip())
cleaned_lines: List[str] = []
skip_section = False
in_naming_section = False
for line in raw_lines:
if not line:
cleaned_lines.append("")
continue
if line.startswith("## "):
in_naming_section = "命名规范" in line
if any(re.search(pattern, line, re.IGNORECASE) for pattern in TextProcessor.GRAPH_META_SECTION_PATTERNS):
skip_section = True
continue
skip_section = False
if skip_section:
continue
if in_naming_section and "|" in line:
columns = [part.strip() for part in line.strip("|").split("|")]
if len(columns) >= 2:
canonical = re.sub(r"\*+", "", columns[0]).strip()
note = columns[1]
line = f"| {canonical} | {note.strip()} |"
if any(re.search(pattern, line, re.IGNORECASE) for pattern in TextProcessor.GRAPH_META_LINE_PATTERNS):
continue
line = re.sub(r"\bS\d{2}E\d{2}\b", "", line)
line = re.sub(r"(?<!\*)\*([A-Za-z][A-Za-z0-9 '\-]{1,60})\*(?!\*)", "", line)
for term in sorted(non_subject_terms, key=len, reverse=True):
line = re.sub(rf"\*{{0,3}}{re.escape(term)}\*{{0,3}}", "", line)
if re.fullmatch(r"[|\-:\s]+", line):
continue
simplified = re.sub(r"[\s|:\-*`_]+", "", line)
if not simplified:
continue
line = re.sub(r"\s{2,}", " ", line).strip()
if not line:
continue
cleaned_lines.append(line)
text = "\n".join(cleaned_lines)
text = re.sub(r'\n{3,}', '\n\n', text)
# 移除行首行尾空白
lines = [line.strip() for line in text.split('\n')]
text = '\n'.join(lines)
return text.strip()
@staticmethod
@ -68,4 +140,3 @@ class TextProcessor:
"total_lines": text.count('\n') + 1,
"total_words": len(text.split()),
}

View File

@ -0,0 +1,64 @@
from importlib.util import module_from_spec, spec_from_file_location
from pathlib import Path
import sys
def _load_text_processor():
backend_dir = Path(__file__).resolve().parents[1]
if str(backend_dir) not in sys.path:
sys.path.insert(0, str(backend_dir))
module_path = backend_dir / "app" / "services" / "text_processor.py"
spec = spec_from_file_location("app.services.text_processor_test", module_path)
module = module_from_spec(spec)
assert spec and spec.loader
spec.loader.exec_module(module)
return module.TextProcessor
TextProcessor = _load_text_processor()
def test_preprocess_text_removes_graph_meta_and_episode_titles():
text = """
本文供 **MiroFish** 等工具做人物与关系底座抽取与推演
## 给 MiroFish / 知识图谱的约定(请先读)
1. **剧集怎样引用**S04E13 *Face Off* 不是角色勿单独建 Agent 节点
## Walter White一类
高中化学教师已以 **Heisenberg** 身份与 Jesse 制毒
7. *End Times*利用 Brock 急病建构 Gus 陷害叙事
8. *Face Off* 正史 Hector 合谋炸弹杀 Gus
"""
processed = TextProcessor.preprocess_text(text)
assert "MiroFish" not in processed
assert "8. *Face Off*" not in processed
assert "S04E13" not in processed
assert "Walter White" in processed
assert "Heisenberg" in processed
assert "Gus" in processed
def test_preprocess_text_keeps_naming_table_but_strips_episode_title_noise():
text = """
## 命名规范(抽取实体时请合并为单一节点)
| Walter White | 亦称 Walt**Heisenberg** 仍指同一人勿拆节点 |
| Hank Schrader | DEA 探员**DEA 为机构**勿与 Hank 重复为同级人物节点 |
## 剧集编号与惯用标题(防误抽)
| **S04E13** | *Face Off* |
- **请勿** *Mandala**Face Off* 等注册为同级人物 Agent它们是**集名**
"""
processed = TextProcessor.preprocess_text(text)
assert "Graph Extraction Hints" not in processed
assert "Walter White" in processed
assert "Heisenberg" in processed
assert "DEA 为机构" in processed
assert "S04E13" not in processed
assert "Face Off" not in processed
assert "Mandala" not in processed