Merge 67fbbc16f8 into 96096ea0ff
This commit is contained in:
commit
4ae911aef2
|
|
@ -2,13 +2,32 @@
|
|||
文本处理服务
|
||||
"""
|
||||
|
||||
import re
|
||||
from typing import List, Optional
|
||||
from ..utils.file_parser import FileParser, split_text_into_chunks
|
||||
|
||||
|
||||
class TextProcessor:
|
||||
"""文本处理器"""
|
||||
|
||||
|
||||
# 这类章节和说明更像“如何喂系统”的元文本,不应直接送入图谱抽取。
|
||||
GRAPH_META_SECTION_PATTERNS = (
|
||||
r"给\s*MiroFish\s*/\s*知识图谱的约定",
|
||||
r"剧集编号与惯用标题",
|
||||
r"防误抽",
|
||||
)
|
||||
|
||||
GRAPH_META_LINE_PATTERNS = (
|
||||
r"本文供\s*\*{0,2}MiroFish",
|
||||
r"供\s*MiroFish.*抽取.*推演",
|
||||
r"请勿.*Agent",
|
||||
r"请勿.*节点",
|
||||
r"非角色",
|
||||
r"抽取实体时请合并",
|
||||
r"平行假设放哪",
|
||||
r"勿把平行结局写进本文件正文",
|
||||
)
|
||||
|
||||
@staticmethod
|
||||
def extract_from_files(file_paths: List[str]) -> str:
|
||||
"""从多个文件提取文本"""
|
||||
|
|
@ -31,6 +50,7 @@ class TextProcessor:
|
|||
Returns:
|
||||
文本块列表
|
||||
"""
|
||||
text = TextProcessor.preprocess_text(text)
|
||||
return split_text_into_chunks(text, chunk_size, overlap)
|
||||
|
||||
@staticmethod
|
||||
|
|
@ -39,6 +59,8 @@ class TextProcessor:
|
|||
预处理文本
|
||||
- 移除多余空白
|
||||
- 标准化换行
|
||||
- 保守清理“说明层/导航层”文本
|
||||
- 删除剧集编号与英文单集标题等非主体锚点
|
||||
|
||||
Args:
|
||||
text: 原始文本
|
||||
|
|
@ -46,18 +68,68 @@ class TextProcessor:
|
|||
Returns:
|
||||
处理后的文本
|
||||
"""
|
||||
import re
|
||||
|
||||
# 标准化换行
|
||||
text = text.replace('\r\n', '\n').replace('\r', '\n')
|
||||
|
||||
# 移除连续空行(保留最多两个换行)
|
||||
|
||||
raw_lines = [line.strip() for line in text.split('\n')]
|
||||
title_pattern = re.compile(r"(?<!\*)\*([A-Za-z][A-Za-z0-9 '\-]{1,60})\*(?!\*)")
|
||||
non_subject_terms = set()
|
||||
|
||||
for line in raw_lines:
|
||||
if "惯用标题" in line or ("|" in line and re.search(r"\bS\d{2}E\d{2}\b", line)):
|
||||
for match in title_pattern.findall(line):
|
||||
non_subject_terms.add(match.strip())
|
||||
|
||||
cleaned_lines: List[str] = []
|
||||
skip_section = False
|
||||
in_naming_section = False
|
||||
|
||||
for line in raw_lines:
|
||||
if not line:
|
||||
cleaned_lines.append("")
|
||||
continue
|
||||
|
||||
if line.startswith("## "):
|
||||
in_naming_section = "命名规范" in line
|
||||
if any(re.search(pattern, line, re.IGNORECASE) for pattern in TextProcessor.GRAPH_META_SECTION_PATTERNS):
|
||||
skip_section = True
|
||||
continue
|
||||
skip_section = False
|
||||
|
||||
if skip_section:
|
||||
continue
|
||||
|
||||
if in_naming_section and "|" in line:
|
||||
columns = [part.strip() for part in line.strip("|").split("|")]
|
||||
if len(columns) >= 2:
|
||||
canonical = re.sub(r"\*+", "", columns[0]).strip()
|
||||
note = columns[1]
|
||||
line = f"| {canonical} | {note.strip()} |"
|
||||
|
||||
if any(re.search(pattern, line, re.IGNORECASE) for pattern in TextProcessor.GRAPH_META_LINE_PATTERNS):
|
||||
continue
|
||||
|
||||
line = re.sub(r"\bS\d{2}E\d{2}\b", "", line)
|
||||
line = re.sub(r"(?<!\*)\*([A-Za-z][A-Za-z0-9 '\-]{1,60})\*(?!\*)", "", line)
|
||||
|
||||
for term in sorted(non_subject_terms, key=len, reverse=True):
|
||||
line = re.sub(rf"\*{{0,3}}{re.escape(term)}\*{{0,3}}", "", line)
|
||||
|
||||
if re.fullmatch(r"[|\-:\s]+", line):
|
||||
continue
|
||||
|
||||
simplified = re.sub(r"[\s|:\-*`_]+", "", line)
|
||||
if not simplified:
|
||||
continue
|
||||
|
||||
line = re.sub(r"\s{2,}", " ", line).strip()
|
||||
if not line:
|
||||
continue
|
||||
|
||||
cleaned_lines.append(line)
|
||||
|
||||
text = "\n".join(cleaned_lines)
|
||||
text = re.sub(r'\n{3,}', '\n\n', text)
|
||||
|
||||
# 移除行首行尾空白
|
||||
lines = [line.strip() for line in text.split('\n')]
|
||||
text = '\n'.join(lines)
|
||||
|
||||
return text.strip()
|
||||
|
||||
@staticmethod
|
||||
|
|
@ -68,4 +140,3 @@ class TextProcessor:
|
|||
"total_lines": text.count('\n') + 1,
|
||||
"total_words": len(text.split()),
|
||||
}
|
||||
|
||||
|
|
|
|||
|
|
@ -0,0 +1,64 @@
|
|||
from importlib.util import module_from_spec, spec_from_file_location
|
||||
from pathlib import Path
|
||||
import sys
|
||||
|
||||
|
||||
def _load_text_processor():
|
||||
backend_dir = Path(__file__).resolve().parents[1]
|
||||
if str(backend_dir) not in sys.path:
|
||||
sys.path.insert(0, str(backend_dir))
|
||||
|
||||
module_path = backend_dir / "app" / "services" / "text_processor.py"
|
||||
spec = spec_from_file_location("app.services.text_processor_test", module_path)
|
||||
module = module_from_spec(spec)
|
||||
assert spec and spec.loader
|
||||
spec.loader.exec_module(module)
|
||||
return module.TextProcessor
|
||||
|
||||
|
||||
TextProcessor = _load_text_processor()
|
||||
|
||||
|
||||
def test_preprocess_text_removes_graph_meta_and_episode_titles():
|
||||
text = """
|
||||
本文供 **MiroFish** 等工具做人物与关系底座抽取与推演。
|
||||
|
||||
## 给 MiroFish / 知识图谱的约定(请先读)
|
||||
1. **剧集怎样引用**:S04E13 *Face Off* 不是角色,勿单独建 Agent 节点。
|
||||
|
||||
## Walter White(一类)
|
||||
高中化学教师,已以 **Heisenberg** 身份与 Jesse 制毒。
|
||||
7. *End Times*:利用 Brock 急病建构 Gus 陷害叙事。
|
||||
8. *Face Off* 正史:与 Hector 合谋炸弹杀 Gus。
|
||||
"""
|
||||
|
||||
processed = TextProcessor.preprocess_text(text)
|
||||
|
||||
assert "MiroFish" not in processed
|
||||
assert "8. *Face Off*" not in processed
|
||||
assert "S04E13" not in processed
|
||||
assert "Walter White" in processed
|
||||
assert "Heisenberg" in processed
|
||||
assert "Gus" in processed
|
||||
|
||||
|
||||
def test_preprocess_text_keeps_naming_table_but_strips_episode_title_noise():
|
||||
text = """
|
||||
## 命名规范(抽取实体时请合并为单一节点)
|
||||
| Walter White | 亦称 Walt;**Heisenberg** 仍指同一人,勿拆节点。 |
|
||||
| Hank Schrader | DEA 探员。**DEA 为机构**,勿与 Hank 重复为同级「人物节点」。 |
|
||||
|
||||
## 剧集编号与惯用标题(防误抽)
|
||||
| **S04E13** | *Face Off* |
|
||||
- **请勿**将 *Mandala*、*Face Off* 等注册为同级人物 Agent——它们是**集名**。
|
||||
"""
|
||||
|
||||
processed = TextProcessor.preprocess_text(text)
|
||||
|
||||
assert "Graph Extraction Hints" not in processed
|
||||
assert "Walter White" in processed
|
||||
assert "Heisenberg" in processed
|
||||
assert "DEA 为机构" in processed
|
||||
assert "S04E13" not in processed
|
||||
assert "Face Off" not in processed
|
||||
assert "Mandala" not in processed
|
||||
Loading…
Reference in New Issue