Reduce structural noise before graph extraction

2026-04-27 00:02:59 +08:00 · 2026-04-27 00:02:59 +08:00 · 67fbbc16f8
parent fa0f6519b1
commit 67fbbc16f8
2 changed files with 146 additions and 11 deletions
--- a/backend/app/services/text_processor.py
+++ b/backend/app/services/text_processor.py
@ -2,13 +2,32 @@
 文本处理服务
 """
 import re
 from typing import List, Optional
 from ..utils.file_parser import FileParser, split_text_into_chunks
 class TextProcessor:
    """文本处理器"""
-    
+
    # 这类章节和说明更像“如何喂系统”的元文本，不应直接送入图谱抽取。
    GRAPH_META_SECTION_PATTERNS = (
        r"给\s*MiroFish\s*/\s*知识图谱的约定",
        r"剧集编号与惯用标题",
        r"防误抽",
    )
    GRAPH_META_LINE_PATTERNS = (
        r"本文供\s*\*{0,2}MiroFish",
        r"供\s*MiroFish.*抽取.*推演",
        r"请勿.*Agent",
        r"请勿.*节点",
        r"非角色",
        r"抽取实体时请合并",
        r"平行假设放哪",
        r"勿把平行结局写进本文件正文",
    )
    @staticmethod
    def extract_from_files(file_paths: List[str]) -> str:
        """从多个文件提取文本"""
@ -31,6 +50,7 @@ class TextProcessor:
        Returns:
            文本块列表
        """
        text = TextProcessor.preprocess_text(text)
        return split_text_into_chunks(text, chunk_size, overlap)
    @staticmethod
@ -39,6 +59,8 @@ class TextProcessor:
        预处理文本
        - 移除多余空白
        - 标准化换行
        - 保守清理“说明层/导航层”文本
        - 删除剧集编号与英文单集标题等非主体锚点
        Args:
            text: 原始文本
@ -46,18 +68,68 @@ class TextProcessor:
        Returns:
            处理后的文本
        """
        import re
        # 标准化换行
        text = text.replace('\r\n', '\n').replace('\r', '\n')
-        
+
-        # 移除连续空行（保留最多两个换行）
+        raw_lines = [line.strip() for line in text.split('\n')]
        title_pattern = re.compile(r"(?<!\*)\*([A-Za-z][A-Za-z0-9 '\-]{1,60})\*(?!\*)")
        non_subject_terms = set()
        for line in raw_lines:
            if "惯用标题" in line or ("|" in line and re.search(r"\bS\d{2}E\d{2}\b", line)):
                for match in title_pattern.findall(line):
                    non_subject_terms.add(match.strip())
        cleaned_lines: List[str] = []
        skip_section = False
        in_naming_section = False
        for line in raw_lines:
            if not line:
                cleaned_lines.append("")
                continue
            if line.startswith("## "):
                in_naming_section = "命名规范" in line
                if any(re.search(pattern, line, re.IGNORECASE) for pattern in TextProcessor.GRAPH_META_SECTION_PATTERNS):
                    skip_section = True
                    continue
                skip_section = False
            if skip_section:
                continue
            if in_naming_section and "|" in line:
                columns = [part.strip() for part in line.strip("|").split("|")]
                if len(columns) >= 2:
                    canonical = re.sub(r"\*+", "", columns[0]).strip()
                    note = columns[1]
                    line = f"| {canonical} | {note.strip()} |"
            if any(re.search(pattern, line, re.IGNORECASE) for pattern in TextProcessor.GRAPH_META_LINE_PATTERNS):
                continue
            line = re.sub(r"\bS\d{2}E\d{2}\b", "", line)
            line = re.sub(r"(?<!\*)\*([A-Za-z][A-Za-z0-9 '\-]{1,60})\*(?!\*)", "", line)
            for term in sorted(non_subject_terms, key=len, reverse=True):
                line = re.sub(rf"\*{{0,3}}{re.escape(term)}\*{{0,3}}", "", line)
            if re.fullmatch(r"[|\-:\s]+", line):
                continue
            simplified = re.sub(r"[\s|:\-*`_]+", "", line)
            if not simplified:
                continue
            line = re.sub(r"\s{2,}", " ", line).strip()
            if not line:
                continue
            cleaned_lines.append(line)
        text = "\n".join(cleaned_lines)
        text = re.sub(r'\n{3,}', '\n\n', text)
        # 移除行首行尾空白
        lines = [line.strip() for line in text.split('\n')]
        text = '\n'.join(lines)
        return text.strip()
    @staticmethod
@ -68,4 +140,3 @@ class TextProcessor:
            "total_lines": text.count('\n') + 1,
            "total_words": len(text.split()),
        }
--- a/backend/tests/test_text_processor.py
+++ b/backend/tests/test_text_processor.py
@ -0,0 +1,64 @@
 from importlib.util import module_from_spec, spec_from_file_location
 from pathlib import Path
 import sys
 def _load_text_processor():
    backend_dir = Path(__file__).resolve().parents[1]
    if str(backend_dir) not in sys.path:
        sys.path.insert(0, str(backend_dir))
    module_path = backend_dir / "app" / "services" / "text_processor.py"
    spec = spec_from_file_location("app.services.text_processor_test", module_path)
    module = module_from_spec(spec)
    assert spec and spec.loader
    spec.loader.exec_module(module)
    return module.TextProcessor
 TextProcessor = _load_text_processor()
 def test_preprocess_text_removes_graph_meta_and_episode_titles():
    text = """
 本文供 **MiroFish** 等工具做人物与关系底座抽取与推演。
 ## 给 MiroFish / 知识图谱的约定（请先读）
 1. **剧集怎样引用**：S04E13 *Face Off* 不是角色，勿单独建 Agent 节点。
 ## Walter White（一类）
 高中化学教师，已以 **Heisenberg** 身份与 Jesse 制毒。
 7. *End Times*：利用 Brock 急病建构 Gus 陷害叙事。
 8. *Face Off* 正史：与 Hector 合谋炸弹杀 Gus。
 """
    processed = TextProcessor.preprocess_text(text)
    assert "MiroFish" not in processed
    assert "8. *Face Off*" not in processed
    assert "S04E13" not in processed
    assert "Walter White" in processed
    assert "Heisenberg" in processed
    assert "Gus" in processed
 def test_preprocess_text_keeps_naming_table_but_strips_episode_title_noise():
    text = """
 ## 命名规范（抽取实体时请合并为单一节点）
 | Walter White | 亦称 Walt；**Heisenberg** 仍指同一人，勿拆节点。 |
 | Hank Schrader | DEA 探员。**DEA 为机构**，勿与 Hank 重复为同级「人物节点」。 |
 ## 剧集编号与惯用标题（防误抽）
 | **S04E13** | *Face Off* |
 - **请勿**将 *Mandala*、*Face Off* 等注册为同级人物 Agent——它们是**集名**。
 """
    processed = TextProcessor.preprocess_text(text)
    assert "Graph Extraction Hints" not in processed
    assert "Walter White" in processed
    assert "Heisenberg" in processed
    assert "DEA 为机构" in processed
    assert "S04E13" not in processed
    assert "Face Off" not in processed
    assert "Mandala" not in processed