Merge 67fbbc16f8 into 96096ea0ff

2026-05-28 17:41:09 -04:00 · 2026-05-28 17:41:09 -04:00 · 4ae911aef2
parent 96096ea0ff 67fbbc16f8
commit 4ae911aef2
2 changed files with 146 additions and 11 deletions
--- a/backend/app/services/text_processor.py
+++ b/backend/app/services/text_processor.py
@ -2,6 +2,7 @@
 文本处理服务
 """

+import re
 from typing import List, Optional
 from ..utils.file_parser import FileParser, split_text_into_chunks

@ -9,6 +10,24 @@ from ..utils.file_parser import FileParser, split_text_into_chunks
 class TextProcessor:
    """文本处理器"""

+    # 这类章节和说明更像“如何喂系统”的元文本，不应直接送入图谱抽取。
+    GRAPH_META_SECTION_PATTERNS = (
+        r"给\s*MiroFish\s*/\s*知识图谱的约定",
+        r"剧集编号与惯用标题",
+        r"防误抽",
+    )
+
+    GRAPH_META_LINE_PATTERNS = (
+        r"本文供\s*\*{0,2}MiroFish",
+        r"供\s*MiroFish.*抽取.*推演",
+        r"请勿.*Agent",
+        r"请勿.*节点",
+        r"非角色",
+        r"抽取实体时请合并",
+        r"平行假设放哪",
+        r"勿把平行结局写进本文件正文",
+    )
+
    @staticmethod
    def extract_from_files(file_paths: List[str]) -> str:
        """从多个文件提取文本"""
@ -31,6 +50,7 @@ class TextProcessor:
        Returns:
            文本块列表
        """
+        text = TextProcessor.preprocess_text(text)
        return split_text_into_chunks(text, chunk_size, overlap)
    
    @staticmethod
@ -39,6 +59,8 @@ class TextProcessor:
        预处理文本
        - 移除多余空白
        - 标准化换行
+        - 保守清理“说明层/导航层”文本
+        - 删除剧集编号与英文单集标题等非主体锚点
        
        Args:
            text: 原始文本
@ -46,18 +68,68 @@ class TextProcessor:
        Returns:
            处理后的文本
        """
-        import re
-        
        # 标准化换行
        text = text.replace('\r\n', '\n').replace('\r', '\n')

-        # 移除连续空行（保留最多两个换行）
+        raw_lines = [line.strip() for line in text.split('\n')]
+        title_pattern = re.compile(r"(?<!\*)\*([A-Za-z][A-Za-z0-9 '\-]{1,60})\*(?!\*)")
+        non_subject_terms = set()
+
+        for line in raw_lines:
+            if "惯用标题" in line or ("|" in line and re.search(r"\bS\d{2}E\d{2}\b", line)):
+                for match in title_pattern.findall(line):
+                    non_subject_terms.add(match.strip())
+
+        cleaned_lines: List[str] = []
+        skip_section = False
+        in_naming_section = False
+
+        for line in raw_lines:
+            if not line:
+                cleaned_lines.append("")
+                continue
+
+            if line.startswith("## "):
+                in_naming_section = "命名规范" in line
+                if any(re.search(pattern, line, re.IGNORECASE) for pattern in TextProcessor.GRAPH_META_SECTION_PATTERNS):
+                    skip_section = True
+                    continue
+                skip_section = False
+
+            if skip_section:
+                continue
+
+            if in_naming_section and "|" in line:
+                columns = [part.strip() for part in line.strip("|").split("|")]
+                if len(columns) >= 2:
+                    canonical = re.sub(r"\*+", "", columns[0]).strip()
+                    note = columns[1]
+                    line = f"| {canonical} | {note.strip()} |"
+
+            if any(re.search(pattern, line, re.IGNORECASE) for pattern in TextProcessor.GRAPH_META_LINE_PATTERNS):
+                continue
+
+            line = re.sub(r"\bS\d{2}E\d{2}\b", "", line)
+            line = re.sub(r"(?<!\*)\*([A-Za-z][A-Za-z0-9 '\-]{1,60})\*(?!\*)", "", line)
+
+            for term in sorted(non_subject_terms, key=len, reverse=True):
+                line = re.sub(rf"\*{{0,3}}{re.escape(term)}\*{{0,3}}", "", line)
+
+            if re.fullmatch(r"[|\-:\s]+", line):
+                continue
+
+            simplified = re.sub(r"[\s|:\-*`_]+", "", line)
+            if not simplified:
+                continue
+
+            line = re.sub(r"\s{2,}", " ", line).strip()
+            if not line:
+                continue
+
+            cleaned_lines.append(line)
+
+        text = "\n".join(cleaned_lines)
        text = re.sub(r'\n{3,}', '\n\n', text)
-        
-        # 移除行首行尾空白
-        lines = [line.strip() for line in text.split('\n')]
-        text = '\n'.join(lines)
-        
        return text.strip()
    
    @staticmethod
@ -68,4 +140,3 @@ class TextProcessor:
            "total_lines": text.count('\n') + 1,
            "total_words": len(text.split()),
        }
-
--- a/backend/tests/test_text_processor.py
+++ b/backend/tests/test_text_processor.py
@ -0,0 +1,64 @@
+from importlib.util import module_from_spec, spec_from_file_location
+from pathlib import Path
+import sys
+
+
+def _load_text_processor():
+    backend_dir = Path(__file__).resolve().parents[1]
+    if str(backend_dir) not in sys.path:
+        sys.path.insert(0, str(backend_dir))
+
+    module_path = backend_dir / "app" / "services" / "text_processor.py"
+    spec = spec_from_file_location("app.services.text_processor_test", module_path)
+    module = module_from_spec(spec)
+    assert spec and spec.loader
+    spec.loader.exec_module(module)
+    return module.TextProcessor
+
+
+TextProcessor = _load_text_processor()
+
+
+def test_preprocess_text_removes_graph_meta_and_episode_titles():
+    text = """
+本文供 **MiroFish** 等工具做人物与关系底座抽取与推演。
+
+## 给 MiroFish / 知识图谱的约定（请先读）
+1. **剧集怎样引用**：S04E13 *Face Off* 不是角色，勿单独建 Agent 节点。
+
+## Walter White（一类）
+高中化学教师，已以 **Heisenberg** 身份与 Jesse 制毒。
+7. *End Times*：利用 Brock 急病建构 Gus 陷害叙事。
+8. *Face Off* 正史：与 Hector 合谋炸弹杀 Gus。
+"""
+
+    processed = TextProcessor.preprocess_text(text)
+
+    assert "MiroFish" not in processed
+    assert "8. *Face Off*" not in processed
+    assert "S04E13" not in processed
+    assert "Walter White" in processed
+    assert "Heisenberg" in processed
+    assert "Gus" in processed
+
+
+def test_preprocess_text_keeps_naming_table_but_strips_episode_title_noise():
+    text = """
+## 命名规范（抽取实体时请合并为单一节点）
+| Walter White | 亦称 Walt；**Heisenberg** 仍指同一人，勿拆节点。 |
+| Hank Schrader | DEA 探员。**DEA 为机构**，勿与 Hank 重复为同级「人物节点」。 |
+
+## 剧集编号与惯用标题（防误抽）
+| **S04E13** | *Face Off* |
+- **请勿**将 *Mandala*、*Face Off* 等注册为同级人物 Agent——它们是**集名**。
+"""
+
+    processed = TextProcessor.preprocess_text(text)
+
+    assert "Graph Extraction Hints" not in processed
+    assert "Walter White" in processed
+    assert "Heisenberg" in processed
+    assert "DEA 为机构" in processed
+    assert "S04E13" not in processed
+    assert "Face Off" not in processed
+    assert "Mandala" not in processed