diff --git a/backend/app/services/text_processor.py b/backend/app/services/text_processor.py index 91e32acc..87ad7193 100644 --- a/backend/app/services/text_processor.py +++ b/backend/app/services/text_processor.py @@ -2,13 +2,32 @@ 文本处理服务 """ +import re from typing import List, Optional from ..utils.file_parser import FileParser, split_text_into_chunks class TextProcessor: """文本处理器""" - + + # 这类章节和说明更像“如何喂系统”的元文本,不应直接送入图谱抽取。 + GRAPH_META_SECTION_PATTERNS = ( + r"给\s*MiroFish\s*/\s*知识图谱的约定", + r"剧集编号与惯用标题", + r"防误抽", + ) + + GRAPH_META_LINE_PATTERNS = ( + r"本文供\s*\*{0,2}MiroFish", + r"供\s*MiroFish.*抽取.*推演", + r"请勿.*Agent", + r"请勿.*节点", + r"非角色", + r"抽取实体时请合并", + r"平行假设放哪", + r"勿把平行结局写进本文件正文", + ) + @staticmethod def extract_from_files(file_paths: List[str]) -> str: """从多个文件提取文本""" @@ -31,6 +50,7 @@ class TextProcessor: Returns: 文本块列表 """ + text = TextProcessor.preprocess_text(text) return split_text_into_chunks(text, chunk_size, overlap) @staticmethod @@ -39,6 +59,8 @@ class TextProcessor: 预处理文本 - 移除多余空白 - 标准化换行 + - 保守清理“说明层/导航层”文本 + - 删除剧集编号与英文单集标题等非主体锚点 Args: text: 原始文本 @@ -46,18 +68,68 @@ class TextProcessor: Returns: 处理后的文本 """ - import re - # 标准化换行 text = text.replace('\r\n', '\n').replace('\r', '\n') - - # 移除连续空行(保留最多两个换行) + + raw_lines = [line.strip() for line in text.split('\n')] + title_pattern = re.compile(r"(?= 2: + canonical = re.sub(r"\*+", "", columns[0]).strip() + note = columns[1] + line = f"| {canonical} | {note.strip()} |" + + if any(re.search(pattern, line, re.IGNORECASE) for pattern in TextProcessor.GRAPH_META_LINE_PATTERNS): + continue + + line = re.sub(r"\bS\d{2}E\d{2}\b", "", line) + line = re.sub(r"(?