""" 文本处理服务 """ import re from typing import List, Optional from ..utils.file_parser import FileParser, split_text_into_chunks class TextProcessor: """文本处理器""" # 这类章节和说明更像“如何喂系统”的元文本,不应直接送入图谱抽取。 GRAPH_META_SECTION_PATTERNS = ( r"给\s*MiroFish\s*/\s*知识图谱的约定", r"剧集编号与惯用标题", r"防误抽", ) GRAPH_META_LINE_PATTERNS = ( r"本文供\s*\*{0,2}MiroFish", r"供\s*MiroFish.*抽取.*推演", r"请勿.*Agent", r"请勿.*节点", r"非角色", r"抽取实体时请合并", r"平行假设放哪", r"勿把平行结局写进本文件正文", ) @staticmethod def extract_from_files(file_paths: List[str]) -> str: """从多个文件提取文本""" return FileParser.extract_from_multiple(file_paths) @staticmethod def split_text( text: str, chunk_size: int = 500, overlap: int = 50 ) -> List[str]: """ 分割文本 Args: text: 原始文本 chunk_size: 块大小 overlap: 重叠大小 Returns: 文本块列表 """ text = TextProcessor.preprocess_text(text) return split_text_into_chunks(text, chunk_size, overlap) @staticmethod def preprocess_text(text: str) -> str: """ 预处理文本 - 移除多余空白 - 标准化换行 - 保守清理“说明层/导航层”文本 - 删除剧集编号与英文单集标题等非主体锚点 Args: text: 原始文本 Returns: 处理后的文本 """ # 标准化换行 text = text.replace('\r\n', '\n').replace('\r', '\n') raw_lines = [line.strip() for line in text.split('\n')] title_pattern = re.compile(r"(?= 2: canonical = re.sub(r"\*+", "", columns[0]).strip() note = columns[1] line = f"| {canonical} | {note.strip()} |" if any(re.search(pattern, line, re.IGNORECASE) for pattern in TextProcessor.GRAPH_META_LINE_PATTERNS): continue line = re.sub(r"\bS\d{2}E\d{2}\b", "", line) line = re.sub(r"(? dict: """获取文本统计信息""" return { "total_chars": len(text), "total_lines": text.count('\n') + 1, "total_words": len(text.split()), }