MicroFish/backend/app/services/text_processor.py

72 lines
1.7 KiB
Python

"""
Text processing service
"""
from typing import List, Optional
from ..utils.file_parser import FileParser, split_text_into_chunks
class TextProcessor:
"""Text processor"""
@staticmethod
def extract_from_files(file_paths: List[str]) -> str:
"""Extract text from multiple files"""
return FileParser.extract_from_multiple(file_paths)
@staticmethod
def split_text(
text: str,
chunk_size: int = 500,
overlap: int = 50
) -> List[str]:
"""
Split text into chunks.
Args:
text: raw text
chunk_size: chunk size
overlap: overlap size
Returns:
list of text chunks
"""
return split_text_into_chunks(text, chunk_size, overlap)
@staticmethod
def preprocess_text(text: str) -> str:
"""
Preprocess text:
- Remove excess whitespace
- Normalize line endings
Args:
text: raw text
Returns:
processed text
"""
import re
# Normalize line endings
text = text.replace('\r\n', '\n').replace('\r', '\n')
# Remove consecutive blank lines (keep at most two newlines)
text = re.sub(r'\n{3,}', '\n\n', text)
# Strip leading/trailing whitespace from each line
lines = [line.strip() for line in text.split('\n')]
text = '\n'.join(lines)
return text.strip()
@staticmethod
def get_text_stats(text: str) -> dict:
"""Get text statistics"""
return {
"total_chars": len(text),
"total_lines": text.count('\n') + 1,
"total_words": len(text.split()),
}