From e3f7defefc4fa45ec539371c2617e49d890a8995 Mon Sep 17 00:00:00 2001 From: Dominik Seemann Date: Thu, 7 May 2026 14:44:08 +0000 Subject: [PATCH 01/16] docs(i18n): translate chinese docstrings/comments in backend/app/{models,utils} and partial services --- backend/app/models/__init__.py | 4 +- backend/app/models/project.py | 219 ++++++++++----------- backend/app/models/task.py | 108 +++++----- backend/app/services/__init__.py | 4 +- backend/app/services/graph_builder.py | 123 ++++++------ backend/app/services/ontology_generator.py | 129 ++++++------ backend/app/services/text_processor.py | 62 +++--- backend/app/utils/__init__.py | 4 +- backend/app/utils/file_parser.py | 139 ++++++------- backend/app/utils/llm_client.py | 25 ++- backend/app/utils/logger.py | 77 ++++---- backend/app/utils/retry.py | 73 ++++--- backend/app/utils/zep_paging.py | 15 +- 13 files changed, 464 insertions(+), 518 deletions(-) diff --git a/backend/app/models/__init__.py b/backend/app/models/__init__.py index 55bec619..b5118d01 100644 --- a/backend/app/models/__init__.py +++ b/backend/app/models/__init__.py @@ -1,6 +1,4 @@ -""" -数据模型模块 -""" +"""Data model package.""" from .task import TaskManager, TaskStatus from .project import Project, ProjectStatus, ProjectManager diff --git a/backend/app/models/project.py b/backend/app/models/project.py index 08978937..81d9a3e7 100644 --- a/backend/app/models/project.py +++ b/backend/app/models/project.py @@ -1,6 +1,7 @@ -""" -项目上下文管理 -用于在服务端持久化项目状态,避免前端在接口间传递大量数据 +"""Project context management. + +Persists project state on the server so the frontend does not have to round-trip +large blobs of context between API calls. """ import os @@ -15,45 +16,45 @@ from ..config import Config class ProjectStatus(str, Enum): - """项目状态""" - CREATED = "created" # 刚创建,文件已上传 - ONTOLOGY_GENERATED = "ontology_generated" # 本体已生成 - GRAPH_BUILDING = "graph_building" # 图谱构建中 - GRAPH_COMPLETED = "graph_completed" # 图谱构建完成 - FAILED = "failed" # 失败 + """Project lifecycle status.""" + CREATED = "created" # just created, files uploaded + ONTOLOGY_GENERATED = "ontology_generated" # ontology has been generated + GRAPH_BUILDING = "graph_building" # graph build in progress + GRAPH_COMPLETED = "graph_completed" # graph build finished + FAILED = "failed" # build failed @dataclass class Project: - """项目数据模型""" + """Project data model.""" project_id: str name: str status: ProjectStatus created_at: str updated_at: str - - # 文件信息 + + # File information files: List[Dict[str, str]] = field(default_factory=list) # [{filename, path, size}] total_text_length: int = 0 - - # 本体信息(接口1生成后填充) + + # Ontology information (filled in after step 1 generates it) ontology: Optional[Dict[str, Any]] = None analysis_summary: Optional[str] = None - - # 图谱信息(接口2完成后填充) + + # Graph information (filled in after step 2 finishes) graph_id: Optional[str] = None graph_build_task_id: Optional[str] = None - - # 配置 + + # Configuration simulation_requirement: Optional[str] = None chunk_size: int = 500 chunk_overlap: int = 50 - - # 错误信息 + + # Error message when status == FAILED error: Optional[str] = None - + def to_dict(self) -> Dict[str, Any]: - """转换为字典""" + """Serialize the project to a JSON-friendly dict.""" return { "project_id": self.project_id, "name": self.name, @@ -71,14 +72,14 @@ class Project: "chunk_overlap": self.chunk_overlap, "error": self.error } - + @classmethod def from_dict(cls, data: Dict[str, Any]) -> 'Project': - """从字典创建""" + """Reconstruct a project from its serialized dict.""" status = data.get('status', 'created') if isinstance(status, str): status = ProjectStatus(status) - + return cls( project_id=data['project_id'], name=data.get('name', 'Unnamed Project'), @@ -99,52 +100,51 @@ class Project: class ProjectManager: - """项目管理器 - 负责项目的持久化存储和检索""" - - # 项目存储根目录 + """Project manager: handles persistence and retrieval of projects on disk.""" + + # Root directory for project storage PROJECTS_DIR = os.path.join(Config.UPLOAD_FOLDER, 'projects') - + @classmethod def _ensure_projects_dir(cls): - """确保项目目录存在""" + """Ensure the projects root directory exists.""" os.makedirs(cls.PROJECTS_DIR, exist_ok=True) - + @classmethod def _get_project_dir(cls, project_id: str) -> str: - """获取项目目录路径""" + """Return the on-disk directory for a project.""" return os.path.join(cls.PROJECTS_DIR, project_id) - + @classmethod def _get_project_meta_path(cls, project_id: str) -> str: - """获取项目元数据文件路径""" + """Return the path to a project's metadata JSON file.""" return os.path.join(cls._get_project_dir(project_id), 'project.json') - + @classmethod def _get_project_files_dir(cls, project_id: str) -> str: - """获取项目文件存储目录""" + """Return the directory where project source files are stored.""" return os.path.join(cls._get_project_dir(project_id), 'files') - + @classmethod def _get_project_text_path(cls, project_id: str) -> str: - """获取项目提取文本存储路径""" + """Return the path to a project's extracted text file.""" return os.path.join(cls._get_project_dir(project_id), 'extracted_text.txt') - + @classmethod def create_project(cls, name: str = "Unnamed Project") -> Project: - """ - 创建新项目 - + """Create a new project. + Args: - name: 项目名称 - + name: Display name for the project. + Returns: - 新创建的Project对象 + The newly created ``Project`` instance. """ cls._ensure_projects_dir() - + project_id = f"proj_{uuid.uuid4().hex[:12]}" now = datetime.now().isoformat() - + project = Project( project_id=project_id, name=name, @@ -152,154 +152,147 @@ class ProjectManager: created_at=now, updated_at=now ) - - # 创建项目目录结构 + + # Create the on-disk project directory layout project_dir = cls._get_project_dir(project_id) files_dir = cls._get_project_files_dir(project_id) os.makedirs(project_dir, exist_ok=True) os.makedirs(files_dir, exist_ok=True) - - # 保存项目元数据 + + # Persist project metadata cls.save_project(project) - + return project - + @classmethod def save_project(cls, project: Project) -> None: - """保存项目元数据""" + """Persist project metadata to disk.""" project.updated_at = datetime.now().isoformat() meta_path = cls._get_project_meta_path(project.project_id) - + with open(meta_path, 'w', encoding='utf-8') as f: json.dump(project.to_dict(), f, ensure_ascii=False, indent=2) - + @classmethod def get_project(cls, project_id: str) -> Optional[Project]: - """ - 获取项目 - + """Load a project by id. + Args: - project_id: 项目ID - + project_id: Project identifier. + Returns: - Project对象,如果不存在返回None + The ``Project`` if it exists, otherwise ``None``. """ meta_path = cls._get_project_meta_path(project_id) - + if not os.path.exists(meta_path): return None - + with open(meta_path, 'r', encoding='utf-8') as f: data = json.load(f) - + return Project.from_dict(data) - + @classmethod def list_projects(cls, limit: int = 50) -> List[Project]: - """ - 列出所有项目 - + """List existing projects, newest first. + Args: - limit: 返回数量限制 - + limit: Maximum number of projects to return. + Returns: - 项目列表,按创建时间倒序 + Projects ordered by ``created_at`` descending. """ cls._ensure_projects_dir() - + projects = [] for project_id in os.listdir(cls.PROJECTS_DIR): project = cls.get_project(project_id) if project: projects.append(project) - - # 按创建时间倒序排序 + projects.sort(key=lambda p: p.created_at, reverse=True) - + return projects[:limit] - + @classmethod def delete_project(cls, project_id: str) -> bool: - """ - 删除项目及其所有文件 - + """Delete a project and all of its files. + Args: - project_id: 项目ID - + project_id: Project identifier. + Returns: - 是否删除成功 + ``True`` if the project existed and was removed, ``False`` otherwise. """ project_dir = cls._get_project_dir(project_id) - + if not os.path.exists(project_dir): return False - + shutil.rmtree(project_dir) return True - + @classmethod def save_file_to_project(cls, project_id: str, file_storage, original_filename: str) -> Dict[str, str]: - """ - 保存上传的文件到项目目录 - + """Save an uploaded file under the project's files directory. + Args: - project_id: 项目ID - file_storage: Flask的FileStorage对象 - original_filename: 原始文件名 - + project_id: Project identifier. + file_storage: Flask ``FileStorage`` object from the request. + original_filename: The user-supplied filename. + Returns: - 文件信息字典 {filename, path, size} + Dict describing the saved file: ``{original_filename, saved_filename, path, size}``. """ files_dir = cls._get_project_files_dir(project_id) os.makedirs(files_dir, exist_ok=True) - - # 生成安全的文件名 + + # Generate a safe randomized filename to avoid collisions ext = os.path.splitext(original_filename)[1].lower() safe_filename = f"{uuid.uuid4().hex[:8]}{ext}" file_path = os.path.join(files_dir, safe_filename) - - # 保存文件 + file_storage.save(file_path) - - # 获取文件大小 + file_size = os.path.getsize(file_path) - + return { "original_filename": original_filename, "saved_filename": safe_filename, "path": file_path, "size": file_size } - + @classmethod def save_extracted_text(cls, project_id: str, text: str) -> None: - """保存提取的文本""" + """Persist the project's extracted full text to disk.""" text_path = cls._get_project_text_path(project_id) with open(text_path, 'w', encoding='utf-8') as f: f.write(text) - + @classmethod def get_extracted_text(cls, project_id: str) -> Optional[str]: - """获取提取的文本""" + """Read back the project's extracted full text, or ``None`` if absent.""" text_path = cls._get_project_text_path(project_id) - + if not os.path.exists(text_path): return None - + with open(text_path, 'r', encoding='utf-8') as f: return f.read() - + @classmethod def get_project_files(cls, project_id: str) -> List[str]: - """获取项目的所有文件路径""" + """Return the on-disk paths of all files in the project.""" files_dir = cls._get_project_files_dir(project_id) - + if not os.path.exists(files_dir): return [] - + return [ - os.path.join(files_dir, f) - for f in os.listdir(files_dir) + os.path.join(files_dir, f) + for f in os.listdir(files_dir) if os.path.isfile(os.path.join(files_dir, f)) ] diff --git a/backend/app/models/task.py b/backend/app/models/task.py index dfebed23..c36290f1 100644 --- a/backend/app/models/task.py +++ b/backend/app/models/task.py @@ -1,6 +1,6 @@ -""" -任务状态管理 -用于跟踪长时间运行的任务(如图谱构建) +"""Task state management. + +Tracks long-running tasks (e.g. graph build) so callers can poll progress. """ import uuid @@ -14,30 +14,30 @@ from ..utils.locale import t class TaskStatus(str, Enum): - """任务状态枚举""" - PENDING = "pending" # 等待中 - PROCESSING = "processing" # 处理中 - COMPLETED = "completed" # 已完成 - FAILED = "failed" # 失败 + """Task status enum.""" + PENDING = "pending" # waiting + PROCESSING = "processing" # in progress + COMPLETED = "completed" # finished successfully + FAILED = "failed" # finished with error @dataclass class Task: - """任务数据类""" + """Task data class.""" task_id: str task_type: str status: TaskStatus created_at: datetime updated_at: datetime - progress: int = 0 # 总进度百分比 0-100 - message: str = "" # 状态消息 - result: Optional[Dict] = None # 任务结果 - error: Optional[str] = None # 错误信息 - metadata: Dict = field(default_factory=dict) # 额外元数据 - progress_detail: Dict = field(default_factory=dict) # 详细进度信息 - + progress: int = 0 # overall progress percentage 0-100 + message: str = "" # human-readable status message + result: Optional[Dict] = None # task result payload + error: Optional[str] = None # error message when failed + metadata: Dict = field(default_factory=dict) # arbitrary caller metadata + progress_detail: Dict = field(default_factory=dict) # fine-grained progress info + def to_dict(self) -> Dict[str, Any]: - """转换为字典""" + """Serialize the task to a JSON-friendly dict.""" return { "task_id": self.task_id, "task_type": self.task_type, @@ -54,16 +54,12 @@ class Task: class TaskManager: - """ - 任务管理器 - 线程安全的任务状态管理 - """ - + """Thread-safe singleton task registry.""" + _instance = None _lock = threading.Lock() - + def __new__(cls): - """单例模式""" if cls._instance is None: with cls._lock: if cls._instance is None: @@ -71,21 +67,20 @@ class TaskManager: cls._instance._tasks: Dict[str, Task] = {} cls._instance._task_lock = threading.Lock() return cls._instance - + def create_task(self, task_type: str, metadata: Optional[Dict] = None) -> str: - """ - 创建新任务 - + """Create a new task. + Args: - task_type: 任务类型 - metadata: 额外元数据 - + task_type: Task type identifier. + metadata: Optional caller-supplied metadata. + Returns: - 任务ID + The newly created task id. """ task_id = str(uuid.uuid4()) now = datetime.now() - + task = Task( task_id=task_id, task_type=task_type, @@ -94,17 +89,17 @@ class TaskManager: updated_at=now, metadata=metadata or {} ) - + with self._task_lock: self._tasks[task_id] = task - + return task_id - + def get_task(self, task_id: str) -> Optional[Task]: - """获取任务""" + """Return the task for ``task_id`` or ``None`` if unknown.""" with self._task_lock: return self._tasks.get(task_id) - + def update_task( self, task_id: str, @@ -115,17 +110,16 @@ class TaskManager: error: Optional[str] = None, progress_detail: Optional[Dict] = None ): - """ - 更新任务状态 - + """Update mutable fields on an existing task. + Args: - task_id: 任务ID - status: 新状态 - progress: 进度 - message: 消息 - result: 结果 - error: 错误信息 - progress_detail: 详细进度信息 + task_id: Task id to update. + status: New status, if changing. + progress: New overall progress (0-100), if changing. + message: New status message, if changing. + result: New result payload, if changing. + error: New error message, if changing. + progress_detail: New fine-grained progress info, if changing. """ with self._task_lock: task = self._tasks.get(task_id) @@ -143,9 +137,9 @@ class TaskManager: task.error = error if progress_detail is not None: task.progress_detail = progress_detail - + def complete_task(self, task_id: str, result: Dict): - """标记任务完成""" + """Mark a task as completed and attach the result.""" self.update_task( task_id, status=TaskStatus.COMPLETED, @@ -153,29 +147,29 @@ class TaskManager: message=t('progress.taskComplete'), result=result ) - + def fail_task(self, task_id: str, error: str): - """标记任务失败""" + """Mark a task as failed and attach the error message.""" self.update_task( task_id, status=TaskStatus.FAILED, message=t('progress.taskFailed'), error=error ) - + def list_tasks(self, task_type: Optional[str] = None) -> list: - """列出任务""" + """List tasks, optionally filtered by ``task_type``, newest first.""" with self._task_lock: tasks = list(self._tasks.values()) if task_type: tasks = [t for t in tasks if t.task_type == task_type] return [t.to_dict() for t in sorted(tasks, key=lambda x: x.created_at, reverse=True)] - + def cleanup_old_tasks(self, max_age_hours: int = 24): - """清理旧任务""" + """Drop completed/failed tasks older than ``max_age_hours``.""" from datetime import timedelta cutoff = datetime.now() - timedelta(hours=max_age_hours) - + with self._task_lock: old_ids = [ tid for tid, task in self._tasks.items() diff --git a/backend/app/services/__init__.py b/backend/app/services/__init__.py index 8db85d86..b0d4018a 100644 --- a/backend/app/services/__init__.py +++ b/backend/app/services/__init__.py @@ -1,6 +1,4 @@ -""" -业务服务模块 -""" +"""Business services package.""" from .ontology_generator import OntologyGenerator from .graph_builder import GraphBuilderService diff --git a/backend/app/services/graph_builder.py b/backend/app/services/graph_builder.py index 57262ab5..c21f44cb 100644 --- a/backend/app/services/graph_builder.py +++ b/backend/app/services/graph_builder.py @@ -1,6 +1,7 @@ -""" -图谱构建服务 -接口2:使用Zep API构建Standalone Graph +"""Graph build service. + +Pipeline step 2: build the project's standalone knowledge graph through the +Zep/Graphiti API. """ import os @@ -69,7 +70,7 @@ def _classify_entity_type(name: str, summary: str, ontology: Optional[Dict]) -> @dataclass class GraphInfo: - """图谱信息""" + """Summary information about a built graph.""" graph_id: str node_count: int edge_count: int @@ -85,10 +86,7 @@ class GraphInfo: class GraphBuilderService: - """ - 图谱构建服务 - 负责调用Zep API构建知识图谱 - """ + """Drives knowledge-graph construction via the Zep/Graphiti API.""" def __init__(self, api_key: Optional[str] = None): self.client = GraphitiAdapter() @@ -103,21 +101,20 @@ class GraphBuilderService: chunk_overlap: int = 50, batch_size: int = 3 ) -> str: - """ - 异步构建图谱 - + """Kick off a graph build asynchronously. + Args: - text: 输入文本 - ontology: 本体定义(来自接口1的输出) - graph_name: 图谱名称 - chunk_size: 文本块大小 - chunk_overlap: 块重叠大小 - batch_size: 每批发送的块数量 - + text: Source text to ingest. + ontology: Ontology definition (the output of pipeline step 1). + graph_name: Display name for the graph. + chunk_size: Characters per text chunk. + chunk_overlap: Overlap (in characters) between consecutive chunks. + batch_size: Number of chunks pushed to Zep per batch. + Returns: - 任务ID + The id of the task tracking the build. """ - # 创建任务 + # Register a task to track build progress. task_id = self.task_manager.create_task( task_type="graph_build", metadata={ @@ -130,7 +127,7 @@ class GraphBuilderService: # Capture locale before spawning background thread current_locale = get_locale() - # 在后台线程中执行构建 + # Run the build on a background thread so the request returns immediately. thread = threading.Thread( target=self._build_graph_worker, args=(task_id, text, ontology, graph_name, chunk_size, chunk_overlap, batch_size, current_locale) @@ -151,7 +148,7 @@ class GraphBuilderService: batch_size: int, locale: str = 'zh' ): - """图谱构建工作线程""" + """Background worker that performs the graph build.""" set_locale(locale) try: self.task_manager.update_task( @@ -161,7 +158,7 @@ class GraphBuilderService: message=t('progress.startBuildingGraph') ) - # 1. 创建图谱 + # 1. Create the graph. graph_id = self.create_graph(graph_name) self.task_manager.update_task( task_id, @@ -169,7 +166,7 @@ class GraphBuilderService: message=t('progress.graphCreated', graphId=graph_id) ) - # 2. 设置本体 + # 2. Set the ontology. self.set_ontology(graph_id, ontology) self.task_manager.update_task( task_id, @@ -177,7 +174,7 @@ class GraphBuilderService: message=t('progress.ontologySet') ) - # 3. 文本分块 + # 3. Split source text into chunks. chunks = TextProcessor.split_text(text, chunk_size, chunk_overlap) total_chunks = len(chunks) self.task_manager.update_task( @@ -186,7 +183,7 @@ class GraphBuilderService: message=t('progress.textSplit', count=total_chunks) ) - # 4. 分批发送数据 + # 4. Push chunks to the graph in batches. episode_uuids = self.add_text_batches( graph_id, chunks, batch_size, lambda msg, prog: self.task_manager.update_task( @@ -196,7 +193,7 @@ class GraphBuilderService: ) ) - # 5. 等待Zep处理完成 + # 5. Wait for Zep to finish processing the episodes. self.task_manager.update_task( task_id, progress=60, @@ -212,7 +209,7 @@ class GraphBuilderService: ) ) - # 6. 获取图谱信息 + # 6. Fetch the final graph metadata. self.task_manager.update_task( task_id, progress=90, @@ -220,8 +217,7 @@ class GraphBuilderService: ) graph_info = self._get_graph_info(graph_id) - - # 完成 + self.task_manager.complete_task(task_id, { "graph_id": graph_id, "graph_info": graph_info.to_dict(), @@ -234,7 +230,7 @@ class GraphBuilderService: self.task_manager.fail_task(task_id, error_msg) def create_graph(self, name: str) -> str: - """创建Zep图谱(公开方法)""" + """Create a new Zep graph and return its id (public API).""" graph_id = f"mirofish_{uuid.uuid4().hex[:16]}" self.client.graph.create( @@ -246,7 +242,7 @@ class GraphBuilderService: return graph_id def set_ontology(self, graph_id: str, ontology: Dict[str, Any]): - """设置图谱本体提示(Graphiti自动提取实体,本体作为提示存储)""" + """Register the ontology with the graph (Graphiti uses it as an extraction prompt).""" self.client.graph.set_ontology( graph_ids=[graph_id], entities=ontology.get("entity_types"), @@ -261,8 +257,11 @@ class GraphBuilderService: progress_callback: Optional[Callable] = None, skip_chunks: int = 0, ) -> List[str]: - """分批添加文本到图谱,返回所有 episode 的 uuid 列表。 - skip_chunks: 跳过已处理的块数(用于断点续传)。""" + """Push chunks to the graph in batches; returns the uuids of all episodes added. + + Args: + skip_chunks: Number of chunks to skip (used for resume-after-restart). + """ episode_uuids = [] total_chunks = len(chunks) @@ -279,27 +278,26 @@ class GraphBuilderService: ) - # 构建episode数据 + # Build the per-episode payload structures expected by the client. episodes = [ type('Episode', (), {'data': chunk, 'type': 'text'})() for chunk in batch_chunks ] - # 发送到Zep try: batch_result = self.client.graph.add_batch( graph_id=graph_id, episodes=episodes ) - - # 收集返回的 episode uuid + + # Collect the uuids returned for each episode. if batch_result and isinstance(batch_result, list): for ep in batch_result: ep_uuid = getattr(ep, 'uuid_', None) or getattr(ep, 'uuid', None) if ep_uuid: episode_uuids.append(ep_uuid) - - # 避免请求过快 + + # Throttle to avoid overwhelming the upstream API. time.sleep(1) except Exception as e: @@ -315,7 +313,7 @@ class GraphBuilderService: progress_callback: Optional[Callable] = None, timeout: int = 600 ): - """等待所有 episode 处理完成(通过查询每个 episode 的 processed 状态)""" + """Poll each episode until Zep marks it processed, or the timeout expires.""" if not episode_uuids: if progress_callback: progress_callback(t('progress.noEpisodesWait'), 1.0) @@ -338,18 +336,18 @@ class GraphBuilderService: ) break - # 检查每个 episode 的处理状态 + # Check the processing state of each pending episode. for ep_uuid in list(pending_episodes): try: episode = self.client.graph.episode.get(uuid_=ep_uuid) is_processed = getattr(episode, 'processed', False) - + if is_processed: pending_episodes.remove(ep_uuid) completed_count += 1 - + except Exception as e: - # 忽略单个查询错误,继续 + # Tolerate a single failed query; the next loop iteration retries. pass elapsed = int(time.time() - start_time) @@ -360,20 +358,17 @@ class GraphBuilderService: ) if pending_episodes: - time.sleep(3) # 每3秒检查一次 + time.sleep(3) # poll every 3 seconds if progress_callback: progress_callback(t('progress.processingComplete', completed=completed_count, total=total_episodes), 1.0) def _get_graph_info(self, graph_id: str) -> GraphInfo: - """获取图谱信息""" - # 获取节点(分页) + """Fetch summary info (counts and entity types) for a graph.""" nodes = fetch_all_nodes(self.client, graph_id) - - # 获取边(分页) edges = fetch_all_edges(self.client, graph_id) - # 统计实体类型 + # Tally distinct entity types across all nodes. entity_types = set() for node in nodes: if node.labels: @@ -389,26 +384,24 @@ class GraphBuilderService: ) def get_graph_data(self, graph_id: str, ontology: Optional[Dict] = None) -> Dict[str, Any]: - """ - 获取完整图谱数据(包含详细信息) - + """Return the full graph payload including timestamps, attributes, and edges. + Args: - graph_id: 图谱ID - + graph_id: Graph identifier. + Returns: - 包含nodes和edges的字典,包括时间信息、属性等详细数据 + Dict with ``nodes``, ``edges``, and aggregate counts. """ nodes = fetch_all_nodes(self.client, graph_id) edges = fetch_all_edges(self.client, graph_id) - # 创建节点映射用于获取节点名称 + # Build a uuid->name map so edge endpoints can be labeled. node_map = {} for node in nodes: node_map[node.uuid_] = node.name or "" - + nodes_data = [] for node in nodes: - # 获取创建时间 created_at = getattr(node, 'created_at', None) if created_at: created_at = str(created_at) @@ -429,20 +422,18 @@ class GraphBuilderService: edges_data = [] for edge in edges: - # 获取时间信息 created_at = getattr(edge, 'created_at', None) valid_at = getattr(edge, 'valid_at', None) invalid_at = getattr(edge, 'invalid_at', None) expired_at = getattr(edge, 'expired_at', None) - - # 获取 episodes + + # Normalize the episode list (the field may be missing or a single id). episodes = getattr(edge, 'episodes', None) or getattr(edge, 'episode_ids', None) if episodes and not isinstance(episodes, list): episodes = [str(episodes)] elif episodes: episodes = [str(e) for e in episodes] - - # 获取 fact_type + fact_type = getattr(edge, 'fact_type', None) or edge.name or "" edges_data.append({ @@ -471,6 +462,6 @@ class GraphBuilderService: } def delete_graph(self, graph_id: str): - """删除图谱""" + """Delete a graph by id.""" self.client.graph.delete(graph_id=graph_id) diff --git a/backend/app/services/ontology_generator.py b/backend/app/services/ontology_generator.py index 01a3d799..d49cb8eb 100644 --- a/backend/app/services/ontology_generator.py +++ b/backend/app/services/ontology_generator.py @@ -1,6 +1,7 @@ -""" -本体生成服务 -接口1:分析文本内容,生成适合社会模拟的实体和关系类型定义 +"""Ontology generation service. + +Pipeline step 1: analyze the source text and propose entity and relationship +types that fit a social-media opinion simulation. """ import json @@ -14,19 +15,19 @@ logger = logging.getLogger(__name__) def _to_pascal_case(name: str) -> str: - """将任意格式的名称转换为 PascalCase(如 'works_for' -> 'WorksFor', 'person' -> 'Person')""" - # 按非字母数字字符分割 + """Convert an arbitrary identifier to PascalCase (e.g. ``works_for`` -> ``WorksFor``).""" + # Split on non-alphanumeric separators first. parts = re.split(r'[^a-zA-Z0-9]+', name) - # 再按 camelCase 边界分割(如 'camelCase' -> ['camel', 'Case']) + # Then split on camelCase boundaries (e.g. ``camelCase`` -> ``['camel', 'Case']``). words = [] for part in parts: words.extend(re.sub(r'([a-z])([A-Z])', r'\1_\2', part).split('_')) - # 每个词首字母大写,过滤空串 + # Title-case each non-empty word and concatenate. result = ''.join(word.capitalize() for word in words if word) return result if result else 'Unknown' -# 本体生成的系统提示词 +# System prompt template for ontology generation. ONTOLOGY_SYSTEM_PROMPT = """你是一个专业的知识图谱本体设计专家。你的任务是分析给定的文本内容和模拟需求,设计适合**社交媒体舆论模拟**的实体类型和关系类型。 **重要:你必须输出有效的JSON格式数据,不要输出任何其他内容。** @@ -174,10 +175,7 @@ B. **具体类型(8个,根据文本内容设计)**: class OntologyGenerator: - """ - 本体生成器 - 分析文本内容,生成实体和关系类型定义 - """ + """Generate an entity- and edge-type ontology from arbitrary input text.""" def __init__(self, llm_client: Optional[LLMClient] = None): self.llm_client = llm_client or LLMClient() @@ -188,18 +186,17 @@ class OntologyGenerator: simulation_requirement: str, additional_context: Optional[str] = None ) -> Dict[str, Any]: - """ - 生成本体定义 - + """Generate an ontology definition. + Args: - document_texts: 文档文本列表 - simulation_requirement: 模拟需求描述 - additional_context: 额外上下文 - + document_texts: Source document text segments. + simulation_requirement: Description of the simulation goal. + additional_context: Optional supplemental context. + Returns: - 本体定义(entity_types, edge_types等) + The ontology dict with ``entity_types``, ``edge_types``, and a summary. """ - # 构建用户消息 + # Compose the user message that frames the LLM request. user_message = self._build_user_message( document_texts, simulation_requirement, @@ -213,19 +210,19 @@ class OntologyGenerator: {"role": "user", "content": user_message} ] - # 调用LLM + # Invoke the LLM. result = self.llm_client.chat_json( messages=messages, temperature=0.3, max_tokens=4096 ) - # 验证和后处理 + # Validate the LLM response and post-process it. result = self._validate_and_process(result) return result - # 传给 LLM 的文本最大长度(5万字) + # Maximum length of source text passed to the LLM (50k characters). MAX_TEXT_LENGTH_FOR_LLM = 50000 def _build_user_message( @@ -234,13 +231,14 @@ class OntologyGenerator: simulation_requirement: str, additional_context: Optional[str] ) -> str: - """构建用户消息""" - - # 合并文本 + """Build the user-message string for the ontology LLM call.""" + + # Concatenate the source documents into a single string. combined_text = "\n\n---\n\n".join(document_texts) original_length = len(combined_text) - - # 如果文本超过5万字,截断(仅影响传给LLM的内容,不影响图谱构建) + + # If the combined text exceeds the LLM input cap, truncate it for the + # LLM call only. The full text is still used for graph construction. if len(combined_text) > self.MAX_TEXT_LENGTH_FOR_LLM: combined_text = combined_text[:self.MAX_TEXT_LENGTH_FOR_LLM] combined_text += f"\n\n...(原文共{original_length}字,已截取前{self.MAX_TEXT_LENGTH_FOR_LLM}字用于本体分析)..." @@ -275,9 +273,9 @@ class OntologyGenerator: return message def _validate_and_process(self, result: Dict[str, Any]) -> Dict[str, Any]: - """验证和后处理结果""" - - # 确保必要字段存在 + """Validate and post-process the LLM-generated ontology dict.""" + + # Ensure required top-level fields exist. if "entity_types" not in result: result["entity_types"] = [] if "edge_types" not in result: @@ -285,11 +283,12 @@ class OntologyGenerator: if "analysis_summary" not in result: result["analysis_summary"] = "" - # 验证实体类型 - # 记录原始名称到 PascalCase 的映射,用于后续修正 edge 的 source_targets 引用 + # Validate entity types. + # Track original-name -> PascalCase mapping so edge source_targets + # references can be fixed up consistently below. entity_name_map = {} for entity in result["entity_types"]: - # 强制将 entity name 转为 PascalCase(Zep API 要求) + # Force entity names to PascalCase (required by the Zep API). if "name" in entity: original_name = entity["name"] entity["name"] = _to_pascal_case(original_name) @@ -300,19 +299,20 @@ class OntologyGenerator: entity["attributes"] = [] if "examples" not in entity: entity["examples"] = [] - # 确保description不超过100字符 + # Truncate descriptions longer than 100 characters. if len(entity.get("description", "")) > 100: entity["description"] = entity["description"][:97] + "..." - - # 验证关系类型 + + # Validate edge types. for edge in result["edge_types"]: - # 强制将 edge name 转为 SCREAMING_SNAKE_CASE(Zep API 要求) + # Force edge names to SCREAMING_SNAKE_CASE (required by the Zep API). if "name" in edge: original_name = edge["name"] edge["name"] = original_name.upper() if edge["name"] != original_name: logger.warning(f"Edge type name '{original_name}' auto-converted to '{edge['name']}'") - # 修正 source_targets 中的实体名称引用,与转换后的 PascalCase 保持一致 + # Rewrite source_targets entity-name references to match the + # PascalCase-normalized entity names. for st in edge.get("source_targets", []): if st.get("source") in entity_name_map: st["source"] = entity_name_map[st["source"]] @@ -325,11 +325,11 @@ class OntologyGenerator: if len(edge.get("description", "")) > 100: edge["description"] = edge["description"][:97] + "..." - # Zep API 限制:最多 10 个自定义实体类型,最多 10 个自定义边类型 + # Zep API caps: at most 10 custom entity types and 10 custom edge types. MAX_ENTITY_TYPES = 10 MAX_EDGE_TYPES = 10 - # 去重:按 name 去重,保留首次出现的 + # Deduplicate by name, keeping the first occurrence. seen_names = set() deduped = [] for entity in result["entity_types"]: @@ -341,7 +341,7 @@ class OntologyGenerator: logger.warning(f"Duplicate entity type '{name}' removed during validation") result["entity_types"] = deduped - # 兜底类型定义 + # Fallback entity-type definitions used when the LLM omits them. person_fallback = { "name": "Person", "description": "Any individual person not fitting other specific person types.", @@ -362,33 +362,31 @@ class OntologyGenerator: "examples": ["small business", "community group"] } - # 检查是否已有兜底类型 + # Check whether the fallback types are already present. entity_names = {e["name"] for e in result["entity_types"]} has_person = "Person" in entity_names has_organization = "Organization" in entity_names - - # 需要添加的兜底类型 + + # Collect missing fallback types to add below. fallbacks_to_add = [] if not has_person: fallbacks_to_add.append(person_fallback) if not has_organization: fallbacks_to_add.append(organization_fallback) - + if fallbacks_to_add: current_count = len(result["entity_types"]) needed_slots = len(fallbacks_to_add) - - # 如果添加后会超过 10 个,需要移除一些现有类型 + + # If adding the fallbacks would exceed the cap, drop some existing types. if current_count + needed_slots > MAX_ENTITY_TYPES: - # 计算需要移除多少个 to_remove = current_count + needed_slots - MAX_ENTITY_TYPES - # 从末尾移除(保留前面更重要的具体类型) + # Drop trailing types first; the more specific types come earlier. result["entity_types"] = result["entity_types"][:-to_remove] - - # 添加兜底类型 + result["entity_types"].extend(fallbacks_to_add) - - # 最终确保不超过限制(防御性编程) + + # Defensive cap enforcement: hard-trim if anything slipped through. if len(result["entity_types"]) > MAX_ENTITY_TYPES: result["entity_types"] = result["entity_types"][:MAX_ENTITY_TYPES] @@ -398,14 +396,13 @@ class OntologyGenerator: return result def generate_python_code(self, ontology: Dict[str, Any]) -> str: - """ - 将本体定义转换为Python代码(类似ontology.py) - + """Render the ontology definition as Python source code. + Args: - ontology: 本体定义 - + ontology: Ontology definition dict. + Returns: - Python代码字符串 + Python source code as a single string. """ code_lines = [ '"""', @@ -421,7 +418,7 @@ class OntologyGenerator: '', ] - # 生成实体类型 + # Emit each entity type as a Python class. for entity in ontology.get("entity_types", []): name = entity["name"] desc = entity.get("description", f"A {name} entity.") @@ -447,10 +444,10 @@ class OntologyGenerator: code_lines.append('# ============== 关系类型定义 ==============') code_lines.append('') - # 生成关系类型 + # Emit each edge type as a Python class. for edge in ontology.get("edge_types", []): name = edge["name"] - # 转换为PascalCase类名 + # Convert SCREAMING_SNAKE_CASE -> PascalCase for the class name. class_name = ''.join(word.capitalize() for word in name.split('_')) desc = edge.get("description", f"A {name} relationship.") @@ -472,7 +469,7 @@ class OntologyGenerator: code_lines.append('') code_lines.append('') - # 生成类型字典 + # Emit the type registries. code_lines.append('# ============== 类型配置 ==============') code_lines.append('') code_lines.append('ENTITY_TYPES = {') @@ -489,7 +486,7 @@ class OntologyGenerator: code_lines.append('}') code_lines.append('') - # 生成边的source_targets映射 + # Emit the edge source_targets map. code_lines.append('EDGE_SOURCE_TARGETS = {') for edge in ontology.get("edge_types", []): name = edge["name"] diff --git a/backend/app/services/text_processor.py b/backend/app/services/text_processor.py index 91e32acc..9364cbc2 100644 --- a/backend/app/services/text_processor.py +++ b/backend/app/services/text_processor.py @@ -1,68 +1,64 @@ -""" -文本处理服务 -""" +"""Text processing service.""" from typing import List, Optional from ..utils.file_parser import FileParser, split_text_into_chunks class TextProcessor: - """文本处理器""" - + """Facade for the text-extraction and chunking pipeline.""" + @staticmethod def extract_from_files(file_paths: List[str]) -> str: - """从多个文件提取文本""" + """Extract and concatenate text from multiple files.""" return FileParser.extract_from_multiple(file_paths) - + @staticmethod def split_text( text: str, chunk_size: int = 500, overlap: int = 50 ) -> List[str]: - """ - 分割文本 - + """Split text into chunks. + Args: - text: 原始文本 - chunk_size: 块大小 - overlap: 重叠大小 - + text: The source text. + chunk_size: Target characters per chunk. + overlap: Overlap between consecutive chunks. + Returns: - 文本块列表 + A list of chunk strings. """ return split_text_into_chunks(text, chunk_size, overlap) - + @staticmethod def preprocess_text(text: str) -> str: - """ - 预处理文本 - - 移除多余空白 - - 标准化换行 - + """Pre-process text by normalizing whitespace and line endings. + + - Collapse runs of blank lines to at most two newlines. + - Normalize line endings to ``\\n``. + - Strip leading/trailing whitespace from each line. + Args: - text: 原始文本 - + text: The source text. + Returns: - 处理后的文本 + The cleaned text. """ import re - - # 标准化换行 + text = text.replace('\r\n', '\n').replace('\r', '\n') - - # 移除连续空行(保留最多两个换行) + + # Collapse 3+ consecutive newlines down to a blank-line separator. text = re.sub(r'\n{3,}', '\n\n', text) - - # 移除行首行尾空白 + lines = [line.strip() for line in text.split('\n')] text = '\n'.join(lines) - + return text.strip() - + @staticmethod def get_text_stats(text: str) -> dict: - """获取文本统计信息""" + """Return basic text statistics: total chars, lines, and words.""" return { "total_chars": len(text), "total_lines": text.count('\n') + 1, diff --git a/backend/app/utils/__init__.py b/backend/app/utils/__init__.py index e70161ac..5f13955e 100644 --- a/backend/app/utils/__init__.py +++ b/backend/app/utils/__init__.py @@ -1,6 +1,4 @@ -""" -工具模块 -""" +"""Backend utilities package.""" from .file_parser import FileParser from .llm_client import LLMClient diff --git a/backend/app/utils/file_parser.py b/backend/app/utils/file_parser.py index 3f1d8ed2..fbe42acf 100644 --- a/backend/app/utils/file_parser.py +++ b/backend/app/utils/file_parser.py @@ -1,6 +1,6 @@ -""" -文件解析工具 -支持PDF、Markdown、TXT文件的文本提取 +"""File parsing utilities. + +Supports text extraction from PDF, Markdown, and plain-text files. """ import os @@ -9,30 +9,27 @@ from typing import List, Optional def _read_text_with_fallback(file_path: str) -> str: - """ - 读取文本文件,UTF-8失败时自动探测编码。 - - 采用多级回退策略: - 1. 首先尝试 UTF-8 解码 - 2. 使用 charset_normalizer 检测编码 - 3. 回退到 chardet 检测编码 - 4. 最终使用 UTF-8 + errors='replace' 兜底 - + """Read a text file, falling back through encoding detectors when UTF-8 fails. + + Multi-stage fallback strategy: + 1. Try UTF-8 first. + 2. Use ``charset_normalizer`` to detect the encoding. + 3. Fall back to ``chardet``. + 4. Last resort: decode with UTF-8 + ``errors='replace'``. + Args: - file_path: 文件路径 - + file_path: Path to the file to read. + Returns: - 解码后的文本内容 + The decoded text content. """ data = Path(file_path).read_bytes() - - # 首先尝试 UTF-8 + try: return data.decode('utf-8') except UnicodeDecodeError: pass - - # 尝试使用 charset_normalizer 检测编码 + encoding = None try: from charset_normalizer import from_bytes @@ -41,8 +38,7 @@ def _read_text_with_fallback(file_path: str) -> str: encoding = best.encoding except Exception: pass - - # 回退到 chardet + if not encoding: try: import chardet @@ -50,89 +46,86 @@ def _read_text_with_fallback(file_path: str) -> str: encoding = result.get('encoding') if result else None except Exception: pass - - # 最终兜底:使用 UTF-8 + replace + if not encoding: encoding = 'utf-8' - + return data.decode(encoding, errors='replace') class FileParser: - """文件解析器""" - + """Parser for the supported document formats.""" + SUPPORTED_EXTENSIONS = {'.pdf', '.md', '.markdown', '.txt'} - + @classmethod def extract_text(cls, file_path: str) -> str: - """ - 从文件中提取文本 - + """Extract plain text from a single supported file. + Args: - file_path: 文件路径 - + file_path: Path to the file. + Returns: - 提取的文本内容 + The extracted text content. """ path = Path(file_path) - + if not path.exists(): raise FileNotFoundError(f"文件不存在: {file_path}") - + suffix = path.suffix.lower() - + if suffix not in cls.SUPPORTED_EXTENSIONS: raise ValueError(f"不支持的文件格式: {suffix}") - + if suffix == '.pdf': return cls._extract_from_pdf(file_path) elif suffix in {'.md', '.markdown'}: return cls._extract_from_md(file_path) elif suffix == '.txt': return cls._extract_from_txt(file_path) - + raise ValueError(f"无法处理的文件格式: {suffix}") - + @staticmethod def _extract_from_pdf(file_path: str) -> str: - """从PDF提取文本""" + """Extract text from a PDF file using PyMuPDF.""" try: import fitz # PyMuPDF except ImportError: raise ImportError("需要安装PyMuPDF: pip install PyMuPDF") - + text_parts = [] with fitz.open(file_path) as doc: for page in doc: text = page.get_text() if text.strip(): text_parts.append(text) - + return "\n\n".join(text_parts) - + @staticmethod def _extract_from_md(file_path: str) -> str: - """从Markdown提取文本,支持自动编码检测""" + """Extract text from a Markdown file with automatic encoding detection.""" return _read_text_with_fallback(file_path) - + @staticmethod def _extract_from_txt(file_path: str) -> str: - """从TXT提取文本,支持自动编码检测""" + """Extract text from a plain-text file with automatic encoding detection.""" return _read_text_with_fallback(file_path) - + @classmethod def extract_from_multiple(cls, file_paths: List[str]) -> str: - """ - 从多个文件提取文本并合并 - + """Extract and concatenate text from multiple files. + Args: - file_paths: 文件路径列表 - + file_paths: Paths of files to read. + Returns: - 合并后的文本 + The merged text, with per-file headers separating each section. """ all_texts = [] - + for i, file_path in enumerate(file_paths, 1): try: text = cls.extract_text(file_path) @@ -140,50 +133,48 @@ class FileParser: all_texts.append(f"=== 文档 {i}: {filename} ===\n{text}") except Exception as e: all_texts.append(f"=== 文档 {i}: {file_path} (提取失败: {str(e)}) ===") - + return "\n\n".join(all_texts) def split_text_into_chunks( - text: str, - chunk_size: int = 500, + text: str, + chunk_size: int = 500, overlap: int = 50 ) -> List[str]: - """ - 将文本分割成小块 - + """Split text into overlapping chunks. + Args: - text: 原始文本 - chunk_size: 每块的字符数 - overlap: 重叠字符数 - + text: The source text to split. + chunk_size: Target characters per chunk. + overlap: Number of characters overlapping between consecutive chunks. + Returns: - 文本块列表 + A list of chunk strings. """ if len(text) <= chunk_size: return [text] if text.strip() else [] - + chunks = [] start = 0 - + while start < len(text): end = start + chunk_size - - # 尝试在句子边界处分割 + + # Prefer splitting on a sentence boundary near the chunk end if end < len(text): - # 查找最近的句子结束符 for sep in ['。', '!', '?', '.\n', '!\n', '?\n', '\n\n', '. ', '! ', '? ']: last_sep = text[start:end].rfind(sep) if last_sep != -1 and last_sep > chunk_size * 0.3: end = start + last_sep + len(sep) break - + chunk = text[start:end].strip() if chunk: chunks.append(chunk) - - # 下一个块从重叠位置开始 + + # Next chunk starts at the overlap point start = end - overlap if end < len(text) else len(text) - + return chunks diff --git a/backend/app/utils/llm_client.py b/backend/app/utils/llm_client.py index ae33afbe..c65b1d12 100644 --- a/backend/app/utils/llm_client.py +++ b/backend/app/utils/llm_client.py @@ -1,6 +1,6 @@ -""" -LLM客户端封装 -统一使用OpenAI格式调用 +"""LLM client wrapper. + +All providers are called through the OpenAI-compatible API surface. """ import json @@ -13,7 +13,7 @@ from ..config import Config class LLMClient: - """LLM客户端""" + """Thin wrapper around the OpenAI-compatible chat completions API.""" def __init__( self, @@ -37,17 +37,16 @@ class LLMClient: max_tokens: int = 4096, response_format: Optional[Dict] = None, ) -> str: - """ - 发送聊天请求 + """Send a chat completion request. Args: - messages: 消息列表 - temperature: 温度参数 - max_tokens: 最大token数 - response_format: 响应格式(如JSON模式) + messages: Chat messages in OpenAI format. + temperature: Sampling temperature. + max_tokens: Maximum number of tokens to generate. + response_format: Optional response format hint (e.g. JSON mode). Returns: - 模型响应文本 + The assistant's response text. """ kwargs = { "model": self.model, @@ -61,7 +60,7 @@ class LLMClient: response = self.client.chat.completions.create(**kwargs) content = response.choices[0].message.content - # 部分模型(如MiniMax M2.5)会在content中包含思考内容,需要移除 + # Some reasoning models (e.g. MiniMax M2.5) embed ... blocks; strip them. content = re.sub(r"[\s\S]*?", "", content).strip() return content @@ -79,7 +78,7 @@ class LLMClient: messages=messages, temperature=temperature, max_tokens=max_tokens ) - # 清理markdown代码块标记 + # Strip surrounding markdown code-fence markers if present. cleaned_response = response.strip() cleaned_response = re.sub( r"^```(?:json)?\s*\n?", "", cleaned_response, flags=re.IGNORECASE diff --git a/backend/app/utils/logger.py b/backend/app/utils/logger.py index 1978c0b8..16caebfb 100644 --- a/backend/app/utils/logger.py +++ b/backend/app/utils/logger.py @@ -1,6 +1,7 @@ -""" -日志配置模块 -提供统一的日志管理,同时输出到控制台和文件 +"""Logger configuration module. + +Provides unified logging that writes simultaneously to the console and a +rotating log file. """ import os @@ -11,59 +12,55 @@ from logging.handlers import RotatingFileHandler def _ensure_utf8_stdout(): - """ - 确保 stdout/stderr 使用 UTF-8 编码 - 解决 Windows 控制台中文乱码问题 + """Force stdout/stderr to UTF-8. + + Fixes garbled non-ASCII output on the Windows console. """ if sys.platform == 'win32': - # Windows 下重新配置标准输出为 UTF-8 + # On Windows, reconfigure the standard streams to UTF-8. if hasattr(sys.stdout, 'reconfigure'): sys.stdout.reconfigure(encoding='utf-8', errors='replace') if hasattr(sys.stderr, 'reconfigure'): sys.stderr.reconfigure(encoding='utf-8', errors='replace') -# 日志目录 +# Directory that holds rotated log files. LOG_DIR = os.path.join(os.path.dirname(os.path.dirname(os.path.dirname(__file__))), 'logs') def setup_logger(name: str = 'mirofish', level: int = logging.DEBUG) -> logging.Logger: - """ - 设置日志器 - + """Configure and return a logger. + Args: - name: 日志器名称 - level: 日志级别 - + name: Logger name. + level: Minimum log level for the logger. + Returns: - 配置好的日志器 + The configured logger. """ - # 确保日志目录存在 os.makedirs(LOG_DIR, exist_ok=True) - - # 创建日志器 + logger = logging.getLogger(name) logger.setLevel(level) - - # 阻止日志向上传播到根 logger,避免重复输出 + + # Prevent propagation to the root logger to avoid duplicate output. logger.propagate = False - - # 如果已经有处理器,不重复添加 + + # If handlers are already attached, do not re-add them. if logger.handlers: return logger - - # 日志格式 + detailed_formatter = logging.Formatter( '[%(asctime)s] %(levelname)s [%(name)s.%(funcName)s:%(lineno)d] %(message)s', datefmt='%Y-%m-%d %H:%M:%S' ) - + simple_formatter = logging.Formatter( '[%(asctime)s] %(levelname)s: %(message)s', datefmt='%H:%M:%S' ) - - # 1. 文件处理器 - 详细日志(按日期命名,带轮转) + + # 1. File handler — detailed log, named by date and rotated by size. log_filename = datetime.now().strftime('%Y-%m-%d') + '.log' file_handler = RotatingFileHandler( os.path.join(LOG_DIR, log_filename), @@ -73,30 +70,28 @@ def setup_logger(name: str = 'mirofish', level: int = logging.DEBUG) -> logging. ) file_handler.setLevel(logging.DEBUG) file_handler.setFormatter(detailed_formatter) - - # 2. 控制台处理器 - 简洁日志(INFO及以上) - # 确保 Windows 下使用 UTF-8 编码,避免中文乱码 + + # 2. Console handler — concise log, INFO and above. + # Ensure UTF-8 on Windows so non-ASCII characters render correctly. _ensure_utf8_stdout() console_handler = logging.StreamHandler(sys.stdout) console_handler.setLevel(logging.INFO) console_handler.setFormatter(simple_formatter) - - # 添加处理器 + logger.addHandler(file_handler) logger.addHandler(console_handler) - + return logger def get_logger(name: str = 'mirofish') -> logging.Logger: - """ - 获取日志器(如果不存在则创建) - + """Return an existing logger by name, creating it lazily if needed. + Args: - name: 日志器名称 - + name: Logger name. + Returns: - 日志器实例 + The logger instance. """ logger = logging.getLogger(name) if not logger.handlers: @@ -104,11 +99,11 @@ def get_logger(name: str = 'mirofish') -> logging.Logger: return logger -# 创建默认日志器 +# Default module-level logger. logger = setup_logger() -# 便捷方法 +# Convenience module-level helpers. def debug(msg, *args, **kwargs): logger.debug(msg, *args, **kwargs) diff --git a/backend/app/utils/retry.py b/backend/app/utils/retry.py index 819b1cfc..c5c5f516 100644 --- a/backend/app/utils/retry.py +++ b/backend/app/utils/retry.py @@ -1,6 +1,7 @@ -""" -API调用重试机制 -用于处理LLM等外部API调用的重试逻辑 +"""API call retry primitives. + +Helpers for retrying calls to external APIs (LLMs, etc.) with exponential +backoff and jitter. """ import time @@ -21,18 +22,17 @@ def retry_with_backoff( exceptions: Tuple[Type[Exception], ...] = (Exception,), on_retry: Optional[Callable[[Exception, int], None]] = None ): - """ - 带指数退避的重试装饰器 - + """Decorator that retries a callable with exponential backoff. + Args: - max_retries: 最大重试次数 - initial_delay: 初始延迟(秒) - max_delay: 最大延迟(秒) - backoff_factor: 退避因子 - jitter: 是否添加随机抖动 - exceptions: 需要重试的异常类型 - on_retry: 重试时的回调函数 (exception, retry_count) - + max_retries: Maximum number of retries before giving up. + initial_delay: Initial delay in seconds before the first retry. + max_delay: Cap on the delay between retries (seconds). + backoff_factor: Multiplicative factor applied to the delay each retry. + jitter: When ``True``, randomize the delay to avoid thundering herd. + exceptions: Exception types that should trigger a retry. + on_retry: Optional callback invoked on each retry as ``(exception, retry_count)``. + Usage: @retry_with_backoff(max_retries=3) def call_llm_api(): @@ -55,7 +55,7 @@ def retry_with_backoff( logger.error(f"函数 {func.__name__} 在 {max_retries} 次重试后仍失败: {str(e)}") raise - # 计算延迟 + # Compute the next delay, capped at ``max_delay``. current_delay = min(delay, max_delay) if jitter: current_delay = current_delay * (0.5 + random.random()) @@ -86,9 +86,7 @@ def retry_with_backoff_async( exceptions: Tuple[Type[Exception], ...] = (Exception,), on_retry: Optional[Callable[[Exception, int], None]] = None ): - """ - 异步版本的重试装饰器 - """ + """Async variant of :func:`retry_with_backoff`.""" import asyncio def decorator(func: Callable) -> Callable: @@ -130,9 +128,7 @@ def retry_with_backoff_async( class RetryableAPIClient: - """ - 可重试的API客户端封装 - """ + """Class-based wrapper around the retry helpers.""" def __init__( self, @@ -153,17 +149,16 @@ class RetryableAPIClient: exceptions: Tuple[Type[Exception], ...] = (Exception,), **kwargs ) -> Any: - """ - 执行函数调用并在失败时重试 - + """Invoke ``func`` with retry on failure. + Args: - func: 要调用的函数 - *args: 函数参数 - exceptions: 需要重试的异常类型 - **kwargs: 函数关键字参数 - + func: Callable to invoke. + *args: Positional arguments forwarded to ``func``. + exceptions: Exception types that should trigger a retry. + **kwargs: Keyword arguments forwarded to ``func``. + Returns: - 函数返回值 + The value returned by ``func``. """ last_exception = None delay = self.initial_delay @@ -199,17 +194,17 @@ class RetryableAPIClient: exceptions: Tuple[Type[Exception], ...] = (Exception,), continue_on_failure: bool = True ) -> Tuple[list, list]: - """ - 批量调用并对每个失败项单独重试 - + """Process ``items`` in sequence, retrying each independently on failure. + Args: - items: 要处理的项目列表 - process_func: 处理函数,接收单个item作为参数 - exceptions: 需要重试的异常类型 - continue_on_failure: 单项失败后是否继续处理其他项 - + items: Items to process. + process_func: Callable invoked once per item. + exceptions: Exception types that should trigger a retry. + continue_on_failure: When ``True``, keep processing remaining items after a failure. + Returns: - (成功结果列表, 失败项列表) + ``(successes, failures)`` — a list of successful results and a list + of failure descriptors ``{"index", "item", "error"}``. """ results = [] failures = [] diff --git a/backend/app/utils/zep_paging.py b/backend/app/utils/zep_paging.py index eb68d4eb..cc149046 100644 --- a/backend/app/utils/zep_paging.py +++ b/backend/app/utils/zep_paging.py @@ -1,7 +1,8 @@ -"""Zep Graph 分页读取工具。 +"""Zep Graph paging helpers. -Zep 的 node/edge 列表接口使用 UUID cursor 分页, -本模块封装自动翻页逻辑(含单页重试),对调用方透明地返回完整列表。 +Zep's node/edge list APIs paginate with a UUID cursor. This module wraps the +auto-paging loop (including per-page retry) so callers see the full list +transparently. """ from __future__ import annotations @@ -30,7 +31,7 @@ def _fetch_page_with_retry( page_description: str = "page", **kwargs: Any, ) -> list[Any]: - """单页请求,失败时指数退避重试。自动处理429限速。""" + """Fetch one page, retrying with exponential backoff. Handles 429 rate limits.""" if max_retries < 1: raise ValueError("max_retries must be >= 1") @@ -43,7 +44,7 @@ def _fetch_page_with_retry( except Exception as e: last_exception = e if attempt < max_retries - 1: - # 检测429限速,使用retry-after头部指定的等待时间 + # If a 429 rate limit is detected, prefer the retry-after header for the wait. wait = delay logger.warning( f"Zep {page_description} attempt {attempt + 1} failed: {str(e)[:100]}, retrying in {wait:.1f}s..." @@ -65,7 +66,7 @@ def fetch_all_nodes( max_retries: int = _DEFAULT_MAX_RETRIES, retry_delay: float = _DEFAULT_RETRY_DELAY, ) -> list[Any]: - """分页获取图谱节点,最多返回 max_items 条(默认 2000)。每页请求自带重试。""" + """Page through graph nodes; return at most ``max_items`` (default 2000). Each page is retried internally.""" all_nodes: list[Any] = [] cursor: str | None = None page_num = 0 @@ -110,7 +111,7 @@ def fetch_all_edges( max_retries: int = _DEFAULT_MAX_RETRIES, retry_delay: float = _DEFAULT_RETRY_DELAY, ) -> list[Any]: - """分页获取图谱所有边,返回完整列表。每页请求自带重试。""" + """Page through every graph edge and return the full list. Each page is retried internally.""" all_edges: list[Any] = [] cursor: str | None = None page_num = 0 From e1019d91cba3303900584975036ea4e2b4a8972f Mon Sep 17 00:00:00 2001 From: Dominik Seemann Date: Thu, 7 May 2026 14:49:20 +0000 Subject: [PATCH 02/16] docs(i18n): translate chinese docstrings/comments in backend root, api init, simulation_ipc, simulation_manager, zep_entity_reader --- backend/app/__init__.py | 53 ++-- backend/app/api/__init__.py | 4 +- backend/app/config.py | 60 ++-- backend/app/services/simulation_ipc.py | 262 ++++++++--------- backend/app/services/simulation_manager.py | 316 ++++++++++----------- backend/app/services/zep_entity_reader.py | 238 ++++++++-------- backend/run.py | 27 +- 7 files changed, 467 insertions(+), 493 deletions(-) diff --git a/backend/app/__init__.py b/backend/app/__init__.py index 11857ef0..2d6519c2 100644 --- a/backend/app/__init__.py +++ b/backend/app/__init__.py @@ -1,12 +1,10 @@ -""" -MiroFish Backend - Flask应用工厂 -""" +"""MiroFish backend Flask application factory.""" import os import warnings -# 抑制 multiprocessing resource_tracker 的警告(来自第三方库如 transformers) -# 需要在所有其他导入之前设置 +# Silence multiprocessing.resource_tracker warnings emitted by some third-party +# libraries (e.g. transformers); must run before those modules are imported. warnings.filterwarnings("ignore", message=".*resource_tracker.*") from flask import Flask, request @@ -18,62 +16,65 @@ from .utils.locale import t def create_app(config_class=Config): - """Flask应用工厂函数""" + """Flask application factory.""" app = Flask(__name__) app.config.from_object(config_class) - - # 设置JSON编码:确保中文直接显示(而不是 \uXXXX 格式) - # Flask >= 2.3 使用 app.json.ensure_ascii,旧版本使用 JSON_AS_ASCII 配置 + + # Configure JSON encoding so non-ASCII characters render literally + # rather than as \uXXXX escape sequences. Flask >= 2.3 exposes + # ``app.json.ensure_ascii``; older versions use ``JSON_AS_ASCII``. if hasattr(app, 'json') and hasattr(app.json, 'ensure_ascii'): app.json.ensure_ascii = False - - # 设置日志 + + # Configure logging. logger = setup_logger('mirofish') - - # 只在 reloader 子进程中打印启动信息(避免 debug 模式下打印两次) + + # Only print startup banners in the reloader child process to avoid + # double-printing in debug mode. is_reloader_process = os.environ.get('WERKZEUG_RUN_MAIN') == 'true' debug_mode = app.config.get('DEBUG', False) should_log_startup = not debug_mode or is_reloader_process - + if should_log_startup: logger.info("=" * 50) logger.info(t("log.bootstrap.m001")) logger.info("=" * 50) - - # 启用CORS + + # Enable CORS. CORS(app, resources={r"/api/*": {"origins": "*"}}) - - # 注册模拟进程清理函数(确保服务器关闭时终止所有模拟进程) + + # Register simulation-process cleanup so all child processes are torn down + # when the Flask server shuts down. from .services.simulation_runner import SimulationRunner SimulationRunner.register_cleanup() if should_log_startup: logger.info(t("log.bootstrap.m002")) - - # 请求日志中间件 + + # Request-logging middleware. @app.before_request def log_request(): logger = get_logger('mirofish.request') logger.debug(t("log.bootstrap.m003", request=request.method, request_2=request.path)) if request.content_type and 'json' in request.content_type: logger.debug(t("log.bootstrap.m004", request=request.get_json(silent=True))) - + @app.after_request def log_response(response): logger = get_logger('mirofish.request') logger.debug(t("log.bootstrap.m005", response=response.status_code)) return response - - # 注册蓝图 + + # Register API blueprints. from .api import graph_bp, simulation_bp, report_bp app.register_blueprint(graph_bp, url_prefix='/api/graph') app.register_blueprint(simulation_bp, url_prefix='/api/simulation') app.register_blueprint(report_bp, url_prefix='/api/report') - - # 健康检查 + + # Health-check endpoint. @app.route('/health') def health(): return {'status': 'ok', 'service': 'MiroFish Backend'} - + # On startup: recover any projects stuck in graph_building (task was killed by restart) if should_log_startup: _recover_stuck_projects() diff --git a/backend/app/api/__init__.py b/backend/app/api/__init__.py index ffda743a..4326e4da 100644 --- a/backend/app/api/__init__.py +++ b/backend/app/api/__init__.py @@ -1,6 +1,4 @@ -""" -API路由模块 -""" +"""API blueprints package.""" from flask import Blueprint diff --git a/backend/app/config.py b/backend/app/config.py index e6939c78..ab0867d3 100644 --- a/backend/app/config.py +++ b/backend/app/config.py @@ -1,38 +1,40 @@ -""" -配置管理 -统一从项目根目录的 .env 文件加载配置 +"""Configuration management. + +Loads configuration values from the project-root ``.env`` file. """ import os from dotenv import load_dotenv -# 加载项目根目录的 .env 文件 -# 路径: MiroFish/.env (相对于 backend/app/config.py) +# Load the project-root .env file. +# Path: MiroFish/.env (relative to backend/app/config.py). project_root_env = os.path.join(os.path.dirname(__file__), '../../.env') if os.path.exists(project_root_env): load_dotenv(project_root_env, override=True) else: - # 如果根目录没有 .env,尝试加载环境变量(用于生产环境) + # If the project root has no .env, fall back to the process environment + # (used in production deployments). load_dotenv(override=True) class Config: - """Flask配置类""" - - # Flask配置 + """Flask configuration class.""" + + # Flask settings. SECRET_KEY = os.environ.get('SECRET_KEY', 'mirofish-secret-key') DEBUG = os.environ.get('FLASK_DEBUG', 'True').lower() == 'true' - - # JSON配置 - 禁用ASCII转义,让中文直接显示(而不是 \uXXXX 格式) + + # JSON settings: disable ASCII escaping so non-ASCII output renders literally + # rather than as \uXXXX escape sequences. JSON_AS_ASCII = False - - # LLM配置(统一使用OpenAI格式) + + # LLM settings (called via the OpenAI-compatible API surface). LLM_API_KEY = os.environ.get('LLM_API_KEY') LLM_BASE_URL = os.environ.get('LLM_BASE_URL', 'https://api.openai.com/v1') LLM_MODEL_NAME = os.environ.get('LLM_MODEL_NAME', 'gpt-4o-mini') - - # Neo4j + Graphiti配置(替代 Zep Cloud) + + # Neo4j + Graphiti settings (replacement for Zep Cloud). NEO4J_URI = os.environ.get('NEO4J_URI', 'bolt://localhost:7687') NEO4J_USER = os.environ.get('NEO4J_USER', 'neo4j') NEO4J_PASSWORD = os.environ.get('NEO4J_PASSWORD', 'mirofish123') @@ -50,23 +52,23 @@ class Config: EMBEDDING_API_KEY = os.environ.get('EMBEDDING_API_KEY') EMBEDDING_BASE_URL = os.environ.get('EMBEDDING_BASE_URL') - # Zep配置(保留兼容性,已废弃) + # Zep settings (kept for backwards compatibility; deprecated). ZEP_API_KEY = os.environ.get('ZEP_API_KEY', '') - - # 文件上传配置 + + # File upload settings. MAX_CONTENT_LENGTH = 50 * 1024 * 1024 # 50MB UPLOAD_FOLDER = os.path.join(os.path.dirname(__file__), '../uploads') ALLOWED_EXTENSIONS = {'pdf', 'md', 'txt', 'markdown'} - - # 文本处理配置 - DEFAULT_CHUNK_SIZE = 500 # 默认切块大小 - DEFAULT_CHUNK_OVERLAP = 50 # 默认重叠大小 - - # OASIS模拟配置 + + # Text processing settings. + DEFAULT_CHUNK_SIZE = 500 # default chunk size in characters + DEFAULT_CHUNK_OVERLAP = 50 # default overlap in characters + + # OASIS simulation settings. OASIS_DEFAULT_MAX_ROUNDS = int(os.environ.get('OASIS_DEFAULT_MAX_ROUNDS', '10')) OASIS_SIMULATION_DATA_DIR = os.path.join(os.path.dirname(__file__), '../uploads/simulations') - - # OASIS平台可用动作配置 + + # OASIS per-platform allowed action lists. OASIS_TWITTER_ACTIONS = [ 'CREATE_POST', 'LIKE_POST', 'REPOST', 'FOLLOW', 'DO_NOTHING', 'QUOTE_POST' ] @@ -76,14 +78,14 @@ class Config: 'TREND', 'REFRESH', 'DO_NOTHING', 'FOLLOW', 'MUTE' ] - # Report Agent配置 + # Report agent settings. REPORT_AGENT_MAX_TOOL_CALLS = int(os.environ.get('REPORT_AGENT_MAX_TOOL_CALLS', '5')) REPORT_AGENT_MAX_REFLECTION_ROUNDS = int(os.environ.get('REPORT_AGENT_MAX_REFLECTION_ROUNDS', '2')) REPORT_AGENT_TEMPERATURE = float(os.environ.get('REPORT_AGENT_TEMPERATURE', '0.5')) - + @classmethod def validate(cls): - """验证必要配置""" + """Validate that required configuration values are present.""" errors = [] if not cls.LLM_API_KEY: errors.append("LLM_API_KEY 未配置") diff --git a/backend/app/services/simulation_ipc.py b/backend/app/services/simulation_ipc.py index be2eac32..68428b8f 100644 --- a/backend/app/services/simulation_ipc.py +++ b/backend/app/services/simulation_ipc.py @@ -1,11 +1,12 @@ -""" -模拟IPC通信模块 -用于Flask后端和模拟脚本之间的进程间通信 +"""Simulation IPC module. -通过文件系统实现简单的命令/响应模式: -1. Flask写入命令到 commands/ 目录 -2. 模拟脚本轮询命令目录,执行命令并写入响应到 responses/ 目录 -3. Flask轮询响应目录获取结果 +Inter-process communication between the Flask backend and the simulation +subprocess. Implements a simple file-system command/response pattern: + +1. Flask writes commands into ``commands/``. +2. The simulation script polls for commands, executes them, and writes + responses into ``responses/``. +3. Flask polls the responses directory for results. """ import os @@ -24,14 +25,14 @@ logger = get_logger('mirofish.simulation_ipc') class CommandType(str, Enum): - """命令类型""" - INTERVIEW = "interview" # 单个Agent采访 - BATCH_INTERVIEW = "batch_interview" # 批量采访 - CLOSE_ENV = "close_env" # 关闭环境 + """IPC command types.""" + INTERVIEW = "interview" # interview a single agent + BATCH_INTERVIEW = "batch_interview" # interview multiple agents at once + CLOSE_ENV = "close_env" # tear down the environment class CommandStatus(str, Enum): - """命令状态""" + """IPC command status.""" PENDING = "pending" PROCESSING = "processing" COMPLETED = "completed" @@ -40,12 +41,12 @@ class CommandStatus(str, Enum): @dataclass class IPCCommand: - """IPC命令""" + """A command sent over the IPC channel.""" command_id: str command_type: CommandType args: Dict[str, Any] timestamp: str = field(default_factory=lambda: datetime.now().isoformat()) - + def to_dict(self) -> Dict[str, Any]: return { "command_id": self.command_id, @@ -53,7 +54,7 @@ class IPCCommand: "args": self.args, "timestamp": self.timestamp } - + @classmethod def from_dict(cls, data: Dict[str, Any]) -> 'IPCCommand': return cls( @@ -66,13 +67,13 @@ class IPCCommand: @dataclass class IPCResponse: - """IPC响应""" + """A response returned over the IPC channel.""" command_id: str status: CommandStatus result: Optional[Dict[str, Any]] = None error: Optional[str] = None timestamp: str = field(default_factory=lambda: datetime.now().isoformat()) - + def to_dict(self) -> Dict[str, Any]: return { "command_id": self.command_id, @@ -81,7 +82,7 @@ class IPCResponse: "error": self.error, "timestamp": self.timestamp } - + @classmethod def from_dict(cls, data: Dict[str, Any]) -> 'IPCResponse': return cls( @@ -94,27 +95,25 @@ class IPCResponse: class SimulationIPCClient: + """IPC client used by the Flask side. + + Sends commands to the simulation process and waits for responses. """ - 模拟IPC客户端(Flask端使用) - - 用于向模拟进程发送命令并等待响应 - """ - + def __init__(self, simulation_dir: str): - """ - 初始化IPC客户端 - + """Initialize the IPC client. + Args: - simulation_dir: 模拟数据目录 + simulation_dir: Directory holding the simulation's IPC files. """ self.simulation_dir = simulation_dir self.commands_dir = os.path.join(simulation_dir, "ipc_commands") self.responses_dir = os.path.join(simulation_dir, "ipc_responses") - - # 确保目录存在 + + # Ensure both directories exist before use. os.makedirs(self.commands_dir, exist_ok=True) os.makedirs(self.responses_dir, exist_ok=True) - + def send_command( self, command_type: CommandType, @@ -122,20 +121,19 @@ class SimulationIPCClient: timeout: float = 60.0, poll_interval: float = 0.5 ) -> IPCResponse: - """ - 发送命令并等待响应 - + """Send a command and wait for the response. + Args: - command_type: 命令类型 - args: 命令参数 - timeout: 超时时间(秒) - poll_interval: 轮询间隔(秒) - + command_type: Command type to send. + args: Command arguments. + timeout: Timeout in seconds. + poll_interval: Polling interval in seconds. + Returns: - IPCResponse - + The ``IPCResponse``. + Raises: - TimeoutError: 等待响应超时 + TimeoutError: When no response arrives before ``timeout``. """ command_id = str(uuid.uuid4()) command = IPCCommand( @@ -143,50 +141,50 @@ class SimulationIPCClient: command_type=command_type, args=args ) - - # 写入命令文件 + + # Write the command file. command_file = os.path.join(self.commands_dir, f"{command_id}.json") with open(command_file, 'w', encoding='utf-8') as f: json.dump(command.to_dict(), f, ensure_ascii=False, indent=2) - + logger.info(t("log.simulation_ipc.m001", command_type=command_type.value, command_id=command_id)) - - # 等待响应 + + # Poll for the response file. response_file = os.path.join(self.responses_dir, f"{command_id}.json") start_time = time.time() - + while time.time() - start_time < timeout: if os.path.exists(response_file): try: with open(response_file, 'r', encoding='utf-8') as f: response_data = json.load(f) response = IPCResponse.from_dict(response_data) - - # 清理命令和响应文件 + + # Clean up command and response files after successful read. try: os.remove(command_file) os.remove(response_file) except OSError: pass - + logger.info(t("log.simulation_ipc.m002", command_id=command_id, response=response.status.value)) return response except (json.JSONDecodeError, KeyError) as e: logger.warning(t("log.simulation_ipc.m003", e=e)) - + time.sleep(poll_interval) - - # 超时 + + # Timed out waiting for the response. logger.error(t("log.simulation_ipc.m004", command_id=command_id)) - - # 清理命令文件 + + # Clean up the unanswered command file. try: os.remove(command_file) except OSError: pass - + raise TimeoutError(f"等待命令响应超时 ({timeout}秒)") - + def send_interview( self, agent_id: int, @@ -194,20 +192,19 @@ class SimulationIPCClient: platform: str = None, timeout: float = 60.0 ) -> IPCResponse: - """ - 发送单个Agent采访命令 - + """Send a single-agent interview command. + Args: - agent_id: Agent ID - prompt: 采访问题 - platform: 指定平台(可选) - - "twitter": 只采访Twitter平台 - - "reddit": 只采访Reddit平台 - - None: 双平台模拟时同时采访两个平台,单平台模拟时采访该平台 - timeout: 超时时间 - + agent_id: Agent id to interview. + prompt: Interview question. + platform: Optional platform selector. + - ``"twitter"``: interview only on Twitter. + - ``"reddit"``: interview only on Reddit. + - ``None``: dual-platform if applicable, else the single active platform. + timeout: Timeout in seconds. + Returns: - IPCResponse,result字段包含采访结果 + ``IPCResponse`` whose ``result`` carries the interview response. """ args = { "agent_id": agent_id, @@ -215,69 +212,66 @@ class SimulationIPCClient: } if platform: args["platform"] = platform - + return self.send_command( command_type=CommandType.INTERVIEW, args=args, timeout=timeout ) - + def send_batch_interview( self, interviews: List[Dict[str, Any]], platform: str = None, timeout: float = 120.0 ) -> IPCResponse: - """ - 发送批量采访命令 - + """Send a batched interview command. + Args: - interviews: 采访列表,每个元素包含 {"agent_id": int, "prompt": str, "platform": str(可选)} - platform: 默认平台(可选,会被每个采访项的platform覆盖) - - "twitter": 默认只采访Twitter平台 - - "reddit": 默认只采访Reddit平台 - - None: 双平台模拟时每个Agent同时采访两个平台 - timeout: 超时时间 - + interviews: List of items shaped ``{"agent_id": int, "prompt": str, "platform": str?}``. + platform: Default platform; per-item ``platform`` overrides this. + - ``"twitter"``: default to Twitter. + - ``"reddit"``: default to Reddit. + - ``None``: dual-platform interview when applicable. + timeout: Timeout in seconds. + Returns: - IPCResponse,result字段包含所有采访结果 + ``IPCResponse`` whose ``result`` carries every interview response. """ args = {"interviews": interviews} if platform: args["platform"] = platform - + return self.send_command( command_type=CommandType.BATCH_INTERVIEW, args=args, timeout=timeout ) - + def send_close_env(self, timeout: float = 30.0) -> IPCResponse: - """ - 发送关闭环境命令 - + """Send a tear-down-environment command. + Args: - timeout: 超时时间 - + timeout: Timeout in seconds. + Returns: - IPCResponse + ``IPCResponse``. """ return self.send_command( command_type=CommandType.CLOSE_ENV, args={}, timeout=timeout ) - + def check_env_alive(self) -> bool: - """ - 检查模拟环境是否存活 - - 通过检查 env_status.json 文件来判断 + """Return ``True`` if the simulation environment reports as alive. + + Reads ``env_status.json`` written by the IPC server side. """ status_file = os.path.join(self.simulation_dir, "env_status.json") if not os.path.exists(status_file): return False - + try: with open(status_file, 'r', encoding='utf-8') as f: status = json.load(f) @@ -287,68 +281,65 @@ class SimulationIPCClient: class SimulationIPCServer: + """IPC server used by the simulation script. + + Polls the commands directory, executes commands, and writes responses. """ - 模拟IPC服务器(模拟脚本端使用) - - 轮询命令目录,执行命令并返回响应 - """ - + def __init__(self, simulation_dir: str): - """ - 初始化IPC服务器 - + """Initialize the IPC server. + Args: - simulation_dir: 模拟数据目录 + simulation_dir: Directory holding the simulation's IPC files. """ self.simulation_dir = simulation_dir self.commands_dir = os.path.join(simulation_dir, "ipc_commands") self.responses_dir = os.path.join(simulation_dir, "ipc_responses") - - # 确保目录存在 + + # Ensure both directories exist before use. os.makedirs(self.commands_dir, exist_ok=True) os.makedirs(self.responses_dir, exist_ok=True) - - # 环境状态 + + # Server-running flag. self._running = False - + def start(self): - """标记服务器为运行状态""" + """Mark the server as alive and persist the state.""" self._running = True self._update_env_status("alive") - + def stop(self): - """标记服务器为停止状态""" + """Mark the server as stopped and persist the state.""" self._running = False self._update_env_status("stopped") - + def _update_env_status(self, status: str): - """更新环境状态文件""" + """Update the persistent environment-status file.""" status_file = os.path.join(self.simulation_dir, "env_status.json") with open(status_file, 'w', encoding='utf-8') as f: json.dump({ "status": status, "timestamp": datetime.now().isoformat() }, f, ensure_ascii=False, indent=2) - + def poll_commands(self) -> Optional[IPCCommand]: - """ - 轮询命令目录,返回第一个待处理的命令 - + """Poll the commands directory and return the next pending command. + Returns: - IPCCommand 或 None + ``IPCCommand`` or ``None`` if no pending commands remain. """ if not os.path.exists(self.commands_dir): return None - - # 按时间排序获取命令文件 + + # Sort by mtime so we process commands in arrival order. command_files = [] for filename in os.listdir(self.commands_dir): if filename.endswith('.json'): filepath = os.path.join(self.commands_dir, filename) command_files.append((filepath, os.path.getmtime(filepath))) - + command_files.sort(key=lambda x: x[1]) - + for filepath, _ in command_files: try: with open(filepath, 'r', encoding='utf-8') as f: @@ -357,37 +348,36 @@ class SimulationIPCServer: except (json.JSONDecodeError, KeyError, OSError) as e: logger.warning(t("log.simulation_ipc.m005", filepath=filepath, e=e)) continue - + return None - + def send_response(self, response: IPCResponse): - """ - 发送响应 - + """Write a response file. + Args: - response: IPC响应 + response: The response to send. """ response_file = os.path.join(self.responses_dir, f"{response.command_id}.json") with open(response_file, 'w', encoding='utf-8') as f: json.dump(response.to_dict(), f, ensure_ascii=False, indent=2) - - # 删除命令文件 + + # Delete the matching command file. command_file = os.path.join(self.commands_dir, f"{response.command_id}.json") try: os.remove(command_file) except OSError: pass - + def send_success(self, command_id: str, result: Dict[str, Any]): - """发送成功响应""" + """Send a success response.""" self.send_response(IPCResponse( command_id=command_id, status=CommandStatus.COMPLETED, result=result )) - + def send_error(self, command_id: str, error: str): - """发送错误响应""" + """Send a failure response.""" self.send_response(IPCResponse( command_id=command_id, status=CommandStatus.FAILED, diff --git a/backend/app/services/simulation_manager.py b/backend/app/services/simulation_manager.py index 2f297e2c..b1af480f 100644 --- a/backend/app/services/simulation_manager.py +++ b/backend/app/services/simulation_manager.py @@ -1,7 +1,7 @@ -""" -OASIS模拟管理器 -管理Twitter和Reddit双平台并行模拟 -使用预设脚本 + LLM智能生成配置参数 +"""OASIS simulation manager. + +Drives parallel Twitter + Reddit simulations using preset scripts plus +LLM-generated configuration parameters. """ import os @@ -23,60 +23,60 @@ logger = get_logger('mirofish.simulation') class SimulationStatus(str, Enum): - """模拟状态""" + """Simulation lifecycle status.""" CREATED = "created" PREPARING = "preparing" READY = "ready" RUNNING = "running" PAUSED = "paused" - STOPPED = "stopped" # 模拟被手动停止 - COMPLETED = "completed" # 模拟自然完成 + STOPPED = "stopped" # manually stopped + COMPLETED = "completed" # finished naturally FAILED = "failed" class PlatformType(str, Enum): - """平台类型""" + """Simulated platform types.""" TWITTER = "twitter" REDDIT = "reddit" @dataclass class SimulationState: - """模拟状态""" + """In-memory + persisted state for a single simulation.""" simulation_id: str project_id: str graph_id: str - - # 平台启用状态 + + # Per-platform enable flags. enable_twitter: bool = True enable_reddit: bool = True - - # 状态 + + # Lifecycle status. status: SimulationStatus = SimulationStatus.CREATED - - # 准备阶段数据 + + # Counters captured during the prepare phase. entities_count: int = 0 profiles_count: int = 0 entity_types: List[str] = field(default_factory=list) - - # 配置生成信息 + + # Information about the auto-generated config. config_generated: bool = False config_reasoning: str = "" - - # 运行时数据 + + # Runtime data. current_round: int = 0 twitter_status: str = "not_started" reddit_status: str = "not_started" - - # 时间戳 + + # Timestamps. created_at: str = field(default_factory=lambda: datetime.now().isoformat()) updated_at: str = field(default_factory=lambda: datetime.now().isoformat()) - - # 错误信息 + + # Error message when status == FAILED. error: Optional[str] = None - + def to_dict(self) -> Dict[str, Any]: - """完整状态字典(内部使用)""" + """Full state dict (used for persistence and internal callers).""" return { "simulation_id": self.simulation_id, "project_id": self.project_id, @@ -96,9 +96,9 @@ class SimulationState: "updated_at": self.updated_at, "error": self.error, } - + def to_simple_dict(self) -> Dict[str, Any]: - """简化状态字典(API返回使用)""" + """Simplified state dict (used for API responses).""" return { "simulation_id": self.simulation_id, "project_id": self.project_id, @@ -113,61 +113,60 @@ class SimulationState: class SimulationManager: + """Simulation manager. + + Core responsibilities: + 1. Read entities from the Zep graph and filter to the configured types. + 2. Generate OASIS agent profiles per entity. + 3. Use the LLM to generate simulation configuration parameters. + 4. Materialize the files the preset scripts expect. """ - 模拟管理器 - - 核心功能: - 1. 从Zep图谱读取实体并过滤 - 2. 生成OASIS Agent Profile - 3. 使用LLM智能生成模拟配置参数 - 4. 准备预设脚本所需的所有文件 - """ - - # 模拟数据存储目录 + + # Root directory for persisted simulation data. SIMULATION_DATA_DIR = os.path.join( - os.path.dirname(__file__), + os.path.dirname(__file__), '../../uploads/simulations' ) - + def __init__(self): - # 确保目录存在 + # Ensure the simulation data directory exists. os.makedirs(self.SIMULATION_DATA_DIR, exist_ok=True) - - # 内存中的模拟状态缓存 + + # In-memory cache of simulation state objects. self._simulations: Dict[str, SimulationState] = {} - + def _get_simulation_dir(self, simulation_id: str) -> str: - """获取模拟数据目录""" + """Return the on-disk directory for a simulation, creating if missing.""" sim_dir = os.path.join(self.SIMULATION_DATA_DIR, simulation_id) os.makedirs(sim_dir, exist_ok=True) return sim_dir - + def _save_simulation_state(self, state: SimulationState): - """保存模拟状态到文件""" + """Persist a simulation state to disk and update the cache.""" sim_dir = self._get_simulation_dir(state.simulation_id) state_file = os.path.join(sim_dir, "state.json") - + state.updated_at = datetime.now().isoformat() - + with open(state_file, 'w', encoding='utf-8') as f: json.dump(state.to_dict(), f, ensure_ascii=False, indent=2) - + self._simulations[state.simulation_id] = state - + def _load_simulation_state(self, simulation_id: str) -> Optional[SimulationState]: - """从文件加载模拟状态""" + """Load a simulation state from disk (or cache) by id.""" if simulation_id in self._simulations: return self._simulations[simulation_id] - + sim_dir = self._get_simulation_dir(simulation_id) state_file = os.path.join(sim_dir, "state.json") - + if not os.path.exists(state_file): return None - + with open(state_file, 'r', encoding='utf-8') as f: data = json.load(f) - + state = SimulationState( simulation_id=simulation_id, project_id=data.get("project_id", ""), @@ -187,10 +186,10 @@ class SimulationManager: updated_at=data.get("updated_at", datetime.now().isoformat()), error=data.get("error"), ) - + self._simulations[simulation_id] = state return state - + def create_simulation( self, project_id: str, @@ -198,21 +197,20 @@ class SimulationManager: enable_twitter: bool = True, enable_reddit: bool = True, ) -> SimulationState: - """ - 创建新的模拟 - + """Create a new simulation in the ``CREATED`` state. + Args: - project_id: 项目ID - graph_id: Zep图谱ID - enable_twitter: 是否启用Twitter模拟 - enable_reddit: 是否启用Reddit模拟 - + project_id: Owning project id. + graph_id: Source Zep graph id. + enable_twitter: When ``True``, the Twitter simulation runs. + enable_reddit: When ``True``, the Reddit simulation runs. + Returns: - SimulationState + The created ``SimulationState``. """ import uuid simulation_id = f"sim_{uuid.uuid4().hex[:12]}" - + state = SimulationState( simulation_id=simulation_id, project_id=project_id, @@ -221,12 +219,12 @@ class SimulationManager: enable_reddit=enable_reddit, status=SimulationStatus.CREATED, ) - + self._save_simulation_state(state) logger.info(t("log.simulation_manager.m001", simulation_id=simulation_id, project_id=project_id, graph_id=graph_id)) - + return state - + def prepare_simulation( self, simulation_id: str, @@ -237,56 +235,55 @@ class SimulationManager: progress_callback: Optional[callable] = None, parallel_profile_count: int = 3 ) -> SimulationState: - """ - 准备模拟环境(全程自动化) - - 步骤: - 1. 从Zep图谱读取并过滤实体 - 2. 为每个实体生成OASIS Agent Profile(可选LLM增强,支持并行) - 3. 使用LLM智能生成模拟配置参数(时间、活跃度、发言频率等) - 4. 保存配置文件和Profile文件 - 5. 复制预设脚本到模拟目录 - + """Prepare the simulation environment end-to-end. + + Steps: + 1. Read and filter entities from the graph. + 2. Generate OASIS agent profiles (optional LLM enrichment, parallel-capable). + 3. Use the LLM to produce simulation parameters (timing, activity, posting frequency). + 4. Save the configuration and profile files. + 5. Copy preset scripts into the simulation directory. + Args: - simulation_id: 模拟ID - simulation_requirement: 模拟需求描述(用于LLM生成配置) - document_text: 原始文档内容(用于LLM理解背景) - defined_entity_types: 预定义的实体类型(可选) - use_llm_for_profiles: 是否使用LLM生成详细人设 - progress_callback: 进度回调函数 (stage, progress, message) - parallel_profile_count: 并行生成人设的数量,默认3 - + simulation_id: Simulation id. + simulation_requirement: Free-text description of the simulation goal. + document_text: Raw source document text passed to the LLM for context. + defined_entity_types: Optional list of allowed entity types. + use_llm_for_profiles: When ``True``, enrich profiles via the LLM. + progress_callback: Optional callback ``(stage, progress, message, **extras)``. + parallel_profile_count: Number of profile generations to run in parallel. + Returns: - SimulationState + The updated ``SimulationState``. """ state = self._load_simulation_state(simulation_id) if not state: raise ValueError(f"模拟不存在: {simulation_id}") - + try: state.status = SimulationStatus.PREPARING self._save_simulation_state(state) - + sim_dir = self._get_simulation_dir(simulation_id) - - # ========== 阶段1: 读取并过滤实体 ========== + + # ========== Stage 1: read and filter entities ========== if progress_callback: progress_callback("reading", 0, t('progress.connectingZepGraph')) - + reader = ZepEntityReader() - + if progress_callback: progress_callback("reading", 30, t('progress.readingNodeData')) - + filtered = reader.filter_defined_entities( graph_id=state.graph_id, defined_entity_types=defined_entity_types, enrich_with_edges=True ) - + state.entities_count = filtered.filtered_count state.entity_types = list(filtered.entity_types) - + if progress_callback: progress_callback( "reading", 100, @@ -294,16 +291,16 @@ class SimulationManager: current=filtered.filtered_count, total=filtered.filtered_count ) - + if filtered.filtered_count == 0: state.status = SimulationStatus.FAILED state.error = "没有找到符合条件的实体,请检查图谱是否正确构建" self._save_simulation_state(state) return state - - # ========== 阶段2: 生成Agent Profile ========== + + # ========== Stage 2: generate agent profiles ========== total_entities = len(filtered.entities) - + if progress_callback: progress_callback( "generating_profiles", 0, @@ -311,22 +308,22 @@ class SimulationManager: current=0, total=total_entities ) - - # 传入graph_id以启用Zep检索功能,获取更丰富的上下文 + + # Pass the graph_id so the generator can use Zep retrieval for richer context. generator = OasisProfileGenerator(graph_id=state.graph_id) - + def profile_progress(current, total, msg): if progress_callback: progress_callback( - "generating_profiles", - int(current / total * 100), + "generating_profiles", + int(current / total * 100), msg, current=current, total=total, item_name=msg ) - - # 设置实时保存的文件路径(优先使用 Reddit JSON 格式) + + # Configure the realtime save target (prefer Reddit JSON if Reddit is enabled). realtime_output_path = None realtime_platform = "reddit" if state.enable_reddit: @@ -335,21 +332,21 @@ class SimulationManager: elif state.enable_twitter: realtime_output_path = os.path.join(sim_dir, "twitter_profiles.csv") realtime_platform = "twitter" - + profiles = generator.generate_profiles_from_entities( entities=filtered.entities, use_llm=use_llm_for_profiles, progress_callback=profile_progress, - graph_id=state.graph_id, # 传入graph_id用于Zep检索 - parallel_count=parallel_profile_count, # 并行生成数量 - realtime_output_path=realtime_output_path, # 实时保存路径 - output_platform=realtime_platform # 输出格式 + graph_id=state.graph_id, # used for Zep retrieval enrichment + parallel_count=parallel_profile_count, + realtime_output_path=realtime_output_path, + output_platform=realtime_platform ) - + state.profiles_count = len(profiles) - - # 保存Profile文件(注意:Twitter使用CSV格式,Reddit使用JSON格式) - # Reddit 已经在生成过程中实时保存了,这里再保存一次确保完整性 + + # Save profile files. Reddit also writes JSON during generation; this is + # a final consistency write. Twitter requires CSV per OASIS conventions. if progress_callback: progress_callback( "generating_profiles", 95, @@ -357,22 +354,22 @@ class SimulationManager: current=total_entities, total=total_entities ) - + if state.enable_reddit: generator.save_profiles( profiles=profiles, file_path=os.path.join(sim_dir, "reddit_profiles.json"), platform="reddit" ) - + if state.enable_twitter: - # Twitter使用CSV格式!这是OASIS的要求 + # Twitter uses CSV format — required by OASIS. generator.save_profiles( profiles=profiles, file_path=os.path.join(sim_dir, "twitter_profiles.csv"), platform="twitter" ) - + if progress_callback: progress_callback( "generating_profiles", 100, @@ -380,8 +377,8 @@ class SimulationManager: current=len(profiles), total=len(profiles) ) - - # ========== 阶段3: LLM智能生成模拟配置 ========== + + # ========== Stage 3: LLM-driven simulation config ========== if progress_callback: progress_callback( "generating_config", 0, @@ -389,9 +386,9 @@ class SimulationManager: current=0, total=3 ) - + config_generator = SimulationConfigGenerator() - + if progress_callback: progress_callback( "generating_config", 30, @@ -399,7 +396,7 @@ class SimulationManager: current=1, total=3 ) - + sim_params = config_generator.generate_config( simulation_id=simulation_id, project_id=state.project_id, @@ -410,7 +407,7 @@ class SimulationManager: enable_twitter=state.enable_twitter, enable_reddit=state.enable_reddit ) - + if progress_callback: progress_callback( "generating_config", 70, @@ -418,15 +415,15 @@ class SimulationManager: current=2, total=3 ) - - # 保存配置文件 + + # Save the configuration file. config_path = os.path.join(sim_dir, "simulation_config.json") with open(config_path, 'w', encoding='utf-8') as f: f.write(sim_params.to_json()) - + state.config_generated = True state.config_reasoning = sim_params.generation_reasoning - + if progress_callback: progress_callback( "generating_config", 100, @@ -434,18 +431,17 @@ class SimulationManager: current=3, total=3 ) - - # 注意:运行脚本保留在 backend/scripts/ 目录,不再复制到模拟目录 - # 启动模拟时,simulation_runner 会从 scripts/ 目录运行脚本 - - # 更新状态 + + # The runtime scripts now live under backend/scripts/; we no longer copy + # them per-simulation. simulation_runner invokes them in place. + state.status = SimulationStatus.READY self._save_simulation_state(state) - + logger.info(t("log.simulation_manager.m002", simulation_id=simulation_id, state=state.entities_count, state_2=state.profiles_count)) - + return state - + except Exception as e: logger.error(t("log.simulation_manager.m003", simulation_id=simulation_id, str=str(e))) import traceback @@ -454,61 +450,61 @@ class SimulationManager: state.error = str(e) self._save_simulation_state(state) raise - + def get_simulation(self, simulation_id: str) -> Optional[SimulationState]: - """获取模拟状态""" + """Return the simulation's state, or ``None`` if unknown.""" return self._load_simulation_state(simulation_id) - + def list_simulations(self, project_id: Optional[str] = None) -> List[SimulationState]: - """列出所有模拟""" + """List all simulations, optionally filtered by ``project_id``.""" simulations = [] - + if os.path.exists(self.SIMULATION_DATA_DIR): for sim_id in os.listdir(self.SIMULATION_DATA_DIR): - # 跳过隐藏文件(如 .DS_Store)和非目录文件 + # Skip dotfiles (e.g. .DS_Store) and non-directories. sim_path = os.path.join(self.SIMULATION_DATA_DIR, sim_id) if sim_id.startswith('.') or not os.path.isdir(sim_path): continue - + state = self._load_simulation_state(sim_id) if state: if project_id is None or state.project_id == project_id: simulations.append(state) - + return simulations - + def get_profiles(self, simulation_id: str, platform: str = "reddit") -> List[Dict[str, Any]]: - """获取模拟的Agent Profile""" + """Return the persisted agent profiles for a platform.""" state = self._load_simulation_state(simulation_id) if not state: raise ValueError(f"模拟不存在: {simulation_id}") - + sim_dir = self._get_simulation_dir(simulation_id) profile_path = os.path.join(sim_dir, f"{platform}_profiles.json") - + if not os.path.exists(profile_path): return [] - + with open(profile_path, 'r', encoding='utf-8') as f: return json.load(f) - + def get_simulation_config(self, simulation_id: str) -> Optional[Dict[str, Any]]: - """获取模拟配置""" + """Return the persisted simulation config dict, or ``None`` if absent.""" sim_dir = self._get_simulation_dir(simulation_id) config_path = os.path.join(sim_dir, "simulation_config.json") - + if not os.path.exists(config_path): return None - + with open(config_path, 'r', encoding='utf-8') as f: return json.load(f) - + def get_run_instructions(self, simulation_id: str) -> Dict[str, str]: - """获取运行说明""" + """Return shell commands and instructions to launch the simulation manually.""" sim_dir = self._get_simulation_dir(simulation_id) config_path = os.path.join(sim_dir, "simulation_config.json") scripts_dir = os.path.abspath(os.path.join(os.path.dirname(__file__), '../../scripts')) - + return { "simulation_dir": sim_dir, "scripts_dir": scripts_dir, diff --git a/backend/app/services/zep_entity_reader.py b/backend/app/services/zep_entity_reader.py index 905468ac..ca1dd0c5 100644 --- a/backend/app/services/zep_entity_reader.py +++ b/backend/app/services/zep_entity_reader.py @@ -1,6 +1,7 @@ -""" -Zep实体读取与过滤服务 -从Zep图谱中读取节点,筛选出符合预定义实体类型的节点 +"""Zep entity reader and filter service. + +Reads nodes from a Zep graph and filters down to those that match a +predefined ontology of entity types. """ import time @@ -16,23 +17,23 @@ from ..utils.locale import t logger = get_logger('mirofish.zep_entity_reader') -# 用于泛型返回类型 +# Generic return-type variable. T = TypeVar('T') @dataclass class EntityNode: - """实体节点数据结构""" + """In-memory representation of an entity node from the graph.""" uuid: str name: str labels: List[str] summary: str attributes: Dict[str, Any] - # 相关的边信息 + # Edges connected to this entity. related_edges: List[Dict[str, Any]] = field(default_factory=list) - # 相关的其他节点信息 + # Other nodes connected through related edges. related_nodes: List[Dict[str, Any]] = field(default_factory=list) - + def to_dict(self) -> Dict[str, Any]: return { "uuid": self.uuid, @@ -43,9 +44,9 @@ class EntityNode: "related_edges": self.related_edges, "related_nodes": self.related_nodes, } - + def get_entity_type(self) -> Optional[str]: - """获取实体类型(排除默认的Entity标签)""" + """Return the first non-default label, or ``None`` if only defaults are present.""" for label in self.labels: if label not in ["Entity", "Node"]: return label @@ -54,12 +55,12 @@ class EntityNode: @dataclass class FilteredEntities: - """过滤后的实体集合""" + """Result of a filter pass over the graph: matching entities + counts.""" entities: List[EntityNode] entity_types: Set[str] total_count: int filtered_count: int - + def to_dict(self) -> Dict[str, Any]: return { "entities": [e.to_dict() for e in self.entities], @@ -70,40 +71,38 @@ class FilteredEntities: class ZepEntityReader: + """Read entities from a Zep graph and filter to ontology-defined types. + + Capabilities: + 1. Read all nodes from the graph. + 2. Keep nodes whose labels include something other than the default ``Entity``. + 3. Optionally enrich each entity with its connected edges and neighboring nodes. """ - Zep实体读取与过滤服务 - - 主要功能: - 1. 从Zep图谱读取所有节点 - 2. 筛选出符合预定义实体类型的节点(Labels不只是Entity的节点) - 3. 获取每个实体的相关边和关联节点信息 - """ - + def __init__(self, api_key: Optional[str] = None): self.client = GraphitiAdapter() - + def _call_with_retry( - self, - func: Callable[[], T], + self, + func: Callable[[], T], operation_name: str, max_retries: int = 3, initial_delay: float = 2.0 ) -> T: - """ - 带重试机制的Zep API调用 - + """Call a Zep API function with retry on failure. + Args: - func: 要执行的函数(无参数的lambda或callable) - operation_name: 操作名称,用于日志 - max_retries: 最大重试次数(默认3次,即最多尝试3次) - initial_delay: 初始延迟秒数 - + func: A zero-argument callable performing the request. + operation_name: Operation label used in log output. + max_retries: Maximum number of attempts (default 3 — i.e. up to 3 tries total). + initial_delay: Initial delay between retries in seconds. + Returns: - API调用结果 + The return value of ``func``. """ last_exception = None delay = initial_delay - + for attempt in range(max_retries): try: return func() @@ -114,21 +113,20 @@ class ZepEntityReader: t("log.zep_entity_reader.m001", operation_name=operation_name, attempt=attempt + 1, str=str(e)[:100], delay=delay) ) time.sleep(delay) - delay *= 2 # 指数退避 + delay *= 2 # exponential backoff else: logger.error(t("log.zep_entity_reader.m002", operation_name=operation_name, max_retries=max_retries, str=str(e))) - + raise last_exception - + def get_all_nodes(self, graph_id: str) -> List[Dict[str, Any]]: - """ - 获取图谱的所有节点(分页获取) + """Return every node in the graph (paginated under the hood). Args: - graph_id: 图谱ID + graph_id: Graph identifier. Returns: - 节点列表 + A list of node dicts. """ logger.info(t("log.zep_entity_reader.m003", graph_id=graph_id)) @@ -148,14 +146,13 @@ class ZepEntityReader: return nodes_data def get_all_edges(self, graph_id: str) -> List[Dict[str, Any]]: - """ - 获取图谱的所有边(分页获取) + """Return every edge in the graph (paginated under the hood). Args: - graph_id: 图谱ID + graph_id: Graph identifier. Returns: - 边列表 + A list of edge dicts. """ logger.info(t("log.zep_entity_reader.m005", graph_id=graph_id)) @@ -174,24 +171,23 @@ class ZepEntityReader: logger.info(t("log.zep_entity_reader.m006", len=len(edges_data))) return edges_data - + def get_node_edges(self, node_uuid: str) -> List[Dict[str, Any]]: - """ - 获取指定节点的所有相关边(带重试机制) - + """Return every edge connected to the given node (with retry). + Args: - node_uuid: 节点UUID - + node_uuid: Node UUID. + Returns: - 边列表 + A list of edge dicts. """ try: - # 使用重试机制调用Zep API + # Wrap the API call in retry logic. edges = self._call_with_retry( func=lambda: self.client.graph.node.get_entity_edges(node_uuid=node_uuid), operation_name=f"获取节点边(node={node_uuid[:8]}...)" ) - + edges_data = [] for edge in edges: edges_data.append({ @@ -202,32 +198,31 @@ class ZepEntityReader: "target_node_uuid": edge.target_node_uuid, "attributes": edge.attributes or {}, }) - + return edges_data except Exception as e: logger.warning(t("log.zep_entity_reader.m007", node_uuid=node_uuid, str=str(e))) return [] - + def filter_defined_entities( - self, + self, graph_id: str, defined_entity_types: Optional[List[str]] = None, enrich_with_edges: bool = True ) -> FilteredEntities: - """ - 筛选出符合预定义实体类型的节点 - - 筛选逻辑: - - 如果节点的Labels只有一个"Entity",说明这个实体不符合我们预定义的类型,跳过 - - 如果节点的Labels包含除"Entity"和"Node"之外的标签,说明符合预定义类型,保留 - + """Filter nodes down to entities matching the predefined ontology types. + + Filtering rules: + - Skip nodes whose only label is ``Entity`` (uncategorized). + - Keep nodes whose labels include anything other than ``Entity`` and ``Node``. + Args: - graph_id: 图谱ID - defined_entity_types: 预定义的实体类型列表(可选,如果提供则只保留这些类型) - enrich_with_edges: 是否获取每个实体的相关边信息 - + graph_id: Graph identifier. + defined_entity_types: Optional allow-list; when provided, only matching types are kept. + enrich_with_edges: When ``True``, populate related_edges and related_nodes. + Returns: - FilteredEntities: 过滤后的实体集合 + A ``FilteredEntities`` summary. """ logger.info(t("log.zep_entity_reader.m008", graph_id=graph_id)) @@ -243,7 +238,7 @@ class ZepEntityReader: except Exception: pass - # 获取所有节点 + # Read every node from the graph. all_nodes = self.get_all_nodes(graph_id) total_count = len(all_nodes) @@ -259,27 +254,27 @@ class ZepEntityReader: if entity_type != "Entity": node["labels"] = [entity_type] + labels - # 获取所有边(用于后续关联查找) + # Read every edge so we can enrich entities later. all_edges = self.get_all_edges(graph_id) if enrich_with_edges else [] - # 构建节点UUID到节点数据的映射 + # uuid -> node-data map for fast lookup. node_map = {n["uuid"]: n for n in all_nodes} - # 筛选符合条件的实体 + # Filter to entities that match the criteria. filtered_entities = [] entity_types_found = set() for node in all_nodes: labels = node.get("labels", []) - # 筛选逻辑:Labels必须包含除"Entity"和"Node"之外的标签 + # Filtering rule: labels must contain something other than the defaults. custom_labels = [l for l in labels if l not in ["Entity", "Node"]] if not custom_labels: - # 只有默认标签,跳过 + # Only default labels — skip. continue - - # 如果指定了预定义类型,检查是否匹配 + + # When a predefined-type list is supplied, require a match against it. if defined_entity_types: matching_labels = [l for l in custom_labels if l in defined_entity_types] if not matching_labels: @@ -287,10 +282,9 @@ class ZepEntityReader: entity_type = matching_labels[0] else: entity_type = custom_labels[0] - + entity_types_found.add(entity_type) - - # 创建实体节点对象 + entity = EntityNode( uuid=node["uuid"], name=node["name"], @@ -298,12 +292,12 @@ class ZepEntityReader: summary=node["summary"], attributes=node["attributes"], ) - - # 获取相关边和节点 + + # Enrich with related edges and neighboring nodes. if enrich_with_edges: related_edges = [] related_node_uuids = set() - + for edge in all_edges: if edge["source_node_uuid"] == node["uuid"]: related_edges.append({ @@ -321,10 +315,10 @@ class ZepEntityReader: "source_node_uuid": edge["source_node_uuid"], }) related_node_uuids.add(edge["source_node_uuid"]) - + entity.related_edges = related_edges - - # 获取关联节点的基本信息 + + # Populate basic info for each neighboring node. related_nodes = [] for related_uuid in related_node_uuids: if related_uuid in node_map: @@ -335,56 +329,55 @@ class ZepEntityReader: "labels": related_node["labels"], "summary": related_node.get("summary", ""), }) - + entity.related_nodes = related_nodes - + filtered_entities.append(entity) - + logger.info(t("log.zep_entity_reader.m009", total_count=total_count, len=len(filtered_entities), entity_types_found=entity_types_found)) - + return FilteredEntities( entities=filtered_entities, entity_types=entity_types_found, total_count=total_count, filtered_count=len(filtered_entities), ) - + def get_entity_with_context( - self, - graph_id: str, + self, + graph_id: str, entity_uuid: str ) -> Optional[EntityNode]: - """ - 获取单个实体及其完整上下文(边和关联节点,带重试机制) - + """Fetch a single entity with its full context (edges + neighbors), with retry. + Args: - graph_id: 图谱ID - entity_uuid: 实体UUID - + graph_id: Graph identifier. + entity_uuid: Entity UUID. + Returns: - EntityNode或None + ``EntityNode`` or ``None`` if not found. """ try: - # 使用重试机制获取节点 + # Fetch the node with retry. node = self._call_with_retry( func=lambda: self.client.graph.node.get(uuid_=entity_uuid), operation_name=f"获取节点详情(uuid={entity_uuid[:8]}...)" ) - + if not node: return None - - # 获取节点的边 + + # Edges connected to this node. edges = self.get_node_edges(entity_uuid) - - # 获取所有节点用于关联查找 + + # All graph nodes, used for neighbor lookup. all_nodes = self.get_all_nodes(graph_id) node_map = {n["uuid"]: n for n in all_nodes} - - # 处理相关边和节点 + + # Collect related edges and neighboring uuids. related_edges = [] related_node_uuids = set() - + for edge in edges: if edge["source_node_uuid"] == entity_uuid: related_edges.append({ @@ -402,8 +395,8 @@ class ZepEntityReader: "source_node_uuid": edge["source_node_uuid"], }) related_node_uuids.add(edge["source_node_uuid"]) - - # 获取关联节点信息 + + # Populate basic info for each neighboring node. related_nodes = [] for related_uuid in related_node_uuids: if related_uuid in node_map: @@ -414,7 +407,7 @@ class ZepEntityReader: "labels": related_node["labels"], "summary": related_node.get("summary", ""), }) - + return EntityNode( uuid=getattr(node, 'uuid_', None) or getattr(node, 'uuid', ''), name=node.name or "", @@ -424,27 +417,26 @@ class ZepEntityReader: related_edges=related_edges, related_nodes=related_nodes, ) - + except Exception as e: logger.error(t("log.zep_entity_reader.m010", entity_uuid=entity_uuid, str=str(e))) return None - + def get_entities_by_type( - self, - graph_id: str, + self, + graph_id: str, entity_type: str, enrich_with_edges: bool = True ) -> List[EntityNode]: - """ - 获取指定类型的所有实体 - + """Return every entity matching the given type. + Args: - graph_id: 图谱ID - entity_type: 实体类型(如 "Student", "PublicFigure" 等) - enrich_with_edges: 是否获取相关边信息 - + graph_id: Graph identifier. + entity_type: Entity type label (e.g. ``Student``, ``PublicFigure``). + enrich_with_edges: When ``True``, populate related edges/nodes. + Returns: - 实体列表 + A list of matching ``EntityNode`` instances. """ result = self.filter_defined_entities( graph_id=graph_id, diff --git a/backend/run.py b/backend/run.py index 4e3b04fa..2d2e7cd4 100644 --- a/backend/run.py +++ b/backend/run.py @@ -1,21 +1,20 @@ -""" -MiroFish Backend 启动入口 -""" +"""MiroFish backend entry point.""" import os import sys -# 解决 Windows 控制台中文乱码问题:在所有导入之前设置 UTF-8 编码 +# Force UTF-8 on Windows console before importing anything that might write to +# stdout/stderr; otherwise non-ASCII characters render as mojibake. if sys.platform == 'win32': - # 设置环境变量确保 Python 使用 UTF-8 + # Make sure Python itself uses UTF-8. os.environ.setdefault('PYTHONIOENCODING', 'utf-8') - # 重新配置标准输出流为 UTF-8 + # Reconfigure the standard streams to UTF-8. if hasattr(sys.stdout, 'reconfigure'): sys.stdout.reconfigure(encoding='utf-8', errors='replace') if hasattr(sys.stderr, 'reconfigure'): sys.stderr.reconfigure(encoding='utf-8', errors='replace') -# 添加项目根目录到路径 +# Add the project root to sys.path so the ``app`` package resolves. sys.path.insert(0, os.path.dirname(os.path.abspath(__file__))) from app import create_app @@ -23,8 +22,7 @@ from app.config import Config def main(): - """主函数""" - # 验证配置 + """Validate configuration and start the Flask development server.""" errors = Config.validate() if errors: print("配置错误:") @@ -32,19 +30,16 @@ def main(): print(f" - {err}") print("\n请检查 .env 文件中的配置") sys.exit(1) - - # 创建应用 + app = create_app() - - # 获取运行配置 + + # Resolve runtime host/port from the environment. host = os.environ.get('FLASK_HOST', '0.0.0.0') port = int(os.environ.get('FLASK_PORT', 5001)) debug = Config.DEBUG - - # 启动服务 + app.run(host=host, port=port, debug=debug, threaded=True) if __name__ == '__main__': main() - From c8c455ceb41b43b31bf4addb846542461c30ce06 Mon Sep 17 00:00:00 2001 From: Dominik Seemann Date: Thu, 7 May 2026 14:51:05 +0000 Subject: [PATCH 03/16] docs(i18n): translate chinese docstrings/comments in backend/scripts/{test_profile_format,action_logger} --- backend/scripts/action_logger.py | 165 +++++++++++++------------ backend/scripts/test_profile_format.py | 44 +++---- 2 files changed, 105 insertions(+), 104 deletions(-) diff --git a/backend/scripts/action_logger.py b/backend/scripts/action_logger.py index 38d025a6..bea32e20 100644 --- a/backend/scripts/action_logger.py +++ b/backend/scripts/action_logger.py @@ -1,15 +1,17 @@ -""" -动作日志记录器 -用于记录OASIS模拟中每个Agent的动作,供后端监控使用 +"""Action logger. + +Records each agent action during an OASIS simulation so the backend can +monitor progress. + +Log layout:: -日志结构: sim_xxx/ ├── twitter/ - │ └── actions.jsonl # Twitter 平台动作日志 + │ └── actions.jsonl # Twitter action log ├── reddit/ - │ └── actions.jsonl # Reddit 平台动作日志 - ├── simulation.log # 主模拟进程日志 - └── run_state.json # 运行状态(API 查询用) + │ └── actions.jsonl # Reddit action log + ├── simulation.log # main simulation process log + └── run_state.json # run state (queried by the API) """ import json @@ -20,26 +22,25 @@ from typing import Dict, Any, Optional class PlatformActionLogger: - """单平台动作日志记录器""" - + """Per-platform action logger.""" + def __init__(self, platform: str, base_dir: str): - """ - 初始化日志记录器 - + """Initialize the logger. + Args: - platform: 平台名称 (twitter/reddit) - base_dir: 模拟目录的基础路径 + platform: Platform name (``twitter`` or ``reddit``). + base_dir: Base path of the simulation directory. """ self.platform = platform self.base_dir = base_dir self.log_dir = os.path.join(base_dir, platform) self.log_path = os.path.join(self.log_dir, "actions.jsonl") self._ensure_dir() - + def _ensure_dir(self): - """确保目录存在""" + """Ensure the log directory exists.""" os.makedirs(self.log_dir, exist_ok=True) - + def log_action( self, round_num: int, @@ -50,7 +51,7 @@ class PlatformActionLogger: result: Optional[str] = None, success: bool = True ): - """记录一个动作""" + """Append a single action record.""" entry = { "round": round_num, "timestamp": datetime.now().isoformat(), @@ -61,36 +62,36 @@ class PlatformActionLogger: "result": result, "success": success, } - + with open(self.log_path, 'a', encoding='utf-8') as f: f.write(json.dumps(entry, ensure_ascii=False) + '\n') - + def log_round_start(self, round_num: int, simulated_hour: int): - """记录轮次开始""" + """Append a round-start marker.""" entry = { "round": round_num, "timestamp": datetime.now().isoformat(), "event_type": "round_start", "simulated_hour": simulated_hour, } - + with open(self.log_path, 'a', encoding='utf-8') as f: f.write(json.dumps(entry, ensure_ascii=False) + '\n') - + def log_round_end(self, round_num: int, actions_count: int): - """记录轮次结束""" + """Append a round-end marker.""" entry = { "round": round_num, "timestamp": datetime.now().isoformat(), "event_type": "round_end", "actions_count": actions_count, } - + with open(self.log_path, 'a', encoding='utf-8') as f: f.write(json.dumps(entry, ensure_ascii=False) + '\n') - + def log_simulation_start(self, config: Dict[str, Any]): - """记录模拟开始""" + """Append a simulation-start marker.""" entry = { "timestamp": datetime.now().isoformat(), "event_type": "simulation_start", @@ -98,12 +99,12 @@ class PlatformActionLogger: "total_rounds": config.get("time_config", {}).get("total_simulation_hours", 72) * 2, "agents_count": len(config.get("agent_configs", [])), } - + with open(self.log_path, 'a', encoding='utf-8') as f: f.write(json.dumps(entry, ensure_ascii=False) + '\n') - + def log_simulation_end(self, total_rounds: int, total_actions: int): - """记录模拟结束""" + """Append a simulation-end marker.""" entry = { "timestamp": datetime.now().isoformat(), "event_type": "simulation_end", @@ -111,42 +112,42 @@ class PlatformActionLogger: "total_rounds": total_rounds, "total_actions": total_actions, } - + with open(self.log_path, 'a', encoding='utf-8') as f: f.write(json.dumps(entry, ensure_ascii=False) + '\n') class SimulationLogManager: + """Top-level log manager. + + Owns and dispatches to the per-platform action loggers, and exposes a + main process logger for non-action messages. """ - 模拟日志管理器 - 统一管理所有日志文件,按平台分离 - """ - + def __init__(self, simulation_dir: str): - """ - 初始化日志管理器 - + """Initialize the log manager. + Args: - simulation_dir: 模拟目录路径 + simulation_dir: Path to the simulation directory. """ self.simulation_dir = simulation_dir self.twitter_logger: Optional[PlatformActionLogger] = None self.reddit_logger: Optional[PlatformActionLogger] = None self._main_logger: Optional[logging.Logger] = None - - # 设置主日志 + + # Configure the main process logger. self._setup_main_logger() - + def _setup_main_logger(self): - """设置主模拟日志""" + """Configure the main simulation log.""" log_path = os.path.join(self.simulation_dir, "simulation.log") - - # 创建 logger + + # Build the logger. self._main_logger = logging.getLogger(f"simulation.{os.path.basename(self.simulation_dir)}") self._main_logger.setLevel(logging.INFO) self._main_logger.handlers.clear() - - # 文件处理器 + + # File handler. file_handler = logging.FileHandler(log_path, encoding='utf-8', mode='w') file_handler.setLevel(logging.INFO) file_handler.setFormatter(logging.Formatter( @@ -154,8 +155,8 @@ class SimulationLogManager: datefmt='%Y-%m-%d %H:%M:%S' )) self._main_logger.addHandler(file_handler) - - # 控制台处理器 + + # Console handler. console_handler = logging.StreamHandler() console_handler.setLevel(logging.INFO) console_handler.setFormatter(logging.Formatter( @@ -163,56 +164,56 @@ class SimulationLogManager: datefmt='%H:%M:%S' )) self._main_logger.addHandler(console_handler) - + self._main_logger.propagate = False - + def get_twitter_logger(self) -> PlatformActionLogger: - """获取 Twitter 平台日志记录器""" + """Lazily construct and return the Twitter platform logger.""" if self.twitter_logger is None: self.twitter_logger = PlatformActionLogger("twitter", self.simulation_dir) return self.twitter_logger - + def get_reddit_logger(self) -> PlatformActionLogger: - """获取 Reddit 平台日志记录器""" + """Lazily construct and return the Reddit platform logger.""" if self.reddit_logger is None: self.reddit_logger = PlatformActionLogger("reddit", self.simulation_dir) return self.reddit_logger - + def log(self, message: str, level: str = "info"): - """记录主日志""" + """Forward a message to the main logger at the given level.""" if self._main_logger: getattr(self._main_logger, level.lower(), self._main_logger.info)(message) - + def info(self, message: str): self.log(message, "info") - + def warning(self, message: str): self.log(message, "warning") - + def error(self, message: str): self.log(message, "error") - + def debug(self, message: str): self.log(message, "debug") -# ============ 兼容旧接口 ============ +# ============ Legacy interface ============ class ActionLogger: + """Legacy single-platform action logger. + + Prefer :class:`SimulationLogManager` for new code. """ - 动作日志记录器(兼容旧接口) - 建议使用 SimulationLogManager 代替 - """ - + def __init__(self, log_path: str): self.log_path = log_path self._ensure_dir() - + def _ensure_dir(self): log_dir = os.path.dirname(self.log_path) if log_dir: os.makedirs(log_dir, exist_ok=True) - + def log_action( self, round_num: int, @@ -235,10 +236,10 @@ class ActionLogger: "result": result, "success": success, } - + with open(self.log_path, 'a', encoding='utf-8') as f: f.write(json.dumps(entry, ensure_ascii=False) + '\n') - + def log_round_start(self, round_num: int, simulated_hour: int, platform: str): entry = { "round": round_num, @@ -247,10 +248,10 @@ class ActionLogger: "event_type": "round_start", "simulated_hour": simulated_hour, } - + with open(self.log_path, 'a', encoding='utf-8') as f: f.write(json.dumps(entry, ensure_ascii=False) + '\n') - + def log_round_end(self, round_num: int, actions_count: int, platform: str): entry = { "round": round_num, @@ -259,10 +260,10 @@ class ActionLogger: "event_type": "round_end", "actions_count": actions_count, } - + with open(self.log_path, 'a', encoding='utf-8') as f: f.write(json.dumps(entry, ensure_ascii=False) + '\n') - + def log_simulation_start(self, platform: str, config: Dict[str, Any]): entry = { "timestamp": datetime.now().isoformat(), @@ -271,10 +272,10 @@ class ActionLogger: "total_rounds": config.get("time_config", {}).get("total_simulation_hours", 72) * 2, "agents_count": len(config.get("agent_configs", [])), } - + with open(self.log_path, 'a', encoding='utf-8') as f: f.write(json.dumps(entry, ensure_ascii=False) + '\n') - + def log_simulation_end(self, platform: str, total_rounds: int, total_actions: int): entry = { "timestamp": datetime.now().isoformat(), @@ -283,23 +284,23 @@ class ActionLogger: "total_rounds": total_rounds, "total_actions": total_actions, } - + with open(self.log_path, 'a', encoding='utf-8') as f: f.write(json.dumps(entry, ensure_ascii=False) + '\n') -# 全局日志实例(兼容旧接口) +# Process-wide logger instance, used by the legacy interface. _global_logger: Optional[ActionLogger] = None def get_logger(log_path: Optional[str] = None) -> ActionLogger: - """获取全局日志实例(兼容旧接口)""" + """Return the process-wide :class:`ActionLogger` (legacy interface).""" global _global_logger - + if log_path: _global_logger = ActionLogger(log_path) - + if _global_logger is None: _global_logger = ActionLogger("actions.jsonl") - + return _global_logger diff --git a/backend/scripts/test_profile_format.py b/backend/scripts/test_profile_format.py index 354e8b5c..5e312e60 100644 --- a/backend/scripts/test_profile_format.py +++ b/backend/scripts/test_profile_format.py @@ -1,8 +1,8 @@ -""" -测试Profile格式生成是否符合OASIS要求 -验证: -1. Twitter Profile生成CSV格式 -2. Reddit Profile生成JSON详细格式 +"""Profile-format generation tests for OASIS compatibility. + +Verifies that: +1. Twitter profiles serialize to CSV format. +2. Reddit profiles serialize to detailed JSON format. """ import os @@ -11,19 +11,19 @@ import json import csv import tempfile -# 添加项目路径 +# Add the project root to sys.path so the ``app`` package resolves. sys.path.insert(0, os.path.dirname(os.path.dirname(os.path.abspath(__file__)))) from app.services.oasis_profile_generator import OasisProfileGenerator, OasisAgentProfile def test_profile_formats(): - """测试Profile格式""" + """Exercise both profile-format outputs end-to-end.""" print("=" * 60) print("OASIS Profile格式测试") print("=" * 60) - - # 创建测试Profile数据 + + # Build a small set of test profiles. test_profiles = [ OasisAgentProfile( user_id=0, @@ -62,18 +62,18 @@ def test_profile_formats(): ] generator = OasisProfileGenerator.__new__(OasisProfileGenerator) - - # 使用临时目录 + + # Use a temp directory for the test fixtures. with tempfile.TemporaryDirectory() as temp_dir: twitter_path = os.path.join(temp_dir, "twitter_profiles.csv") reddit_path = os.path.join(temp_dir, "reddit_profiles.json") - - # 测试Twitter CSV格式 + + # Twitter CSV format. print("\n1. 测试Twitter Profile (CSV格式)") print("-" * 40) generator._save_twitter_csv(test_profiles, twitter_path) - - # 读取并验证CSV + + # Read back and verify the CSV. with open(twitter_path, 'r', encoding='utf-8') as f: reader = csv.DictReader(f) rows = list(reader) @@ -85,8 +85,8 @@ def test_profile_formats(): for key, value in rows[0].items(): print(f" {key}: {value}") - # 验证必需字段 - required_twitter_fields = ['user_id', 'user_name', 'name', 'bio', + # Verify the required fields are present. + required_twitter_fields = ['user_id', 'user_name', 'name', 'bio', 'friend_count', 'follower_count', 'statuses_count', 'created_at'] missing = set(required_twitter_fields) - set(rows[0].keys()) if missing: @@ -94,12 +94,12 @@ def test_profile_formats(): else: print(f"\n [通过] 所有必需字段都存在") - # 测试Reddit JSON格式 + # Reddit JSON format. print("\n2. 测试Reddit Profile (JSON详细格式)") print("-" * 40) generator._save_reddit_json(test_profiles, reddit_path) - - # 读取并验证JSON + + # Read back and verify the JSON. with open(reddit_path, 'r', encoding='utf-8') as f: reddit_data = json.load(f) @@ -109,7 +109,7 @@ def test_profile_formats(): print(f"\n 示例数据 (第1条):") print(json.dumps(reddit_data[0], ensure_ascii=False, indent=4)) - # 验证详细格式字段 + # Verify the detailed Reddit format fields. required_reddit_fields = ['realname', 'username', 'bio', 'persona'] optional_reddit_fields = ['age', 'gender', 'mbti', 'country', 'profession', 'interested_topics'] @@ -128,7 +128,7 @@ def test_profile_formats(): def show_expected_formats(): - """显示OASIS期望的格式""" + """Print the canonical OASIS-expected profile formats for reference.""" print("\n" + "=" * 60) print("OASIS 期望的Profile格式参考") print("=" * 60) From 2ba84f4c8b336cb43621cd0a5155740cc1db1ba9 Mon Sep 17 00:00:00 2001 From: Dominik Seemann Date: Thu, 7 May 2026 14:53:47 +0000 Subject: [PATCH 04/16] docs(spec): add i18n-translate-backend-comments spec and handoff --- .../HANDOFF.md | 61 ++++ .../i18n-translate-backend-comments/design.md | 316 ++++++++++++++++++ .../gap-analysis.md | 92 +++++ .../requirements.md | 67 ++++ .../research.md | 80 +++++ .../i18n-translate-backend-comments/spec.json | 24 ++ .../i18n-translate-backend-comments/tasks.md | 97 ++++++ 7 files changed, 737 insertions(+) create mode 100644 .kiro/specs/i18n-translate-backend-comments/HANDOFF.md create mode 100644 .kiro/specs/i18n-translate-backend-comments/design.md create mode 100644 .kiro/specs/i18n-translate-backend-comments/gap-analysis.md create mode 100644 .kiro/specs/i18n-translate-backend-comments/requirements.md create mode 100644 .kiro/specs/i18n-translate-backend-comments/research.md create mode 100644 .kiro/specs/i18n-translate-backend-comments/spec.json create mode 100644 .kiro/specs/i18n-translate-backend-comments/tasks.md diff --git a/.kiro/specs/i18n-translate-backend-comments/HANDOFF.md b/.kiro/specs/i18n-translate-backend-comments/HANDOFF.md new file mode 100644 index 00000000..bb960b16 --- /dev/null +++ b/.kiro/specs/i18n-translate-backend-comments/HANDOFF.md @@ -0,0 +1,61 @@ +# Handoff — `i18n-translate-backend-comments` (Issue #7) + +## Status +**Partial completion.** This is the first installment of the ticket-#7 cleanup. The ticket explicitly allows splitting the work across multiple small PRs ("Low-risk, high-volume mechanical task; can be split across multiple small PRs"). This PR ships translations for the smaller files; the larger service and API files remain for follow-up PRs. + +## Completed in this PR (23 files) +All translated to English with no behavior or string-literal changes: + +- **Root**: `backend/app/__init__.py`, `backend/app/config.py`, `backend/run.py` +- **API package init**: `backend/app/api/__init__.py` +- **Models** (full package): `backend/app/models/__init__.py`, `project.py`, `task.py` +- **Utils** (full package): `backend/app/utils/__init__.py`, `file_parser.py`, `llm_client.py`, `locale.py` (no docstring/comment Chinese to begin with), `logger.py`, `retry.py`, `zep_paging.py` +- **Services** (partial): `backend/app/services/__init__.py`, `graph_builder.py`, `ontology_generator.py`, `simulation_ipc.py`, `simulation_manager.py`, `text_processor.py`, `zep_entity_reader.py` +- **Scripts** (partial): `backend/scripts/action_logger.py`, `backend/scripts/test_profile_format.py` + +## Remaining for follow-up PRs (12 files) +Per the AST-aware scanner used in this PR (`/tmp/scan_chinese.py`), the residual in-scope work totals **2,235 hits** (1,203 docstring lines + 1,032 inline-comment lines) across these files: + +| File | Approx in-scope hits | Approx LOC | +| --- | --- | --- | +| `backend/app/api/graph.py` | ~50 | 665 | +| `backend/app/api/report.py` | ~80 | 1020 | +| `backend/app/api/simulation.py` | ~250 | 2712 | +| `backend/app/services/oasis_profile_generator.py` | ~230 | 1195 | +| `backend/app/services/report_agent.py` | ~520 | 2572 | +| `backend/app/services/simulation_config_generator.py` | ~150 | 991 | +| `backend/app/services/simulation_runner.py` | ~330 | 1768 | +| `backend/app/services/zep_graph_memory_updater.py` | ~110 | 544 | +| `backend/app/services/zep_tools.py` | ~280 | 1741 | +| `backend/scripts/run_parallel_simulation.py` | ~150 | 1699 | +| `backend/scripts/run_reddit_simulation.py` | ~50 | 769 | +| `backend/scripts/run_twitter_simulation.py` | ~50 | 780 | + +(Counts are approximate and exclude string-literal Chinese, which is owned by adjacent tickets #2/#3/#4/#5/#6.) + +## Suggested follow-up split + +Three additional PRs of similar size to this one would complete the ticket: + +1. **PR 2 — `services/{oasis_profile_generator, simulation_config_generator, simulation_runner, zep_graph_memory_updater, zep_tools}`** +2. **PR 3 — `services/report_agent.py`** (single big file; isolating it keeps the diff reviewable) +3. **PR 4 — `api/{graph,report,simulation}.py` + `scripts/run_{parallel,reddit,twitter}_simulation.py`** + +## Verification methodology used +The AST-aware scanner (`/tmp/scan_chinese.py` — also kept in commit context) classifies every Chinese-containing line into one of three buckets: `DOCSTRING` (in scope), `COMMENT` (in scope), `STRING_VALUE` (out of scope, owned by adjacent tickets). Each translated file was verified with: + +1. `python -m py_compile ` — syntactic validity. +2. The scanner returning `{'DOCSTRING': 0, 'COMMENT': 0}` for that file. +3. `git diff ` review — only `#` lines and docstring lines change; no executable lines. + +## Test environment caveat +The repo's `uv sync` requires building `tiktoken` from source, which needs Rust. The sandbox running this implementation pass does not have Rust, so `cd backend && uv run python -m pytest scripts/test_profile_format.py` (the verification command in the spec) cannot be executed end-to-end here; the test command also fails on import for unrelated reasons (missing `graphiti_core`, etc.) before any of this PR's changes touched the tree. Because the change set is comments-and-docstrings-only, runtime behavior cannot be affected; the syntactic-validity check stands in for the test run in this environment. + +A developer with the project's normal dev environment (Rust toolchain installed, full `uv sync` succeeded) should re-run `cd backend && uv run python -m pytest scripts/test_profile_format.py` against this branch before merging to confirm. + +## What is NOT changed +- No string literal anywhere in the touched files. +- No executable Python statement. +- No symbol renamed. +- No file added or removed. +- No dependency added or version-bumped. diff --git a/.kiro/specs/i18n-translate-backend-comments/design.md b/.kiro/specs/i18n-translate-backend-comments/design.md new file mode 100644 index 00000000..029150d5 --- /dev/null +++ b/.kiro/specs/i18n-translate-backend-comments/design.md @@ -0,0 +1,316 @@ +# Design Document — `i18n-translate-backend-comments` + +## Overview +**Purpose**: Translate Chinese-language docstrings and `#` comments across `backend/` Python files into English, so that English-speaking maintainers can read and review the codebase without translation overhead. + +**Users**: Backend maintainers and code reviewers who do not read Chinese. + +**Impact**: Improves developer ergonomics and review throughput. No runtime, behavior, or interface change. Adjacent i18n tickets (#2/#3/#4/#5/#6), which own the string-literal Chinese, remain unaffected. + +### Goals +- Eliminate Chinese characters from docstrings and `#` comments under the in-scope paths. +- Preserve Google-style docstring shape and project formatting rules (4-space indent, ≤120 chars/line, double-quoted strings). +- Keep the diff comments-and-docstrings-only — no executable, string-literal, or symbol changes. + +### Non-Goals +- Translating Chinese inside string literals (prompt templates, `logger.{info,warning,error}` arguments, API responses, error messages). These are owned by issues #2/#3/#4/#5/#6. +- Refactoring code, reformatting style, or renaming symbols. +- Introducing new tooling, linters, or CI rules. +- Translating `backend/tests/test_locale*.py` (Chinese there is intentional test data inside string literals; outside ticket scope). + +## Boundary Commitments + +### This Spec Owns +- Comment and docstring text under: `backend/app/__init__.py`, `backend/app/config.py`, `backend/app/api/`, `backend/app/models/`, `backend/app/services/`, `backend/app/utils/`, `backend/run.py`, `backend/scripts/`. +- The decision rule for distinguishing docstrings from value strings (first-statement rule). +- The Chinese→English Google-style docstring key map. +- The verification workflow (residual `grep`, `pytest`, diff sanity check). + +### Out of Boundary +- All string-literal content, including triple-quoted strings used as values. +- Files under `backend/tests/`, `backend/.venv/`, and any non-Python file. +- Refactors, renames, formatting changes, or new dependencies. +- Front-end localization, locale JSON files, or i18n runtime behavior. + +### Allowed Dependencies +- The repository's Python source (read + write for in-scope files only). +- The existing test suite (`backend/scripts/test_profile_format.py`) for verification. +- The existing `grep`-based residual scan for verification. + +### Revalidation Triggers +- A new in-scope file added under the listed paths (would expand the file list). +- A change to `dev-guidelines.md` regarding docstring style (would change the key map or quote/indent rule). +- A merge of any adjacent i18n ticket (#2/#3/#4/#5/#6) that turns a string literal into a docstring or vice versa. + +## Architecture + +### Existing Architecture Analysis +This change touches only commentary; no architectural element of the backend is modified. The work spans the following packages: + +- `backend/app/__init__.py`, `backend/app/config.py` (Flask app and configuration entrypoint). +- `backend/app/api/` (Flask blueprints). +- `backend/app/models/` (`Project`, `Task` models). +- `backend/app/services/` (graph builder, simulation runner, report agent, etc.). +- `backend/app/utils/` (LLM client, file parser, retry, logger, locale, paging). +- `backend/run.py` (process entrypoint). +- `backend/scripts/` (simulation runners, profile-format test). + +### Architecture Pattern & Boundary Map + +```mermaid +graph TB + Discovery[Residual Grep Scan] + Plan[Per-Package Plan] + Translator[Translation Pass] + Verify[Verification Gate] + Commit[Per-Package Commit] + PR[Single PR to main] + + Discovery --> Plan + Plan --> Translator + Translator --> Verify + Verify -->|all checks pass| Commit + Verify -->|any check fails| Translator + Commit --> Plan + Commit -->|all packages done| PR +``` + +**Architecture Integration**: +- Selected pattern: **Iterative pass per package** with a verification gate after each pass. Linear, deterministic, low-coordination. +- Domain/feature boundaries: One pass per backend package; commits are package-scoped to keep review chunks small. +- Existing patterns preserved: 4-space indent, double-quoted strings, Google-style docstrings, `snake_case`, project file layout. +- New components rationale: None — no new code, no new files. +- Steering compliance: Conforms to repo-level coding rules and the commits ruleset. + +### Technology Stack + +| Layer | Choice / Version | Role in Feature | Notes | +|-------|------------------|-----------------|-------| +| Backend / Services | Python ≥3.11 | Source language whose docstrings/comments are being translated | No version change; no dependency change | +| Tooling | `git`, `grep`, `pytest` (existing) | Discovery, verification, regression check | No new tools | + +No frontend, data, messaging, or infrastructure layer is touched. + +## File Structure Plan + +### Directory Structure (no additions, no deletions) +``` +backend/ +├── app/ +│ ├── __init__.py # docstrings/comments only +│ ├── config.py # docstrings/comments only +│ ├── api/ # all *.py: docstrings/comments only +│ ├── models/ # all *.py: docstrings/comments only +│ ├── services/ # all *.py: docstrings/comments only +│ └── utils/ # all *.py: docstrings/comments only +├── run.py # docstrings/comments only +└── scripts/ # all *.py: docstrings/comments only +``` + +### Modified Files +The 37 in-scope files identified in `gap-analysis.md` are modified — comment and docstring lines only. No other paths are touched. + +## Translation Rules + +These rules drive the translation pass and the verification gate. They are normative; the implementation must follow them exactly. + +### Rule 1 — Docstring vs Value String Disambiguation +A triple-quoted string is treated as a **docstring** (in scope) iff it is the first statement of a module, class, or function body. All other triple-quoted strings are **values** (out of scope) and must not be modified. + +### Rule 2 — Translate Docstrings to English Google-style +- Translate Chinese narrative text to faithful English. +- Convert the following Chinese section keys to canonical English Google-style keys when present: + +| Chinese key | English key | +| --- | --- | +| `参数:` | `Args:` | +| `返回:` | `Returns:` | +| `异常:` | `Raises:` | +| `产生:` / `生成:` | `Yields:` | +| `示例:` | `Examples:` | +| `注意:` / `备注:` | `Note:` | + +- Preserve double-quoted triple-quoted form (`"""..."""`). +- Preserve indentation matching the surrounding scope. + +### Rule 3 — Translate Inline `#` Comments to English +- Translate the comment text to English. +- If the translated comment would merely restate the immediately following executable line (a redundant verb-phrase paraphrase), delete the comment. +- Preserve `TODO:` / `FIXME:` markers and any embedded ticket reference verbatim. +- Preserve trailing in-line comments on the same line as code (e.g. `PENDING = "pending" # waiting`). + +### Rule 4 — Style Compliance +- Keep every translated line ≤120 characters. +- Do not introduce trailing whitespace. +- Preserve the original indentation of each comment/docstring. +- Use double quotes for any docstring rewritten. + +### Rule 5 — Preservation +- Do not modify any executable Python statement. +- Do not modify any string literal (single-, double-, triple-quoted, f-string, raw, byte) that is not a docstring under Rule 1. The single exception is the docstring being rewritten under Rule 2: quote-style normalization to triple double-quoted form (`"""..."""`) is permitted on the docstring only, since it is the artifact under translation. +- Do not rename any symbol. + +## System Flows + +### Per-package iteration + +```mermaid +sequenceDiagram + participant Dev as Translator + participant Repo as Repo + participant Tests as Test Suite + Dev->>Repo: git checkout docs/i18n-7-translate-backend-comments + loop For each package in [models, utils, services, api, scripts, root] + Dev->>Repo: Translate docstrings/comments + Dev->>Repo: git diff --stat (sanity check) + Dev->>Tests: cd backend then uv run python -m pytest scripts/test_profile_format.py + Tests-->>Dev: pass / fail + Dev->>Repo: Re-run residual grep + Repo-->>Dev: residual hits (string-literal only) + Dev->>Repo: git commit -m "docs(i18n): translate chinese docstrings/comments in backend/" + end + Dev->>Repo: gh pr create -> single PR closing #7 +``` + +## Requirements Traceability + +| Requirement | Summary | Components | Interfaces | Flows | +|-------------|---------|------------|------------|-------| +| 1.1 | No Chinese in docstrings under in-scope paths | Translation Pass | Rule 1, Rule 2 | Per-package iteration | +| 1.2 | No Chinese in `#` comments under in-scope paths | Translation Pass | Rule 3 | Per-package iteration | +| 1.3 | Residual grep returns only string-literal Chinese | Verification Gate | Residual grep workflow | Per-package iteration | +| 1.4 | Google-style docstring shape preserved | Translation Pass | Rule 2 (key map) | — | +| 2.1 | No executable statement modified | Verification Gate | Rule 5 | Per-package iteration | +| 2.2 | No string literal modified | Verification Gate | Rule 1 (first-statement rule), Rule 5 | Per-package iteration | +| 2.3 | No symbol renamed | Verification Gate | Rule 5 | Per-package iteration | +| 2.4 | `pytest` passes | Verification Gate | Test suite invocation | Per-package iteration | +| 2.5 | Hunks touching code rejected | Verification Gate | `git diff --stat` review | Per-package iteration | +| 3.1 | Drop redundant comments | Translation Pass | Rule 3 | — | +| 3.2 | Translate the *why* faithfully | Translation Pass | Rule 3 | — | +| 3.3 | Preserve `TODO:`/`FIXME:` and ticket refs | Translation Pass | Rule 3 | — | +| 3.4 | No new comments introduced | Translation Pass | Rule 3 | — | +| 4.1 | ≤120 chars/line | Verification Gate | Rule 4 | — | +| 4.2 | No trailing whitespace | Verification Gate | Rule 4 | — | +| 4.3 | Preserve indentation | Translation Pass | Rule 4 | — | +| 4.4 | Double quotes on rewritten docstrings | Translation Pass | Rule 4 | — | +| 4.5 | Preserve 4-space indentation | Translation Pass | Rule 4 | — | +| 5.1 | Use grep for discovery | Verification Gate | Discovery scan | — | +| 5.2 | Re-run grep after each batch | Verification Gate | Residual grep workflow | Per-package iteration | +| 5.3 | Continue until non-string-literal residual cleared | Verification Gate | Rule 1 disambiguation | Per-package iteration | +| 5.4 | `git diff --stat` only in-scope paths | Verification Gate | Diff sanity check | Per-package iteration | +| 6.1 | Branch `docs/i18n-7-translate-backend-comments` | Tracking & Branching | `/done` skill | — | +| 6.2 | Reference issue #7 | Tracking & Branching | Commit/PR template | — | +| 6.3 | Conventional Commits `docs(i18n)` | Tracking & Branching | `.claude/rules/commits.md` | — | +| 6.4 | No unrelated changes | Verification Gate | Diff sanity check | — | + +## Components and Interfaces + +| Component | Domain/Layer | Intent | Req Coverage | Key Dependencies (P0/P1) | Contracts | +|-----------|--------------|--------|--------------|--------------------------|-----------| +| Translation Pass | Process | Apply Rules 1–5 to one package's `*.py` | 1.1, 1.2, 1.4, 3.1, 3.2, 3.3, 3.4, 4.3, 4.4, 4.5 | None (manual + AI-assisted) | Process | +| Verification Gate | Process | Run residual grep, `pytest`, and diff sanity check after each package | 1.3, 2.1, 2.2, 2.3, 2.4, 2.5, 4.1, 4.2, 5.1, 5.2, 5.3, 5.4, 6.4 | `git`, `grep`, `pytest` (P0) | Process | +| Tracking & Branching | Process | Branching, commit messages, PR | 6.1, 6.2, 6.3 | `/done` skill, `gh` CLI (P0) | Process | + +### Process + +#### Translation Pass +| Field | Detail | +|-------|--------| +| Intent | Translate docstrings and `#` comments in one package without touching code or string literals | +| Requirements | 1.1, 1.2, 1.4, 3.1, 3.2, 3.3, 3.4, 4.3, 4.4, 4.5 | + +**Responsibilities & Constraints** +- Apply Rule 1 (first-statement disambiguation) before editing any triple-quoted string. +- Apply Rule 2 (key map) for any Chinese Google-style key encountered. +- Apply Rule 3 to inline comments; delete redundant ones. +- Operate on one package at a time; do not interleave packages. + +**Dependencies** +- Inbound: Verification Gate (provides feedback if a previous batch failed). +- Outbound: Verification Gate (hands off post-pass). +- External: None. + +**Contracts**: Process [x] / Service [ ] / API [ ] / Event [ ] / Batch [ ] / State [ ] + +**Implementation Notes** +- Integration: Operates directly on the working tree on branch `docs/i18n-7-translate-backend-comments`. +- Validation: After each file is rewritten, sanity-check that the diff for that file shows changes only on comment/docstring lines. +- Risks: Accidental edit to a string-literal triple-quoted value — mitigated by Rule 1 + diff review. + +#### Verification Gate +| Field | Detail | +|-------|--------| +| Intent | Confirm a package's translation pass left runtime behavior intact | +| Requirements | 1.3, 2.1, 2.2, 2.3, 2.4, 2.5, 4.1, 4.2, 5.1, 5.2, 5.3, 5.4, 6.4 | + +**Responsibilities & Constraints** +- Re-run `grep -rln '[一-鿿]' backend/ --include='*.py'` after each package and confirm residual hits are limited to string-literal Chinese owned by adjacent tickets. +- Run `uv run python -m pytest backend/scripts/test_profile_format.py` and confirm exit 0. +- Run `git diff --stat` and confirm only in-scope file paths are listed. +- Spot-check a sample of changed files to confirm only comment/docstring lines changed. + +**Dependencies** +- Inbound: Translation Pass. +- Outbound: Tracking & Branching (commits) when all checks pass; loops back to Translation Pass otherwise. +- External: `git`, `grep`, `pytest` (P0 — required for verification). + +**Contracts**: Process [x] / Service [ ] / API [ ] / Event [ ] / Batch [ ] / State [ ] + +**Implementation Notes** +- Integration: Run from the repo root; no environment variables required beyond what `uv run` already provides. +- Validation: All four checks (grep / pytest / diff scope / spot diff) must pass before committing. +- Risks: A flaky `pytest` run unrelated to this change would block progress — mitigated by reading the failure and re-running once. + +#### Tracking & Branching +| Field | Detail | +|-------|--------| +| Intent | Branch, commit, push, and open PR per project conventions | +| Requirements | 6.1, 6.2, 6.3 | + +**Responsibilities & Constraints** +- Branch name: `docs/i18n-7-translate-backend-comments`. +- Commit messages follow Conventional Commits with `docs(i18n)` scope (e.g. `docs(i18n): translate chinese docstrings/comments in backend/services`). +- PR closes #7 and references the spec. + +**Dependencies** +- Inbound: Verification Gate (only commits when all checks pass). +- External: `gh` CLI (P0), `/done` skill (P0). + +**Contracts**: Process [x] / Service [ ] / API [ ] / Event [ ] / Batch [ ] / State [ ] + +**Implementation Notes** +- Integration: Use `/done` skill at the end to handle branch/push/PR uniformly. +- Validation: Confirm PR body references issue #7 with `Closes #7` and lists each commit. +- Risks: None. + +## Error Handling + +### Error Strategy +This is a build-time / source-edit task — there is no runtime error path. Errors are caught by the Verification Gate. + +### Error Categories and Responses +- **Translation slipped into a string literal**: caught by `git diff --stat` + spot diff. Response: revert that hunk, re-apply translation against the docstring/comment only. +- **Test suite fails after a pass**: caught by `pytest`. Response: read failure, identify which line was incorrectly modified (likely a string the translator misclassified as a docstring), revert that hunk, re-apply. +- **Residual grep returns non-string-literal Chinese**: caught by post-pass grep. Response: classify those hits as in-scope and translate them in the next sub-pass. +- **Line exceeds 120 chars after translation**: caught by spot diff. Response: reflow the comment/docstring without changing executable code. + +### Monitoring +None — this is a one-shot change. No production observability required. + +## Testing Strategy + +The repository's existing tests are the safety net. No new tests are added. + +### Default sections +- **Unit Tests**: Not applicable; nothing executable changes. +- **Integration Tests**: `uv run python -m pytest backend/scripts/test_profile_format.py` must continue to pass after each commit. +- **E2E/UI Tests**: Not applicable. +- **Verification checks (per package commit)**: + 1. Residual `grep -rln '[一-鿿]' backend/ --include='*.py'` (run from repo root) returns only files whose remaining Chinese is in string literals owned by adjacent tickets. + 2. `cd backend && uv run python -m pytest scripts/test_profile_format.py` exits 0. + 3. `git diff --stat HEAD~..HEAD` shows only in-scope file paths. + 4. Spot diff on three random changed files confirms only comment/docstring lines changed. + +## Supporting References (Optional) +- `gap-analysis.md` — full file enumeration and pattern survey. +- `research.md` — discovery log, alternatives, and decisions. diff --git a/.kiro/specs/i18n-translate-backend-comments/gap-analysis.md b/.kiro/specs/i18n-translate-backend-comments/gap-analysis.md new file mode 100644 index 00000000..34bc2270 --- /dev/null +++ b/.kiro/specs/i18n-translate-backend-comments/gap-analysis.md @@ -0,0 +1,92 @@ +# Gap Analysis — `i18n-translate-backend-comments` + +## Scope Recap +- **Ticket**: salestech-group/MiroFish#7 +- **Goal**: Translate Chinese docstrings and `#` comments in `backend/` to English without behavior changes. +- **Blast radius**: Comments and docstrings only; runtime semantics preserved. + +## Current State Investigation + +### Discovered files +A scan with the regex `[一-鿿]` across `backend/**/*.py` (excluding `.venv`) returns **37 in-app files** plus 2 test files: + +| Area | Count | Files | +| --- | --- | --- | +| `backend/app/__init__.py` | 1 | `__init__.py` | +| `backend/app/config.py` | 1 | `config.py` | +| `backend/app/api/` | 4 | `__init__.py`, `graph.py`, `report.py`, `simulation.py` | +| `backend/app/models/` | 3 | `__init__.py`, `project.py`, `task.py` | +| `backend/app/services/` | 12 | `__init__.py`, `graph_builder.py`, `oasis_profile_generator.py`, `ontology_generator.py`, `report_agent.py`, `simulation_config_generator.py`, `simulation_ipc.py`, `simulation_manager.py`, `simulation_runner.py`, `text_processor.py`, `zep_entity_reader.py`, `zep_graph_memory_updater.py`, `zep_tools.py` | +| `backend/app/utils/` | 7 | `__init__.py`, `file_parser.py`, `llm_client.py`, `locale.py`, `logger.py`, `retry.py`, `zep_paging.py` | +| `backend/run.py` | 1 | `run.py` | +| `backend/scripts/` | 5 | `action_logger.py`, `run_parallel_simulation.py`, `run_reddit_simulation.py`, `run_twitter_simulation.py`, `test_profile_format.py` | +| `backend/tests/` (extra, not in ticket file list) | 2 | `test_locale.py`, `test_locale_request_resolution.py` | + +Spot checks (`models/task.py`, `models/project.py`, `services/text_processor.py`, `utils/locale.py`): +- Module-level docstrings in Chinese (e.g. `"""任务状态管理"""`). +- Class/method docstrings in Chinese, often Google-shaped (`Args:` translated as `参数:`). +- Inline `#` comments tagging fields, sections, or restating obvious code (e.g. `# 标准化换行` above an `\n` normalization call). +- Status-enum trailing comments (e.g. `PENDING = "pending" # 等待中`). + +### Conventions to preserve +- Project guideline: 4-space indent, max 120 char/line, double-quoted strings (Python). +- Docstring style: Google-style per `dev-guidelines.md`. Existing files mix English-shape `Args:`/`Returns:` keys with Chinese descriptions, or use Chinese keys (`参数:`, `返回:`). Translate both to canonical Google-style English. +- File-level convention: `snake_case` filenames, Python `__init__.py` modules typically have a one-line module docstring. + +### Integration surfaces +None. This work touches only commentary; no API contracts, schemas, or imports change. + +## Requirements Feasibility + +| Requirement | Status | Notes | +| --- | --- | --- | +| R1 (coverage) | Feasible — straightforward | Files identified by `grep` rule. | +| R2 (behavior preservation) | Feasible | Achieved by limiting diffs to comment/docstring lines. Need to be careful with multi-line triple-quoted docstrings vs string literals (they are syntactically identical to strings — disambiguation: docstring is the *first* statement of a module/class/function body). | +| R3 (comment hygiene) | Feasible | Some judgment required; will adopt heuristic: drop comments whose translated form would be a single verb-phrase paraphrase of the next executable line. | +| R4 (style compliance) | Feasible | Watch line-length when translating dense Chinese to English (English is typically longer); rewrap as needed without changing executable code. | +| R5 (verification) | Feasible | The `grep -rln '[一-鿿]'` rule is reliable. Residual hits should land only in: prompt template strings (#2/#3/#4/#5), logger/API string literals (#6), and the `tests/test_locale*` files (intentional Chinese test data). | +| R6 (tracking/branching) | Feasible | Branch + commit conventions are standard for this repo; `/done` skill enforces them. | + +### Gaps and constraints +- **Constraint**: Triple-quoted strings used as values (not as docstrings) must NOT be edited if their content is in scope of issues #2–#6 (prompts/log messages/error messages). Disambiguation matters. +- **Constraint**: Chinese characters appearing inside f-string literal segments must remain. They are out of scope. +- **Unknown / Research Needed**: None — task is mechanical and well-bounded. + +### Adjacent specs / overlap with other tickets +- `i18n-externalize-backend-logs` (#6) owns translating `logger.{info,warning,error}` Chinese arguments and API response strings. +- `i18n-report-agent-prompts` (#5), and tickets #2/#3/#4 own prompt template strings. +- We must NOT touch any string literal that those tickets own. After this PR, residual `grep` hits should reduce by exactly the count of comments and docstrings translated and nothing else. +- The two `backend/tests/test_locale*.py` files are **not in the ticket's listed file scope**, and inspection shows their Chinese is exclusively in string literals (test data and a Unicode range check). They are out of scope by R1's enumerated paths and remain untouched. + +## Implementation Approach Options + +### Option A — Single-pass file-by-file translation (recommended) +- Walk the 37 in-scope files in a deterministic order (alphabetical), translating docstrings/comments per file, running the residual grep after each batch. +- Group commit by area (models, utils, services, api, scripts, root) to keep PR diff readable. +- ✅ Simple, low risk, easy to revert per-area. +- ✅ Maps directly to the requirements; easy to verify. +- ❌ Larger PR than option B, but ticket explicitly allows a single PR. + +### Option B — Multi-PR per package +- Split into one PR per package (`models/`, `utils/`, …). The ticket allows this. +- ✅ Smaller diffs to review. +- ❌ More overhead (multiple branches/PRs); not necessary for a mechanical change of this size. + +### Option C — Tooling-assisted bulk script +- Build a one-shot translation script (LLM-driven) that rewrites docstrings/comments. +- ✅ Could scale to other repos. +- ❌ Out of proportion for a single-ticket task; risk of errant edits to string literals; tooling itself becomes a deliverable to test and maintain. + +## Effort and Risk +- **Effort**: **M (3–7 days of focused work)** — 37 files, hundreds of comments. In an interactive AI-assisted run, this collapses to a few hours. +- **Risk**: **Low** — comments-only diff; covered by mechanical verification (grep + pytest); easy to rollback per file/area. + +## Recommendations for Design Phase + +- **Preferred approach**: Option A (single-pass file-by-file, package-grouped commits, single PR). +- **Key decisions to capture in design**: + - Order of traversal (proposed: `models/` → `utils/` → `services/` → `api/` → `scripts/` → root files `__init__.py`, `config.py`, `run.py`). + - Heuristic for "drops the obvious comment" (one-line rule). + - How to handle Google-style docstring keys: always translate `参数:` → `Args:`, `返回:` → `Returns:`, `异常:` → `Raises:`. + - Verification cadence: re-run the grep after each package batch. +- **Research items to carry forward**: None. diff --git a/.kiro/specs/i18n-translate-backend-comments/requirements.md b/.kiro/specs/i18n-translate-backend-comments/requirements.md new file mode 100644 index 00000000..39bff4f2 --- /dev/null +++ b/.kiro/specs/i18n-translate-backend-comments/requirements.md @@ -0,0 +1,67 @@ +# Requirements Document + +## Introduction +This specification covers the developer-facing internationalization of `backend/` Python source: translating Chinese docstrings and inline comments to English so that English-speaking maintainers can read and review the code without translation overhead. The change is mechanical — no behavior, no public strings, no symbol names are modified. It is one of several i18n tickets (#2, #3, #4, #5, #6, #7); this spec covers ticket #7 only. + +## Boundary Context +- **In scope**: Translation of Chinese-language characters that appear in Python docstrings (module/class/function) and inline `#` comments under `backend/`. Removal of comments that merely restate the code. Preservation of `TODO:` / `FIXME:` markers and embedded ticket references. +- **Out of scope**: Chinese characters inside string literals (prompt templates, `logger.{info,warning,error}` arguments, API response bodies, error messages returned to clients) — these are tracked separately by issues #2/#3/#4/#5/#6. No refactoring, reformatting, renaming, or behavior changes. +- **Adjacent expectations**: Spec `i18n-externalize-backend-logs` (issue #6) and the prompt-translation specs handle string-literal Chinese; this spec must leave those untouched so the other tickets remain mergeable. + +## Requirements + +### Requirement 1: Translation Coverage of In-Scope Files +**Objective:** As a maintainer, I want every Chinese docstring and inline comment in the in-scope backend files translated to English, so that I can read and review the code without translation tools. + +#### Acceptance Criteria +1. The Backend Codebase shall contain no Chinese characters (Unicode range U+4E00–U+9FFF) inside Python docstrings under `backend/app/__init__.py`, `backend/app/config.py`, `backend/app/models/`, `backend/app/services/`, `backend/app/api/`, `backend/app/utils/`, `backend/run.py`, and `backend/scripts/`. +2. The Backend Codebase shall contain no Chinese characters inside Python `#` inline comments under the same paths. +3. When `grep -rln '[一-鿿]' backend/ --include='*.py'` is run after this change, the Backend Codebase shall return only files whose remaining Chinese is contained within string literals owned by issues #2/#3/#4/#5/#6. +4. When a docstring is translated, the Translator shall preserve Google-style docstring shape (`Args:`, `Returns:`, `Raises:`, `Yields:` sections) per `dev-guidelines.md`. + +### Requirement 2: Preservation of Code Behavior +**Objective:** As a maintainer, I want the translation to be comments-and-docstrings-only, so that runtime behavior is provably unchanged. + +#### Acceptance Criteria +1. The Translator shall not modify any executable Python statement (assignments, function calls, control flow, decorators, imports). +2. The Translator shall not modify any Python string literal (single-, double-, triple-quoted, f-string, raw, byte) regardless of whether it contains Chinese characters. +3. The Translator shall not rename any symbol (variable, function, class, module, parameter). +4. When `uv run python -m pytest backend/scripts/test_profile_format.py` is run after the change, the Backend Codebase shall exit with status 0. +5. If a diff line touches any non-comment, non-docstring code, the Translator shall reject that diff hunk and revise. + +### Requirement 3: Comment Quality Hygiene +**Objective:** As a maintainer, I want translated comments to add value, so that the codebase remains easy to read after the migration. + +#### Acceptance Criteria +1. When a Chinese comment merely restates the immediately following code (e.g. `# 初始化客户端` above `client = Client()`), the Translator shall delete the comment rather than translate it. +2. When a Chinese comment captures non-obvious *why* (constraints, workarounds, invariants), the Translator shall translate it to a faithful English equivalent. +3. The Translator shall preserve any `TODO:` / `FIXME:` marker and any embedded ticket reference (e.g. `#1234`, `PROJ-456`) verbatim within the translated comment. +4. The Translator shall not introduce new comments that did not exist (or had no Chinese equivalent) in the original source. + +### Requirement 4: Style and Format Compliance +**Objective:** As a maintainer, I want the translated output to comply with project style rules, so that no follow-up cleanup PR is needed. + +#### Acceptance Criteria +1. The Translator shall keep all translated docstrings and comments at or below 120 characters per line. +2. The Translator shall not introduce trailing whitespace on any line. +3. The Translator shall preserve the original indentation (tabs/spaces) of every comment and docstring. +4. The Translator shall use double quotes for any docstring it rewrites, matching the existing Python convention in the file. +5. Where a file already uses 4-space indentation, the Translator shall preserve that indentation. + +### Requirement 5: Discovery and Verification Workflow +**Objective:** As a reviewer, I want a reproducible discovery and verification workflow, so that I can confirm coverage and absence of regressions in CI or locally. + +#### Acceptance Criteria +1. The Translator shall enumerate candidate files using `grep -rln '[一-鿿]' backend/ --include='*.py'` before beginning work. +2. The Translator shall re-run the same `grep` after each batch and confirm the residual hits are limited to string-literal Chinese owned by adjacent tickets (#2/#3/#4/#5/#6). +3. When the residual `grep` hits include any non-string-literal Chinese, the Translator shall classify those hits as in-scope and continue translation until they are gone. +4. The Translator shall verify that `git diff --stat` only reports changes inside the in-scope file paths listed in Requirement 1. + +### Requirement 6: Tracking and Branching +**Objective:** As a release manager, I want the work tracked against ticket #7 on a dedicated branch, so that the PR remains scoped and traceable. + +#### Acceptance Criteria +1. The Translator shall produce changes on a branch named `docs/i18n-7-translate-backend-comments`. +2. The Translator shall reference issue `salestech-group/MiroFish#7` in commit messages or PR description. +3. When committing, the Translator shall use Conventional Commits with type `docs` and scope `i18n` (e.g. `docs(i18n): translate chinese docstrings/comments in backend/`). +4. The Translator shall not include unrelated changes (e.g. dependency bumps, config changes, refactors) in the resulting PR. diff --git a/.kiro/specs/i18n-translate-backend-comments/research.md b/.kiro/specs/i18n-translate-backend-comments/research.md new file mode 100644 index 00000000..c9d9ad4e --- /dev/null +++ b/.kiro/specs/i18n-translate-backend-comments/research.md @@ -0,0 +1,80 @@ +# Research & Design Decisions — `i18n-translate-backend-comments` + +## Summary +- **Feature**: `i18n-translate-backend-comments` +- **Discovery Scope**: Simple Addition (mechanical translation, no architectural change) +- **Key Findings**: + - 37 in-scope `backend/` Python files contain Chinese characters in docstrings or `#` comments. The full list is in `gap-analysis.md`. + - Existing docstrings mix English-shape Google-style keys (`Args:`/`Returns:`) with Chinese descriptions, and a smaller subset uses Chinese keys (`参数:`/`返回:`/`异常:`). Both patterns must converge to canonical English Google-style. + - Several `tests/test_locale*.py` files contain Chinese only inside string literals (intentional test data) and are out of scope by the ticket's enumerated paths. + +## Research Log + +### Discovery scan: where is Chinese in `backend/`? +- **Context**: Need a deterministic enumeration of files to translate. +- **Sources Consulted**: `grep`/Python-driven scan against `backend/**/*.py`. +- **Findings**: + - 37 in-app files (under `backend/app/`, `backend/run.py`, `backend/scripts/`). + - 2 additional test files in `backend/tests/` whose Chinese is only in string literals; not in ticket scope. + - `.venv/` matches are noise and excluded. +- **Implications**: The ticket-listed paths are exhaustive; no unexpected location. Order of traversal can be alphabetical within package groups. + +### Disambiguation: docstring vs string literal +- **Context**: A triple-quoted string is a docstring iff it is the first statement of a module, class, or function body. Otherwise it is a value (e.g. a prompt template) owned by adjacent tickets. +- **Sources Consulted**: Python language reference; spot inspection of `services/ontology_generator.py`, `services/report_agent.py`. +- **Findings**: + - In-scope files contain both kinds of triple-quoted strings. + - Translating only the *first-statement* triple-quoted string per scope keeps the change comments-and-docstrings-only. +- **Implications**: Translation pass must visually verify each triple-quoted string is the first statement before rewriting; otherwise leave it alone. + +### Google-style docstring conversions +- **Context**: `dev-guidelines.md` requires Google-style docstrings; existing Chinese docstrings sometimes use Chinese keys. +- **Findings**: The following key map applies: + - `参数:` → `Args:` + - `返回:` → `Returns:` + - `异常:` → `Raises:` + - `产生:` / `生成:` → `Yields:` + - `示例:` → `Example:` (or `Examples:`) + - `注意:` / `备注:` → `Note:` (or `Notes:`) +- **Implications**: Document this mapping in design.md so the implementation pass is mechanical. + +## Architecture Pattern Evaluation + +| Option | Description | Strengths | Risks / Limitations | Notes | +|--------|-------------|-----------|---------------------|-------| +| Manual file-by-file pass | Walk in alphabetical order, package-grouped commits | Predictable, easy to review per package | Human time required | Selected approach | +| Multi-PR per package | One PR per backend package | Smaller diffs to review | Higher overhead, more PR churn | Allowed by ticket but not required | +| Tooling-assisted bulk script | LLM-driven find-and-replace tool | Reusable | Risk of touching string literals; tool itself becomes a deliverable | Out of proportion | + +## Design Decisions + +### Decision: Single-pass, package-grouped commits, single PR +- **Context**: 37 files, mechanical change, ticket allows either single or split PRs. +- **Alternatives Considered**: + 1. Multi-PR per package — more granular review but higher overhead. + 2. Tooling-assisted bulk script — overkill for one ticket. +- **Selected Approach**: Single PR with one or more commits, grouped by package (`models/`, `utils/`, `services/`, `api/`, `scripts/`, root) so reviewers can read the diff one package at a time. +- **Rationale**: Mechanical change with low risk; ticket explicitly allows it; reduces PR overhead; `/done` produces one PR per branch by default. +- **Trade-offs**: One large PR, but partitioned by commit. Reviewer can use commit history to navigate. +- **Follow-up**: After each package commit, re-run residual `grep` and `pytest` to maintain the invariant. + +### Decision: First-statement disambiguation rule +- **Context**: Distinguish docstrings (in scope) from value strings (out of scope). +- **Selected Approach**: A triple-quoted string is treated as a docstring (in scope) only if it is the first statement of a module / class / function body. All other triple-quoted strings are values (out of scope). +- **Rationale**: Matches Python's own definition; keeps boundary with adjacent tickets unambiguous. + +### Decision: Drop comments that restate code +- **Context**: R3 requires deletion of comments whose translated form would merely paraphrase the next line. +- **Selected Approach**: Apply a one-line heuristic: if the translated comment would be a verb phrase that mirrors the immediately following executable line, delete the comment instead of writing it. +- **Rationale**: Aligns with project rule "comment the why, not the what". + +## Risks & Mitigations +- **Risk**: Accidental edit to a string literal (would belong to ticket #2/#3/#4/#5/#6) — **Mitigation**: After each package commit, run `git diff --stat` and a per-file diff sanity check; verify only `#` lines and docstring lines change. +- **Risk**: Tests failing because a string-shape changed — **Mitigation**: Run `uv run python -m pytest backend/scripts/test_profile_format.py` after each commit. +- **Risk**: Line length violations after English expansion — **Mitigation**: Reflow long English at <= 120 chars within the docstring/comment only; never reflow code. + +## References +- `dev-guidelines.md` — repo-level coding standards, Google-style docstring requirement. +- `.claude/rules/commits.md` — Conventional Commits standard for the commit message. +- Issue #7 — salestech-group/MiroFish: source ticket. +- Issues #2/#3/#4/#5/#6 — adjacent i18n tickets that own the string-literal Chinese. diff --git a/.kiro/specs/i18n-translate-backend-comments/spec.json b/.kiro/specs/i18n-translate-backend-comments/spec.json new file mode 100644 index 00000000..38538b31 --- /dev/null +++ b/.kiro/specs/i18n-translate-backend-comments/spec.json @@ -0,0 +1,24 @@ +{ + "feature_name": "i18n-translate-backend-comments", + "created_at": "2026-05-07T14:24:17Z", + "updated_at": "2026-05-07T14:26:00Z", + "language": "en", + "phase": "tasks-generated", + "ticket": 7, + "ticket_url": "https://github.com/salestech-group/MiroFish/issues/7", + "approvals": { + "requirements": { + "generated": true, + "approved": true + }, + "design": { + "generated": true, + "approved": true + }, + "tasks": { + "generated": true, + "approved": true + } + }, + "ready_for_implementation": true +} diff --git a/.kiro/specs/i18n-translate-backend-comments/tasks.md b/.kiro/specs/i18n-translate-backend-comments/tasks.md new file mode 100644 index 00000000..279e57e6 --- /dev/null +++ b/.kiro/specs/i18n-translate-backend-comments/tasks.md @@ -0,0 +1,97 @@ +# Implementation Plan + +## Foundation + +- [ ] 1. Establish baseline and working branch +- [x] 1.1 Create translation working branch and capture baseline state + - Create branch `docs/i18n-7-translate-backend-comments` from `main`. + - Capture the baseline residual hits by running the discovery scan (the regex `[一-鿿]` against `backend/**/*.py`, excluding `.venv`); record the file list as the work queue. + - Run `cd backend && uv run python -m pytest scripts/test_profile_format.py` and confirm a green baseline before any edits. + - Observable: a fresh branch exists, the baseline file list of 37 in-scope files is captured, and the baseline pytest run passes. + - _Requirements: 5.1, 6.1_ + +## Core — Per-Package Translation + +- [ ] 2. Translate Chinese docstrings and inline comments per package + +- [x] 2.1 (P) Translate `backend/app/models/` + - Translate Chinese module/class/function docstrings and `#` comments in `backend/app/models/__init__.py`, `backend/app/models/project.py`, and `backend/app/models/task.py`. + - Apply the docstring-vs-value disambiguation rule (first-statement only) so that no string literal is touched. + - Apply the Google-style key map (`参数:` → `Args:`, `返回:` → `Returns:`, `异常:` → `Raises:`, `产生:`/`生成:` → `Yields:`, `示例:` → `Examples:`, `注意:`/`备注:` → `Note:`). + - Drop comments that merely restate the next executable line; preserve `TODO:`/`FIXME:` and any embedded ticket reference verbatim. + - Re-run the residual scan and confirm `backend/app/models/` no longer has Chinese in non-string-literal positions. + - Re-run `cd backend && uv run python -m pytest scripts/test_profile_format.py` and confirm exit 0. + - Observable: zero non-string-literal Chinese remains in `backend/app/models/*.py`, and the test command exits 0. + - _Requirements: 1.1, 1.2, 1.4, 2.1, 2.2, 2.3, 2.4, 3.1, 3.2, 3.3, 3.4, 4.1, 4.2, 4.3, 4.4, 4.5_ + - _Boundary: backend/app/models/_ + +- [x] 2.2 (P) Translate `backend/app/utils/` + - Translate Chinese docstrings and `#` comments in `backend/app/utils/__init__.py`, `file_parser.py`, `llm_client.py`, `locale.py`, `logger.py`, `retry.py`, and `zep_paging.py`. + - Be especially careful with `locale.py` and `logger.py`: they intentionally route Chinese strings through their value paths; only docstrings and `#` comments are in scope. + - Apply Rules 1–5 from `design.md` (disambiguation, key map, comment hygiene, style, preservation). + - Re-run the residual scan and confirm `backend/app/utils/` no longer has Chinese in non-string-literal positions. + - Re-run the pytest command and confirm exit 0. + - Observable: zero non-string-literal Chinese remains in `backend/app/utils/*.py`, and the test command exits 0. + - _Requirements: 1.1, 1.2, 1.4, 2.1, 2.2, 2.3, 2.4, 3.1, 3.2, 3.3, 3.4, 4.1, 4.2, 4.3, 4.4, 4.5_ + - _Boundary: backend/app/utils/_ + +- [-] 2.3 (P) Translate `backend/app/services/` — partial (7 of 12 files done; 5 remain — see HANDOFF.md) + - Translate Chinese docstrings and `#` comments across all 12 service files: `__init__.py`, `graph_builder.py`, `ontology_generator.py`, `oasis_profile_generator.py`, `report_agent.py`, `simulation_config_generator.py`, `simulation_ipc.py`, `simulation_manager.py`, `simulation_runner.py`, `text_processor.py`, `zep_entity_reader.py`, `zep_graph_memory_updater.py`, `zep_tools.py`. + - Treat all triple-quoted prompt templates and value strings as out of scope (owned by issues #2/#3/#4/#5/#6) — only the first-statement docstrings of modules/classes/functions are in scope. + - Apply Rules 1–5 from `design.md`. + - Re-run the residual scan and confirm `backend/app/services/` no longer has Chinese in non-string-literal positions. + - Re-run the pytest command and confirm exit 0. + - Observable: zero non-string-literal Chinese remains in `backend/app/services/*.py`, and the test command exits 0. + - _Requirements: 1.1, 1.2, 1.4, 2.1, 2.2, 2.3, 2.4, 3.1, 3.2, 3.3, 3.4, 4.1, 4.2, 4.3, 4.4, 4.5_ + - _Boundary: backend/app/services/_ + +- [-] 2.4 (P) Translate `backend/app/api/` — partial (only `__init__.py` done; 3 files remain — see HANDOFF.md) + - Translate Chinese docstrings and `#` comments in `__init__.py`, `graph.py`, `report.py`, `simulation.py`. + - Treat any user-facing string-literal Chinese in API responses as out of scope (owned by issue #6). + - Apply Rules 1–5 from `design.md`. + - Re-run the residual scan and confirm `backend/app/api/` no longer has Chinese in non-string-literal positions. + - Re-run the pytest command and confirm exit 0. + - Observable: zero non-string-literal Chinese remains in `backend/app/api/*.py`, and the test command exits 0. + - _Requirements: 1.1, 1.2, 1.4, 2.1, 2.2, 2.3, 2.4, 3.1, 3.2, 3.3, 3.4, 4.1, 4.2, 4.3, 4.4, 4.5_ + - _Boundary: backend/app/api/_ + +- [-] 2.5 (P) Translate `backend/scripts/` — partial (`action_logger.py`, `test_profile_format.py` done; 3 `run_*_simulation.py` files remain — see HANDOFF.md) + - Translate Chinese docstrings and `#` comments in `action_logger.py`, `run_parallel_simulation.py`, `run_reddit_simulation.py`, `run_twitter_simulation.py`, `test_profile_format.py`. + - Apply Rules 1–5 from `design.md`. + - Be especially careful with `test_profile_format.py`: any Chinese in test data string literals is out of scope; only docstrings and `#` comments are in scope. + - Re-run the residual scan and confirm `backend/scripts/` no longer has Chinese in non-string-literal positions. + - Re-run the pytest command and confirm exit 0. + - Observable: zero non-string-literal Chinese remains in `backend/scripts/*.py`, and the test command exits 0. + - _Requirements: 1.1, 1.2, 1.4, 2.1, 2.2, 2.3, 2.4, 3.1, 3.2, 3.3, 3.4, 4.1, 4.2, 4.3, 4.4, 4.5_ + - _Boundary: backend/scripts/_ + +- [x] 2.6 (P) Translate root backend files + - Translate Chinese docstrings and `#` comments in `backend/app/__init__.py`, `backend/app/config.py`, and `backend/run.py`. + - Apply Rules 1–5 from `design.md`. + - Be especially careful with `backend/app/config.py`: any Chinese in default-value string literals is out of scope; only docstrings and `#` comments are in scope. + - Re-run the residual scan and confirm these three files no longer have Chinese in non-string-literal positions. + - Re-run the pytest command and confirm exit 0. + - Observable: zero non-string-literal Chinese remains in `backend/app/__init__.py`, `backend/app/config.py`, and `backend/run.py`, and the test command exits 0. + - _Requirements: 1.1, 1.2, 1.4, 2.1, 2.2, 2.3, 2.4, 3.1, 3.2, 3.3, 3.4, 4.1, 4.2, 4.3, 4.4, 4.5_ + - _Boundary: backend/app (root), backend/run.py_ + +## Validation + +- [ ] 3. Final verification and PR preparation + +- [-] 3.1 Run the final verification gate — partial (per-file scanner + py_compile pass; full pytest blocked by pre-existing env issues, see HANDOFF.md) + - Run the residual scan one more time and confirm the only remaining hits are files where the Chinese is in string literals owned by issues #2/#3/#4/#5/#6, plus the intentional Chinese in `backend/tests/test_locale*.py`. + - Run `cd backend && uv run python -m pytest scripts/test_profile_format.py` and confirm exit 0. + - Run `git diff --stat origin/main...HEAD` and confirm only in-scope file paths under `backend/app/`, `backend/run.py`, and `backend/scripts/` are listed. + - Spot-check three random changed files with `git diff ` and confirm only `#` lines and docstring lines changed (no executable lines, no string-literal lines). + - Observable: residual scan, pytest, diff scope, and spot diff all pass. + - _Depends: 2.1, 2.2, 2.3, 2.4, 2.5, 2.6_ + - _Requirements: 1.3, 2.5, 5.1, 5.2, 5.3, 5.4, 6.4_ + +- [ ] 3.2 Open PR and reference ticket #7 + - Use `/done` to commit any remaining changes per Conventional Commits with type `docs` and scope `i18n` (e.g. `docs(i18n): translate chinese docstrings/comments in backend/`), push the branch, and open a PR. + - The PR body must include `Closes #7` and reference the spec at `.kiro/specs/i18n-translate-backend-comments/`. + - Verify the PR contains no unrelated changes (no dependency bumps, no config changes, no refactors). + - Observable: a PR exists on GitHub from `docs/i18n-7-translate-backend-comments` to `main` that closes #7 and contains only docstring/comment translation diffs. + - _Depends: 3.1_ + - _Requirements: 6.1, 6.2, 6.3, 6.4_ From 9dcaecd2d27e6325bae0c53b9ab41eb86d0269cd Mon Sep 17 00:00:00 2001 From: Dominik Seemann Date: Thu, 7 May 2026 17:42:05 +0000 Subject: [PATCH 05/16] docs(i18n): translate chinese comments in frontend src to english Translate chinese developer comments in frontend/src/ to english so non-chinese-reading maintainers can understand intent without translation tooling. Pure documentation cleanup with no runtime behavior changes. Twenty files updated across views, components, api services, App.vue, and pendingUpload.js. Region-eligibility matrix from .kiro/specs/i18n- frontend-comments/design.md drives every edit: - Translate `//`, `/* */`, JSDoc, and Vue `` template comments. - Drop comments that merely restate the code per dev-guidelines.md. - Translate console.error/warn/log argument strings (developer-facing). - Append (#9) to the single chinese-content TODO in views/Process.vue. Five files retain documented chinese string literals per requirements 1.5 and 4.4: hardcoded UI text and error fallbacks (Process.vue, Step3Simulation.vue), backend-format regex patterns and i18n-keyed UI labels (Step4Report.vue), backend stage-key matchers (Step2EnvSetup.vue), and LLM prompt templates sent to a chinese-tuned model (Step5Interaction.vue). Translating any of these would either be out of scope (UI strings belong in /locales/*.json) or would change runtime behavior. Verification: `rg '[\x{4e00}-\x{9fff}]' frontend/src/` returns 5 documented files; `npm run build` exits 0 with the same Vite output as before. Closes #9 --- .kiro/specs/i18n-frontend-comments/design.md | 229 +++++++++++++ .../i18n-frontend-comments/gap-analysis.md | 133 ++++++++ .../i18n-frontend-comments/requirements.md | 70 ++++ .../specs/i18n-frontend-comments/research.md | 97 ++++++ .kiro/specs/i18n-frontend-comments/spec.json | 23 ++ .kiro/specs/i18n-frontend-comments/tasks.md | 53 +++ frontend/src/App.vue | 7 +- frontend/src/api/graph.js | 20 +- frontend/src/api/index.js | 19 +- frontend/src/api/report.js | 16 +- frontend/src/api/simulation.js | 58 ++-- frontend/src/components/GraphPanel.vue | 192 ++++++----- frontend/src/components/HistoryDatabase.vue | 272 +++++++--------- frontend/src/components/Step1GraphBuild.vue | 15 +- frontend/src/components/Step2EnvSetup.vue | 156 ++++----- frontend/src/components/Step3Simulation.vue | 116 +++---- frontend/src/components/Step4Report.vue | 303 +++++++++--------- frontend/src/components/Step5Interaction.vue | 80 +++-- frontend/src/store/pendingUpload.js | 5 +- frontend/src/views/Home.vue | 82 ++--- frontend/src/views/InteractionView.vue | 9 +- frontend/src/views/MainView.vue | 10 +- frontend/src/views/Process.vue | 260 +++++++-------- frontend/src/views/ReportView.vue | 9 +- frontend/src/views/SimulationRunView.vue | 51 ++- frontend/src/views/SimulationView.vue | 58 ++-- 26 files changed, 1394 insertions(+), 949 deletions(-) create mode 100644 .kiro/specs/i18n-frontend-comments/design.md create mode 100644 .kiro/specs/i18n-frontend-comments/gap-analysis.md create mode 100644 .kiro/specs/i18n-frontend-comments/requirements.md create mode 100644 .kiro/specs/i18n-frontend-comments/research.md create mode 100644 .kiro/specs/i18n-frontend-comments/spec.json create mode 100644 .kiro/specs/i18n-frontend-comments/tasks.md diff --git a/.kiro/specs/i18n-frontend-comments/design.md b/.kiro/specs/i18n-frontend-comments/design.md new file mode 100644 index 00000000..5d863448 --- /dev/null +++ b/.kiro/specs/i18n-frontend-comments/design.md @@ -0,0 +1,229 @@ +# Design Document — i18n-frontend-comments + +## Overview + +**Purpose**: Translate Chinese developer comments in `frontend/src/` to English so non-Chinese-reading maintainers can understand intent without translation tooling. Strictly documentation-only; no behavior change. + +**Users**: Frontend maintainers and reviewers of MiroFish — developers who read and modify `frontend/src/` but do not read Chinese. + +**Impact**: 20 files in `frontend/src/` change; the compiled bundle is byte-equivalent modulo source-map comment lines. The `vue-i18n` user-facing translation surface (`/locales/*.json`) is unaffected. + +### Goals + +- Eliminate Chinese characters (U+4E00–U+9FFF) from `frontend/src/` comments and dev-facing string literals (`console.*`). +- Preserve every comment's *why* (semantic intent) when translating; delete comments that merely restate the code per `dev-guidelines.md`. +- Append `(#9)` ticket reference to any TODO/FIXME marker that lacks one. +- Keep `npm run build` green and the rendered UI byte-equivalent on a smoke check. + +### Non-Goals + +- Translating user-facing strings (those live in `/locales/*.json`; tracked separately). +- Translating LLM prompt template strings (translation would change model input — retained and documented in PR per Requirement 1.5). +- Restructuring comments into JSDoc (only keep JSDoc when already JSDoc-shaped). +- Reformatting code, renaming identifiers, or any change to `