diff --git a/azure/1-infra.sh b/azure/1-infra.sh index 3bfdb4aa..0f46c7bb 100755 --- a/azure/1-infra.sh +++ b/azure/1-infra.sh @@ -20,15 +20,15 @@ set -euo pipefail SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)" # ── Carregar configuració ───────────────────────────────────────────────────── -CONFIG_FILE="${SCRIPT_DIR}/config.sh" -if [[ ! -f "$CONFIG_FILE" ]]; then - echo "ERROR: No s'ha trobat azure/config.sh" - echo " Còpia l'exemple: cp azure/config.sh.example azure/config.sh" - echo " Després omple els valors i torna a executar." - exit 1 -fi +#CONFIG_FILE="${SCRIPT_DIR}/config.sh" +#if [[ ! -f "$CONFIG_FILE" ]]; then +# echo "ERROR: No s'ha trobat azure/config.sh" +# echo " Còpia l'exemple: cp azure/config.sh.example azure/config.sh" +# echo " Després omple els valors i torna a executar." +# exit 1 +#fi # shellcheck source=config.sh.example -source "$CONFIG_FILE" +#source "$CONFIG_FILE" # ── Validar variables obligatòries ─────────────────────────────────────────── REQUIRED_VARS=( diff --git a/azure/2-build-deploy.sh b/azure/2-build-deploy.sh index b52cedb3..024d4c36 100755 --- a/azure/2-build-deploy.sh +++ b/azure/2-build-deploy.sh @@ -17,14 +17,14 @@ SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)" REPO_ROOT="$(cd "${SCRIPT_DIR}/.." && pwd)" # ── Carregar configuració ───────────────────────────────────────────────────── -CONFIG_FILE="${SCRIPT_DIR}/config.sh" -if [[ ! -f "$CONFIG_FILE" ]]; then - echo "ERROR: No s'ha trobat azure/config.sh" - echo " Còpia l'exemple: cp azure/config.sh.example azure/config.sh" - exit 1 -fi +#CONFIG_FILE="${SCRIPT_DIR}/config.sh" +#if [[ ! -f "$CONFIG_FILE" ]]; then +# echo "ERROR: No s'ha trobat azure/config.sh" +# echo " Còpia l'exemple: cp azure/config.sh.example azure/config.sh" +# exit 1 +#fi # shellcheck source=config.sh.example -source "$CONFIG_FILE" +#source "$CONFIG_FILE" # ── Validar variables obligatòries ─────────────────────────────────────────── REQUIRED_VARS=( diff --git a/backend/app/__init__.py b/backend/app/__init__.py index e5f3cf2d..e8bd7e47 100644 --- a/backend/app/__init__.py +++ b/backend/app/__init__.py @@ -1,12 +1,12 @@ """ -MiroFish Backend - Flask应用工厂 +MiroFish Backend - Flask application factory """ import os import warnings -# 抑制 multiprocessing resource_tracker 的警告(来自第三方库如 transformers) -# 需要在所有其他导入之前设置 +# Suppress multiprocessing resource_tracker warnings (from third-party libraries like transformers) +# Must be set before all other imports warnings.filterwarnings("ignore", message=".*resource_tracker.*") import jwt @@ -21,36 +21,36 @@ _PUBLIC_PATHS = {'/health', '/api/auth/login'} def create_app(config_class=Config): - """Flask应用工厂函数""" + """Flask application factory""" app = Flask(__name__) app.config.from_object(config_class) - # 设置JSON编码:确保中文直接显示(而不是 \uXXXX 格式) - # Flask >= 2.3 使用 app.json.ensure_ascii,旧版本使用 JSON_AS_ASCII 配置 + # Configure JSON encoding: ensure non-ASCII characters are output directly (not as \uXXXX) + # Flask >= 2.3 uses app.json.ensure_ascii; older versions use JSON_AS_ASCII config if hasattr(app, 'json') and hasattr(app.json, 'ensure_ascii'): app.json.ensure_ascii = False - # 设置日志 + # Set up logging logger = setup_logger('mirofish') - # 只在 reloader 子进程中打印启动信息(避免 debug 模式下打印两次) + # Only log startup info in the reloader subprocess (avoids double-printing in debug mode) is_reloader_process = os.environ.get('WERKZEUG_RUN_MAIN') == 'true' debug_mode = app.config.get('DEBUG', False) should_log_startup = not debug_mode or is_reloader_process if should_log_startup: logger.info("=" * 50) - logger.info("MiroFish Backend 启动中...") + logger.info("MiroFish Backend starting...") logger.info("=" * 50) - # 启用CORS + # Enable CORS CORS(app, resources={r"/api/*": {"origins": "*"}}) - # 注册模拟进程清理函数(确保服务器关闭时终止所有模拟进程) + # Register simulation process cleanup (ensures all simulation processes are terminated on server shutdown) from .services.simulation_runner import SimulationRunner SimulationRunner.register_cleanup() if should_log_startup: - logger.info("已注册模拟进程清理函数") + logger.info("Simulation process cleanup handler registered") # Middleware d'autenticació JWT — s'executa ABANS del log_request (ordre FIFO) @app.before_request @@ -70,28 +70,28 @@ def create_app(config_class=Config): except jwt.InvalidTokenError: return jsonify({'success': False, 'error': 'Invalid token'}), 401 - # 请求日志中间件 + # Request logging middleware @app.before_request def log_request(): logger = get_logger('mirofish.request') - logger.debug(f"请求: {request.method} {request.path}") + logger.debug(f"Request: {request.method} {request.path}") if request.content_type and 'json' in request.content_type: - logger.debug(f"请求体: {request.get_json(silent=True)}") + logger.debug(f"Request body: {request.get_json(silent=True)}") @app.after_request def log_response(response): logger = get_logger('mirofish.request') - logger.debug(f"响应: {response.status_code}") + logger.debug(f"Response: {response.status_code}") return response - # 注册蓝图 (auth primer, luego els existents) + # Register blueprints (auth first, then the rest) from .api import graph_bp, simulation_bp, report_bp, auth_bp app.register_blueprint(auth_bp, url_prefix='/api/auth') app.register_blueprint(graph_bp, url_prefix='/api/graph') app.register_blueprint(simulation_bp, url_prefix='/api/simulation') app.register_blueprint(report_bp, url_prefix='/api/report') - # 健康检查 + # Health check @app.route('/health') def health(): return {'status': 'ok', 'service': 'MiroFish Backend'} @@ -111,6 +111,6 @@ def create_app(config_class=Config): return _send_file(_os.path.join(_dist, 'index.html')) if should_log_startup: - logger.info("MiroFish Backend 启动完成") + logger.info("MiroFish Backend startup complete") return app diff --git a/backend/app/api/__init__.py b/backend/app/api/__init__.py index 3155d510..7ea18388 100644 --- a/backend/app/api/__init__.py +++ b/backend/app/api/__init__.py @@ -1,5 +1,5 @@ """ -API路由模块 +API routes module """ from flask import Blueprint diff --git a/backend/app/api/graph.py b/backend/app/api/graph.py index 759ff48b..6d22cb9e 100644 --- a/backend/app/api/graph.py +++ b/backend/app/api/graph.py @@ -1,6 +1,6 @@ """ -图谱相关API路由 -采用项目上下文机制,服务端持久化状态 +Graph-related API routes +Uses project context mechanism with server-side persistent state """ import os @@ -19,24 +19,24 @@ from ..utils.locale import t, get_locale, set_locale from ..models.task import TaskManager, TaskStatus from ..models.project import ProjectManager, ProjectStatus -# 获取日志器 +# Get logger logger = get_logger('mirofish.api') def allowed_file(filename: str) -> bool: - """检查文件扩展名是否允许""" + """Check if the file extension is allowed""" if not filename or '.' not in filename: return False ext = os.path.splitext(filename)[1].lower().lstrip('.') return ext in Config.ALLOWED_EXTENSIONS -# ============== 项目管理接口 ============== +# ============== Project management endpoints ============== @graph_bp.route('/project/', methods=['GET']) def get_project(project_id: str): """ - 获取项目详情 + Get project details """ project = ProjectManager.get_project(project_id) @@ -55,7 +55,7 @@ def get_project(project_id: str): @graph_bp.route('/project/list', methods=['GET']) def list_projects(): """ - 列出所有项目 + List all projects """ limit = request.args.get('limit', 50, type=int) projects = ProjectManager.list_projects(limit=limit) @@ -70,7 +70,7 @@ def list_projects(): @graph_bp.route('/project/', methods=['DELETE']) def delete_project(project_id: str): """ - 删除项目 + Delete a project """ success = ProjectManager.delete_project(project_id) @@ -89,7 +89,7 @@ def delete_project(project_id: str): @graph_bp.route('/project//reset', methods=['POST']) def reset_project(project_id: str): """ - 重置项目状态(用于重新构建图谱) + Reset project status (used to rebuild the graph) """ project = ProjectManager.get_project(project_id) @@ -99,7 +99,7 @@ def reset_project(project_id: str): "error": t('api.projectNotFound', id=project_id) }), 404 - # 重置到本体已生成状态 + # Reset to ontology-generated status if project.ontology: project.status = ProjectStatus.ONTOLOGY_GENERATED else: @@ -117,22 +117,22 @@ def reset_project(project_id: str): }) -# ============== 接口1:上传文件并生成本体 ============== +# ============== Endpoint 1: Upload files and generate ontology ============== @graph_bp.route('/ontology/generate', methods=['POST']) def generate_ontology(): """ - 接口1:上传文件,分析生成本体定义 - - 请求方式:multipart/form-data - - 参数: - files: 上传的文件(PDF/MD/TXT),可多个 - simulation_requirement: 模拟需求描述(必填) - project_name: 项目名称(可选) - additional_context: 额外说明(可选) - - 返回: + Endpoint 1: Upload files and generate ontology definition + + Request method: multipart/form-data + + Parameters: + files: Uploaded files (PDF/MD/TXT), multiple allowed + simulation_requirement: Simulation requirement description (required) + project_name: Project name (optional) + additional_context: Additional context (optional) + + Returns: { "success": true, "data": { @@ -148,15 +148,15 @@ def generate_ontology(): } """ try: - logger.info("=== 开始生成本体定义 ===") - - # 获取参数 + logger.info("=== Starting ontology generation ===") + + # Get parameters simulation_requirement = request.form.get('simulation_requirement', '') project_name = request.form.get('project_name', 'Unnamed Project') additional_context = request.form.get('additional_context', '') - - logger.debug(f"项目名称: {project_name}") - logger.debug(f"模拟需求: {simulation_requirement[:100]}...") + + logger.debug(f"Project name: {project_name}") + logger.debug(f"Simulation requirement: {simulation_requirement[:100]}...") if not simulation_requirement: return jsonify({ @@ -164,68 +164,68 @@ def generate_ontology(): "error": t('api.requireSimulationRequirement') }), 400 - # 获取上传的文件 + # Get uploaded files uploaded_files = request.files.getlist('files') if not uploaded_files or all(not f.filename for f in uploaded_files): return jsonify({ "success": False, "error": t('api.requireFileUpload') }), 400 - - # 创建项目 + + # Create project project = ProjectManager.create_project(name=project_name) project.simulation_requirement = simulation_requirement - logger.info(f"创建项目: {project.project_id}") - - # 保存文件并提取文本 + logger.info(f"Project created: {project.project_id}") + + # Save files and extract text document_texts = [] all_text = "" - + for file in uploaded_files: if file and file.filename and allowed_file(file.filename): - # 保存文件到项目目录 + # Save file to project directory file_info = ProjectManager.save_file_to_project( - project.project_id, - file, + project.project_id, + file, file.filename ) project.files.append({ "filename": file_info["original_filename"], "size": file_info["size"] }) - - # 提取文本 + + # Extract text text = FileParser.extract_text(file_info["path"]) text = TextProcessor.preprocess_text(text) document_texts.append(text) all_text += f"\n\n=== {file_info['original_filename']} ===\n{text}" - + if not document_texts: ProjectManager.delete_project(project.project_id) return jsonify({ "success": False, "error": t('api.noDocProcessed') }), 400 - - # 保存提取的文本 + + # Save extracted text project.total_text_length = len(all_text) ProjectManager.save_extracted_text(project.project_id, all_text) - logger.info(f"文本提取完成,共 {len(all_text)} 字符") - - # 生成本体 - logger.info("调用 LLM 生成本体定义...") + logger.info(f"Text extraction complete, total {len(all_text)} characters") + + # Generate ontology + logger.info("Calling LLM to generate ontology definition...") generator = OntologyGenerator() ontology = generator.generate( document_texts=document_texts, simulation_requirement=simulation_requirement, additional_context=additional_context if additional_context else None ) - - # 保存本体到项目 + + # Save ontology to project entity_count = len(ontology.get("entity_types", [])) edge_count = len(ontology.get("edge_types", [])) - logger.info(f"本体生成完成: {entity_count} 个实体类型, {edge_count} 个关系类型") - + logger.info(f"Ontology generation complete: {entity_count} entity types, {edge_count} relationship types") + project.ontology = { "entity_types": ontology.get("entity_types", []), "edge_types": ontology.get("edge_types", []) @@ -233,7 +233,7 @@ def generate_ontology(): project.analysis_summary = ontology.get("analysis_summary", "") project.status = ProjectStatus.ONTOLOGY_GENERATED ProjectManager.save_project(project) - logger.info(f"=== 本体生成完成 === 项目ID: {project.project_id}") + logger.info(f"=== Ontology generation complete === Project ID: {project.project_id}") return jsonify({ "success": True, @@ -255,49 +255,49 @@ def generate_ontology(): }), 500 -# ============== 接口2:构建图谱 ============== +# ============== Endpoint 2: Build graph ============== @graph_bp.route('/build', methods=['POST']) def build_graph(): """ - 接口2:根据project_id构建图谱 - - 请求(JSON): + Endpoint 2: Build graph from project_id + + Request (JSON): { - "project_id": "proj_xxxx", // 必填,来自接口1 - "graph_name": "图谱名称", // 可选 - "chunk_size": 500, // 可选,默认500 - "chunk_overlap": 50 // 可选,默认50 + "project_id": "proj_xxxx", // required, from endpoint 1 + "graph_name": "Graph name", // optional + "chunk_size": 500, // optional, default 500 + "chunk_overlap": 50 // optional, default 50 } - - 返回: + + Returns: { "success": true, "data": { "project_id": "proj_xxxx", "task_id": "task_xxxx", - "message": "图谱构建任务已启动" + "message": "Graph build task started" } } """ try: - logger.info("=== 开始构建图谱 ===") - - # 检查配置 + logger.info("=== Starting graph build ===") + + # Check configuration errors = [] if not Config.ZEP_API_KEY: errors.append(t('api.zepApiKeyMissing')) if errors: - logger.error(f"配置错误: {errors}") + logger.error(f"Configuration error: {errors}") return jsonify({ "success": False, "error": t('api.configError', details="; ".join(errors)) }), 500 - - # 解析请求 + + # Parse request data = request.get_json() or {} project_id = data.get('project_id') - logger.debug(f"请求参数: project_id={project_id}") + logger.debug(f"Request parameters: project_id={project_id}") if not project_id: return jsonify({ @@ -305,7 +305,7 @@ def build_graph(): "error": t('api.requireProjectId') }), 400 - # 获取项目 + # Get project project = ProjectManager.get_project(project_id) if not project: return jsonify({ @@ -313,83 +313,83 @@ def build_graph(): "error": t('api.projectNotFound', id=project_id) }), 404 - # 检查项目状态 - force = data.get('force', False) # 强制重新构建 - + # Check project status + force = data.get('force', False) # Force rebuild + if project.status == ProjectStatus.CREATED: return jsonify({ "success": False, "error": t('api.ontologyNotGenerated') }), 400 - + if project.status == ProjectStatus.GRAPH_BUILDING and not force: return jsonify({ "success": False, "error": t('api.graphBuilding'), "task_id": project.graph_build_task_id }), 400 - - # 如果强制重建,重置状态 + + # If force rebuild, reset status if force and project.status in [ProjectStatus.GRAPH_BUILDING, ProjectStatus.FAILED, ProjectStatus.GRAPH_COMPLETED]: project.status = ProjectStatus.ONTOLOGY_GENERATED project.graph_id = None project.graph_build_task_id = None project.error = None - - # 获取配置 + + # Get configuration graph_name = data.get('graph_name', project.name or 'MiroFish Graph') chunk_size = data.get('chunk_size', project.chunk_size or Config.DEFAULT_CHUNK_SIZE) chunk_overlap = data.get('chunk_overlap', project.chunk_overlap or Config.DEFAULT_CHUNK_OVERLAP) - - # 更新项目配置 + + # Update project configuration project.chunk_size = chunk_size project.chunk_overlap = chunk_overlap - - # 获取提取的文本 + + # Get extracted text text = ProjectManager.get_extracted_text(project_id) if not text: return jsonify({ "success": False, "error": t('api.textNotFound') }), 400 - - # 获取本体 + + # Get ontology ontology = project.ontology if not ontology: return jsonify({ "success": False, "error": t('api.ontologyNotFound') }), 400 - - # 创建异步任务 + + # Create async task task_manager = TaskManager() - task_id = task_manager.create_task(f"构建图谱: {graph_name}") - logger.info(f"创建图谱构建任务: task_id={task_id}, project_id={project_id}") - - # 更新项目状态 + task_id = task_manager.create_task(f"Build graph: {graph_name}") + logger.info(f"Graph build task created: task_id={task_id}, project_id={project_id}") + + # Update project status project.status = ProjectStatus.GRAPH_BUILDING project.graph_build_task_id = task_id ProjectManager.save_project(project) - + # Capture locale before spawning background thread current_locale = get_locale() - # 启动后台任务 + # Start background task def build_task(): set_locale(current_locale) build_logger = get_logger('mirofish.build') try: - build_logger.info(f"[{task_id}] 开始构建图谱...") + build_logger.info(f"[{task_id}] Starting graph build...") task_manager.update_task( task_id, status=TaskStatus.PROCESSING, message=t('progress.initGraphService') ) - # 创建图谱构建服务 + # Create graph builder service builder = GraphBuilderService(api_key=Config.ZEP_API_KEY) - - # 分块 + + # Split into chunks task_manager.update_task( task_id, message=t('progress.textChunking'), @@ -402,7 +402,7 @@ def build_graph(): ) total_chunks = len(chunks) - # 创建图谱 + # Create graph task_manager.update_task( task_id, message=t('progress.creatingZepGraph'), @@ -410,11 +410,11 @@ def build_graph(): ) graph_id = builder.create_graph(name=graph_name) - # 更新项目的graph_id + # Update project graph_id project.graph_id = graph_id ProjectManager.save_project(project) - - # 设置本体 + + # Set ontology task_manager.update_task( task_id, message=t('progress.settingOntology'), @@ -422,7 +422,7 @@ def build_graph(): ) builder.set_ontology(graph_id, ontology) - # 添加文本(progress_callback 签名是 (msg, progress_ratio)) + # Add text (progress_callback signature: (msg, progress_ratio)) def add_progress_callback(msg, progress_ratio): progress = 15 + int(progress_ratio * 40) # 15% - 55% task_manager.update_task( @@ -444,7 +444,7 @@ def build_graph(): progress_callback=add_progress_callback ) - # 等待Zep处理完成(查询每个episode的processed状态) + # Wait for Zep processing to complete (poll each episode's processed status) task_manager.update_task( task_id, message=t('progress.waitingZepProcess'), @@ -461,7 +461,7 @@ def build_graph(): builder._wait_for_episodes(episode_uuids, wait_progress_callback) - # 获取图谱数据 + # Fetch graph data task_manager.update_task( task_id, message=t('progress.fetchingGraphData'), @@ -469,15 +469,15 @@ def build_graph(): ) graph_data = builder.get_graph_data(graph_id) - # 更新项目状态 + # Update project status project.status = ProjectStatus.GRAPH_COMPLETED ProjectManager.save_project(project) - + node_count = graph_data.get("node_count", 0) edge_count = graph_data.get("edge_count", 0) - build_logger.info(f"[{task_id}] 图谱构建完成: graph_id={graph_id}, 节点={node_count}, 边={edge_count}") - - # 完成 + build_logger.info(f"[{task_id}] Graph build complete: graph_id={graph_id}, nodes={node_count}, edges={edge_count}") + + # Complete task_manager.update_task( task_id, status=TaskStatus.COMPLETED, @@ -493,8 +493,8 @@ def build_graph(): ) except Exception as e: - # 更新项目状态为失败 - build_logger.error(f"[{task_id}] 图谱构建失败: {str(e)}") + # Update project status to failed + build_logger.error(f"[{task_id}] Graph build failed: {str(e)}") build_logger.debug(traceback.format_exc()) project.status = ProjectStatus.FAILED @@ -508,7 +508,7 @@ def build_graph(): error=traceback.format_exc() ) - # 启动后台线程 + # Start background thread thread = threading.Thread(target=build_task, daemon=True) thread.start() @@ -529,12 +529,12 @@ def build_graph(): }), 500 -# ============== 任务查询接口 ============== +# ============== Task query endpoints ============== @graph_bp.route('/task/', methods=['GET']) def get_task(task_id: str): """ - 查询任务状态 + Query task status """ task = TaskManager().get_task(task_id) @@ -553,7 +553,7 @@ def get_task(task_id: str): @graph_bp.route('/tasks', methods=['GET']) def list_tasks(): """ - 列出所有任务 + List all tasks """ tasks = TaskManager().list_tasks() @@ -564,12 +564,12 @@ def list_tasks(): }) -# ============== 图谱数据接口 ============== +# ============== Graph data endpoints ============== @graph_bp.route('/data/', methods=['GET']) def get_graph_data(graph_id: str): """ - 获取图谱数据(节点和边) + Get graph data (nodes and edges) """ try: if not Config.ZEP_API_KEY: @@ -597,7 +597,7 @@ def get_graph_data(graph_id: str): @graph_bp.route('/delete/', methods=['DELETE']) def delete_graph(graph_id: str): """ - 删除Zep图谱 + Delete a Zep graph """ try: if not Config.ZEP_API_KEY: diff --git a/backend/app/api/report.py b/backend/app/api/report.py index d7f2a4d0..ae776e79 100644 --- a/backend/app/api/report.py +++ b/backend/app/api/report.py @@ -1,6 +1,6 @@ """ -Report API路由 -提供模拟报告生成、获取、对话等接口 +Report API routes +Provides simulation report generation, retrieval, and chat endpoints """ import os @@ -20,30 +20,30 @@ from ..utils.locale import t, get_locale, set_locale logger = get_logger('mirofish.api.report') -# ============== 报告生成接口 ============== +# ============== Report generation endpoints ============== @report_bp.route('/generate', methods=['POST']) def generate_report(): """ - 生成模拟分析报告(异步任务) - - 这是一个耗时操作,接口会立即返回task_id, - 使用 GET /api/report/generate/status 查询进度 - - 请求(JSON): + Generate a simulation analysis report (async task) + + This is a long-running operation; the endpoint returns task_id immediately. + Use GET /api/report/generate/status to poll progress. + + Request (JSON): { - "simulation_id": "sim_xxxx", // 必填,模拟ID - "force_regenerate": false // 可选,强制重新生成 + "simulation_id": "sim_xxxx", // required, simulation ID + "force_regenerate": false // optional, force regeneration } - - 返回: + + Returns: { "success": true, "data": { "simulation_id": "sim_xxxx", "task_id": "task_xxxx", "status": "generating", - "message": "报告生成任务已启动" + "message": "Report generation task started" } } """ @@ -59,17 +59,17 @@ def generate_report(): force_regenerate = data.get('force_regenerate', False) - # 获取模拟信息 + # Get simulation info manager = SimulationManager() state = manager.get_simulation(simulation_id) - + if not state: return jsonify({ "success": False, "error": t('api.simulationNotFound', id=simulation_id) }), 404 - # 检查是否已有报告 + # Check if a report already exists if not force_regenerate: existing_report = ReportManager.get_report_by_simulation(simulation_id) if existing_report and existing_report.status == ReportStatus.COMPLETED: @@ -84,7 +84,7 @@ def generate_report(): } }) - # 获取项目信息 + # Get project info project = ProjectManager.get_project(state.project_id) if not project: return jsonify({ @@ -106,11 +106,11 @@ def generate_report(): "error": t('api.missingSimRequirement') }), 400 - # 提前生成 report_id,以便立即返回给前端 + # Pre-generate report_id so it can be returned immediately import uuid report_id = f"report_{uuid.uuid4().hex[:12]}" - - # 创建异步任务 + + # Create async task task_manager = TaskManager() task_id = task_manager.create_task( task_type="report_generate", @@ -124,7 +124,7 @@ def generate_report(): # Capture locale before spawning background thread current_locale = get_locale() - # 定义后台任务 + # Define background task def run_generate(): set_locale(current_locale) try: @@ -134,29 +134,29 @@ def generate_report(): progress=0, message=t('api.initReportAgent') ) - - # 创建Report Agent + + # Create Report Agent agent = ReportAgent( graph_id=graph_id, simulation_id=simulation_id, simulation_requirement=simulation_requirement ) - # 进度回调 + # Progress callback def progress_callback(stage, progress, message): task_manager.update_task( task_id, progress=progress, message=f"[{stage}] {message}" ) - - # 生成报告(传入预先生成的 report_id) + + # Generate report (pass pre-generated report_id) report = agent.generate_report( progress_callback=progress_callback, report_id=report_id ) - # 保存报告 + # Save report ReportManager.save_report(report) if report.status == ReportStatus.COMPLETED: @@ -172,10 +172,10 @@ def generate_report(): task_manager.fail_task(task_id, report.error or t('api.reportGenerateFailed')) except Exception as e: - logger.error(f"报告生成失败: {str(e)}") + logger.error(f"Report generation failed: {str(e)}") task_manager.fail_task(task_id, str(e)) - - # 启动后台线程 + + # Start background thread thread = threading.Thread(target=run_generate, daemon=True) thread.start() @@ -192,7 +192,7 @@ def generate_report(): }) except Exception as e: - logger.error(f"启动报告生成任务失败: {str(e)}") + logger.error(f"Failed to start report generation task: {str(e)}") return jsonify({ "success": False, "error": str(e), @@ -203,15 +203,15 @@ def generate_report(): @report_bp.route('/generate/status', methods=['POST']) def get_generate_status(): """ - 查询报告生成任务进度 - - 请求(JSON): + Query report generation task progress + + Request (JSON): { - "task_id": "task_xxxx", // 可选,generate返回的task_id - "simulation_id": "sim_xxxx" // 可选,模拟ID + "task_id": "task_xxxx", // optional, task_id from generate + "simulation_id": "sim_xxxx" // optional, simulation ID } - - 返回: + + Returns: { "success": true, "data": { @@ -228,7 +228,7 @@ def get_generate_status(): task_id = data.get('task_id') simulation_id = data.get('simulation_id') - # 如果提供了simulation_id,先检查是否已有完成的报告 + # If simulation_id is provided, check whether a completed report exists if simulation_id: existing_report = ReportManager.get_report_by_simulation(simulation_id) if existing_report and existing_report.status == ReportStatus.COMPLETED: @@ -265,21 +265,21 @@ def get_generate_status(): }) except Exception as e: - logger.error(f"查询任务状态失败: {str(e)}") + logger.error(f"Failed to query task status: {str(e)}") return jsonify({ "success": False, "error": str(e) }), 500 -# ============== 报告获取接口 ============== +# ============== Report retrieval endpoints ============== @report_bp.route('/', methods=['GET']) def get_report(report_id: str): """ - 获取报告详情 - - 返回: + Get report details + + Returns: { "success": true, "data": { @@ -308,7 +308,7 @@ def get_report(report_id: str): }) except Exception as e: - logger.error(f"获取报告失败: {str(e)}") + logger.error(f"Failed to get report: {str(e)}") return jsonify({ "success": False, "error": str(e), @@ -319,9 +319,9 @@ def get_report(report_id: str): @report_bp.route('/by-simulation/', methods=['GET']) def get_report_by_simulation(simulation_id: str): """ - 根据模拟ID获取报告 - - 返回: + Get report by simulation ID + + Returns: { "success": true, "data": { @@ -347,7 +347,7 @@ def get_report_by_simulation(simulation_id: str): }) except Exception as e: - logger.error(f"获取报告失败: {str(e)}") + logger.error(f"Failed to get report: {str(e)}") return jsonify({ "success": False, "error": str(e), @@ -358,13 +358,13 @@ def get_report_by_simulation(simulation_id: str): @report_bp.route('/list', methods=['GET']) def list_reports(): """ - 列出所有报告 - - Query参数: - simulation_id: 按模拟ID过滤(可选) - limit: 返回数量限制(默认50) - - 返回: + List all reports + + Query parameters: + simulation_id: filter by simulation ID (optional) + limit: result count limit (default 50) + + Returns: { "success": true, "data": [...], @@ -387,7 +387,7 @@ def list_reports(): }) except Exception as e: - logger.error(f"列出报告失败: {str(e)}") + logger.error(f"Failed to list reports: {str(e)}") return jsonify({ "success": False, "error": str(e), @@ -398,9 +398,9 @@ def list_reports(): @report_bp.route('//download', methods=['GET']) def download_report(report_id: str): """ - 下载报告(Markdown格式) - - 返回Markdown文件 + Download report (Markdown format) + + Returns a Markdown file """ try: report = ReportManager.get_report(report_id) @@ -414,7 +414,7 @@ def download_report(report_id: str): md_path = ReportManager._get_report_markdown_path(report_id) if not os.path.exists(md_path): - # 如果MD文件不存在,生成一个临时文件 + # If MD file doesn't exist, create a temporary file import tempfile with tempfile.NamedTemporaryFile(mode='w', suffix='.md', delete=False) as f: f.write(report.markdown_content) @@ -433,7 +433,7 @@ def download_report(report_id: str): ) except Exception as e: - logger.error(f"下载报告失败: {str(e)}") + logger.error(f"Failed to download report: {str(e)}") return jsonify({ "success": False, "error": str(e), @@ -443,7 +443,7 @@ def download_report(report_id: str): @report_bp.route('/', methods=['DELETE']) def delete_report(report_id: str): - """删除报告""" + """Delete a report""" try: success = ReportManager.delete_report(report_id) @@ -459,7 +459,7 @@ def delete_report(report_id: str): }) except Exception as e: - logger.error(f"删除报告失败: {str(e)}") + logger.error(f"Failed to delete report: {str(e)}") return jsonify({ "success": False, "error": str(e), @@ -467,32 +467,32 @@ def delete_report(report_id: str): }), 500 -# ============== Report Agent对话接口 ============== +# ============== Report Agent chat endpoint ============== @report_bp.route('/chat', methods=['POST']) def chat_with_report_agent(): """ - 与Report Agent对话 - - Report Agent可以在对话中自主调用检索工具来回答问题 - - 请求(JSON): + Chat with the Report Agent + + The Report Agent can autonomously call retrieval tools to answer questions. + + Request (JSON): { - "simulation_id": "sim_xxxx", // 必填,模拟ID - "message": "请解释一下舆情走向", // 必填,用户消息 - "chat_history": [ // 可选,对话历史 + "simulation_id": "sim_xxxx", // required, simulation ID + "message": "Explain the trend...", // required, user message + "chat_history": [ // optional, conversation history {"role": "user", "content": "..."}, {"role": "assistant", "content": "..."} ] } - - 返回: + + Returns: { "success": true, "data": { - "response": "Agent回复...", - "tool_calls": [调用的工具列表], - "sources": [信息来源] + "response": "Agent reply...", + "tool_calls": [list of tools called], + "sources": [information sources] } } """ @@ -515,7 +515,7 @@ def chat_with_report_agent(): "error": t('api.requireMessage') }), 400 - # 获取模拟和项目信息 + # Get simulation and project info manager = SimulationManager() state = manager.get_simulation(simulation_id) @@ -541,7 +541,7 @@ def chat_with_report_agent(): simulation_requirement = project.simulation_requirement or "" - # 创建Agent并进行对话 + # Create agent and start chat agent = ReportAgent( graph_id=graph_id, simulation_id=simulation_id, @@ -556,7 +556,7 @@ def chat_with_report_agent(): }) except Exception as e: - logger.error(f"对话失败: {str(e)}") + logger.error(f"Chat failed: {str(e)}") return jsonify({ "success": False, "error": str(e), @@ -564,22 +564,22 @@ def chat_with_report_agent(): }), 500 -# ============== 报告进度与分章节接口 ============== +# ============== Report progress and section endpoints ============== @report_bp.route('//progress', methods=['GET']) def get_report_progress(report_id: str): """ - 获取报告生成进度(实时) - - 返回: + Get report generation progress (real-time) + + Returns: { "success": true, "data": { "status": "generating", "progress": 45, - "message": "正在生成章节: 关键发现", - "current_section": "关键发现", - "completed_sections": ["执行摘要", "模拟背景"], + "message": "Generating section: Key Findings", + "current_section": "Key Findings", + "completed_sections": ["Executive Summary", "Simulation Background"], "updated_at": "2025-12-09T..." } } @@ -599,7 +599,7 @@ def get_report_progress(report_id: str): }) except Exception as e: - logger.error(f"获取报告进度失败: {str(e)}") + logger.error(f"Failed to get report progress: {str(e)}") return jsonify({ "success": False, "error": str(e), @@ -610,11 +610,12 @@ def get_report_progress(report_id: str): @report_bp.route('//sections', methods=['GET']) def get_report_sections(report_id: str): """ - 获取已生成的章节列表(分章节输出) - - 前端可以轮询此接口获取已生成的章节内容,无需等待整个报告完成 - - 返回: + Get list of already-generated sections (section-by-section output) + + The frontend can poll this endpoint to get section content as it is generated, + without waiting for the full report to complete. + + Returns: { "success": true, "data": { @@ -623,7 +624,7 @@ def get_report_sections(report_id: str): { "filename": "section_01.md", "section_index": 1, - "content": "## 执行摘要\\n\\n..." + "content": "## Executive Summary\\n\\n..." }, ... ], @@ -635,7 +636,7 @@ def get_report_sections(report_id: str): try: sections = ReportManager.get_generated_sections(report_id) - # 获取报告状态 + # Get report status report = ReportManager.get_report(report_id) is_complete = report is not None and report.status == ReportStatus.COMPLETED @@ -650,7 +651,7 @@ def get_report_sections(report_id: str): }) except Exception as e: - logger.error(f"获取章节列表失败: {str(e)}") + logger.error(f"Failed to get section list: {str(e)}") return jsonify({ "success": False, "error": str(e), @@ -661,14 +662,14 @@ def get_report_sections(report_id: str): @report_bp.route('//section/', methods=['GET']) def get_single_section(report_id: str, section_index: int): """ - 获取单个章节内容 - - 返回: + Get single section content + + Returns: { "success": true, "data": { "filename": "section_01.md", - "content": "## 执行摘要\\n\\n..." + "content": "## Executive Summary\\n\\n..." } } """ @@ -694,7 +695,7 @@ def get_single_section(report_id: str, section_index: int): }) except Exception as e: - logger.error(f"获取章节内容失败: {str(e)}") + logger.error(f"Failed to get section content: {str(e)}") return jsonify({ "success": False, "error": str(e), @@ -702,16 +703,16 @@ def get_single_section(report_id: str, section_index: int): }), 500 -# ============== 报告状态检查接口 ============== +# ============== Report status check endpoint ============== @report_bp.route('/check/', methods=['GET']) def check_report_status(simulation_id: str): """ - 检查模拟是否有报告,以及报告状态 - - 用于前端判断是否解锁Interview功能 - - 返回: + Check whether a simulation has a report and its status + + Used by the frontend to determine whether to unlock the Interview feature. + + Returns: { "success": true, "data": { @@ -730,7 +731,7 @@ def check_report_status(simulation_id: str): report_status = report.status.value if report else None report_id = report.report_id if report else None - # 只有报告完成后才解锁interview + # Interview is unlocked only after the report is complete interview_unlocked = has_report and report.status == ReportStatus.COMPLETED return jsonify({ @@ -745,7 +746,7 @@ def check_report_status(simulation_id: str): }) except Exception as e: - logger.error(f"检查报告状态失败: {str(e)}") + logger.error(f"Failed to check report status: {str(e)}") return jsonify({ "success": False, "error": str(e), @@ -753,22 +754,22 @@ def check_report_status(simulation_id: str): }), 500 -# ============== Agent 日志接口 ============== +# ============== Agent log endpoints ============== @report_bp.route('//agent-log', methods=['GET']) def get_agent_log(report_id: str): """ - 获取 Report Agent 的详细执行日志 - - 实时获取报告生成过程中的每一步动作,包括: - - 报告开始、规划开始/完成 - - 每个章节的开始、工具调用、LLM响应、完成 - - 报告完成或失败 - - Query参数: - from_line: 从第几行开始读取(可选,默认0,用于增量获取) - - 返回: + Get detailed execution log of the Report Agent + + Retrieves step-by-step actions during report generation, including: + - Report start, planning start/complete + - Each section's start, tool calls, LLM response, completion + - Report completion or failure + + Query parameters: + from_line: start reading from this line (optional, default 0, for incremental fetch) + + Returns: { "success": true, "data": { @@ -779,7 +780,7 @@ def get_agent_log(report_id: str): "report_id": "report_xxxx", "action": "tool_call", "stage": "generating", - "section_title": "执行摘要", + "section_title": "Executive Summary", "section_index": 1, "details": { "tool_name": "insight_forge", @@ -806,7 +807,7 @@ def get_agent_log(report_id: str): }) except Exception as e: - logger.error(f"获取Agent日志失败: {str(e)}") + logger.error(f"Failed to get Agent log: {str(e)}") return jsonify({ "success": False, "error": str(e), @@ -817,9 +818,9 @@ def get_agent_log(report_id: str): @report_bp.route('//agent-log/stream', methods=['GET']) def stream_agent_log(report_id: str): """ - 获取完整的 Agent 日志(一次性获取全部) - - 返回: + Get the full Agent log (fetch all at once) + + Returns: { "success": true, "data": { @@ -840,7 +841,7 @@ def stream_agent_log(report_id: str): }) except Exception as e: - logger.error(f"获取Agent日志失败: {str(e)}") + logger.error(f"Failed to get Agent log: {str(e)}") return jsonify({ "success": False, "error": str(e), @@ -848,27 +849,27 @@ def stream_agent_log(report_id: str): }), 500 -# ============== 控制台日志接口 ============== +# ============== Console log endpoints ============== @report_bp.route('//console-log', methods=['GET']) def get_console_log(report_id: str): """ - 获取 Report Agent 的控制台输出日志 - - 实时获取报告生成过程中的控制台输出(INFO、WARNING等), - 这与 agent-log 接口返回的结构化 JSON 日志不同, - 是纯文本格式的控制台风格日志。 - - Query参数: - from_line: 从第几行开始读取(可选,默认0,用于增量获取) - - 返回: + Get the console output log of the Report Agent + + Returns real-time console output (INFO, WARNING, etc.) during report generation. + Unlike the agent-log endpoint which returns structured JSON logs, + this returns plain-text console-style logs. + + Query parameters: + from_line: start reading from this line (optional, default 0, for incremental fetch) + + Returns: { "success": true, "data": { "logs": [ - "[19:46:14] INFO: 搜索完成: 找到 15 条相关事实", - "[19:46:14] INFO: 图谱搜索: graph_id=xxx, query=...", + "[19:46:14] INFO: Search complete: found 15 relevant facts", + "[19:46:14] INFO: Graph search: graph_id=xxx, query=...", ... ], "total_lines": 100, @@ -888,7 +889,7 @@ def get_console_log(report_id: str): }) except Exception as e: - logger.error(f"获取控制台日志失败: {str(e)}") + logger.error(f"Failed to get console log: {str(e)}") return jsonify({ "success": False, "error": str(e), @@ -899,9 +900,9 @@ def get_console_log(report_id: str): @report_bp.route('//console-log/stream', methods=['GET']) def stream_console_log(report_id: str): """ - 获取完整的控制台日志(一次性获取全部) - - 返回: + Get the full console log (fetch all at once) + + Returns: { "success": true, "data": { @@ -922,7 +923,7 @@ def stream_console_log(report_id: str): }) except Exception as e: - logger.error(f"获取控制台日志失败: {str(e)}") + logger.error(f"Failed to get console log: {str(e)}") return jsonify({ "success": False, "error": str(e), @@ -930,17 +931,17 @@ def stream_console_log(report_id: str): }), 500 -# ============== 工具调用接口(供调试使用)============== +# ============== Tool call endpoints (for debugging) ============== @report_bp.route('/tools/search', methods=['POST']) def search_graph_tool(): """ - 图谱搜索工具接口(供调试使用) - - 请求(JSON): + Graph search tool endpoint (for debugging) + + Request (JSON): { "graph_id": "mirofish_xxxx", - "query": "搜索查询", + "query": "search query", "limit": 10 } """ @@ -972,7 +973,7 @@ def search_graph_tool(): }) except Exception as e: - logger.error(f"图谱搜索失败: {str(e)}") + logger.error(f"Graph search failed: {str(e)}") return jsonify({ "success": False, "error": str(e), @@ -983,9 +984,9 @@ def search_graph_tool(): @report_bp.route('/tools/statistics', methods=['POST']) def get_graph_statistics_tool(): """ - 图谱统计工具接口(供调试使用) - - 请求(JSON): + Graph statistics tool endpoint (for debugging) + + Request (JSON): { "graph_id": "mirofish_xxxx" } @@ -1012,7 +1013,7 @@ def get_graph_statistics_tool(): }) except Exception as e: - logger.error(f"获取图谱统计失败: {str(e)}") + logger.error(f"Failed to get graph statistics: {str(e)}") return jsonify({ "success": False, "error": str(e), diff --git a/backend/app/api/simulation.py b/backend/app/api/simulation.py index 3a8e1e3f..aefb6f66 100644 --- a/backend/app/api/simulation.py +++ b/backend/app/api/simulation.py @@ -1,6 +1,6 @@ """ -模拟相关API路由 -Step2: Zep实体读取与过滤、OASIS模拟准备与运行(全程自动化) +Simulation-related API routes +Step 2: Zep entity reading & filtering, OASIS simulation preparation & execution (fully automated) """ import os @@ -20,41 +20,41 @@ from ..models.project import ProjectManager logger = get_logger('mirofish.api.simulation') -# Interview prompt 优化前缀 -# 添加此前缀可以避免Agent调用工具,直接用文本回复 -INTERVIEW_PROMPT_PREFIX = "结合你的人设、所有的过往记忆与行动,不调用任何工具直接用文本回复我:" +# Interview prompt optimization prefix +# Adding this prefix prevents the Agent from calling tools and forces a direct text reply +INTERVIEW_PROMPT_PREFIX = "Based on your persona, all past memories and actions, reply to me directly in text without calling any tools: " def optimize_interview_prompt(prompt: str) -> str: """ - 优化Interview提问,添加前缀避免Agent调用工具 - + Optimize an interview question by adding a prefix to prevent tool calls. + Args: - prompt: 原始提问 - + prompt: original question + Returns: - 优化后的提问 + optimized question """ if not prompt: return prompt - # 避免重复添加前缀 + # Avoid adding the prefix twice if prompt.startswith(INTERVIEW_PROMPT_PREFIX): return prompt return f"{INTERVIEW_PROMPT_PREFIX}{prompt}" -# ============== 实体读取接口 ============== +# ============== Entity retrieval endpoints ============== @simulation_bp.route('/entities/', methods=['GET']) def get_graph_entities(graph_id: str): """ - 获取图谱中的所有实体(已过滤) - - 只返回符合预定义实体类型的节点(Labels不只是Entity的节点) - - Query参数: - entity_types: 逗号分隔的实体类型列表(可选,用于进一步过滤) - enrich: 是否获取相关边信息(默认true) + Get all entities in the graph (filtered) + + Returns only nodes matching predefined entity types (nodes with Labels beyond just "Entity"). + + Query parameters: + entity_types: comma-separated entity type list (optional, for further filtering) + enrich: whether to fetch related edge info (default true) """ try: if not Config.ZEP_API_KEY: @@ -67,7 +67,7 @@ def get_graph_entities(graph_id: str): entity_types = [t.strip() for t in entity_types_str.split(',') if t.strip()] if entity_types_str else None enrich = request.args.get('enrich', 'true').lower() == 'true' - logger.info(f"获取图谱实体: graph_id={graph_id}, entity_types={entity_types}, enrich={enrich}") + logger.info(f"Fetching graph entities: graph_id={graph_id}, entity_types={entity_types}, enrich={enrich}") reader = ZepEntityReader() result = reader.filter_defined_entities( @@ -82,7 +82,7 @@ def get_graph_entities(graph_id: str): }) except Exception as e: - logger.error(f"获取图谱实体失败: {str(e)}") + logger.error(f"Failed to get graph entities: {str(e)}") return jsonify({ "success": False, "error": str(e), @@ -92,7 +92,7 @@ def get_graph_entities(graph_id: str): @simulation_bp.route('/entities//', methods=['GET']) def get_entity_detail(graph_id: str, entity_uuid: str): - """获取单个实体的详细信息""" + """Get detailed information about a single entity""" try: if not Config.ZEP_API_KEY: return jsonify({ @@ -115,7 +115,7 @@ def get_entity_detail(graph_id: str, entity_uuid: str): }) except Exception as e: - logger.error(f"获取实体详情失败: {str(e)}") + logger.error(f"Failed to get entity details: {str(e)}") return jsonify({ "success": False, "error": str(e), @@ -125,7 +125,7 @@ def get_entity_detail(graph_id: str, entity_uuid: str): @simulation_bp.route('/entities//by-type/', methods=['GET']) def get_entities_by_type(graph_id: str, entity_type: str): - """获取指定类型的所有实体""" + """Get all entities of a specified type""" try: if not Config.ZEP_API_KEY: return jsonify({ @@ -152,7 +152,7 @@ def get_entities_by_type(graph_id: str, entity_type: str): }) except Exception as e: - logger.error(f"获取实体失败: {str(e)}") + logger.error(f"Failed to get entities: {str(e)}") return jsonify({ "success": False, "error": str(e), @@ -160,24 +160,24 @@ def get_entities_by_type(graph_id: str, entity_type: str): }), 500 -# ============== 模拟管理接口 ============== +# ============== Simulation management endpoints ============== @simulation_bp.route('/create', methods=['POST']) def create_simulation(): """ - 创建新的模拟 - - 注意:max_rounds等参数由LLM智能生成,无需手动设置 - - 请求(JSON): + Create a new simulation + + Note: parameters like max_rounds are intelligently generated by the LLM; no manual setup needed. + + Request (JSON): { - "project_id": "proj_xxxx", // 必填 - "graph_id": "mirofish_xxxx", // 可选,如不提供则从project获取 - "enable_twitter": true, // 可选,默认true - "enable_reddit": true // 可选,默认true + "project_id": "proj_xxxx", // required + "graph_id": "mirofish_xxxx", // optional, falls back to project graph_id + "enable_twitter": true, // optional, default true + "enable_reddit": true // optional, default true } - - 返回: + + Returns: { "success": true, "data": { @@ -229,7 +229,7 @@ def create_simulation(): }) except Exception as e: - logger.error(f"创建模拟失败: {str(e)}") + logger.error(f"Failed to create simulation: {str(e)}") return jsonify({ "success": False, "error": str(e), @@ -239,17 +239,17 @@ def create_simulation(): def _check_simulation_prepared(simulation_id: str) -> tuple: """ - 检查模拟是否已经准备完成 - - 检查条件: - 1. state.json 存在且 status 为 "ready" - 2. 必要文件存在:reddit_profiles.json, twitter_profiles.csv, simulation_config.json - - 注意:运行脚本(run_*.py)保留在 backend/scripts/ 目录,不再复制到模拟目录 - + Check whether the simulation preparation is complete. + + Checks: + 1. state.json exists and status is "ready" + 2. Required files exist: reddit_profiles.json, twitter_profiles.csv, simulation_config.json + + Note: run scripts (run_*.py) stay in backend/scripts/ and are no longer copied to the simulation directory. + Args: - simulation_id: 模拟ID - + simulation_id: simulation ID + Returns: (is_prepared: bool, info: dict) """ @@ -258,11 +258,11 @@ def _check_simulation_prepared(simulation_id: str) -> tuple: simulation_dir = os.path.join(Config.OASIS_SIMULATION_DATA_DIR, simulation_id) - # 检查目录是否存在 + # Check if directory exists if not os.path.exists(simulation_dir): - return False, {"reason": "模拟目录不存在"} - - # 必要文件列表(不包括脚本,脚本位于 backend/scripts/) + return False, {"reason": "Simulation directory does not exist"} + + # Required file list (scripts excluded; scripts are in backend/scripts/) required_files = [ "state.json", "simulation_config.json", @@ -270,7 +270,7 @@ def _check_simulation_prepared(simulation_id: str) -> tuple: "twitter_profiles.csv" ] - # 检查文件是否存在 + # Check if files exist existing_files = [] missing_files = [] for f in required_files: @@ -282,12 +282,12 @@ def _check_simulation_prepared(simulation_id: str) -> tuple: if missing_files: return False, { - "reason": "缺少必要文件", + "reason": "Missing required files", "missing_files": missing_files, "existing_files": existing_files } - - # 检查state.json中的状态 + + # Check status in state.json state_file = os.path.join(simulation_dir, "state.json") try: import json @@ -297,20 +297,20 @@ def _check_simulation_prepared(simulation_id: str) -> tuple: status = state_data.get("status", "") config_generated = state_data.get("config_generated", False) - # 详细日志 - logger.debug(f"检测模拟准备状态: {simulation_id}, status={status}, config_generated={config_generated}") - - # 如果 config_generated=True 且文件存在,认为准备完成 - # 以下状态都说明准备工作已完成: - # - ready: 准备完成,可以运行 - # - preparing: 如果 config_generated=True 说明已完成 - # - running: 正在运行,说明准备早就完成了 - # - completed: 运行完成,说明准备早就完成了 - # - stopped: 已停止,说明准备早就完成了 - # - failed: 运行失败(但准备是完成的) + # Detailed log + logger.debug(f"Checking simulation preparation status: {simulation_id}, status={status}, config_generated={config_generated}") + + # If config_generated=True and files exist, treat preparation as complete. + # All of the following statuses indicate preparation has finished: + # - ready: preparation complete, can run + # - preparing: if config_generated=True, preparation is done + # - running: currently running, meaning preparation completed long ago + # - completed: run finished, meaning preparation completed long ago + # - stopped: stopped, meaning preparation completed long ago + # - failed: run failed (but preparation was complete) prepared_statuses = ["ready", "preparing", "running", "completed", "stopped", "failed"] if status in prepared_statuses and config_generated: - # 获取文件统计信息 + # Get file statistics profiles_file = os.path.join(simulation_dir, "reddit_profiles.json") config_file = os.path.join(simulation_dir, "simulation_config.json") @@ -320,7 +320,7 @@ def _check_simulation_prepared(simulation_id: str) -> tuple: profiles_data = json.load(f) profiles_count = len(profiles_data) if isinstance(profiles_data, list) else 0 - # 如果状态是preparing但文件已完成,自动更新状态为ready + # If status is "preparing" but files are done, auto-update status to "ready" if status == "preparing": try: state_data["status"] = "ready" @@ -328,12 +328,12 @@ def _check_simulation_prepared(simulation_id: str) -> tuple: state_data["updated_at"] = datetime.now().isoformat() with open(state_file, 'w', encoding='utf-8') as f: json.dump(state_data, f, ensure_ascii=False, indent=2) - logger.info(f"自动更新模拟状态: {simulation_id} preparing -> ready") + logger.info(f"Auto-updated simulation status: {simulation_id} preparing -> ready") status = "ready" except Exception as e: - logger.warning(f"自动更新状态失败: {e}") - - logger.info(f"模拟 {simulation_id} 检测结果: 已准备完成 (status={status}, config_generated={config_generated})") + logger.warning(f"Failed to auto-update status: {e}") + + logger.info(f"Simulation {simulation_id} check result: preparation complete (status={status}, config_generated={config_generated})") return True, { "status": status, "entities_count": state_data.get("entities_count", 0), @@ -345,55 +345,55 @@ def _check_simulation_prepared(simulation_id: str) -> tuple: "existing_files": existing_files } else: - logger.warning(f"模拟 {simulation_id} 检测结果: 未准备完成 (status={status}, config_generated={config_generated})") + logger.warning(f"Simulation {simulation_id} check result: preparation not complete (status={status}, config_generated={config_generated})") return False, { - "reason": f"状态不在已准备列表中或config_generated为false: status={status}, config_generated={config_generated}", + "reason": f"Status not in prepared list or config_generated is false: status={status}, config_generated={config_generated}", "status": status, "config_generated": config_generated } - + except Exception as e: - return False, {"reason": f"读取状态文件失败: {str(e)}"} + return False, {"reason": f"Failed to read state file: {str(e)}"} @simulation_bp.route('/prepare', methods=['POST']) def prepare_simulation(): """ - 准备模拟环境(异步任务,LLM智能生成所有参数) - - 这是一个耗时操作,接口会立即返回task_id, - 使用 GET /api/simulation/prepare/status 查询进度 - - 特性: - - 自动检测已完成的准备工作,避免重复生成 - - 如果已准备完成,直接返回已有结果 - - 支持强制重新生成(force_regenerate=true) - - 步骤: - 1. 检查是否已有完成的准备工作 - 2. 从Zep图谱读取并过滤实体 - 3. 为每个实体生成OASIS Agent Profile(带重试机制) - 4. LLM智能生成模拟配置(带重试机制) - 5. 保存配置文件和预设脚本 - - 请求(JSON): + Prepare the simulation environment (async task, all parameters generated by LLM) + + This is a long-running operation; the endpoint returns task_id immediately. + Use GET /api/simulation/prepare/status to poll progress. + + Features: + - Automatically detects completed preparation to avoid regenerating + - Returns existing results immediately if preparation is already done + - Supports force regeneration (force_regenerate=true) + + Steps: + 1. Check whether preparation has already been completed + 2. Read and filter entities from the Zep knowledge graph + 3. Generate OASIS Agent Profiles for each entity (with retry) + 4. Intelligently generate simulation config via LLM (with retry) + 5. Save config files and preset scripts + + Request (JSON): { - "simulation_id": "sim_xxxx", // 必填,模拟ID - "entity_types": ["Student", "PublicFigure"], // 可选,指定实体类型 - "use_llm_for_profiles": true, // 可选,是否用LLM生成人设 - "parallel_profile_count": 5, // 可选,并行生成人设数量,默认5 - "force_regenerate": false // 可选,强制重新生成,默认false + "simulation_id": "sim_xxxx", // required, simulation ID + "entity_types": ["Student", "PublicFigure"], // optional, specify entity types + "use_llm_for_profiles": true, // optional, use LLM to generate personas + "parallel_profile_count": 5, // optional, parallel profile generation, default 5 + "force_regenerate": false // optional, force regeneration, default false } - - 返回: + + Returns: { "success": true, "data": { "simulation_id": "sim_xxxx", - "task_id": "task_xxxx", // 新任务时返回 + "task_id": "task_xxxx", // returned for new tasks "status": "preparing|ready", - "message": "准备任务已启动|已有完成的准备工作", - "already_prepared": true|false // 是否已准备完成 + "message": "Preparation task started | Preparation already complete", + "already_prepared": true|false // whether preparation was already done } } """ @@ -421,17 +421,17 @@ def prepare_simulation(): "error": t('api.simulationNotFound', id=simulation_id) }), 404 - # 检查是否强制重新生成 + # Check whether force regeneration is requested force_regenerate = data.get('force_regenerate', False) - logger.info(f"开始处理 /prepare 请求: simulation_id={simulation_id}, force_regenerate={force_regenerate}") - - # 检查是否已经准备完成(避免重复生成) + logger.info(f"Processing /prepare request: simulation_id={simulation_id}, force_regenerate={force_regenerate}") + + # Check whether preparation is already done (avoid regenerating) if not force_regenerate: - logger.debug(f"检查模拟 {simulation_id} 是否已准备完成...") + logger.debug(f"Checking whether simulation {simulation_id} is already prepared...") is_prepared, prepare_info = _check_simulation_prepared(simulation_id) - logger.debug(f"检查结果: is_prepared={is_prepared}, prepare_info={prepare_info}") + logger.debug(f"Check result: is_prepared={is_prepared}, prepare_info={prepare_info}") if is_prepared: - logger.info(f"模拟 {simulation_id} 已准备完成,跳过重复生成") + logger.info(f"Simulation {simulation_id} is already prepared; skipping regeneration") return jsonify({ "success": True, "data": { @@ -443,9 +443,9 @@ def prepare_simulation(): } }) else: - logger.info(f"模拟 {simulation_id} 未准备完成,将启动准备任务") - - # 从项目获取必要信息 + logger.info(f"Simulation {simulation_id} is not prepared; starting preparation task") + + # Get required info from project project = ProjectManager.get_project(state.project_id) if not project: return jsonify({ @@ -453,41 +453,41 @@ def prepare_simulation(): "error": t('api.projectNotFound', id=state.project_id) }), 404 - # 获取模拟需求 + # Get simulation requirement simulation_requirement = project.simulation_requirement or "" if not simulation_requirement: return jsonify({ "success": False, "error": t('api.projectMissingRequirement') }), 400 - - # 获取文档文本 + + # Get document text document_text = ProjectManager.get_extracted_text(state.project_id) or "" entity_types_list = data.get('entity_types') use_llm_for_profiles = data.get('use_llm_for_profiles', True) parallel_profile_count = data.get('parallel_profile_count', 5) - # ========== 同步获取实体数量(在后台任务启动前) ========== - # 这样前端在调用prepare后立即就能获取到预期Agent总数 + # ========== Synchronously fetch entity count (before background task starts) ========== + # This lets the frontend obtain the expected total agent count immediately after calling prepare. try: - logger.info(f"同步获取实体数量: graph_id={state.graph_id}") + logger.info(f"Synchronously fetching entity count: graph_id={state.graph_id}") reader = ZepEntityReader() - # 快速读取实体(不需要边信息,只统计数量) + # Quick entity read (no edge info needed, just count) filtered_preview = reader.filter_defined_entities( graph_id=state.graph_id, defined_entity_types=entity_types_list, - enrich_with_edges=False # 不获取边信息,加快速度 + enrich_with_edges=False # Skip edge info to speed things up ) - # 保存实体数量到状态(供前端立即获取) + # Save entity count to state (so frontend can fetch it immediately) state.entities_count = filtered_preview.filtered_count state.entity_types = list(filtered_preview.entity_types) - logger.info(f"预期实体数量: {filtered_preview.filtered_count}, 类型: {filtered_preview.entity_types}") + logger.info(f"Expected entity count: {filtered_preview.filtered_count}, types: {filtered_preview.entity_types}") except Exception as e: - logger.warning(f"同步获取实体数量失败(将在后台任务中重试): {e}") - # 失败不影响后续流程,后台任务会重新获取 - - # 创建异步任务 + logger.warning(f"Failed to synchronously fetch entity count (will retry in background task): {e}") + # Failure does not block the rest of the flow; the background task will retry. + + # Create async task task_manager = TaskManager() task_id = task_manager.create_task( task_type="simulation_prepare", @@ -497,14 +497,14 @@ def prepare_simulation(): } ) - # 更新模拟状态(包含预先获取的实体数量) + # Update simulation status (includes pre-fetched entity count) state.status = SimulationStatus.PREPARING manager._save_simulation_state(state) # Capture locale before spawning background thread current_locale = get_locale() - # 定义后台任务 + # Define background task def run_prepare(): set_locale(current_locale) try: @@ -514,13 +514,13 @@ def prepare_simulation(): progress=0, message=t('progress.startPreparingEnv') ) - - # 准备模拟(带进度回调) - # 存储阶段进度详情 + + # Prepare simulation (with progress callback) + # Store per-stage progress details stage_details = {} def progress_callback(stage, progress, message, **kwargs): - # 计算总进度 + # Calculate overall progress stage_weights = { "reading": (0, 20), # 0-20% "generating_profiles": (20, 70), # 20-70% @@ -531,7 +531,7 @@ def prepare_simulation(): start, end = stage_weights.get(stage, (0, 100)) current_progress = int(start + (end - start) * progress / 100) - # 构建详细进度信息 + # Build detailed progress info stage_names = { "reading": t('progress.readingGraphEntities'), "generating_profiles": t('progress.generatingProfiles'), @@ -542,7 +542,7 @@ def prepare_simulation(): stage_index = list(stage_weights.keys()).index(stage) + 1 if stage in stage_weights else 1 total_stages = len(stage_weights) - # 更新阶段详情 + # Update stage details stage_details[stage] = { "stage_name": stage_names.get(stage, stage), "stage_progress": progress, @@ -551,7 +551,7 @@ def prepare_simulation(): "item_name": kwargs.get("item_name", "") } - # 构建详细进度信息 + # Build detailed progress info detail = stage_details[stage] progress_detail_data = { "current_stage": stage, @@ -564,7 +564,7 @@ def prepare_simulation(): "item_description": message } - # 构建简洁消息 + # Build concise message if detail["total"] > 0: detailed_message = ( f"[{stage_index}/{total_stages}] {stage_names.get(stage, stage)}: " @@ -590,24 +590,24 @@ def prepare_simulation(): parallel_profile_count=parallel_profile_count ) - # 任务完成 + # Task complete task_manager.complete_task( task_id, result=result_state.to_simple_dict() ) except Exception as e: - logger.error(f"准备模拟失败: {str(e)}") + logger.error(f"Failed to prepare simulation: {str(e)}") task_manager.fail_task(task_id, str(e)) - - # 更新模拟状态为失败 + + # Update simulation status to failed state = manager.get_simulation(simulation_id) if state: state.status = SimulationStatus.FAILED state.error = str(e) manager._save_simulation_state(state) - # 启动后台线程 + # Start background thread thread = threading.Thread(target=run_prepare, daemon=True) thread.start() @@ -619,8 +619,8 @@ def prepare_simulation(): "status": "preparing", "message": t('api.prepareStarted'), "already_prepared": False, - "expected_entities_count": state.entities_count, # 预期的Agent总数 - "entity_types": state.entity_types # 实体类型列表 + "expected_entities_count": state.entities_count, # Expected total agent count + "entity_types": state.entity_types # Entity type list } }) @@ -629,9 +629,9 @@ def prepare_simulation(): "success": False, "error": str(e) }), 404 - + except Exception as e: - logger.error(f"启动准备任务失败: {str(e)}") + logger.error(f"Failed to start preparation task: {str(e)}") return jsonify({ "success": False, "error": str(e), @@ -642,19 +642,19 @@ def prepare_simulation(): @simulation_bp.route('/prepare/status', methods=['POST']) def get_prepare_status(): """ - 查询准备任务进度 + Query preparation task progress + + Supports two query modes: + 1. Query an in-progress task by task_id + 2. Check whether preparation is already complete via simulation_id - 支持两种查询方式: - 1. 通过task_id查询正在进行的任务进度 - 2. 通过simulation_id检查是否已有完成的准备工作 - - 请求(JSON): + Request (JSON): { - "task_id": "task_xxxx", // 可选,prepare返回的task_id - "simulation_id": "sim_xxxx" // 可选,模拟ID(用于检查已完成的准备) + "task_id": "task_xxxx", // optional, task_id from prepare + "simulation_id": "sim_xxxx" // optional, simulation ID (to check completed preparation) } - - 返回: + + Returns: { "success": true, "data": { @@ -662,8 +662,8 @@ def get_prepare_status(): "status": "processing|completed|ready", "progress": 45, "message": "...", - "already_prepared": true|false, // 是否已有完成的准备 - "prepare_info": {...} // 已准备完成时的详细信息 + "already_prepared": true|false, // whether preparation was already done + "prepare_info": {...} // details when preparation is complete } } """ @@ -675,7 +675,7 @@ def get_prepare_status(): task_id = data.get('task_id') simulation_id = data.get('simulation_id') - # 如果提供了simulation_id,先检查是否已准备完成 + # If simulation_id provided, check whether preparation is already complete if simulation_id: is_prepared, prepare_info = _check_simulation_prepared(simulation_id) if is_prepared: @@ -691,10 +691,10 @@ def get_prepare_status(): } }) - # 如果没有task_id,返回错误 + # If no task_id, return error if not task_id: if simulation_id: - # 有simulation_id但未准备完成 + # Has simulation_id but not yet prepared return jsonify({ "success": True, "data": { @@ -714,7 +714,7 @@ def get_prepare_status(): task = task_manager.get_task(task_id) if not task: - # 任务不存在,但如果有simulation_id,检查是否已准备完成 + # Task not found; if simulation_id provided, check whether preparation is done if simulation_id: is_prepared, prepare_info = _check_simulation_prepared(simulation_id) if is_prepared: @@ -745,7 +745,7 @@ def get_prepare_status(): }) except Exception as e: - logger.error(f"查询任务状态失败: {str(e)}") + logger.error(f"Failed to query task status: {str(e)}") return jsonify({ "success": False, "error": str(e) @@ -754,7 +754,7 @@ def get_prepare_status(): @simulation_bp.route('/', methods=['GET']) def get_simulation(simulation_id: str): - """获取模拟状态""" + """Get simulation status""" try: manager = SimulationManager() state = manager.get_simulation(simulation_id) @@ -767,7 +767,7 @@ def get_simulation(simulation_id: str): result = state.to_dict() - # 如果模拟已准备好,附加运行说明 + # If simulation is ready, attach run instructions if state.status == SimulationStatus.READY: result["run_instructions"] = manager.get_run_instructions(simulation_id) @@ -777,7 +777,7 @@ def get_simulation(simulation_id: str): }) except Exception as e: - logger.error(f"获取模拟状态失败: {str(e)}") + logger.error(f"Failed to get simulation status: {str(e)}") return jsonify({ "success": False, "error": str(e), @@ -788,10 +788,10 @@ def get_simulation(simulation_id: str): @simulation_bp.route('/list', methods=['GET']) def list_simulations(): """ - 列出所有模拟 - - Query参数: - project_id: 按项目ID过滤(可选) + List all simulations + + Query parameters: + project_id: filter by project ID (optional) """ try: project_id = request.args.get('project_id') @@ -806,7 +806,7 @@ def list_simulations(): }) except Exception as e: - logger.error(f"列出模拟失败: {str(e)}") + logger.error(f"Failed to list simulations: {str(e)}") return jsonify({ "success": False, "error": str(e), @@ -816,22 +816,22 @@ def list_simulations(): def _get_report_id_for_simulation(simulation_id: str) -> str: """ - 获取 simulation 对应的最新 report_id - - 遍历 reports 目录,找出 simulation_id 匹配的 report, - 如果有多个则返回最新的(按 created_at 排序) - + Get the most recent report_id associated with a simulation. + + Scans the reports directory for reports matching simulation_id, + returning the most recent one (sorted by created_at) if multiple exist. + Args: - simulation_id: 模拟ID - + simulation_id: simulation ID + Returns: - report_id 或 None + report_id or None """ import json from datetime import datetime - # reports 目录路径:backend/uploads/reports - # __file__ 是 app/api/simulation.py,需要向上两级到 backend/ + # reports directory path: backend/uploads/reports + # __file__ is app/api/simulation.py — go up two levels to backend/ reports_dir = os.path.join(os.path.dirname(__file__), '../../uploads/reports') if not os.path.exists(reports_dir): return None @@ -864,34 +864,35 @@ def _get_report_id_for_simulation(simulation_id: str) -> str: if not matching_reports: return None - # 按创建时间倒序排序,返回最新的 + # Sort by created_at descending, return the most recent matching_reports.sort(key=lambda x: x.get("created_at", ""), reverse=True) return matching_reports[0].get("report_id") - + except Exception as e: - logger.warning(f"查找 simulation {simulation_id} 的 report 失败: {e}") + logger.warning(f"Failed to find report for simulation {simulation_id}: {e}") return None @simulation_bp.route('/history', methods=['GET']) def get_simulation_history(): """ - 获取历史模拟列表(带项目详情) - - 用于首页历史项目展示,返回包含项目名称、描述等丰富信息的模拟列表 - - Query参数: - limit: 返回数量限制(默认20) - - 返回: + Get historical simulation list (with project details) + + Used for the homepage history view; returns enriched simulation list including + project name, description, etc. + + Query parameters: + limit: result count limit (default 20) + + Returns: { "success": true, "data": [ { "simulation_id": "sim_xxxx", "project_id": "proj_xxxx", - "project_name": "武大舆情分析", - "simulation_requirement": "如果武汉大学发布...", + "project_name": "Public Opinion Analysis", + "simulation_requirement": "If the university announces...", "status": "completed", "entities_count": 68, "profiles_count": 68, @@ -914,18 +915,18 @@ def get_simulation_history(): manager = SimulationManager() simulations = manager.list_simulations()[:limit] - # 增强模拟数据,只从 Simulation 文件读取 + # Enrich simulation data, reading only from Simulation files enriched_simulations = [] for sim in simulations: sim_dict = sim.to_dict() - - # 获取模拟配置信息(从 simulation_config.json 读取 simulation_requirement) + + # Get simulation config info (read simulation_requirement from simulation_config.json) config = manager.get_simulation_config(sim.simulation_id) if config: sim_dict["simulation_requirement"] = config.get("simulation_requirement", "") time_config = config.get("time_config", {}) sim_dict["total_simulation_hours"] = time_config.get("total_simulation_hours", 0) - # 推荐轮数(后备值) + # Recommended rounds (fallback) recommended_rounds = int( time_config.get("total_simulation_hours", 0) * 60 / max(time_config.get("minutes_per_round", 60), 1) @@ -935,35 +936,35 @@ def get_simulation_history(): sim_dict["total_simulation_hours"] = 0 recommended_rounds = 0 - # 获取运行状态(从 run_state.json 读取用户设置的实际轮数) + # Get run state (read user-configured actual rounds from run_state.json) run_state = SimulationRunner.get_run_state(sim.simulation_id) if run_state: sim_dict["current_round"] = run_state.current_round sim_dict["runner_status"] = run_state.runner_status.value - # 使用用户设置的 total_rounds,若无则使用推荐轮数 + # Use user-configured total_rounds; fall back to recommended rounds if not set sim_dict["total_rounds"] = run_state.total_rounds if run_state.total_rounds > 0 else recommended_rounds else: sim_dict["current_round"] = 0 sim_dict["runner_status"] = "idle" sim_dict["total_rounds"] = recommended_rounds - # 获取关联项目的文件列表(最多3个) + # Get associated project's file list (up to 3) project = ProjectManager.get_project(sim.project_id) if project and hasattr(project, 'files') and project.files: sim_dict["files"] = [ - {"filename": f.get("filename", "未知文件")} + {"filename": f.get("filename", "Unknown file")} for f in project.files[:3] ] else: sim_dict["files"] = [] - # 获取关联的 report_id(查找该 simulation 最新的 report) + # Get associated report_id (find the most recent report for this simulation) sim_dict["report_id"] = _get_report_id_for_simulation(sim.simulation_id) - - # 添加版本号 + + # Add version sim_dict["version"] = "v1.0.2" - - # 格式化日期 + + # Format date try: created_date = sim_dict.get("created_at", "")[:10] sim_dict["created_date"] = created_date @@ -979,7 +980,7 @@ def get_simulation_history(): }) except Exception as e: - logger.error(f"获取历史模拟失败: {str(e)}") + logger.error(f"Failed to get simulation history: {str(e)}") return jsonify({ "success": False, "error": str(e), @@ -990,10 +991,10 @@ def get_simulation_history(): @simulation_bp.route('//profiles', methods=['GET']) def get_simulation_profiles(simulation_id: str): """ - 获取模拟的Agent Profile - - Query参数: - platform: 平台类型(reddit/twitter,默认reddit) + Get Agent Profiles for a simulation + + Query parameters: + platform: platform type (reddit/twitter, default reddit) """ try: platform = request.args.get('platform', 'reddit') @@ -1017,7 +1018,7 @@ def get_simulation_profiles(simulation_id: str): }), 404 except Exception as e: - logger.error(f"获取Profile失败: {str(e)}") + logger.error(f"Failed to get profiles: {str(e)}") return jsonify({ "success": False, "error": str(e), @@ -1028,25 +1029,25 @@ def get_simulation_profiles(simulation_id: str): @simulation_bp.route('//profiles/realtime', methods=['GET']) def get_simulation_profiles_realtime(simulation_id: str): """ - 实时获取模拟的Agent Profile(用于在生成过程中实时查看进度) + Fetch Agent Profiles in real-time (for viewing progress during generation) + + Differences from /profiles: + - Reads files directly, bypassing SimulationManager + - Suitable for real-time viewing during generation + - Returns extra metadata (e.g. file modification time, whether generation is in progress) + + Query parameters: + platform: platform type (reddit/twitter, default reddit) - 与 /profiles 接口的区别: - - 直接读取文件,不经过 SimulationManager - - 适用于生成过程中的实时查看 - - 返回额外的元数据(如文件修改时间、是否正在生成等) - - Query参数: - platform: 平台类型(reddit/twitter,默认reddit) - - 返回: + Returns: { "success": true, "data": { "simulation_id": "sim_xxxx", "platform": "reddit", "count": 15, - "total_expected": 93, // 预期总数(如果有) - "is_generating": true, // 是否正在生成 + "total_expected": 93, // expected total (if available) + "is_generating": true, // whether generation is in progress "file_exists": true, "file_modified_at": "2025-12-04T18:20:00", "profiles": [...] @@ -1056,32 +1057,32 @@ def get_simulation_profiles_realtime(simulation_id: str): import json import csv from datetime import datetime - + try: platform = request.args.get('platform', 'reddit') - # 获取模拟目录 + # Get simulation directory sim_dir = os.path.join(Config.OASIS_SIMULATION_DATA_DIR, simulation_id) - + if not os.path.exists(sim_dir): return jsonify({ "success": False, "error": t('api.simulationNotFound', id=simulation_id) }), 404 - - # 确定文件路径 + + # Determine file path if platform == "reddit": profiles_file = os.path.join(sim_dir, "reddit_profiles.json") else: profiles_file = os.path.join(sim_dir, "twitter_profiles.csv") - # 检查文件是否存在 + # Check if file exists file_exists = os.path.exists(profiles_file) profiles = [] file_modified_at = None - + if file_exists: - # 获取文件修改时间 + # Get file modification time file_stat = os.stat(profiles_file) file_modified_at = datetime.fromtimestamp(file_stat.st_mtime).isoformat() @@ -1094,13 +1095,13 @@ def get_simulation_profiles_realtime(simulation_id: str): reader = csv.DictReader(f) profiles = list(reader) except (json.JSONDecodeError, Exception) as e: - logger.warning(f"读取 profiles 文件失败(可能正在写入中): {e}") + logger.warning(f"Failed to read profiles file (may be mid-write): {e}") profiles = [] - - # 检查是否正在生成(通过 state.json 判断) + + # Check if generation is in progress (via state.json) is_generating = False total_expected = None - + state_file = os.path.join(sim_dir, "state.json") if os.path.exists(state_file): try: @@ -1127,7 +1128,7 @@ def get_simulation_profiles_realtime(simulation_id: str): }) except Exception as e: - logger.error(f"实时获取Profile失败: {str(e)}") + logger.error(f"Failed to get profiles in real-time: {str(e)}") return jsonify({ "success": False, "error": str(e), @@ -1138,32 +1139,32 @@ def get_simulation_profiles_realtime(simulation_id: str): @simulation_bp.route('//config/realtime', methods=['GET']) def get_simulation_config_realtime(simulation_id: str): """ - 实时获取模拟配置(用于在生成过程中实时查看进度) + Fetch simulation config in real-time (for viewing progress during generation) + + Differences from /config: + - Reads files directly, bypassing SimulationManager + - Suitable for real-time viewing during generation + - Returns extra metadata (e.g. file modification time, whether generation is in progress) + - Can return partial information even before config generation is complete - 与 /config 接口的区别: - - 直接读取文件,不经过 SimulationManager - - 适用于生成过程中的实时查看 - - 返回额外的元数据(如文件修改时间、是否正在生成等) - - 即使配置还没生成完也能返回部分信息 - - 返回: + Returns: { "success": true, "data": { "simulation_id": "sim_xxxx", "file_exists": true, "file_modified_at": "2025-12-04T18:20:00", - "is_generating": true, // 是否正在生成 - "generation_stage": "generating_config", // 当前生成阶段 - "config": {...} // 配置内容(如果存在) + "is_generating": true, // whether generation is in progress + "generation_stage": "generating_config", // current generation stage + "config": {...} // config content (if it exists) } } """ import json from datetime import datetime - + try: - # 获取模拟目录 + # Get simulation directory sim_dir = os.path.join(Config.OASIS_SIMULATION_DATA_DIR, simulation_id) if not os.path.exists(sim_dir): @@ -1172,16 +1173,16 @@ def get_simulation_config_realtime(simulation_id: str): "error": t('api.simulationNotFound', id=simulation_id) }), 404 - # 配置文件路径 + # Config file path config_file = os.path.join(sim_dir, "simulation_config.json") - - # 检查文件是否存在 + + # Check if file exists file_exists = os.path.exists(config_file) config = None file_modified_at = None - + if file_exists: - # 获取文件修改时间 + # Get file modification time file_stat = os.stat(config_file) file_modified_at = datetime.fromtimestamp(file_stat.st_mtime).isoformat() @@ -1189,14 +1190,14 @@ def get_simulation_config_realtime(simulation_id: str): with open(config_file, 'r', encoding='utf-8') as f: config = json.load(f) except (json.JSONDecodeError, Exception) as e: - logger.warning(f"读取 config 文件失败(可能正在写入中): {e}") + logger.warning(f"Failed to read config file (may be mid-write): {e}") config = None - - # 检查是否正在生成(通过 state.json 判断) + + # Check if generation is in progress (via state.json) is_generating = False generation_stage = None config_generated = False - + state_file = os.path.join(sim_dir, "state.json") if os.path.exists(state_file): try: @@ -1206,7 +1207,7 @@ def get_simulation_config_realtime(simulation_id: str): is_generating = status == "preparing" config_generated = state_data.get("config_generated", False) - # 判断当前阶段 + # Determine current stage if is_generating: if state_data.get("profiles_generated", False): generation_stage = "generating_config" @@ -1217,7 +1218,7 @@ def get_simulation_config_realtime(simulation_id: str): except Exception: pass - # 构建返回数据 + # Build response data response_data = { "simulation_id": simulation_id, "file_exists": file_exists, @@ -1228,7 +1229,7 @@ def get_simulation_config_realtime(simulation_id: str): "config": config } - # 如果配置存在,提取一些关键统计信息 + # If config exists, extract key summary stats if config: response_data["summary"] = { "total_agents": len(config.get("agent_configs", [])), @@ -1247,7 +1248,7 @@ def get_simulation_config_realtime(simulation_id: str): }) except Exception as e: - logger.error(f"实时获取Config失败: {str(e)}") + logger.error(f"Failed to get config in real-time: {str(e)}") return jsonify({ "success": False, "error": str(e), @@ -1258,14 +1259,14 @@ def get_simulation_config_realtime(simulation_id: str): @simulation_bp.route('//config', methods=['GET']) def get_simulation_config(simulation_id: str): """ - 获取模拟配置(LLM智能生成的完整配置) - - 返回包含: - - time_config: 时间配置(模拟时长、轮次、高峰/低谷时段) - - agent_configs: 每个Agent的活动配置(活跃度、发言频率、立场等) - - event_config: 事件配置(初始帖子、热点话题) - - platform_configs: 平台配置 - - generation_reasoning: LLM的配置推理说明 + Get simulation config (full config intelligently generated by LLM) + + Returns: + - time_config: time configuration (simulation duration, rounds, peak/off-peak periods) + - agent_configs: per-agent activity config (activity level, post frequency, stance, etc.) + - event_config: event configuration (initial posts, trending topics) + - platform_configs: platform configuration + - generation_reasoning: LLM's reasoning for the configuration """ try: manager = SimulationManager() @@ -1283,7 +1284,7 @@ def get_simulation_config(simulation_id: str): }) except Exception as e: - logger.error(f"获取配置失败: {str(e)}") + logger.error(f"Failed to get config: {str(e)}") return jsonify({ "success": False, "error": str(e), @@ -1293,7 +1294,7 @@ def get_simulation_config(simulation_id: str): @simulation_bp.route('//config/download', methods=['GET']) def download_simulation_config(simulation_id: str): - """下载模拟配置文件""" + """Download simulation config file""" try: manager = SimulationManager() sim_dir = manager._get_simulation_dir(simulation_id) @@ -1312,7 +1313,7 @@ def download_simulation_config(simulation_id: str): ) except Exception as e: - logger.error(f"下载配置失败: {str(e)}") + logger.error(f"Failed to download config: {str(e)}") return jsonify({ "success": False, "error": str(e), @@ -1323,19 +1324,19 @@ def download_simulation_config(simulation_id: str): @simulation_bp.route('/script//download', methods=['GET']) def download_simulation_script(script_name: str): """ - 下载模拟运行脚本文件(通用脚本,位于 backend/scripts/) - - script_name可选值: + Download a simulation run script (generic scripts in backend/scripts/) + + Valid script_name values: - run_twitter_simulation.py - run_reddit_simulation.py - run_parallel_simulation.py - action_logger.py """ try: - # 脚本位于 backend/scripts/ 目录 + # Scripts are in the backend/scripts/ directory scripts_dir = os.path.abspath(os.path.join(os.path.dirname(__file__), '../../scripts')) - # 验证脚本名称 + # Validate script name allowed_scripts = [ "run_twitter_simulation.py", "run_reddit_simulation.py", @@ -1364,7 +1365,7 @@ def download_simulation_script(script_name: str): ) except Exception as e: - logger.error(f"下载脚本失败: {str(e)}") + logger.error(f"Failed to download script: {str(e)}") return jsonify({ "success": False, "error": str(e), @@ -1372,19 +1373,19 @@ def download_simulation_script(script_name: str): }), 500 -# ============== Profile生成接口(独立使用) ============== +# ============== Profile generation endpoint (standalone use) ============== @simulation_bp.route('/generate-profiles', methods=['POST']) def generate_profiles(): """ - 直接从图谱生成OASIS Agent Profile(不创建模拟) - - 请求(JSON): + Generate OASIS Agent Profiles directly from the graph (without creating a simulation) + + Request (JSON): { - "graph_id": "mirofish_xxxx", // 必填 - "entity_types": ["Student"], // 可选 - "use_llm": true, // 可选 - "platform": "reddit" // 可选 + "graph_id": "mirofish_xxxx", // required + "entity_types": ["Student"], // optional + "use_llm": true, // optional + "platform": "reddit" // optional } """ try: @@ -1438,7 +1439,7 @@ def generate_profiles(): }) except Exception as e: - logger.error(f"生成Profile失败: {str(e)}") + logger.error(f"Failed to generate profiles: {str(e)}") return jsonify({ "success": False, "error": str(e), @@ -1446,35 +1447,35 @@ def generate_profiles(): }), 500 -# ============== 模拟运行控制接口 ============== +# ============== Simulation run control endpoints ============== @simulation_bp.route('/start', methods=['POST']) def start_simulation(): """ - 开始运行模拟 + Start running a simulation - 请求(JSON): + Request (JSON): { - "simulation_id": "sim_xxxx", // 必填,模拟ID - "platform": "parallel", // 可选: twitter / reddit / parallel (默认) - "max_rounds": 100, // 可选: 最大模拟轮数,用于截断过长的模拟 - "enable_graph_memory_update": false, // 可选: 是否将Agent活动动态更新到Zep图谱记忆 - "force": false // 可选: 强制重新开始(会停止运行中的模拟并清理日志) + "simulation_id": "sim_xxxx", // required, simulation ID + "platform": "parallel", // optional: twitter / reddit / parallel (default) + "max_rounds": 100, // optional: max simulation rounds, to cap long simulations + "enable_graph_memory_update": false, // optional: whether to dynamically update Agent activities to Zep graph memory + "force": false // optional: force restart (stops running simulation and clears logs) } - 关于 force 参数: - - 启用后,如果模拟正在运行或已完成,会先停止并清理运行日志 - - 清理的内容包括:run_state.json, actions.jsonl, simulation.log 等 - - 不会清理配置文件(simulation_config.json)和 profile 文件 - - 适用于需要重新运行模拟的场景 + About force parameter: + - When enabled, if the simulation is running or already complete, it is stopped and run logs are cleared + - Cleared items: run_state.json, actions.jsonl, simulation.log, etc. + - Config files (simulation_config.json) and profile files are NOT cleared + - Use when you need to re-run a simulation - 关于 enable_graph_memory_update: - - 启用后,模拟中所有Agent的活动(发帖、评论、点赞等)都会实时更新到Zep图谱 - - 这可以让图谱"记住"模拟过程,用于后续分析或AI对话 - - 需要模拟关联的项目有有效的 graph_id - - 采用批量更新机制,减少API调用次数 + About enable_graph_memory_update: + - When enabled, all Agent activities (posts, comments, likes, etc.) are updated to Zep graph in real time + - This lets the graph "remember" the simulation process for later analysis or AI chat + - Requires the simulation's associated project to have a valid graph_id + - Uses a batch update mechanism to reduce API call count - 返回: + Returns: { "success": true, "data": { @@ -1484,8 +1485,8 @@ def start_simulation(): "twitter_running": true, "reddit_running": true, "started_at": "2025-12-01T10:00:00", - "graph_memory_update_enabled": true, // 是否启用了图谱记忆更新 - "force_restarted": true // 是否是强制重新开始 + "graph_memory_update_enabled": true, // whether graph memory update is enabled + "force_restarted": true // whether this is a forced restart } } """ @@ -1500,11 +1501,11 @@ def start_simulation(): }), 400 platform = data.get('platform', 'parallel') - max_rounds = data.get('max_rounds') # 可选:最大模拟轮数 - enable_graph_memory_update = data.get('enable_graph_memory_update', False) # 可选:是否启用图谱记忆更新 - force = data.get('force', False) # 可选:强制重新开始 + max_rounds = data.get('max_rounds') # optional: max simulation rounds + enable_graph_memory_update = data.get('enable_graph_memory_update', False) # optional: enable graph memory update + force = data.get('force', False) # optional: force restart - # 验证 max_rounds 参数 + # Validate max_rounds parameter if max_rounds is not None: try: max_rounds = int(max_rounds) @@ -1525,7 +1526,7 @@ def start_simulation(): "error": t('api.invalidPlatform', platform=platform) }), 400 - # 检查模拟是否已准备好 + # Check if simulation is ready manager = SimulationManager() state = manager.get_simulation(simulation_id) @@ -1536,58 +1537,58 @@ def start_simulation(): }), 404 force_restarted = False - - # 智能处理状态:如果准备工作已完成,允许重新启动 + + # Smart status handling: allow restart if preparation is complete if state.status != SimulationStatus.READY: - # 检查准备工作是否已完成 + # Check whether preparation has been completed is_prepared, prepare_info = _check_simulation_prepared(simulation_id) if is_prepared: - # 准备工作已完成,检查是否有正在运行的进程 + # Preparation is complete; check if a process is still running if state.status == SimulationStatus.RUNNING: - # 检查模拟进程是否真的在运行 + # Check whether the simulation process is actually running run_state = SimulationRunner.get_run_state(simulation_id) if run_state and run_state.runner_status.value == "running": - # 进程确实在运行 + # Process is indeed running if force: - # 强制模式:停止运行中的模拟 - logger.info(f"强制模式:停止运行中的模拟 {simulation_id}") + # Force mode: stop the running simulation + logger.info(f"Force mode: stopping running simulation {simulation_id}") try: SimulationRunner.stop_simulation(simulation_id) except Exception as e: - logger.warning(f"停止模拟时出现警告: {str(e)}") + logger.warning(f"Warning while stopping simulation: {str(e)}") else: return jsonify({ "success": False, "error": t('api.simRunningForceHint') }), 400 - # 如果是强制模式,清理运行日志 + # If in force mode, clear run logs if force: - logger.info(f"强制模式:清理模拟日志 {simulation_id}") + logger.info(f"Force mode: clearing simulation logs for {simulation_id}") cleanup_result = SimulationRunner.cleanup_simulation_logs(simulation_id) if not cleanup_result.get("success"): - logger.warning(f"清理日志时出现警告: {cleanup_result.get('errors')}") + logger.warning(f"Warning while clearing logs: {cleanup_result.get('errors')}") force_restarted = True - # 进程不存在或已结束,重置状态为 ready - logger.info(f"模拟 {simulation_id} 准备工作已完成,重置状态为 ready(原状态: {state.status.value})") + # Process does not exist or has ended; reset status to ready + logger.info(f"Simulation {simulation_id} preparation complete; resetting status to ready (was: {state.status.value})") state.status = SimulationStatus.READY manager._save_simulation_state(state) else: - # 准备工作未完成 + # Preparation not yet complete return jsonify({ "success": False, "error": t('api.simNotReady', status=state.status.value) }), 400 - # 获取图谱ID(用于图谱记忆更新) + # Get graph ID (for graph memory update) graph_id = None if enable_graph_memory_update: - # 从模拟状态或项目中获取 graph_id + # Get graph_id from simulation state or project graph_id = state.graph_id if not graph_id: - # 尝试从项目中获取 + # Try to get from project project = ProjectManager.get_project(state.project_id) if project: graph_id = project.graph_id @@ -1598,9 +1599,9 @@ def start_simulation(): "error": t('api.graphIdRequiredForMemory') }), 400 - logger.info(f"启用图谱记忆更新: simulation_id={simulation_id}, graph_id={graph_id}") - - # 启动模拟 + logger.info(f"Graph memory update enabled: simulation_id={simulation_id}, graph_id={graph_id}") + + # Start simulation run_state = SimulationRunner.start_simulation( simulation_id=simulation_id, platform=platform, @@ -1609,7 +1610,7 @@ def start_simulation(): graph_id=graph_id ) - # 更新模拟状态 + # Update simulation status state.status = SimulationStatus.RUNNING manager._save_simulation_state(state) @@ -1633,7 +1634,7 @@ def start_simulation(): }), 400 except Exception as e: - logger.error(f"启动模拟失败: {str(e)}") + logger.error(f"Failed to start simulation: {str(e)}") return jsonify({ "success": False, "error": str(e), @@ -1644,14 +1645,14 @@ def start_simulation(): @simulation_bp.route('/stop', methods=['POST']) def stop_simulation(): """ - 停止模拟 - - 请求(JSON): + Stop the simulation + + Request (JSON): { - "simulation_id": "sim_xxxx" // 必填,模拟ID + "simulation_id": "sim_xxxx" // required, simulation ID } - - 返回: + + Returns: { "success": true, "data": { @@ -1673,7 +1674,7 @@ def stop_simulation(): run_state = SimulationRunner.stop_simulation(simulation_id) - # 更新模拟状态 + # Update simulation status manager = SimulationManager() state = manager.get_simulation(simulation_id) if state: @@ -1692,7 +1693,7 @@ def stop_simulation(): }), 400 except Exception as e: - logger.error(f"停止模拟失败: {str(e)}") + logger.error(f"Failed to stop simulation: {str(e)}") return jsonify({ "success": False, "error": str(e), @@ -1700,14 +1701,14 @@ def stop_simulation(): }), 500 -# ============== 实时状态监控接口 ============== +# ============== Real-time status monitoring endpoints ============== @simulation_bp.route('//run-status', methods=['GET']) def get_run_status(simulation_id: str): """ - 获取模拟运行实时状态(用于前端轮询) - - 返回: + Get real-time simulation run status (for frontend polling) + + Returns: { "success": true, "data": { @@ -1752,7 +1753,7 @@ def get_run_status(simulation_id: str): }) except Exception as e: - logger.error(f"获取运行状态失败: {str(e)}") + logger.error(f"Failed to get run status: {str(e)}") return jsonify({ "success": False, "error": str(e), @@ -1763,14 +1764,14 @@ def get_run_status(simulation_id: str): @simulation_bp.route('//run-status/detail', methods=['GET']) def get_run_status_detail(simulation_id: str): """ - 获取模拟运行详细状态(包含所有动作) - - 用于前端展示实时动态 - - Query参数: - platform: 过滤平台(twitter/reddit,可选) - - 返回: + Get detailed simulation run status (including all actions) + + Used for real-time activity display in the frontend. + + Query parameters: + platform: filter by platform (twitter/reddit, optional) + + Returns: { "success": true, "data": { @@ -1792,8 +1793,8 @@ def get_run_status_detail(simulation_id: str): }, ... ], - "twitter_actions": [...], # Twitter 平台的所有动作 - "reddit_actions": [...] # Reddit 平台的所有动作 + "twitter_actions": [...], # All actions on the Twitter platform + "reddit_actions": [...] # All actions on the Reddit platform } } """ @@ -1813,13 +1814,13 @@ def get_run_status_detail(simulation_id: str): } }) - # 获取完整的动作列表 + # Get full action list all_actions = SimulationRunner.get_all_actions( simulation_id=simulation_id, platform=platform_filter ) - # 分平台获取动作 + # Get actions per platform twitter_actions = SimulationRunner.get_all_actions( simulation_id=simulation_id, platform="twitter" @@ -1830,7 +1831,7 @@ def get_run_status_detail(simulation_id: str): platform="reddit" ) if not platform_filter or platform_filter == "reddit" else [] - # 获取当前轮次的动作(recent_actions 只展示最新一轮) + # Get actions for the current round (recent_actions shows only the latest round) current_round = run_state.current_round recent_actions = SimulationRunner.get_all_actions( simulation_id=simulation_id, @@ -1838,13 +1839,13 @@ def get_run_status_detail(simulation_id: str): round_num=current_round ) if current_round > 0 else [] - # 获取基础状态信息 + # Get basic status info result = run_state.to_dict() result["all_actions"] = [a.to_dict() for a in all_actions] result["twitter_actions"] = [a.to_dict() for a in twitter_actions] result["reddit_actions"] = [a.to_dict() for a in reddit_actions] result["rounds_count"] = len(run_state.rounds) - # recent_actions 只展示当前最新一轮两个平台的内容 + # recent_actions shows only the current latest round across both platforms result["recent_actions"] = [a.to_dict() for a in recent_actions] return jsonify({ @@ -1853,7 +1854,7 @@ def get_run_status_detail(simulation_id: str): }) except Exception as e: - logger.error(f"获取详细状态失败: {str(e)}") + logger.error(f"Failed to get detailed status: {str(e)}") return jsonify({ "success": False, "error": str(e), @@ -1864,16 +1865,16 @@ def get_run_status_detail(simulation_id: str): @simulation_bp.route('//actions', methods=['GET']) def get_simulation_actions(simulation_id: str): """ - 获取模拟中的Agent动作历史 + Get Agent action history for a simulation + + Query parameters: + limit: result count (default 100) + offset: offset (default 0) + platform: filter by platform (twitter/reddit) + agent_id: filter by Agent ID + round_num: filter by round number - Query参数: - limit: 返回数量(默认100) - offset: 偏移量(默认0) - platform: 过滤平台(twitter/reddit) - agent_id: 过滤Agent ID - round_num: 过滤轮次 - - 返回: + Returns: { "success": true, "data": { @@ -1907,7 +1908,7 @@ def get_simulation_actions(simulation_id: str): }) except Exception as e: - logger.error(f"获取动作历史失败: {str(e)}") + logger.error(f"Failed to get action history: {str(e)}") return jsonify({ "success": False, "error": str(e), @@ -1918,15 +1919,15 @@ def get_simulation_actions(simulation_id: str): @simulation_bp.route('//timeline', methods=['GET']) def get_simulation_timeline(simulation_id: str): """ - 获取模拟时间线(按轮次汇总) - - 用于前端展示进度条和时间线视图 - - Query参数: - start_round: 起始轮次(默认0) - end_round: 结束轮次(默认全部) - - 返回每轮的汇总信息 + Get simulation timeline (summarized by round) + + Used for progress bar and timeline view in the frontend. + + Query parameters: + start_round: starting round (default 0) + end_round: ending round (default all) + + Returns summary info for each round. """ try: start_round = request.args.get('start_round', 0, type=int) @@ -1947,7 +1948,7 @@ def get_simulation_timeline(simulation_id: str): }) except Exception as e: - logger.error(f"获取时间线失败: {str(e)}") + logger.error(f"Failed to get timeline: {str(e)}") return jsonify({ "success": False, "error": str(e), @@ -1958,9 +1959,9 @@ def get_simulation_timeline(simulation_id: str): @simulation_bp.route('//agent-stats', methods=['GET']) def get_agent_stats(simulation_id: str): """ - 获取每个Agent的统计信息 - - 用于前端展示Agent活跃度排行、动作分布等 + Get statistics for each Agent + + Used to display agent activity rankings and action distribution in the frontend. """ try: stats = SimulationRunner.get_agent_stats(simulation_id) @@ -1974,7 +1975,7 @@ def get_agent_stats(simulation_id: str): }) except Exception as e: - logger.error(f"获取Agent统计失败: {str(e)}") + logger.error(f"Failed to get Agent stats: {str(e)}") return jsonify({ "success": False, "error": str(e), @@ -1982,19 +1983,19 @@ def get_agent_stats(simulation_id: str): }), 500 -# ============== 数据库查询接口 ============== +# ============== Database query endpoints ============== @simulation_bp.route('//posts', methods=['GET']) def get_simulation_posts(simulation_id: str): """ - 获取模拟中的帖子 - - Query参数: - platform: 平台类型(twitter/reddit) - limit: 返回数量(默认50) - offset: 偏移量 - - 返回帖子列表(从SQLite数据库读取) + Get posts from a simulation + + Query parameters: + platform: platform type (twitter/reddit) + limit: result count (default 50) + offset: offset + + Returns list of posts (read from SQLite database) """ try: platform = request.args.get('platform', 'reddit') @@ -2054,7 +2055,7 @@ def get_simulation_posts(simulation_id: str): }) except Exception as e: - logger.error(f"获取帖子失败: {str(e)}") + logger.error(f"Failed to get posts: {str(e)}") return jsonify({ "success": False, "error": str(e), @@ -2065,12 +2066,12 @@ def get_simulation_posts(simulation_id: str): @simulation_bp.route('//comments', methods=['GET']) def get_simulation_comments(simulation_id: str): """ - 获取模拟中的评论(仅Reddit) - - Query参数: - post_id: 过滤帖子ID(可选) - limit: 返回数量 - offset: 偏移量 + Get comments from a simulation (Reddit only) + + Query parameters: + post_id: filter by post ID (optional) + limit: result count + offset: offset """ try: post_id = request.args.get('post_id') @@ -2129,7 +2130,7 @@ def get_simulation_comments(simulation_id: str): }) except Exception as e: - logger.error(f"获取评论失败: {str(e)}") + logger.error(f"Failed to get comments: {str(e)}") return jsonify({ "success": False, "error": str(e), @@ -2137,31 +2138,32 @@ def get_simulation_comments(simulation_id: str): }), 500 -# ============== Interview 采访接口 ============== +# ============== Interview endpoints ============== @simulation_bp.route('/interview', methods=['POST']) def interview_agent(): """ - 采访单个Agent + Interview a single Agent - 注意:此功能需要模拟环境处于运行状态(完成模拟循环后进入等待命令模式) + Note: this feature requires the simulation environment to be running + (after the simulation loop completes it enters command-waiting mode). - 请求(JSON): + Request (JSON): { - "simulation_id": "sim_xxxx", // 必填,模拟ID - "agent_id": 0, // 必填,Agent ID - "prompt": "你对这件事有什么看法?", // 必填,采访问题 - "platform": "twitter", // 可选,指定平台(twitter/reddit) - // 不指定时:双平台模拟同时采访两个平台 - "timeout": 60 // 可选,超时时间(秒),默认60 + "simulation_id": "sim_xxxx", // required, simulation ID + "agent_id": 0, // required, Agent ID + "prompt": "What do you think?", // required, interview question + "platform": "twitter", // optional, specify platform (twitter/reddit) + // if not specified: dual-platform simulation interviews both + "timeout": 60 // optional, timeout in seconds, default 60 } - 返回(不指定platform,双平台模式): + Returns (no platform specified, dual-platform mode): { "success": true, "data": { "agent_id": 0, - "prompt": "你对这件事有什么看法?", + "prompt": "What do you think?", "result": { "agent_id": 0, "prompt": "...", @@ -2174,15 +2176,15 @@ def interview_agent(): } } - 返回(指定platform): + Returns (platform specified): { "success": true, "data": { "agent_id": 0, - "prompt": "你对这件事有什么看法?", + "prompt": "What do you think?", "result": { "agent_id": 0, - "response": "我认为...", + "response": "I think...", "platform": "twitter", "timestamp": "2025-12-08T10:00:00" }, @@ -2196,9 +2198,9 @@ def interview_agent(): simulation_id = data.get('simulation_id') agent_id = data.get('agent_id') prompt = data.get('prompt') - platform = data.get('platform') # 可选:twitter/reddit/None + platform = data.get('platform') # optional: twitter/reddit/None timeout = data.get('timeout', 60) - + if not simulation_id: return jsonify({ "success": False, @@ -2217,23 +2219,23 @@ def interview_agent(): "error": t('api.requirePrompt') }), 400 - # 验证platform参数 + # Validate platform parameter if platform and platform not in ("twitter", "reddit"): return jsonify({ "success": False, "error": t('api.invalidInterviewPlatform') }), 400 - - # 检查环境状态 + + # Check environment status if not SimulationRunner.check_env_alive(simulation_id): return jsonify({ "success": False, "error": t('api.envNotRunning') }), 400 - # 优化prompt,添加前缀避免Agent调用工具 + # Optimize prompt: add prefix to prevent Agent from calling tools optimized_prompt = optimize_interview_prompt(prompt) - + result = SimulationRunner.interview_agent( simulation_id=simulation_id, agent_id=agent_id, @@ -2260,7 +2262,7 @@ def interview_agent(): }), 504 except Exception as e: - logger.error(f"Interview失败: {str(e)}") + logger.error(f"Interview failed: {str(e)}") return jsonify({ "success": False, "error": str(e), @@ -2271,30 +2273,30 @@ def interview_agent(): @simulation_bp.route('/interview/batch', methods=['POST']) def interview_agents_batch(): """ - 批量采访多个Agent + Batch interview multiple Agents - 注意:此功能需要模拟环境处于运行状态 + Note: this feature requires the simulation environment to be running. - 请求(JSON): + Request (JSON): { - "simulation_id": "sim_xxxx", // 必填,模拟ID - "interviews": [ // 必填,采访列表 + "simulation_id": "sim_xxxx", // required, simulation ID + "interviews": [ // required, interview list { "agent_id": 0, - "prompt": "你对A有什么看法?", - "platform": "twitter" // 可选,指定该Agent的采访平台 + "prompt": "What do you think of A?", + "platform": "twitter" // optional, specify platform for this agent }, { "agent_id": 1, - "prompt": "你对B有什么看法?" // 不指定platform则使用默认值 + "prompt": "What do you think of B?" // no platform: uses default } ], - "platform": "reddit", // 可选,默认平台(被每项的platform覆盖) - // 不指定时:双平台模拟每个Agent同时采访两个平台 - "timeout": 120 // 可选,超时时间(秒),默认120 + "platform": "reddit", // optional, default platform (overridden per item) + // if not specified: dual-platform each agent is interviewed on both + "timeout": 120 // optional, timeout in seconds, default 120 } - 返回: + Returns: { "success": true, "data": { @@ -2317,7 +2319,7 @@ def interview_agents_batch(): simulation_id = data.get('simulation_id') interviews = data.get('interviews') - platform = data.get('platform') # 可选:twitter/reddit/None + platform = data.get('platform') # optional: twitter/reddit/None timeout = data.get('timeout', 120) if not simulation_id: @@ -2332,14 +2334,14 @@ def interview_agents_batch(): "error": t('api.requireInterviews') }), 400 - # 验证platform参数 + # Validate platform parameter if platform and platform not in ("twitter", "reddit"): return jsonify({ "success": False, "error": t('api.invalidInterviewPlatform') }), 400 - # 验证每个采访项 + # Validate each interview item for i, interview in enumerate(interviews): if 'agent_id' not in interview: return jsonify({ @@ -2351,7 +2353,7 @@ def interview_agents_batch(): "success": False, "error": t('api.interviewListMissingPrompt', index=i+1) }), 400 - # 验证每项的platform(如果有) + # Validate per-item platform (if present) item_platform = interview.get('platform') if item_platform and item_platform not in ("twitter", "reddit"): return jsonify({ @@ -2359,14 +2361,14 @@ def interview_agents_batch(): "error": t('api.interviewListInvalidPlatform', index=i+1) }), 400 - # 检查环境状态 + # Check environment status if not SimulationRunner.check_env_alive(simulation_id): return jsonify({ "success": False, "error": t('api.envNotRunning') }), 400 - # 优化每个采访项的prompt,添加前缀避免Agent调用工具 + # Optimize prompt for each interview item: add prefix to prevent tool calls optimized_interviews = [] for interview in interviews: optimized_interview = interview.copy() @@ -2398,7 +2400,7 @@ def interview_agents_batch(): }), 504 except Exception as e: - logger.error(f"批量Interview失败: {str(e)}") + logger.error(f"Batch interview failed: {str(e)}") return jsonify({ "success": False, "error": str(e), @@ -2409,20 +2411,20 @@ def interview_agents_batch(): @simulation_bp.route('/interview/all', methods=['POST']) def interview_all_agents(): """ - 全局采访 - 使用相同问题采访所有Agent + Global interview - ask all Agents the same question - 注意:此功能需要模拟环境处于运行状态 + Note: this feature requires the simulation environment to be running. - 请求(JSON): + Request (JSON): { - "simulation_id": "sim_xxxx", // 必填,模拟ID - "prompt": "你对这件事整体有什么看法?", // 必填,采访问题(所有Agent使用相同问题) - "platform": "reddit", // 可选,指定平台(twitter/reddit) - // 不指定时:双平台模拟每个Agent同时采访两个平台 - "timeout": 180 // 可选,超时时间(秒),默认180 + "simulation_id": "sim_xxxx", // required, simulation ID + "prompt": "What is your overall take on this?", // required, interview question (same for all agents) + "platform": "reddit", // optional, specify platform (twitter/reddit) + // if not specified: dual-platform, each agent interviewed on both + "timeout": 180 // optional, timeout in seconds, default 180 } - 返回: + Returns: { "success": true, "data": { @@ -2444,7 +2446,7 @@ def interview_all_agents(): simulation_id = data.get('simulation_id') prompt = data.get('prompt') - platform = data.get('platform') # 可选:twitter/reddit/None + platform = data.get('platform') # optional: twitter/reddit/None timeout = data.get('timeout', 180) if not simulation_id: @@ -2459,21 +2461,21 @@ def interview_all_agents(): "error": t('api.requirePrompt') }), 400 - # 验证platform参数 + # Validate platform parameter if platform and platform not in ("twitter", "reddit"): return jsonify({ "success": False, "error": t('api.invalidInterviewPlatform') }), 400 - # 检查环境状态 + # Check environment status if not SimulationRunner.check_env_alive(simulation_id): return jsonify({ "success": False, "error": t('api.envNotRunning') }), 400 - # 优化prompt,添加前缀避免Agent调用工具 + # Optimize prompt: add prefix to prevent tool calls optimized_prompt = optimize_interview_prompt(prompt) result = SimulationRunner.interview_all_agents( @@ -2501,7 +2503,7 @@ def interview_all_agents(): }), 504 except Exception as e: - logger.error(f"全局Interview失败: {str(e)}") + logger.error(f"Global interview failed: {str(e)}") return jsonify({ "success": False, "error": str(e), @@ -2512,20 +2514,20 @@ def interview_all_agents(): @simulation_bp.route('/interview/history', methods=['POST']) def get_interview_history(): """ - 获取Interview历史记录 + Get Interview history records - 从模拟数据库中读取所有Interview记录 + Reads all Interview records from the simulation database. - 请求(JSON): + Request (JSON): { - "simulation_id": "sim_xxxx", // 必填,模拟ID - "platform": "reddit", // 可选,平台类型(reddit/twitter) - // 不指定则返回两个平台的所有历史 - "agent_id": 0, // 可选,只获取该Agent的采访历史 - "limit": 100 // 可选,返回数量,默认100 + "simulation_id": "sim_xxxx", // required, simulation ID + "platform": "reddit", // optional, platform type (reddit/twitter) + // if not specified, returns history from both platforms + "agent_id": 0, // optional, get only this agent's interview history + "limit": 100 // optional, result count, default 100 } - 返回: + Returns: { "success": true, "data": { @@ -2533,8 +2535,8 @@ def get_interview_history(): "history": [ { "agent_id": 0, - "response": "我认为...", - "prompt": "你对这件事有什么看法?", + "response": "I think...", + "prompt": "What do you think about this?", "timestamp": "2025-12-08T10:00:00", "platform": "reddit" }, @@ -2547,7 +2549,7 @@ def get_interview_history(): data = request.get_json() or {} simulation_id = data.get('simulation_id') - platform = data.get('platform') # 不指定则返回两个平台的历史 + platform = data.get('platform') # if not specified, return history from both platforms agent_id = data.get('agent_id') limit = data.get('limit', 100) @@ -2573,7 +2575,7 @@ def get_interview_history(): }) except Exception as e: - logger.error(f"获取Interview历史失败: {str(e)}") + logger.error(f"Failed to get interview history: {str(e)}") return jsonify({ "success": False, "error": str(e), @@ -2584,16 +2586,16 @@ def get_interview_history(): @simulation_bp.route('/env-status', methods=['POST']) def get_env_status(): """ - 获取模拟环境状态 + Get simulation environment status - 检查模拟环境是否存活(可以接收Interview命令) + Checks whether the simulation environment is alive (can receive Interview commands). - 请求(JSON): + Request (JSON): { - "simulation_id": "sim_xxxx" // 必填,模拟ID + "simulation_id": "sim_xxxx" // required, simulation ID } - 返回: + Returns: { "success": true, "data": { @@ -2601,7 +2603,7 @@ def get_env_status(): "env_alive": true, "twitter_available": true, "reddit_available": true, - "message": "环境正在运行,可以接收Interview命令" + "message": "Environment is running and ready to receive Interview commands" } } """ @@ -2618,7 +2620,7 @@ def get_env_status(): env_alive = SimulationRunner.check_env_alive(simulation_id) - # 获取更详细的状态信息 + # Get more detailed status info env_status = SimulationRunner.get_env_status_detail(simulation_id) if env_alive: @@ -2638,7 +2640,7 @@ def get_env_status(): }) except Exception as e: - logger.error(f"获取环境状态失败: {str(e)}") + logger.error(f"Failed to get environment status: {str(e)}") return jsonify({ "success": False, "error": str(e), @@ -2649,24 +2651,25 @@ def get_env_status(): @simulation_bp.route('/close-env', methods=['POST']) def close_simulation_env(): """ - 关闭模拟环境 - - 向模拟发送关闭环境命令,使其优雅退出等待命令模式。 - - 注意:这不同于 /stop 接口,/stop 会强制终止进程, - 而此接口会让模拟优雅地关闭环境并退出。 - - 请求(JSON): + Close the simulation environment + + Sends a close-environment command to the simulation, causing it to exit + command-waiting mode gracefully. + + Note: this is different from /stop, which forcibly terminates the process. + This endpoint lets the simulation gracefully close the environment and exit. + + Request (JSON): { - "simulation_id": "sim_xxxx", // 必填,模拟ID - "timeout": 30 // 可选,超时时间(秒),默认30 + "simulation_id": "sim_xxxx", // required, simulation ID + "timeout": 30 // optional, timeout in seconds, default 30 } - - 返回: + + Returns: { "success": true, "data": { - "message": "环境关闭命令已发送", + "message": "Environment close command sent", "result": {...}, "timestamp": "2025-12-08T10:00:01" } @@ -2689,26 +2692,26 @@ def close_simulation_env(): timeout=timeout ) - # 更新模拟状态 + # Update simulation status manager = SimulationManager() state = manager.get_simulation(simulation_id) if state: state.status = SimulationStatus.COMPLETED manager._save_simulation_state(state) - + return jsonify({ "success": result.get("success", False), "data": result }) - + except ValueError as e: return jsonify({ "success": False, "error": str(e) }), 400 - + except Exception as e: - logger.error(f"关闭环境失败: {str(e)}") + logger.error(f"Failed to close environment: {str(e)}") return jsonify({ "success": False, "error": str(e), diff --git a/backend/app/config.py b/backend/app/config.py index ebced558..7dfa8246 100644 --- a/backend/app/config.py +++ b/backend/app/config.py @@ -1,55 +1,55 @@ """ -配置管理 -统一从项目根目录的 .env 文件加载配置 +Configuration management +Loads config uniformly from the .env file at the project root """ import os from dotenv import load_dotenv -# 加载项目根目录的 .env 文件 -# 路径: MiroFish/.env (相对于 backend/app/config.py) +# Load the .env file from the project root +# Path: MiroFish/.env (relative to backend/app/config.py) project_root_env = os.path.join(os.path.dirname(__file__), '../../.env') if os.path.exists(project_root_env): load_dotenv(project_root_env, override=True) else: - # 如果根目录没有 .env,尝试加载环境变量(用于生产环境) + # If no root-level .env file found, load from environment variables (production) load_dotenv(override=True) class Config: - """Flask配置类""" - - # Flask配置 + """Flask configuration class""" + + # Flask settings SECRET_KEY = os.environ.get('SECRET_KEY', 'mirofish-secret-key') DEMO_PASSWORD = os.environ.get('DEMO_PASSWORD', '') DEBUG = os.environ.get('FLASK_DEBUG', 'True').lower() == 'true' - - # JSON配置 - 禁用ASCII转义,让中文直接显示(而不是 \uXXXX 格式) + + # JSON settings - disable ASCII escaping so non-ASCII chars are output directly (not as \uXXXX) JSON_AS_ASCII = False - - # LLM配置(统一使用OpenAI格式) + + # LLM settings (unified OpenAI-compatible format) LLM_API_KEY = os.environ.get('LLM_API_KEY') LLM_BASE_URL = os.environ.get('LLM_BASE_URL', 'https://api.openai.com/v1') LLM_MODEL_NAME = os.environ.get('LLM_MODEL_NAME', 'gpt-4o-mini') - # Zep配置 + # Zep settings ZEP_API_KEY = os.environ.get('ZEP_API_KEY') - - # 文件上传配置 + + # File upload settings MAX_CONTENT_LENGTH = 50 * 1024 * 1024 # 50MB UPLOAD_FOLDER = os.path.join(os.path.dirname(__file__), '../uploads') ALLOWED_EXTENSIONS = {'pdf', 'md', 'txt', 'markdown'} - # 文本处理配置 - DEFAULT_CHUNK_SIZE = 500 # 默认切块大小 - DEFAULT_CHUNK_OVERLAP = 50 # 默认重叠大小 - - # OASIS模拟配置 + # Text processing settings + DEFAULT_CHUNK_SIZE = 500 # default chunk size + DEFAULT_CHUNK_OVERLAP = 50 # default overlap size + + # OASIS simulation settings OASIS_DEFAULT_MAX_ROUNDS = int(os.environ.get('OASIS_DEFAULT_MAX_ROUNDS', '10')) OASIS_SIMULATION_DATA_DIR = os.path.join(os.path.dirname(__file__), '../uploads/simulations') - # OASIS平台可用动作配置 + # OASIS platform available actions OASIS_TWITTER_ACTIONS = [ 'CREATE_POST', 'LIKE_POST', 'REPOST', 'FOLLOW', 'DO_NOTHING', 'QUOTE_POST' ] @@ -59,18 +59,18 @@ class Config: 'TREND', 'REFRESH', 'DO_NOTHING', 'FOLLOW', 'MUTE' ] - # Report Agent配置 + # Report Agent settings REPORT_AGENT_MAX_TOOL_CALLS = int(os.environ.get('REPORT_AGENT_MAX_TOOL_CALLS', '5')) REPORT_AGENT_MAX_REFLECTION_ROUNDS = int(os.environ.get('REPORT_AGENT_MAX_REFLECTION_ROUNDS', '2')) REPORT_AGENT_TEMPERATURE = float(os.environ.get('REPORT_AGENT_TEMPERATURE', '0.5')) @classmethod def validate(cls): - """验证必要配置""" + """Validate required configuration""" errors = [] if not cls.LLM_API_KEY: - errors.append("LLM_API_KEY 未配置") + errors.append("LLM_API_KEY is not configured") if not cls.ZEP_API_KEY: - errors.append("ZEP_API_KEY 未配置") + errors.append("ZEP_API_KEY is not configured") return errors diff --git a/backend/app/models/__init__.py b/backend/app/models/__init__.py index 55bec619..b3483660 100644 --- a/backend/app/models/__init__.py +++ b/backend/app/models/__init__.py @@ -1,5 +1,5 @@ """ -数据模型模块 +Data models module """ from .task import TaskManager, TaskStatus diff --git a/backend/app/models/project.py b/backend/app/models/project.py index 08978937..697d2856 100644 --- a/backend/app/models/project.py +++ b/backend/app/models/project.py @@ -1,6 +1,6 @@ """ -项目上下文管理 -用于在服务端持久化项目状态,避免前端在接口间传递大量数据 +Project context management +Persists project state server-side so the frontend does not need to pass large amounts of data between endpoints. """ import os @@ -15,45 +15,45 @@ from ..config import Config class ProjectStatus(str, Enum): - """项目状态""" - CREATED = "created" # 刚创建,文件已上传 - ONTOLOGY_GENERATED = "ontology_generated" # 本体已生成 - GRAPH_BUILDING = "graph_building" # 图谱构建中 - GRAPH_COMPLETED = "graph_completed" # 图谱构建完成 - FAILED = "failed" # 失败 + """Project status""" + CREATED = "created" # Just created; files uploaded + ONTOLOGY_GENERATED = "ontology_generated" # Ontology generated + GRAPH_BUILDING = "graph_building" # Graph building in progress + GRAPH_COMPLETED = "graph_completed" # Graph build complete + FAILED = "failed" # Failed @dataclass class Project: - """项目数据模型""" + """Project data model""" project_id: str name: str status: ProjectStatus created_at: str updated_at: str - - # 文件信息 + + # File info files: List[Dict[str, str]] = field(default_factory=list) # [{filename, path, size}] total_text_length: int = 0 - - # 本体信息(接口1生成后填充) + + # Ontology info (populated after endpoint 1) ontology: Optional[Dict[str, Any]] = None analysis_summary: Optional[str] = None - - # 图谱信息(接口2完成后填充) + + # Graph info (populated after endpoint 2 completes) graph_id: Optional[str] = None graph_build_task_id: Optional[str] = None - - # 配置 + + # Configuration simulation_requirement: Optional[str] = None chunk_size: int = 500 chunk_overlap: int = 50 - - # 错误信息 + + # Error info error: Optional[str] = None - + def to_dict(self) -> Dict[str, Any]: - """转换为字典""" + """Convert to dictionary""" return { "project_id": self.project_id, "name": self.name, @@ -74,7 +74,7 @@ class Project: @classmethod def from_dict(cls, data: Dict[str, Any]) -> 'Project': - """从字典创建""" + """Create from dictionary""" status = data.get('status', 'created') if isinstance(status, str): status = ProjectStatus(status) @@ -99,52 +99,52 @@ class Project: class ProjectManager: - """项目管理器 - 负责项目的持久化存储和检索""" - - # 项目存储根目录 + """Project manager - handles persistent storage and retrieval of projects""" + + # Root directory for project storage PROJECTS_DIR = os.path.join(Config.UPLOAD_FOLDER, 'projects') - + @classmethod def _ensure_projects_dir(cls): - """确保项目目录存在""" + """Ensure the projects directory exists""" os.makedirs(cls.PROJECTS_DIR, exist_ok=True) - + @classmethod def _get_project_dir(cls, project_id: str) -> str: - """获取项目目录路径""" + """Get project directory path""" return os.path.join(cls.PROJECTS_DIR, project_id) - + @classmethod def _get_project_meta_path(cls, project_id: str) -> str: - """获取项目元数据文件路径""" + """Get project metadata file path""" return os.path.join(cls._get_project_dir(project_id), 'project.json') - + @classmethod def _get_project_files_dir(cls, project_id: str) -> str: - """获取项目文件存储目录""" + """Get project files storage directory""" return os.path.join(cls._get_project_dir(project_id), 'files') - + @classmethod def _get_project_text_path(cls, project_id: str) -> str: - """获取项目提取文本存储路径""" + """Get path for storing the extracted project text""" return os.path.join(cls._get_project_dir(project_id), 'extracted_text.txt') - + @classmethod def create_project(cls, name: str = "Unnamed Project") -> Project: """ - 创建新项目 - + Create a new project. + Args: - name: 项目名称 - + name: project name + Returns: - 新创建的Project对象 + newly created Project object """ cls._ensure_projects_dir() - + project_id = f"proj_{uuid.uuid4().hex[:12]}" now = datetime.now().isoformat() - + project = Project( project_id=project_id, name=name, @@ -152,21 +152,21 @@ class ProjectManager: created_at=now, updated_at=now ) - - # 创建项目目录结构 + + # Create project directory structure project_dir = cls._get_project_dir(project_id) files_dir = cls._get_project_files_dir(project_id) os.makedirs(project_dir, exist_ok=True) os.makedirs(files_dir, exist_ok=True) - - # 保存项目元数据 + + # Save project metadata cls.save_project(project) - + return project - + @classmethod def save_project(cls, project: Project) -> None: - """保存项目元数据""" + """Save project metadata""" project.updated_at = datetime.now().isoformat() meta_path = cls._get_project_meta_path(project.project_id) @@ -176,13 +176,13 @@ class ProjectManager: @classmethod def get_project(cls, project_id: str) -> Optional[Project]: """ - 获取项目 - + Get a project. + Args: - project_id: 项目ID - + project_id: project ID + Returns: - Project对象,如果不存在返回None + Project object, or None if not found """ meta_path = cls._get_project_meta_path(project_id) @@ -197,23 +197,23 @@ class ProjectManager: @classmethod def list_projects(cls, limit: int = 50) -> List[Project]: """ - 列出所有项目 - + List all projects. + Args: - limit: 返回数量限制 - + limit: result count limit + Returns: - 项目列表,按创建时间倒序 + list of projects sorted by creation time, descending """ cls._ensure_projects_dir() - + projects = [] for project_id in os.listdir(cls.PROJECTS_DIR): project = cls.get_project(project_id) if project: projects.append(project) - - # 按创建时间倒序排序 + + # Sort by creation time, descending projects.sort(key=lambda p: p.created_at, reverse=True) return projects[:limit] @@ -221,13 +221,13 @@ class ProjectManager: @classmethod def delete_project(cls, project_id: str) -> bool: """ - 删除项目及其所有文件 - + Delete a project and all its files. + Args: - project_id: 项目ID - + project_id: project ID + Returns: - 是否删除成功 + True if successfully deleted """ project_dir = cls._get_project_dir(project_id) @@ -240,28 +240,28 @@ class ProjectManager: @classmethod def save_file_to_project(cls, project_id: str, file_storage, original_filename: str) -> Dict[str, str]: """ - 保存上传的文件到项目目录 - + Save an uploaded file to the project directory. + Args: - project_id: 项目ID - file_storage: Flask的FileStorage对象 - original_filename: 原始文件名 - + project_id: project ID + file_storage: Flask FileStorage object + original_filename: original filename + Returns: - 文件信息字典 {filename, path, size} + file info dict {filename, path, size} """ files_dir = cls._get_project_files_dir(project_id) os.makedirs(files_dir, exist_ok=True) - - # 生成安全的文件名 + + # Generate a safe filename ext = os.path.splitext(original_filename)[1].lower() safe_filename = f"{uuid.uuid4().hex[:8]}{ext}" file_path = os.path.join(files_dir, safe_filename) - - # 保存文件 + + # Save file file_storage.save(file_path) - - # 获取文件大小 + + # Get file size file_size = os.path.getsize(file_path) return { @@ -273,14 +273,14 @@ class ProjectManager: @classmethod def save_extracted_text(cls, project_id: str, text: str) -> None: - """保存提取的文本""" + """Save extracted text""" text_path = cls._get_project_text_path(project_id) with open(text_path, 'w', encoding='utf-8') as f: f.write(text) @classmethod def get_extracted_text(cls, project_id: str) -> Optional[str]: - """获取提取的文本""" + """Get extracted text""" text_path = cls._get_project_text_path(project_id) if not os.path.exists(text_path): @@ -291,7 +291,7 @@ class ProjectManager: @classmethod def get_project_files(cls, project_id: str) -> List[str]: - """获取项目的所有文件路径""" + """Get all file paths for a project""" files_dir = cls._get_project_files_dir(project_id) if not os.path.exists(files_dir): diff --git a/backend/app/models/task.py b/backend/app/models/task.py index dfebed23..7a6c4f53 100644 --- a/backend/app/models/task.py +++ b/backend/app/models/task.py @@ -1,6 +1,6 @@ """ -任务状态管理 -用于跟踪长时间运行的任务(如图谱构建) +Task state management +Used to track long-running tasks (e.g. graph building). """ import uuid @@ -14,30 +14,30 @@ from ..utils.locale import t class TaskStatus(str, Enum): - """任务状态枚举""" - PENDING = "pending" # 等待中 - PROCESSING = "processing" # 处理中 - COMPLETED = "completed" # 已完成 - FAILED = "failed" # 失败 + """Task status enum""" + PENDING = "pending" # Waiting + PROCESSING = "processing" # In progress + COMPLETED = "completed" # Completed + FAILED = "failed" # Failed @dataclass class Task: - """任务数据类""" + """Task data class""" task_id: str task_type: str status: TaskStatus created_at: datetime updated_at: datetime - progress: int = 0 # 总进度百分比 0-100 - message: str = "" # 状态消息 - result: Optional[Dict] = None # 任务结果 - error: Optional[str] = None # 错误信息 - metadata: Dict = field(default_factory=dict) # 额外元数据 - progress_detail: Dict = field(default_factory=dict) # 详细进度信息 - + progress: int = 0 # Total progress percentage 0-100 + message: str = "" # Status message + result: Optional[Dict] = None # Task result + error: Optional[str] = None # Error info + metadata: Dict = field(default_factory=dict) # Extra metadata + progress_detail: Dict = field(default_factory=dict) # Detailed progress info + def to_dict(self) -> Dict[str, Any]: - """转换为字典""" + """Convert to dictionary""" return { "task_id": self.task_id, "task_type": self.task_type, @@ -55,15 +55,15 @@ class Task: class TaskManager: """ - 任务管理器 - 线程安全的任务状态管理 + Task manager + Thread-safe task state management """ - + _instance = None _lock = threading.Lock() - + def __new__(cls): - """单例模式""" + """Singleton pattern""" if cls._instance is None: with cls._lock: if cls._instance is None: @@ -74,14 +74,14 @@ class TaskManager: def create_task(self, task_type: str, metadata: Optional[Dict] = None) -> str: """ - 创建新任务 - + Create a new task. + Args: - task_type: 任务类型 - metadata: 额外元数据 - + task_type: task type + metadata: extra metadata + Returns: - 任务ID + task ID """ task_id = str(uuid.uuid4()) now = datetime.now() @@ -101,7 +101,7 @@ class TaskManager: return task_id def get_task(self, task_id: str) -> Optional[Task]: - """获取任务""" + """Get a task""" with self._task_lock: return self._tasks.get(task_id) @@ -116,16 +116,16 @@ class TaskManager: progress_detail: Optional[Dict] = None ): """ - 更新任务状态 - + Update task status. + Args: - task_id: 任务ID - status: 新状态 - progress: 进度 - message: 消息 - result: 结果 - error: 错误信息 - progress_detail: 详细进度信息 + task_id: task ID + status: new status + progress: progress + message: message + result: result + error: error info + progress_detail: detailed progress info """ with self._task_lock: task = self._tasks.get(task_id) @@ -145,7 +145,7 @@ class TaskManager: task.progress_detail = progress_detail def complete_task(self, task_id: str, result: Dict): - """标记任务完成""" + """Mark task as complete""" self.update_task( task_id, status=TaskStatus.COMPLETED, @@ -155,7 +155,7 @@ class TaskManager: ) def fail_task(self, task_id: str, error: str): - """标记任务失败""" + """Mark task as failed""" self.update_task( task_id, status=TaskStatus.FAILED, @@ -164,7 +164,7 @@ class TaskManager: ) def list_tasks(self, task_type: Optional[str] = None) -> list: - """列出任务""" + """List tasks""" with self._task_lock: tasks = list(self._tasks.values()) if task_type: @@ -172,7 +172,7 @@ class TaskManager: return [t.to_dict() for t in sorted(tasks, key=lambda x: x.created_at, reverse=True)] def cleanup_old_tasks(self, max_age_hours: int = 24): - """清理旧任务""" + """Clean up old tasks""" from datetime import timedelta cutoff = datetime.now() - timedelta(hours=max_age_hours) diff --git a/backend/app/services/__init__.py b/backend/app/services/__init__.py index 8db85d86..96af8b71 100644 --- a/backend/app/services/__init__.py +++ b/backend/app/services/__init__.py @@ -1,5 +1,5 @@ """ -业务服务模块 +Business services module """ from .ontology_generator import OntologyGenerator diff --git a/backend/app/services/graph_builder.py b/backend/app/services/graph_builder.py index 37c9969c..566c4321 100644 --- a/backend/app/services/graph_builder.py +++ b/backend/app/services/graph_builder.py @@ -1,6 +1,6 @@ """ -图谱构建服务 -接口2:使用Zep API构建Standalone Graph +Graph building service +Endpoint 2: Build a Standalone Graph using the Zep API """ import os @@ -22,7 +22,7 @@ from ..utils.locale import t, get_locale, set_locale @dataclass class GraphInfo: - """图谱信息""" + """Graph info""" graph_id: str node_count: int edge_count: int @@ -39,14 +39,14 @@ class GraphInfo: class GraphBuilderService: """ - 图谱构建服务 - 负责调用Zep API构建知识图谱 + Graph building service + Responsible for calling the Zep API to build the knowledge graph. """ - + def __init__(self, api_key: Optional[str] = None): self.api_key = api_key or Config.ZEP_API_KEY if not self.api_key: - raise ValueError("ZEP_API_KEY 未配置") + raise ValueError("ZEP_API_KEY is not configured") self.client = Zep(api_key=self.api_key) self.task_manager = TaskManager() @@ -61,20 +61,20 @@ class GraphBuilderService: batch_size: int = 3 ) -> str: """ - 异步构建图谱 - + Build the graph asynchronously. + Args: - text: 输入文本 - ontology: 本体定义(来自接口1的输出) - graph_name: 图谱名称 - chunk_size: 文本块大小 - chunk_overlap: 块重叠大小 - batch_size: 每批发送的块数量 - + text: input text + ontology: ontology definition (output from endpoint 1) + graph_name: graph name + chunk_size: text chunk size + chunk_overlap: chunk overlap size + batch_size: number of chunks per batch + Returns: - 任务ID + task ID """ - # 创建任务 + # Create task task_id = self.task_manager.create_task( task_type="graph_build", metadata={ @@ -87,7 +87,7 @@ class GraphBuilderService: # Capture locale before spawning background thread current_locale = get_locale() - # 在后台线程中执行构建 + # Run build in background thread thread = threading.Thread( target=self._build_graph_worker, args=(task_id, text, ontology, graph_name, chunk_size, chunk_overlap, batch_size, current_locale) @@ -108,7 +108,7 @@ class GraphBuilderService: batch_size: int, locale: str = 'zh' ): - """图谱构建工作线程""" + """Graph build worker thread""" set_locale(locale) try: self.task_manager.update_task( @@ -118,7 +118,7 @@ class GraphBuilderService: message=t('progress.startBuildingGraph') ) - # 1. 创建图谱 + # 1. Create graph graph_id = self.create_graph(graph_name) self.task_manager.update_task( task_id, @@ -126,7 +126,7 @@ class GraphBuilderService: message=t('progress.graphCreated', graphId=graph_id) ) - # 2. 设置本体 + # 2. Set ontology self.set_ontology(graph_id, ontology) self.task_manager.update_task( task_id, @@ -134,7 +134,7 @@ class GraphBuilderService: message=t('progress.ontologySet') ) - # 3. 文本分块 + # 3. Split text into chunks chunks = TextProcessor.split_text(text, chunk_size, chunk_overlap) total_chunks = len(chunks) self.task_manager.update_task( @@ -143,7 +143,7 @@ class GraphBuilderService: message=t('progress.textSplit', count=total_chunks) ) - # 4. 分批发送数据 + # 4. Send data in batches episode_uuids = self.add_text_batches( graph_id, chunks, batch_size, lambda msg, prog: self.task_manager.update_task( @@ -153,7 +153,7 @@ class GraphBuilderService: ) ) - # 5. 等待Zep处理完成 + # 5. Wait for Zep processing to complete self.task_manager.update_task( task_id, progress=60, @@ -169,7 +169,7 @@ class GraphBuilderService: ) ) - # 6. 获取图谱信息 + # 6. Fetch graph info self.task_manager.update_task( task_id, progress=90, @@ -178,7 +178,7 @@ class GraphBuilderService: graph_info = self._get_graph_info(graph_id) - # 完成 + # Complete self.task_manager.complete_task(task_id, { "graph_id": graph_id, "graph_info": graph_info.to_dict(), @@ -191,7 +191,7 @@ class GraphBuilderService: self.task_manager.fail_task(task_id, error_msg) def create_graph(self, name: str) -> str: - """创建Zep图谱(公开方法)""" + """Create a Zep graph (public method)""" graph_id = f"mirofish_{uuid.uuid4().hex[:16]}" self.client.graph.create( @@ -203,74 +203,74 @@ class GraphBuilderService: return graph_id def set_ontology(self, graph_id: str, ontology: Dict[str, Any]): - """设置图谱本体(公开方法)""" + """Set graph ontology (public method)""" import warnings from typing import Optional from pydantic import Field from zep_cloud.external_clients.ontology import EntityModel, EntityText, EdgeModel - - # 抑制 Pydantic v2 关于 Field(default=None) 的警告 - # 这是 Zep SDK 要求的用法,警告来自动态类创建,可以安全忽略 + + # Suppress Pydantic v2 warnings about Field(default=None) + # This is the usage required by the Zep SDK; warnings come from dynamic class creation and can be safely ignored warnings.filterwarnings('ignore', category=UserWarning, module='pydantic') - - # Zep 保留名称,不能作为属性名 + + # Zep reserved names that cannot be used as attribute names RESERVED_NAMES = {'uuid', 'name', 'group_id', 'name_embedding', 'summary', 'created_at'} - + def safe_attr_name(attr_name: str) -> str: - """将保留名称转换为安全名称""" + """Convert reserved names to safe attribute names""" if attr_name.lower() in RESERVED_NAMES: return f"entity_{attr_name}" return attr_name - # 动态创建实体类型 + # Dynamically create entity types entity_types = {} for entity_def in ontology.get("entity_types", []): name = entity_def["name"] description = entity_def.get("description", f"A {name} entity.") - - # 创建属性字典和类型注解(Pydantic v2 需要) + + # Build attribute dict and type annotations (required by Pydantic v2) attrs = {"__doc__": description} annotations = {} - + for attr_def in entity_def.get("attributes", []): - attr_name = safe_attr_name(attr_def["name"]) # 使用安全名称 + attr_name = safe_attr_name(attr_def["name"]) # Use safe name attr_desc = attr_def.get("description", attr_name) - # Zep API 需要 Field 的 description,这是必需的 + # Zep API requires Field description — this is mandatory attrs[attr_name] = Field(description=attr_desc, default=None) - annotations[attr_name] = Optional[EntityText] # 类型注解 - + annotations[attr_name] = Optional[EntityText] # Type annotation + attrs["__annotations__"] = annotations - - # 动态创建类 + + # Dynamically create class entity_class = type(name, (EntityModel,), attrs) entity_class.__doc__ = description entity_types[name] = entity_class - # 动态创建边类型 + # Dynamically create edge types edge_definitions = {} for edge_def in ontology.get("edge_types", []): name = edge_def["name"] description = edge_def.get("description", f"A {name} relationship.") - - # 创建属性字典和类型注解 + + # Build attribute dict and type annotations attrs = {"__doc__": description} annotations = {} - + for attr_def in edge_def.get("attributes", []): - attr_name = safe_attr_name(attr_def["name"]) # 使用安全名称 + attr_name = safe_attr_name(attr_def["name"]) # Use safe name attr_desc = attr_def.get("description", attr_name) - # Zep API 需要 Field 的 description,这是必需的 + # Zep API requires Field description — this is mandatory attrs[attr_name] = Field(description=attr_desc, default=None) - annotations[attr_name] = Optional[str] # 边属性用str类型 - + annotations[attr_name] = Optional[str] # Edge attributes use str type + attrs["__annotations__"] = annotations - - # 动态创建类 + + # Dynamically create class class_name = ''.join(word.capitalize() for word in name.split('_')) edge_class = type(class_name, (EdgeModel,), attrs) edge_class.__doc__ = description - # 构建source_targets + # Build source_targets source_targets = [] for st in edge_def.get("source_targets", []): source_targets.append( @@ -283,7 +283,7 @@ class GraphBuilderService: if source_targets: edge_definitions[name] = (edge_class, source_targets) - # 调用Zep API设置本体 + # Call Zep API to set ontology if entity_types or edge_definitions: self.client.graph.set_ontology( graph_ids=[graph_id], @@ -298,7 +298,7 @@ class GraphBuilderService: batch_size: int = 3, progress_callback: Optional[Callable] = None ) -> List[str]: - """分批添加文本到图谱,返回所有 episode 的 uuid 列表""" + """Add text to the graph in batches; returns a list of all episode UUIDs""" episode_uuids = [] total_chunks = len(chunks) @@ -314,27 +314,27 @@ class GraphBuilderService: progress ) - # 构建episode数据 + # Build episode data episodes = [ EpisodeData(data=chunk, type="text") for chunk in batch_chunks ] - # 发送到Zep + # Send to Zep try: batch_result = self.client.graph.add_batch( graph_id=graph_id, episodes=episodes ) - # 收集返回的 episode uuid + # Collect returned episode UUIDs if batch_result and isinstance(batch_result, list): for ep in batch_result: ep_uuid = getattr(ep, 'uuid_', None) or getattr(ep, 'uuid', None) if ep_uuid: episode_uuids.append(ep_uuid) - # 避免请求过快 + # Avoid sending requests too quickly time.sleep(1) except Exception as e: @@ -350,7 +350,7 @@ class GraphBuilderService: progress_callback: Optional[Callable] = None, timeout: int = 600 ): - """等待所有 episode 处理完成(通过查询每个 episode 的 processed 状态)""" + """Wait for all episodes to finish processing (by polling each episode's processed status)""" if not episode_uuids: if progress_callback: progress_callback(t('progress.noEpisodesWait'), 1.0) @@ -373,42 +373,42 @@ class GraphBuilderService: ) break - # 检查每个 episode 的处理状态 + # Check processing status of each episode for ep_uuid in list(pending_episodes): try: episode = self.client.graph.episode.get(uuid_=ep_uuid) is_processed = getattr(episode, 'processed', False) - + if is_processed: pending_episodes.remove(ep_uuid) completed_count += 1 - + except Exception as e: - # 忽略单个查询错误,继续 + # Ignore individual query errors and continue pass - + elapsed = int(time.time() - start_time) if progress_callback: progress_callback( t('progress.zepProcessing', completed=completed_count, total=total_episodes, pending=len(pending_episodes), elapsed=elapsed), completed_count / total_episodes if total_episodes > 0 else 0 ) - + if pending_episodes: - time.sleep(3) # 每3秒检查一次 + time.sleep(3) # Check every 3 seconds if progress_callback: progress_callback(t('progress.processingComplete', completed=completed_count, total=total_episodes), 1.0) def _get_graph_info(self, graph_id: str) -> GraphInfo: - """获取图谱信息""" - # 获取节点(分页) + """Retrieve graph info""" + # Fetch nodes (paginated) nodes = fetch_all_nodes(self.client, graph_id) - # 获取边(分页) + # Fetch edges (paginated) edges = fetch_all_edges(self.client, graph_id) - # 统计实体类型 + # Count entity types entity_types = set() for node in nodes: if node.labels: @@ -425,25 +425,25 @@ class GraphBuilderService: def get_graph_data(self, graph_id: str) -> Dict[str, Any]: """ - 获取完整图谱数据(包含详细信息) - + Retrieve full graph data (with detailed information). + Args: - graph_id: 图谱ID - + graph_id: graph ID + Returns: - 包含nodes和edges的字典,包括时间信息、属性等详细数据 + Dictionary containing nodes and edges with timestamps, attributes, and other details """ nodes = fetch_all_nodes(self.client, graph_id) edges = fetch_all_edges(self.client, graph_id) - # 创建节点映射用于获取节点名称 + # Build node map for looking up node names node_map = {} for node in nodes: node_map[node.uuid_] = node.name or "" - + nodes_data = [] for node in nodes: - # 获取创建时间 + # Get creation timestamp created_at = getattr(node, 'created_at', None) if created_at: created_at = str(created_at) @@ -459,20 +459,20 @@ class GraphBuilderService: edges_data = [] for edge in edges: - # 获取时间信息 + # Get timestamps created_at = getattr(edge, 'created_at', None) valid_at = getattr(edge, 'valid_at', None) invalid_at = getattr(edge, 'invalid_at', None) expired_at = getattr(edge, 'expired_at', None) - - # 获取 episodes + + # Get episodes episodes = getattr(edge, 'episodes', None) or getattr(edge, 'episode_ids', None) if episodes and not isinstance(episodes, list): episodes = [str(episodes)] elif episodes: episodes = [str(e) for e in episodes] - - # 获取 fact_type + + # Get fact_type fact_type = getattr(edge, 'fact_type', None) or edge.name or "" edges_data.append({ @@ -501,6 +501,6 @@ class GraphBuilderService: } def delete_graph(self, graph_id: str): - """删除图谱""" + """Delete graph""" self.client.graph.delete(graph_id=graph_id) diff --git a/backend/app/services/oasis_profile_generator.py b/backend/app/services/oasis_profile_generator.py index 7704a627..2670454e 100644 --- a/backend/app/services/oasis_profile_generator.py +++ b/backend/app/services/oasis_profile_generator.py @@ -1,11 +1,12 @@ """ -OASIS Agent Profile生成器 -将Zep图谱中的实体转换为OASIS模拟平台所需的Agent Profile格式 +OASIS Agent Profile Generator +Converts entities from the Zep knowledge graph into Agent Profile format +required by the OASIS simulation platform. -优化改进: -1. 调用Zep检索功能二次丰富节点信息 -2. 优化提示词生成非常详细的人设 -3. 区分个人实体和抽象群体实体 +Improvements: +1. Calls Zep retrieval to enrich node information +2. Optimised prompts to generate very detailed personas +3. Distinguishes between individual entities and abstract group entities """ import json @@ -28,49 +29,49 @@ logger = get_logger('mirofish.oasis_profile') @dataclass class OasisAgentProfile: - """OASIS Agent Profile数据结构""" - # 通用字段 + """OASIS Agent Profile data structure""" + # Common fields user_id: int user_name: str name: str bio: str persona: str - - # 可选字段 - Reddit风格 + + # Optional fields - Reddit style karma: int = 1000 - - # 可选字段 - Twitter风格 + + # Optional fields - Twitter style friend_count: int = 100 follower_count: int = 150 statuses_count: int = 500 - - # 额外人设信息 + + # Additional persona information age: Optional[int] = None gender: Optional[str] = None mbti: Optional[str] = None country: Optional[str] = None profession: Optional[str] = None interested_topics: List[str] = field(default_factory=list) - - # 来源实体信息 + + # Source entity information source_entity_uuid: Optional[str] = None source_entity_type: Optional[str] = None created_at: str = field(default_factory=lambda: datetime.now().strftime("%Y-%m-%d")) def to_reddit_format(self) -> Dict[str, Any]: - """转换为Reddit平台格式""" + """Convert to Reddit platform format""" profile = { "user_id": self.user_id, - "username": self.user_name, # OASIS 库要求字段名为 username(无下划线) + "username": self.user_name, # OASIS library requires field name 'username' (no underscore) "name": self.name, "bio": self.bio, "persona": self.persona, "karma": self.karma, "created_at": self.created_at, } - - # 添加额外人设信息(如果有) + + # Add additional persona information (if present) if self.age: profile["age"] = self.age if self.gender: @@ -87,10 +88,10 @@ class OasisAgentProfile: return profile def to_twitter_format(self) -> Dict[str, Any]: - """转换为Twitter平台格式""" + """Convert to Twitter platform format""" profile = { "user_id": self.user_id, - "username": self.user_name, # OASIS 库要求字段名为 username(无下划线) + "username": self.user_name, # OASIS library requires field name 'username' (no underscore) "name": self.name, "bio": self.bio, "persona": self.persona, @@ -99,8 +100,8 @@ class OasisAgentProfile: "statuses_count": self.statuses_count, "created_at": self.created_at, } - - # 添加额外人设信息 + + # Add additional persona information if self.age: profile["age"] = self.age if self.gender: @@ -117,7 +118,7 @@ class OasisAgentProfile: return profile def to_dict(self) -> Dict[str, Any]: - """转换为完整字典格式""" + """Convert to full dictionary format""" return { "user_id": self.user_id, "user_name": self.user_name, @@ -142,17 +143,19 @@ class OasisAgentProfile: class OasisProfileGenerator: """ - OASIS Profile生成器 - - 将Zep图谱中的实体转换为OASIS模拟所需的Agent Profile - - 优化特性: - 1. 调用Zep图谱检索功能获取更丰富的上下文 - 2. 生成非常详细的人设(包括基本信息、职业经历、性格特征、社交媒体行为等) - 3. 区分个人实体和抽象群体实体 + OASIS Profile Generator + + Converts entities from the Zep knowledge graph into Agent Profiles + required for OASIS simulations. + + Key features: + 1. Calls Zep graph retrieval to obtain richer context + 2. Generates very detailed personas (basic info, career history, personality traits, + social media behaviour, etc.) + 3. Distinguishes between individual entities and abstract group entities """ - - # MBTI类型列表 + + # MBTI type list MBTI_TYPES = [ "INTJ", "INTP", "ENTJ", "ENTP", "INFJ", "INFP", "ENFJ", "ENFP", @@ -160,19 +163,19 @@ class OasisProfileGenerator: "ISTP", "ISFP", "ESTP", "ESFP" ] - # 常见国家列表 + # Common country list COUNTRIES = [ "China", "US", "UK", "Japan", "Germany", "France", "Canada", "Australia", "Brazil", "India", "South Korea" ] - # 个人类型实体(需要生成具体人设) + # Individual entity types (require a concrete persona) INDIVIDUAL_ENTITY_TYPES = [ "student", "alumni", "professor", "person", "publicfigure", "expert", "faculty", "official", "journalist", "activist" ] - # 群体/机构类型实体(需要生成群体代表人设) + # Group/institution entity types (require a representative account persona) GROUP_ENTITY_TYPES = [ "university", "governmentagency", "organization", "ngo", "mediaoutlet", "company", "institution", "group", "community" @@ -191,52 +194,52 @@ class OasisProfileGenerator: self.model_name = model_name or Config.LLM_MODEL_NAME if not self.api_key: - raise ValueError("LLM_API_KEY 未配置") - + raise ValueError("LLM_API_KEY is not configured") + self.client = OpenAI( api_key=self.api_key, base_url=self.base_url ) - - # Zep客户端用于检索丰富上下文 + + # Zep client for enriching context via retrieval self.zep_api_key = zep_api_key or Config.ZEP_API_KEY self.zep_client = None self.graph_id = graph_id - + if self.zep_api_key: try: self.zep_client = Zep(api_key=self.zep_api_key) except Exception as e: - logger.warning(f"Zep客户端初始化失败: {e}") + logger.warning(f"Zep client initialisation failed: {e}") def generate_profile_from_entity( - self, - entity: EntityNode, + self, + entity: EntityNode, user_id: int, use_llm: bool = True ) -> OasisAgentProfile: """ - 从Zep实体生成OASIS Agent Profile - + Generate an OASIS Agent Profile from a Zep entity. + Args: - entity: Zep实体节点 - user_id: 用户ID(用于OASIS) - use_llm: 是否使用LLM生成详细人设 - + entity: Zep entity node + user_id: User ID (for OASIS) + use_llm: Whether to use an LLM to generate a detailed persona + Returns: OasisAgentProfile """ entity_type = entity.get_entity_type() or "Entity" - - # 基础信息 + + # Basic information name = entity.name user_name = self._generate_username(name) - - # 构建上下文信息 + + # Build context information context = self._build_entity_context(entity) - + if use_llm: - # 使用LLM生成详细人设 + # Use LLM to generate a detailed persona profile_data = self._generate_profile_with_llm( entity_name=name, entity_type=entity_type, @@ -245,7 +248,7 @@ class OasisProfileGenerator: context=context ) else: - # 使用规则生成基础人设 + # Use rule-based generation for a basic persona profile_data = self._generate_profile_rule_based( entity_name=name, entity_type=entity_type, @@ -274,27 +277,28 @@ class OasisProfileGenerator: ) def _generate_username(self, name: str) -> str: - """生成用户名""" - # 移除特殊字符,转换为小写 + """Generate a username""" + # Remove special characters and convert to lowercase username = name.lower().replace(" ", "_") username = ''.join(c for c in username if c.isalnum() or c == '_') - - # 添加随机后缀避免重复 + + # Add a random suffix to avoid duplicates suffix = random.randint(100, 999) return f"{username}_{suffix}" def _search_zep_for_entity(self, entity: EntityNode) -> Dict[str, Any]: """ - 使用Zep图谱混合搜索功能获取实体相关的丰富信息 - - Zep没有内置混合搜索接口,需要分别搜索edges和nodes然后合并结果。 - 使用并行请求同时搜索,提高效率。 - + Retrieve rich information about an entity using the Zep graph hybrid search. + + Zep has no built-in hybrid search endpoint, so edges and nodes are searched + separately and the results are merged. Parallel requests are used for + efficiency. + Args: - entity: 实体节点对象 - + entity: Entity node object + Returns: - 包含facts, node_summaries, context的字典 + Dictionary containing facts, node_summaries, and context """ import concurrent.futures @@ -309,19 +313,19 @@ class OasisProfileGenerator: "context": "" } - # 必须有graph_id才能进行搜索 + # graph_id is required for searching if not self.graph_id: - logger.debug(f"跳过Zep检索:未设置graph_id") + logger.debug(f"Skipping Zep retrieval: graph_id not set") return results comprehensive_query = t('progress.zepSearchQuery', name=entity_name) def search_edges(): - """搜索边(事实/关系)- 带重试机制""" + """Search edges (facts/relationships) - with retry logic""" max_retries = 3 last_exception = None delay = 2.0 - + for attempt in range(max_retries): try: return self.zep_client.graph.search( @@ -334,19 +338,19 @@ class OasisProfileGenerator: except Exception as e: last_exception = e if attempt < max_retries - 1: - logger.debug(f"Zep边搜索第 {attempt + 1} 次失败: {str(e)[:80]}, 重试中...") + logger.debug(f"Zep edge search attempt {attempt + 1} failed: {str(e)[:80]}, retrying...") time.sleep(delay) delay *= 2 else: - logger.debug(f"Zep边搜索在 {max_retries} 次尝试后仍失败: {e}") + logger.debug(f"Zep edge search failed after {max_retries} attempts: {e}") return None - + def search_nodes(): - """搜索节点(实体摘要)- 带重试机制""" + """Search nodes (entity summaries) - with retry logic""" max_retries = 3 last_exception = None delay = 2.0 - + for attempt in range(max_retries): try: return self.zep_client.graph.search( @@ -359,139 +363,139 @@ class OasisProfileGenerator: except Exception as e: last_exception = e if attempt < max_retries - 1: - logger.debug(f"Zep节点搜索第 {attempt + 1} 次失败: {str(e)[:80]}, 重试中...") + logger.debug(f"Zep node search attempt {attempt + 1} failed: {str(e)[:80]}, retrying...") time.sleep(delay) delay *= 2 else: - logger.debug(f"Zep节点搜索在 {max_retries} 次尝试后仍失败: {e}") + logger.debug(f"Zep node search failed after {max_retries} attempts: {e}") return None try: - # 并行执行edges和nodes搜索 + # Run edge and node searches in parallel with concurrent.futures.ThreadPoolExecutor(max_workers=2) as executor: edge_future = executor.submit(search_edges) node_future = executor.submit(search_nodes) - - # 获取结果 + + # Collect results edge_result = edge_future.result(timeout=30) node_result = node_future.result(timeout=30) - - # 处理边搜索结果 + + # Process edge search results all_facts = set() if edge_result and hasattr(edge_result, 'edges') and edge_result.edges: for edge in edge_result.edges: if hasattr(edge, 'fact') and edge.fact: all_facts.add(edge.fact) results["facts"] = list(all_facts) - - # 处理节点搜索结果 + + # Process node search results all_summaries = set() if node_result and hasattr(node_result, 'nodes') and node_result.nodes: for node in node_result.nodes: if hasattr(node, 'summary') and node.summary: all_summaries.add(node.summary) if hasattr(node, 'name') and node.name and node.name != entity_name: - all_summaries.add(f"相关实体: {node.name}") + all_summaries.add(f"Related entity: {node.name}") results["node_summaries"] = list(all_summaries) - - # 构建综合上下文 + + # Build comprehensive context context_parts = [] if results["facts"]: - context_parts.append("事实信息:\n" + "\n".join(f"- {f}" for f in results["facts"][:20])) + context_parts.append("Facts:\n" + "\n".join(f"- {f}" for f in results["facts"][:20])) if results["node_summaries"]: - context_parts.append("相关实体:\n" + "\n".join(f"- {s}" for s in results["node_summaries"][:10])) + context_parts.append("Related entities:\n" + "\n".join(f"- {s}" for s in results["node_summaries"][:10])) results["context"] = "\n\n".join(context_parts) - - logger.info(f"Zep混合检索完成: {entity_name}, 获取 {len(results['facts'])} 条事实, {len(results['node_summaries'])} 个相关节点") - + + logger.info(f"Zep hybrid retrieval complete: {entity_name}, fetched {len(results['facts'])} facts, {len(results['node_summaries'])} related nodes") + except concurrent.futures.TimeoutError: - logger.warning(f"Zep检索超时 ({entity_name})") + logger.warning(f"Zep retrieval timed out ({entity_name})") except Exception as e: - logger.warning(f"Zep检索失败 ({entity_name}): {e}") + logger.warning(f"Zep retrieval failed ({entity_name}): {e}") return results def _build_entity_context(self, entity: EntityNode) -> str: """ - 构建实体的完整上下文信息 - - 包括: - 1. 实体本身的边信息(事实) - 2. 关联节点的详细信息 - 3. Zep混合检索到的丰富信息 + Build the complete context information for an entity. + + Includes: + 1. Edge information (facts) from the entity itself + 2. Detailed information from related nodes + 3. Rich information retrieved via Zep hybrid search """ context_parts = [] - - # 1. 添加实体属性信息 + + # 1. Add entity attribute information if entity.attributes: attrs = [] for key, value in entity.attributes.items(): if value and str(value).strip(): attrs.append(f"- {key}: {value}") if attrs: - context_parts.append("### 实体属性\n" + "\n".join(attrs)) - - # 2. 添加相关边信息(事实/关系) + context_parts.append("### Entity Attributes\n" + "\n".join(attrs)) + + # 2. Add related edge information (facts/relationships) existing_facts = set() if entity.related_edges: relationships = [] - for edge in entity.related_edges: # 不限制数量 + for edge in entity.related_edges: # no quantity limit fact = edge.get("fact", "") edge_name = edge.get("edge_name", "") direction = edge.get("direction", "") - + if fact: relationships.append(f"- {fact}") existing_facts.add(fact) elif edge_name: if direction == "outgoing": - relationships.append(f"- {entity.name} --[{edge_name}]--> (相关实体)") + relationships.append(f"- {entity.name} --[{edge_name}]--> (related entity)") else: - relationships.append(f"- (相关实体) --[{edge_name}]--> {entity.name}") - + relationships.append(f"- (related entity) --[{edge_name}]--> {entity.name}") + if relationships: - context_parts.append("### 相关事实和关系\n" + "\n".join(relationships)) - - # 3. 添加关联节点的详细信息 + context_parts.append("### Related Facts and Relationships\n" + "\n".join(relationships)) + + # 3. Add detailed information from related nodes if entity.related_nodes: related_info = [] - for node in entity.related_nodes: # 不限制数量 + for node in entity.related_nodes: # no quantity limit node_name = node.get("name", "") node_labels = node.get("labels", []) node_summary = node.get("summary", "") - - # 过滤掉默认标签 + + # Filter out default labels custom_labels = [l for l in node_labels if l not in ["Entity", "Node"]] label_str = f" ({', '.join(custom_labels)})" if custom_labels else "" - + if node_summary: related_info.append(f"- **{node_name}**{label_str}: {node_summary}") else: related_info.append(f"- **{node_name}**{label_str}") - + if related_info: - context_parts.append("### 关联实体信息\n" + "\n".join(related_info)) - - # 4. 使用Zep混合检索获取更丰富的信息 + context_parts.append("### Related Entity Information\n" + "\n".join(related_info)) + + # 4. Use Zep hybrid search to obtain richer information zep_results = self._search_zep_for_entity(entity) - + if zep_results.get("facts"): - # 去重:排除已存在的事实 + # Deduplicate: exclude already-present facts new_facts = [f for f in zep_results["facts"] if f not in existing_facts] if new_facts: - context_parts.append("### Zep检索到的事实信息\n" + "\n".join(f"- {f}" for f in new_facts[:15])) - + context_parts.append("### Facts Retrieved from Zep\n" + "\n".join(f"- {f}" for f in new_facts[:15])) + if zep_results.get("node_summaries"): - context_parts.append("### Zep检索到的相关节点\n" + "\n".join(f"- {s}" for s in zep_results["node_summaries"][:10])) + context_parts.append("### Related Nodes Retrieved from Zep\n" + "\n".join(f"- {s}" for s in zep_results["node_summaries"][:10])) return "\n\n".join(context_parts) def _is_individual_entity(self, entity_type: str) -> bool: - """判断是否是个人类型实体""" + """Check whether the entity type is an individual""" return entity_type.lower() in self.INDIVIDUAL_ENTITY_TYPES - + def _is_group_entity(self, entity_type: str) -> bool: - """判断是否是群体/机构类型实体""" + """Check whether the entity type is a group or institution""" return entity_type.lower() in self.GROUP_ENTITY_TYPES def _generate_profile_with_llm( @@ -503,11 +507,11 @@ class OasisProfileGenerator: context: str ) -> Dict[str, Any]: """ - 使用LLM生成非常详细的人设 - - 根据实体类型区分: - - 个人实体:生成具体的人物设定 - - 群体/机构实体:生成代表性账号设定 + Use an LLM to generate a very detailed persona. + + Distinguishes by entity type: + - Individual entities: generate a concrete character profile + - Group/institution entities: generate a representative account profile """ is_individual = self._is_individual_entity(entity_type) @@ -521,10 +525,10 @@ class OasisProfileGenerator: entity_name, entity_type, entity_summary, entity_attributes, context ) - # 尝试多次生成,直到成功或达到最大重试次数 + # Attempt multiple times until successful or max retries reached max_attempts = 3 last_error = None - + for attempt in range(max_attempts): try: response = self.client.chat.completions.create( @@ -534,144 +538,144 @@ class OasisProfileGenerator: {"role": "user", "content": prompt} ], response_format={"type": "json_object"}, - temperature=0.7 - (attempt * 0.1) # 每次重试降低温度 - # 不设置max_tokens,让LLM自由发挥 + temperature=0.7 - (attempt * 0.1) # lower temperature on each retry + # max_tokens not set — let the LLM respond freely ) - + content = response.choices[0].message.content - - # 检查是否被截断(finish_reason不是'stop') + + # Check for truncation (finish_reason is not 'stop') finish_reason = response.choices[0].finish_reason if finish_reason == 'length': - logger.warning(f"LLM输出被截断 (attempt {attempt+1}), 尝试修复...") + logger.warning(f"LLM output truncated (attempt {attempt+1}), attempting repair...") content = self._fix_truncated_json(content) - - # 尝试解析JSON + + # Try to parse JSON try: result = json.loads(content) - - # 验证必需字段 + + # Validate required fields if "bio" not in result or not result["bio"]: result["bio"] = entity_summary[:200] if entity_summary else f"{entity_type}: {entity_name}" if "persona" not in result or not result["persona"]: - result["persona"] = entity_summary or f"{entity_name}是一个{entity_type}。" - + result["persona"] = entity_summary or f"{entity_name} is a {entity_type}." + return result - + except json.JSONDecodeError as je: - logger.warning(f"JSON解析失败 (attempt {attempt+1}): {str(je)[:80]}") - - # 尝试修复JSON + logger.warning(f"JSON parse failed (attempt {attempt+1}): {str(je)[:80]}") + + # Attempt to repair JSON result = self._try_fix_json(content, entity_name, entity_type, entity_summary) if result.get("_fixed"): del result["_fixed"] return result - + last_error = je - + except Exception as e: - logger.warning(f"LLM调用失败 (attempt {attempt+1}): {str(e)[:80]}") + logger.warning(f"LLM call failed (attempt {attempt+1}): {str(e)[:80]}") last_error = e import time - time.sleep(1 * (attempt + 1)) # 指数退避 - - logger.warning(f"LLM生成人设失败({max_attempts}次尝试): {last_error}, 使用规则生成") + time.sleep(1 * (attempt + 1)) # exponential back-off + + logger.warning(f"LLM persona generation failed after {max_attempts} attempts: {last_error}, falling back to rule-based generation") return self._generate_profile_rule_based( entity_name, entity_type, entity_summary, entity_attributes ) def _fix_truncated_json(self, content: str) -> str: - """修复被截断的JSON(输出被max_tokens限制截断)""" + """Repair JSON that was truncated by a max_tokens limit""" import re - - # 如果JSON被截断,尝试闭合它 + + # If the JSON was truncated, attempt to close it content = content.strip() - - # 计算未闭合的括号 + + # Count unclosed braces open_braces = content.count('{') - content.count('}') open_brackets = content.count('[') - content.count(']') - - # 检查是否有未闭合的字符串 - # 简单检查:如果最后一个引号后没有逗号或闭合括号,可能是字符串被截断 + + # Check for unclosed strings + # Simple check: if the last character is not a comma or closing bracket, + # the string may have been cut off if content and content[-1] not in '",}]': - # 尝试闭合字符串 + # Attempt to close the string content += '"' - - # 闭合括号 + + # Close brackets content += ']' * open_brackets content += '}' * open_braces - + return content def _try_fix_json(self, content: str, entity_name: str, entity_type: str, entity_summary: str = "") -> Dict[str, Any]: - """尝试修复损坏的JSON""" + """Attempt to repair damaged JSON""" import re - - # 1. 首先尝试修复被截断的情况 + + # 1. First try to fix truncation content = self._fix_truncated_json(content) - - # 2. 尝试提取JSON部分 + + # 2. Attempt to extract the JSON portion json_match = re.search(r'\{[\s\S]*\}', content) if json_match: json_str = json_match.group() - - # 3. 处理字符串中的换行符问题 - # 找到所有字符串值并替换其中的换行符 + + # 3. Fix newlines inside string values def fix_string_newlines(match): s = match.group(0) - # 替换字符串内的实际换行符为空格 + # Replace actual newlines inside strings with spaces s = s.replace('\n', ' ').replace('\r', ' ') - # 替换多余空格 + # Collapse multiple spaces s = re.sub(r'\s+', ' ', s) return s - - # 匹配JSON字符串值 + + # Match JSON string values json_str = re.sub(r'"[^"\\]*(?:\\.[^"\\]*)*"', fix_string_newlines, json_str) - - # 4. 尝试解析 + + # 4. Try to parse try: result = json.loads(json_str) result["_fixed"] = True return result except json.JSONDecodeError as e: - # 5. 如果还是失败,尝试更激进的修复 + # 5. Still failing — try a more aggressive repair try: - # 移除所有控制字符 + # Remove all control characters json_str = re.sub(r'[\x00-\x1f\x7f-\x9f]', ' ', json_str) - # 替换所有连续空白 + # Collapse all consecutive whitespace json_str = re.sub(r'\s+', ' ', json_str) result = json.loads(json_str) result["_fixed"] = True return result except: pass - - # 6. 尝试从内容中提取部分信息 + + # 6. Attempt to extract partial information from the content bio_match = re.search(r'"bio"\s*:\s*"([^"]*)"', content) - persona_match = re.search(r'"persona"\s*:\s*"([^"]*)', content) # 可能被截断 - + persona_match = re.search(r'"persona"\s*:\s*"([^"]*)', content) # may be truncated + bio = bio_match.group(1) if bio_match else (entity_summary[:200] if entity_summary else f"{entity_type}: {entity_name}") - persona = persona_match.group(1) if persona_match else (entity_summary or f"{entity_name}是一个{entity_type}。") - - # 如果提取到了有意义的内容,标记为已修复 + persona = persona_match.group(1) if persona_match else (entity_summary or f"{entity_name} is a {entity_type}.") + + # If we extracted meaningful content, mark as fixed if bio_match or persona_match: - logger.info(f"从损坏的JSON中提取了部分信息") + logger.info(f"Extracted partial information from damaged JSON") return { "bio": bio, "persona": persona, "_fixed": True } - - # 7. 完全失败,返回基础结构 - logger.warning(f"JSON修复失败,返回基础结构") + + # 7. Complete failure — return a minimal structure + logger.warning(f"JSON repair failed, returning minimal structure") return { "bio": entity_summary[:200] if entity_summary else f"{entity_type}: {entity_name}", - "persona": entity_summary or f"{entity_name}是一个{entity_type}。" + "persona": entity_summary or f"{entity_name} is a {entity_type}." } def _get_system_prompt(self, is_individual: bool) -> str: - """获取系统提示词""" - base_prompt = "你是社交媒体用户画像生成专家。生成详细、真实的人设用于舆论模拟,最大程度还原已有现实情况。必须返回有效的JSON格式,所有字符串值不能包含未转义的换行符。" + """Get the system prompt""" + base_prompt = "You are an expert in generating social media user profiles. Generate detailed, realistic personas for public opinion simulations, reproducing existing real-world situations as faithfully as possible. You must return valid JSON. All string values must not contain unescaped newline characters." return f"{base_prompt}\n\n{get_language_instruction()}" def _build_individual_persona_prompt( @@ -682,45 +686,45 @@ class OasisProfileGenerator: entity_attributes: Dict[str, Any], context: str ) -> str: - """构建个人实体的详细人设提示词""" - - attrs_str = json.dumps(entity_attributes, ensure_ascii=False) if entity_attributes else "无" - context_str = context[:3000] if context else "无额外上下文" - - return f"""为实体生成详细的社交媒体用户人设,最大程度还原已有现实情况。 + """Build a detailed persona prompt for an individual entity""" -实体名称: {entity_name} -实体类型: {entity_type} -实体摘要: {entity_summary} -实体属性: {attrs_str} + attrs_str = json.dumps(entity_attributes, ensure_ascii=False) if entity_attributes else "none" + context_str = context[:3000] if context else "no additional context" -上下文信息: + return f"""Generate a detailed social media user persona for the entity below, reproducing existing real-world situations as faithfully as possible. + +Entity name: {entity_name} +Entity type: {entity_type} +Entity summary: {entity_summary} +Entity attributes: {attrs_str} + +Context information: {context_str} -请生成JSON,包含以下字段: +Generate JSON with the following fields: -1. bio: 社交媒体简介,200字 -2. persona: 详细人设描述(2000字的纯文本),需包含: - - 基本信息(年龄、职业、教育背景、所在地) - - 人物背景(重要经历、与事件的关联、社会关系) - - 性格特征(MBTI类型、核心性格、情绪表达方式) - - 社交媒体行为(发帖频率、内容偏好、互动风格、语言特点) - - 立场观点(对话题的态度、可能被激怒/感动的内容) - - 独特特征(口头禅、特殊经历、个人爱好) - - 个人记忆(人设的重要部分,要介绍这个个体与事件的关联,以及这个个体在事件中的已有动作与反应) -3. age: 年龄数字(必须是整数) -4. gender: 性别,必须是英文: "male" 或 "female" -5. mbti: MBTI类型(如INTJ、ENFP等) -6. country: 国家(使用中文,如"中国") -7. profession: 职业 -8. interested_topics: 感兴趣话题数组 +1. bio: social media profile, ~200 words +2. persona: detailed persona description (~2000 words of plain text), covering: + - Basic information (age, occupation, educational background, location) + - Background (important experiences, connection to events, social relationships) + - Personality traits (MBTI type, core character, emotional expression style) + - Social media behaviour (posting frequency, content preferences, interaction style, language characteristics) + - Stance and opinions (attitude towards topics, content that might provoke or move them) + - Distinctive features (catchphrases, unique experiences, personal hobbies) + - Personal memory (important part of the persona: describe this individual's connection to the event and their existing actions and reactions within it) +3. age: numeric age (must be an integer) +4. gender: must be English: "male" or "female" +5. mbti: MBTI type (e.g. INTJ, ENFP) +6. country: country name (e.g. "China") +7. profession: occupation +8. interested_topics: array of topics of interest -重要: -- 所有字段值必须是字符串或数字,不要使用换行符 -- persona必须是一段连贯的文字描述 -- {get_language_instruction()} (gender字段必须用英文male/female) -- 内容要与实体信息保持一致 -- age必须是有效的整数,gender必须是"male"或"female" +Important: +- All field values must be strings or numbers; do not use newline characters +- persona must be a single continuous block of text +- {get_language_instruction()} (the gender field must use English: male/female) +- Content must be consistent with the entity information +- age must be a valid integer; gender must be "male" or "female" """ def _build_group_persona_prompt( @@ -731,45 +735,45 @@ class OasisProfileGenerator: entity_attributes: Dict[str, Any], context: str ) -> str: - """构建群体/机构实体的详细人设提示词""" - - attrs_str = json.dumps(entity_attributes, ensure_ascii=False) if entity_attributes else "无" - context_str = context[:3000] if context else "无额外上下文" - - return f"""为机构/群体实体生成详细的社交媒体账号设定,最大程度还原已有现实情况。 + """Build a detailed persona prompt for a group/institution entity""" -实体名称: {entity_name} -实体类型: {entity_type} -实体摘要: {entity_summary} -实体属性: {attrs_str} + attrs_str = json.dumps(entity_attributes, ensure_ascii=False) if entity_attributes else "none" + context_str = context[:3000] if context else "no additional context" -上下文信息: + return f"""Generate detailed social media account settings for an institution/group entity, reproducing existing real-world situations as faithfully as possible. + +Entity name: {entity_name} +Entity type: {entity_type} +Entity summary: {entity_summary} +Entity attributes: {attrs_str} + +Context information: {context_str} -请生成JSON,包含以下字段: +Generate JSON with the following fields: -1. bio: 官方账号简介,200字,专业得体 -2. persona: 详细账号设定描述(2000字的纯文本),需包含: - - 机构基本信息(正式名称、机构性质、成立背景、主要职能) - - 账号定位(账号类型、目标受众、核心功能) - - 发言风格(语言特点、常用表达、禁忌话题) - - 发布内容特点(内容类型、发布频率、活跃时间段) - - 立场态度(对核心话题的官方立场、面对争议的处理方式) - - 特殊说明(代表的群体画像、运营习惯) - - 机构记忆(机构人设的重要部分,要介绍这个机构与事件的关联,以及这个机构在事件中的已有动作与反应) -3. age: 固定填30(机构账号的虚拟年龄) -4. gender: 固定填"other"(机构账号使用other表示非个人) -5. mbti: MBTI类型,用于描述账号风格,如ISTJ代表严谨保守 -6. country: 国家(使用中文,如"中国") -7. profession: 机构职能描述 -8. interested_topics: 关注领域数组 +1. bio: official account profile, ~200 words, professional and appropriate in tone +2. persona: detailed account description (~2000 words of plain text), covering: + - Institutional basics (official name, nature of the institution, founding background, main functions) + - Account positioning (account type, target audience, core purpose) + - Communication style (language characteristics, common expressions, taboo topics) + - Content characteristics (content types, posting frequency, active time periods) + - Stance and attitude (official position on key topics, approach to handling controversies) + - Special notes (audience profile represented, operational habits) + - Institutional memory (important part of the persona: describe this institution's connection to the event and its existing actions and reactions within it) +3. age: fixed value 30 (virtual age for institutional accounts) +4. gender: fixed value "other" (institutional accounts use "other" to denote non-individual) +5. mbti: MBTI type describing the account's style, e.g. ISTJ for rigorous and conservative +6. country: country name (e.g. "China") +7. profession: description of the institution's function +8. interested_topics: array of focus areas -重要: -- 所有字段值必须是字符串或数字,不允许null值 -- persona必须是一段连贯的文字描述,不要使用换行符 -- {get_language_instruction()} (gender字段必须用英文"other") -- age必须是整数30,gender必须是字符串"other" -- 机构账号发言要符合其身份定位""" +Important: +- All field values must be strings or numbers; null values are not allowed +- persona must be a single continuous block of text; do not use newline characters +- {get_language_instruction()} (gender field must use the English string "other") +- age must be the integer 30; gender must be the string "other" +- Institutional account speech must be consistent with its identity and positioning""" def _generate_profile_rule_based( self, @@ -778,9 +782,9 @@ class OasisProfileGenerator: entity_summary: str, entity_attributes: Dict[str, Any] ) -> Dict[str, Any]: - """使用规则生成基础人设""" - - # 根据实体类型生成不同的人设 + """Generate a basic persona using rules""" + + # Generate different personas according to entity type entity_type_lower = entity_type.lower() if entity_type_lower in ["student", "alumni"]: @@ -811,28 +815,28 @@ class OasisProfileGenerator: return { "bio": f"Official account for {entity_name}. News and updates.", "persona": f"{entity_name} is a media entity that reports news and facilitates public discourse. The account shares timely updates and engages with the audience on current events.", - "age": 30, # 机构虚拟年龄 - "gender": "other", # 机构使用other - "mbti": "ISTJ", # 机构风格:严谨保守 - "country": "中国", + "age": 30, # virtual age for institutional accounts + "gender": "other", # institutions use "other" + "mbti": "ISTJ", # institutional style: rigorous and conservative + "country": "China", "profession": "Media", "interested_topics": ["General News", "Current Events", "Public Affairs"], } - + elif entity_type_lower in ["university", "governmentagency", "ngo", "organization"]: return { "bio": f"Official account of {entity_name}.", "persona": f"{entity_name} is an institutional entity that communicates official positions, announcements, and engages with stakeholders on relevant matters.", - "age": 30, # 机构虚拟年龄 - "gender": "other", # 机构使用other - "mbti": "ISTJ", # 机构风格:严谨保守 - "country": "中国", + "age": 30, # virtual age for institutional accounts + "gender": "other", # institutions use "other" + "mbti": "ISTJ", # institutional style: rigorous and conservative + "country": "China", "profession": entity_type, "interested_topics": ["Public Policy", "Community", "Official Announcements"], } - + else: - # 默认人设 + # Default persona return { "bio": entity_summary[:150] if entity_summary else f"{entity_type}: {entity_name}", "persona": entity_summary or f"{entity_name} is a {entity_type.lower()} participating in social discussions.", @@ -845,7 +849,7 @@ class OasisProfileGenerator: } def set_graph_id(self, graph_id: str): - """设置图谱ID用于Zep检索""" + """Set the graph ID for Zep retrieval""" self.graph_id = graph_id def generate_profiles_from_entities( @@ -859,52 +863,52 @@ class OasisProfileGenerator: output_platform: str = "reddit" ) -> List[OasisAgentProfile]: """ - 批量从实体生成Agent Profile(支持并行生成) - + Bulk-generate Agent Profiles from entities (supports parallel generation). + Args: - entities: 实体列表 - use_llm: 是否使用LLM生成详细人设 - progress_callback: 进度回调函数 (current, total, message) - graph_id: 图谱ID,用于Zep检索获取更丰富上下文 - parallel_count: 并行生成数量,默认5 - realtime_output_path: 实时写入的文件路径(如果提供,每生成一个就写入一次) - output_platform: 输出平台格式 ("reddit" 或 "twitter") - + entities: List of entity nodes + use_llm: Whether to use an LLM to generate detailed personas + progress_callback: Progress callback function (current, total, message) + graph_id: Graph ID used for Zep retrieval to obtain richer context + parallel_count: Number of parallel workers, default 5 + realtime_output_path: File path for real-time writing (if provided, written after each profile is generated) + output_platform: Output platform format ("reddit" or "twitter") + Returns: - Agent Profile列表 + List of Agent Profiles """ import concurrent.futures from threading import Lock - # 设置graph_id用于Zep检索 + # Set graph_id for Zep retrieval if graph_id: self.graph_id = graph_id - + total = len(entities) - profiles = [None] * total # 预分配列表保持顺序 - completed_count = [0] # 使用列表以便在闭包中修改 + profiles = [None] * total # pre-allocate list to preserve order + completed_count = [0] # use list so it can be mutated inside a closure lock = Lock() - - # 实时写入文件的辅助函数 + + # Helper for real-time file writing def save_profiles_realtime(): - """实时保存已生成的 profiles 到文件""" + """Save already-generated profiles to disk in real time""" if not realtime_output_path: return - + with lock: - # 过滤出已生成的 profiles + # Filter out profiles that have been generated existing_profiles = [p for p in profiles if p is not None] if not existing_profiles: return - + try: if output_platform == "reddit": - # Reddit JSON 格式 + # Reddit JSON format profiles_data = [p.to_reddit_format() for p in existing_profiles] with open(realtime_output_path, 'w', encoding='utf-8') as f: json.dump(profiles_data, f, ensure_ascii=False, indent=2) else: - # Twitter CSV 格式 + # Twitter CSV format import csv profiles_data = [p.to_twitter_format() for p in existing_profiles] if profiles_data: @@ -914,31 +918,31 @@ class OasisProfileGenerator: writer.writeheader() writer.writerows(profiles_data) except Exception as e: - logger.warning(f"实时保存 profiles 失败: {e}") + logger.warning(f"Real-time profile save failed: {e}") # Capture locale before spawning thread pool workers current_locale = get_locale() def generate_single_profile(idx: int, entity: EntityNode) -> tuple: - """生成单个profile的工作函数""" + """Worker function to generate a single profile""" set_locale(current_locale) entity_type = entity.get_entity_type() or "Entity" - + try: profile = self.generate_profile_from_entity( entity=entity, user_id=idx, use_llm=use_llm ) - - # 实时输出生成的人设到控制台和日志 + + # Print the generated persona to console and logs in real time self._print_generated_profile(entity.name, entity_type, profile) - + return idx, profile, None - + except Exception as e: - logger.error(f"生成实体 {entity.name} 的人设失败: {str(e)}") - # 创建一个基础profile + logger.error(f"Failed to generate persona for entity {entity.name}: {str(e)}") + # Create a fallback profile fallback_profile = OasisAgentProfile( user_id=idx, user_name=self._generate_username(entity.name), @@ -950,20 +954,20 @@ class OasisProfileGenerator: ) return idx, fallback_profile, str(e) - logger.info(f"开始并行生成 {total} 个Agent人设(并行数: {parallel_count})...") + logger.info(f"Starting parallel persona generation for {total} agents (parallel workers: {parallel_count})...") print(f"\n{'='*60}") - print(f"开始生成Agent人设 - 共 {total} 个实体,并行数: {parallel_count}") + print(f"Starting Agent persona generation - {total} entities total, parallel workers: {parallel_count}") print(f"{'='*60}\n") - # 使用线程池并行执行 + # Execute in parallel using a thread pool with concurrent.futures.ThreadPoolExecutor(max_workers=parallel_count) as executor: - # 提交所有任务 + # Submit all tasks future_to_entity = { executor.submit(generate_single_profile, idx, entity): (idx, entity) for idx, entity in enumerate(entities) } - - # 收集结果 + + # Collect results for future in concurrent.futures.as_completed(future_to_entity): idx, entity = future_to_entity[future] entity_type = entity.get_entity_type() or "Entity" @@ -976,23 +980,23 @@ class OasisProfileGenerator: completed_count[0] += 1 current = completed_count[0] - # 实时写入文件 + # Write to file in real time save_profiles_realtime() if progress_callback: progress_callback( - current, - total, - f"已完成 {current}/{total}: {entity.name}({entity_type})" + current, + total, + f"Completed {current}/{total}: {entity.name} ({entity_type})" ) - + if error: - logger.warning(f"[{current}/{total}] {entity.name} 使用备用人设: {error}") + logger.warning(f"[{current}/{total}] {entity.name} using fallback persona: {error}") else: - logger.info(f"[{current}/{total}] 成功生成人设: {entity.name} ({entity_type})") - + logger.info(f"[{current}/{total}] Successfully generated persona: {entity.name} ({entity_type})") + except Exception as e: - logger.error(f"处理实体 {entity.name} 时发生异常: {str(e)}") + logger.error(f"Exception while processing entity {entity.name}: {str(e)}") with lock: completed_count[0] += 1 profiles[idx] = OasisAgentProfile( @@ -1004,44 +1008,44 @@ class OasisProfileGenerator: source_entity_uuid=entity.uuid, source_entity_type=entity_type, ) - # 实时写入文件(即使是备用人设) + # Write to file in real time (even for fallback personas) save_profiles_realtime() - + print(f"\n{'='*60}") - print(f"人设生成完成!共生成 {len([p for p in profiles if p])} 个Agent") + print(f"Persona generation complete! Generated {len([p for p in profiles if p])} agents") print(f"{'='*60}\n") return profiles def _print_generated_profile(self, entity_name: str, entity_type: str, profile: OasisAgentProfile): - """实时输出生成的人设到控制台(完整内容,不截断)""" + """Print the generated persona to console in real time (full content, no truncation)""" separator = "-" * 70 - - # 构建完整输出内容(不截断) - topics_str = ', '.join(profile.interested_topics) if profile.interested_topics else '无' - + + # Build the full output content (no truncation) + topics_str = ', '.join(profile.interested_topics) if profile.interested_topics else 'none' + output_lines = [ f"\n{separator}", t('progress.profileGenerated', name=entity_name, type=entity_type), f"{separator}", - f"用户名: {profile.user_name}", + f"Username: {profile.user_name}", f"", - f"【简介】", + f"[Bio]", f"{profile.bio}", f"", - f"【详细人设】", + f"[Detailed Persona]", f"{profile.persona}", f"", - f"【基本属性】", - f"年龄: {profile.age} | 性别: {profile.gender} | MBTI: {profile.mbti}", - f"职业: {profile.profession} | 国家: {profile.country}", - f"兴趣话题: {topics_str}", + f"[Basic Attributes]", + f"Age: {profile.age} | Gender: {profile.gender} | MBTI: {profile.mbti}", + f"Profession: {profile.profession} | Country: {profile.country}", + f"Topics of Interest: {topics_str}", separator ] - + output = "\n".join(output_lines) - - # 只输出到控制台(避免重复,logger不再输出完整内容) + + # Output to console only (avoid duplication — logger no longer prints full content) print(output) def save_profiles( @@ -1051,16 +1055,16 @@ class OasisProfileGenerator: platform: str = "reddit" ): """ - 保存Profile到文件(根据平台选择正确格式) - - OASIS平台格式要求: - - Twitter: CSV格式 - - Reddit: JSON格式 - + Save profiles to a file in the correct format for the platform. + + OASIS platform format requirements: + - Twitter: CSV format + - Reddit: JSON format + Args: - profiles: Profile列表 - file_path: 文件路径 - platform: 平台类型 ("reddit" 或 "twitter") + profiles: List of profiles + file_path: File path + platform: Platform type ("reddit" or "twitter") """ if platform == "twitter": self._save_twitter_csv(profiles, file_path) @@ -1069,73 +1073,73 @@ class OasisProfileGenerator: def _save_twitter_csv(self, profiles: List[OasisAgentProfile], file_path: str): """ - 保存Twitter Profile为CSV格式(符合OASIS官方要求) - - OASIS Twitter要求的CSV字段: - - user_id: 用户ID(根据CSV顺序从0开始) - - name: 用户真实姓名 - - username: 系统中的用户名 - - user_char: 详细人设描述(注入到LLM系统提示中,指导Agent行为) - - description: 简短的公开简介(显示在用户资料页面) - - user_char vs description 区别: - - user_char: 内部使用,LLM系统提示,决定Agent如何思考和行动 - - description: 外部显示,其他用户可见的简介 + Save Twitter Profiles in CSV format (compliant with OASIS official requirements). + + CSV fields required by OASIS Twitter: + - user_id: User ID (sequential from 0 based on CSV order) + - name: User's real name + - username: Username in the system + - user_char: Detailed persona description (injected into the LLM system prompt to guide Agent behaviour) + - description: Short public profile (displayed on the user profile page) + + user_char vs description: + - user_char: Internal use; LLM system prompt that determines how the Agent thinks and acts + - description: External display; the profile visible to other users """ import csv - - # 确保文件扩展名是.csv + + # Ensure the file extension is .csv if not file_path.endswith('.csv'): file_path = file_path.replace('.json', '.csv') - + with open(file_path, 'w', newline='', encoding='utf-8') as f: writer = csv.writer(f) - - # 写入OASIS要求的表头 + + # Write the OASIS-required header headers = ['user_id', 'name', 'username', 'user_char', 'description'] writer.writerow(headers) - - # 写入数据行 + + # Write data rows for idx, profile in enumerate(profiles): - # user_char: 完整人设(bio + persona),用于LLM系统提示 + # user_char: full persona (bio + persona), used in the LLM system prompt user_char = profile.bio if profile.persona and profile.persona != profile.bio: user_char = f"{profile.bio} {profile.persona}" - # 处理换行符(CSV中用空格替代) + # Replace newlines with spaces in CSV user_char = user_char.replace('\n', ' ').replace('\r', ' ') - - # description: 简短简介,用于外部显示 + + # description: short profile for external display description = profile.bio.replace('\n', ' ').replace('\r', ' ') - + row = [ - idx, # user_id: 从0开始的顺序ID - profile.name, # name: 真实姓名 - profile.user_name, # username: 用户名 - user_char, # user_char: 完整人设(内部LLM使用) - description # description: 简短简介(外部显示) + idx, # user_id: sequential ID starting from 0 + profile.name, # name: real name + profile.user_name, # username: username + user_char, # user_char: full persona (internal LLM use) + description # description: short profile (external display) ] writer.writerow(row) - - logger.info(f"已保存 {len(profiles)} 个Twitter Profile到 {file_path} (OASIS CSV格式)") + + logger.info(f"Saved {len(profiles)} Twitter profiles to {file_path} (OASIS CSV format)") def _normalize_gender(self, gender: Optional[str]) -> str: """ - 标准化gender字段为OASIS要求的英文格式 - - OASIS要求: male, female, other + Normalise the gender field to the English format required by OASIS. + + OASIS accepts: male, female, other """ if not gender: return "other" - + gender_lower = gender.lower().strip() - - # 中文映射 + + # Chinese-to-English mapping gender_map = { "男": "male", "女": "female", "机构": "other", "其他": "other", - # 英文已有 + # English values passed through as-is "male": "male", "female": "female", "other": "other", @@ -1145,61 +1149,62 @@ class OasisProfileGenerator: def _save_reddit_json(self, profiles: List[OasisAgentProfile], file_path: str): """ - 保存Reddit Profile为JSON格式 - - 使用与 to_reddit_format() 一致的格式,确保 OASIS 能正确读取。 - 必须包含 user_id 字段,这是 OASIS agent_graph.get_agent() 匹配的关键! - - 必需字段: - - user_id: 用户ID(整数,用于匹配 initial_posts 中的 poster_agent_id) - - username: 用户名 - - name: 显示名称 - - bio: 简介 - - persona: 详细人设 - - age: 年龄(整数) - - gender: "male", "female", 或 "other" - - mbti: MBTI类型 - - country: 国家 + Save Reddit Profiles in JSON format. + + Uses the same format as to_reddit_format() to ensure OASIS can read it correctly. + The user_id field is mandatory — it is the key used by OASIS agent_graph.get_agent() + to match profiles! + + Required fields: + - user_id: User ID (integer, used to match poster_agent_id in initial_posts) + - username: Username + - name: Display name + - bio: Profile bio + - persona: Detailed persona + - age: Age (integer) + - gender: "male", "female", or "other" + - mbti: MBTI type + - country: Country """ data = [] for idx, profile in enumerate(profiles): - # 使用与 to_reddit_format() 一致的格式 + # Use the same format as to_reddit_format() item = { - "user_id": profile.user_id if profile.user_id is not None else idx, # 关键:必须包含 user_id + "user_id": profile.user_id if profile.user_id is not None else idx, # critical: user_id must be present "username": profile.user_name, "name": profile.name, "bio": profile.bio[:150] if profile.bio else f"{profile.name}", "persona": profile.persona or f"{profile.name} is a participant in social discussions.", "karma": profile.karma if profile.karma else 1000, "created_at": profile.created_at, - # OASIS必需字段 - 确保都有默认值 + # OASIS required fields - ensure all have default values "age": profile.age if profile.age else 30, "gender": self._normalize_gender(profile.gender), "mbti": profile.mbti if profile.mbti else "ISTJ", - "country": profile.country if profile.country else "中国", + "country": profile.country if profile.country else "China", } - - # 可选字段 + + # Optional fields if profile.profession: item["profession"] = profile.profession if profile.interested_topics: item["interested_topics"] = profile.interested_topics - + data.append(item) - + with open(file_path, 'w', encoding='utf-8') as f: json.dump(data, f, ensure_ascii=False, indent=2) - - logger.info(f"已保存 {len(profiles)} 个Reddit Profile到 {file_path} (JSON格式,包含user_id字段)") + + logger.info(f"Saved {len(profiles)} Reddit profiles to {file_path} (JSON format, includes user_id field)") - # 保留旧方法名作为别名,保持向后兼容 + # Keep old method name as an alias for backwards compatibility def save_profiles_to_json( self, profiles: List[OasisAgentProfile], file_path: str, platform: str = "reddit" ): - """[已废弃] 请使用 save_profiles() 方法""" - logger.warning("save_profiles_to_json已废弃,请使用save_profiles方法") + """[Deprecated] Please use save_profiles() instead""" + logger.warning("save_profiles_to_json is deprecated; please use save_profiles instead") self.save_profiles(profiles, file_path, platform) diff --git a/backend/app/services/ontology_generator.py b/backend/app/services/ontology_generator.py index 01a3d799..1dd4d879 100644 --- a/backend/app/services/ontology_generator.py +++ b/backend/app/services/ontology_generator.py @@ -1,6 +1,6 @@ """ -本体生成服务 -接口1:分析文本内容,生成适合社会模拟的实体和关系类型定义 +Ontology generation service +Endpoint 1: Analyze text content and generate entity and relationship type definitions suitable for social simulation. """ import json @@ -14,174 +14,174 @@ logger = logging.getLogger(__name__) def _to_pascal_case(name: str) -> str: - """将任意格式的名称转换为 PascalCase(如 'works_for' -> 'WorksFor', 'person' -> 'Person')""" - # 按非字母数字字符分割 + """Convert a name in any format to PascalCase (e.g. 'works_for' -> 'WorksFor', 'person' -> 'Person')""" + # Split on non-alphanumeric characters parts = re.split(r'[^a-zA-Z0-9]+', name) - # 再按 camelCase 边界分割(如 'camelCase' -> ['camel', 'Case']) + # Also split on camelCase boundaries (e.g. 'camelCase' -> ['camel', 'Case']) words = [] for part in parts: words.extend(re.sub(r'([a-z])([A-Z])', r'\1_\2', part).split('_')) - # 每个词首字母大写,过滤空串 + # Capitalize each word and filter empty strings result = ''.join(word.capitalize() for word in words if word) return result if result else 'Unknown' -# 本体生成的系统提示词 -ONTOLOGY_SYSTEM_PROMPT = """你是一个专业的知识图谱本体设计专家。你的任务是分析给定的文本内容和模拟需求,设计适合**社交媒体舆论模拟**的实体类型和关系类型。 +# System prompt for ontology generation +ONTOLOGY_SYSTEM_PROMPT = """You are a professional knowledge graph ontology design expert. Your task is to analyze the given text content and simulation requirements, and design entity types and relationship types suitable for **social media opinion simulation**. -**重要:你必须输出有效的JSON格式数据,不要输出任何其他内容。** +**Important: You must output valid JSON format data, and nothing else.** -## 核心任务背景 +## Core Task Background -我们正在构建一个**社交媒体舆论模拟系统**。在这个系统中: -- 每个实体都是一个可以在社交媒体上发声、互动、传播信息的"账号"或"主体" -- 实体之间会相互影响、转发、评论、回应 -- 我们需要模拟舆论事件中各方的反应和信息传播路径 +We are building a **social media opinion simulation system**. In this system: +- Every entity is an "account" or "subject" that can speak out, interact, and spread information on social media +- Entities influence each other, repost, comment, and respond +- We need to simulate each party's reaction and the information propagation path during opinion events -因此,**实体必须是现实中真实存在的、可以在社媒上发声和互动的主体**: +Therefore, **entities must be real-world subjects that exist and can speak out and interact on social media**: -**可以是**: -- 具体的个人(公众人物、当事人、意见领袖、专家学者、普通人) -- 公司、企业(包括其官方账号) -- 组织机构(大学、协会、NGO、工会等) -- 政府部门、监管机构 -- 媒体机构(报纸、电视台、自媒体、网站) -- 社交媒体平台本身 -- 特定群体代表(如校友会、粉丝团、维权群体等) +**Can be**: +- Specific individuals (public figures, persons involved, opinion leaders, experts and scholars, ordinary people) +- Companies and enterprises (including their official accounts) +- Organizations (universities, associations, NGOs, unions, etc.) +- Government departments and regulatory agencies +- Media organizations (newspapers, TV stations, self-media, websites) +- Social media platforms themselves +- Representatives of specific groups (e.g. alumni associations, fan clubs, rights-protection groups, etc.) -**不可以是**: -- 抽象概念(如"舆论"、"情绪"、"趋势") -- 主题/话题(如"学术诚信"、"教育改革") -- 观点/态度(如"支持方"、"反对方") +**Cannot be**: +- Abstract concepts (e.g. "public opinion", "emotion", "trend") +- Topics/themes (e.g. "academic integrity", "education reform") +- Viewpoints/attitudes (e.g. "supporters", "opponents") -## 输出格式 +## Output Format -请输出JSON格式,包含以下结构: +Please output JSON format with the following structure: ```json { "entity_types": [ { - "name": "实体类型名称(英文,PascalCase)", - "description": "简短描述(英文,不超过100字符)", + "name": "Entity type name (English, PascalCase)", + "description": "Brief description (English, max 100 characters)", "attributes": [ { - "name": "属性名(英文,snake_case)", + "name": "Attribute name (English, snake_case)", "type": "text", - "description": "属性描述" + "description": "Attribute description" } ], - "examples": ["示例实体1", "示例实体2"] + "examples": ["Example entity 1", "Example entity 2"] } ], "edge_types": [ { - "name": "关系类型名称(英文,UPPER_SNAKE_CASE)", - "description": "简短描述(英文,不超过100字符)", + "name": "Relationship type name (English, UPPER_SNAKE_CASE)", + "description": "Brief description (English, max 100 characters)", "source_targets": [ - {"source": "源实体类型", "target": "目标实体类型"} + {"source": "Source entity type", "target": "Target entity type"} ], "attributes": [] } ], - "analysis_summary": "对文本内容的简要分析说明" + "analysis_summary": "Brief analysis summary of the text content" } ``` -## 设计指南(极其重要!) +## Design Guidelines (Extremely Important!) -### 1. 实体类型设计 - 必须严格遵守 +### 1. Entity Type Design — Must Be Strictly Followed -**数量要求:必须正好10个实体类型** +**Quantity requirement: exactly 10 entity types** -**层次结构要求(必须同时包含具体类型和兜底类型)**: +**Hierarchy requirement (must include both specific types and fallback types)**: -你的10个实体类型必须包含以下层次: +Your 10 entity types must include the following levels: -A. **兜底类型(必须包含,放在列表最后2个)**: - - `Person`: 任何自然人个体的兜底类型。当一个人不属于其他更具体的人物类型时,归入此类。 - - `Organization`: 任何组织机构的兜底类型。当一个组织不属于其他更具体的组织类型时,归入此类。 +A. **Fallback types (required, placed as the last 2 in the list)**: + - `Person`: Fallback type for any individual person. Use this when a person does not fit any other more specific person type. + - `Organization`: Fallback type for any organization. Use this when an organization does not fit any other more specific organization type. -B. **具体类型(8个,根据文本内容设计)**: - - 针对文本中出现的主要角色,设计更具体的类型 - - 例如:如果文本涉及学术事件,可以有 `Student`, `Professor`, `University` - - 例如:如果文本涉及商业事件,可以有 `Company`, `CEO`, `Employee` +B. **Specific types (8 types, designed based on text content)**: + - Design more specific types for the main roles that appear in the text + - Example: if the text involves an academic event, you might have `Student`, `Professor`, `University` + - Example: if the text involves a business event, you might have `Company`, `CEO`, `Employee` -**为什么需要兜底类型**: -- 文本中会出现各种人物,如"中小学教师"、"路人甲"、"某位网友" -- 如果没有专门的类型匹配,他们应该被归入 `Person` -- 同理,小型组织、临时团体等应该归入 `Organization` +**Why fallback types are needed**: +- Various people appear in text, such as "primary and secondary school teachers", "passersby", "some netizen" +- If there is no dedicated type to match them, they should fall into `Person` +- Similarly, small organizations, ad hoc groups, etc. should fall into `Organization` -**具体类型的设计原则**: -- 从文本中识别出高频出现或关键的角色类型 -- 每个具体类型应该有明确的边界,避免重叠 -- description 必须清晰说明这个类型和兜底类型的区别 +**Principles for designing specific types**: +- Identify high-frequency or key role types from the text +- Each specific type should have clear boundaries and avoid overlap +- The description must clearly explain the difference between this type and the fallback types -### 2. 关系类型设计 +### 2. Relationship Type Design -- 数量:6-10个 -- 关系应该反映社媒互动中的真实联系 -- 确保关系的 source_targets 涵盖你定义的实体类型 +- Quantity: 6-10 +- Relationships should reflect real connections in social media interactions +- Ensure the source_targets in relationships cover the entity types you have defined -### 3. 属性设计 +### 3. Attribute Design -- 每个实体类型1-3个关键属性 -- **注意**:属性名不能使用 `name`、`uuid`、`group_id`、`created_at`、`summary`(这些是系统保留字) -- 推荐使用:`full_name`, `title`, `role`, `position`, `location`, `description` 等 +- 1-3 key attributes per entity type +- **Note**: Attribute names must not use `name`, `uuid`, `group_id`, `created_at`, `summary` (these are system reserved words) +- Recommended: `full_name`, `title`, `role`, `position`, `location`, `description`, etc. -## 实体类型参考 +## Entity Type Reference -**个人类(具体)**: -- Student: 学生 -- Professor: 教授/学者 -- Journalist: 记者 -- Celebrity: 明星/网红 -- Executive: 高管 -- Official: 政府官员 -- Lawyer: 律师 -- Doctor: 医生 +**Individual types (specific)**: +- Student: student +- Professor: professor/scholar +- Journalist: journalist +- Celebrity: celebrity/influencer +- Executive: corporate executive +- Official: government official +- Lawyer: lawyer +- Doctor: doctor -**个人类(兜底)**: -- Person: 任何自然人(不属于上述具体类型时使用) +**Individual types (fallback)**: +- Person: any individual (use when not fitting the specific types above) -**组织类(具体)**: -- University: 高校 -- Company: 公司企业 -- GovernmentAgency: 政府机构 -- MediaOutlet: 媒体机构 -- Hospital: 医院 -- School: 中小学 -- NGO: 非政府组织 +**Organization types (specific)**: +- University: university/college +- Company: company/enterprise +- GovernmentAgency: government agency +- MediaOutlet: media organization +- Hospital: hospital +- School: primary/secondary school +- NGO: non-governmental organization -**组织类(兜底)**: -- Organization: 任何组织机构(不属于上述具体类型时使用) +**Organization types (fallback)**: +- Organization: any organization (use when not fitting the specific types above) -## 关系类型参考 +## Relationship Type Reference -- WORKS_FOR: 工作于 -- STUDIES_AT: 就读于 -- AFFILIATED_WITH: 隶属于 -- REPRESENTS: 代表 -- REGULATES: 监管 -- REPORTS_ON: 报道 -- COMMENTS_ON: 评论 -- RESPONDS_TO: 回应 -- SUPPORTS: 支持 -- OPPOSES: 反对 -- COLLABORATES_WITH: 合作 -- COMPETES_WITH: 竞争 +- WORKS_FOR: works for +- STUDIES_AT: studies at +- AFFILIATED_WITH: affiliated with +- REPRESENTS: represents +- REGULATES: regulates +- REPORTS_ON: reports on +- COMMENTS_ON: comments on +- RESPONDS_TO: responds to +- SUPPORTS: supports +- OPPOSES: opposes +- COLLABORATES_WITH: collaborates with +- COMPETES_WITH: competes with """ class OntologyGenerator: """ - 本体生成器 - 分析文本内容,生成实体和关系类型定义 + Ontology generator + Analyzes text content and generates entity and relationship type definitions. """ - + def __init__(self, llm_client: Optional[LLMClient] = None): self.llm_client = llm_client or LLMClient() - + def generate( self, document_texts: List[str], @@ -189,107 +189,112 @@ class OntologyGenerator: additional_context: Optional[str] = None ) -> Dict[str, Any]: """ - 生成本体定义 - + Generate ontology definition. + Args: - document_texts: 文档文本列表 - simulation_requirement: 模拟需求描述 - additional_context: 额外上下文 - + document_texts: list of document texts + simulation_requirement: simulation requirement description + additional_context: additional context + Returns: - 本体定义(entity_types, edge_types等) + Ontology definition (entity_types, edge_types, etc.) """ - # 构建用户消息 - user_message = self._build_user_message( - document_texts, - simulation_requirement, - additional_context - ) - lang_instruction = get_language_instruction() - system_prompt = f"{ONTOLOGY_SYSTEM_PROMPT}\n\n{lang_instruction}\nIMPORTANT: Entity type names MUST be in English PascalCase (e.g., 'PersonEntity', 'MediaOrganization'). Relationship type names MUST be in English UPPER_SNAKE_CASE (e.g., 'WORKS_FOR'). Attribute names MUST be in English snake_case. Only description fields and analysis_summary should use the specified language above." + + # Build user message + user_message = self._build_user_message( + document_texts, + simulation_requirement, + additional_context, + lang_instruction + ) + + system_prompt = f"LANGUAGE INSTRUCTION (HIGHEST PRIORITY — MUST BE FOLLOWED): {lang_instruction} All description fields, analysis_summary, and examples MUST be written in this language.\n\n{ONTOLOGY_SYSTEM_PROMPT}\n\n{lang_instruction}\nIMPORTANT: Entity type names MUST be in English PascalCase (e.g., 'PersonEntity', 'MediaOrganization'). Relationship type names MUST be in English UPPER_SNAKE_CASE (e.g., 'WORKS_FOR'). Attribute names MUST be in English snake_case. Only description fields and analysis_summary should use the specified language above." messages = [ {"role": "system", "content": system_prompt}, {"role": "user", "content": user_message} ] - - # 调用LLM + + # Call LLM result = self.llm_client.chat_json( messages=messages, temperature=0.3, max_tokens=4096 ) - - # 验证和后处理 + + # Validate and post-process result = self._validate_and_process(result) - + return result - - # 传给 LLM 的文本最大长度(5万字) + + # Maximum text length passed to LLM (50,000 characters) MAX_TEXT_LENGTH_FOR_LLM = 50000 - + def _build_user_message( self, document_texts: List[str], simulation_requirement: str, - additional_context: Optional[str] + additional_context: Optional[str], + lang_instruction: str = "" ) -> str: - """构建用户消息""" - - # 合并文本 + """Build user message""" + + # Merge texts combined_text = "\n\n---\n\n".join(document_texts) original_length = len(combined_text) - - # 如果文本超过5万字,截断(仅影响传给LLM的内容,不影响图谱构建) + + # If text exceeds 50,000 characters, truncate (only affects what is passed to LLM, not graph building) if len(combined_text) > self.MAX_TEXT_LENGTH_FOR_LLM: combined_text = combined_text[:self.MAX_TEXT_LENGTH_FOR_LLM] - combined_text += f"\n\n...(原文共{original_length}字,已截取前{self.MAX_TEXT_LENGTH_FOR_LLM}字用于本体分析)..." - - message = f"""## 模拟需求 + combined_text += f"\n\n...(text truncated at {self.MAX_TEXT_LENGTH_FOR_LLM} chars out of {original_length} total)..." + + message = f"""## Simulation requirement {simulation_requirement} -## 文档内容 +## Document content {combined_text} """ - + if additional_context: message += f""" -## 额外说明 +## Additional context {additional_context} """ - - message += """ -请根据以上内容,设计适合社会舆论模拟的实体类型和关系类型。 -**必须遵守的规则**: -1. 必须正好输出10个实体类型 -2. 最后2个必须是兜底类型:Person(个人兜底)和 Organization(组织兜底) -3. 前8个是根据文本内容设计的具体类型 -4. 所有实体类型必须是现实中可以发声的主体,不能是抽象概念 -5. 属性名不能使用 name、uuid、group_id 等保留字,用 full_name、org_name 等替代 + message += f""" +Based on the content above, design entity types and relationship types suitable for social opinion simulation. + +**Mandatory rules**: +1. Output exactly 10 entity types +2. The last 2 must be fallback types: Person (individual fallback) and Organization (organization fallback) +3. The first 8 are specific types designed from the document content +4. All entity types must be real-world subjects capable of speaking out, not abstract concepts +5. Attribute names must not use reserved words: name, uuid, group_id — use full_name, org_name, etc. instead + +{lang_instruction} """ - + return message - + def _validate_and_process(self, result: Dict[str, Any]) -> Dict[str, Any]: - """验证和后处理结果""" - - # 确保必要字段存在 + """Validate and post-process the result""" + + # Ensure required fields exist if "entity_types" not in result: result["entity_types"] = [] if "edge_types" not in result: result["edge_types"] = [] if "analysis_summary" not in result: result["analysis_summary"] = "" - - # 验证实体类型 - # 记录原始名称到 PascalCase 的映射,用于后续修正 edge 的 source_targets 引用 + + # Validate entity types + # Record mapping from original name to PascalCase for fixing edge source_targets references later entity_name_map = {} for entity in result["entity_types"]: - # 强制将 entity name 转为 PascalCase(Zep API 要求) + # Force entity name to PascalCase (required by Zep API) if "name" in entity: original_name = entity["name"] entity["name"] = _to_pascal_case(original_name) @@ -300,19 +305,19 @@ class OntologyGenerator: entity["attributes"] = [] if "examples" not in entity: entity["examples"] = [] - # 确保description不超过100字符 + # Ensure description does not exceed 100 characters if len(entity.get("description", "")) > 100: entity["description"] = entity["description"][:97] + "..." - - # 验证关系类型 + + # Validate relationship types for edge in result["edge_types"]: - # 强制将 edge name 转为 SCREAMING_SNAKE_CASE(Zep API 要求) + # Force edge name to SCREAMING_SNAKE_CASE (required by Zep API) if "name" in edge: original_name = edge["name"] edge["name"] = original_name.upper() if edge["name"] != original_name: logger.warning(f"Edge type name '{original_name}' auto-converted to '{edge['name']}'") - # 修正 source_targets 中的实体名称引用,与转换后的 PascalCase 保持一致 + # Fix entity name references in source_targets to match converted PascalCase names for st in edge.get("source_targets", []): if st.get("source") in entity_name_map: st["source"] = entity_name_map[st["source"]] @@ -324,12 +329,12 @@ class OntologyGenerator: edge["attributes"] = [] if len(edge.get("description", "")) > 100: edge["description"] = edge["description"][:97] + "..." - - # Zep API 限制:最多 10 个自定义实体类型,最多 10 个自定义边类型 + + # Zep API limit: maximum 10 custom entity types and 10 custom edge types MAX_ENTITY_TYPES = 10 MAX_EDGE_TYPES = 10 - # 去重:按 name 去重,保留首次出现的 + # Deduplicate: keep first occurrence by name seen_names = set() deduped = [] for entity in result["entity_types"]: @@ -341,7 +346,7 @@ class OntologyGenerator: logger.warning(f"Duplicate entity type '{name}' removed during validation") result["entity_types"] = deduped - # 兜底类型定义 + # Fallback type definitions person_fallback = { "name": "Person", "description": "Any individual person not fitting other specific person types.", @@ -351,7 +356,7 @@ class OntologyGenerator: ], "examples": ["ordinary citizen", "anonymous netizen"] } - + organization_fallback = { "name": "Organization", "description": "Any organization not fitting other specific organization types.", @@ -361,74 +366,74 @@ class OntologyGenerator: ], "examples": ["small business", "community group"] } - - # 检查是否已有兜底类型 + + # Check whether fallback types already exist entity_names = {e["name"] for e in result["entity_types"]} has_person = "Person" in entity_names has_organization = "Organization" in entity_names - - # 需要添加的兜底类型 + + # Collect fallback types to add fallbacks_to_add = [] if not has_person: fallbacks_to_add.append(person_fallback) if not has_organization: fallbacks_to_add.append(organization_fallback) - + if fallbacks_to_add: current_count = len(result["entity_types"]) needed_slots = len(fallbacks_to_add) - - # 如果添加后会超过 10 个,需要移除一些现有类型 + + # If adding them would exceed 10, remove some existing types if current_count + needed_slots > MAX_ENTITY_TYPES: - # 计算需要移除多少个 + # Calculate how many to remove to_remove = current_count + needed_slots - MAX_ENTITY_TYPES - # 从末尾移除(保留前面更重要的具体类型) + # Remove from the end (preserve the more important specific types at the front) result["entity_types"] = result["entity_types"][:-to_remove] - - # 添加兜底类型 + + # Add fallback types result["entity_types"].extend(fallbacks_to_add) - - # 最终确保不超过限制(防御性编程) + + # Final guard: ensure limits are not exceeded (defensive programming) if len(result["entity_types"]) > MAX_ENTITY_TYPES: result["entity_types"] = result["entity_types"][:MAX_ENTITY_TYPES] - + if len(result["edge_types"]) > MAX_EDGE_TYPES: result["edge_types"] = result["edge_types"][:MAX_EDGE_TYPES] - + return result - + def generate_python_code(self, ontology: Dict[str, Any]) -> str: """ - 将本体定义转换为Python代码(类似ontology.py) - + Convert the ontology definition to Python code (similar to ontology.py). + Args: - ontology: 本体定义 - + ontology: ontology definition + Returns: - Python代码字符串 + Python code string """ code_lines = [ '"""', - '自定义实体类型定义', - '由MiroFish自动生成,用于社会舆论模拟', + 'Custom entity type definitions', + 'Auto-generated by MiroFish for social opinion simulation', '"""', '', 'from pydantic import Field', 'from zep_cloud.external_clients.ontology import EntityModel, EntityText, EdgeModel', '', '', - '# ============== 实体类型定义 ==============', + '# ============== Entity type definitions ==============', '', ] - - # 生成实体类型 + + # Generate entity types for entity in ontology.get("entity_types", []): name = entity["name"] desc = entity.get("description", f"A {name} entity.") - + code_lines.append(f'class {name}(EntityModel):') code_lines.append(f' """{desc}"""') - + attrs = entity.get("attributes", []) if attrs: for attr in attrs: @@ -440,23 +445,23 @@ class OntologyGenerator: code_lines.append(f' )') else: code_lines.append(' pass') - + code_lines.append('') code_lines.append('') - - code_lines.append('# ============== 关系类型定义 ==============') + + code_lines.append('# ============== Relationship type definitions ==============') code_lines.append('') - - # 生成关系类型 + + # Generate relationship types for edge in ontology.get("edge_types", []): name = edge["name"] - # 转换为PascalCase类名 + # Convert to PascalCase class name class_name = ''.join(word.capitalize() for word in name.split('_')) desc = edge.get("description", f"A {name} relationship.") - + code_lines.append(f'class {class_name}(EdgeModel):') code_lines.append(f' """{desc}"""') - + attrs = edge.get("attributes", []) if attrs: for attr in attrs: @@ -468,12 +473,12 @@ class OntologyGenerator: code_lines.append(f' )') else: code_lines.append(' pass') - + code_lines.append('') code_lines.append('') - - # 生成类型字典 - code_lines.append('# ============== 类型配置 ==============') + + # Generate type dictionaries + code_lines.append('# ============== Type configuration ==============') code_lines.append('') code_lines.append('ENTITY_TYPES = {') for entity in ontology.get("entity_types", []): @@ -488,8 +493,8 @@ class OntologyGenerator: code_lines.append(f' "{name}": {class_name},') code_lines.append('}') code_lines.append('') - - # 生成边的source_targets映射 + + # Generate edge source_targets mapping code_lines.append('EDGE_SOURCE_TARGETS = {') for edge in ontology.get("edge_types", []): name = edge["name"] @@ -501,6 +506,5 @@ class OntologyGenerator: ]) code_lines.append(f' "{name}": [{st_list}],') code_lines.append('}') - - return '\n'.join(code_lines) + return '\n'.join(code_lines) diff --git a/backend/app/services/report_agent.py b/backend/app/services/report_agent.py index cecd70b4..22252fec 100644 --- a/backend/app/services/report_agent.py +++ b/backend/app/services/report_agent.py @@ -1,12 +1,12 @@ """ -Report Agent服务 -使用LangChain + Zep实现ReACT模式的模拟报告生成 +Report Agent Service +Implements ReACT-pattern simulation report generation using Zep. -功能: -1. 根据模拟需求和Zep图谱信息生成报告 -2. 先规划目录结构,然后分段生成 -3. 每段采用ReACT多轮思考与反思模式 -4. 支持与用户对话,在对话中自主调用检索工具 +Features: +1. Generates reports based on simulation requirements and Zep graph data +2. First plans the table of contents, then generates content section by section +3. Each section uses multi-round ReACT thinking and reflection +4. Supports user conversation in which the agent autonomously calls retrieval tools """ import os @@ -35,18 +35,19 @@ logger = get_logger('mirofish.report_agent') class ReportLogger: """ - Report Agent 详细日志记录器 - - 在报告文件夹中生成 agent_log.jsonl 文件,记录每一步详细动作。 - 每行是一个完整的 JSON 对象,包含时间戳、动作类型、详细内容等。 + Detailed log recorder for the Report Agent. + + Generates an agent_log.jsonl file inside the report folder, recording each + step in detail. Every line is a complete JSON object containing a timestamp, + action type, and detailed content. """ - + def __init__(self, report_id: str): """ - 初始化日志记录器 - + Initialise the log recorder. + Args: - report_id: 报告ID,用于确定日志文件路径 + report_id: Report ID used to determine the log file path """ self.report_id = report_id self.log_file_path = os.path.join( @@ -56,31 +57,31 @@ class ReportLogger: self._ensure_log_file() def _ensure_log_file(self): - """确保日志文件所在目录存在""" + """Ensure the directory containing the log file exists""" log_dir = os.path.dirname(self.log_file_path) os.makedirs(log_dir, exist_ok=True) - + def _get_elapsed_time(self) -> float: - """获取从开始到现在的耗时(秒)""" + """Return elapsed time in seconds since the logger was created""" return (datetime.now() - self.start_time).total_seconds() - + def log( - self, - action: str, + self, + action: str, stage: str, details: Dict[str, Any], section_title: str = None, section_index: int = None ): """ - 记录一条日志 - + Record a single log entry. + Args: - action: 动作类型,如 'start', 'tool_call', 'llm_response', 'section_complete' 等 - stage: 当前阶段,如 'planning', 'generating', 'completed' - details: 详细内容字典,不截断 - section_title: 当前章节标题(可选) - section_index: 当前章节索引(可选) + action: Action type, e.g. 'start', 'tool_call', 'llm_response', 'section_complete' + stage: Current stage, e.g. 'planning', 'generating', 'completed' + details: Dictionary of detailed content (not truncated) + section_title: Current section title (optional) + section_index: Current section index (optional) """ log_entry = { "timestamp": datetime.now().isoformat(), @@ -93,12 +94,12 @@ class ReportLogger: "details": details } - # 追加写入 JSONL 文件 + # Append to the JSONL file with open(self.log_file_path, 'a', encoding='utf-8') as f: f.write(json.dumps(log_entry, ensure_ascii=False) + '\n') def log_start(self, simulation_id: str, graph_id: str, simulation_requirement: str): - """记录报告生成开始""" + """Record the start of report generation""" self.log( action="report_start", stage="pending", @@ -111,7 +112,7 @@ class ReportLogger: ) def log_planning_start(self): - """记录大纲规划开始""" + """Record the start of outline planning""" self.log( action="planning_start", stage="planning", @@ -119,7 +120,7 @@ class ReportLogger: ) def log_planning_context(self, context: Dict[str, Any]): - """记录规划时获取的上下文信息""" + """Record context information obtained during planning""" self.log( action="planning_context", stage="planning", @@ -130,7 +131,7 @@ class ReportLogger: ) def log_planning_complete(self, outline_dict: Dict[str, Any]): - """记录大纲规划完成""" + """Record the completion of outline planning""" self.log( action="planning_complete", stage="planning", @@ -141,7 +142,7 @@ class ReportLogger: ) def log_section_start(self, section_title: str, section_index: int): - """记录章节生成开始""" + """Record the start of section generation""" self.log( action="section_start", stage="generating", @@ -151,7 +152,7 @@ class ReportLogger: ) def log_react_thought(self, section_title: str, section_index: int, iteration: int, thought: str): - """记录 ReACT 思考过程""" + """Record a ReACT thinking step""" self.log( action="react_thought", stage="generating", @@ -165,14 +166,14 @@ class ReportLogger: ) def log_tool_call( - self, - section_title: str, + self, + section_title: str, section_index: int, - tool_name: str, + tool_name: str, parameters: Dict[str, Any], iteration: int ): - """记录工具调用""" + """Record a tool call""" self.log( action="tool_call", stage="generating", @@ -194,7 +195,7 @@ class ReportLogger: result: str, iteration: int ): - """记录工具调用结果(完整内容,不截断)""" + """Record a tool call result (full content, no truncation)""" self.log( action="tool_result", stage="generating", @@ -203,7 +204,7 @@ class ReportLogger: details={ "iteration": iteration, "tool_name": tool_name, - "result": result, # 完整结果,不截断 + "result": result, # full result, not truncated "result_length": len(result), "message": t('report.toolResult', toolName=tool_name) } @@ -218,7 +219,7 @@ class ReportLogger: has_tool_calls: bool, has_final_answer: bool ): - """记录 LLM 响应(完整内容,不截断)""" + """Record an LLM response (full content, no truncation)""" self.log( action="llm_response", stage="generating", @@ -226,7 +227,7 @@ class ReportLogger: section_index=section_index, details={ "iteration": iteration, - "response": response, # 完整响应,不截断 + "response": response, # full response, not truncated "response_length": len(response), "has_tool_calls": has_tool_calls, "has_final_answer": has_final_answer, @@ -241,14 +242,14 @@ class ReportLogger: content: str, tool_calls_count: int ): - """记录章节内容生成完成(仅记录内容,不代表整个章节完成)""" + """Record section content generation completion (records content only; does not indicate the whole section is done)""" self.log( action="section_content", stage="generating", section_title=section_title, section_index=section_index, details={ - "content": content, # 完整内容,不截断 + "content": content, # full content, not truncated "content_length": len(content), "tool_calls_count": tool_calls_count, "message": t('report.sectionContentDone', title=section_title) @@ -262,9 +263,10 @@ class ReportLogger: full_content: str ): """ - 记录章节生成完成 + Record section generation completion. - 前端应监听此日志来判断一个章节是否真正完成,并获取完整内容 + The frontend should listen for this log entry to determine whether a section + is truly complete and to retrieve the full content. """ self.log( action="section_complete", @@ -279,7 +281,7 @@ class ReportLogger: ) def log_report_complete(self, total_sections: int, total_time_seconds: float): - """记录报告生成完成""" + """Record the completion of report generation""" self.log( action="report_complete", stage="completed", @@ -291,7 +293,7 @@ class ReportLogger: ) def log_error(self, error_message: str, stage: str, section_title: str = None): - """记录错误""" + """Record an error""" self.log( action="error", stage=stage, @@ -306,18 +308,19 @@ class ReportLogger: class ReportConsoleLogger: """ - Report Agent 控制台日志记录器 - - 将控制台风格的日志(INFO、WARNING等)写入报告文件夹中的 console_log.txt 文件。 - 这些日志与 agent_log.jsonl 不同,是纯文本格式的控制台输出。 + Console log recorder for the Report Agent. + + Writes console-style log messages (INFO, WARNING, etc.) to a console_log.txt + file inside the report folder. Unlike agent_log.jsonl, this file uses a + plain-text format. """ - + def __init__(self, report_id: str): """ - 初始化控制台日志记录器 - + Initialise the console log recorder. + Args: - report_id: 报告ID,用于确定日志文件路径 + report_id: Report ID used to determine the log file path """ self.report_id = report_id self.log_file_path = os.path.join( @@ -328,66 +331,66 @@ class ReportConsoleLogger: self._setup_file_handler() def _ensure_log_file(self): - """确保日志文件所在目录存在""" + """Ensure the directory containing the log file exists""" log_dir = os.path.dirname(self.log_file_path) os.makedirs(log_dir, exist_ok=True) - + def _setup_file_handler(self): - """设置文件处理器,将日志同时写入文件""" + """Set up a file handler to write log messages to disk""" import logging - - # 创建文件处理器 + + # Create the file handler self._file_handler = logging.FileHandler( self.log_file_path, mode='a', encoding='utf-8' ) self._file_handler.setLevel(logging.INFO) - - # 使用与控制台相同的简洁格式 + + # Use the same concise format as the console formatter = logging.Formatter( '[%(asctime)s] %(levelname)s: %(message)s', datefmt='%H:%M:%S' ) self._file_handler.setFormatter(formatter) - - # 添加到 report_agent 相关的 logger + + # Attach to the report_agent-related loggers loggers_to_attach = [ 'mirofish.report_agent', 'mirofish.zep_tools', ] - + for logger_name in loggers_to_attach: target_logger = logging.getLogger(logger_name) - # 避免重复添加 + # Avoid attaching the same handler twice if self._file_handler not in target_logger.handlers: target_logger.addHandler(self._file_handler) - + def close(self): - """关闭文件处理器并从 logger 中移除""" + """Close the file handler and detach it from loggers""" import logging - + if self._file_handler: loggers_to_detach = [ 'mirofish.report_agent', 'mirofish.zep_tools', ] - + for logger_name in loggers_to_detach: target_logger = logging.getLogger(logger_name) if self._file_handler in target_logger.handlers: target_logger.removeHandler(self._file_handler) - + self._file_handler.close() self._file_handler = None - + def __del__(self): - """析构时确保关闭文件处理器""" + """Ensure the file handler is closed on destruction""" self.close() class ReportStatus(str, Enum): - """报告状态""" + """Report status""" PENDING = "pending" PLANNING = "planning" GENERATING = "generating" @@ -397,7 +400,7 @@ class ReportStatus(str, Enum): @dataclass class ReportSection: - """报告章节""" + """Report section""" title: str content: str = "" @@ -408,7 +411,7 @@ class ReportSection: } def to_markdown(self, level: int = 2) -> str: - """转换为Markdown格式""" + """Convert to Markdown format""" md = f"{'#' * level} {self.title}\n\n" if self.content: md += f"{self.content}\n\n" @@ -417,7 +420,7 @@ class ReportSection: @dataclass class ReportOutline: - """报告大纲""" + """Report outline""" title: str summary: str sections: List[ReportSection] @@ -430,7 +433,7 @@ class ReportOutline: } def to_markdown(self) -> str: - """转换为Markdown格式""" + """Convert to Markdown format""" md = f"# {self.title}\n\n" md += f"> {self.summary}\n\n" for section in self.sections: @@ -440,7 +443,7 @@ class ReportOutline: @dataclass class Report: - """完整报告""" + """Complete report""" report_id: str simulation_id: str graph_id: str @@ -468,421 +471,427 @@ class Report: # ═══════════════════════════════════════════════════════════════ -# Prompt 模板常量 +# Prompt template constants # ═══════════════════════════════════════════════════════════════ -# ── 工具描述 ── +# ── Tool descriptions ── TOOL_DESC_INSIGHT_FORGE = """\ -【深度洞察检索 - 强大的检索工具】 -这是我们强大的检索函数,专为深度分析设计。它会: -1. 自动将你的问题分解为多个子问题 -2. 从多个维度检索模拟图谱中的信息 -3. 整合语义搜索、实体分析、关系链追踪的结果 -4. 返回最全面、最深度的检索内容 +[Deep Insight Retrieval - Powerful Retrieval Tool] +This is our powerful retrieval function, designed specifically for in-depth analysis. It will: +1. Automatically decompose your question into multiple sub-questions +2. Retrieve information from the simulation graph across multiple dimensions +3. Integrate results from semantic search, entity analysis, and relationship-chain tracing +4. Return the most comprehensive and in-depth retrieved content -【使用场景】 -- 需要深入分析某个话题 -- 需要了解事件的多个方面 -- 需要获取支撑报告章节的丰富素材 +[Use cases] +- Need to analyse a topic in depth +- Need to understand multiple aspects of an event +- Need rich material to support a report section -【返回内容】 -- 相关事实原文(可直接引用) -- 核心实体洞察 -- 关系链分析""" +[Returns] +- Verbatim relevant facts (can be quoted directly) +- Core entity insights +- Relationship-chain analysis""" TOOL_DESC_PANORAMA_SEARCH = """\ -【广度搜索 - 获取全貌视图】 -这个工具用于获取模拟结果的完整全貌,特别适合了解事件演变过程。它会: -1. 获取所有相关节点和关系 -2. 区分当前有效的事实和历史/过期的事实 -3. 帮助你了解舆情是如何演变的 +[Panorama Search - Get the Full Picture] +This tool is used to obtain a complete overview of the simulation results, and is +especially suited to understanding how events evolved. It will: +1. Retrieve all relevant nodes and relationships +2. Distinguish between currently valid facts and historical/expired facts +3. Help you understand how public opinion has evolved -【使用场景】 -- 需要了解事件的完整发展脉络 -- 需要对比不同阶段的舆情变化 -- 需要获取全面的实体和关系信息 +[Use cases] +- Need to understand the complete development trajectory of an event +- Need to compare public-opinion changes across different stages +- Need comprehensive entity and relationship information -【返回内容】 -- 当前有效事实(模拟最新结果) -- 历史/过期事实(演变记录) -- 所有涉及的实体""" +[Returns] +- Currently valid facts (latest simulation results) +- Historical/expired facts (evolution record) +- All entities involved""" TOOL_DESC_QUICK_SEARCH = """\ -【简单搜索 - 快速检索】 -轻量级的快速检索工具,适合简单、直接的信息查询。 +[Quick Search - Fast Retrieval] +A lightweight, fast retrieval tool suited to simple, direct information queries. -【使用场景】 -- 需要快速查找某个具体信息 -- 需要验证某个事实 -- 简单的信息检索 +[Use cases] +- Need to quickly look up a specific piece of information +- Need to verify a fact +- Simple information retrieval -【返回内容】 -- 与查询最相关的事实列表""" +[Returns] +- A list of facts most relevant to the query""" TOOL_DESC_INTERVIEW_AGENTS = """\ -【深度采访 - 真实Agent采访(双平台)】 -调用OASIS模拟环境的采访API,对正在运行的模拟Agent进行真实采访! -这不是LLM模拟,而是调用真实的采访接口获取模拟Agent的原始回答。 -默认在Twitter和Reddit两个平台同时采访,获取更全面的观点。 +[In-Depth Interview - Real Agent Interviews (Dual Platform)] +Calls the OASIS simulation environment's interview API to conduct real interviews with +currently running simulation agents! +This is not LLM simulation — it calls the real interview endpoint to obtain raw answers +from simulation agents. +Interviews are conducted simultaneously on both Twitter and Reddit by default, providing +more comprehensive perspectives. -功能流程: -1. 自动读取人设文件,了解所有模拟Agent -2. 智能选择与采访主题最相关的Agent(如学生、媒体、官方等) -3. 自动生成采访问题 -4. 调用 /api/simulation/interview/batch 接口在双平台进行真实采访 -5. 整合所有采访结果,提供多视角分析 +Workflow: +1. Automatically reads persona files to learn about all simulation agents +2. Intelligently selects agents most relevant to the interview topic (e.g. students, media, officials) +3. Automatically generates interview questions +4. Calls the /api/simulation/interview/batch endpoint to conduct real interviews on both platforms +5. Integrates all interview results to provide multi-perspective analysis -【使用场景】 -- 需要从不同角色视角了解事件看法(学生怎么看?媒体怎么看?官方怎么说?) -- 需要收集多方意见和立场 -- 需要获取模拟Agent的真实回答(来自OASIS模拟环境) -- 想让报告更生动,包含"采访实录" +[Use cases] +- Need to understand how different roles view an event (what do students think? media? officials?) +- Need to collect opinions and positions from multiple parties +- Need real answers from simulation agents (from the OASIS simulation environment) +- Want to make the report more vivid by including "interview transcripts" -【返回内容】 -- 被采访Agent的身份信息 -- 各Agent在Twitter和Reddit两个平台的采访回答 -- 关键引言(可直接引用) -- 采访摘要和观点对比 +[Returns] +- Identity information of interviewed agents +- Each agent's interview responses on Twitter and Reddit +- Key quotes (can be cited directly) +- Interview summary and comparison of viewpoints -【重要】需要OASIS模拟环境正在运行才能使用此功能!""" +[Important] The OASIS simulation environment must be running to use this feature!""" -# ── 大纲规划 prompt ── +# ── Outline planning prompt ── PLAN_SYSTEM_PROMPT = """\ -你是一个「未来预测报告」的撰写专家,拥有对模拟世界的「上帝视角」——你可以洞察模拟中每一位Agent的行为、言论和互动。 +You are an expert writer of "Future Prediction Reports" with a "god's-eye view" of the simulated world — you can observe the behaviour, statements, and interactions of every Agent in the simulation. -【核心理念】 -我们构建了一个模拟世界,并向其中注入了特定的「模拟需求」作为变量。模拟世界的演化结果,就是对未来可能发生情况的预测。你正在观察的不是"实验数据",而是"未来的预演"。 +[Core concept] +We have built a simulated world and injected a specific "simulation requirement" into it as a variable. The outcome of the simulated world's evolution is a prediction of what may happen in the future. What you are observing is not "experimental data" but a "rehearsal of the future". -【你的任务】 -撰写一份「未来预测报告」,回答: -1. 在我们设定的条件下,未来发生了什么? -2. 各类Agent(人群)是如何反应和行动? -3. 这个模拟揭示了哪些值得关注的未来趋势和风险? +[Your task] +Write a "Future Prediction Report" that answers: +1. Under the conditions we set, what happened in the future? +2. How did the various types of Agents (population groups) react and act? +3. What future trends and risks worth attention did this simulation reveal? -【报告定位】 -- ✅ 这是一份基于模拟的未来预测报告,揭示"如果这样,未来会怎样" -- ✅ 聚焦于预测结果:事件走向、群体反应、涌现现象、潜在风险 -- ✅ 模拟世界中的Agent言行就是对未来人群行为的预测 -- ❌ 不是对现实世界现状的分析 -- ❌ 不是泛泛而谈的舆情综述 +[Report positioning] +- ✅ This is a simulation-based future prediction report that reveals "if this happens, what will the future look like" +- ✅ Focus on predicted outcomes: how events unfold, group reactions, emergent phenomena, potential risks +- ✅ The words and actions of Agents in the simulated world are predictions of future human behaviour +- ❌ Not an analysis of the current state of the real world +- ❌ Not a generic public-opinion overview -【章节数量限制】 -- 最少2个章节,最多5个章节 -- 不需要子章节,每个章节直接撰写完整内容 -- 内容要精炼,聚焦于核心预测发现 -- 章节结构由你根据预测结果自主设计 +[Section count limit] +- Minimum 2 sections, maximum 5 sections +- No sub-sections needed; write complete content directly within each section +- Content should be concise and focused on the core prediction findings +- You design the section structure yourself based on the prediction results -请输出JSON格式的报告大纲,格式如下: +Output the report outline in JSON format as follows: { - "title": "报告标题", - "summary": "报告摘要(一句话概括核心预测发现)", + "title": "Report title", + "summary": "Report summary (one sentence summarising the core prediction findings)", "sections": [ { - "title": "章节标题", - "description": "章节内容描述" + "title": "Section title", + "description": "Description of the section content" } ] } -注意:sections数组最少2个,最多5个元素!""" +Note: the sections array must have a minimum of 2 and a maximum of 5 elements!""" PLAN_USER_PROMPT_TEMPLATE = """\ -【预测场景设定】 -我们向模拟世界注入的变量(模拟需求):{simulation_requirement} +[Prediction Scenario Setup] +The variable we injected into the simulated world (simulation requirement): {simulation_requirement} -【模拟世界规模】 -- 参与模拟的实体数量: {total_nodes} -- 实体间产生的关系数量: {total_edges} -- 实体类型分布: {entity_types} -- 活跃Agent数量: {total_entities} +[Scale of the Simulated World] +- Number of entities in the simulation: {total_nodes} +- Number of relationships between entities: {total_edges} +- Distribution of entity types: {entity_types} +- Number of active agents: {total_entities} -【模拟预测到的部分未来事实样本】 +[Sample of Future Facts Predicted by the Simulation] {related_facts_json} -请以「上帝视角」审视这个未来预演: -1. 在我们设定的条件下,未来呈现出了什么样的状态? -2. 各类人群(Agent)是如何反应和行动的? -3. 这个模拟揭示了哪些值得关注的未来趋势? +Please examine this rehearsal of the future from a "god's-eye view": +1. Under the conditions we set, what state did the future settle into? +2. How did the various population groups (Agents) react and act? +3. What future trends worth attention did this simulation reveal? -根据预测结果,设计最合适的报告章节结构。 +Based on the prediction results, design the most appropriate report section structure. -【再次提醒】报告章节数量:最少2个,最多5个,内容要精炼聚焦于核心预测发现。""" +[Reminder] Report section count: minimum 2, maximum 5; content should be concise and focused on core prediction findings.""" -# ── 章节生成 prompt ── +# ── Section generation prompt ── SECTION_SYSTEM_PROMPT_TEMPLATE = """\ -你是一个「未来预测报告」的撰写专家,正在撰写报告的一个章节。 +You are an expert writer of "Future Prediction Reports", currently writing one section of a report. -报告标题: {report_title} -报告摘要: {report_summary} -预测场景(模拟需求): {simulation_requirement} +Report title: {report_title} +Report summary: {report_summary} +Prediction scenario (simulation requirement): {simulation_requirement} -当前要撰写的章节: {section_title} +Section to write now: {section_title} ═══════════════════════════════════════════════════════════════ -【核心理念】 +[Core concept] ═══════════════════════════════════════════════════════════════ -模拟世界是对未来的预演。我们向模拟世界注入了特定条件(模拟需求), -模拟中Agent的行为和互动,就是对未来人群行为的预测。 +The simulated world is a rehearsal of the future. We injected specific conditions +(the simulation requirement) into the simulated world; the behaviour and interactions +of the Agents in the simulation are predictions of future human behaviour. -你的任务是: -- 揭示在设定条件下,未来发生了什么 -- 预测各类人群(Agent)是如何反应和行动的 -- 发现值得关注的未来趋势、风险和机会 +Your task is to: +- Reveal what happened in the future under the conditions set +- Predict how the various population groups (Agents) reacted and acted +- Identify future trends, risks, and opportunities worth attention -❌ 不要写成对现实世界现状的分析 -✅ 要聚焦于"未来会怎样"——模拟结果就是预测的未来 +❌ Do not write this as an analysis of the current state of the real world +✅ Focus on "what will happen in the future" — the simulation results ARE the predicted future ═══════════════════════════════════════════════════════════════ -【最重要的规则 - 必须遵守】 +[Most important rules — must be followed] ═══════════════════════════════════════════════════════════════ -1. 【必须调用工具观察模拟世界】 - - 你正在以「上帝视角」观察未来的预演 - - 所有内容必须来自模拟世界中发生的事件和Agent言行 - - 禁止使用你自己的知识来编写报告内容 - - 每个章节至少调用3次工具(最多5次)来观察模拟的世界,它代表了未来 +1. [You must call tools to observe the simulated world] + - You are observing the rehearsal of the future from a "god's-eye view" + - All content must come from events that occurred and Agent statements in the simulated world + - You are prohibited from using your own knowledge to write the report content + - Call tools at least 3 times (and no more than 5 times) per section to observe the simulated world, which represents the future -2. 【必须引用Agent的原始言行】 - - Agent的发言和行为是对未来人群行为的预测 - - 在报告中使用引用格式展示这些预测,例如: - > "某类人群会表示:原文内容..." - - 这些引用是模拟预测的核心证据 +2. [You must quote Agents' original statements and actions] + - Agent statements and behaviour are predictions of future human behaviour + - Present these predictions in the report using quotation format, for example: + > "A certain population group would say: verbatim content..." + - These quotations are the core evidence of the simulation's predictions -3. 【语言一致性 - 引用内容必须翻译为报告语言】 - - 工具返回的内容可能包含与报告语言不同的表述 - - 报告必须全部使用与用户指定语言一致的语言撰写 - - 当你引用工具返回的其他语言内容时,必须将其翻译为报告语言后再写入 - - 翻译时保持原意不变,确保表述自然通顺 - - 这一规则同时适用于正文和引用块(> 格式)中的内容 +3. [Language consistency — quoted content must be translated to the report language] + - Tool return values may contain expressions in a language different from the report language + - The entire report must be written in the language specified by the user + - When quoting tool return values in other languages, you must translate them into the report language before writing them in + - Keep the original meaning intact; ensure the phrasing is natural and fluent + - This rule applies to both the main body text and blockquote (> format) content -4. 【忠实呈现预测结果】 - - 报告内容必须反映模拟世界中的代表未来的模拟结果 - - 不要添加模拟中不存在的信息 - - 如果某方面信息不足,如实说明 +4. [Faithfully present prediction results] + - Report content must reflect the simulation results that represent the future + - Do not add information that does not exist in the simulation + - If information on some aspect is insufficient, state that honestly ═══════════════════════════════════════════════════════════════ -【⚠️ 格式规范 - 极其重要!】 +[⚠️ Formatting rules — extremely important!] ═══════════════════════════════════════════════════════════════ -【一个章节 = 最小内容单位】 -- 每个章节是报告的最小分块单位 -- ❌ 禁止在章节内使用任何 Markdown 标题(#、##、###、#### 等) -- ❌ 禁止在内容开头添加章节主标题 -- ✅ 章节标题由系统自动添加,你只需撰写纯正文内容 -- ✅ 使用**粗体**、段落分隔、引用、列表来组织内容,但不要用标题 +[One section = the minimum content unit] +- Each section is the smallest division of the report +- ❌ No Markdown headings (#, ##, ###, #### etc.) are permitted anywhere inside a section +- ❌ Do not add the section's main heading at the top of the content +- ✅ The section heading is added automatically by the system; you only need to write the plain body content +- ✅ Use **bold**, paragraph breaks, block quotes, and lists to organise content — but no headings -【正确示例】 +[Correct example] ``` -本章节分析了事件的舆论传播态势。通过对模拟数据的深入分析,我们发现... +This section analyses the public-opinion dissemination dynamics of the event. Through in-depth analysis of the simulation data, we found... -**首发引爆阶段** +**Initial Ignition Stage** -微博作为舆情的第一现场,承担了信息首发的核心功能: +Platform X served as the primary venue, taking on the core function of first publishing the information: -> "微博贡献了68%的首发声量..." +> "Platform X contributed 68% of the initial volume..." -**情绪放大阶段** +**Emotional Amplification Stage** -抖音平台进一步放大了事件影响力: +Video platforms further amplified the impact of the event: -- 视觉冲击力强 -- 情绪共鸣度高 +- Strong visual impact +- High emotional resonance ``` -【错误示例】 +[Incorrect example] ``` -## 执行摘要 ← 错误!不要添加任何标题 -### 一、首发阶段 ← 错误!不要用###分小节 -#### 1.1 详细分析 ← 错误!不要用####细分 +## Executive Summary ← Wrong! Do not add any headings +### I. Initial Stage ← Wrong! Do not use ### for sub-sections +#### 1.1 Detailed Analysis ← Wrong! Do not use #### for finer divisions -本章节分析了... +This section analyses... ``` ═══════════════════════════════════════════════════════════════ -【可用检索工具】(每章节调用3-5次) +[Available retrieval tools] (call 3–5 times per section) ═══════════════════════════════════════════════════════════════ {tools_description} -【工具使用建议 - 请混合使用不同工具,不要只用一种】 -- insight_forge: 深度洞察分析,自动分解问题并多维度检索事实和关系 -- panorama_search: 广角全景搜索,了解事件全貌、时间线和演变过程 -- quick_search: 快速验证某个具体信息点 -- interview_agents: 采访模拟Agent,获取不同角色的第一人称观点和真实反应 +[Tool usage advice — mix different tools; do not use only one] +- insight_forge: Deep insight analysis; automatically decomposes questions and retrieves facts and relationships across multiple dimensions +- panorama_search: Wide-angle panorama search; understand the full picture of an event, its timeline, and how it evolved +- quick_search: Quickly verify a specific piece of information +- interview_agents: Interview simulation agents to obtain first-person perspectives and real reactions from different roles ═══════════════════════════════════════════════════════════════ -【工作流程】 +[Workflow] ═══════════════════════════════════════════════════════════════ -每次回复你只能做以下两件事之一(不可同时做): +In each reply you may do only one of the following two things (not both simultaneously): -选项A - 调用工具: -输出你的思考,然后用以下格式调用一个工具: +Option A — Call a tool: +Output your thoughts, then call one tool using the following format: -{{"name": "工具名称", "parameters": {{"参数名": "参数值"}}}} +{{"name": "tool_name", "parameters": {{"param_name": "param_value"}}}} -系统会执行工具并把结果返回给你。你不需要也不能自己编写工具返回结果。 +The system will execute the tool and return the result to you. You must not and cannot write the tool's return result yourself. -选项B - 输出最终内容: -当你已通过工具获取了足够信息,以 "Final Answer:" 开头输出章节内容。 +Option B — Output the final content: +Once you have obtained sufficient information through tools, output the section content starting with "Final Answer:". -⚠️ 严格禁止: -- 禁止在一次回复中同时包含工具调用和 Final Answer -- 禁止自己编造工具返回结果(Observation),所有工具结果由系统注入 -- 每次回复最多调用一个工具 +⚠️ Strictly prohibited: +- Including both a tool call and a Final Answer in a single reply +- Fabricating tool return results (Observations); all tool results are injected by the system +- Calling more than one tool per reply ═══════════════════════════════════════════════════════════════ -【章节内容要求】 +[Section content requirements] ═══════════════════════════════════════════════════════════════ -1. 内容必须基于工具检索到的模拟数据 -2. 大量引用原文来展示模拟效果 -3. 使用Markdown格式(但禁止使用标题): - - 使用 **粗体文字** 标记重点(代替子标题) - - 使用列表(-或1.2.3.)组织要点 - - 使用空行分隔不同段落 - - ❌ 禁止使用 #、##、###、#### 等任何标题语法 -4. 【引用格式规范 - 必须单独成段】 - 引用必须独立成段,前后各有一个空行,不能混在段落中: +1. Content must be based on simulation data retrieved by tools +2. Quote original text extensively to demonstrate simulation results +3. Use Markdown formatting (but headings are prohibited): + - Use **bold text** to mark key points (instead of sub-headings) + - Use lists (- or 1. 2. 3.) to organise points + - Use blank lines to separate different paragraphs + - ❌ Any heading syntax (#, ##, ###, #### etc.) is prohibited +4. [Quotation format rules — must stand alone as a paragraph] + Quotations must be their own paragraph with one blank line before and after; they cannot be embedded in a paragraph: - ✅ 正确格式: + ✅ Correct format: ``` - 校方的回应被认为缺乏实质内容。 + The institution's response was considered to lack substance. - > "校方的应对模式在瞬息万变的社交媒体环境中显得僵化和迟缓。" + > "The institution's response pattern appeared rigid and slow-moving in the fast-changing social media environment." - 这一评价反映了公众的普遍不满。 + This evaluation reflects the widespread public dissatisfaction. ``` - ❌ 错误格式: + ❌ Incorrect format: ``` - 校方的回应被认为缺乏实质内容。> "校方的应对模式..." 这一评价反映了... + The institution's response was considered to lack substance. > "The institution's response pattern..." This evaluation reflects... ``` -5. 保持与其他章节的逻辑连贯性 -6. 【避免重复】仔细阅读下方已完成的章节内容,不要重复描述相同的信息 -7. 【再次强调】不要添加任何标题!用**粗体**代替小节标题""" +5. Maintain logical coherence with other sections +6. [Avoid repetition] Carefully read the already-completed section content below; do not repeat the same information +7. [Emphasis again] Do not add any headings! Use **bold** instead of sub-section headings""" SECTION_USER_PROMPT_TEMPLATE = """\ -已完成的章节内容(请仔细阅读,避免重复): +Already-completed section content (please read carefully to avoid repetition): {previous_content} ═══════════════════════════════════════════════════════════════ -【当前任务】撰写章节: {section_title} +[Current task] Write section: {section_title} ═══════════════════════════════════════════════════════════════ -【重要提醒】 -1. 仔细阅读上方已完成的章节,避免重复相同的内容! -2. 开始前必须先调用工具获取模拟数据 -3. 请混合使用不同工具,不要只用一种 -4. 报告内容必须来自检索结果,不要使用自己的知识 +[Important reminders] +1. Read the already-completed sections above carefully and avoid repeating the same content! +2. You must call tools to retrieve simulation data before starting +3. Mix different tools; do not use only one +4. Report content must come from retrieval results; do not use your own knowledge -【⚠️ 格式警告 - 必须遵守】 -- ❌ 不要写任何标题(#、##、###、####都不行) -- ❌ 不要写"{section_title}"作为开头 -- ✅ 章节标题由系统自动添加 -- ✅ 直接写正文,用**粗体**代替小节标题 +[⚠️ Formatting warning — must be followed] +- ❌ Do not write any headings (#, ##, ###, #### are all prohibited) +- ❌ Do not write "{section_title}" as the opening line +- ✅ The section heading is added automatically by the system +- ✅ Write the body text directly; use **bold** instead of sub-section headings -请开始: -1. 首先思考(Thought)这个章节需要什么信息 -2. 然后调用工具(Action)获取模拟数据 -3. 收集足够信息后输出 Final Answer(纯正文,无任何标题)""" +Please begin: +1. First, think (Thought) about what information this section needs +2. Then call a tool (Action) to retrieve simulation data +3. Once you have gathered enough information, output Final Answer (plain body text, no headings of any kind)""" -# ── ReACT 循环内消息模板 ── +# ── ReACT loop message templates ── REACT_OBSERVATION_TEMPLATE = """\ -Observation(检索结果): +Observation (retrieval result): -═══ 工具 {tool_name} 返回 ═══ +═══ Tool {tool_name} returned ═══ {result} ═══════════════════════════════════════════════════════════════ -已调用工具 {tool_calls_count}/{max_tool_calls} 次(已用: {used_tools_str}){unused_hint} -- 如果信息充分:以 "Final Answer:" 开头输出章节内容(必须引用上述原文) -- 如果需要更多信息:调用一个工具继续检索 +Tools called: {tool_calls_count}/{max_tool_calls} (used: {used_tools_str}){unused_hint} +- If the information is sufficient: output the section content starting with "Final Answer:" (must quote the original text above) +- If more information is needed: call one tool to continue retrieval ═══════════════════════════════════════════════════════════════""" REACT_INSUFFICIENT_TOOLS_MSG = ( - "【注意】你只调用了{tool_calls_count}次工具,至少需要{min_tool_calls}次。" - "请再调用工具获取更多模拟数据,然后再输出 Final Answer。{unused_hint}" + "[Note] You have only called {tool_calls_count} tool(s), but at least {min_tool_calls} are required. " + "Please call more tools to retrieve simulation data before outputting Final Answer. {unused_hint}" ) REACT_INSUFFICIENT_TOOLS_MSG_ALT = ( - "当前只调用了 {tool_calls_count} 次工具,至少需要 {min_tool_calls} 次。" - "请调用工具获取模拟数据。{unused_hint}" + "You have only called {tool_calls_count} tool(s) so far; at least {min_tool_calls} are required. " + "Please call a tool to retrieve simulation data. {unused_hint}" ) REACT_TOOL_LIMIT_MSG = ( - "工具调用次数已达上限({tool_calls_count}/{max_tool_calls}),不能再调用工具。" - '请立即基于已获取的信息,以 "Final Answer:" 开头输出章节内容。' + "The tool call limit has been reached ({tool_calls_count}/{max_tool_calls}); no more tools can be called. " + 'Please immediately output the section content starting with "Final Answer:" based on the information already retrieved.' ) -REACT_UNUSED_TOOLS_HINT = "\n💡 你还没有使用过: {unused_list},建议尝试不同工具获取多角度信息" +REACT_UNUSED_TOOLS_HINT = "\n💡 You have not yet used: {unused_list} — consider trying different tools to gather multi-perspective information" -REACT_FORCE_FINAL_MSG = "已达到工具调用限制,请直接输出 Final Answer: 并生成章节内容。" +REACT_FORCE_FINAL_MSG = "The tool call limit has been reached. Please output Final Answer: directly and generate the section content." # ── Chat prompt ── CHAT_SYSTEM_PROMPT_TEMPLATE = """\ -你是一个简洁高效的模拟预测助手。 +You are a concise and efficient simulation prediction assistant. -【背景】 -预测条件: {simulation_requirement} +[Background] +Prediction conditions: {simulation_requirement} -【已生成的分析报告】 +[Already-generated analysis report] {report_content} -【规则】 -1. 优先基于上述报告内容回答问题 -2. 直接回答问题,避免冗长的思考论述 -3. 仅在报告内容不足以回答时,才调用工具检索更多数据 -4. 回答要简洁、清晰、有条理 +[Rules] +1. Prioritise answering questions based on the report content above +2. Answer questions directly; avoid lengthy reasoning +3. Only call tools to retrieve more data when the report content is insufficient to answer +4. Answers should be concise, clear, and well-organised -【可用工具】(仅在需要时使用,最多调用1-2次) +[Available tools] (use only when needed; call at most 1–2 times) {tools_description} -【工具调用格式】 +[Tool call format] -{{"name": "工具名称", "parameters": {{"参数名": "参数值"}}}} +{{"name": "tool_name", "parameters": {{"param_name": "param_value"}}}} -【回答风格】 -- 简洁直接,不要长篇大论 -- 使用 > 格式引用关键内容 -- 优先给出结论,再解释原因""" +[Answer style] +- Concise and direct; avoid long-winded explanations +- Use the > format to quote key content +- Lead with the conclusion, then explain the reasoning""" -CHAT_OBSERVATION_SUFFIX = "\n\n请简洁回答问题。" +CHAT_OBSERVATION_SUFFIX = "\n\nPlease answer the question concisely." # ═══════════════════════════════════════════════════════════════ -# ReportAgent 主类 +# ReportAgent main class # ═══════════════════════════════════════════════════════════════ class ReportAgent: """ - Report Agent - 模拟报告生成Agent + Report Agent — Simulation report generation agent. - 采用ReACT(Reasoning + Acting)模式: - 1. 规划阶段:分析模拟需求,规划报告目录结构 - 2. 生成阶段:逐章节生成内容,每章节可多次调用工具获取信息 - 3. 反思阶段:检查内容完整性和准确性 + Uses the ReACT (Reasoning + Acting) pattern: + 1. Planning stage: analyse simulation requirements and plan the report structure + 2. Generation stage: generate content section by section; each section may call tools + multiple times to retrieve information + 3. Reflection stage: check content completeness and accuracy """ - - # 最大工具调用次数(每个章节) + + # Maximum tool calls per section MAX_TOOL_CALLS_PER_SECTION = 5 - - # 最大反思轮数 + + # Maximum reflection rounds MAX_REFLECTION_ROUNDS = 3 - - # 对话中的最大工具调用次数 + + # Maximum tool calls per chat turn MAX_TOOL_CALLS_PER_CHAT = 2 def __init__( - self, + self, graph_id: str, simulation_id: str, simulation_requirement: str, @@ -890,14 +899,14 @@ class ReportAgent: zep_tools: Optional[ZepToolsService] = None ): """ - 初始化Report Agent - + Initialise the Report Agent. + Args: - graph_id: 图谱ID - simulation_id: 模拟ID - simulation_requirement: 模拟需求描述 - llm_client: LLM客户端(可选) - zep_tools: Zep工具服务(可选) + graph_id: Graph ID + simulation_id: Simulation ID + simulation_requirement: Simulation requirement description + llm_client: LLM client (optional) + zep_tools: Zep tools service (optional) """ self.graph_id = graph_id self.simulation_id = simulation_id @@ -906,64 +915,64 @@ class ReportAgent: self.llm = llm_client or LLMClient() self.zep_tools = zep_tools or ZepToolsService() - # 工具定义 + # Tool definitions self.tools = self._define_tools() - - # 日志记录器(在 generate_report 中初始化) + + # Structured log recorder (initialised in generate_report) self.report_logger: Optional[ReportLogger] = None - # 控制台日志记录器(在 generate_report 中初始化) + # Console log recorder (initialised in generate_report) self.console_logger: Optional[ReportConsoleLogger] = None logger.info(t('report.agentInitDone', graphId=graph_id, simulationId=simulation_id)) def _define_tools(self) -> Dict[str, Dict[str, Any]]: - """定义可用工具""" + """Define available tools""" return { "insight_forge": { "name": "insight_forge", "description": TOOL_DESC_INSIGHT_FORGE, "parameters": { - "query": "你想深入分析的问题或话题", - "report_context": "当前报告章节的上下文(可选,有助于生成更精准的子问题)" + "query": "The question or topic you want to analyse in depth", + "report_context": "Context of the current report section (optional; helps generate more precise sub-questions)" } }, "panorama_search": { "name": "panorama_search", "description": TOOL_DESC_PANORAMA_SEARCH, "parameters": { - "query": "搜索查询,用于相关性排序", - "include_expired": "是否包含过期/历史内容(默认True)" + "query": "Search query for relevance ranking", + "include_expired": "Whether to include expired/historical content (default True)" } }, "quick_search": { "name": "quick_search", "description": TOOL_DESC_QUICK_SEARCH, "parameters": { - "query": "搜索查询字符串", - "limit": "返回结果数量(可选,默认10)" + "query": "Search query string", + "limit": "Number of results to return (optional, default 10)" } }, "interview_agents": { "name": "interview_agents", "description": TOOL_DESC_INTERVIEW_AGENTS, "parameters": { - "interview_topic": "采访主题或需求描述(如:'了解学生对宿舍甲醛事件的看法')", - "max_agents": "最多采访的Agent数量(可选,默认5,最大10)" + "interview_topic": "Interview topic or requirement description", + "max_agents": "Maximum number of agents to interview (optional, default 5, max 10)" } } } def _execute_tool(self, tool_name: str, parameters: Dict[str, Any], report_context: str = "") -> str: """ - 执行工具调用 - + Execute a tool call. + Args: - tool_name: 工具名称 - parameters: 工具参数 - report_context: 报告上下文(用于InsightForge) - + tool_name: Tool name + parameters: Tool parameters + report_context: Report context (used by InsightForge) + Returns: - 工具执行结果(文本格式) + Tool execution result (text format) """ logger.info(t('report.executingTool', toolName=tool_name, params=parameters)) @@ -980,7 +989,7 @@ class ReportAgent: return result.to_text() elif tool_name == "panorama_search": - # 广度搜索 - 获取全貌 + # Panorama search - get the full picture query = parameters.get("query", "") include_expired = parameters.get("include_expired", True) if isinstance(include_expired, str): @@ -993,7 +1002,7 @@ class ReportAgent: return result.to_text() elif tool_name == "quick_search": - # 简单搜索 - 快速检索 + # Quick search - fast retrieval query = parameters.get("query", "") limit = parameters.get("limit", 10) if isinstance(limit, str): @@ -1006,7 +1015,7 @@ class ReportAgent: return result.to_text() elif tool_name == "interview_agents": - # 深度采访 - 调用真实的OASIS采访API获取模拟Agent的回答(双平台) + # In-depth interview - calls the real OASIS interview API to get simulation agent responses (dual platform) interview_topic = parameters.get("interview_topic", parameters.get("query", "")) max_agents = parameters.get("max_agents", 5) if isinstance(max_agents, str): @@ -1020,10 +1029,10 @@ class ReportAgent: ) return result.to_text() - # ========== 向后兼容的旧工具(内部重定向到新工具) ========== - + # ========== Legacy tools for backward compatibility (internally redirect to new tools) ========== + elif tool_name == "search_graph": - # 重定向到 quick_search + # Redirect to quick_search logger.info(t('report.redirectToQuickSearch')) return self._execute_tool("quick_search", parameters, report_context) @@ -1040,7 +1049,7 @@ class ReportAgent: return json.dumps(result, ensure_ascii=False, indent=2) elif tool_name == "get_simulation_context": - # 重定向到 insight_forge,因为它更强大 + # Redirect to insight_forge as it is more powerful logger.info(t('report.redirectToInsightForge')) query = parameters.get("query", self.simulation_requirement) return self._execute_tool("insight_forge", {"query": query}, report_context) @@ -1055,26 +1064,26 @@ class ReportAgent: return json.dumps(result, ensure_ascii=False, indent=2) else: - return f"未知工具: {tool_name}。请使用以下工具之一: insight_forge, panorama_search, quick_search" - + return f"Unknown tool: {tool_name}. Please use one of: insight_forge, panorama_search, quick_search" + except Exception as e: logger.error(t('report.toolExecFailed', toolName=tool_name, error=str(e))) - return f"工具执行失败: {str(e)}" + return f"Tool execution failed: {str(e)}" - # 合法的工具名称集合,用于裸 JSON 兜底解析时校验 + # Valid tool names; used when validating the bare-JSON fallback parse VALID_TOOL_NAMES = {"insight_forge", "panorama_search", "quick_search", "interview_agents"} def _parse_tool_calls(self, response: str) -> List[Dict[str, Any]]: """ - 从LLM响应中解析工具调用 + Parse tool calls from an LLM response. - 支持的格式(按优先级): + Supported formats (in priority order): 1. {"name": "tool_name", "parameters": {...}} - 2. 裸 JSON(响应整体或单行就是一个工具调用 JSON) + 2. Bare JSON (the entire response or a single line is a tool call JSON object) """ tool_calls = [] - # 格式1: XML风格(标准格式) + # Format 1: XML-style (standard format) xml_pattern = r'\s*(\{.*?\})\s*' for match in re.finditer(xml_pattern, response, re.DOTALL): try: @@ -1086,8 +1095,8 @@ class ReportAgent: if tool_calls: return tool_calls - # 格式2: 兜底 - LLM 直接输出裸 JSON(没包 标签) - # 只在格式1未匹配时尝试,避免误匹配正文中的 JSON + # Format 2: fallback — LLM outputs bare JSON directly (without tags) + # Only attempted when Format 1 did not match, to avoid false positives in body text stripped = response.strip() if stripped.startswith('{') and stripped.endswith('}'): try: @@ -1098,7 +1107,7 @@ class ReportAgent: except json.JSONDecodeError: pass - # 响应可能包含思考文字 + 裸 JSON,尝试提取最后一个 JSON 对象 + # Response may contain reasoning text + bare JSON; try to extract the last JSON object json_pattern = r'(\{"(?:name|tool)"\s*:.*?\})\s*$' match = re.search(json_pattern, stripped, re.DOTALL) if match: @@ -1112,11 +1121,11 @@ class ReportAgent: return tool_calls def _is_valid_tool_call(self, data: dict) -> bool: - """校验解析出的 JSON 是否是合法的工具调用""" - # 支持 {"name": ..., "parameters": ...} 和 {"tool": ..., "params": ...} 两种键名 + """Validate whether the parsed JSON is a valid tool call""" + # Supports both {"name": ..., "parameters": ...} and {"tool": ..., "params": ...} key forms tool_name = data.get("name") or data.get("tool") if tool_name and tool_name in self.VALID_TOOL_NAMES: - # 统一键名为 name / parameters + # Normalise keys to name / parameters if "tool" in data: data["name"] = data.pop("tool") if "params" in data and "parameters" not in data: @@ -1125,36 +1134,36 @@ class ReportAgent: return False def _get_tools_description(self) -> str: - """生成工具描述文本""" - desc_parts = ["可用工具:"] + """Generate tool description text""" + desc_parts = ["Available tools:"] for name, tool in self.tools.items(): params_desc = ", ".join([f"{k}: {v}" for k, v in tool["parameters"].items()]) desc_parts.append(f"- {name}: {tool['description']}") if params_desc: - desc_parts.append(f" 参数: {params_desc}") + desc_parts.append(f" Parameters: {params_desc}") return "\n".join(desc_parts) def plan_outline( - self, + self, progress_callback: Optional[Callable] = None ) -> ReportOutline: """ - 规划报告大纲 - - 使用LLM分析模拟需求,规划报告的目录结构 - + Plan the report outline. + + Uses an LLM to analyse the simulation requirements and plan the report structure. + Args: - progress_callback: 进度回调函数 - + progress_callback: Progress callback function + Returns: - ReportOutline: 报告大纲 + ReportOutline: Report outline """ logger.info(t('report.startPlanningOutline')) - + if progress_callback: progress_callback("planning", 0, t('progress.analyzingRequirements')) - - # 首先获取模拟上下文 + + # Obtain the simulation context first context = self.zep_tools.get_simulation_context( graph_id=self.graph_id, simulation_requirement=self.simulation_requirement @@ -1184,8 +1193,8 @@ class ReportAgent: if progress_callback: progress_callback("planning", 80, t('progress.parsingOutline')) - - # 解析大纲 + + # Parse the outline sections = [] for section_data in response.get("sections", []): sections.append(ReportSection( @@ -1194,7 +1203,7 @@ class ReportAgent: )) outline = ReportOutline( - title=response.get("title", "模拟分析报告"), + title=response.get("title", "Simulation Analysis Report"), summary=response.get("summary", ""), sections=sections ) @@ -1207,19 +1216,19 @@ class ReportAgent: except Exception as e: logger.error(t('report.outlinePlanFailed', error=str(e))) - # 返回默认大纲(3个章节,作为fallback) + # Return a default outline (3 sections, as fallback) return ReportOutline( - title="未来预测报告", - summary="基于模拟预测的未来趋势与风险分析", + title="Future Forecast Report", + summary="Future trends and risk analysis based on simulation forecasts", sections=[ - ReportSection(title="预测场景与核心发现"), - ReportSection(title="人群行为预测分析"), - ReportSection(title="趋势展望与风险提示") + ReportSection(title="Predicted Scenario and Core Findings"), + ReportSection(title="Population Behaviour Prediction Analysis"), + ReportSection(title="Trend Outlook and Risk Warnings") ] ) def _generate_section_react( - self, + self, section: ReportSection, outline: ReportOutline, previous_sections: List[str], @@ -1227,28 +1236,28 @@ class ReportAgent: section_index: int = 0 ) -> str: """ - 使用ReACT模式生成单个章节内容 - - ReACT循环: - 1. Thought(思考)- 分析需要什么信息 - 2. Action(行动)- 调用工具获取信息 - 3. Observation(观察)- 分析工具返回结果 - 4. 重复直到信息足够或达到最大次数 - 5. Final Answer(最终回答)- 生成章节内容 - + Generate a single section using the ReACT pattern. + + ReACT loop: + 1. Thought — analyse what information is needed + 2. Action — call a tool to retrieve information + 3. Observation — analyse the tool's return value + 4. Repeat until enough information is gathered or the maximum count is reached + 5. Final Answer — generate the section content + Args: - section: 要生成的章节 - outline: 完整大纲 - previous_sections: 之前章节的内容(用于保持连贯性) - progress_callback: 进度回调 - section_index: 章节索引(用于日志记录) - + section: The section to generate + outline: The full outline + previous_sections: Content of previously generated sections (for coherence) + progress_callback: Progress callback + section_index: Section index (used for logging) + Returns: - 章节内容(Markdown格式) + Section content (Markdown format) """ logger.info(t('report.reactGenerateSection', title=section.title)) - - # 记录章节开始日志 + + # Log section start if self.report_logger: self.report_logger.log_section_start(section.title, section_index) @@ -1261,16 +1270,16 @@ class ReportAgent: ) system_prompt = f"{system_prompt}\n\n{get_language_instruction()}" - # 构建用户prompt - 每个已完成章节各传入最大4000字 + # Build the user prompt — each already-completed section is capped at 4000 characters if previous_sections: previous_parts = [] for sec in previous_sections: - # 每个章节最多4000字 + # Each section is limited to 4000 characters truncated = sec[:4000] + "..." if len(sec) > 4000 else sec previous_parts.append(truncated) previous_content = "\n\n---\n\n".join(previous_parts) else: - previous_content = "(这是第一个章节)" + previous_content = "(This is the first section)" user_prompt = SECTION_USER_PROMPT_TEMPLATE.format( previous_content=previous_content, @@ -1282,16 +1291,16 @@ class ReportAgent: {"role": "user", "content": user_prompt} ] - # ReACT循环 + # ReACT loop tool_calls_count = 0 - max_iterations = 5 # 最大迭代轮数 - min_tool_calls = 3 # 最少工具调用次数 - conflict_retries = 0 # 工具调用与Final Answer同时出现的连续冲突次数 - used_tools = set() # 记录已调用过的工具名 + max_iterations = 5 # maximum iterations + min_tool_calls = 3 # minimum tool calls required + conflict_retries = 0 # consecutive conflict count (tool call + Final Answer in same reply) + used_tools = set() # tracks tool names that have been called all_tools = {"insight_forge", "panorama_search", "quick_search", "interview_agents"} - # 报告上下文,用于InsightForge的子问题生成 - report_context = f"章节标题: {section.title}\n模拟需求: {self.simulation_requirement}" + # Report context used by InsightForge for sub-question generation + report_context = f"Section title: {section.title}\nSimulation requirement: {self.simulation_requirement}" for iteration in range(max_iterations): if progress_callback: @@ -1301,32 +1310,32 @@ class ReportAgent: t('progress.deepSearchAndWrite', current=tool_calls_count, max=self.MAX_TOOL_CALLS_PER_SECTION) ) - # 调用LLM + # Call the LLM response = self.llm.chat( messages=messages, temperature=0.5, max_tokens=4096 ) - # 检查 LLM 返回是否为 None(API 异常或内容为空) + # Check whether the LLM returned None (API error or empty content) if response is None: logger.warning(t('report.sectionIterNone', title=section.title, iteration=iteration + 1)) - # 如果还有迭代次数,添加消息并重试 + # If iterations remain, append a message and retry if iteration < max_iterations - 1: - messages.append({"role": "assistant", "content": "(响应为空)"}) - messages.append({"role": "user", "content": "请继续生成内容。"}) + messages.append({"role": "assistant", "content": "(empty response)"}) + messages.append({"role": "user", "content": "Please continue generating content."}) continue - # 最后一次迭代也返回 None,跳出循环进入强制收尾 + # Last iteration also returned None; break out of the loop and force a conclusion break - logger.debug(f"LLM响应: {response[:200]}...") + logger.debug(f"LLM response: {response[:200]}...") - # 解析一次,复用结果 + # Parse once and reuse the result tool_calls = self._parse_tool_calls(response) has_tool_calls = bool(tool_calls) has_final_answer = "Final Answer:" in response - # ── 冲突处理:LLM 同时输出了工具调用和 Final Answer ── + # ── Conflict handling: LLM output both a tool call and a Final Answer ── if has_tool_calls and has_final_answer: conflict_retries += 1 logger.warning( @@ -1334,21 +1343,21 @@ class ReportAgent: ) if conflict_retries <= 2: - # 前两次:丢弃本次响应,要求 LLM 重新回复 + # First two occurrences: discard this response and ask the LLM to reply again messages.append({"role": "assistant", "content": response}) messages.append({ "role": "user", "content": ( - "【格式错误】你在一次回复中同时包含了工具调用和 Final Answer,这是不允许的。\n" - "每次回复只能做以下两件事之一:\n" - "- 调用一个工具(输出一个 块,不要写 Final Answer)\n" - "- 输出最终内容(以 'Final Answer:' 开头,不要包含 )\n" - "请重新回复,只做其中一件事。" + "[Format error] Your reply contained both a tool call and a Final Answer, which is not allowed.\n" + "Each reply may do only one of the following two things:\n" + "- Call a tool (output a block; do not write Final Answer)\n" + "- Output the final content (begin with 'Final Answer:'; do not include )\n" + "Please reply again and do only one of the two." ), }) continue else: - # 第三次:降级处理,截断到第一个工具调用,强制执行 + # Third occurrence: downgrade — truncate to the first tool call and force execution logger.warning( t('report.sectionConflictDowngrade', title=section.title, conflictCount=conflict_retries) ) @@ -1360,7 +1369,7 @@ class ReportAgent: has_final_answer = False conflict_retries = 0 - # 记录 LLM 响应日志 + # Log the LLM response if self.report_logger: self.report_logger.log_llm_response( section_title=section.title, @@ -1371,13 +1380,13 @@ class ReportAgent: has_final_answer=has_final_answer ) - # ── 情况1:LLM 输出了 Final Answer ── + # ── Case 1: LLM output a Final Answer ── if has_final_answer: - # 工具调用次数不足,拒绝并要求继续调工具 + # Insufficient tool calls — reject and require more tool usage if tool_calls_count < min_tool_calls: messages.append({"role": "assistant", "content": response}) unused_tools = all_tools - used_tools - unused_hint = f"(这些工具还未使用,推荐用一下他们: {', '.join(unused_tools)})" if unused_tools else "" + unused_hint = f"(These tools have not been used yet, consider trying them: {', '.join(unused_tools)})" if unused_tools else "" messages.append({ "role": "user", "content": REACT_INSUFFICIENT_TOOLS_MSG.format( @@ -1388,7 +1397,7 @@ class ReportAgent: }) continue - # 正常结束 + # Normal completion final_answer = response.split("Final Answer:")[-1].strip() logger.info(t('report.sectionGenDone', title=section.title, count=tool_calls_count)) @@ -1401,9 +1410,9 @@ class ReportAgent: ) return final_answer - # ── 情况2:LLM 尝试调用工具 ── + # ── Case 2: LLM attempted a tool call ── if has_tool_calls: - # 工具额度已耗尽 → 明确告知,要求输出 Final Answer + # Tool quota exhausted → notify clearly and require Final Answer output if tool_calls_count >= self.MAX_TOOL_CALLS_PER_SECTION: messages.append({"role": "assistant", "content": response}) messages.append({ @@ -1415,7 +1424,7 @@ class ReportAgent: }) continue - # 只执行第一个工具调用 + # Execute only the first tool call call = tool_calls[0] if len(tool_calls) > 1: logger.info(t('report.multiToolOnlyFirst', total=len(tool_calls), toolName=call['name'])) @@ -1447,11 +1456,11 @@ class ReportAgent: tool_calls_count += 1 used_tools.add(call['name']) - # 构建未使用工具提示 + # Build the unused-tools hint unused_tools = all_tools - used_tools unused_hint = "" if unused_tools and tool_calls_count < self.MAX_TOOL_CALLS_PER_SECTION: - unused_hint = REACT_UNUSED_TOOLS_HINT.format(unused_list="、".join(unused_tools)) + unused_hint = REACT_UNUSED_TOOLS_HINT.format(unused_list=", ".join(unused_tools)) messages.append({"role": "assistant", "content": response}) messages.append({ @@ -1467,13 +1476,13 @@ class ReportAgent: }) continue - # ── 情况3:既没有工具调用,也没有 Final Answer ── + # ── Case 3: Neither a tool call nor a Final Answer ── messages.append({"role": "assistant", "content": response}) if tool_calls_count < min_tool_calls: - # 工具调用次数不足,推荐未用过的工具 + # Insufficient tool calls — recommend unused tools unused_tools = all_tools - used_tools - unused_hint = f"(这些工具还未使用,推荐用一下他们: {', '.join(unused_tools)})" if unused_tools else "" + unused_hint = f"(These tools have not been used yet, consider trying them: {', '.join(unused_tools)})" if unused_tools else "" messages.append({ "role": "user", @@ -1485,8 +1494,8 @@ class ReportAgent: }) continue - # 工具调用已足够,LLM 输出了内容但没带 "Final Answer:" 前缀 - # 直接将这段内容作为最终答案,不再空转 + # Enough tool calls have been made; the LLM output content without the "Final Answer:" prefix. + # Use this content as the final answer directly without further looping. logger.info(t('report.sectionNoPrefix', title=section.title, count=tool_calls_count)) final_answer = response.strip() @@ -1499,7 +1508,7 @@ class ReportAgent: ) return final_answer - # 达到最大迭代次数,强制生成内容 + # Maximum iterations reached; force content generation logger.warning(t('report.sectionMaxIter', title=section.title)) messages.append({"role": "user", "content": REACT_FORCE_FINAL_MSG}) @@ -1509,7 +1518,7 @@ class ReportAgent: max_tokens=4096 ) - # 检查强制收尾时 LLM 返回是否为 None + # Check whether LLM returned None during forced conclusion if response is None: logger.error(t('report.sectionForceFailed', title=section.title)) final_answer = t('report.sectionGenFailedContent') @@ -1518,7 +1527,7 @@ class ReportAgent: else: final_answer = response - # 记录章节内容生成完成日志 + # Log section content generation completion if self.report_logger: self.report_logger.log_section_content( section_title=section.title, @@ -1526,38 +1535,39 @@ class ReportAgent: content=final_answer, tool_calls_count=tool_calls_count ) - + return final_answer - + def generate_report( - self, + self, progress_callback: Optional[Callable[[str, int, str], None]] = None, report_id: Optional[str] = None ) -> Report: """ - 生成完整报告(分章节实时输出) - - 每个章节生成完成后立即保存到文件夹,不需要等待整个报告完成。 - 文件结构: + Generate the complete report (real-time section-by-section output). + + Each section is saved to the folder immediately upon completion; + there is no need to wait for the entire report to finish. + File structure: reports/{report_id}/ - meta.json - 报告元信息 - outline.json - 报告大纲 - progress.json - 生成进度 - section_01.md - 第1章节 - section_02.md - 第2章节 + meta.json - Report metadata + outline.json - Report outline + progress.json - Generation progress + section_01.md - Section 1 + section_02.md - Section 2 ... - full_report.md - 完整报告 - + full_report.md - Complete report + Args: - progress_callback: 进度回调函数 (stage, progress, message) - report_id: 报告ID(可选,如果不传则自动生成) - + progress_callback: Progress callback function (stage, progress, message) + report_id: Report ID (optional; auto-generated if not provided) + Returns: - Report: 完整报告 + Report: The complete report """ import uuid - - # 如果没有传入 report_id,则自动生成 + + # Auto-generate report_id if not provided if not report_id: report_id = f"report_{uuid.uuid4().hex[:12]}" start_time = datetime.now() @@ -1571,22 +1581,22 @@ class ReportAgent: created_at=datetime.now().isoformat() ) - # 已完成的章节标题列表(用于进度追踪) + # List of completed section titles (for progress tracking) completed_section_titles = [] - + try: - # 初始化:创建报告文件夹并保存初始状态 + # Initialise: create the report folder and save the initial state ReportManager._ensure_report_folder(report_id) - - # 初始化日志记录器(结构化日志 agent_log.jsonl) + + # Initialise the structured log recorder (agent_log.jsonl) self.report_logger = ReportLogger(report_id) self.report_logger.log_start( simulation_id=self.simulation_id, graph_id=self.graph_id, simulation_requirement=self.simulation_requirement ) - - # 初始化控制台日志记录器(console_log.txt) + + # Initialise the console log recorder (console_log.txt) self.console_logger = ReportConsoleLogger(report_id) ReportManager.update_progress( @@ -1595,14 +1605,14 @@ class ReportAgent: ) ReportManager.save_report(report) - # 阶段1: 规划大纲 + # Stage 1: Plan the outline report.status = ReportStatus.PLANNING ReportManager.update_progress( report_id, "planning", 5, t('progress.startPlanningOutline'), completed_sections=[] ) - - # 记录规划开始日志 + + # Log planning start self.report_logger.log_planning_start() if progress_callback: @@ -1614,10 +1624,10 @@ class ReportAgent: ) report.outline = outline - # 记录规划完成日志 + # Log planning completion self.report_logger.log_planning_complete(outline.to_dict()) - - # 保存大纲到文件 + + # Save the outline to file ReportManager.save_outline(report_id, outline) ReportManager.update_progress( report_id, "planning", 15, t('progress.outlineDone', count=len(outline.sections)), @@ -1627,17 +1637,17 @@ class ReportAgent: logger.info(t('report.outlineSavedToFile', reportId=report_id)) - # 阶段2: 逐章节生成(分章节保存) + # Stage 2: Generate section by section (save as each section is completed) report.status = ReportStatus.GENERATING - + total_sections = len(outline.sections) - generated_sections = [] # 保存内容用于上下文 + generated_sections = [] # saved content used for context for i, section in enumerate(outline.sections): section_num = i + 1 base_progress = 20 + int((i / total_sections) * 70) - # 更新进度 + # Update progress ReportManager.update_progress( report_id, "generating", base_progress, t('progress.generatingSection', title=section.title, current=section_num, total=total_sections), @@ -1652,7 +1662,7 @@ class ReportAgent: t('progress.generatingSection', title=section.title, current=section_num, total=total_sections) ) - # 生成主章节内容 + # Generate the main section content section_content = self._generate_section_react( section=section, outline=outline, @@ -1669,11 +1679,11 @@ class ReportAgent: section.content = section_content generated_sections.append(f"## {section.title}\n\n{section_content}") - # 保存章节 + # Save the section ReportManager.save_section(report_id, section_num, section) completed_section_titles.append(section.title) - # 记录章节完成日志 + # Log section completion full_section_content = f"## {section.title}\n\n{section_content}" if self.report_logger: @@ -1685,16 +1695,16 @@ class ReportAgent: logger.info(t('report.sectionSaved', reportId=report_id, sectionNum=f"{section_num:02d}")) - # 更新进度 + # Update progress ReportManager.update_progress( - report_id, "generating", + report_id, "generating", base_progress + int(70 / total_sections), t('progress.sectionDone', title=section.title), current_section=None, completed_sections=completed_section_titles ) - # 阶段3: 组装完整报告 + # Stage 3: Assemble the complete report if progress_callback: progress_callback("generating", 95, t('progress.assemblingReport')) @@ -1703,22 +1713,22 @@ class ReportAgent: completed_sections=completed_section_titles ) - # 使用ReportManager组装完整报告 + # Use ReportManager to assemble the complete report report.markdown_content = ReportManager.assemble_full_report(report_id, outline) report.status = ReportStatus.COMPLETED report.completed_at = datetime.now().isoformat() - # 计算总耗时 + # Calculate total elapsed time total_time_seconds = (datetime.now() - start_time).total_seconds() - - # 记录报告完成日志 + + # Log report completion if self.report_logger: self.report_logger.log_report_complete( total_sections=total_sections, total_time_seconds=total_time_seconds ) - # 保存最终报告 + # Save the final report ReportManager.save_report(report) ReportManager.update_progress( report_id, "completed", 100, t('progress.reportComplete'), @@ -1730,23 +1740,23 @@ class ReportAgent: logger.info(t('report.reportGenDone', reportId=report_id)) - # 关闭控制台日志记录器 + # Close the console log recorder if self.console_logger: self.console_logger.close() self.console_logger = None - + return report - + except Exception as e: logger.error(t('report.reportGenFailed', error=str(e))) report.status = ReportStatus.FAILED report.error = str(e) - - # 记录错误日志 + + # Log the error if self.report_logger: self.report_logger.log_error(str(e), "failed") - - # 保存失败状态 + + # Save the failed state try: ReportManager.save_report(report) ReportManager.update_progress( @@ -1754,75 +1764,75 @@ class ReportAgent: completed_sections=completed_section_titles ) except Exception: - pass # 忽略保存失败的错误 - - # 关闭控制台日志记录器 + pass # Ignore errors when saving the failed state + + # Close the console log recorder if self.console_logger: self.console_logger.close() self.console_logger = None - + return report def chat( - self, + self, message: str, chat_history: List[Dict[str, str]] = None ) -> Dict[str, Any]: """ - 与Report Agent对话 - - 在对话中Agent可以自主调用检索工具来回答问题 - + Chat with the Report Agent. + + During the conversation the Agent may autonomously call retrieval tools to answer questions. + Args: - message: 用户消息 - chat_history: 对话历史 - + message: User message + chat_history: Conversation history + Returns: { - "response": "Agent回复", - "tool_calls": [调用的工具列表], - "sources": [信息来源] + "response": "Agent reply", + "tool_calls": [list of tools called], + "sources": [information sources] } """ logger.info(t('report.agentChat', message=message[:50])) chat_history = chat_history or [] - # 获取已生成的报告内容 + # Retrieve the already-generated report content report_content = "" try: report = ReportManager.get_report_by_simulation(self.simulation_id) if report and report.markdown_content: - # 限制报告长度,避免上下文过长 + # Limit report length to avoid overly long context report_content = report.markdown_content[:15000] if len(report.markdown_content) > 15000: - report_content += "\n\n... [报告内容已截断] ..." + report_content += "\n\n... [Report content truncated] ..." except Exception as e: logger.warning(t('report.fetchReportFailed', error=e)) system_prompt = CHAT_SYSTEM_PROMPT_TEMPLATE.format( simulation_requirement=self.simulation_requirement, - report_content=report_content if report_content else "(暂无报告)", + report_content=report_content if report_content else "(No report available yet)", tools_description=self._get_tools_description(), ) system_prompt = f"{system_prompt}\n\n{get_language_instruction()}" - # 构建消息 + # Build the messages list messages = [{"role": "system", "content": system_prompt}] - - # 添加历史对话 - for h in chat_history[-10:]: # 限制历史长度 + + # Append conversation history + for h in chat_history[-10:]: # limit history length messages.append(h) - - # 添加用户消息 + + # Append the user message messages.append({ "role": "user", "content": message }) - # ReACT循环(简化版) + # ReACT loop (simplified) tool_calls_made = [] - max_iterations = 2 # 减少迭代轮数 + max_iterations = 2 # reduced number of iterations for iteration in range(max_iterations): response = self.llm.chat( @@ -1830,11 +1840,11 @@ class ReportAgent: temperature=0.5 ) - # 解析工具调用 + # Parse tool calls tool_calls = self._parse_tool_calls(response) - + if not tool_calls: - # 没有工具调用,直接返回响应 + # No tool calls — return the response directly clean_response = re.sub(r'.*?', '', response, flags=re.DOTALL) clean_response = re.sub(r'\[TOOL_CALL\].*?\)', '', clean_response) @@ -1844,33 +1854,33 @@ class ReportAgent: "sources": [tc.get("parameters", {}).get("query", "") for tc in tool_calls_made] } - # 执行工具调用(限制数量) + # Execute tool calls (with count limit) tool_results = [] - for call in tool_calls[:1]: # 每轮最多执行1次工具调用 + for call in tool_calls[:1]: # at most 1 tool call per round if len(tool_calls_made) >= self.MAX_TOOL_CALLS_PER_CHAT: break result = self._execute_tool(call["name"], call.get("parameters", {})) tool_results.append({ "tool": call["name"], - "result": result[:1500] # 限制结果长度 + "result": result[:1500] # limit result length }) tool_calls_made.append(call) - # 将结果添加到消息 + # Append the results to the messages messages.append({"role": "assistant", "content": response}) - observation = "\n".join([f"[{r['tool']}结果]\n{r['result']}" for r in tool_results]) + observation = "\n".join([f"[{r['tool']} result]\n{r['result']}" for r in tool_results]) messages.append({ "role": "user", "content": observation + CHAT_OBSERVATION_SUFFIX }) - # 达到最大迭代,获取最终响应 + # Maximum iterations reached; get the final response final_response = self.llm.chat( messages=messages, temperature=0.5 ) - # 清理响应 + # Clean the response clean_response = re.sub(r'.*?', '', final_response, flags=re.DOTALL) clean_response = re.sub(r'\[TOOL_CALL\].*?\)', '', clean_response) @@ -1883,95 +1893,95 @@ class ReportAgent: class ReportManager: """ - 报告管理器 - - 负责报告的持久化存储和检索 - - 文件结构(分章节输出): + Report Manager. + + Responsible for the persistent storage and retrieval of reports. + + File structure (section-by-section output): reports/ {report_id}/ - meta.json - 报告元信息和状态 - outline.json - 报告大纲 - progress.json - 生成进度 - section_01.md - 第1章节 - section_02.md - 第2章节 + meta.json - Report metadata and status + outline.json - Report outline + progress.json - Generation progress + section_01.md - Section 1 + section_02.md - Section 2 ... - full_report.md - 完整报告 + full_report.md - Complete report """ - - # 报告存储目录 + + # Report storage directory REPORTS_DIR = os.path.join(Config.UPLOAD_FOLDER, 'reports') @classmethod def _ensure_reports_dir(cls): - """确保报告根目录存在""" + """Ensure the reports root directory exists""" os.makedirs(cls.REPORTS_DIR, exist_ok=True) - + @classmethod def _get_report_folder(cls, report_id: str) -> str: - """获取报告文件夹路径""" + """Get the report folder path""" return os.path.join(cls.REPORTS_DIR, report_id) - + @classmethod def _ensure_report_folder(cls, report_id: str) -> str: - """确保报告文件夹存在并返回路径""" + """Ensure the report folder exists and return its path""" folder = cls._get_report_folder(report_id) os.makedirs(folder, exist_ok=True) return folder - + @classmethod def _get_report_path(cls, report_id: str) -> str: - """获取报告元信息文件路径""" + """Get the path to the report metadata file""" return os.path.join(cls._get_report_folder(report_id), "meta.json") - + @classmethod def _get_report_markdown_path(cls, report_id: str) -> str: - """获取完整报告Markdown文件路径""" + """Get the path to the full report Markdown file""" return os.path.join(cls._get_report_folder(report_id), "full_report.md") - + @classmethod def _get_outline_path(cls, report_id: str) -> str: - """获取大纲文件路径""" + """Get the path to the outline file""" return os.path.join(cls._get_report_folder(report_id), "outline.json") - + @classmethod def _get_progress_path(cls, report_id: str) -> str: - """获取进度文件路径""" + """Get the path to the progress file""" return os.path.join(cls._get_report_folder(report_id), "progress.json") - + @classmethod def _get_section_path(cls, report_id: str, section_index: int) -> str: - """获取章节Markdown文件路径""" + """Get the path to a section Markdown file""" return os.path.join(cls._get_report_folder(report_id), f"section_{section_index:02d}.md") - + @classmethod def _get_agent_log_path(cls, report_id: str) -> str: - """获取 Agent 日志文件路径""" + """Get the path to the Agent log file""" return os.path.join(cls._get_report_folder(report_id), "agent_log.jsonl") - + @classmethod def _get_console_log_path(cls, report_id: str) -> str: - """获取控制台日志文件路径""" + """Get the path to the console log file""" return os.path.join(cls._get_report_folder(report_id), "console_log.txt") @classmethod def get_console_log(cls, report_id: str, from_line: int = 0) -> Dict[str, Any]: """ - 获取控制台日志内容 - - 这是报告生成过程中的控制台输出日志(INFO、WARNING等), - 与 agent_log.jsonl 的结构化日志不同。 - + Retrieve console log content. + + This is the console output log (INFO, WARNING, etc.) produced during report + generation — distinct from the structured log in agent_log.jsonl. + Args: - report_id: 报告ID - from_line: 从第几行开始读取(用于增量获取,0 表示从头开始) - + report_id: Report ID + from_line: Line number to start reading from (for incremental retrieval; 0 = from the beginning) + Returns: { - "logs": [日志行列表], - "total_lines": 总行数, - "from_line": 起始行号, - "has_more": 是否还有更多日志 + "logs": [list of log lines], + "total_lines": total line count, + "from_line": starting line number, + "has_more": whether more logs are available } """ log_path = cls._get_console_log_path(report_id) @@ -1991,26 +2001,26 @@ class ReportManager: for i, line in enumerate(f): total_lines = i + 1 if i >= from_line: - # 保留原始日志行,去掉末尾换行符 + # Keep the original log line, stripping the trailing newline logs.append(line.rstrip('\n\r')) return { "logs": logs, "total_lines": total_lines, "from_line": from_line, - "has_more": False # 已读取到末尾 + "has_more": False # read through to the end } - + @classmethod def get_console_log_stream(cls, report_id: str) -> List[str]: """ - 获取完整的控制台日志(一次性获取全部) - + Retrieve the complete console log (all at once). + Args: - report_id: 报告ID - + report_id: Report ID + Returns: - 日志行列表 + List of log lines """ result = cls.get_console_log(report_id, from_line=0) return result["logs"] @@ -2018,18 +2028,18 @@ class ReportManager: @classmethod def get_agent_log(cls, report_id: str, from_line: int = 0) -> Dict[str, Any]: """ - 获取 Agent 日志内容 - + Retrieve Agent log content. + Args: - report_id: 报告ID - from_line: 从第几行开始读取(用于增量获取,0 表示从头开始) - + report_id: Report ID + from_line: Line number to start reading from (for incremental retrieval; 0 = from the beginning) + Returns: { - "logs": [日志条目列表], - "total_lines": 总行数, - "from_line": 起始行号, - "has_more": 是否还有更多日志 + "logs": [list of log entries], + "total_lines": total line count, + "from_line": starting line number, + "has_more": whether more logs are available } """ log_path = cls._get_agent_log_path(report_id) @@ -2053,26 +2063,26 @@ class ReportManager: log_entry = json.loads(line.strip()) logs.append(log_entry) except json.JSONDecodeError: - # 跳过解析失败的行 + # Skip lines that fail to parse continue - + return { "logs": logs, "total_lines": total_lines, "from_line": from_line, - "has_more": False # 已读取到末尾 + "has_more": False # read through to the end } - + @classmethod def get_agent_log_stream(cls, report_id: str) -> List[Dict[str, Any]]: """ - 获取完整的 Agent 日志(用于一次性获取全部) - + Retrieve the complete Agent log (all at once). + Args: - report_id: 报告ID - + report_id: Report ID + Returns: - 日志条目列表 + List of log entries """ result = cls.get_agent_log(report_id, from_line=0) return result["logs"] @@ -2080,9 +2090,9 @@ class ReportManager: @classmethod def save_outline(cls, report_id: str, outline: ReportOutline) -> None: """ - 保存报告大纲 - - 在规划阶段完成后立即调用 + Save the report outline. + + Called immediately after the planning stage completes. """ cls._ensure_report_folder(report_id) @@ -2099,27 +2109,27 @@ class ReportManager: section: ReportSection ) -> str: """ - 保存单个章节 + Save a single section. - 在每个章节生成完成后立即调用,实现分章节输出 + Called immediately after each section is generated to enable section-by-section output. Args: - report_id: 报告ID - section_index: 章节索引(从1开始) - section: 章节对象 + report_id: Report ID + section_index: Section index (starting from 1) + section: Section object Returns: - 保存的文件路径 + Path of the saved file """ cls._ensure_report_folder(report_id) - # 构建章节Markdown内容 - 清理可能存在的重复标题 + # Build the section Markdown content — clean up any duplicate headings cleaned_content = cls._clean_section_content(section.content, section.title) md_content = f"## {section.title}\n\n" if cleaned_content: md_content += f"{cleaned_content}\n\n" - # 保存文件 + # Save the file file_suffix = f"section_{section_index:02d}.md" file_path = os.path.join(cls._get_report_folder(report_id), file_suffix) with open(file_path, 'w', encoding='utf-8') as f: @@ -2131,17 +2141,17 @@ class ReportManager: @classmethod def _clean_section_content(cls, content: str, section_title: str) -> str: """ - 清理章节内容 - - 1. 移除内容开头与章节标题重复的Markdown标题行 - 2. 将所有 ### 及以下级别的标题转换为粗体文本 - + Clean section content. + + 1. Remove any Markdown heading at the start of the content that duplicates the section title. + 2. Convert all headings at level ### and below to bold text. + Args: - content: 原始内容 - section_title: 章节标题 - + content: Raw content + section_title: Section title + Returns: - 清理后的内容 + Cleaned content """ import re @@ -2155,27 +2165,27 @@ class ReportManager: for i, line in enumerate(lines): stripped = line.strip() - - # 检查是否是Markdown标题行 + + # Check whether this is a Markdown heading line heading_match = re.match(r'^(#{1,6})\s+(.+)$', stripped) - + if heading_match: level = len(heading_match.group(1)) title_text = heading_match.group(2).strip() - - # 检查是否是与章节标题重复的标题(跳过前5行内的重复) + + # Check for a duplicate of the section title (within the first 5 lines) if i < 5: if title_text == section_title or title_text.replace(' ', '') == section_title.replace(' ', ''): skip_next_empty = True continue - - # 将所有级别的标题(#, ##, ###, ####等)转换为粗体 - # 因为章节标题由系统添加,内容中不应有任何标题 + + # Convert all heading levels (#, ##, ###, #### etc.) to bold text, + # since section headings are added by the system and content should contain none. cleaned_lines.append(f"**{title_text}**") - cleaned_lines.append("") # 添加空行 + cleaned_lines.append("") # add blank line continue - # 如果上一行是被跳过的标题,且当前行为空,也跳过 + # If the previous line was a skipped heading and this line is blank, skip it too if skip_next_empty and stripped == '': skip_next_empty = False continue @@ -2183,14 +2193,14 @@ class ReportManager: skip_next_empty = False cleaned_lines.append(line) - # 移除开头的空行 + # Remove leading blank lines while cleaned_lines and cleaned_lines[0].strip() == '': cleaned_lines.pop(0) - - # 移除开头的分隔线 + + # Remove leading horizontal rules while cleaned_lines and cleaned_lines[0].strip() in ['---', '***', '___']: cleaned_lines.pop(0) - # 同时移除分隔线后的空行 + # Also remove any blank lines immediately after the horizontal rule while cleaned_lines and cleaned_lines[0].strip() == '': cleaned_lines.pop(0) @@ -2198,18 +2208,18 @@ class ReportManager: @classmethod def update_progress( - cls, - report_id: str, - status: str, - progress: int, + cls, + report_id: str, + status: str, + progress: int, message: str, current_section: str = None, completed_sections: List[str] = None ) -> None: """ - 更新报告生成进度 - - 前端可以通过读取progress.json获取实时进度 + Update the report generation progress. + + The frontend can read progress.json to obtain real-time progress. """ cls._ensure_report_folder(report_id) @@ -2227,7 +2237,7 @@ class ReportManager: @classmethod def get_progress(cls, report_id: str) -> Optional[Dict[str, Any]]: - """获取报告生成进度""" + """Get the report generation progress""" path = cls._get_progress_path(report_id) if not os.path.exists(path): @@ -2239,9 +2249,9 @@ class ReportManager: @classmethod def get_generated_sections(cls, report_id: str) -> List[Dict[str, Any]]: """ - 获取已生成的章节列表 - - 返回所有已保存的章节文件信息 + Get the list of already-generated sections. + + Returns information about all saved section files. """ folder = cls._get_report_folder(report_id) @@ -2255,7 +2265,7 @@ class ReportManager: with open(file_path, 'r', encoding='utf-8') as f: content = f.read() - # 从文件名解析章节索引 + # Parse the section index from the filename parts = filename.replace('.md', '').split('_') section_index = int(parts[1]) @@ -2270,26 +2280,26 @@ class ReportManager: @classmethod def assemble_full_report(cls, report_id: str, outline: ReportOutline) -> str: """ - 组装完整报告 - - 从已保存的章节文件组装完整报告,并进行标题清理 + Assemble the complete report. + + Assembles the full report from the saved section files and performs heading clean-up. """ folder = cls._get_report_folder(report_id) - - # 构建报告头部 + + # Build the report header md_content = f"# {outline.title}\n\n" md_content += f"> {outline.summary}\n\n" md_content += f"---\n\n" - # 按顺序读取所有章节文件 + # Read all section files in order sections = cls.get_generated_sections(report_id) for section_info in sections: md_content += section_info["content"] - - # 后处理:清理整个报告的标题问题 + + # Post-processing: clean up heading issues across the full report md_content = cls._post_process_report(md_content, outline) - - # 保存完整报告 + + # Save the complete report full_path = cls._get_report_markdown_path(report_id) with open(full_path, 'w', encoding='utf-8') as f: f.write(md_content) @@ -2300,18 +2310,19 @@ class ReportManager: @classmethod def _post_process_report(cls, content: str, outline: ReportOutline) -> str: """ - 后处理报告内容 - - 1. 移除重复的标题 - 2. 保留报告主标题(#)和章节标题(##),移除其他级别的标题(###, ####等) - 3. 清理多余的空行和分隔线 - + Post-process report content. + + 1. Remove duplicate headings. + 2. Retain the report main title (#) and section titles (##); remove other heading + levels (###, #### etc.). + 3. Clean up excessive blank lines and horizontal rules. + Args: - content: 原始报告内容 - outline: 报告大纲 - + content: Raw report content + outline: Report outline + Returns: - 处理后的内容 + Processed content """ import re @@ -2319,7 +2330,7 @@ class ReportManager: processed_lines = [] prev_was_heading = False - # 收集大纲中的所有章节标题 + # Collect all section titles from the outline section_titles = set() for section in outline.sections: section_titles.add(section.title) @@ -2329,14 +2340,14 @@ class ReportManager: line = lines[i] stripped = line.strip() - # 检查是否是标题行 + # Check whether this is a heading line heading_match = re.match(r'^(#{1,6})\s+(.+)$', stripped) - + if heading_match: level = len(heading_match.group(1)) title = heading_match.group(2).strip() - - # 检查是否是重复标题(在连续5行内出现相同内容的标题) + + # Check for a duplicate heading (same text appearing within the previous 5 lines) is_duplicate = False for j in range(max(0, len(processed_lines) - 5), len(processed_lines)): prev_line = processed_lines[j].strip() @@ -2348,43 +2359,43 @@ class ReportManager: break if is_duplicate: - # 跳过重复标题及其后的空行 + # Skip the duplicate heading and any blank lines that follow it i += 1 while i < len(lines) and lines[i].strip() == '': i += 1 continue - - # 标题层级处理: - # - # (level=1) 只保留报告主标题 - # - ## (level=2) 保留章节标题 - # - ### 及以下 (level>=3) 转换为粗体文本 - + + # Heading level handling: + # - # (level=1) — keep only the report main title + # - ## (level=2) — keep section titles + # - ### and below (level>=3) — convert to bold text + if level == 1: if title == outline.title: - # 保留报告主标题 + # Keep the report main title processed_lines.append(line) prev_was_heading = True elif title in section_titles: - # 章节标题错误使用了#,修正为## + # Section title incorrectly used #; correct to ## processed_lines.append(f"## {title}") prev_was_heading = True else: - # 其他一级标题转为粗体 + # Other level-1 headings become bold text processed_lines.append(f"**{title}**") processed_lines.append("") prev_was_heading = False elif level == 2: if title in section_titles or title == outline.title: - # 保留章节标题 + # Keep section titles processed_lines.append(line) prev_was_heading = True else: - # 非章节的二级标题转为粗体 + # Non-section level-2 headings become bold text processed_lines.append(f"**{title}**") processed_lines.append("") prev_was_heading = False else: - # ### 及以下级别的标题转换为粗体文本 + # Headings at level ### and below are converted to bold text processed_lines.append(f"**{title}**") processed_lines.append("") prev_was_heading = False @@ -2393,12 +2404,12 @@ class ReportManager: continue elif stripped == '---' and prev_was_heading: - # 跳过标题后紧跟的分隔线 + # Skip horizontal rules that immediately follow a heading i += 1 continue - + elif stripped == '' and prev_was_heading: - # 标题后只保留一个空行 + # After a heading, keep only one blank line if processed_lines and processed_lines[-1].strip() != '': processed_lines.append(line) prev_was_heading = False @@ -2409,7 +2420,7 @@ class ReportManager: i += 1 - # 清理连续的多个空行(保留最多2个) + # Clean up consecutive blank lines (retain at most 2) result_lines = [] empty_count = 0 for line in processed_lines: @@ -2425,18 +2436,18 @@ class ReportManager: @classmethod def save_report(cls, report: Report) -> None: - """保存报告元信息和完整报告""" + """Save report metadata and the complete report""" cls._ensure_report_folder(report.report_id) - - # 保存元信息JSON + + # Save metadata JSON with open(cls._get_report_path(report.report_id), 'w', encoding='utf-8') as f: json.dump(report.to_dict(), f, ensure_ascii=False, indent=2) - - # 保存大纲 + + # Save the outline if report.outline: cls.save_outline(report.report_id, report.outline) - - # 保存完整Markdown报告 + + # Save the complete Markdown report if report.markdown_content: with open(cls._get_report_markdown_path(report.report_id), 'w', encoding='utf-8') as f: f.write(report.markdown_content) @@ -2445,11 +2456,11 @@ class ReportManager: @classmethod def get_report(cls, report_id: str) -> Optional[Report]: - """获取报告""" + """Get a report""" path = cls._get_report_path(report_id) - + if not os.path.exists(path): - # 兼容旧格式:检查直接存储在reports目录下的文件 + # Backward compatibility: check for files stored directly in the reports directory old_path = os.path.join(cls.REPORTS_DIR, f"{report_id}.json") if os.path.exists(old_path): path = old_path @@ -2459,7 +2470,7 @@ class ReportManager: with open(path, 'r', encoding='utf-8') as f: data = json.load(f) - # 重建Report对象 + # Reconstruct the Report object outline = None if data.get('outline'): outline_data = data['outline'] @@ -2475,7 +2486,7 @@ class ReportManager: sections=sections ) - # 如果markdown_content为空,尝试从full_report.md读取 + # If markdown_content is empty, try reading from full_report.md markdown_content = data.get('markdown_content', '') if not markdown_content: full_report_path = cls._get_report_markdown_path(report_id) @@ -2498,17 +2509,17 @@ class ReportManager: @classmethod def get_report_by_simulation(cls, simulation_id: str) -> Optional[Report]: - """根据模拟ID获取报告""" + """Get a report by simulation ID""" cls._ensure_reports_dir() - + for item in os.listdir(cls.REPORTS_DIR): item_path = os.path.join(cls.REPORTS_DIR, item) - # 新格式:文件夹 + # New format: folder if os.path.isdir(item_path): report = cls.get_report(item) if report and report.simulation_id == simulation_id: return report - # 兼容旧格式:JSON文件 + # Backward compatibility: JSON file elif item.endswith('.json'): report_id = item[:-5] report = cls.get_report(report_id) @@ -2519,19 +2530,19 @@ class ReportManager: @classmethod def list_reports(cls, simulation_id: Optional[str] = None, limit: int = 50) -> List[Report]: - """列出报告""" + """List reports""" cls._ensure_reports_dir() - + reports = [] for item in os.listdir(cls.REPORTS_DIR): item_path = os.path.join(cls.REPORTS_DIR, item) - # 新格式:文件夹 + # New format: folder if os.path.isdir(item_path): report = cls.get_report(item) if report: if simulation_id is None or report.simulation_id == simulation_id: reports.append(report) - # 兼容旧格式:JSON文件 + # Backward compatibility: JSON file elif item.endswith('.json'): report_id = item[:-5] report = cls.get_report(report_id) @@ -2539,25 +2550,25 @@ class ReportManager: if simulation_id is None or report.simulation_id == simulation_id: reports.append(report) - # 按创建时间倒序 + # Sort by creation time, descending reports.sort(key=lambda r: r.created_at, reverse=True) return reports[:limit] @classmethod def delete_report(cls, report_id: str) -> bool: - """删除报告(整个文件夹)""" + """Delete a report (the entire folder)""" import shutil - + folder_path = cls._get_report_folder(report_id) - - # 新格式:删除整个文件夹 + + # New format: delete the entire folder if os.path.exists(folder_path) and os.path.isdir(folder_path): shutil.rmtree(folder_path) logger.info(t('report.reportFolderDeleted', reportId=report_id)) return True - - # 兼容旧格式:删除单独的文件 + + # Backward compatibility: delete individual files deleted = False old_json_path = os.path.join(cls.REPORTS_DIR, f"{report_id}.json") old_md_path = os.path.join(cls.REPORTS_DIR, f"{report_id}.md") diff --git a/backend/app/services/simulation_config_generator.py b/backend/app/services/simulation_config_generator.py index cb77f6b6..71d4be10 100644 --- a/backend/app/services/simulation_config_generator.py +++ b/backend/app/services/simulation_config_generator.py @@ -1,13 +1,15 @@ """ -模拟配置智能生成器 -使用LLM根据模拟需求、文档内容、图谱信息自动生成细致的模拟参数 -实现全程自动化,无需人工设置参数 +Simulation configuration intelligent generator +Uses LLM to automatically generate detailed simulation parameters +based on simulation requirements, document content, and graph information. +Fully automated — no manual parameter tuning required. -采用分步生成策略,避免一次性生成过长内容导致失败: -1. 生成时间配置 -2. 生成事件配置 -3. 分批生成Agent配置 -4. 生成平台配置 +Employs a step-by-step generation strategy to avoid failures from +producing excessively long output in a single call: +1. Generate time configuration +2. Generate event configuration +3. Generate agent configurations in batches +4. Generate platform configuration """ import json @@ -25,156 +27,156 @@ from .zep_entity_reader import EntityNode, ZepEntityReader logger = get_logger('mirofish.simulation_config') -# 中国作息时间配置(北京时间) +# Activity schedule configuration (Beijing time, UTC+8) CHINA_TIMEZONE_CONFIG = { - # 深夜时段(几乎无人活动) + # Late-night hours (almost no activity) "dead_hours": [0, 1, 2, 3, 4, 5], - # 早间时段(逐渐醒来) + # Morning hours (gradually waking up) "morning_hours": [6, 7, 8], - # 工作时段 + # Working hours "work_hours": [9, 10, 11, 12, 13, 14, 15, 16, 17, 18], - # 晚间高峰(最活跃) + # Evening peak hours (most active) "peak_hours": [19, 20, 21, 22], - # 夜间时段(活跃度下降) + # Night hours (activity declining) "night_hours": [23], - # 活跃度系数 + # Activity multipliers "activity_multipliers": { - "dead": 0.05, # 凌晨几乎无人 - "morning": 0.4, # 早间逐渐活跃 - "work": 0.7, # 工作时段中等 - "peak": 1.5, # 晚间高峰 - "night": 0.5 # 深夜下降 + "dead": 0.05, # Almost no one in the early hours + "morning": 0.4, # Gradually becoming active in the morning + "work": 0.7, # Moderate during working hours + "peak": 1.5, # Evening peak + "night": 0.5 # Declining late at night } } @dataclass class AgentActivityConfig: - """单个Agent的活动配置""" + """Activity configuration for a single agent""" agent_id: int entity_uuid: str entity_name: str entity_type: str - - # 活跃度配置 (0.0-1.0) - activity_level: float = 0.5 # 整体活跃度 - - # 发言频率(每小时预期发言次数) + + # Activity level (0.0-1.0) + activity_level: float = 0.5 # Overall activity level + + # Posting frequency (expected posts per hour) posts_per_hour: float = 1.0 comments_per_hour: float = 2.0 - - # 活跃时间段(24小时制,0-23) + + # Active hours (24-hour clock, 0-23) active_hours: List[int] = field(default_factory=lambda: list(range(8, 23))) - - # 响应速度(对热点事件的反应延迟,单位:模拟分钟) + + # Response speed (reaction delay to hot events, in simulated minutes) response_delay_min: int = 5 response_delay_max: int = 60 - - # 情感倾向 (-1.0到1.0,负面到正面) + + # Sentiment bias (-1.0 to 1.0, negative to positive) sentiment_bias: float = 0.0 - - # 立场(对特定话题的态度) + + # Stance (attitude toward a specific topic) stance: str = "neutral" # supportive, opposing, neutral, observer - - # 影响力权重(决定其发言被其他Agent看到的概率) + + # Influence weight (determines the probability of being seen by other agents) influence_weight: float = 1.0 -@dataclass +@dataclass class TimeSimulationConfig: - """时间模拟配置(基于中国人作息习惯)""" - # 模拟总时长(模拟小时数) - total_simulation_hours: int = 72 # 默认模拟72小时(3天) - - # 每轮代表的时间(模拟分钟)- 默认60分钟(1小时),加快时间流速 + """Time simulation configuration""" + # Total simulation duration (simulated hours) + total_simulation_hours: int = 72 # Default: simulate 72 hours (3 days) + + # Time per round (simulated minutes) — default 60 minutes (1 hour) to accelerate time flow minutes_per_round: int = 60 - - # 每小时激活的Agent数量范围 + + # Range of agents activated per hour agents_per_hour_min: int = 5 agents_per_hour_max: int = 20 - - # 高峰时段(晚间19-22点,中国人最活跃的时间) + + # Peak hours (evening 19-22, most active period) peak_hours: List[int] = field(default_factory=lambda: [19, 20, 21, 22]) peak_activity_multiplier: float = 1.5 - - # 低谷时段(凌晨0-5点,几乎无人活动) + + # Off-peak hours (midnight 0-5, almost no activity) off_peak_hours: List[int] = field(default_factory=lambda: [0, 1, 2, 3, 4, 5]) - off_peak_activity_multiplier: float = 0.05 # 凌晨活跃度极低 - - # 早间时段 + off_peak_activity_multiplier: float = 0.05 # Extremely low activity in early hours + + # Morning hours morning_hours: List[int] = field(default_factory=lambda: [6, 7, 8]) morning_activity_multiplier: float = 0.4 - - # 工作时段 + + # Working hours work_hours: List[int] = field(default_factory=lambda: [9, 10, 11, 12, 13, 14, 15, 16, 17, 18]) work_activity_multiplier: float = 0.7 @dataclass class EventConfig: - """事件配置""" - # 初始事件(模拟开始时的触发事件) + """Event configuration""" + # Initial events (events triggered at the start of the simulation) initial_posts: List[Dict[str, Any]] = field(default_factory=list) - - # 定时事件(在特定时间触发的事件) + + # Scheduled events (events triggered at specific times) scheduled_events: List[Dict[str, Any]] = field(default_factory=list) - - # 热点话题关键词 + + # Hot topic keywords hot_topics: List[str] = field(default_factory=list) - - # 舆论引导方向 + + # Narrative direction for public opinion narrative_direction: str = "" @dataclass class PlatformConfig: - """平台特定配置""" + """Platform-specific configuration""" platform: str # twitter or reddit - - # 推荐算法权重 - recency_weight: float = 0.4 # 时间新鲜度 - popularity_weight: float = 0.3 # 热度 - relevance_weight: float = 0.3 # 相关性 - - # 病毒传播阈值(达到多少互动后触发扩散) + + # Recommendation algorithm weights + recency_weight: float = 0.4 # Freshness + popularity_weight: float = 0.3 # Popularity + relevance_weight: float = 0.3 # Relevance + + # Viral spread threshold (interactions needed to trigger amplification) viral_threshold: int = 10 - - # 回声室效应强度(相似观点聚集程度) + + # Echo chamber effect strength (degree of similar-opinion clustering) echo_chamber_strength: float = 0.5 @dataclass class SimulationParameters: - """完整的模拟参数配置""" - # 基础信息 + """Complete simulation parameter configuration""" + # Basic info simulation_id: str project_id: str graph_id: str simulation_requirement: str - - # 时间配置 + + # Time configuration time_config: TimeSimulationConfig = field(default_factory=TimeSimulationConfig) - - # Agent配置列表 + + # Agent configuration list agent_configs: List[AgentActivityConfig] = field(default_factory=list) - - # 事件配置 + + # Event configuration event_config: EventConfig = field(default_factory=EventConfig) - - # 平台配置 + + # Platform configuration twitter_config: Optional[PlatformConfig] = None reddit_config: Optional[PlatformConfig] = None - - # LLM配置 + + # LLM configuration llm_model: str = "" llm_base_url: str = "" - - # 生成元数据 + + # Generation metadata generated_at: str = field(default_factory=lambda: datetime.now().isoformat()) - generation_reasoning: str = "" # LLM的推理说明 - + generation_reasoning: str = "" # LLM reasoning explanation + def to_dict(self) -> Dict[str, Any]: - """转换为字典""" + """Convert to dictionary""" time_dict = asdict(self.time_config) return { "simulation_id": self.simulation_id, @@ -191,37 +193,37 @@ class SimulationParameters: "generated_at": self.generated_at, "generation_reasoning": self.generation_reasoning, } - + def to_json(self, indent: int = 2) -> str: - """转换为JSON字符串""" + """Convert to JSON string""" return json.dumps(self.to_dict(), ensure_ascii=False, indent=indent) class SimulationConfigGenerator: """ - 模拟配置智能生成器 - - 使用LLM分析模拟需求、文档内容、图谱实体信息, - 自动生成最佳的模拟参数配置 - - 采用分步生成策略: - 1. 生成时间配置和事件配置(轻量级) - 2. 分批生成Agent配置(每批10-20个) - 3. 生成平台配置 + Simulation configuration intelligent generator + + Uses LLM to analyze simulation requirements, document content, and graph entity + information to automatically generate optimal simulation parameter configurations. + + Employs a step-by-step generation strategy: + 1. Generate time configuration and event configuration (lightweight) + 2. Generate agent configurations in batches (10-20 per batch) + 3. Generate platform configuration """ - - # 上下文最大字符数 + + # Maximum context length in characters MAX_CONTEXT_LENGTH = 50000 - # 每批生成的Agent数量 + # Number of agents generated per batch AGENTS_PER_BATCH = 15 - - # 各步骤的上下文截断长度(字符数) - TIME_CONFIG_CONTEXT_LENGTH = 10000 # 时间配置 - EVENT_CONFIG_CONTEXT_LENGTH = 8000 # 事件配置 - ENTITY_SUMMARY_LENGTH = 300 # 实体摘要 - AGENT_SUMMARY_LENGTH = 300 # Agent配置中的实体摘要 - ENTITIES_PER_TYPE_DISPLAY = 20 # 每类实体显示数量 - + + # Context truncation lengths per step (in characters) + TIME_CONFIG_CONTEXT_LENGTH = 10000 # Time configuration + EVENT_CONFIG_CONTEXT_LENGTH = 8000 # Event configuration + ENTITY_SUMMARY_LENGTH = 300 # Entity summary + AGENT_SUMMARY_LENGTH = 300 # Entity summary in agent configuration + ENTITIES_PER_TYPE_DISPLAY = 20 # Number of entities displayed per type + def __init__( self, api_key: Optional[str] = None, @@ -231,15 +233,15 @@ class SimulationConfigGenerator: self.api_key = api_key or Config.LLM_API_KEY self.base_url = base_url or Config.LLM_BASE_URL self.model_name = model_name or Config.LLM_MODEL_NAME - + if not self.api_key: - raise ValueError("LLM_API_KEY 未配置") - + raise ValueError("LLM_API_KEY is not configured") + self.client = OpenAI( api_key=self.api_key, base_url=self.base_url ) - + def generate_config( self, simulation_id: str, @@ -253,70 +255,70 @@ class SimulationConfigGenerator: progress_callback: Optional[Callable[[int, int, str], None]] = None, ) -> SimulationParameters: """ - 智能生成完整的模拟配置(分步生成) - + Intelligently generate a complete simulation configuration (step-by-step). + Args: - simulation_id: 模拟ID - project_id: 项目ID - graph_id: 图谱ID - simulation_requirement: 模拟需求描述 - document_text: 原始文档内容 - entities: 过滤后的实体列表 - enable_twitter: 是否启用Twitter - enable_reddit: 是否启用Reddit - progress_callback: 进度回调函数(current_step, total_steps, message) - + simulation_id: simulation ID + project_id: project ID + graph_id: graph ID + simulation_requirement: simulation requirement description + document_text: original document content + entities: filtered entity list + enable_twitter: whether to enable Twitter + enable_reddit: whether to enable Reddit + progress_callback: progress callback function(current_step, total_steps, message) + Returns: - SimulationParameters: 完整的模拟参数 + SimulationParameters: complete simulation parameters """ - logger.info(f"开始智能生成模拟配置: simulation_id={simulation_id}, 实体数={len(entities)}") - - # 计算总步骤数 + logger.info(f"Starting intelligent simulation config generation: simulation_id={simulation_id}, entities={len(entities)}") + + # Calculate total number of steps num_batches = math.ceil(len(entities) / self.AGENTS_PER_BATCH) - total_steps = 3 + num_batches # 时间配置 + 事件配置 + N批Agent + 平台配置 + total_steps = 3 + num_batches # time config + event config + N agent batches + platform config current_step = 0 - + def report_progress(step: int, message: str): nonlocal current_step current_step = step if progress_callback: progress_callback(step, total_steps, message) logger.info(f"[{step}/{total_steps}] {message}") - - # 1. 构建基础上下文信息 + + # 1. Build base context information context = self._build_context( simulation_requirement=simulation_requirement, document_text=document_text, entities=entities ) - + reasoning_parts = [] - - # ========== 步骤1: 生成时间配置 ========== + + # ========== Step 1: Generate time configuration ========== report_progress(1, t('progress.generatingTimeConfig')) num_entities = len(entities) time_config_result = self._generate_time_config(context, num_entities) time_config = self._parse_time_config(time_config_result, num_entities) reasoning_parts.append(f"{t('progress.timeConfigLabel')}: {time_config_result.get('reasoning', t('common.success'))}") - - # ========== 步骤2: 生成事件配置 ========== + + # ========== Step 2: Generate event configuration ========== report_progress(2, t('progress.generatingEventConfig')) event_config_result = self._generate_event_config(context, simulation_requirement, entities) event_config = self._parse_event_config(event_config_result) reasoning_parts.append(f"{t('progress.eventConfigLabel')}: {event_config_result.get('reasoning', t('common.success'))}") - - # ========== 步骤3-N: 分批生成Agent配置 ========== + + # ========== Steps 3-N: Generate agent configurations in batches ========== all_agent_configs = [] for batch_idx in range(num_batches): start_idx = batch_idx * self.AGENTS_PER_BATCH end_idx = min(start_idx + self.AGENTS_PER_BATCH, len(entities)) batch_entities = entities[start_idx:end_idx] - + report_progress( 3 + batch_idx, t('progress.generatingAgentConfig', start=start_idx + 1, end=end_idx, total=len(entities)) ) - + batch_configs = self._generate_agent_configs_batch( context=context, entities=batch_entities, @@ -324,20 +326,20 @@ class SimulationConfigGenerator: simulation_requirement=simulation_requirement ) all_agent_configs.extend(batch_configs) - + reasoning_parts.append(t('progress.agentConfigResult', count=len(all_agent_configs))) - - # ========== 为初始帖子分配发布者 Agent ========== - logger.info("为初始帖子分配合适的发布者 Agent...") + + # ========== Assign poster agents to initial posts ========== + logger.info("Assigning suitable poster agents to initial posts...") event_config = self._assign_initial_post_agents(event_config, all_agent_configs) assigned_count = len([p for p in event_config.initial_posts if p.get("poster_agent_id") is not None]) reasoning_parts.append(t('progress.postAssignResult', count=assigned_count)) - - # ========== 最后一步: 生成平台配置 ========== + + # ========== Final step: Generate platform configuration ========== report_progress(total_steps, t('progress.generatingPlatformConfig')) twitter_config = None reddit_config = None - + if enable_twitter: twitter_config = PlatformConfig( platform="twitter", @@ -347,7 +349,7 @@ class SimulationConfigGenerator: viral_threshold=10, echo_chamber_strength=0.5 ) - + if enable_reddit: reddit_config = PlatformConfig( platform="reddit", @@ -357,8 +359,8 @@ class SimulationConfigGenerator: viral_threshold=15, echo_chamber_strength=0.6 ) - - # 构建最终参数 + + # Build final parameters params = SimulationParameters( simulation_id=simulation_id, project_id=project_id, @@ -373,71 +375,71 @@ class SimulationConfigGenerator: llm_base_url=self.base_url, generation_reasoning=" | ".join(reasoning_parts) ) - - logger.info(f"模拟配置生成完成: {len(params.agent_configs)} 个Agent配置") - + + logger.info(f"Simulation config generation complete: {len(params.agent_configs)} agent configurations") + return params - + def _build_context( self, simulation_requirement: str, document_text: str, entities: List[EntityNode] ) -> str: - """构建LLM上下文,截断到最大长度""" - - # 实体摘要 + """Build LLM context, truncated to maximum length""" + + # Entity summary entity_summary = self._summarize_entities(entities) - - # 构建上下文 + + # Build context context_parts = [ - f"## 模拟需求\n{simulation_requirement}", - f"\n## 实体信息 ({len(entities)}个)\n{entity_summary}", + f"## Simulation Requirement\n{simulation_requirement}", + f"\n## Entity Information ({len(entities)} entities)\n{entity_summary}", ] - + current_length = sum(len(p) for p in context_parts) - remaining_length = self.MAX_CONTEXT_LENGTH - current_length - 500 # 留500字符余量 - + remaining_length = self.MAX_CONTEXT_LENGTH - current_length - 500 # Leave 500-char margin + if remaining_length > 0 and document_text: doc_text = document_text[:remaining_length] if len(document_text) > remaining_length: - doc_text += "\n...(文档已截断)" - context_parts.append(f"\n## 原始文档内容\n{doc_text}") - + doc_text += "\n...(document truncated)" + context_parts.append(f"\n## Original Document Content\n{doc_text}") + return "\n".join(context_parts) - + def _summarize_entities(self, entities: List[EntityNode]) -> str: - """生成实体摘要""" + """Generate entity summary""" lines = [] - - # 按类型分组 + + # Group by type by_type: Dict[str, List[EntityNode]] = {} for e in entities: t = e.get_entity_type() or "Unknown" if t not in by_type: by_type[t] = [] by_type[t].append(e) - + for entity_type, type_entities in by_type.items(): - lines.append(f"\n### {entity_type} ({len(type_entities)}个)") - # 使用配置的显示数量和摘要长度 + lines.append(f"\n### {entity_type} ({len(type_entities)} entities)") + # Use configured display count and summary length display_count = self.ENTITIES_PER_TYPE_DISPLAY summary_len = self.ENTITY_SUMMARY_LENGTH for e in type_entities[:display_count]: summary_preview = (e.summary[:summary_len] + "...") if len(e.summary) > summary_len else e.summary lines.append(f"- {e.name}: {summary_preview}") if len(type_entities) > display_count: - lines.append(f" ... 还有 {len(type_entities) - display_count} 个") - + lines.append(f" ... and {len(type_entities) - display_count} more") + return "\n".join(lines) - + def _call_llm_with_retry(self, prompt: str, system_prompt: str) -> Dict[str, Any]: - """带重试的LLM调用,包含JSON修复逻辑""" + """LLM call with retry, including JSON repair logic""" import re - + max_attempts = 3 last_error = None - + for attempt in range(max_attempts): try: response = self.client.chat.completions.create( @@ -447,121 +449,121 @@ class SimulationConfigGenerator: {"role": "user", "content": prompt} ], response_format={"type": "json_object"}, - temperature=0.7 - (attempt * 0.1) # 每次重试降低温度 - # 不设置max_tokens,让LLM自由发挥 + temperature=0.7 - (attempt * 0.1) # Lower temperature on each retry + # No max_tokens set — let the LLM generate freely ) - + content = response.choices[0].message.content finish_reason = response.choices[0].finish_reason - - # 检查是否被截断 + + # Check if output was truncated if finish_reason == 'length': - logger.warning(f"LLM输出被截断 (attempt {attempt+1})") + logger.warning(f"LLM output truncated (attempt {attempt+1})") content = self._fix_truncated_json(content) - - # 尝试解析JSON + + # Attempt to parse JSON try: return json.loads(content) except json.JSONDecodeError as e: - logger.warning(f"JSON解析失败 (attempt {attempt+1}): {str(e)[:80]}") - - # 尝试修复JSON + logger.warning(f"JSON parse failed (attempt {attempt+1}): {str(e)[:80]}") + + # Attempt to repair JSON fixed = self._try_fix_config_json(content) if fixed: return fixed - + last_error = e - + except Exception as e: - logger.warning(f"LLM调用失败 (attempt {attempt+1}): {str(e)[:80]}") + logger.warning(f"LLM call failed (attempt {attempt+1}): {str(e)[:80]}") last_error = e import time time.sleep(2 * (attempt + 1)) - - raise last_error or Exception("LLM调用失败") - + + raise last_error or Exception("LLM call failed") + def _fix_truncated_json(self, content: str) -> str: - """修复被截断的JSON""" + """Repair truncated JSON""" content = content.strip() - - # 计算未闭合的括号 + + # Count unclosed brackets open_braces = content.count('{') - content.count('}') open_brackets = content.count('[') - content.count(']') - - # 检查是否有未闭合的字符串 + + # Check for unclosed string if content and content[-1] not in '",}]': content += '"' - - # 闭合括号 + + # Close brackets content += ']' * open_brackets content += '}' * open_braces - + return content - + def _try_fix_config_json(self, content: str) -> Optional[Dict[str, Any]]: - """尝试修复配置JSON""" + """Attempt to repair configuration JSON""" import re - - # 修复被截断的情况 + + # Repair truncated content content = self._fix_truncated_json(content) - - # 提取JSON部分 + + # Extract JSON portion json_match = re.search(r'\{[\s\S]*\}', content) if json_match: json_str = json_match.group() - - # 移除字符串中的换行符 + + # Remove newlines inside strings def fix_string(match): s = match.group(0) s = s.replace('\n', ' ').replace('\r', ' ') s = re.sub(r'\s+', ' ', s) return s - + json_str = re.sub(r'"[^"\\]*(?:\\.[^"\\]*)*"', fix_string, json_str) - + try: return json.loads(json_str) except: - # 尝试移除所有控制字符 + # Try removing all control characters json_str = re.sub(r'[\x00-\x1f\x7f-\x9f]', ' ', json_str) json_str = re.sub(r'\s+', ' ', json_str) try: return json.loads(json_str) except: pass - + return None - + def _generate_time_config(self, context: str, num_entities: int) -> Dict[str, Any]: - """生成时间配置""" - # 使用配置的上下文截断长度 + """Generate time configuration""" + # Use configured context truncation length context_truncated = context[:self.TIME_CONFIG_CONTEXT_LENGTH] - - # 计算最大允许值(80%的agent数) + + # Calculate maximum allowed value (90% of agent count) max_agents_allowed = max(1, int(num_entities * 0.9)) - - prompt = f"""基于以下模拟需求,生成时间模拟配置。 + + prompt = f"""Based on the following simulation requirements, generate a time simulation configuration. {context_truncated} -## 任务 -请生成时间配置JSON。 +## Task +Generate the time configuration JSON. -### 基本原则(仅供参考,需根据具体事件和参与群体灵活调整): -- 请根据模拟场景推断目标用户群体所在时区和作息习惯,以下为东八区(UTC+8)的参考示例 -- 凌晨0-5点几乎无人活动(活跃度系数0.05) -- 早上6-8点逐渐活跃(活跃度系数0.4) -- 工作时间9-18点中等活跃(活跃度系数0.7) -- 晚间19-22点是高峰期(活跃度系数1.5) -- 23点后活跃度下降(活跃度系数0.5) -- 一般规律:凌晨低活跃、早间渐增、工作时段中等、晚间高峰 -- **重要**:以下示例值仅供参考,你需要根据事件性质、参与群体特点来调整具体时段 - - 例如:学生群体高峰可能是21-23点;媒体全天活跃;官方机构只在工作时间 - - 例如:突发热点可能导致深夜也有讨论,off_peak_hours 可适当缩短 +### Basic principles (for reference only — adjust flexibly based on the specific event and participant group): +- Infer the target user group's timezone and daily schedule from the simulation scenario; the following is a reference example for UTC+8 +- Almost no activity from 0-5 (activity multiplier 0.05) +- Gradually becoming active from 6-8 (activity multiplier 0.4) +- Moderate activity during working hours 9-18 (activity multiplier 0.7) +- Peak period 19-22 in the evening (activity multiplier 1.5) +- Activity declines after 23:00 (activity multiplier 0.5) +- General pattern: low activity in early hours, increasing in morning, moderate during work hours, peak in evening +- **Important**: The example values below are for reference only — adjust specific time slots based on event nature and participant characteristics + - Example: student groups may peak at 21-23; media is active all day; official institutions only during working hours + - Example: a sudden trending topic may generate discussion even late at night, so off_peak_hours can be shortened -### 返回JSON格式(不要markdown) +### Return JSON format (no markdown) -示例: +Example: {{ "total_simulation_hours": 72, "minutes_per_round": 60, @@ -571,92 +573,92 @@ class SimulationConfigGenerator: "off_peak_hours": [0, 1, 2, 3, 4, 5], "morning_hours": [6, 7, 8], "work_hours": [9, 10, 11, 12, 13, 14, 15, 16, 17, 18], - "reasoning": "针对该事件的时间配置说明" + "reasoning": "Explanation of the time configuration for this event" }} -字段说明: -- total_simulation_hours (int): 模拟总时长,24-168小时,突发事件短、持续话题长 -- minutes_per_round (int): 每轮时长,30-120分钟,建议60分钟 -- agents_per_hour_min (int): 每小时最少激活Agent数(取值范围: 1-{max_agents_allowed}) -- agents_per_hour_max (int): 每小时最多激活Agent数(取值范围: 1-{max_agents_allowed}) -- peak_hours (int数组): 高峰时段,根据事件参与群体调整 -- off_peak_hours (int数组): 低谷时段,通常深夜凌晨 -- morning_hours (int数组): 早间时段 -- work_hours (int数组): 工作时段 -- reasoning (string): 简要说明为什么这样配置""" +Field descriptions: +- total_simulation_hours (int): total simulation duration, 24-168 hours; shorter for sudden events, longer for sustained topics +- minutes_per_round (int): duration per round, 30-120 minutes, 60 minutes recommended +- agents_per_hour_min (int): minimum agents activated per hour (range: 1-{max_agents_allowed}) +- agents_per_hour_max (int): maximum agents activated per hour (range: 1-{max_agents_allowed}) +- peak_hours (int array): peak hours, adjust based on participant group +- off_peak_hours (int array): off-peak hours, usually late night / early morning +- morning_hours (int array): morning hours +- work_hours (int array): working hours +- reasoning (string): brief explanation of why this configuration was chosen""" - system_prompt = "你是社交媒体模拟专家。返回纯JSON格式,时间配置需符合模拟场景中目标用户群体的作息习惯。" + system_prompt = "You are a social media simulation expert. Return pure JSON format. The time configuration must match the daily schedule of the target user group in the simulation scenario." system_prompt = f"{system_prompt}\n\n{get_language_instruction()}" try: return self._call_llm_with_retry(prompt, system_prompt) except Exception as e: - logger.warning(f"时间配置LLM生成失败: {e}, 使用默认配置") + logger.warning(f"Time config LLM generation failed: {e}, using default configuration") return self._get_default_time_config(num_entities) - + def _get_default_time_config(self, num_entities: int) -> Dict[str, Any]: - """获取默认时间配置(中国人作息)""" + """Get default time configuration""" return { "total_simulation_hours": 72, - "minutes_per_round": 60, # 每轮1小时,加快时间流速 + "minutes_per_round": 60, # 1 hour per round to accelerate time flow "agents_per_hour_min": max(1, num_entities // 15), "agents_per_hour_max": max(5, num_entities // 5), "peak_hours": [19, 20, 21, 22], "off_peak_hours": [0, 1, 2, 3, 4, 5], "morning_hours": [6, 7, 8], "work_hours": [9, 10, 11, 12, 13, 14, 15, 16, 17, 18], - "reasoning": "使用默认中国人作息配置(每轮1小时)" + "reasoning": "Using default time configuration (1 hour per round)" } - + def _parse_time_config(self, result: Dict[str, Any], num_entities: int) -> TimeSimulationConfig: - """解析时间配置结果,并验证agents_per_hour值不超过总agent数""" - # 获取原始值 + """Parse time configuration result and validate agents_per_hour values do not exceed total agent count""" + # Get raw values agents_per_hour_min = result.get("agents_per_hour_min", max(1, num_entities // 15)) agents_per_hour_max = result.get("agents_per_hour_max", max(5, num_entities // 5)) - - # 验证并修正:确保不超过总agent数 + + # Validate and correct: ensure values do not exceed total agent count if agents_per_hour_min > num_entities: - logger.warning(f"agents_per_hour_min ({agents_per_hour_min}) 超过总Agent数 ({num_entities}),已修正") + logger.warning(f"agents_per_hour_min ({agents_per_hour_min}) exceeds total agent count ({num_entities}), correcting") agents_per_hour_min = max(1, num_entities // 10) - + if agents_per_hour_max > num_entities: - logger.warning(f"agents_per_hour_max ({agents_per_hour_max}) 超过总Agent数 ({num_entities}),已修正") + logger.warning(f"agents_per_hour_max ({agents_per_hour_max}) exceeds total agent count ({num_entities}), correcting") agents_per_hour_max = max(agents_per_hour_min + 1, num_entities // 2) - - # 确保 min < max + + # Ensure min < max if agents_per_hour_min >= agents_per_hour_max: agents_per_hour_min = max(1, agents_per_hour_max // 2) - logger.warning(f"agents_per_hour_min >= max,已修正为 {agents_per_hour_min}") - + logger.warning(f"agents_per_hour_min >= max, corrected to {agents_per_hour_min}") + return TimeSimulationConfig( total_simulation_hours=result.get("total_simulation_hours", 72), - minutes_per_round=result.get("minutes_per_round", 60), # 默认每轮1小时 + minutes_per_round=result.get("minutes_per_round", 60), # Default: 1 hour per round agents_per_hour_min=agents_per_hour_min, agents_per_hour_max=agents_per_hour_max, peak_hours=result.get("peak_hours", [19, 20, 21, 22]), off_peak_hours=result.get("off_peak_hours", [0, 1, 2, 3, 4, 5]), - off_peak_activity_multiplier=0.05, # 凌晨几乎无人 + off_peak_activity_multiplier=0.05, # Almost no activity in early hours morning_hours=result.get("morning_hours", [6, 7, 8]), morning_activity_multiplier=0.4, work_hours=result.get("work_hours", list(range(9, 19))), work_activity_multiplier=0.7, peak_activity_multiplier=1.5 ) - + def _generate_event_config( - self, - context: str, + self, + context: str, simulation_requirement: str, entities: List[EntityNode] ) -> Dict[str, Any]: - """生成事件配置""" - - # 获取可用的实体类型列表,供 LLM 参考 + """Generate event configuration""" + + # Get list of available entity types for LLM reference entity_types_available = list(set( e.get_entity_type() or "Unknown" for e in entities )) - - # 为每种类型列出代表性实体名称 + + # List representative entity names for each type type_examples = {} for e in entities: etype = e.get_entity_type() or "Unknown" @@ -664,89 +666,89 @@ class SimulationConfigGenerator: type_examples[etype] = [] if len(type_examples[etype]) < 3: type_examples[etype].append(e.name) - + type_info = "\n".join([ - f"- {t}: {', '.join(examples)}" + f"- {t}: {', '.join(examples)}" for t, examples in type_examples.items() ]) - - # 使用配置的上下文截断长度 - context_truncated = context[:self.EVENT_CONFIG_CONTEXT_LENGTH] - - prompt = f"""基于以下模拟需求,生成事件配置。 -模拟需求: {simulation_requirement} + # Use configured context truncation length + context_truncated = context[:self.EVENT_CONFIG_CONTEXT_LENGTH] + + prompt = f"""Based on the following simulation requirements, generate the event configuration. + +Simulation requirement: {simulation_requirement} {context_truncated} -## 可用实体类型及示例 +## Available entity types and examples {type_info} -## 任务 -请生成事件配置JSON: -- 提取热点话题关键词 -- 描述舆论发展方向 -- 设计初始帖子内容,**每个帖子必须指定 poster_type(发布者类型)** +## Task +Generate the event configuration JSON: +- Extract hot topic keywords +- Describe the direction of public opinion development +- Design initial post content — **each post must specify a poster_type (poster entity type)** -**重要**: poster_type 必须从上面的"可用实体类型"中选择,这样初始帖子才能分配给合适的 Agent 发布。 -例如:官方声明应由 Official/University 类型发布,新闻由 MediaOutlet 发布,学生观点由 Student 发布。 +**Important**: poster_type must be chosen from the "Available entity types" listed above, so that initial posts can be assigned to the appropriate agent for publishing. +For example: official announcements should be posted by Official/University types, news by MediaOutlet, student opinions by Student. -返回JSON格式(不要markdown): +Return JSON format (no markdown): {{ - "hot_topics": ["关键词1", "关键词2", ...], - "narrative_direction": "<舆论发展方向描述>", + "hot_topics": ["keyword1", "keyword2", ...], + "narrative_direction": "", "initial_posts": [ - {{"content": "帖子内容", "poster_type": "实体类型(必须从可用类型中选择)"}}, + {{"content": "post content", "poster_type": "entity type (must be chosen from available types)"}}, ... ], - "reasoning": "<简要说明>" + "reasoning": "" }}""" - system_prompt = "你是舆论分析专家。返回纯JSON格式。注意 poster_type 必须精确匹配可用实体类型。" + system_prompt = "You are a public opinion analysis expert. Return pure JSON format. Note that poster_type must exactly match the available entity types." system_prompt = f"{system_prompt}\n\n{get_language_instruction()}\nIMPORTANT: The 'poster_type' field value MUST be in English PascalCase exactly matching the available entity types. Only 'content', 'narrative_direction', 'hot_topics' and 'reasoning' fields should use the specified language." try: return self._call_llm_with_retry(prompt, system_prompt) except Exception as e: - logger.warning(f"事件配置LLM生成失败: {e}, 使用默认配置") + logger.warning(f"Event config LLM generation failed: {e}, using default configuration") return { "hot_topics": [], "narrative_direction": "", "initial_posts": [], - "reasoning": "使用默认配置" + "reasoning": "Using default configuration" } - + def _parse_event_config(self, result: Dict[str, Any]) -> EventConfig: - """解析事件配置结果""" + """Parse event configuration result""" return EventConfig( initial_posts=result.get("initial_posts", []), scheduled_events=[], hot_topics=result.get("hot_topics", []), narrative_direction=result.get("narrative_direction", "") ) - + def _assign_initial_post_agents( self, event_config: EventConfig, agent_configs: List[AgentActivityConfig] ) -> EventConfig: """ - 为初始帖子分配合适的发布者 Agent - - 根据每个帖子的 poster_type 匹配最合适的 agent_id + Assign suitable poster agents to initial posts. + + Matches the most appropriate agent_id for each post based on its poster_type. """ if not event_config.initial_posts: return event_config - - # 按实体类型建立 agent 索引 + + # Build agent index by entity type agents_by_type: Dict[str, List[AgentActivityConfig]] = {} for agent in agent_configs: etype = agent.entity_type.lower() if etype not in agents_by_type: agents_by_type[etype] = [] agents_by_type[etype].append(agent) - - # 类型映射表(处理 LLM 可能输出的不同格式) + + # Type alias map (handles different formats the LLM may output) type_aliases = { "official": ["official", "university", "governmentagency", "government"], "university": ["university", "official"], @@ -757,26 +759,26 @@ class SimulationConfigGenerator: "organization": ["organization", "ngo", "company", "group"], "person": ["person", "student", "alumni"], } - - # 记录每种类型已使用的 agent 索引,避免重复使用同一个 agent + + # Track the used agent index per type to avoid assigning the same agent twice used_indices: Dict[str, int] = {} - + updated_posts = [] for post in event_config.initial_posts: poster_type = post.get("poster_type", "").lower() content = post.get("content", "") - - # 尝试找到匹配的 agent + + # Try to find a matching agent matched_agent_id = None - - # 1. 直接匹配 + + # 1. Direct match if poster_type in agents_by_type: agents = agents_by_type[poster_type] idx = used_indices.get(poster_type, 0) % len(agents) matched_agent_id = agents[idx].agent_id used_indices[poster_type] = idx + 1 else: - # 2. 使用别名匹配 + # 2. Alias match for alias_key, aliases in type_aliases.items(): if poster_type in aliases or alias_key == poster_type: for alias in aliases: @@ -788,28 +790,28 @@ class SimulationConfigGenerator: break if matched_agent_id is not None: break - - # 3. 如果仍未找到,使用影响力最高的 agent + + # 3. If still no match, use the agent with the highest influence if matched_agent_id is None: - logger.warning(f"未找到类型 '{poster_type}' 的匹配 Agent,使用影响力最高的 Agent") + logger.warning(f"No matching agent found for type '{poster_type}', using the highest-influence agent") if agent_configs: - # 按影响力排序,选择影响力最高的 + # Sort by influence and pick the highest sorted_agents = sorted(agent_configs, key=lambda a: a.influence_weight, reverse=True) matched_agent_id = sorted_agents[0].agent_id else: matched_agent_id = 0 - + updated_posts.append({ "content": content, "poster_type": post.get("poster_type", "Unknown"), "poster_agent_id": matched_agent_id }) - - logger.info(f"初始帖子分配: poster_type='{poster_type}' -> agent_id={matched_agent_id}") - + + logger.info(f"Initial post assigned: poster_type='{poster_type}' -> agent_id={matched_agent_id}") + event_config.initial_posts = updated_posts return event_config - + def _generate_agent_configs_batch( self, context: str, @@ -817,9 +819,9 @@ class SimulationConfigGenerator: start_idx: int, simulation_requirement: str ) -> List[AgentActivityConfig]: - """分批生成Agent配置""" - - # 构建实体信息(使用配置的摘要长度) + """Generate agent configurations in batches""" + + # Build entity information (using configured summary length) entity_list = [] summary_len = self.AGENT_SUMMARY_LENGTH for i, e in enumerate(entities): @@ -829,63 +831,63 @@ class SimulationConfigGenerator: "entity_type": e.get_entity_type() or "Unknown", "summary": e.summary[:summary_len] if e.summary else "" }) - - prompt = f"""基于以下信息,为每个实体生成社交媒体活动配置。 -模拟需求: {simulation_requirement} + prompt = f"""Based on the following information, generate social media activity configurations for each entity. -## 实体列表 +Simulation requirement: {simulation_requirement} + +## Entity list ```json {json.dumps(entity_list, ensure_ascii=False, indent=2)} ``` -## 任务 -为每个实体生成活动配置,注意: -- **时间符合目标用户群体作息**:以下为参考(东八区),请根据模拟场景调整 -- **官方机构**(University/GovernmentAgency):活跃度低(0.1-0.3),工作时间(9-17)活动,响应慢(60-240分钟),影响力高(2.5-3.0) -- **媒体**(MediaOutlet):活跃度中(0.4-0.6),全天活动(8-23),响应快(5-30分钟),影响力高(2.0-2.5) -- **个人**(Student/Person/Alumni):活跃度高(0.6-0.9),主要晚间活动(18-23),响应快(1-15分钟),影响力低(0.8-1.2) -- **公众人物/专家**:活跃度中(0.4-0.6),影响力中高(1.5-2.0) +## Task +Generate activity configurations for each entity. Notes: +- **Activity times should match the target user group's schedule**: the following is for reference (UTC+8); adjust based on the simulation scenario +- **Official institutions** (University/GovernmentAgency): low activity (0.1-0.3), active during work hours (9-17), slow response (60-240 min), high influence (2.5-3.0) +- **Media** (MediaOutlet): medium activity (0.4-0.6), active all day (8-23), fast response (5-30 min), high influence (2.0-2.5) +- **Individuals** (Student/Person/Alumni): high activity (0.6-0.9), mainly evening activity (18-23), fast response (1-15 min), low influence (0.8-1.2) +- **Public figures/experts**: medium activity (0.4-0.6), medium-high influence (1.5-2.0) -返回JSON格式(不要markdown): +Return JSON format (no markdown): {{ "agent_configs": [ {{ - "agent_id": <必须与输入一致>, + "agent_id": , "activity_level": <0.0-1.0>, - "posts_per_hour": <发帖频率>, - "comments_per_hour": <评论频率>, - "active_hours": [<活跃小时列表,考虑中国人作息>], - "response_delay_min": <最小响应延迟分钟>, - "response_delay_max": <最大响应延迟分钟>, - "sentiment_bias": <-1.0到1.0>, + "posts_per_hour": , + "comments_per_hour": , + "active_hours": [], + "response_delay_min": , + "response_delay_max": , + "sentiment_bias": <-1.0 to 1.0>, "stance": "", - "influence_weight": <影响力权重> + "influence_weight": }}, ... ] }}""" - system_prompt = "你是社交媒体行为分析专家。返回纯JSON,配置需符合模拟场景中目标用户群体的作息习惯。" + system_prompt = "You are a social media behavior analysis expert. Return pure JSON. Configurations must match the daily schedule of the target user group in the simulation scenario." system_prompt = f"{system_prompt}\n\n{get_language_instruction()}\nIMPORTANT: The 'stance' field value MUST be one of the English strings: 'supportive', 'opposing', 'neutral', 'observer'. All JSON field names and numeric values must remain unchanged. Only natural language text fields should use the specified language." try: result = self._call_llm_with_retry(prompt, system_prompt) llm_configs = {cfg["agent_id"]: cfg for cfg in result.get("agent_configs", [])} except Exception as e: - logger.warning(f"Agent配置批次LLM生成失败: {e}, 使用规则生成") + logger.warning(f"Agent config batch LLM generation failed: {e}, using rule-based generation") llm_configs = {} - - # 构建AgentActivityConfig对象 + + # Build AgentActivityConfig objects configs = [] for i, entity in enumerate(entities): agent_id = start_idx + i cfg = llm_configs.get(agent_id, {}) - - # 如果LLM没有生成,使用规则生成 + + # If LLM did not generate a config, use rule-based generation if not cfg: cfg = self._generate_agent_config_by_rule(entity) - + config = AgentActivityConfig( agent_id=agent_id, entity_uuid=entity.uuid, @@ -902,15 +904,15 @@ class SimulationConfigGenerator: influence_weight=cfg.get("influence_weight", 1.0) ) configs.append(config) - + return configs - + def _generate_agent_config_by_rule(self, entity: EntityNode) -> Dict[str, Any]: - """基于规则生成单个Agent配置(中国人作息)""" + """Generate a single agent configuration using rule-based logic""" entity_type = (entity.get_entity_type() or "Unknown").lower() - + if entity_type in ["university", "governmentagency", "ngo"]: - # 官方机构:工作时间活动,低频率,高影响力 + # Official institutions: active during work hours, low frequency, high influence return { "activity_level": 0.2, "posts_per_hour": 0.1, @@ -923,7 +925,7 @@ class SimulationConfigGenerator: "influence_weight": 3.0 } elif entity_type in ["mediaoutlet"]: - # 媒体:全天活动,中等频率,高影响力 + # Media: active all day, moderate frequency, high influence return { "activity_level": 0.5, "posts_per_hour": 0.8, @@ -936,7 +938,7 @@ class SimulationConfigGenerator: "influence_weight": 2.5 } elif entity_type in ["professor", "expert", "official"]: - # 专家/教授:工作+晚间活动,中等频率 + # Experts/professors: active during work + evening hours, moderate frequency return { "activity_level": 0.4, "posts_per_hour": 0.3, @@ -949,12 +951,12 @@ class SimulationConfigGenerator: "influence_weight": 2.0 } elif entity_type in ["student"]: - # 学生:晚间为主,高频率 + # Students: mainly active in the evening, high frequency return { "activity_level": 0.8, "posts_per_hour": 0.6, "comments_per_hour": 1.5, - "active_hours": [8, 9, 10, 11, 12, 13, 18, 19, 20, 21, 22, 23], # 上午+晚间 + "active_hours": [8, 9, 10, 11, 12, 13, 18, 19, 20, 21, 22, 23], # Morning + evening "response_delay_min": 1, "response_delay_max": 15, "sentiment_bias": 0.0, @@ -962,12 +964,12 @@ class SimulationConfigGenerator: "influence_weight": 0.8 } elif entity_type in ["alumni"]: - # 校友:晚间为主 + # Alumni: mainly active in the evening return { "activity_level": 0.6, "posts_per_hour": 0.4, "comments_per_hour": 0.8, - "active_hours": [12, 13, 19, 20, 21, 22, 23], # 午休+晚间 + "active_hours": [12, 13, 19, 20, 21, 22, 23], # Lunch break + evening "response_delay_min": 5, "response_delay_max": 30, "sentiment_bias": 0.0, @@ -975,17 +977,15 @@ class SimulationConfigGenerator: "influence_weight": 1.0 } else: - # 普通人:晚间高峰 + # General public: evening peak return { "activity_level": 0.7, "posts_per_hour": 0.5, "comments_per_hour": 1.2, - "active_hours": [9, 10, 11, 12, 13, 18, 19, 20, 21, 22, 23], # 白天+晚间 + "active_hours": [9, 10, 11, 12, 13, 18, 19, 20, 21, 22, 23], # Daytime + evening "response_delay_min": 2, "response_delay_max": 20, "sentiment_bias": 0.0, "stance": "neutral", "influence_weight": 1.0 } - - diff --git a/backend/app/services/simulation_ipc.py b/backend/app/services/simulation_ipc.py index 9d70d0be..fc43055c 100644 --- a/backend/app/services/simulation_ipc.py +++ b/backend/app/services/simulation_ipc.py @@ -1,11 +1,11 @@ """ -模拟IPC通信模块 -用于Flask后端和模拟脚本之间的进程间通信 +Simulation IPC communication module +Used for inter-process communication between the Flask backend and simulation scripts. -通过文件系统实现简单的命令/响应模式: -1. Flask写入命令到 commands/ 目录 -2. 模拟脚本轮询命令目录,执行命令并写入响应到 responses/ 目录 -3. Flask轮询响应目录获取结果 +Implements a simple command/response pattern via the file system: +1. Flask writes commands to the commands/ directory +2. Simulation scripts poll the command directory, execute commands, and write responses to the responses/ directory +3. Flask polls the response directory to get results """ import os @@ -23,14 +23,14 @@ logger = get_logger('mirofish.simulation_ipc') class CommandType(str, Enum): - """命令类型""" - INTERVIEW = "interview" # 单个Agent采访 - BATCH_INTERVIEW = "batch_interview" # 批量采访 - CLOSE_ENV = "close_env" # 关闭环境 + """Command type""" + INTERVIEW = "interview" # Single agent interview + BATCH_INTERVIEW = "batch_interview" # Batch interview + CLOSE_ENV = "close_env" # Close environment class CommandStatus(str, Enum): - """命令状态""" + """Command status""" PENDING = "pending" PROCESSING = "processing" COMPLETED = "completed" @@ -39,12 +39,12 @@ class CommandStatus(str, Enum): @dataclass class IPCCommand: - """IPC命令""" + """IPC command""" command_id: str command_type: CommandType args: Dict[str, Any] timestamp: str = field(default_factory=lambda: datetime.now().isoformat()) - + def to_dict(self) -> Dict[str, Any]: return { "command_id": self.command_id, @@ -52,7 +52,7 @@ class IPCCommand: "args": self.args, "timestamp": self.timestamp } - + @classmethod def from_dict(cls, data: Dict[str, Any]) -> 'IPCCommand': return cls( @@ -65,13 +65,13 @@ class IPCCommand: @dataclass class IPCResponse: - """IPC响应""" + """IPC response""" command_id: str status: CommandStatus result: Optional[Dict[str, Any]] = None error: Optional[str] = None timestamp: str = field(default_factory=lambda: datetime.now().isoformat()) - + def to_dict(self) -> Dict[str, Any]: return { "command_id": self.command_id, @@ -80,7 +80,7 @@ class IPCResponse: "error": self.error, "timestamp": self.timestamp } - + @classmethod def from_dict(cls, data: Dict[str, Any]) -> 'IPCResponse': return cls( @@ -94,26 +94,26 @@ class IPCResponse: class SimulationIPCClient: """ - 模拟IPC客户端(Flask端使用) - - 用于向模拟进程发送命令并等待响应 + Simulation IPC client (used by the Flask side) + + Used to send commands to the simulation process and wait for responses """ - + def __init__(self, simulation_dir: str): """ - 初始化IPC客户端 - + Initialize the IPC client + Args: - simulation_dir: 模拟数据目录 + simulation_dir: simulation data directory """ self.simulation_dir = simulation_dir self.commands_dir = os.path.join(simulation_dir, "ipc_commands") self.responses_dir = os.path.join(simulation_dir, "ipc_responses") - - # 确保目录存在 + + # Ensure directories exist os.makedirs(self.commands_dir, exist_ok=True) os.makedirs(self.responses_dir, exist_ok=True) - + def send_command( self, command_type: CommandType, @@ -122,19 +122,19 @@ class SimulationIPCClient: poll_interval: float = 0.5 ) -> IPCResponse: """ - 发送命令并等待响应 - + Send a command and wait for a response + Args: - command_type: 命令类型 - args: 命令参数 - timeout: 超时时间(秒) - poll_interval: 轮询间隔(秒) - + command_type: command type + args: command arguments + timeout: timeout in seconds + poll_interval: polling interval in seconds + Returns: IPCResponse - + Raises: - TimeoutError: 等待响应超时 + TimeoutError: timed out waiting for a response """ command_id = str(uuid.uuid4()) command = IPCCommand( @@ -142,50 +142,50 @@ class SimulationIPCClient: command_type=command_type, args=args ) - - # 写入命令文件 + + # Write command file command_file = os.path.join(self.commands_dir, f"{command_id}.json") with open(command_file, 'w', encoding='utf-8') as f: json.dump(command.to_dict(), f, ensure_ascii=False, indent=2) - - logger.info(f"发送IPC命令: {command_type.value}, command_id={command_id}") - - # 等待响应 + + logger.info(f"Sending IPC command: {command_type.value}, command_id={command_id}") + + # Wait for response response_file = os.path.join(self.responses_dir, f"{command_id}.json") start_time = time.time() - + while time.time() - start_time < timeout: if os.path.exists(response_file): try: with open(response_file, 'r', encoding='utf-8') as f: response_data = json.load(f) response = IPCResponse.from_dict(response_data) - - # 清理命令和响应文件 + + # Clean up command and response files try: os.remove(command_file) os.remove(response_file) except OSError: pass - - logger.info(f"收到IPC响应: command_id={command_id}, status={response.status.value}") + + logger.info(f"Received IPC response: command_id={command_id}, status={response.status.value}") return response except (json.JSONDecodeError, KeyError) as e: - logger.warning(f"解析响应失败: {e}") - + logger.warning(f"Failed to parse response: {e}") + time.sleep(poll_interval) - - # 超时 - logger.error(f"等待IPC响应超时: command_id={command_id}") - - # 清理命令文件 + + # Timeout + logger.error(f"Timed out waiting for IPC response: command_id={command_id}") + + # Clean up command file try: os.remove(command_file) except OSError: pass - - raise TimeoutError(f"等待命令响应超时 ({timeout}秒)") - + + raise TimeoutError(f"Timed out waiting for command response ({timeout}s)") + def send_interview( self, agent_id: int, @@ -194,19 +194,19 @@ class SimulationIPCClient: timeout: float = 60.0 ) -> IPCResponse: """ - 发送单个Agent采访命令 - + Send a single agent interview command + Args: agent_id: Agent ID - prompt: 采访问题 - platform: 指定平台(可选) - - "twitter": 只采访Twitter平台 - - "reddit": 只采访Reddit平台 - - None: 双平台模拟时同时采访两个平台,单平台模拟时采访该平台 - timeout: 超时时间 - + prompt: interview question + platform: target platform (optional) + - "twitter": interview only the Twitter platform + - "reddit": interview only the Reddit platform + - None: in dual-platform mode, interview both; in single-platform mode, interview that platform + timeout: timeout in seconds + Returns: - IPCResponse,result字段包含采访结果 + IPCResponse with interview result in the result field """ args = { "agent_id": agent_id, @@ -214,13 +214,13 @@ class SimulationIPCClient: } if platform: args["platform"] = platform - + return self.send_command( command_type=CommandType.INTERVIEW, args=args, timeout=timeout ) - + def send_batch_interview( self, interviews: List[Dict[str, Any]], @@ -228,36 +228,36 @@ class SimulationIPCClient: timeout: float = 120.0 ) -> IPCResponse: """ - 发送批量采访命令 - + Send a batch interview command + Args: - interviews: 采访列表,每个元素包含 {"agent_id": int, "prompt": str, "platform": str(可选)} - platform: 默认平台(可选,会被每个采访项的platform覆盖) - - "twitter": 默认只采访Twitter平台 - - "reddit": 默认只采访Reddit平台 - - None: 双平台模拟时每个Agent同时采访两个平台 - timeout: 超时时间 - + interviews: list of interviews, each containing {"agent_id": int, "prompt": str, "platform": str (optional)} + platform: default platform (optional; overridden per-item by each interview's platform) + - "twitter": default to Twitter platform only + - "reddit": default to Reddit platform only + - None: in dual-platform mode, interview each agent on both platforms + timeout: timeout in seconds + Returns: - IPCResponse,result字段包含所有采访结果 + IPCResponse with all interview results in the result field """ args = {"interviews": interviews} if platform: args["platform"] = platform - + return self.send_command( command_type=CommandType.BATCH_INTERVIEW, args=args, timeout=timeout ) - + def send_close_env(self, timeout: float = 30.0) -> IPCResponse: """ - 发送关闭环境命令 - + Send a close-environment command + Args: - timeout: 超时时间 - + timeout: timeout in seconds + Returns: IPCResponse """ @@ -266,17 +266,17 @@ class SimulationIPCClient: args={}, timeout=timeout ) - + def check_env_alive(self) -> bool: """ - 检查模拟环境是否存活 - - 通过检查 env_status.json 文件来判断 + Check whether the simulation environment is alive + + Determined by checking the env_status.json file """ status_file = os.path.join(self.simulation_dir, "env_status.json") if not os.path.exists(status_file): return False - + try: with open(status_file, 'r', encoding='utf-8') as f: status = json.load(f) @@ -287,106 +287,106 @@ class SimulationIPCClient: class SimulationIPCServer: """ - 模拟IPC服务器(模拟脚本端使用) - - 轮询命令目录,执行命令并返回响应 + Simulation IPC server (used by the simulation script side) + + Polls the command directory, executes commands, and returns responses """ - + def __init__(self, simulation_dir: str): """ - 初始化IPC服务器 - + Initialize the IPC server + Args: - simulation_dir: 模拟数据目录 + simulation_dir: simulation data directory """ self.simulation_dir = simulation_dir self.commands_dir = os.path.join(simulation_dir, "ipc_commands") self.responses_dir = os.path.join(simulation_dir, "ipc_responses") - - # 确保目录存在 + + # Ensure directories exist os.makedirs(self.commands_dir, exist_ok=True) os.makedirs(self.responses_dir, exist_ok=True) - - # 环境状态 + + # Environment status self._running = False - + def start(self): - """标记服务器为运行状态""" + """Mark the server as running""" self._running = True self._update_env_status("alive") - + def stop(self): - """标记服务器为停止状态""" + """Mark the server as stopped""" self._running = False self._update_env_status("stopped") - + def _update_env_status(self, status: str): - """更新环境状态文件""" + """Update the environment status file""" status_file = os.path.join(self.simulation_dir, "env_status.json") with open(status_file, 'w', encoding='utf-8') as f: json.dump({ "status": status, "timestamp": datetime.now().isoformat() }, f, ensure_ascii=False, indent=2) - + def poll_commands(self) -> Optional[IPCCommand]: """ - 轮询命令目录,返回第一个待处理的命令 - + Poll the command directory and return the first pending command + Returns: - IPCCommand 或 None + IPCCommand or None """ if not os.path.exists(self.commands_dir): return None - - # 按时间排序获取命令文件 + + # Get command files sorted by modification time command_files = [] for filename in os.listdir(self.commands_dir): if filename.endswith('.json'): filepath = os.path.join(self.commands_dir, filename) command_files.append((filepath, os.path.getmtime(filepath))) - + command_files.sort(key=lambda x: x[1]) - + for filepath, _ in command_files: try: with open(filepath, 'r', encoding='utf-8') as f: data = json.load(f) return IPCCommand.from_dict(data) except (json.JSONDecodeError, KeyError, OSError) as e: - logger.warning(f"读取命令文件失败: {filepath}, {e}") + logger.warning(f"Failed to read command file: {filepath}, {e}") continue - + return None - + def send_response(self, response: IPCResponse): """ - 发送响应 - + Send a response + Args: - response: IPC响应 + response: IPC response """ response_file = os.path.join(self.responses_dir, f"{response.command_id}.json") with open(response_file, 'w', encoding='utf-8') as f: json.dump(response.to_dict(), f, ensure_ascii=False, indent=2) - - # 删除命令文件 + + # Delete the command file command_file = os.path.join(self.commands_dir, f"{response.command_id}.json") try: os.remove(command_file) except OSError: pass - + def send_success(self, command_id: str, result: Dict[str, Any]): - """发送成功响应""" + """Send a success response""" self.send_response(IPCResponse( command_id=command_id, status=CommandStatus.COMPLETED, result=result )) - + def send_error(self, command_id: str, error: str): - """发送错误响应""" + """Send an error response""" self.send_response(IPCResponse( command_id=command_id, status=CommandStatus.FAILED, diff --git a/backend/app/services/simulation_manager.py b/backend/app/services/simulation_manager.py index 0d161a90..c15a8d0b 100644 --- a/backend/app/services/simulation_manager.py +++ b/backend/app/services/simulation_manager.py @@ -1,7 +1,7 @@ """ -OASIS模拟管理器 -管理Twitter和Reddit双平台并行模拟 -使用预设脚本 + LLM智能生成配置参数 +OASIS simulation manager +Manages parallel simulation on both Twitter and Reddit platforms. +Uses preset scripts with LLM-generated configuration parameters. """ import os @@ -23,60 +23,60 @@ logger = get_logger('mirofish.simulation') class SimulationStatus(str, Enum): - """模拟状态""" + """Simulation status""" CREATED = "created" PREPARING = "preparing" READY = "ready" RUNNING = "running" PAUSED = "paused" - STOPPED = "stopped" # 模拟被手动停止 - COMPLETED = "completed" # 模拟自然完成 + STOPPED = "stopped" # Simulation manually stopped + COMPLETED = "completed" # Simulation naturally completed FAILED = "failed" class PlatformType(str, Enum): - """平台类型""" + """Platform type""" TWITTER = "twitter" REDDIT = "reddit" @dataclass class SimulationState: - """模拟状态""" + """Simulation state""" simulation_id: str project_id: str graph_id: str - - # 平台启用状态 + + # Platform enable flags enable_twitter: bool = True enable_reddit: bool = True - - # 状态 + + # Status status: SimulationStatus = SimulationStatus.CREATED - - # 准备阶段数据 + + # Preparation phase data entities_count: int = 0 profiles_count: int = 0 entity_types: List[str] = field(default_factory=list) - - # 配置生成信息 + + # Config generation info config_generated: bool = False config_reasoning: str = "" - - # 运行时数据 + + # Runtime data current_round: int = 0 twitter_status: str = "not_started" reddit_status: str = "not_started" - - # 时间戳 + + # Timestamps created_at: str = field(default_factory=lambda: datetime.now().isoformat()) updated_at: str = field(default_factory=lambda: datetime.now().isoformat()) - - # 错误信息 + + # Error message error: Optional[str] = None - + def to_dict(self) -> Dict[str, Any]: - """完整状态字典(内部使用)""" + """Full state dictionary (internal use)""" return { "simulation_id": self.simulation_id, "project_id": self.project_id, @@ -96,9 +96,9 @@ class SimulationState: "updated_at": self.updated_at, "error": self.error, } - + def to_simple_dict(self) -> Dict[str, Any]: - """简化状态字典(API返回使用)""" + """Simplified state dictionary (used for API responses)""" return { "simulation_id": self.simulation_id, "project_id": self.project_id, @@ -114,60 +114,60 @@ class SimulationState: class SimulationManager: """ - 模拟管理器 - - 核心功能: - 1. 从Zep图谱读取实体并过滤 - 2. 生成OASIS Agent Profile - 3. 使用LLM智能生成模拟配置参数 - 4. 准备预设脚本所需的所有文件 + Simulation manager + + Core functions: + 1. Read and filter entities from the Zep graph + 2. Generate OASIS Agent Profiles + 3. Use LLM to intelligently generate simulation configuration parameters + 4. Prepare all files required by the preset scripts """ - - # 模拟数据存储目录 + + # Simulation data storage directory SIMULATION_DATA_DIR = os.path.join( - os.path.dirname(__file__), + os.path.dirname(__file__), '../../uploads/simulations' ) - + def __init__(self): - # 确保目录存在 + # Ensure directory exists os.makedirs(self.SIMULATION_DATA_DIR, exist_ok=True) - - # 内存中的模拟状态缓存 + + # In-memory simulation state cache self._simulations: Dict[str, SimulationState] = {} - + def _get_simulation_dir(self, simulation_id: str) -> str: - """获取模拟数据目录""" + """Get the simulation data directory""" sim_dir = os.path.join(self.SIMULATION_DATA_DIR, simulation_id) os.makedirs(sim_dir, exist_ok=True) return sim_dir - + def _save_simulation_state(self, state: SimulationState): - """保存模拟状态到文件""" + """Save simulation state to file""" sim_dir = self._get_simulation_dir(state.simulation_id) state_file = os.path.join(sim_dir, "state.json") - + state.updated_at = datetime.now().isoformat() - + with open(state_file, 'w', encoding='utf-8') as f: json.dump(state.to_dict(), f, ensure_ascii=False, indent=2) - + self._simulations[state.simulation_id] = state - + def _load_simulation_state(self, simulation_id: str) -> Optional[SimulationState]: - """从文件加载模拟状态""" + """Load simulation state from file""" if simulation_id in self._simulations: return self._simulations[simulation_id] - + sim_dir = self._get_simulation_dir(simulation_id) state_file = os.path.join(sim_dir, "state.json") - + if not os.path.exists(state_file): return None - + with open(state_file, 'r', encoding='utf-8') as f: data = json.load(f) - + state = SimulationState( simulation_id=simulation_id, project_id=data.get("project_id", ""), @@ -187,10 +187,10 @@ class SimulationManager: updated_at=data.get("updated_at", datetime.now().isoformat()), error=data.get("error"), ) - + self._simulations[simulation_id] = state return state - + def create_simulation( self, project_id: str, @@ -199,20 +199,20 @@ class SimulationManager: enable_reddit: bool = True, ) -> SimulationState: """ - 创建新的模拟 - + Create a new simulation. + Args: - project_id: 项目ID - graph_id: Zep图谱ID - enable_twitter: 是否启用Twitter模拟 - enable_reddit: 是否启用Reddit模拟 - + project_id: project ID + graph_id: Zep graph ID + enable_twitter: whether to enable Twitter simulation + enable_reddit: whether to enable Reddit simulation + Returns: SimulationState """ import uuid simulation_id = f"sim_{uuid.uuid4().hex[:12]}" - + state = SimulationState( simulation_id=simulation_id, project_id=project_id, @@ -221,12 +221,12 @@ class SimulationManager: enable_reddit=enable_reddit, status=SimulationStatus.CREATED, ) - + self._save_simulation_state(state) - logger.info(f"创建模拟: {simulation_id}, project={project_id}, graph={graph_id}") - + logger.info(f"Simulation created: {simulation_id}, project={project_id}, graph={graph_id}") + return state - + def prepare_simulation( self, simulation_id: str, @@ -238,55 +238,55 @@ class SimulationManager: parallel_profile_count: int = 3 ) -> SimulationState: """ - 准备模拟环境(全程自动化) - - 步骤: - 1. 从Zep图谱读取并过滤实体 - 2. 为每个实体生成OASIS Agent Profile(可选LLM增强,支持并行) - 3. 使用LLM智能生成模拟配置参数(时间、活跃度、发言频率等) - 4. 保存配置文件和Profile文件 - 5. 复制预设脚本到模拟目录 - + Prepare the simulation environment (fully automated). + + Steps: + 1. Read and filter entities from the Zep graph + 2. Generate an OASIS Agent Profile for each entity (optional LLM enhancement, supports parallelism) + 3. Use LLM to intelligently generate simulation configuration parameters (time, activity level, posting frequency, etc.) + 4. Save configuration files and profile files + 5. Copy preset scripts to the simulation directory + Args: - simulation_id: 模拟ID - simulation_requirement: 模拟需求描述(用于LLM生成配置) - document_text: 原始文档内容(用于LLM理解背景) - defined_entity_types: 预定义的实体类型(可选) - use_llm_for_profiles: 是否使用LLM生成详细人设 - progress_callback: 进度回调函数 (stage, progress, message) - parallel_profile_count: 并行生成人设的数量,默认3 - + simulation_id: simulation ID + simulation_requirement: simulation requirement description (used for LLM config generation) + document_text: original document content (used for LLM background understanding) + defined_entity_types: predefined entity types (optional) + use_llm_for_profiles: whether to use LLM to generate detailed personas + progress_callback: progress callback function (stage, progress, message) + parallel_profile_count: number of profiles to generate in parallel, default 3 + Returns: SimulationState """ state = self._load_simulation_state(simulation_id) if not state: - raise ValueError(f"模拟不存在: {simulation_id}") - + raise ValueError(f"Simulation not found: {simulation_id}") + try: state.status = SimulationStatus.PREPARING self._save_simulation_state(state) - + sim_dir = self._get_simulation_dir(simulation_id) - - # ========== 阶段1: 读取并过滤实体 ========== + + # ========== Stage 1: Read and filter entities ========== if progress_callback: progress_callback("reading", 0, t('progress.connectingZepGraph')) - + reader = ZepEntityReader() - + if progress_callback: progress_callback("reading", 30, t('progress.readingNodeData')) - + filtered = reader.filter_defined_entities( graph_id=state.graph_id, defined_entity_types=defined_entity_types, enrich_with_edges=True ) - + state.entities_count = filtered.filtered_count state.entity_types = list(filtered.entity_types) - + if progress_callback: progress_callback( "reading", 100, @@ -294,16 +294,16 @@ class SimulationManager: current=filtered.filtered_count, total=filtered.filtered_count ) - + if filtered.filtered_count == 0: state.status = SimulationStatus.FAILED - state.error = "没有找到符合条件的实体,请检查图谱是否正确构建" + state.error = "No qualifying entities found. Please check that the graph was built correctly." self._save_simulation_state(state) return state - - # ========== 阶段2: 生成Agent Profile ========== + + # ========== Stage 2: Generate Agent Profiles ========== total_entities = len(filtered.entities) - + if progress_callback: progress_callback( "generating_profiles", 0, @@ -311,22 +311,22 @@ class SimulationManager: current=0, total=total_entities ) - - # 传入graph_id以启用Zep检索功能,获取更丰富的上下文 + + # Pass graph_id to enable Zep retrieval for richer context generator = OasisProfileGenerator(graph_id=state.graph_id) - + def profile_progress(current, total, msg): if progress_callback: progress_callback( - "generating_profiles", - int(current / total * 100), + "generating_profiles", + int(current / total * 100), msg, current=current, total=total, item_name=msg ) - - # 设置实时保存的文件路径(优先使用 Reddit JSON 格式) + + # Set real-time save path (prefer Reddit JSON format) realtime_output_path = None realtime_platform = "reddit" if state.enable_reddit: @@ -335,21 +335,21 @@ class SimulationManager: elif state.enable_twitter: realtime_output_path = os.path.join(sim_dir, "twitter_profiles.csv") realtime_platform = "twitter" - + profiles = generator.generate_profiles_from_entities( entities=filtered.entities, use_llm=use_llm_for_profiles, progress_callback=profile_progress, - graph_id=state.graph_id, # 传入graph_id用于Zep检索 - parallel_count=parallel_profile_count, # 并行生成数量 - realtime_output_path=realtime_output_path, # 实时保存路径 - output_platform=realtime_platform # 输出格式 + graph_id=state.graph_id, # Pass graph_id for Zep retrieval + parallel_count=parallel_profile_count, # Parallel generation count + realtime_output_path=realtime_output_path, # Real-time save path + output_platform=realtime_platform # Output format ) - + state.profiles_count = len(profiles) - - # 保存Profile文件(注意:Twitter使用CSV格式,Reddit使用JSON格式) - # Reddit 已经在生成过程中实时保存了,这里再保存一次确保完整性 + + # Save profile files (note: Twitter uses CSV format, Reddit uses JSON format) + # Reddit has already been saved incrementally during generation; save once more to ensure completeness if progress_callback: progress_callback( "generating_profiles", 95, @@ -357,22 +357,22 @@ class SimulationManager: current=total_entities, total=total_entities ) - + if state.enable_reddit: generator.save_profiles( profiles=profiles, file_path=os.path.join(sim_dir, "reddit_profiles.json"), platform="reddit" ) - + if state.enable_twitter: - # Twitter使用CSV格式!这是OASIS的要求 + # Twitter uses CSV format — this is a requirement of OASIS generator.save_profiles( profiles=profiles, file_path=os.path.join(sim_dir, "twitter_profiles.csv"), platform="twitter" ) - + if progress_callback: progress_callback( "generating_profiles", 100, @@ -380,8 +380,8 @@ class SimulationManager: current=len(profiles), total=len(profiles) ) - - # ========== 阶段3: LLM智能生成模拟配置 ========== + + # ========== Stage 3: LLM intelligent simulation configuration generation ========== if progress_callback: progress_callback( "generating_config", 0, @@ -389,9 +389,9 @@ class SimulationManager: current=0, total=3 ) - + config_generator = SimulationConfigGenerator() - + if progress_callback: progress_callback( "generating_config", 30, @@ -399,7 +399,7 @@ class SimulationManager: current=1, total=3 ) - + sim_params = config_generator.generate_config( simulation_id=simulation_id, project_id=state.project_id, @@ -410,7 +410,7 @@ class SimulationManager: enable_twitter=state.enable_twitter, enable_reddit=state.enable_reddit ) - + if progress_callback: progress_callback( "generating_config", 70, @@ -418,15 +418,15 @@ class SimulationManager: current=2, total=3 ) - - # 保存配置文件 + + # Save configuration file config_path = os.path.join(sim_dir, "simulation_config.json") with open(config_path, 'w', encoding='utf-8') as f: f.write(sim_params.to_json()) - + state.config_generated = True state.config_reasoning = sim_params.generation_reasoning - + if progress_callback: progress_callback( "generating_config", 100, @@ -434,82 +434,82 @@ class SimulationManager: current=3, total=3 ) - - # 注意:运行脚本保留在 backend/scripts/ 目录,不再复制到模拟目录 - # 启动模拟时,simulation_runner 会从 scripts/ 目录运行脚本 - - # 更新状态 + + # Note: run scripts remain in backend/scripts/; they are not copied to the simulation directory. + # When starting a simulation, simulation_runner runs scripts from the scripts/ directory. + + # Update status state.status = SimulationStatus.READY self._save_simulation_state(state) - - logger.info(f"模拟准备完成: {simulation_id}, " - f"entities={state.entities_count}, profiles={state.profiles_count}") - + + logger.info(f"Simulation preparation complete: {simulation_id}, " + f"entities={state.entities_count}, profiles={state.profiles_count}") + return state - + except Exception as e: - logger.error(f"模拟准备失败: {simulation_id}, error={str(e)}") + logger.error(f"Simulation preparation failed: {simulation_id}, error={str(e)}") import traceback logger.error(traceback.format_exc()) state.status = SimulationStatus.FAILED state.error = str(e) self._save_simulation_state(state) raise - + def get_simulation(self, simulation_id: str) -> Optional[SimulationState]: - """获取模拟状态""" + """Get simulation state""" return self._load_simulation_state(simulation_id) - + def list_simulations(self, project_id: Optional[str] = None) -> List[SimulationState]: - """列出所有模拟""" + """List all simulations""" simulations = [] - + if os.path.exists(self.SIMULATION_DATA_DIR): for sim_id in os.listdir(self.SIMULATION_DATA_DIR): - # 跳过隐藏文件(如 .DS_Store)和非目录文件 + # Skip hidden files (e.g. .DS_Store) and non-directory entries sim_path = os.path.join(self.SIMULATION_DATA_DIR, sim_id) if sim_id.startswith('.') or not os.path.isdir(sim_path): continue - + state = self._load_simulation_state(sim_id) if state: if project_id is None or state.project_id == project_id: simulations.append(state) - + return simulations - + def get_profiles(self, simulation_id: str, platform: str = "reddit") -> List[Dict[str, Any]]: - """获取模拟的Agent Profile""" + """Get agent profiles for a simulation""" state = self._load_simulation_state(simulation_id) if not state: - raise ValueError(f"模拟不存在: {simulation_id}") - + raise ValueError(f"Simulation not found: {simulation_id}") + sim_dir = self._get_simulation_dir(simulation_id) profile_path = os.path.join(sim_dir, f"{platform}_profiles.json") - + if not os.path.exists(profile_path): return [] - + with open(profile_path, 'r', encoding='utf-8') as f: return json.load(f) - + def get_simulation_config(self, simulation_id: str) -> Optional[Dict[str, Any]]: - """获取模拟配置""" + """Get simulation configuration""" sim_dir = self._get_simulation_dir(simulation_id) config_path = os.path.join(sim_dir, "simulation_config.json") - + if not os.path.exists(config_path): return None - + with open(config_path, 'r', encoding='utf-8') as f: return json.load(f) - + def get_run_instructions(self, simulation_id: str) -> Dict[str, str]: - """获取运行说明""" + """Get run instructions""" sim_dir = self._get_simulation_dir(simulation_id) config_path = os.path.join(sim_dir, "simulation_config.json") scripts_dir = os.path.abspath(os.path.join(os.path.dirname(__file__), '../../scripts')) - + return { "simulation_dir": sim_dir, "scripts_dir": scripts_dir, @@ -520,10 +520,10 @@ class SimulationManager: "parallel": f"python {scripts_dir}/run_parallel_simulation.py --config {config_path}", }, "instructions": ( - f"1. 激活conda环境: conda activate MiroFish\n" - f"2. 运行模拟 (脚本位于 {scripts_dir}):\n" - f" - 单独运行Twitter: python {scripts_dir}/run_twitter_simulation.py --config {config_path}\n" - f" - 单独运行Reddit: python {scripts_dir}/run_reddit_simulation.py --config {config_path}\n" - f" - 并行运行双平台: python {scripts_dir}/run_parallel_simulation.py --config {config_path}" + f"1. Activate conda environment: conda activate MiroFish\n" + f"2. Run simulation (scripts located at {scripts_dir}):\n" + f" - Twitter only: python {scripts_dir}/run_twitter_simulation.py --config {config_path}\n" + f" - Reddit only: python {scripts_dir}/run_reddit_simulation.py --config {config_path}\n" + f" - Both platforms in parallel: python {scripts_dir}/run_parallel_simulation.py --config {config_path}" ) } diff --git a/backend/app/services/simulation_runner.py b/backend/app/services/simulation_runner.py index e86021f8..19cf16ad 100644 --- a/backend/app/services/simulation_runner.py +++ b/backend/app/services/simulation_runner.py @@ -1,6 +1,6 @@ """ -OASIS模拟运行器 -在后台运行模拟并记录每个Agent的动作,支持实时状态监控 +OASIS simulation runner +Runs simulations in the background, records each agent's actions, and supports real-time status monitoring """ import os @@ -26,15 +26,15 @@ from .simulation_ipc import SimulationIPCClient, CommandType, IPCResponse logger = get_logger('mirofish.simulation_runner') -# 标记是否已注册清理函数 +# Flag indicating whether the cleanup function has been registered _cleanup_registered = False -# 平台检测 +# Platform detection IS_WINDOWS = sys.platform == 'win32' class RunnerStatus(str, Enum): - """运行器状态""" + """Runner status""" IDLE = "idle" STARTING = "starting" RUNNING = "running" @@ -47,7 +47,7 @@ class RunnerStatus(str, Enum): @dataclass class AgentAction: - """Agent动作记录""" + """Agent action record""" round_num: int timestamp: str platform: str # twitter / reddit @@ -57,7 +57,7 @@ class AgentAction: action_args: Dict[str, Any] = field(default_factory=dict) result: Optional[str] = None success: bool = True - + def to_dict(self) -> Dict[str, Any]: return { "round_num": self.round_num, @@ -74,7 +74,7 @@ class AgentAction: @dataclass class RoundSummary: - """每轮摘要""" + """Per-round summary""" round_num: int start_time: str end_time: Optional[str] = None @@ -83,7 +83,7 @@ class RoundSummary: reddit_actions: int = 0 active_agents: List[int] = field(default_factory=list) actions: List[AgentAction] = field(default_factory=list) - + def to_dict(self) -> Dict[str, Any]: return { "round_num": self.round_num, @@ -100,63 +100,63 @@ class RoundSummary: @dataclass class SimulationRunState: - """模拟运行状态(实时)""" + """Simulation run state (real-time)""" simulation_id: str runner_status: RunnerStatus = RunnerStatus.IDLE - - # 进度信息 + + # Progress info current_round: int = 0 total_rounds: int = 0 simulated_hours: int = 0 total_simulation_hours: int = 0 - - # 各平台独立轮次和模拟时间(用于双平台并行显示) + + # Per-platform independent rounds and simulated time (for parallel dual-platform display) twitter_current_round: int = 0 reddit_current_round: int = 0 twitter_simulated_hours: int = 0 reddit_simulated_hours: int = 0 - - # 平台状态 + + # Platform status twitter_running: bool = False reddit_running: bool = False twitter_actions_count: int = 0 reddit_actions_count: int = 0 - - # 平台完成状态(通过检测 actions.jsonl 中的 simulation_end 事件) + + # Platform completion status (detected via simulation_end events in actions.jsonl) twitter_completed: bool = False reddit_completed: bool = False - - # 每轮摘要 + + # Per-round summaries rounds: List[RoundSummary] = field(default_factory=list) - - # 最近动作(用于前端实时展示) + + # Recent actions (for real-time frontend display) recent_actions: List[AgentAction] = field(default_factory=list) max_recent_actions: int = 50 - - # 时间戳 + + # Timestamps started_at: Optional[str] = None updated_at: str = field(default_factory=lambda: datetime.now().isoformat()) completed_at: Optional[str] = None - - # 错误信息 + + # Error info error: Optional[str] = None - - # 进程ID(用于停止) + + # Process ID (for stopping) process_pid: Optional[int] = None - + def add_action(self, action: AgentAction): - """添加动作到最近动作列表""" + """Add an action to the recent actions list""" self.recent_actions.insert(0, action) if len(self.recent_actions) > self.max_recent_actions: self.recent_actions = self.recent_actions[:self.max_recent_actions] - + if action.platform == "twitter": self.twitter_actions_count += 1 else: self.reddit_actions_count += 1 - + self.updated_at = datetime.now().isoformat() - + def to_dict(self) -> Dict[str, Any]: return { "simulation_id": self.simulation_id, @@ -166,7 +166,7 @@ class SimulationRunState: "simulated_hours": self.simulated_hours, "total_simulation_hours": self.total_simulation_hours, "progress_percent": round(self.current_round / max(self.total_rounds, 1) * 100, 1), - # 各平台独立轮次和时间 + # Per-platform independent rounds and time "twitter_current_round": self.twitter_current_round, "reddit_current_round": self.reddit_current_round, "twitter_simulated_hours": self.twitter_simulated_hours, @@ -184,9 +184,9 @@ class SimulationRunState: "error": self.error, "process_pid": self.process_pid, } - + def to_detail_dict(self) -> Dict[str, Any]: - """包含最近动作的详细信息""" + """Detailed info including recent actions""" result = self.to_dict() result["recent_actions"] = [a.to_dict() for a in self.recent_actions] result["rounds_count"] = len(self.rounds) @@ -195,61 +195,61 @@ class SimulationRunState: class SimulationRunner: """ - 模拟运行器 - - 负责: - 1. 在后台进程中运行OASIS模拟 - 2. 解析运行日志,记录每个Agent的动作 - 3. 提供实时状态查询接口 - 4. 支持暂停/停止/恢复操作 + Simulation runner + + Responsibilities: + 1. Run OASIS simulation in a background process + 2. Parse run logs and record each agent's actions + 3. Provide a real-time status query interface + 4. Support pause/stop/resume operations """ - - # 运行状态存储目录 + + # Run state storage directory RUN_STATE_DIR = os.path.join( os.path.dirname(__file__), '../../uploads/simulations' ) - - # 脚本目录 + + # Scripts directory SCRIPTS_DIR = os.path.join( os.path.dirname(__file__), '../../scripts' ) - - # 内存中的运行状态 + + # In-memory run states _run_states: Dict[str, SimulationRunState] = {} _processes: Dict[str, subprocess.Popen] = {} _action_queues: Dict[str, Queue] = {} _monitor_threads: Dict[str, threading.Thread] = {} - _stdout_files: Dict[str, Any] = {} # 存储 stdout 文件句柄 - _stderr_files: Dict[str, Any] = {} # 存储 stderr 文件句柄 - - # 图谱记忆更新配置 + _stdout_files: Dict[str, Any] = {} # stdout file handles + _stderr_files: Dict[str, Any] = {} # stderr file handles + + # Graph memory update configuration _graph_memory_enabled: Dict[str, bool] = {} # simulation_id -> enabled - + @classmethod def get_run_state(cls, simulation_id: str) -> Optional[SimulationRunState]: - """获取运行状态""" + """Get run state""" if simulation_id in cls._run_states: return cls._run_states[simulation_id] - - # 尝试从文件加载 + + # Try to load from file state = cls._load_run_state(simulation_id) if state: cls._run_states[simulation_id] = state return state - + @classmethod def _load_run_state(cls, simulation_id: str) -> Optional[SimulationRunState]: - """从文件加载运行状态""" + """Load run state from file""" state_file = os.path.join(cls.RUN_STATE_DIR, simulation_id, "run_state.json") if not os.path.exists(state_file): return None - + try: with open(state_file, 'r', encoding='utf-8') as f: data = json.load(f) - + state = SimulationRunState( simulation_id=simulation_id, runner_status=RunnerStatus(data.get("runner_status", "idle")), @@ -257,7 +257,7 @@ class SimulationRunner: total_rounds=data.get("total_rounds", 0), simulated_hours=data.get("simulated_hours", 0), total_simulation_hours=data.get("total_simulation_hours", 0), - # 各平台独立轮次和时间 + # Per-platform independent rounds and time twitter_current_round=data.get("twitter_current_round", 0), reddit_current_round=data.get("reddit_current_round", 0), twitter_simulated_hours=data.get("twitter_simulated_hours", 0), @@ -274,8 +274,8 @@ class SimulationRunner: error=data.get("error"), process_pid=data.get("process_pid"), ) - - # 加载最近动作 + + # Load recent actions actions_data = data.get("recent_actions", []) for a in actions_data: state.recent_actions.append(AgentAction( @@ -289,76 +289,76 @@ class SimulationRunner: result=a.get("result"), success=a.get("success", True), )) - + return state except Exception as e: - logger.error(f"加载运行状态失败: {str(e)}") + logger.error(f"Failed to load run state: {str(e)}") return None - + @classmethod def _save_run_state(cls, state: SimulationRunState): - """保存运行状态到文件""" + """Save run state to file""" sim_dir = os.path.join(cls.RUN_STATE_DIR, state.simulation_id) os.makedirs(sim_dir, exist_ok=True) state_file = os.path.join(sim_dir, "run_state.json") - + data = state.to_detail_dict() - + with open(state_file, 'w', encoding='utf-8') as f: json.dump(data, f, ensure_ascii=False, indent=2) - + cls._run_states[state.simulation_id] = state - + @classmethod def start_simulation( cls, simulation_id: str, platform: str = "parallel", # twitter / reddit / parallel - max_rounds: int = None, # 最大模拟轮数(可选,用于截断过长的模拟) - enable_graph_memory_update: bool = False, # 是否将活动更新到Zep图谱 - graph_id: str = None # Zep图谱ID(启用图谱更新时必需) + max_rounds: int = None, # Maximum simulation rounds (optional, to cap long simulations) + enable_graph_memory_update: bool = False, # Whether to update activities to the Zep graph + graph_id: str = None # Zep graph ID (required when graph update is enabled) ) -> SimulationRunState: """ - 启动模拟 - + Start a simulation + Args: - simulation_id: 模拟ID - platform: 运行平台 (twitter/reddit/parallel) - max_rounds: 最大模拟轮数(可选,用于截断过长的模拟) - enable_graph_memory_update: 是否将Agent活动动态更新到Zep图谱 - graph_id: Zep图谱ID(启用图谱更新时必需) - + simulation_id: simulation ID + platform: run platform (twitter/reddit/parallel) + max_rounds: maximum simulation rounds (optional, to cap long simulations) + enable_graph_memory_update: whether to dynamically update agent activities to the Zep graph + graph_id: Zep graph ID (required when graph update is enabled) + Returns: SimulationRunState """ - # 检查是否已在运行 + # Check if already running existing = cls.get_run_state(simulation_id) if existing and existing.runner_status in [RunnerStatus.RUNNING, RunnerStatus.STARTING]: - raise ValueError(f"模拟已在运行中: {simulation_id}") - - # 加载模拟配置 + raise ValueError(f"Simulation is already running: {simulation_id}") + + # Load simulation config sim_dir = os.path.join(cls.RUN_STATE_DIR, simulation_id) config_path = os.path.join(sim_dir, "simulation_config.json") - + if not os.path.exists(config_path): - raise ValueError(f"模拟配置不存在,请先调用 /prepare 接口") - + raise ValueError(f"Simulation config not found; please call /prepare first") + with open(config_path, 'r', encoding='utf-8') as f: config = json.load(f) - - # 初始化运行状态 + + # Initialize run state time_config = config.get("time_config", {}) total_hours = time_config.get("total_simulation_hours", 72) minutes_per_round = time_config.get("minutes_per_round", 30) total_rounds = int(total_hours * 60 / minutes_per_round) - - # 如果指定了最大轮数,则截断 + + # Truncate if max_rounds is specified if max_rounds is not None and max_rounds > 0: original_rounds = total_rounds total_rounds = min(total_rounds, max_rounds) if total_rounds < original_rounds: - logger.info(f"轮数已截断: {original_rounds} -> {total_rounds} (max_rounds={max_rounds})") - + logger.info(f"Rounds truncated: {original_rounds} -> {total_rounds} (max_rounds={max_rounds})") + state = SimulationRunState( simulation_id=simulation_id, runner_status=RunnerStatus.STARTING, @@ -366,25 +366,25 @@ class SimulationRunner: total_simulation_hours=total_hours, started_at=datetime.now().isoformat(), ) - + cls._save_run_state(state) - - # 如果启用图谱记忆更新,创建更新器 + + # Create graph memory updater if enabled if enable_graph_memory_update: if not graph_id: - raise ValueError("启用图谱记忆更新时必须提供 graph_id") - + raise ValueError("graph_id is required when graph memory update is enabled") + try: ZepGraphMemoryManager.create_updater(simulation_id, graph_id) cls._graph_memory_enabled[simulation_id] = True - logger.info(f"已启用图谱记忆更新: simulation_id={simulation_id}, graph_id={graph_id}") + logger.info(f"Graph memory update enabled: simulation_id={simulation_id}, graph_id={graph_id}") except Exception as e: - logger.error(f"创建图谱记忆更新器失败: {e}") + logger.error(f"Failed to create graph memory updater: {e}") cls._graph_memory_enabled[simulation_id] = False else: cls._graph_memory_enabled[simulation_id] = False - - # 确定运行哪个脚本(脚本位于 backend/scripts/ 目录) + + # Determine which script to run (scripts are in the backend/scripts/ directory) if platform == "twitter": script_name = "run_twitter_simulation.py" state.twitter_running = True @@ -395,71 +395,72 @@ class SimulationRunner: script_name = "run_parallel_simulation.py" state.twitter_running = True state.reddit_running = True - + script_path = os.path.join(cls.SCRIPTS_DIR, script_name) - + if not os.path.exists(script_path): - raise ValueError(f"脚本不存在: {script_path}") - - # 创建动作队列 + raise ValueError(f"Script not found: {script_path}") + + # Create action queue action_queue = Queue() cls._action_queues[simulation_id] = action_queue - - # 启动模拟进程 + + # Start simulation process try: - # 构建运行命令,使用完整路径 - # 新的日志结构: - # twitter/actions.jsonl - Twitter 动作日志 - # reddit/actions.jsonl - Reddit 动作日志 - # simulation.log - 主进程日志 - + # Build run command with full paths. + # New log structure: + # twitter/actions.jsonl - Twitter action log + # reddit/actions.jsonl - Reddit action log + # simulation.log - main process log + cmd = [ - sys.executable, # Python解释器 + sys.executable, # Python interpreter script_path, - "--config", config_path, # 使用完整配置文件路径 + "--config", config_path, # use full config file path ] - - # 如果指定了最大轮数,添加到命令行参数 + + # Append max_rounds to command line if specified if max_rounds is not None and max_rounds > 0: cmd.extend(["--max-rounds", str(max_rounds)]) - - # 创建主日志文件,避免 stdout/stderr 管道缓冲区满导致进程阻塞 + + # Create main log file to avoid stdout/stderr pipe buffer filling up and blocking the process main_log_path = os.path.join(sim_dir, "simulation.log") main_log_file = open(main_log_path, 'w', encoding='utf-8') - - # 设置子进程环境变量,确保 Windows 上使用 UTF-8 编码 - # 这可以修复第三方库(如 OASIS)读取文件时未指定编码的问题 + + # Set subprocess environment variables to ensure UTF-8 encoding on Windows. + # This fixes issues where third-party libraries (e.g. OASIS) open files without specifying an encoding. env = os.environ.copy() - env['PYTHONUTF8'] = '1' # Python 3.7+ 支持,让所有 open() 默认使用 UTF-8 - env['PYTHONIOENCODING'] = 'utf-8' # 确保 stdout/stderr 使用 UTF-8 - - # 设置工作目录为模拟目录(数据库等文件会生成在此) - # 使用 start_new_session=True 创建新的进程组,确保可以通过 os.killpg 终止所有子进程 + env['PYTHONUTF8'] = '1' # Python 3.7+: makes all open() calls default to UTF-8 + env['PYTHONIOENCODING'] = 'utf-8' # Ensures stdout/stderr use UTF-8 + + # Set working directory to the simulation directory (databases and other files are created there). + # Use start_new_session=True to create a new process group so all child processes can be + # terminated via os.killpg when needed. process = subprocess.Popen( cmd, cwd=sim_dir, stdout=main_log_file, - stderr=subprocess.STDOUT, # stderr 也写入同一个文件 + stderr=subprocess.STDOUT, # stderr also written to the same file text=True, - encoding='utf-8', # 显式指定编码 + encoding='utf-8', # Explicit encoding bufsize=1, - env=env, # 传递带有 UTF-8 设置的环境变量 - start_new_session=True, # 创建新进程组,确保服务器关闭时能终止所有相关进程 + env=env, # Pass environment with UTF-8 settings + start_new_session=True, # Create new process group so server shutdown can terminate all related processes ) - - # 保存文件句柄以便后续关闭 + + # Save file handles for later cleanup cls._stdout_files[simulation_id] = main_log_file - cls._stderr_files[simulation_id] = None # 不再需要单独的 stderr - + cls._stderr_files[simulation_id] = None # No separate stderr file needed + state.process_pid = process.pid state.runner_status = RunnerStatus.RUNNING cls._processes[simulation_id] = process cls._save_run_state(state) - + # Capture locale before spawning monitor thread current_locale = get_locale() - # 启动监控线程 + # Start monitor thread monitor_thread = threading.Thread( target=cls._monitor_simulation, args=(simulation_id, current_locale), @@ -467,106 +468,106 @@ class SimulationRunner: ) monitor_thread.start() cls._monitor_threads[simulation_id] = monitor_thread - - logger.info(f"模拟启动成功: {simulation_id}, pid={process.pid}, platform={platform}") - + + logger.info(f"Simulation started: {simulation_id}, pid={process.pid}, platform={platform}") + except Exception as e: state.runner_status = RunnerStatus.FAILED state.error = str(e) cls._save_run_state(state) raise - + return state - + @classmethod def _monitor_simulation(cls, simulation_id: str, locale: str = 'zh'): - """监控模拟进程,解析动作日志""" + """Monitor the simulation process and parse action logs""" set_locale(locale) sim_dir = os.path.join(cls.RUN_STATE_DIR, simulation_id) - - # 新的日志结构:分平台的动作日志 + + # New log structure: per-platform action logs twitter_actions_log = os.path.join(sim_dir, "twitter", "actions.jsonl") reddit_actions_log = os.path.join(sim_dir, "reddit", "actions.jsonl") - + process = cls._processes.get(simulation_id) state = cls.get_run_state(simulation_id) - + if not process or not state: return - + twitter_position = 0 reddit_position = 0 - + try: - while process.poll() is None: # 进程仍在运行 - # 读取 Twitter 动作日志 + while process.poll() is None: # Process is still running + # Read Twitter action log if os.path.exists(twitter_actions_log): twitter_position = cls._read_action_log( twitter_actions_log, twitter_position, state, "twitter" ) - - # 读取 Reddit 动作日志 + + # Read Reddit action log if os.path.exists(reddit_actions_log): reddit_position = cls._read_action_log( reddit_actions_log, reddit_position, state, "reddit" ) - - # 更新状态 + + # Save state cls._save_run_state(state) time.sleep(2) - - # 进程结束后,最后读取一次日志 + + # After process exits, do a final log read if os.path.exists(twitter_actions_log): cls._read_action_log(twitter_actions_log, twitter_position, state, "twitter") if os.path.exists(reddit_actions_log): cls._read_action_log(reddit_actions_log, reddit_position, state, "reddit") - - # 进程结束 + + # Process has exited exit_code = process.returncode - + if exit_code == 0: state.runner_status = RunnerStatus.COMPLETED state.completed_at = datetime.now().isoformat() - logger.info(f"模拟完成: {simulation_id}") + logger.info(f"Simulation completed: {simulation_id}") else: state.runner_status = RunnerStatus.FAILED - # 从主日志文件读取错误信息 + # Read error info from main log file main_log_path = os.path.join(sim_dir, "simulation.log") error_info = "" try: if os.path.exists(main_log_path): with open(main_log_path, 'r', encoding='utf-8') as f: - error_info = f.read()[-2000:] # 取最后2000字符 + error_info = f.read()[-2000:] # Last 2000 characters except Exception: pass - state.error = f"进程退出码: {exit_code}, 错误: {error_info}" - logger.error(f"模拟失败: {simulation_id}, error={state.error}") - + state.error = f"Process exit code: {exit_code}, error: {error_info}" + logger.error(f"Simulation failed: {simulation_id}, error={state.error}") + state.twitter_running = False state.reddit_running = False cls._save_run_state(state) - + except Exception as e: - logger.error(f"监控线程异常: {simulation_id}, error={str(e)}") + logger.error(f"Monitor thread exception: {simulation_id}, error={str(e)}") state.runner_status = RunnerStatus.FAILED state.error = str(e) cls._save_run_state(state) - + finally: - # 停止图谱记忆更新器 + # Stop graph memory updater if cls._graph_memory_enabled.get(simulation_id, False): try: ZepGraphMemoryManager.stop_updater(simulation_id) - logger.info(f"已停止图谱记忆更新: simulation_id={simulation_id}") + logger.info(f"Graph memory update stopped: simulation_id={simulation_id}") except Exception as e: - logger.error(f"停止图谱记忆更新器失败: {e}") + logger.error(f"Failed to stop graph memory updater: {e}") cls._graph_memory_enabled.pop(simulation_id, None) - - # 清理进程资源 + + # Clean up process resources cls._processes.pop(simulation_id, None) cls._action_queues.pop(simulation_id, None) - - # 关闭日志文件句柄 + + # Close log file handles if simulation_id in cls._stdout_files: try: cls._stdout_files[simulation_id].close() @@ -579,33 +580,33 @@ class SimulationRunner: except Exception: pass cls._stderr_files.pop(simulation_id, None) - + @classmethod def _read_action_log( - cls, - log_path: str, - position: int, + cls, + log_path: str, + position: int, state: SimulationRunState, platform: str ) -> int: """ - 读取动作日志文件 - + Read an action log file + Args: - log_path: 日志文件路径 - position: 上次读取位置 - state: 运行状态对象 - platform: 平台名称 (twitter/reddit) - + log_path: path to the log file + position: last read position + state: run state object + platform: platform name (twitter/reddit) + Returns: - 新的读取位置 + New read position """ - # 检查是否启用了图谱记忆更新 + # Check whether graph memory update is enabled graph_memory_enabled = cls._graph_memory_enabled.get(state.simulation_id, False) graph_updater = None if graph_memory_enabled: graph_updater = ZepGraphMemoryManager.get_updater(state.simulation_id) - + try: with open(log_path, 'r', encoding='utf-8') as f: f.seek(position) @@ -614,37 +615,37 @@ class SimulationRunner: if line: try: action_data = json.loads(line) - - # 处理事件类型的条目 + + # Handle event-type entries if "event_type" in action_data: event_type = action_data.get("event_type") - - # 检测 simulation_end 事件,标记平台已完成 + + # Detect simulation_end event and mark platform as completed if event_type == "simulation_end": if platform == "twitter": state.twitter_completed = True state.twitter_running = False - logger.info(f"Twitter 模拟已完成: {state.simulation_id}, total_rounds={action_data.get('total_rounds')}, total_actions={action_data.get('total_actions')}") + logger.info(f"Twitter simulation completed: {state.simulation_id}, total_rounds={action_data.get('total_rounds')}, total_actions={action_data.get('total_actions')}") elif platform == "reddit": state.reddit_completed = True state.reddit_running = False - logger.info(f"Reddit 模拟已完成: {state.simulation_id}, total_rounds={action_data.get('total_rounds')}, total_actions={action_data.get('total_actions')}") - - # 检查是否所有启用的平台都已完成 - # 如果只运行了一个平台,只检查那个平台 - # 如果运行了两个平台,需要两个都完成 + logger.info(f"Reddit simulation completed: {state.simulation_id}, total_rounds={action_data.get('total_rounds')}, total_actions={action_data.get('total_actions')}") + + # Check if all enabled platforms have completed. + # If only one platform is running, check only that one. + # If both platforms are running, both must complete. all_completed = cls._check_all_platforms_completed(state) if all_completed: state.runner_status = RunnerStatus.COMPLETED state.completed_at = datetime.now().isoformat() - logger.info(f"所有平台模拟已完成: {state.simulation_id}") - - # 更新轮次信息(从 round_end 事件) + logger.info(f"All platform simulations completed: {state.simulation_id}") + + # Update round info (from round_end events) elif event_type == "round_end": round_num = action_data.get("round", 0) simulated_hours = action_data.get("simulated_hours", 0) - - # 更新各平台独立的轮次和时间 + + # Update per-platform independent rounds and time if platform == "twitter": if round_num > state.twitter_current_round: state.twitter_current_round = round_num @@ -653,15 +654,15 @@ class SimulationRunner: if round_num > state.reddit_current_round: state.reddit_current_round = round_num state.reddit_simulated_hours = simulated_hours - - # 总体轮次取两个平台的最大值 + + # Overall round is the maximum across both platforms if round_num > state.current_round: state.current_round = round_num - # 总体时间取两个平台的最大值 + # Overall time is the maximum across both platforms state.simulated_hours = max(state.twitter_simulated_hours, state.reddit_simulated_hours) - + continue - + action = AgentAction( round_num=action_data.get("round", 0), timestamp=action_data.get("timestamp", datetime.now().isoformat()), @@ -674,65 +675,65 @@ class SimulationRunner: success=action_data.get("success", True), ) state.add_action(action) - - # 更新轮次 + + # Update round number if action.round_num and action.round_num > state.current_round: state.current_round = action.round_num - - # 如果启用了图谱记忆更新,将活动发送到Zep + + # If graph memory update is enabled, send activity to Zep if graph_updater: graph_updater.add_activity_from_dict(action_data, platform) - + except json.JSONDecodeError: pass return f.tell() except Exception as e: - logger.warning(f"读取动作日志失败: {log_path}, error={e}") + logger.warning(f"Failed to read action log: {log_path}, error={e}") return position - + @classmethod def _check_all_platforms_completed(cls, state: SimulationRunState) -> bool: """ - 检查所有启用的平台是否都已完成模拟 - - 通过检查对应的 actions.jsonl 文件是否存在来判断平台是否被启用 - + Check whether all enabled platforms have completed the simulation. + + A platform is considered enabled if its corresponding actions.jsonl file exists. + Returns: - True 如果所有启用的平台都已完成 + True if all enabled platforms have completed """ sim_dir = os.path.join(cls.RUN_STATE_DIR, state.simulation_id) twitter_log = os.path.join(sim_dir, "twitter", "actions.jsonl") reddit_log = os.path.join(sim_dir, "reddit", "actions.jsonl") - - # 检查哪些平台被启用(通过文件是否存在判断) + + # Check which platforms are enabled (determined by file existence) twitter_enabled = os.path.exists(twitter_log) reddit_enabled = os.path.exists(reddit_log) - - # 如果平台被启用但未完成,则返回 False + + # If a platform is enabled but not completed, return False if twitter_enabled and not state.twitter_completed: return False if reddit_enabled and not state.reddit_completed: return False - - # 至少有一个平台被启用且已完成 + + # At least one platform must be enabled and completed return twitter_enabled or reddit_enabled - + @classmethod def _terminate_process(cls, process: subprocess.Popen, simulation_id: str, timeout: int = 10): """ - 跨平台终止进程及其子进程 - + Cross-platform process termination (including child processes) + Args: - process: 要终止的进程 - simulation_id: 模拟ID(用于日志) - timeout: 等待进程退出的超时时间(秒) + process: process to terminate + simulation_id: simulation ID (for logging) + timeout: seconds to wait for the process to exit """ if IS_WINDOWS: - # Windows: 使用 taskkill 命令终止进程树 - # /F = 强制终止, /T = 终止进程树(包括子进程) - logger.info(f"终止进程树 (Windows): simulation={simulation_id}, pid={process.pid}") + # Windows: use taskkill to terminate the process tree + # /F = force terminate, /T = terminate process tree (including children) + logger.info(f"Terminating process tree (Windows): simulation={simulation_id}, pid={process.pid}") try: - # 先尝试优雅终止 + # Attempt graceful termination first subprocess.run( ['taskkill', '/PID', str(process.pid), '/T'], capture_output=True, @@ -741,8 +742,8 @@ class SimulationRunner: try: process.wait(timeout=timeout) except subprocess.TimeoutExpired: - # 强制终止 - logger.warning(f"进程未响应,强制终止: {simulation_id}") + # Force terminate + logger.warning(f"Process did not respond; force terminating: {simulation_id}") subprocess.run( ['taskkill', '/F', '/PID', str(process.pid), '/T'], capture_output=True, @@ -750,77 +751,77 @@ class SimulationRunner: ) process.wait(timeout=5) except Exception as e: - logger.warning(f"taskkill 失败,尝试 terminate: {e}") + logger.warning(f"taskkill failed, falling back to terminate: {e}") process.terminate() try: process.wait(timeout=5) except subprocess.TimeoutExpired: process.kill() else: - # Unix: 使用进程组终止 - # 由于使用了 start_new_session=True,进程组 ID 等于主进程 PID + # Unix: terminate via process group. + # Because start_new_session=True was used, the process group ID equals the main process PID. pgid = os.getpgid(process.pid) - logger.info(f"终止进程组 (Unix): simulation={simulation_id}, pgid={pgid}") - - # 先发送 SIGTERM 给整个进程组 + logger.info(f"Terminating process group (Unix): simulation={simulation_id}, pgid={pgid}") + + # Send SIGTERM to the entire process group os.killpg(pgid, signal.SIGTERM) - + try: process.wait(timeout=timeout) except subprocess.TimeoutExpired: - # 如果超时后还没结束,强制发送 SIGKILL - logger.warning(f"进程组未响应 SIGTERM,强制终止: {simulation_id}") + # If still alive after timeout, force send SIGKILL + logger.warning(f"Process group did not respond to SIGTERM; force terminating: {simulation_id}") os.killpg(pgid, signal.SIGKILL) process.wait(timeout=5) - + @classmethod def stop_simulation(cls, simulation_id: str) -> SimulationRunState: - """停止模拟""" + """Stop a simulation""" state = cls.get_run_state(simulation_id) if not state: - raise ValueError(f"模拟不存在: {simulation_id}") - + raise ValueError(f"Simulation not found: {simulation_id}") + if state.runner_status not in [RunnerStatus.RUNNING, RunnerStatus.PAUSED]: - raise ValueError(f"模拟未在运行: {simulation_id}, status={state.runner_status}") - + raise ValueError(f"Simulation is not running: {simulation_id}, status={state.runner_status}") + state.runner_status = RunnerStatus.STOPPING cls._save_run_state(state) - - # 终止进程 + + # Terminate process process = cls._processes.get(simulation_id) if process and process.poll() is None: try: cls._terminate_process(process, simulation_id) except ProcessLookupError: - # 进程已经不存在 + # Process no longer exists pass except Exception as e: - logger.error(f"终止进程组失败: {simulation_id}, error={e}") - # 回退到直接终止进程 + logger.error(f"Failed to terminate process group: {simulation_id}, error={e}") + # Fall back to direct process termination try: process.terminate() process.wait(timeout=5) except Exception: process.kill() - + state.runner_status = RunnerStatus.STOPPED state.twitter_running = False state.reddit_running = False state.completed_at = datetime.now().isoformat() cls._save_run_state(state) - - # 停止图谱记忆更新器 + + # Stop graph memory updater if cls._graph_memory_enabled.get(simulation_id, False): try: ZepGraphMemoryManager.stop_updater(simulation_id) - logger.info(f"已停止图谱记忆更新: simulation_id={simulation_id}") + logger.info(f"Graph memory update stopped: simulation_id={simulation_id}") except Exception as e: - logger.error(f"停止图谱记忆更新器失败: {e}") + logger.error(f"Failed to stop graph memory updater: {e}") cls._graph_memory_enabled.pop(simulation_id, None) - - logger.info(f"模拟已停止: {simulation_id}") + + logger.info(f"Simulation stopped: {simulation_id}") return state - + @classmethod def _read_actions_from_file( cls, @@ -831,48 +832,48 @@ class SimulationRunner: round_num: Optional[int] = None ) -> List[AgentAction]: """ - 从单个动作文件中读取动作 - + Read actions from a single action file + Args: - file_path: 动作日志文件路径 - default_platform: 默认平台(当动作记录中没有 platform 字段时使用) - platform_filter: 过滤平台 - agent_id: 过滤 Agent ID - round_num: 过滤轮次 + file_path: action log file path + default_platform: default platform (used when the record has no platform field) + platform_filter: filter by platform + agent_id: filter by agent ID + round_num: filter by round number """ if not os.path.exists(file_path): return [] - + actions = [] - + with open(file_path, 'r', encoding='utf-8') as f: for line in f: line = line.strip() if not line: continue - + try: data = json.loads(line) - - # 跳过非动作记录(如 simulation_start, round_start, round_end 等事件) + + # Skip non-action entries (e.g. simulation_start, round_start, round_end events) if "event_type" in data: continue - - # 跳过没有 agent_id 的记录(非 Agent 动作) + + # Skip records without agent_id (non-agent actions) if "agent_id" not in data: continue - - # 获取平台:优先使用记录中的 platform,否则使用默认平台 + + # Use the record's platform field first; fall back to default record_platform = data.get("platform") or default_platform or "" - - # 过滤 + + # Apply filters if platform_filter and record_platform != platform_filter: continue if agent_id is not None and data.get("agent_id") != agent_id: continue if round_num is not None and data.get("round") != round_num: continue - + actions.append(AgentAction( round_num=data.get("round", 0), timestamp=data.get("timestamp", ""), @@ -884,12 +885,12 @@ class SimulationRunner: result=data.get("result"), success=data.get("success", True), )) - + except json.JSONDecodeError: continue - + return actions - + @classmethod def get_all_actions( cls, @@ -899,58 +900,58 @@ class SimulationRunner: round_num: Optional[int] = None ) -> List[AgentAction]: """ - 获取所有平台的完整动作历史(无分页限制) - + Get the complete action history across all platforms (no pagination limit) + Args: - simulation_id: 模拟ID - platform: 过滤平台(twitter/reddit) - agent_id: 过滤Agent - round_num: 过滤轮次 - + simulation_id: simulation ID + platform: filter by platform (twitter/reddit) + agent_id: filter by agent + round_num: filter by round number + Returns: - 完整的动作列表(按时间戳排序,新的在前) + Complete action list (sorted by timestamp, newest first) """ sim_dir = os.path.join(cls.RUN_STATE_DIR, simulation_id) actions = [] - - # 读取 Twitter 动作文件(根据文件路径自动设置 platform 为 twitter) + + # Read Twitter action file (platform automatically set to "twitter" from file path) twitter_actions_log = os.path.join(sim_dir, "twitter", "actions.jsonl") if not platform or platform == "twitter": actions.extend(cls._read_actions_from_file( twitter_actions_log, - default_platform="twitter", # 自动填充 platform 字段 + default_platform="twitter", # Auto-fill platform field platform_filter=platform, - agent_id=agent_id, + agent_id=agent_id, round_num=round_num )) - - # 读取 Reddit 动作文件(根据文件路径自动设置 platform 为 reddit) + + # Read Reddit action file (platform automatically set to "reddit" from file path) reddit_actions_log = os.path.join(sim_dir, "reddit", "actions.jsonl") if not platform or platform == "reddit": actions.extend(cls._read_actions_from_file( reddit_actions_log, - default_platform="reddit", # 自动填充 platform 字段 + default_platform="reddit", # Auto-fill platform field platform_filter=platform, agent_id=agent_id, round_num=round_num )) - - # 如果分平台文件不存在,尝试读取旧的单一文件格式 + + # If per-platform files don't exist, try reading the old single-file format if not actions: actions_log = os.path.join(sim_dir, "actions.jsonl") actions = cls._read_actions_from_file( actions_log, - default_platform=None, # 旧格式文件中应该有 platform 字段 + default_platform=None, # Old format files should have a platform field platform_filter=platform, agent_id=agent_id, round_num=round_num ) - - # 按时间戳排序(新的在前) + + # Sort by timestamp (newest first) actions.sort(key=lambda x: x.timestamp, reverse=True) - + return actions - + @classmethod def get_actions( cls, @@ -962,18 +963,18 @@ class SimulationRunner: round_num: Optional[int] = None ) -> List[AgentAction]: """ - 获取动作历史(带分页) - + Get action history (paginated) + Args: - simulation_id: 模拟ID - limit: 返回数量限制 - offset: 偏移量 - platform: 过滤平台 - agent_id: 过滤Agent - round_num: 过滤轮次 - + simulation_id: simulation ID + limit: result count limit + offset: offset + platform: filter by platform + agent_id: filter by agent + round_num: filter by round number + Returns: - 动作列表 + Action list """ actions = cls.get_all_actions( simulation_id=simulation_id, @@ -981,10 +982,10 @@ class SimulationRunner: agent_id=agent_id, round_num=round_num ) - - # 分页 + + # Paginate return actions[offset:offset + limit] - + @classmethod def get_timeline( cls, @@ -993,29 +994,29 @@ class SimulationRunner: end_round: Optional[int] = None ) -> List[Dict[str, Any]]: """ - 获取模拟时间线(按轮次汇总) - + Get simulation timeline (summarized by round) + Args: - simulation_id: 模拟ID - start_round: 起始轮次 - end_round: 结束轮次 - + simulation_id: simulation ID + start_round: starting round + end_round: ending round + Returns: - 每轮的汇总信息 + Summary info per round """ actions = cls.get_actions(simulation_id, limit=10000) - - # 按轮次分组 + + # Group by round rounds: Dict[int, Dict[str, Any]] = {} - + for action in actions: round_num = action.round_num - + if round_num < start_round: continue if end_round is not None and round_num > end_round: continue - + if round_num not in rounds: rounds[round_num] = { "round_num": round_num, @@ -1026,19 +1027,19 @@ class SimulationRunner: "first_action_time": action.timestamp, "last_action_time": action.timestamp, } - + r = rounds[round_num] - + if action.platform == "twitter": r["twitter_actions"] += 1 else: r["reddit_actions"] += 1 - + r["active_agents"].add(action.agent_id) r["action_types"][action.action_type] = r["action_types"].get(action.action_type, 0) + 1 r["last_action_time"] = action.timestamp - - # 转换为列表 + + # Convert to list result = [] for round_num in sorted(rounds.keys()): r = rounds[round_num] @@ -1053,24 +1054,24 @@ class SimulationRunner: "first_action_time": r["first_action_time"], "last_action_time": r["last_action_time"], }) - + return result - + @classmethod def get_agent_stats(cls, simulation_id: str) -> List[Dict[str, Any]]: """ - 获取每个Agent的统计信息 - + Get statistics for each agent + Returns: - Agent统计列表 + Agent statistics list """ actions = cls.get_actions(simulation_id, limit=10000) - + agent_stats: Dict[int, Dict[str, Any]] = {} - + for action in actions: agent_id = action.agent_id - + if agent_id not in agent_stats: agent_stats[agent_id] = { "agent_id": agent_id, @@ -1082,71 +1083,71 @@ class SimulationRunner: "first_action_time": action.timestamp, "last_action_time": action.timestamp, } - + stats = agent_stats[agent_id] stats["total_actions"] += 1 - + if action.platform == "twitter": stats["twitter_actions"] += 1 else: stats["reddit_actions"] += 1 - + stats["action_types"][action.action_type] = stats["action_types"].get(action.action_type, 0) + 1 stats["last_action_time"] = action.timestamp - - # 按总动作数排序 + + # Sort by total actions descending result = sorted(agent_stats.values(), key=lambda x: x["total_actions"], reverse=True) - + return result - + @classmethod def cleanup_simulation_logs(cls, simulation_id: str) -> Dict[str, Any]: """ - 清理模拟的运行日志(用于强制重新开始模拟) - - 会删除以下文件: + Clean up simulation run logs (used to force a fresh restart) + + Deletes the following files: - run_state.json - twitter/actions.jsonl - reddit/actions.jsonl - simulation.log - stdout.log / stderr.log - - twitter_simulation.db(模拟数据库) - - reddit_simulation.db(模拟数据库) - - env_status.json(环境状态) - - 注意:不会删除配置文件(simulation_config.json)和 profile 文件 - + - twitter_simulation.db (simulation database) + - reddit_simulation.db (simulation database) + - env_status.json (environment status) + + Note: does NOT delete config files (simulation_config.json) or profile files + Args: - simulation_id: 模拟ID - + simulation_id: simulation ID + Returns: - 清理结果信息 + Cleanup result info """ import shutil - + sim_dir = os.path.join(cls.RUN_STATE_DIR, simulation_id) - + if not os.path.exists(sim_dir): - return {"success": True, "message": "模拟目录不存在,无需清理"} - + return {"success": True, "message": "Simulation directory does not exist; nothing to clean up"} + cleaned_files = [] errors = [] - - # 要删除的文件列表(包括数据库文件) + + # Files to delete (including database files) files_to_delete = [ "run_state.json", "simulation.log", "stdout.log", "stderr.log", - "twitter_simulation.db", # Twitter 平台数据库 - "reddit_simulation.db", # Reddit 平台数据库 - "env_status.json", # 环境状态文件 + "twitter_simulation.db", # Twitter platform database + "reddit_simulation.db", # Reddit platform database + "env_status.json", # Environment status file ] - - # 要删除的目录列表(包含动作日志) + + # Directories to clean (contain action logs) dirs_to_clean = ["twitter", "reddit"] - - # 删除文件 + + # Delete files for filename in files_to_delete: file_path = os.path.join(sim_dir, filename) if os.path.exists(file_path): @@ -1154,9 +1155,9 @@ class SimulationRunner: os.remove(file_path) cleaned_files.append(filename) except Exception as e: - errors.append(f"删除 {filename} 失败: {str(e)}") - - # 清理平台目录中的动作日志 + errors.append(f"Failed to delete {filename}: {str(e)}") + + # Clean action logs in platform directories for dir_name in dirs_to_clean: dir_path = os.path.join(sim_dir, dir_name) if os.path.exists(dir_path): @@ -1166,85 +1167,85 @@ class SimulationRunner: os.remove(actions_file) cleaned_files.append(f"{dir_name}/actions.jsonl") except Exception as e: - errors.append(f"删除 {dir_name}/actions.jsonl 失败: {str(e)}") - - # 清理内存中的运行状态 + errors.append(f"Failed to delete {dir_name}/actions.jsonl: {str(e)}") + + # Remove in-memory run state if simulation_id in cls._run_states: del cls._run_states[simulation_id] - - logger.info(f"清理模拟日志完成: {simulation_id}, 删除文件: {cleaned_files}") - + + logger.info(f"Simulation log cleanup complete: {simulation_id}, deleted files: {cleaned_files}") + return { "success": len(errors) == 0, "cleaned_files": cleaned_files, "errors": errors if errors else None } - - # 防止重复清理的标志 + + # Flag to prevent duplicate cleanup _cleanup_done = False - + @classmethod def cleanup_all_simulations(cls): """ - 清理所有运行中的模拟进程 - - 在服务器关闭时调用,确保所有子进程被终止 + Clean up all running simulation processes + + Called when the server shuts down to ensure all child processes are terminated """ - # 防止重复清理 + # Prevent duplicate cleanup if cls._cleanup_done: return cls._cleanup_done = True - - # 检查是否有内容需要清理(避免空进程的进程打印无用日志) + + # Check whether there is anything to clean up (avoid printing useless logs for empty process maps) has_processes = bool(cls._processes) has_updaters = bool(cls._graph_memory_enabled) - + if not has_processes and not has_updaters: - return # 没有需要清理的内容,静默返回 - - logger.info("正在清理所有模拟进程...") - - # 首先停止所有图谱记忆更新器(stop_all 内部会打印日志) + return # Nothing to clean up; return silently + + logger.info("Cleaning up all simulation processes...") + + # Stop all graph memory updaters first (stop_all logs internally) try: ZepGraphMemoryManager.stop_all() except Exception as e: - logger.error(f"停止图谱记忆更新器失败: {e}") + logger.error(f"Failed to stop graph memory updaters: {e}") cls._graph_memory_enabled.clear() - - # 复制字典以避免在迭代时修改 + + # Copy dict to avoid modifying it while iterating processes = list(cls._processes.items()) - + for simulation_id, process in processes: try: - if process.poll() is None: # 进程仍在运行 - logger.info(f"终止模拟进程: {simulation_id}, pid={process.pid}") - + if process.poll() is None: # Process is still running + logger.info(f"Terminating simulation process: {simulation_id}, pid={process.pid}") + try: - # 使用跨平台的进程终止方法 + # Use cross-platform process termination cls._terminate_process(process, simulation_id, timeout=5) except (ProcessLookupError, OSError): - # 进程可能已经不存在,尝试直接终止 + # Process may already be gone; try direct termination try: process.terminate() process.wait(timeout=3) except Exception: process.kill() - - # 更新 run_state.json + + # Update run_state.json state = cls.get_run_state(simulation_id) if state: state.runner_status = RunnerStatus.STOPPED state.twitter_running = False state.reddit_running = False state.completed_at = datetime.now().isoformat() - state.error = "服务器关闭,模拟被终止" + state.error = "Server shutdown; simulation was terminated" cls._save_run_state(state) - - # 同时更新 state.json,将状态设为 stopped + + # Also update state.json to set status to stopped try: sim_dir = os.path.join(cls.RUN_STATE_DIR, simulation_id) state_file = os.path.join(sim_dir, "state.json") - logger.info(f"尝试更新 state.json: {state_file}") + logger.info(f"Updating state.json: {state_file}") if os.path.exists(state_file): with open(state_file, 'r', encoding='utf-8') as f: state_data = json.load(f) @@ -1252,16 +1253,16 @@ class SimulationRunner: state_data['updated_at'] = datetime.now().isoformat() with open(state_file, 'w', encoding='utf-8') as f: json.dump(state_data, f, indent=2, ensure_ascii=False) - logger.info(f"已更新 state.json 状态为 stopped: {simulation_id}") + logger.info(f"Updated state.json status to stopped: {simulation_id}") else: - logger.warning(f"state.json 不存在: {state_file}") + logger.warning(f"state.json not found: {state_file}") except Exception as state_err: - logger.warning(f"更新 state.json 失败: {simulation_id}, error={state_err}") - + logger.warning(f"Failed to update state.json: {simulation_id}, error={state_err}") + except Exception as e: - logger.error(f"清理进程失败: {simulation_id}, error={e}") - - # 清理文件句柄 + logger.error(f"Failed to clean up process: {simulation_id}, error={e}") + + # Close file handles for simulation_id, file_handle in list(cls._stdout_files.items()): try: if file_handle: @@ -1269,7 +1270,7 @@ class SimulationRunner: except Exception: pass cls._stdout_files.clear() - + for simulation_id, file_handle in list(cls._stderr_files.items()): try: if file_handle: @@ -1277,109 +1278,111 @@ class SimulationRunner: except Exception: pass cls._stderr_files.clear() - - # 清理内存中的状态 + + # Clear in-memory state cls._processes.clear() cls._action_queues.clear() - - logger.info("模拟进程清理完成") - + + logger.info("Simulation process cleanup complete") + @classmethod def register_cleanup(cls): """ - 注册清理函数 - - 在 Flask 应用启动时调用,确保服务器关闭时清理所有模拟进程 + Register cleanup function + + Called at Flask application startup to ensure all simulation processes are + terminated when the server shuts down """ global _cleanup_registered - + if _cleanup_registered: return - - # Flask debug 模式下,只在 reloader 子进程中注册清理(实际运行应用的进程) - # WERKZEUG_RUN_MAIN=true 表示是 reloader 子进程 - # 如果不是 debug 模式,则没有这个环境变量,也需要注册 + + # In Flask debug mode, only register cleanup in the reloader child process + # (the process that actually runs the application). + # WERKZEUG_RUN_MAIN=true indicates the reloader child process. + # In non-debug mode this env var is absent, and we always register. is_reloader_process = os.environ.get('WERKZEUG_RUN_MAIN') == 'true' is_debug_mode = os.environ.get('FLASK_DEBUG') == '1' or os.environ.get('WERKZEUG_RUN_MAIN') is not None - - # 在 debug 模式下,只在 reloader 子进程中注册;非 debug 模式下始终注册 + + # In debug mode, only register in the reloader child process; in non-debug mode always register if is_debug_mode and not is_reloader_process: - _cleanup_registered = True # 标记已注册,防止子进程再次尝试 + _cleanup_registered = True # Mark as registered to prevent child process from trying again return - - # 保存原有的信号处理器 + + # Save original signal handlers original_sigint = signal.getsignal(signal.SIGINT) original_sigterm = signal.getsignal(signal.SIGTERM) - # SIGHUP 只在 Unix 系统存在(macOS/Linux),Windows 没有 + # SIGHUP only exists on Unix (macOS/Linux), not on Windows original_sighup = None has_sighup = hasattr(signal, 'SIGHUP') if has_sighup: original_sighup = signal.getsignal(signal.SIGHUP) - + def cleanup_handler(signum=None, frame=None): - """信号处理器:先清理模拟进程,再调用原处理器""" - # 只有在有进程需要清理时才打印日志 + """Signal handler: clean up simulation processes, then invoke original handler""" + # Only log when there are processes to clean up if cls._processes or cls._graph_memory_enabled: - logger.info(f"收到信号 {signum},开始清理...") + logger.info(f"Received signal {signum}; starting cleanup...") cls.cleanup_all_simulations() - - # 调用原有的信号处理器,让 Flask 正常退出 + + # Invoke the original signal handler so Flask exits normally if signum == signal.SIGINT and callable(original_sigint): original_sigint(signum, frame) elif signum == signal.SIGTERM and callable(original_sigterm): original_sigterm(signum, frame) elif has_sighup and signum == signal.SIGHUP: - # SIGHUP: 终端关闭时发送 + # SIGHUP: sent when the terminal closes if callable(original_sighup): original_sighup(signum, frame) else: - # 默认行为:正常退出 + # Default behavior: exit normally sys.exit(0) else: - # 如果原处理器不可调用(如 SIG_DFL),则使用默认行为 + # If original handler is not callable (e.g. SIG_DFL), use default behavior raise KeyboardInterrupt - - # 注册 atexit 处理器(作为备用) + + # Register atexit handler (as a fallback) atexit.register(cls.cleanup_all_simulations) - - # 注册信号处理器(仅在主线程中) + + # Register signal handlers (main thread only) try: - # SIGTERM: kill 命令默认信号 + # SIGTERM: default signal sent by kill command signal.signal(signal.SIGTERM, cleanup_handler) # SIGINT: Ctrl+C signal.signal(signal.SIGINT, cleanup_handler) - # SIGHUP: 终端关闭(仅 Unix 系统) + # SIGHUP: terminal close (Unix only) if has_sighup: signal.signal(signal.SIGHUP, cleanup_handler) except ValueError: - # 不在主线程中,只能使用 atexit - logger.warning("无法注册信号处理器(不在主线程),仅使用 atexit") - + # Not in the main thread; atexit only + logger.warning("Cannot register signal handlers (not in main thread); using atexit only") + _cleanup_registered = True - + @classmethod def get_running_simulations(cls) -> List[str]: """ - 获取所有正在运行的模拟ID列表 + Get a list of all currently running simulation IDs """ running = [] for sim_id, process in cls._processes.items(): if process.poll() is None: running.append(sim_id) return running - - # ============== Interview 功能 ============== - + + # ============== Interview functionality ============== + @classmethod def check_env_alive(cls, simulation_id: str) -> bool: """ - 检查模拟环境是否存活(可以接收Interview命令) + Check whether the simulation environment is alive (able to receive Interview commands) Args: - simulation_id: 模拟ID + simulation_id: simulation ID Returns: - True 表示环境存活,False 表示环境已关闭 + True if the environment is alive, False if it has been closed """ sim_dir = os.path.join(cls.RUN_STATE_DIR, simulation_id) if not os.path.exists(sim_dir): @@ -1391,27 +1394,27 @@ class SimulationRunner: @classmethod def get_env_status_detail(cls, simulation_id: str) -> Dict[str, Any]: """ - 获取模拟环境的详细状态信息 + Get detailed environment status for a simulation Args: - simulation_id: 模拟ID + simulation_id: simulation ID Returns: - 状态详情字典,包含 status, twitter_available, reddit_available, timestamp + Status detail dict containing: status, twitter_available, reddit_available, timestamp """ sim_dir = os.path.join(cls.RUN_STATE_DIR, simulation_id) status_file = os.path.join(sim_dir, "env_status.json") - + default_status = { "status": "stopped", "twitter_available": False, "reddit_available": False, "timestamp": None } - + if not os.path.exists(status_file): return default_status - + try: with open(status_file, 'r', encoding='utf-8') as f: status = json.load(f) @@ -1434,35 +1437,35 @@ class SimulationRunner: timeout: float = 60.0 ) -> Dict[str, Any]: """ - 采访单个Agent + Interview a single agent Args: - simulation_id: 模拟ID + simulation_id: simulation ID agent_id: Agent ID - prompt: 采访问题 - platform: 指定平台(可选) - - "twitter": 只采访Twitter平台 - - "reddit": 只采访Reddit平台 - - None: 双平台模拟时同时采访两个平台,返回整合结果 - timeout: 超时时间(秒) + prompt: interview question + platform: target platform (optional) + - "twitter": interview only Twitter platform + - "reddit": interview only Reddit platform + - None: in dual-platform mode, interview both and return integrated result + timeout: timeout in seconds Returns: - 采访结果字典 + Interview result dict Raises: - ValueError: 模拟不存在或环境未运行 - TimeoutError: 等待响应超时 + ValueError: simulation not found or environment not running + TimeoutError: timed out waiting for response """ sim_dir = os.path.join(cls.RUN_STATE_DIR, simulation_id) if not os.path.exists(sim_dir): - raise ValueError(f"模拟不存在: {simulation_id}") + raise ValueError(f"Simulation not found: {simulation_id}") ipc_client = SimulationIPCClient(sim_dir) if not ipc_client.check_env_alive(): - raise ValueError(f"模拟环境未运行或已关闭,无法执行Interview: {simulation_id}") + raise ValueError(f"Simulation environment is not running or has been closed; cannot interview: {simulation_id}") - logger.info(f"发送Interview命令: simulation_id={simulation_id}, agent_id={agent_id}, platform={platform}") + logger.info(f"Sending Interview command: simulation_id={simulation_id}, agent_id={agent_id}, platform={platform}") response = ipc_client.send_interview( agent_id=agent_id, @@ -1487,7 +1490,7 @@ class SimulationRunner: "error": response.error, "timestamp": response.timestamp } - + @classmethod def interview_agents_batch( cls, @@ -1497,34 +1500,34 @@ class SimulationRunner: timeout: float = 120.0 ) -> Dict[str, Any]: """ - 批量采访多个Agent + Batch-interview multiple agents Args: - simulation_id: 模拟ID - interviews: 采访列表,每个元素包含 {"agent_id": int, "prompt": str, "platform": str(可选)} - platform: 默认平台(可选,会被每个采访项的platform覆盖) - - "twitter": 默认只采访Twitter平台 - - "reddit": 默认只采访Reddit平台 - - None: 双平台模拟时每个Agent同时采访两个平台 - timeout: 超时时间(秒) + simulation_id: simulation ID + interviews: list of interviews, each containing {"agent_id": int, "prompt": str, "platform": str (optional)} + platform: default platform (optional; overridden per-item by each interview's platform) + - "twitter": default to Twitter platform only + - "reddit": default to Reddit platform only + - None: in dual-platform mode, interview each agent on both platforms + timeout: timeout in seconds Returns: - 批量采访结果字典 + Batch interview result dict Raises: - ValueError: 模拟不存在或环境未运行 - TimeoutError: 等待响应超时 + ValueError: simulation not found or environment not running + TimeoutError: timed out waiting for response """ sim_dir = os.path.join(cls.RUN_STATE_DIR, simulation_id) if not os.path.exists(sim_dir): - raise ValueError(f"模拟不存在: {simulation_id}") + raise ValueError(f"Simulation not found: {simulation_id}") ipc_client = SimulationIPCClient(sim_dir) if not ipc_client.check_env_alive(): - raise ValueError(f"模拟环境未运行或已关闭,无法执行Interview: {simulation_id}") + raise ValueError(f"Simulation environment is not running or has been closed; cannot interview: {simulation_id}") - logger.info(f"发送批量Interview命令: simulation_id={simulation_id}, count={len(interviews)}, platform={platform}") + logger.info(f"Sending batch Interview command: simulation_id={simulation_id}, count={len(interviews)}, platform={platform}") response = ipc_client.send_batch_interview( interviews=interviews, @@ -1546,7 +1549,7 @@ class SimulationRunner: "error": response.error, "timestamp": response.timestamp } - + @classmethod def interview_all_agents( cls, @@ -1556,39 +1559,39 @@ class SimulationRunner: timeout: float = 180.0 ) -> Dict[str, Any]: """ - 采访所有Agent(全局采访) + Interview all agents (global interview) - 使用相同的问题采访模拟中的所有Agent + Uses the same question to interview all agents in the simulation Args: - simulation_id: 模拟ID - prompt: 采访问题(所有Agent使用相同问题) - platform: 指定平台(可选) - - "twitter": 只采访Twitter平台 - - "reddit": 只采访Reddit平台 - - None: 双平台模拟时每个Agent同时采访两个平台 - timeout: 超时时间(秒) + simulation_id: simulation ID + prompt: interview question (same for all agents) + platform: target platform (optional) + - "twitter": interview only Twitter platform + - "reddit": interview only Reddit platform + - None: in dual-platform mode, interview each agent on both platforms + timeout: timeout in seconds Returns: - 全局采访结果字典 + Global interview result dict """ sim_dir = os.path.join(cls.RUN_STATE_DIR, simulation_id) if not os.path.exists(sim_dir): - raise ValueError(f"模拟不存在: {simulation_id}") + raise ValueError(f"Simulation not found: {simulation_id}") - # 从配置文件获取所有Agent信息 + # Get all agent info from config file config_path = os.path.join(sim_dir, "simulation_config.json") if not os.path.exists(config_path): - raise ValueError(f"模拟配置不存在: {simulation_id}") + raise ValueError(f"Simulation config not found: {simulation_id}") with open(config_path, 'r', encoding='utf-8') as f: config = json.load(f) agent_configs = config.get("agent_configs", []) if not agent_configs: - raise ValueError(f"模拟配置中没有Agent: {simulation_id}") + raise ValueError(f"No agents in simulation config: {simulation_id}") - # 构建批量采访列表 + # Build batch interview list interviews = [] for agent_config in agent_configs: agent_id = agent_config.get("agent_id") @@ -1598,7 +1601,7 @@ class SimulationRunner: "prompt": prompt }) - logger.info(f"发送全局Interview命令: simulation_id={simulation_id}, agent_count={len(interviews)}, platform={platform}") + logger.info(f"Sending global Interview command: simulation_id={simulation_id}, agent_count={len(interviews)}, platform={platform}") return cls.interview_agents_batch( simulation_id=simulation_id, @@ -1606,7 +1609,7 @@ class SimulationRunner: platform=platform, timeout=timeout ) - + @classmethod def close_simulation_env( cls, @@ -1614,47 +1617,48 @@ class SimulationRunner: timeout: float = 30.0 ) -> Dict[str, Any]: """ - 关闭模拟环境(而不是停止模拟进程) - - 向模拟发送关闭环境命令,使其优雅退出等待命令模式 - + Close the simulation environment (without stopping the simulation process) + + Sends a close-environment command to the simulation, causing it to exit + the command-waiting mode gracefully. + Args: - simulation_id: 模拟ID - timeout: 超时时间(秒) - + simulation_id: simulation ID + timeout: timeout in seconds + Returns: - 操作结果字典 + Operation result dict """ sim_dir = os.path.join(cls.RUN_STATE_DIR, simulation_id) if not os.path.exists(sim_dir): - raise ValueError(f"模拟不存在: {simulation_id}") - + raise ValueError(f"Simulation not found: {simulation_id}") + ipc_client = SimulationIPCClient(sim_dir) - + if not ipc_client.check_env_alive(): return { "success": True, - "message": "环境已经关闭" + "message": "Environment is already closed" } - - logger.info(f"发送关闭环境命令: simulation_id={simulation_id}") - + + logger.info(f"Sending close-environment command: simulation_id={simulation_id}") + try: response = ipc_client.send_close_env(timeout=timeout) - + return { "success": response.status.value == "completed", - "message": "环境关闭命令已发送", + "message": "Close-environment command sent", "result": response.result, "timestamp": response.timestamp } except TimeoutError: - # 超时可能是因为环境正在关闭 + # Timeout may mean the environment is already shutting down return { "success": True, - "message": "环境关闭命令已发送(等待响应超时,环境可能正在关闭)" + "message": "Close-environment command sent (timed out waiting for response; environment may be closing)" } - + @classmethod def _get_interview_history_from_db( cls, @@ -1663,18 +1667,18 @@ class SimulationRunner: agent_id: Optional[int] = None, limit: int = 100 ) -> List[Dict[str, Any]]: - """从单个数据库获取Interview历史""" + """Fetch Interview history from a single database""" import sqlite3 - + if not os.path.exists(db_path): return [] - + results = [] - + try: conn = sqlite3.connect(db_path) cursor = conn.cursor() - + if agent_id is not None: cursor.execute(""" SELECT user_id, info, created_at @@ -1691,13 +1695,13 @@ class SimulationRunner: ORDER BY created_at DESC LIMIT ? """, (limit,)) - + for user_id, info_json, created_at in cursor.fetchall(): try: info = json.loads(info_json) if info_json else {} except json.JSONDecodeError: info = {"raw": info_json} - + results.append({ "agent_id": user_id, "response": info.get("response", info), @@ -1705,12 +1709,12 @@ class SimulationRunner: "timestamp": created_at, "platform": platform_name }) - + conn.close() - + except Exception as e: - logger.error(f"读取Interview历史失败 ({platform_name}): {e}") - + logger.error(f"Failed to read Interview history ({platform_name}): {e}") + return results @classmethod @@ -1722,31 +1726,31 @@ class SimulationRunner: limit: int = 100 ) -> List[Dict[str, Any]]: """ - 获取Interview历史记录(从数据库读取) - + Get Interview history records (read from database) + Args: - simulation_id: 模拟ID - platform: 平台类型(reddit/twitter/None) - - "reddit": 只获取Reddit平台的历史 - - "twitter": 只获取Twitter平台的历史 - - None: 获取两个平台的所有历史 - agent_id: 指定Agent ID(可选,只获取该Agent的历史) - limit: 每个平台返回数量限制 - + simulation_id: simulation ID + platform: platform type (reddit/twitter/None) + - "reddit": only fetch Reddit platform history + - "twitter": only fetch Twitter platform history + - None: fetch history from both platforms + agent_id: filter by agent ID (optional) + limit: per-platform result count limit + Returns: - Interview历史记录列表 + Interview history record list """ sim_dir = os.path.join(cls.RUN_STATE_DIR, simulation_id) - + results = [] - - # 确定要查询的平台 + + # Determine which platforms to query if platform in ("reddit", "twitter"): platforms = [platform] else: - # 不指定platform时,查询两个平台 + # No platform specified: query both platforms = ["twitter", "reddit"] - + for p in platforms: db_path = os.path.join(sim_dir, f"{p}_simulation.db") platform_results = cls._get_interview_history_from_db( @@ -1756,13 +1760,12 @@ class SimulationRunner: limit=limit ) results.extend(platform_results) - - # 按时间降序排序 + + # Sort by timestamp descending results.sort(key=lambda x: x.get("timestamp", ""), reverse=True) - - # 如果查询了多个平台,限制总数 + + # If multiple platforms were queried, cap total count if len(platforms) > 1 and len(results) > limit: results = results[:limit] - - return results + return results diff --git a/backend/app/services/text_processor.py b/backend/app/services/text_processor.py index 91e32acc..81f81842 100644 --- a/backend/app/services/text_processor.py +++ b/backend/app/services/text_processor.py @@ -1,5 +1,5 @@ """ -文本处理服务 +Text processing service """ from typing import List, Optional @@ -7,11 +7,11 @@ from ..utils.file_parser import FileParser, split_text_into_chunks class TextProcessor: - """文本处理器""" - + """Text processor""" + @staticmethod def extract_from_files(file_paths: List[str]) -> str: - """从多个文件提取文本""" + """Extract text from multiple files""" return FileParser.extract_from_multiple(file_paths) @staticmethod @@ -21,48 +21,48 @@ class TextProcessor: overlap: int = 50 ) -> List[str]: """ - 分割文本 - + Split text into chunks. + Args: - text: 原始文本 - chunk_size: 块大小 - overlap: 重叠大小 - + text: raw text + chunk_size: chunk size + overlap: overlap size + Returns: - 文本块列表 + list of text chunks """ return split_text_into_chunks(text, chunk_size, overlap) @staticmethod def preprocess_text(text: str) -> str: """ - 预处理文本 - - 移除多余空白 - - 标准化换行 - + Preprocess text: + - Remove excess whitespace + - Normalize line endings + Args: - text: 原始文本 - + text: raw text + Returns: - 处理后的文本 + processed text """ import re - - # 标准化换行 + + # Normalize line endings text = text.replace('\r\n', '\n').replace('\r', '\n') - - # 移除连续空行(保留最多两个换行) + + # Remove consecutive blank lines (keep at most two newlines) text = re.sub(r'\n{3,}', '\n\n', text) - - # 移除行首行尾空白 + + # Strip leading/trailing whitespace from each line lines = [line.strip() for line in text.split('\n')] text = '\n'.join(lines) - + return text.strip() - + @staticmethod def get_text_stats(text: str) -> dict: - """获取文本统计信息""" + """Get text statistics""" return { "total_chars": len(text), "total_lines": text.count('\n') + 1, diff --git a/backend/app/services/zep_entity_reader.py b/backend/app/services/zep_entity_reader.py index 71661be4..dc14961e 100644 --- a/backend/app/services/zep_entity_reader.py +++ b/backend/app/services/zep_entity_reader.py @@ -1,6 +1,6 @@ """ -Zep实体读取与过滤服务 -从Zep图谱中读取节点,筛选出符合预定义实体类型的节点 +Zep entity read and filter service +Reads nodes from the Zep graph and filters out nodes that match predefined entity types """ import time @@ -15,23 +15,23 @@ from ..utils.zep_paging import fetch_all_nodes, fetch_all_edges logger = get_logger('mirofish.zep_entity_reader') -# 用于泛型返回类型 +# Generic return type T = TypeVar('T') @dataclass class EntityNode: - """实体节点数据结构""" + """Entity node data structure""" uuid: str name: str labels: List[str] summary: str attributes: Dict[str, Any] - # 相关的边信息 + # Related edge info related_edges: List[Dict[str, Any]] = field(default_factory=list) - # 相关的其他节点信息 + # Related node info related_nodes: List[Dict[str, Any]] = field(default_factory=list) - + def to_dict(self) -> Dict[str, Any]: return { "uuid": self.uuid, @@ -42,9 +42,9 @@ class EntityNode: "related_edges": self.related_edges, "related_nodes": self.related_nodes, } - + def get_entity_type(self) -> Optional[str]: - """获取实体类型(排除默认的Entity标签)""" + """Get entity type (excluding the default Entity label)""" for label in self.labels: if label not in ["Entity", "Node"]: return label @@ -53,12 +53,12 @@ class EntityNode: @dataclass class FilteredEntities: - """过滤后的实体集合""" + """Filtered entity collection""" entities: List[EntityNode] entity_types: Set[str] total_count: int filtered_count: int - + def to_dict(self) -> Dict[str, Any]: return { "entities": [e.to_dict() for e in self.entities], @@ -70,43 +70,43 @@ class FilteredEntities: class ZepEntityReader: """ - Zep实体读取与过滤服务 - - 主要功能: - 1. 从Zep图谱读取所有节点 - 2. 筛选出符合预定义实体类型的节点(Labels不只是Entity的节点) - 3. 获取每个实体的相关边和关联节点信息 + Zep entity read and filter service + + Main features: + 1. Read all nodes from the Zep graph + 2. Filter out nodes matching predefined entity types (nodes with labels beyond just "Entity") + 3. Fetch related edges and associated node info for each entity """ - + def __init__(self, api_key: Optional[str] = None): self.api_key = api_key or Config.ZEP_API_KEY if not self.api_key: - raise ValueError("ZEP_API_KEY 未配置") - + raise ValueError("ZEP_API_KEY is not configured") + self.client = Zep(api_key=self.api_key) - + def _call_with_retry( - self, - func: Callable[[], T], + self, + func: Callable[[], T], operation_name: str, max_retries: int = 3, initial_delay: float = 2.0 ) -> T: """ - 带重试机制的Zep API调用 - + Zep API call with retry logic + Args: - func: 要执行的函数(无参数的lambda或callable) - operation_name: 操作名称,用于日志 - max_retries: 最大重试次数(默认3次,即最多尝试3次) - initial_delay: 初始延迟秒数 - + func: function to execute (a lambda or callable with no arguments) + operation_name: operation name for logging + max_retries: maximum number of retries (default 3, meaning up to 3 attempts total) + initial_delay: initial delay in seconds + Returns: - API调用结果 + API call result """ last_exception = None delay = initial_delay - + for attempt in range(max_retries): try: return func() @@ -114,27 +114,27 @@ class ZepEntityReader: last_exception = e if attempt < max_retries - 1: logger.warning( - f"Zep {operation_name} 第 {attempt + 1} 次尝试失败: {str(e)[:100]}, " - f"{delay:.1f}秒后重试..." + f"Zep {operation_name} attempt {attempt + 1} failed: {str(e)[:100]}, " + f"retrying in {delay:.1f}s..." ) time.sleep(delay) - delay *= 2 # 指数退避 + delay *= 2 # Exponential backoff else: - logger.error(f"Zep {operation_name} 在 {max_retries} 次尝试后仍失败: {str(e)}") - + logger.error(f"Zep {operation_name} still failing after {max_retries} attempts: {str(e)}") + raise last_exception - + def get_all_nodes(self, graph_id: str) -> List[Dict[str, Any]]: """ - 获取图谱的所有节点(分页获取) + Get all nodes in the graph (paginated) Args: - graph_id: 图谱ID + graph_id: graph ID Returns: - 节点列表 + Node list """ - logger.info(f"获取图谱 {graph_id} 的所有节点...") + logger.info(f"Fetching all nodes for graph {graph_id}...") nodes = fetch_all_nodes(self.client, graph_id) @@ -148,20 +148,20 @@ class ZepEntityReader: "attributes": node.attributes or {}, }) - logger.info(f"共获取 {len(nodes_data)} 个节点") + logger.info(f"Fetched {len(nodes_data)} nodes") return nodes_data def get_all_edges(self, graph_id: str) -> List[Dict[str, Any]]: """ - 获取图谱的所有边(分页获取) + Get all edges in the graph (paginated) Args: - graph_id: 图谱ID + graph_id: graph ID Returns: - 边列表 + Edge list """ - logger.info(f"获取图谱 {graph_id} 的所有边...") + logger.info(f"Fetching all edges for graph {graph_id}...") edges = fetch_all_edges(self.client, graph_id) @@ -176,26 +176,26 @@ class ZepEntityReader: "attributes": edge.attributes or {}, }) - logger.info(f"共获取 {len(edges_data)} 条边") + logger.info(f"Fetched {len(edges_data)} edges") return edges_data - + def get_node_edges(self, node_uuid: str) -> List[Dict[str, Any]]: """ - 获取指定节点的所有相关边(带重试机制) - + Get all edges related to the specified node (with retry logic) + Args: - node_uuid: 节点UUID - + node_uuid: node UUID + Returns: - 边列表 + Edge list """ try: - # 使用重试机制调用Zep API + # Call Zep API with retry edges = self._call_with_retry( func=lambda: self.client.graph.node.get_entity_edges(node_uuid=node_uuid), - operation_name=f"获取节点边(node={node_uuid[:8]}...)" + operation_name=f"get node edges (node={node_uuid[:8]}...)" ) - + edges_data = [] for edge in edges: edges_data.append({ @@ -206,60 +206,60 @@ class ZepEntityReader: "target_node_uuid": edge.target_node_uuid, "attributes": edge.attributes or {}, }) - + return edges_data except Exception as e: - logger.warning(f"获取节点 {node_uuid} 的边失败: {str(e)}") + logger.warning(f"Failed to get edges for node {node_uuid}: {str(e)}") return [] - + def filter_defined_entities( - self, + self, graph_id: str, defined_entity_types: Optional[List[str]] = None, enrich_with_edges: bool = True ) -> FilteredEntities: """ - 筛选出符合预定义实体类型的节点 - - 筛选逻辑: - - 如果节点的Labels只有一个"Entity",说明这个实体不符合我们预定义的类型,跳过 - - 如果节点的Labels包含除"Entity"和"Node"之外的标签,说明符合预定义类型,保留 - + Filter out nodes that match predefined entity types + + Filter logic: + - If a node's Labels contain only "Entity", it does not match our predefined types; skip it + - If a node's Labels contain labels other than "Entity" and "Node", it matches a predefined type; keep it + Args: - graph_id: 图谱ID - defined_entity_types: 预定义的实体类型列表(可选,如果提供则只保留这些类型) - enrich_with_edges: 是否获取每个实体的相关边信息 - + graph_id: graph ID + defined_entity_types: list of predefined entity types (optional; if provided, only these types are kept) + enrich_with_edges: whether to fetch related edge info for each entity + Returns: - FilteredEntities: 过滤后的实体集合 + FilteredEntities: filtered entity collection """ - logger.info(f"开始筛选图谱 {graph_id} 的实体...") - - # 获取所有节点 + logger.info(f"Starting entity filtering for graph {graph_id}...") + + # Get all nodes all_nodes = self.get_all_nodes(graph_id) total_count = len(all_nodes) - - # 获取所有边(用于后续关联查找) + + # Get all edges (for relation lookup) all_edges = self.get_all_edges(graph_id) if enrich_with_edges else [] - - # 构建节点UUID到节点数据的映射 + + # Build UUID-to-node mapping node_map = {n["uuid"]: n for n in all_nodes} - - # 筛选符合条件的实体 + + # Filter matching entities filtered_entities = [] entity_types_found = set() - + for node in all_nodes: labels = node.get("labels", []) - - # 筛选逻辑:Labels必须包含除"Entity"和"Node"之外的标签 + + # Filter logic: Labels must contain at least one label other than "Entity" and "Node" custom_labels = [l for l in labels if l not in ["Entity", "Node"]] - + if not custom_labels: - # 只有默认标签,跳过 + # Only default labels; skip continue - - # 如果指定了预定义类型,检查是否匹配 + + # If predefined types are specified, check for a match if defined_entity_types: matching_labels = [l for l in custom_labels if l in defined_entity_types] if not matching_labels: @@ -267,10 +267,10 @@ class ZepEntityReader: entity_type = matching_labels[0] else: entity_type = custom_labels[0] - + entity_types_found.add(entity_type) - - # 创建实体节点对象 + + # Create entity node object entity = EntityNode( uuid=node["uuid"], name=node["name"], @@ -278,12 +278,12 @@ class ZepEntityReader: summary=node["summary"], attributes=node["attributes"], ) - - # 获取相关边和节点 + + # Fetch related edges and nodes if enrich_with_edges: related_edges = [] related_node_uuids = set() - + for edge in all_edges: if edge["source_node_uuid"] == node["uuid"]: related_edges.append({ @@ -301,10 +301,10 @@ class ZepEntityReader: "source_node_uuid": edge["source_node_uuid"], }) related_node_uuids.add(edge["source_node_uuid"]) - + entity.related_edges = related_edges - - # 获取关联节点的基本信息 + + # Fetch basic info for related nodes related_nodes = [] for related_uuid in related_node_uuids: if related_uuid in node_map: @@ -315,57 +315,57 @@ class ZepEntityReader: "labels": related_node["labels"], "summary": related_node.get("summary", ""), }) - + entity.related_nodes = related_nodes - + filtered_entities.append(entity) - - logger.info(f"筛选完成: 总节点 {total_count}, 符合条件 {len(filtered_entities)}, " - f"实体类型: {entity_types_found}") - + + logger.info(f"Filtering complete: total nodes {total_count}, matching {len(filtered_entities)}, " + f"entity types: {entity_types_found}") + return FilteredEntities( entities=filtered_entities, entity_types=entity_types_found, total_count=total_count, filtered_count=len(filtered_entities), ) - + def get_entity_with_context( - self, - graph_id: str, + self, + graph_id: str, entity_uuid: str ) -> Optional[EntityNode]: """ - 获取单个实体及其完整上下文(边和关联节点,带重试机制) - + Get a single entity and its full context (edges and related nodes, with retry) + Args: - graph_id: 图谱ID - entity_uuid: 实体UUID - + graph_id: graph ID + entity_uuid: entity UUID + Returns: - EntityNode或None + EntityNode or None """ try: - # 使用重试机制获取节点 + # Get the node with retry node = self._call_with_retry( func=lambda: self.client.graph.node.get(uuid_=entity_uuid), - operation_name=f"获取节点详情(uuid={entity_uuid[:8]}...)" + operation_name=f"get node detail (uuid={entity_uuid[:8]}...)" ) - + if not node: return None - - # 获取节点的边 + + # Get the node's edges edges = self.get_node_edges(entity_uuid) - - # 获取所有节点用于关联查找 + + # Get all nodes for relation lookup all_nodes = self.get_all_nodes(graph_id) node_map = {n["uuid"]: n for n in all_nodes} - - # 处理相关边和节点 + + # Process related edges and nodes related_edges = [] related_node_uuids = set() - + for edge in edges: if edge["source_node_uuid"] == entity_uuid: related_edges.append({ @@ -383,8 +383,8 @@ class ZepEntityReader: "source_node_uuid": edge["source_node_uuid"], }) related_node_uuids.add(edge["source_node_uuid"]) - - # 获取关联节点信息 + + # Fetch related node info related_nodes = [] for related_uuid in related_node_uuids: if related_uuid in node_map: @@ -395,7 +395,7 @@ class ZepEntityReader: "labels": related_node["labels"], "summary": related_node.get("summary", ""), }) - + return EntityNode( uuid=getattr(node, 'uuid_', None) or getattr(node, 'uuid', ''), name=node.name or "", @@ -405,27 +405,27 @@ class ZepEntityReader: related_edges=related_edges, related_nodes=related_nodes, ) - + except Exception as e: - logger.error(f"获取实体 {entity_uuid} 失败: {str(e)}") + logger.error(f"Failed to get entity {entity_uuid}: {str(e)}") return None - + def get_entities_by_type( - self, - graph_id: str, + self, + graph_id: str, entity_type: str, enrich_with_edges: bool = True ) -> List[EntityNode]: """ - 获取指定类型的所有实体 - + Get all entities of a specified type + Args: - graph_id: 图谱ID - entity_type: 实体类型(如 "Student", "PublicFigure" 等) - enrich_with_edges: 是否获取相关边信息 - + graph_id: graph ID + entity_type: entity type (e.g. "Student", "PublicFigure") + enrich_with_edges: whether to fetch related edge info + Returns: - 实体列表 + Entity list """ result = self.filter_defined_entities( graph_id=graph_id, @@ -433,5 +433,3 @@ class ZepEntityReader: enrich_with_edges=enrich_with_edges ) return result.entities - - diff --git a/backend/app/services/zep_graph_memory_updater.py b/backend/app/services/zep_graph_memory_updater.py index e034fee2..eab77fdd 100644 --- a/backend/app/services/zep_graph_memory_updater.py +++ b/backend/app/services/zep_graph_memory_updater.py @@ -1,6 +1,6 @@ """ -Zep图谱记忆更新服务 -将模拟中的Agent活动动态更新到Zep图谱中 +Zep graph memory update service +Dynamically updates agent activities from the simulation to the Zep graph """ import os @@ -23,7 +23,7 @@ logger = get_logger('mirofish.zep_graph_memory_updater') @dataclass class AgentActivity: - """Agent活动记录""" + """Agent activity record""" platform: str # twitter / reddit agent_id: int agent_name: str @@ -31,15 +31,16 @@ class AgentActivity: action_args: Dict[str, Any] round_num: int timestamp: str - + def to_episode_text(self) -> str: """ - 将活动转换为可以发送给Zep的文本描述 - - 采用自然语言描述格式,让Zep能够从中提取实体和关系 - 不添加模拟相关的前缀,避免误导图谱更新 + Convert the activity to a text description suitable for sending to Zep + + Uses a natural-language description format so Zep can extract entities and + relationships. No simulation-specific prefix is added to avoid misleading + graph updates. """ - # 根据不同的动作类型生成不同的描述 + # Generate a description based on the action type action_descriptions = { "CREATE_POST": self._describe_create_post, "LIKE_POST": self._describe_like_post, @@ -54,226 +55,227 @@ class AgentActivity: "SEARCH_USER": self._describe_search_user, "MUTE": self._describe_mute, } - + describe_func = action_descriptions.get(self.action_type, self._describe_generic) description = describe_func() - - # 直接返回 "agent名称: 活动描述" 格式,不添加模拟前缀 + + # Return "agent_name: activity description" format without a simulation prefix return f"{self.agent_name}: {description}" - + def _describe_create_post(self) -> str: content = self.action_args.get("content", "") if content: - return f"发布了一条帖子:「{content}」" - return "发布了一条帖子" - + return f'posted: "{content}"' + return "created a post" + def _describe_like_post(self) -> str: - """点赞帖子 - 包含帖子原文和作者信息""" + """Like a post — includes post content and author info""" post_content = self.action_args.get("post_content", "") post_author = self.action_args.get("post_author_name", "") - + if post_content and post_author: - return f"点赞了{post_author}的帖子:「{post_content}」" + return f'liked {post_author}\'s post: "{post_content}"' elif post_content: - return f"点赞了一条帖子:「{post_content}」" + return f'liked a post: "{post_content}"' elif post_author: - return f"点赞了{post_author}的一条帖子" - return "点赞了一条帖子" - + return f"liked a post by {post_author}" + return "liked a post" + def _describe_dislike_post(self) -> str: - """踩帖子 - 包含帖子原文和作者信息""" + """Dislike a post — includes post content and author info""" post_content = self.action_args.get("post_content", "") post_author = self.action_args.get("post_author_name", "") - + if post_content and post_author: - return f"踩了{post_author}的帖子:「{post_content}」" + return f'disliked {post_author}\'s post: "{post_content}"' elif post_content: - return f"踩了一条帖子:「{post_content}」" + return f'disliked a post: "{post_content}"' elif post_author: - return f"踩了{post_author}的一条帖子" - return "踩了一条帖子" - + return f"disliked a post by {post_author}" + return "disliked a post" + def _describe_repost(self) -> str: - """转发帖子 - 包含原帖内容和作者信息""" + """Repost — includes original post content and author info""" original_content = self.action_args.get("original_content", "") original_author = self.action_args.get("original_author_name", "") - + if original_content and original_author: - return f"转发了{original_author}的帖子:「{original_content}」" + return f'reposted {original_author}\'s post: "{original_content}"' elif original_content: - return f"转发了一条帖子:「{original_content}」" + return f'reposted: "{original_content}"' elif original_author: - return f"转发了{original_author}的一条帖子" - return "转发了一条帖子" - + return f"reposted a post by {original_author}" + return "reposted a post" + def _describe_quote_post(self) -> str: - """引用帖子 - 包含原帖内容、作者信息和引用评论""" + """Quote post — includes original post content, author info, and quote comment""" original_content = self.action_args.get("original_content", "") original_author = self.action_args.get("original_author_name", "") quote_content = self.action_args.get("quote_content", "") or self.action_args.get("content", "") - + base = "" if original_content and original_author: - base = f"引用了{original_author}的帖子「{original_content}」" + base = f'quoted {original_author}\'s post "{original_content}"' elif original_content: - base = f"引用了一条帖子「{original_content}」" + base = f'quoted a post: "{original_content}"' elif original_author: - base = f"引用了{original_author}的一条帖子" + base = f"quoted a post by {original_author}" else: - base = "引用了一条帖子" - + base = "quoted a post" + if quote_content: - base += f",并评论道:「{quote_content}」" + base += f' with comment: "{quote_content}"' return base - + def _describe_follow(self) -> str: - """关注用户 - 包含被关注用户的名称""" + """Follow a user — includes the followed user's name""" target_user_name = self.action_args.get("target_user_name", "") - + if target_user_name: - return f"关注了用户「{target_user_name}」" - return "关注了一个用户" - + return f'followed user "{target_user_name}"' + return "followed a user" + def _describe_create_comment(self) -> str: - """发表评论 - 包含评论内容和所评论的帖子信息""" + """Create a comment — includes comment content and the post being commented on""" content = self.action_args.get("content", "") post_content = self.action_args.get("post_content", "") post_author = self.action_args.get("post_author_name", "") - + if content: if post_content and post_author: - return f"在{post_author}的帖子「{post_content}」下评论道:「{content}」" + return f'commented on {post_author}\'s post "{post_content}": "{content}"' elif post_content: - return f"在帖子「{post_content}」下评论道:「{content}」" + return f'commented on post "{post_content}": "{content}"' elif post_author: - return f"在{post_author}的帖子下评论道:「{content}」" - return f"评论道:「{content}」" - return "发表了评论" - + return f'commented on {post_author}\'s post: "{content}"' + return f'commented: "{content}"' + return "posted a comment" + def _describe_like_comment(self) -> str: - """点赞评论 - 包含评论内容和作者信息""" + """Like a comment — includes comment content and author info""" comment_content = self.action_args.get("comment_content", "") comment_author = self.action_args.get("comment_author_name", "") - + if comment_content and comment_author: - return f"点赞了{comment_author}的评论:「{comment_content}」" + return f'liked {comment_author}\'s comment: "{comment_content}"' elif comment_content: - return f"点赞了一条评论:「{comment_content}」" + return f'liked a comment: "{comment_content}"' elif comment_author: - return f"点赞了{comment_author}的一条评论" - return "点赞了一条评论" - + return f"liked a comment by {comment_author}" + return "liked a comment" + def _describe_dislike_comment(self) -> str: - """踩评论 - 包含评论内容和作者信息""" + """Dislike a comment — includes comment content and author info""" comment_content = self.action_args.get("comment_content", "") comment_author = self.action_args.get("comment_author_name", "") - + if comment_content and comment_author: - return f"踩了{comment_author}的评论:「{comment_content}」" + return f'disliked {comment_author}\'s comment: "{comment_content}"' elif comment_content: - return f"踩了一条评论:「{comment_content}」" + return f'disliked a comment: "{comment_content}"' elif comment_author: - return f"踩了{comment_author}的一条评论" - return "踩了一条评论" - + return f"disliked a comment by {comment_author}" + return "disliked a comment" + def _describe_search(self) -> str: - """搜索帖子 - 包含搜索关键词""" + """Search posts — includes search keyword""" query = self.action_args.get("query", "") or self.action_args.get("keyword", "") - return f"搜索了「{query}」" if query else "进行了搜索" - + return f'searched for "{query}"' if query else "performed a search" + def _describe_search_user(self) -> str: - """搜索用户 - 包含搜索关键词""" + """Search users — includes search keyword""" query = self.action_args.get("query", "") or self.action_args.get("username", "") - return f"搜索了用户「{query}」" if query else "搜索了用户" - + return f'searched for user "{query}"' if query else "searched for a user" + def _describe_mute(self) -> str: - """屏蔽用户 - 包含被屏蔽用户的名称""" + """Mute a user — includes the muted user's name""" target_user_name = self.action_args.get("target_user_name", "") - + if target_user_name: - return f"屏蔽了用户「{target_user_name}」" - return "屏蔽了一个用户" - + return f'muted user "{target_user_name}"' + return "muted a user" + def _describe_generic(self) -> str: - # 对于未知的动作类型,生成通用描述 - return f"执行了{self.action_type}操作" + # Generic description for unknown action types + return f"performed action: {self.action_type}" class ZepGraphMemoryUpdater: """ - Zep图谱记忆更新器 - - 监控模拟的actions日志文件,将新的agent活动实时更新到Zep图谱中。 - 按平台分组,每累积BATCH_SIZE条活动后批量发送到Zep。 - - 所有有意义的行为都会被更新到Zep,action_args中会包含完整的上下文信息: - - 点赞/踩的帖子原文 - - 转发/引用的帖子原文 - - 关注/屏蔽的用户名 - - 点赞/踩的评论原文 + Zep graph memory updater + + Monitors the simulation's actions log file and updates new agent activities + to the Zep graph in real time. Activities are grouped by platform; each platform + batches up to BATCH_SIZE activities before sending them to Zep. + + All meaningful actions are updated to Zep. action_args contains full context: + - Original post content for likes/dislikes + - Original post content for reposts/quotes + - Usernames for follows/mutes + - Original comment content for comment likes/dislikes """ - - # 批量发送大小(每个平台累积多少条后发送) + + # Batch send size (activities per platform before sending) BATCH_SIZE = 5 - - # 平台名称映射(用于控制台显示) + + # Platform display names PLATFORM_DISPLAY_NAMES = { - 'twitter': '世界1', - 'reddit': '世界2', + 'twitter': 'World 1', + 'reddit': 'World 2', } - - # 发送间隔(秒),避免请求过快 + + # Send interval (seconds) to avoid sending too fast SEND_INTERVAL = 0.5 - - # 重试配置 + + # Retry config MAX_RETRIES = 3 - RETRY_DELAY = 2 # 秒 - + RETRY_DELAY = 2 # seconds + def __init__(self, graph_id: str, api_key: Optional[str] = None): """ - 初始化更新器 - + Initialize the updater + Args: - graph_id: Zep图谱ID - api_key: Zep API Key(可选,默认从配置读取) + graph_id: Zep graph ID + api_key: Zep API key (optional; defaults to config value) """ self.graph_id = graph_id self.api_key = api_key or Config.ZEP_API_KEY - + if not self.api_key: - raise ValueError("ZEP_API_KEY未配置") - + raise ValueError("ZEP_API_KEY is not configured") + self.client = Zep(api_key=self.api_key) - - # 活动队列 + + # Activity queue self._activity_queue: Queue = Queue() - - # 按平台分组的活动缓冲区(每个平台各自累积到BATCH_SIZE后批量发送) + + # Per-platform activity buffers (each platform accumulates to BATCH_SIZE before batch sending) self._platform_buffers: Dict[str, List[AgentActivity]] = { 'twitter': [], 'reddit': [], } self._buffer_lock = threading.Lock() - - # 控制标志 + + # Control flags self._running = False self._worker_thread: Optional[threading.Thread] = None - - # 统计 - self._total_activities = 0 # 实际添加到队列的活动数 - self._total_sent = 0 # 成功发送到Zep的批次数 - self._total_items_sent = 0 # 成功发送到Zep的活动条数 - self._failed_count = 0 # 发送失败的批次数 - self._skipped_count = 0 # 被过滤跳过的活动数(DO_NOTHING) - - logger.info(f"ZepGraphMemoryUpdater 初始化完成: graph_id={graph_id}, batch_size={self.BATCH_SIZE}") - + + # Statistics + self._total_activities = 0 # Activities added to queue + self._total_sent = 0 # Batches successfully sent to Zep + self._total_items_sent = 0 # Individual activities successfully sent to Zep + self._failed_count = 0 # Batches that failed to send + self._skipped_count = 0 # Activities filtered out (DO_NOTHING) + + logger.info(f"ZepGraphMemoryUpdater initialized: graph_id={graph_id}, batch_size={self.BATCH_SIZE}") + def _get_platform_display_name(self, platform: str) -> str: - """获取平台的显示名称""" + """Get the display name for a platform""" return self.PLATFORM_DISPLAY_NAMES.get(platform.lower(), platform) - + def start(self): - """启动后台工作线程""" + """Start the background worker thread""" if self._running: return @@ -288,67 +290,67 @@ class ZepGraphMemoryUpdater: name=f"ZepMemoryUpdater-{self.graph_id[:8]}" ) self._worker_thread.start() - logger.info(f"ZepGraphMemoryUpdater 已启动: graph_id={self.graph_id}") - + logger.info(f"ZepGraphMemoryUpdater started: graph_id={self.graph_id}") + def stop(self): - """停止后台工作线程""" + """Stop the background worker thread""" self._running = False - - # 发送剩余的活动 + + # Send remaining activities self._flush_remaining() - + if self._worker_thread and self._worker_thread.is_alive(): self._worker_thread.join(timeout=10) - - logger.info(f"ZepGraphMemoryUpdater 已停止: graph_id={self.graph_id}, " + + logger.info(f"ZepGraphMemoryUpdater stopped: graph_id={self.graph_id}, " f"total_activities={self._total_activities}, " f"batches_sent={self._total_sent}, " f"items_sent={self._total_items_sent}, " f"failed={self._failed_count}, " f"skipped={self._skipped_count}") - + def add_activity(self, activity: AgentActivity): """ - 添加一个agent活动到队列 - - 所有有意义的行为都会被添加到队列,包括: - - CREATE_POST(发帖) - - CREATE_COMMENT(评论) - - QUOTE_POST(引用帖子) - - SEARCH_POSTS(搜索帖子) - - SEARCH_USER(搜索用户) - - LIKE_POST/DISLIKE_POST(点赞/踩帖子) - - REPOST(转发) - - FOLLOW(关注) - - MUTE(屏蔽) - - LIKE_COMMENT/DISLIKE_COMMENT(点赞/踩评论) - - action_args中会包含完整的上下文信息(如帖子原文、用户名等)。 - + Add an agent activity to the queue + + All meaningful actions are added to the queue, including: + - CREATE_POST + - CREATE_COMMENT + - QUOTE_POST + - SEARCH_POSTS + - SEARCH_USER + - LIKE_POST/DISLIKE_POST + - REPOST + - FOLLOW + - MUTE + - LIKE_COMMENT/DISLIKE_COMMENT + + action_args contains full context (e.g. post content, usernames, etc.). + Args: - activity: Agent活动记录 + activity: agent activity record """ - # 跳过DO_NOTHING类型的活动 + # Skip DO_NOTHING activities if activity.action_type == "DO_NOTHING": self._skipped_count += 1 return - + self._activity_queue.put(activity) self._total_activities += 1 - logger.debug(f"添加活动到Zep队列: {activity.agent_name} - {activity.action_type}") - + logger.debug(f"Added activity to Zep queue: {activity.agent_name} - {activity.action_type}") + def add_activity_from_dict(self, data: Dict[str, Any], platform: str): """ - 从字典数据添加活动 - + Add an activity from a dictionary + Args: - data: 从actions.jsonl解析的字典数据 - platform: 平台名称 (twitter/reddit) + data: dict parsed from actions.jsonl + platform: platform name (twitter/reddit) """ - # 跳过事件类型的条目 + # Skip event-type entries if "event_type" in data: return - + activity = AgentActivity( platform=platform, agent_id=data.get("agent_id", 0), @@ -358,57 +360,57 @@ class ZepGraphMemoryUpdater: round_num=data.get("round", 0), timestamp=data.get("timestamp", datetime.now().isoformat()), ) - + self.add_activity(activity) - + def _worker_loop(self, locale: str = 'zh'): - """后台工作循环 - 按平台批量发送活动到Zep""" + """Background worker loop — batch-sends activities to Zep per platform""" set_locale(locale) while self._running or not self._activity_queue.empty(): try: - # 尝试从队列获取活动(超时1秒) + # Try to get an activity from the queue (1 second timeout) try: activity = self._activity_queue.get(timeout=1) - - # 将活动添加到对应平台的缓冲区 + + # Add activity to the corresponding platform buffer platform = activity.platform.lower() with self._buffer_lock: if platform not in self._platform_buffers: self._platform_buffers[platform] = [] self._platform_buffers[platform].append(activity) - - # 检查该平台是否达到批量大小 + + # Check if this platform has reached the batch size if len(self._platform_buffers[platform]) >= self.BATCH_SIZE: batch = self._platform_buffers[platform][:self.BATCH_SIZE] self._platform_buffers[platform] = self._platform_buffers[platform][self.BATCH_SIZE:] - # 释放锁后再发送 + # Release lock before sending self._send_batch_activities(batch, platform) - # 发送间隔,避免请求过快 + # Throttle to avoid sending too fast time.sleep(self.SEND_INTERVAL) - + except Empty: pass - + except Exception as e: - logger.error(f"工作循环异常: {e}") + logger.error(f"Worker loop exception: {e}") time.sleep(1) - + def _send_batch_activities(self, activities: List[AgentActivity], platform: str): """ - 批量发送活动到Zep图谱(合并为一条文本) - + Batch-send activities to the Zep graph (merged into a single text block) + Args: - activities: Agent活动列表 - platform: 平台名称 + activities: list of agent activities + platform: platform name """ if not activities: return - - # 将多条活动合并为一条文本,用换行分隔 + + # Merge multiple activities into a single text, separated by newlines episode_texts = [activity.to_episode_text() for activity in activities] combined_text = "\n".join(episode_texts) - - # 带重试的发送 + + # Send with retry for attempt in range(self.MAX_RETRIES): try: self.client.graph.add( @@ -416,25 +418,25 @@ class ZepGraphMemoryUpdater: type="text", data=combined_text ) - + self._total_sent += 1 self._total_items_sent += len(activities) display_name = self._get_platform_display_name(platform) - logger.info(f"成功批量发送 {len(activities)} 条{display_name}活动到图谱 {self.graph_id}") - logger.debug(f"批量内容预览: {combined_text[:200]}...") + logger.info(f"Successfully sent batch of {len(activities)} {display_name} activities to graph {self.graph_id}") + logger.debug(f"Batch content preview: {combined_text[:200]}...") return - + except Exception as e: if attempt < self.MAX_RETRIES - 1: - logger.warning(f"批量发送到Zep失败 (尝试 {attempt + 1}/{self.MAX_RETRIES}): {e}") + logger.warning(f"Batch send to Zep failed (attempt {attempt + 1}/{self.MAX_RETRIES}): {e}") time.sleep(self.RETRY_DELAY * (attempt + 1)) else: - logger.error(f"批量发送到Zep失败,已重试{self.MAX_RETRIES}次: {e}") + logger.error(f"Batch send to Zep failed after {self.MAX_RETRIES} attempts: {e}") self._failed_count += 1 - + def _flush_remaining(self): - """发送队列和缓冲区中剩余的活动""" - # 首先处理队列中剩余的活动,添加到缓冲区 + """Send remaining activities from the queue and buffers""" + # First, drain the queue into the buffers while not self._activity_queue.empty(): try: activity = self._activity_queue.get_nowait() @@ -445,110 +447,110 @@ class ZepGraphMemoryUpdater: self._platform_buffers[platform].append(activity) except Empty: break - - # 然后发送各平台缓冲区中剩余的活动(即使不足BATCH_SIZE条) + + # Then send remaining activities in each platform buffer (even if below BATCH_SIZE) with self._buffer_lock: for platform, buffer in self._platform_buffers.items(): if buffer: display_name = self._get_platform_display_name(platform) - logger.info(f"发送{display_name}平台剩余的 {len(buffer)} 条活动") + logger.info(f"Sending {len(buffer)} remaining {display_name} platform activities") self._send_batch_activities(buffer, platform) - # 清空所有缓冲区 + # Clear all buffers for platform in self._platform_buffers: self._platform_buffers[platform] = [] - + def get_stats(self) -> Dict[str, Any]: - """获取统计信息""" + """Get statistics""" with self._buffer_lock: buffer_sizes = {p: len(b) for p, b in self._platform_buffers.items()} - + return { "graph_id": self.graph_id, "batch_size": self.BATCH_SIZE, - "total_activities": self._total_activities, # 添加到队列的活动总数 - "batches_sent": self._total_sent, # 成功发送的批次数 - "items_sent": self._total_items_sent, # 成功发送的活动条数 - "failed_count": self._failed_count, # 发送失败的批次数 - "skipped_count": self._skipped_count, # 被过滤跳过的活动数(DO_NOTHING) + "total_activities": self._total_activities, # Total activities added to queue + "batches_sent": self._total_sent, # Batches successfully sent + "items_sent": self._total_items_sent, # Individual activities successfully sent + "failed_count": self._failed_count, # Batches that failed to send + "skipped_count": self._skipped_count, # Activities filtered out (DO_NOTHING) "queue_size": self._activity_queue.qsize(), - "buffer_sizes": buffer_sizes, # 各平台缓冲区大小 + "buffer_sizes": buffer_sizes, # Per-platform buffer sizes "running": self._running, } class ZepGraphMemoryManager: """ - 管理多个模拟的Zep图谱记忆更新器 - - 每个模拟可以有自己的更新器实例 + Manages Zep graph memory updaters for multiple simulations + + Each simulation can have its own updater instance """ - + _updaters: Dict[str, ZepGraphMemoryUpdater] = {} _lock = threading.Lock() - + @classmethod def create_updater(cls, simulation_id: str, graph_id: str) -> ZepGraphMemoryUpdater: """ - 为模拟创建图谱记忆更新器 - + Create a graph memory updater for a simulation + Args: - simulation_id: 模拟ID - graph_id: Zep图谱ID - + simulation_id: simulation ID + graph_id: Zep graph ID + Returns: - ZepGraphMemoryUpdater实例 + ZepGraphMemoryUpdater instance """ with cls._lock: - # 如果已存在,先停止旧的 + # If one already exists, stop it first if simulation_id in cls._updaters: cls._updaters[simulation_id].stop() - + updater = ZepGraphMemoryUpdater(graph_id) updater.start() cls._updaters[simulation_id] = updater - - logger.info(f"创建图谱记忆更新器: simulation_id={simulation_id}, graph_id={graph_id}") + + logger.info(f"Created graph memory updater: simulation_id={simulation_id}, graph_id={graph_id}") return updater - + @classmethod def get_updater(cls, simulation_id: str) -> Optional[ZepGraphMemoryUpdater]: - """获取模拟的更新器""" + """Get the updater for a simulation""" return cls._updaters.get(simulation_id) - + @classmethod def stop_updater(cls, simulation_id: str): - """停止并移除模拟的更新器""" + """Stop and remove the updater for a simulation""" with cls._lock: if simulation_id in cls._updaters: cls._updaters[simulation_id].stop() del cls._updaters[simulation_id] - logger.info(f"已停止图谱记忆更新器: simulation_id={simulation_id}") - - # 防止 stop_all 重复调用的标志 + logger.info(f"Stopped graph memory updater: simulation_id={simulation_id}") + + # Flag to prevent stop_all from being called more than once _stop_all_done = False - + @classmethod def stop_all(cls): - """停止所有更新器""" - # 防止重复调用 + """Stop all updaters""" + # Prevent duplicate calls if cls._stop_all_done: return cls._stop_all_done = True - + with cls._lock: if cls._updaters: for simulation_id, updater in list(cls._updaters.items()): try: updater.stop() except Exception as e: - logger.error(f"停止更新器失败: simulation_id={simulation_id}, error={e}") + logger.error(f"Failed to stop updater: simulation_id={simulation_id}, error={e}") cls._updaters.clear() - logger.info("已停止所有图谱记忆更新器") - + logger.info("All graph memory updaters stopped") + @classmethod def get_all_stats(cls) -> Dict[str, Dict[str, Any]]: - """获取所有更新器的统计信息""" + """Get statistics for all updaters""" return { - sim_id: updater.get_stats() + sim_id: updater.get_stats() for sim_id, updater in cls._updaters.items() } diff --git a/backend/app/services/zep_tools.py b/backend/app/services/zep_tools.py index 3bc8a57a..1cadcbd5 100644 --- a/backend/app/services/zep_tools.py +++ b/backend/app/services/zep_tools.py @@ -1,11 +1,11 @@ """ -Zep检索工具服务 -封装图谱搜索、节点读取、边查询等工具,供Report Agent使用 +Zep retrieval tools service +Wraps graph search, node read, edge query and other tools for use by the Report Agent -核心检索工具(优化后): -1. InsightForge(深度洞察检索)- 最强大的混合检索,自动生成子问题并多维度检索 -2. PanoramaSearch(广度搜索)- 获取全貌,包括过期内容 -3. QuickSearch(简单搜索)- 快速检索 +Core retrieval tools (optimized): +1. InsightForge (deep insight retrieval) — most powerful hybrid retrieval; auto-generates sub-queries and searches multiple dimensions +2. PanoramaSearch (breadth search) — gets the full picture, including expired content +3. QuickSearch (simple search) — fast retrieval """ import time @@ -26,13 +26,13 @@ logger = get_logger('mirofish.zep_tools') @dataclass class SearchResult: - """搜索结果""" + """Search result""" facts: List[str] edges: List[Dict[str, Any]] nodes: List[Dict[str, Any]] query: str total_count: int - + def to_dict(self) -> Dict[str, Any]: return { "facts": self.facts, @@ -41,28 +41,28 @@ class SearchResult: "query": self.query, "total_count": self.total_count } - + def to_text(self) -> str: - """转换为文本格式,供LLM理解""" - text_parts = [f"搜索查询: {self.query}", f"找到 {self.total_count} 条相关信息"] - + """Convert to text format for LLM consumption""" + text_parts = [f"Search query: {self.query}", f"Found {self.total_count} related items"] + if self.facts: - text_parts.append("\n### 相关事实:") + text_parts.append("\n### Related facts:") for i, fact in enumerate(self.facts, 1): text_parts.append(f"{i}. {fact}") - + return "\n".join(text_parts) @dataclass class NodeInfo: - """节点信息""" + """Node info""" uuid: str name: str labels: List[str] summary: str attributes: Dict[str, Any] - + def to_dict(self) -> Dict[str, Any]: return { "uuid": self.uuid, @@ -71,16 +71,16 @@ class NodeInfo: "summary": self.summary, "attributes": self.attributes } - + def to_text(self) -> str: - """转换为文本格式""" - entity_type = next((l for l in self.labels if l not in ["Entity", "Node"]), "未知类型") - return f"实体: {self.name} (类型: {entity_type})\n摘要: {self.summary}" + """Convert to text format""" + entity_type = next((l for l in self.labels if l not in ["Entity", "Node"]), "Unknown type") + return f"Entity: {self.name} (type: {entity_type})\nSummary: {self.summary}" @dataclass class EdgeInfo: - """边信息""" + """Edge info""" uuid: str name: str fact: str @@ -88,12 +88,12 @@ class EdgeInfo: target_node_uuid: str source_node_name: Optional[str] = None target_node_name: Optional[str] = None - # 时间信息 + # Temporal info created_at: Optional[str] = None valid_at: Optional[str] = None invalid_at: Optional[str] = None expired_at: Optional[str] = None - + def to_dict(self) -> Dict[str, Any]: return { "uuid": self.uuid, @@ -108,53 +108,53 @@ class EdgeInfo: "invalid_at": self.invalid_at, "expired_at": self.expired_at } - + def to_text(self, include_temporal: bool = False) -> str: - """转换为文本格式""" + """Convert to text format""" source = self.source_node_name or self.source_node_uuid[:8] target = self.target_node_name or self.target_node_uuid[:8] - base_text = f"关系: {source} --[{self.name}]--> {target}\n事实: {self.fact}" - + base_text = f"Relation: {source} --[{self.name}]--> {target}\nFact: {self.fact}" + if include_temporal: - valid_at = self.valid_at or "未知" - invalid_at = self.invalid_at or "至今" - base_text += f"\n时效: {valid_at} - {invalid_at}" + valid_at = self.valid_at or "unknown" + invalid_at = self.invalid_at or "present" + base_text += f"\nValidity: {valid_at} - {invalid_at}" if self.expired_at: - base_text += f" (已过期: {self.expired_at})" - + base_text += f" (expired: {self.expired_at})" + return base_text - + @property def is_expired(self) -> bool: - """是否已过期""" + """Whether the edge has expired""" return self.expired_at is not None - + @property def is_invalid(self) -> bool: - """是否已失效""" + """Whether the edge has been invalidated""" return self.invalid_at is not None @dataclass class InsightForgeResult: """ - 深度洞察检索结果 (InsightForge) - 包含多个子问题的检索结果,以及综合分析 + Deep insight retrieval result (InsightForge) + Contains retrieval results for multiple sub-queries plus a comprehensive analysis """ query: str simulation_requirement: str sub_queries: List[str] - - # 各维度检索结果 - semantic_facts: List[str] = field(default_factory=list) # 语义搜索结果 - entity_insights: List[Dict[str, Any]] = field(default_factory=list) # 实体洞察 - relationship_chains: List[str] = field(default_factory=list) # 关系链 - - # 统计信息 + + # Per-dimension retrieval results + semantic_facts: List[str] = field(default_factory=list) # Semantic search results + entity_insights: List[Dict[str, Any]] = field(default_factory=list) # Entity insights + relationship_chains: List[str] = field(default_factory=list) # Relationship chains + + # Statistics total_facts: int = 0 total_entities: int = 0 total_relationships: int = 0 - + def to_dict(self) -> Dict[str, Any]: return { "query": self.query, @@ -167,73 +167,73 @@ class InsightForgeResult: "total_entities": self.total_entities, "total_relationships": self.total_relationships } - + def to_text(self) -> str: - """转换为详细的文本格式,供LLM理解""" + """Convert to detailed text format for LLM consumption""" text_parts = [ - f"## 未来预测深度分析", - f"分析问题: {self.query}", - f"预测场景: {self.simulation_requirement}", - f"\n### 预测数据统计", - f"- 相关预测事实: {self.total_facts}条", - f"- 涉及实体: {self.total_entities}个", - f"- 关系链: {self.total_relationships}条" + f"## Deep Analysis", + f"Analysis query: {self.query}", + f"Simulation scenario: {self.simulation_requirement}", + f"\n### Data statistics", + f"- Related facts: {self.total_facts}", + f"- Entities involved: {self.total_entities}", + f"- Relationship chains: {self.total_relationships}" ] - - # 子问题 + + # Sub-queries if self.sub_queries: - text_parts.append(f"\n### 分析的子问题") + text_parts.append(f"\n### Sub-queries analyzed") for i, sq in enumerate(self.sub_queries, 1): text_parts.append(f"{i}. {sq}") - - # 语义搜索结果 + + # Semantic search results if self.semantic_facts: - text_parts.append(f"\n### 【关键事实】(请在报告中引用这些原文)") + text_parts.append(f"\n### [Key facts] (please quote these verbatim in the report)") for i, fact in enumerate(self.semantic_facts, 1): text_parts.append(f"{i}. \"{fact}\"") - - # 实体洞察 + + # Entity insights if self.entity_insights: - text_parts.append(f"\n### 【核心实体】") + text_parts.append(f"\n### [Core entities]") for entity in self.entity_insights: - text_parts.append(f"- **{entity.get('name', '未知')}** ({entity.get('type', '实体')})") + text_parts.append(f"- **{entity.get('name', 'Unknown')}** ({entity.get('type', 'Entity')})") if entity.get('summary'): - text_parts.append(f" 摘要: \"{entity.get('summary')}\"") + text_parts.append(f" Summary: \"{entity.get('summary')}\"") if entity.get('related_facts'): - text_parts.append(f" 相关事实: {len(entity.get('related_facts', []))}条") - - # 关系链 + text_parts.append(f" Related facts: {len(entity.get('related_facts', []))}") + + # Relationship chains if self.relationship_chains: - text_parts.append(f"\n### 【关系链】") + text_parts.append(f"\n### [Relationship chains]") for chain in self.relationship_chains: text_parts.append(f"- {chain}") - + return "\n".join(text_parts) @dataclass class PanoramaResult: """ - 广度搜索结果 (Panorama) - 包含所有相关信息,包括过期内容 + Breadth search result (Panorama) + Contains all related information, including expired content """ query: str - - # 全部节点 + + # All nodes all_nodes: List[NodeInfo] = field(default_factory=list) - # 全部边(包括过期的) + # All edges (including expired ones) all_edges: List[EdgeInfo] = field(default_factory=list) - # 当前有效的事实 + # Currently active facts active_facts: List[str] = field(default_factory=list) - # 已过期/失效的事实(历史记录) + # Expired/invalidated facts (historical records) historical_facts: List[str] = field(default_factory=list) - - # 统计 + + # Statistics total_nodes: int = 0 total_edges: int = 0 active_count: int = 0 historical_count: int = 0 - + def to_dict(self) -> Dict[str, Any]: return { "query": self.query, @@ -246,50 +246,50 @@ class PanoramaResult: "active_count": self.active_count, "historical_count": self.historical_count } - + def to_text(self) -> str: - """转换为文本格式(完整版本,不截断)""" + """Convert to text format (complete, no truncation)""" text_parts = [ - f"## 广度搜索结果(未来全景视图)", - f"查询: {self.query}", - f"\n### 统计信息", - f"- 总节点数: {self.total_nodes}", - f"- 总边数: {self.total_edges}", - f"- 当前有效事实: {self.active_count}条", - f"- 历史/过期事实: {self.historical_count}条" + f"## Breadth search result (panoramic view)", + f"Query: {self.query}", + f"\n### Statistics", + f"- Total nodes: {self.total_nodes}", + f"- Total edges: {self.total_edges}", + f"- Currently active facts: {self.active_count}", + f"- Historical/expired facts: {self.historical_count}" ] - - # 当前有效的事实(完整输出,不截断) + + # Currently active facts (complete output, no truncation) if self.active_facts: - text_parts.append(f"\n### 【当前有效事实】(模拟结果原文)") + text_parts.append(f"\n### [Active facts] (simulation result verbatim)") for i, fact in enumerate(self.active_facts, 1): text_parts.append(f"{i}. \"{fact}\"") - - # 历史/过期事实(完整输出,不截断) + + # Historical/expired facts (complete output, no truncation) if self.historical_facts: - text_parts.append(f"\n### 【历史/过期事实】(演变过程记录)") + text_parts.append(f"\n### [Historical/expired facts] (evolution record)") for i, fact in enumerate(self.historical_facts, 1): text_parts.append(f"{i}. \"{fact}\"") - - # 关键实体(完整输出,不截断) + + # Key entities (complete output, no truncation) if self.all_nodes: - text_parts.append(f"\n### 【涉及实体】") + text_parts.append(f"\n### [Entities involved]") for node in self.all_nodes: - entity_type = next((l for l in node.labels if l not in ["Entity", "Node"]), "实体") + entity_type = next((l for l in node.labels if l not in ["Entity", "Node"]), "Entity") text_parts.append(f"- **{node.name}** ({entity_type})") - + return "\n".join(text_parts) @dataclass class AgentInterview: - """单个Agent的采访结果""" + """Interview result for a single agent""" agent_name: str - agent_role: str # 角色类型(如:学生、教师、媒体等) - agent_bio: str # 简介 - question: str # 采访问题 - response: str # 采访回答 - key_quotes: List[str] = field(default_factory=list) # 关键引言 + agent_role: str # Role type (e.g. student, teacher, media, etc.) + agent_bio: str # Bio + question: str # Interview question + response: str # Interview answer + key_quotes: List[str] = field(default_factory=list) # Key quotes def to_dict(self) -> Dict[str, Any]: return { @@ -303,31 +303,31 @@ class AgentInterview: def to_text(self) -> str: text = f"**{self.agent_name}** ({self.agent_role})\n" - # 显示完整的agent_bio,不截断 - text += f"_简介: {self.agent_bio}_\n\n" + # Display complete agent_bio without truncation + text += f"_Bio: {self.agent_bio}_\n\n" text += f"**Q:** {self.question}\n\n" text += f"**A:** {self.response}\n" if self.key_quotes: - text += "\n**关键引言:**\n" + text += "\n**Key quotes:**\n" for quote in self.key_quotes: - # 清理各种引号 + # Strip various quote characters clean_quote = quote.replace('\u201c', '').replace('\u201d', '').replace('"', '') clean_quote = clean_quote.replace('\u300c', '').replace('\u300d', '') clean_quote = clean_quote.strip() - # 去掉开头的标点 + # Remove leading punctuation while clean_quote and clean_quote[0] in ',,;;::、。!?\n\r\t ': clean_quote = clean_quote[1:] - # 过滤包含问题编号的垃圾内容(问题1-9) + # Filter out noise containing question numbers (e.g. "Question 1") skip = False for d in '123456789': - if f'\u95ee\u9898{d}' in clean_quote: + if f'Question {d}' in clean_quote or f'question {d}' in clean_quote: skip = True break if skip: continue - # 截断过长内容(按句号截断,而非硬截断) + # Truncate overly long content (break at sentence boundary, not hard cut) if len(clean_quote) > 150: - dot_pos = clean_quote.find('\u3002', 80) + dot_pos = clean_quote.find('。', 80) if dot_pos > 0: clean_quote = clean_quote[:dot_pos + 1] else: @@ -340,23 +340,23 @@ class AgentInterview: @dataclass class InterviewResult: """ - 采访结果 (Interview) - 包含多个模拟Agent的采访回答 + Interview result + Contains interview responses from multiple simulated Agents """ - interview_topic: str # 采访主题 - interview_questions: List[str] # 采访问题列表 + interview_topic: str # Interview topic + interview_questions: List[str] # Interview question list - # 采访选择的Agent + # Agents selected for interview selected_agents: List[Dict[str, Any]] = field(default_factory=list) - # 各Agent的采访回答 + # Each agent's interview response interviews: List[AgentInterview] = field(default_factory=list) - # 选择Agent的理由 + # Reason for agent selection selection_reasoning: str = "" - # 整合后的采访摘要 + # Consolidated interview summary summary: str = "" - # 统计 + # Statistics total_agents: int = 0 interviewed_count: int = 0 @@ -373,74 +373,74 @@ class InterviewResult: } def to_text(self) -> str: - """转换为详细的文本格式,供LLM理解和报告引用""" + """Convert to detailed text format for LLM consumption and report citation""" text_parts = [ - "## 深度采访报告", - f"**采访主题:** {self.interview_topic}", - f"**采访人数:** {self.interviewed_count} / {self.total_agents} 位模拟Agent", - "\n### 采访对象选择理由", - self.selection_reasoning or "(自动选择)", + "## In-depth Interview Report", + f"**Interview topic:** {self.interview_topic}", + f"**Interviewees:** {self.interviewed_count} / {self.total_agents} simulated agents", + "\n### Agent selection rationale", + self.selection_reasoning or "(auto-selected)", "\n---", - "\n### 采访实录", + "\n### Interview transcripts", ] if self.interviews: for i, interview in enumerate(self.interviews, 1): - text_parts.append(f"\n#### 采访 #{i}: {interview.agent_name}") + text_parts.append(f"\n#### Interview #{i}: {interview.agent_name}") text_parts.append(interview.to_text()) text_parts.append("\n---") else: - text_parts.append("(无采访记录)\n\n---") + text_parts.append("(No interview records)\n\n---") - text_parts.append("\n### 采访摘要与核心观点") - text_parts.append(self.summary or "(无摘要)") + text_parts.append("\n### Interview summary and key viewpoints") + text_parts.append(self.summary or "(No summary)") return "\n".join(text_parts) class ZepToolsService: """ - Zep检索工具服务 - - 【核心检索工具 - 优化后】 - 1. insight_forge - 深度洞察检索(最强大,自动生成子问题,多维度检索) - 2. panorama_search - 广度搜索(获取全貌,包括过期内容) - 3. quick_search - 简单搜索(快速检索) - 4. interview_agents - 深度采访(采访模拟Agent,获取多视角观点) - - 【基础工具】 - - search_graph - 图谱语义搜索 - - get_all_nodes - 获取图谱所有节点 - - get_all_edges - 获取图谱所有边(含时间信息) - - get_node_detail - 获取节点详细信息 - - get_node_edges - 获取节点相关的边 - - get_entities_by_type - 按类型获取实体 - - get_entity_summary - 获取实体的关系摘要 + Zep retrieval tools service + + [Core retrieval tools - optimized] + 1. insight_forge - deep insight retrieval (most powerful; auto-generates sub-queries, multi-dimensional retrieval) + 2. panorama_search - breadth search (full picture, including expired content) + 3. quick_search - simple search (fast retrieval) + 4. interview_agents - in-depth interview (interviews simulated agents, gathers multi-perspective viewpoints) + + [Basic tools] + - search_graph - graph semantic search + - get_all_nodes - get all nodes in the graph + - get_all_edges - get all edges in the graph (with temporal info) + - get_node_detail - get node details + - get_node_edges - get edges related to a node + - get_entities_by_type - get entities by type + - get_entity_summary - get entity relationship summary """ - # 重试配置 + # Retry configuration MAX_RETRIES = 3 RETRY_DELAY = 2.0 def __init__(self, api_key: Optional[str] = None, llm_client: Optional[LLMClient] = None): self.api_key = api_key or Config.ZEP_API_KEY if not self.api_key: - raise ValueError("ZEP_API_KEY 未配置") + raise ValueError("ZEP_API_KEY is not configured") self.client = Zep(api_key=self.api_key) - # LLM客户端用于InsightForge生成子问题 + # LLM client used by InsightForge to generate sub-queries self._llm_client = llm_client logger.info(t("console.zepToolsInitialized")) @property def llm(self) -> LLMClient: - """延迟初始化LLM客户端""" + """Lazily initialize the LLM client""" if self._llm_client is None: self._llm_client = LLMClient() return self._llm_client def _call_with_retry(self, func, operation_name: str, max_retries: int = None): - """带重试机制的API调用""" + """API call with retry mechanism""" max_retries = max_retries or self.MAX_RETRIES last_exception = None delay = self.RETRY_DELAY @@ -462,30 +462,30 @@ class ZepToolsService: raise last_exception def search_graph( - self, - graph_id: str, - query: str, + self, + graph_id: str, + query: str, limit: int = 10, scope: str = "edges" ) -> SearchResult: """ - 图谱语义搜索 - - 使用混合搜索(语义+BM25)在图谱中搜索相关信息。 - 如果Zep Cloud的search API不可用,则降级为本地关键词匹配。 - + Graph semantic search + + Uses hybrid search (semantic + BM25) to find relevant information in the graph. + Falls back to local keyword matching if the Zep Cloud search API is unavailable. + Args: - graph_id: 图谱ID (Standalone Graph) - query: 搜索查询 - limit: 返回结果数量 - scope: 搜索范围,"edges" 或 "nodes" - + graph_id: graph ID (Standalone Graph) + query: search query + limit: number of results to return + scope: search scope, "edges" or "nodes" + Returns: - SearchResult: 搜索结果 + SearchResult: search result """ logger.info(t("console.graphSearch", graphId=graph_id, query=query[:50])) - # 尝试使用Zep Cloud Search API + # Try using the Zep Cloud Search API try: search_results = self._call_with_retry( func=lambda: self.client.graph.search( @@ -502,7 +502,7 @@ class ZepToolsService: edges = [] nodes = [] - # 解析边搜索结果 + # Parse edge search results if hasattr(search_results, 'edges') and search_results.edges: for edge in search_results.edges: if hasattr(edge, 'fact') and edge.fact: @@ -515,7 +515,7 @@ class ZepToolsService: "target_node_uuid": getattr(edge, 'target_node_uuid', ''), }) - # 解析节点搜索结果 + # Parse node search results if hasattr(search_results, 'nodes') and search_results.nodes: for node in search_results.nodes: nodes.append({ @@ -524,7 +524,7 @@ class ZepToolsService: "labels": getattr(node, 'labels', []), "summary": getattr(node, 'summary', ''), }) - # 节点摘要也算作事实 + # Node summaries count as facts too if hasattr(node, 'summary') and node.summary: facts.append(f"[{node.name}]: {node.summary}") @@ -540,29 +540,29 @@ class ZepToolsService: except Exception as e: logger.warning(t("console.zepSearchApiFallback", error=str(e))) - # 降级:使用本地关键词匹配搜索 + # Fallback: use local keyword matching search return self._local_search(graph_id, query, limit, scope) def _local_search( - self, - graph_id: str, - query: str, + self, + graph_id: str, + query: str, limit: int = 10, scope: str = "edges" ) -> SearchResult: """ - 本地关键词匹配搜索(作为Zep Search API的降级方案) - - 获取所有边/节点,然后在本地进行关键词匹配 - + Local keyword matching search (fallback for the Zep Search API) + + Fetches all edges/nodes and performs keyword matching locally. + Args: - graph_id: 图谱ID - query: 搜索查询 - limit: 返回结果数量 - scope: 搜索范围 - + graph_id: graph ID + query: search query + limit: number of results to return + scope: search scope + Returns: - SearchResult: 搜索结果 + SearchResult: search result """ logger.info(t("console.usingLocalSearch", query=query[:30])) @@ -570,19 +570,19 @@ class ZepToolsService: edges_result = [] nodes_result = [] - # 提取查询关键词(简单分词) + # Extract query keywords (simple tokenization) query_lower = query.lower() keywords = [w.strip() for w in query_lower.replace(',', ' ').replace(',', ' ').split() if len(w.strip()) > 1] def match_score(text: str) -> int: - """计算文本与查询的匹配分数""" + """Calculate the match score between text and the query""" if not text: return 0 text_lower = text.lower() - # 完全匹配查询 + # Exact query match if query_lower in text_lower: return 100 - # 关键词匹配 + # Keyword matching score = 0 for keyword in keywords: if keyword in text_lower: @@ -591,7 +591,7 @@ class ZepToolsService: try: if scope in ["edges", "both"]: - # 获取所有边并匹配 + # Get all edges and match all_edges = self.get_all_edges(graph_id) scored_edges = [] for edge in all_edges: @@ -599,7 +599,7 @@ class ZepToolsService: if score > 0: scored_edges.append((score, edge)) - # 按分数排序 + # Sort by score scored_edges.sort(key=lambda x: x[0], reverse=True) for score, edge in scored_edges[:limit]: @@ -614,7 +614,7 @@ class ZepToolsService: }) if scope in ["nodes", "both"]: - # 获取所有节点并匹配 + # Get all nodes and match all_nodes = self.get_all_nodes(graph_id) scored_nodes = [] for node in all_nodes: @@ -649,13 +649,13 @@ class ZepToolsService: def get_all_nodes(self, graph_id: str) -> List[NodeInfo]: """ - 获取图谱的所有节点(分页获取) + Get all nodes in the graph (fetched with pagination) Args: - graph_id: 图谱ID + graph_id: graph ID Returns: - 节点列表 + List of nodes """ logger.info(t("console.fetchingAllNodes", graphId=graph_id)) @@ -677,14 +677,14 @@ class ZepToolsService: def get_all_edges(self, graph_id: str, include_temporal: bool = True) -> List[EdgeInfo]: """ - 获取图谱的所有边(分页获取,包含时间信息) + Get all edges in the graph (fetched with pagination, including temporal info) Args: - graph_id: 图谱ID - include_temporal: 是否包含时间信息(默认True) + graph_id: graph ID + include_temporal: whether to include temporal info (default True) Returns: - 边列表(包含created_at, valid_at, invalid_at, expired_at) + List of edges (including created_at, valid_at, invalid_at, expired_at) """ logger.info(t("console.fetchingAllEdges", graphId=graph_id)) @@ -701,7 +701,7 @@ class ZepToolsService: target_node_uuid=edge.target_node_uuid or "" ) - # 添加时间信息 + # Add temporal info if include_temporal: edge_info.created_at = getattr(edge, 'created_at', None) edge_info.valid_at = getattr(edge, 'valid_at', None) @@ -715,13 +715,13 @@ class ZepToolsService: def get_node_detail(self, node_uuid: str) -> Optional[NodeInfo]: """ - 获取单个节点的详细信息 - + Get detailed information for a single node + Args: - node_uuid: 节点UUID - + node_uuid: node UUID + Returns: - 节点信息或None + Node info or None """ logger.info(t("console.fetchingNodeDetail", uuid=node_uuid[:8])) @@ -747,26 +747,26 @@ class ZepToolsService: def get_node_edges(self, graph_id: str, node_uuid: str) -> List[EdgeInfo]: """ - 获取节点相关的所有边 - - 通过获取图谱所有边,然后过滤出与指定节点相关的边 - + Get all edges related to a node + + Fetches all graph edges and filters to those connected to the specified node. + Args: - graph_id: 图谱ID - node_uuid: 节点UUID - + graph_id: graph ID + node_uuid: node UUID + Returns: - 边列表 + List of edges """ logger.info(t("console.fetchingNodeEdges", uuid=node_uuid[:8])) try: - # 获取图谱所有边,然后过滤 + # Get all graph edges, then filter all_edges = self.get_all_edges(graph_id) - + result = [] for edge in all_edges: - # 检查边是否与指定节点相关(作为源或目标) + # Check whether the edge is related to the specified node (as source or target) if edge.source_node_uuid == node_uuid or edge.target_node_uuid == node_uuid: result.append(edge) @@ -778,19 +778,19 @@ class ZepToolsService: return [] def get_entities_by_type( - self, - graph_id: str, + self, + graph_id: str, entity_type: str ) -> List[NodeInfo]: """ - 按类型获取实体 - + Get entities by type + Args: - graph_id: 图谱ID - entity_type: 实体类型(如 Student, PublicFigure 等) - + graph_id: graph ID + entity_type: entity type (e.g. Student, PublicFigure, etc.) + Returns: - 符合类型的实体列表 + List of entities matching the type """ logger.info(t("console.fetchingEntitiesByType", type=entity_type)) @@ -798,7 +798,7 @@ class ZepToolsService: filtered = [] for node in all_nodes: - # 检查labels是否包含指定类型 + # Check whether labels contain the specified type if entity_type in node.labels: filtered.append(node) @@ -806,32 +806,32 @@ class ZepToolsService: return filtered def get_entity_summary( - self, - graph_id: str, + self, + graph_id: str, entity_name: str ) -> Dict[str, Any]: """ - 获取指定实体的关系摘要 - - 搜索与该实体相关的所有信息,并生成摘要 - + Get relationship summary for a specified entity + + Searches all information related to the entity and generates a summary. + Args: - graph_id: 图谱ID - entity_name: 实体名称 - + graph_id: graph ID + entity_name: entity name + Returns: - 实体摘要信息 + Entity summary info """ logger.info(t("console.fetchingEntitySummary", name=entity_name)) - # 先搜索该实体相关的信息 + # First search for information related to this entity search_result = self.search_graph( graph_id=graph_id, query=entity_name, limit=20 ) - # 尝试在所有节点中找到该实体 + # Try to find the entity among all nodes all_nodes = self.get_all_nodes(graph_id) entity_node = None for node in all_nodes: @@ -841,7 +841,7 @@ class ZepToolsService: related_edges = [] if entity_node: - # 传入graph_id参数 + # Pass graph_id parameter related_edges = self.get_node_edges(graph_id, entity_node.uuid) return { @@ -854,27 +854,27 @@ class ZepToolsService: def get_graph_statistics(self, graph_id: str) -> Dict[str, Any]: """ - 获取图谱的统计信息 - + Get graph statistics + Args: - graph_id: 图谱ID - + graph_id: graph ID + Returns: - 统计信息 + Statistics info """ logger.info(t("console.fetchingGraphStats", graphId=graph_id)) nodes = self.get_all_nodes(graph_id) edges = self.get_all_edges(graph_id) - # 统计实体类型分布 + # Count entity type distribution entity_types = {} for node in nodes: for label in node.labels: if label not in ["Entity", "Node"]: entity_types[label] = entity_types.get(label, 0) + 1 - - # 统计关系类型分布 + + # Count relationship type distribution relation_types = {} for edge in edges: relation_types[edge.name] = relation_types.get(edge.name, 0) + 1 @@ -888,40 +888,40 @@ class ZepToolsService: } def get_simulation_context( - self, + self, graph_id: str, simulation_requirement: str, limit: int = 30 ) -> Dict[str, Any]: """ - 获取模拟相关的上下文信息 - - 综合搜索与模拟需求相关的所有信息 - + Get simulation-related context information + + Comprehensively searches all information related to the simulation requirement. + Args: - graph_id: 图谱ID - simulation_requirement: 模拟需求描述 - limit: 每类信息的数量限制 - + graph_id: graph ID + simulation_requirement: simulation requirement description + limit: count limit per category + Returns: - 模拟上下文信息 + Simulation context info """ logger.info(t("console.fetchingSimContext", requirement=simulation_requirement[:50])) - # 搜索与模拟需求相关的信息 + # Search for information related to the simulation requirement search_result = self.search_graph( graph_id=graph_id, query=simulation_requirement, limit=limit ) - # 获取图谱统计 + # Get graph statistics stats = self.get_graph_statistics(graph_id) - - # 获取所有实体节点 + + # Get all entity nodes all_nodes = self.get_all_nodes(graph_id) - - # 筛选有实际类型的实体(非纯Entity节点) + + # Filter entities with actual types (non-pure Entity nodes) entities = [] for node in all_nodes: custom_labels = [l for l in node.labels if l not in ["Entity", "Node"]] @@ -931,16 +931,16 @@ class ZepToolsService: "type": custom_labels[0], "summary": node.summary }) - + return { "simulation_requirement": simulation_requirement, "related_facts": search_result.facts, "graph_statistics": stats, - "entities": entities[:limit], # 限制数量 + "entities": entities[:limit], # Limit count "total_entities": len(entities) } - # ========== 核心检索工具(优化后) ========== + # ========== Core retrieval tools (optimized) ========== def insight_forge( self, @@ -951,24 +951,25 @@ class ZepToolsService: max_sub_queries: int = 5 ) -> InsightForgeResult: """ - 【InsightForge - 深度洞察检索】 - - 最强大的混合检索函数,自动分解问题并多维度检索: - 1. 使用LLM将问题分解为多个子问题 - 2. 对每个子问题进行语义搜索 - 3. 提取相关实体并获取其详细信息 - 4. 追踪关系链 - 5. 整合所有结果,生成深度洞察 - + [InsightForge - deep insight retrieval] + + The most powerful hybrid retrieval function; automatically decomposes a question and + searches multiple dimensions: + 1. Use LLM to decompose the question into multiple sub-queries + 2. Run semantic search for each sub-query + 3. Extract related entities and fetch their details + 4. Trace relationship chains + 5. Integrate all results to produce deep insights + Args: - graph_id: 图谱ID - query: 用户问题 - simulation_requirement: 模拟需求描述 - report_context: 报告上下文(可选,用于更精准的子问题生成) - max_sub_queries: 最大子问题数量 - + graph_id: graph ID + query: user question + simulation_requirement: simulation requirement description + report_context: report context (optional; helps generate more precise sub-queries) + max_sub_queries: maximum number of sub-queries + Returns: - InsightForgeResult: 深度洞察检索结果 + InsightForgeResult: deep insight retrieval result """ logger.info(t("console.insightForgeStart", query=query[:50])) @@ -978,7 +979,7 @@ class ZepToolsService: sub_queries=[] ) - # Step 1: 使用LLM生成子问题 + # Step 1: Use LLM to generate sub-queries sub_queries = self._generate_sub_queries( query=query, simulation_requirement=simulation_requirement, @@ -988,7 +989,7 @@ class ZepToolsService: result.sub_queries = sub_queries logger.info(t("console.generatedSubQueries", count=len(sub_queries))) - # Step 2: 对每个子问题进行语义搜索 + # Step 2: Run semantic search for each sub-query all_facts = [] all_edges = [] seen_facts = set() @@ -1008,7 +1009,7 @@ class ZepToolsService: all_edges.extend(search_result.edges) - # 对原始问题也进行搜索 + # Also search the original question main_search = self.search_graph( graph_id=graph_id, query=query, @@ -1023,7 +1024,7 @@ class ZepToolsService: result.semantic_facts = all_facts result.total_facts = len(all_facts) - # Step 3: 从边中提取相关实体UUID,只获取这些实体的信息(不获取全部节点) + # Step 3: Extract related entity UUIDs from edges; fetch only those entities (not all nodes) entity_uuids = set() for edge_data in all_edges: if isinstance(edge_data, dict): @@ -1034,21 +1035,21 @@ class ZepToolsService: if target_uuid: entity_uuids.add(target_uuid) - # 获取所有相关实体的详情(不限制数量,完整输出) + # Fetch details for all related entities (no count limit; full output) entity_insights = [] - node_map = {} # 用于后续关系链构建 + node_map = {} # For relationship chain construction - for uuid in list(entity_uuids): # 处理所有实体,不截断 + for uuid in list(entity_uuids): # Process all entities, no truncation if not uuid: continue try: - # 单独获取每个相关节点的信息 + # Fetch each related node individually node = self.get_node_detail(uuid) if node: node_map[uuid] = node - entity_type = next((l for l in node.labels if l not in ["Entity", "Node"]), "实体") + entity_type = next((l for l in node.labels if l not in ["Entity", "Node"]), "Entity") - # 获取该实体相关的所有事实(不截断) + # Get all facts related to this entity (no truncation) related_facts = [ f for f in all_facts if node.name.lower() in f.lower() @@ -1059,18 +1060,18 @@ class ZepToolsService: "name": node.name, "type": entity_type, "summary": node.summary, - "related_facts": related_facts # 完整输出,不截断 + "related_facts": related_facts # Full output, no truncation }) except Exception as e: - logger.debug(f"获取节点 {uuid} 失败: {e}") + logger.debug(f"Failed to get node {uuid}: {e}") continue result.entity_insights = entity_insights result.total_entities = len(entity_insights) - # Step 4: 构建所有关系链(不限制数量) + # Step 4: Build all relationship chains (no count limit) relationship_chains = [] - for edge_data in all_edges: # 处理所有边,不截断 + for edge_data in all_edges: # Process all edges, no truncation if isinstance(edge_data, dict): source_uuid = edge_data.get('source_node_uuid', '') target_uuid = edge_data.get('target_node_uuid', '') @@ -1097,27 +1098,27 @@ class ZepToolsService: max_queries: int = 5 ) -> List[str]: """ - 使用LLM生成子问题 - - 将复杂问题分解为多个可以独立检索的子问题 + Use LLM to generate sub-queries + + Decomposes a complex question into multiple independently searchable sub-queries """ - system_prompt = """你是一个专业的问题分析专家。你的任务是将一个复杂问题分解为多个可以在模拟世界中独立观察的子问题。 + system_prompt = """You are a professional question analysis expert. Your task is to decompose a complex question into multiple sub-questions that can be independently observed in the simulated world. -要求: -1. 每个子问题应该足够具体,可以在模拟世界中找到相关的Agent行为或事件 -2. 子问题应该覆盖原问题的不同维度(如:谁、什么、为什么、怎么样、何时、何地) -3. 子问题应该与模拟场景相关 -4. 返回JSON格式:{"sub_queries": ["子问题1", "子问题2", ...]}""" +Requirements: +1. Each sub-question should be specific enough to find relevant agent behaviors or events in the simulation +2. Sub-questions should cover different dimensions of the original question (e.g. who, what, why, how, when, where) +3. Sub-questions should be relevant to the simulation scenario +4. Return JSON format: {"sub_queries": ["sub-question 1", "sub-question 2", ...]}""" - user_prompt = f"""模拟需求背景: + user_prompt = f"""Simulation requirement background: {simulation_requirement} -{f"报告上下文:{report_context[:500]}" if report_context else ""} +{f"Report context: {report_context[:500]}" if report_context else ""} -请将以下问题分解为{max_queries}个子问题: +Please decompose the following question into {max_queries} sub-questions: {query} -返回JSON格式的子问题列表。""" +Return a JSON list of sub-questions.""" try: response = self.llm.chat_json( @@ -1129,17 +1130,17 @@ class ZepToolsService: ) sub_queries = response.get("sub_queries", []) - # 确保是字符串列表 + # Ensure it is a list of strings return [str(sq) for sq in sub_queries[:max_queries]] except Exception as e: logger.warning(t("console.generateSubQueriesFailed", error=str(e))) - # 降级:返回基于原问题的变体 + # Fallback: return variants of the original question return [ query, - f"{query} 的主要参与者", - f"{query} 的原因和影响", - f"{query} 的发展过程" + f"Key participants in: {query}", + f"Causes and effects of: {query}", + f"How {query} developed" ][:max_queries] def panorama_search( @@ -1150,40 +1151,41 @@ class ZepToolsService: limit: int = 50 ) -> PanoramaResult: """ - 【PanoramaSearch - 广度搜索】 - - 获取全貌视图,包括所有相关内容和历史/过期信息: - 1. 获取所有相关节点 - 2. 获取所有边(包括已过期/失效的) - 3. 分类整理当前有效和历史信息 - - 这个工具适用于需要了解事件全貌、追踪演变过程的场景。 - + [PanoramaSearch - breadth search] + + Gets a full-picture view, including all related content and historical/expired info: + 1. Get all related nodes + 2. Get all edges (including expired/invalidated ones) + 3. Classify and organize active and historical info + + This tool is suitable for scenarios that require understanding the full picture of an event + or tracing its evolution. + Args: - graph_id: 图谱ID - query: 搜索查询(用于相关性排序) - include_expired: 是否包含过期内容(默认True) - limit: 返回结果数量限制 - + graph_id: graph ID + query: search query (used for relevance ranking) + include_expired: whether to include expired content (default True) + limit: result count limit + Returns: - PanoramaResult: 广度搜索结果 + PanoramaResult: breadth search result """ logger.info(t("console.panoramaSearchStart", query=query[:50])) result = PanoramaResult(query=query) - # 获取所有节点 + # Get all nodes all_nodes = self.get_all_nodes(graph_id) node_map = {n.uuid: n for n in all_nodes} result.all_nodes = all_nodes result.total_nodes = len(all_nodes) - # 获取所有边(包含时间信息) + # Get all edges (with temporal info) all_edges = self.get_all_edges(graph_id, include_temporal=True) result.all_edges = all_edges result.total_edges = len(all_edges) - # 分类事实 + # Classify facts active_facts = [] historical_facts = [] @@ -1191,24 +1193,24 @@ class ZepToolsService: if not edge.fact: continue - # 为事实添加实体名称 + # Add entity names to facts source_name = node_map.get(edge.source_node_uuid, NodeInfo('', '', [], '', {})).name or edge.source_node_uuid[:8] target_name = node_map.get(edge.target_node_uuid, NodeInfo('', '', [], '', {})).name or edge.target_node_uuid[:8] - # 判断是否过期/失效 + # Determine if expired/invalidated is_historical = edge.is_expired or edge.is_invalid if is_historical: - # 历史/过期事实,添加时间标记 - valid_at = edge.valid_at or "未知" - invalid_at = edge.invalid_at or edge.expired_at or "未知" + # Historical/expired fact; add time markers + valid_at = edge.valid_at or "unknown" + invalid_at = edge.invalid_at or edge.expired_at or "unknown" fact_with_time = f"[{valid_at} - {invalid_at}] {edge.fact}" historical_facts.append(fact_with_time) else: - # 当前有效事实 + # Currently active fact active_facts.append(edge.fact) - # 基于查询进行相关性排序 + # Sort by relevance to query query_lower = query.lower() keywords = [w.strip() for w in query_lower.replace(',', ' ').replace(',', ' ').split() if len(w.strip()) > 1] @@ -1222,7 +1224,7 @@ class ZepToolsService: score += 10 return score - # 排序并限制数量 + # Sort and limit count active_facts.sort(key=relevance_score, reverse=True) historical_facts.sort(key=relevance_score, reverse=True) @@ -1241,24 +1243,24 @@ class ZepToolsService: limit: int = 10 ) -> SearchResult: """ - 【QuickSearch - 简单搜索】 - - 快速、轻量级的检索工具: - 1. 直接调用Zep语义搜索 - 2. 返回最相关的结果 - 3. 适用于简单、直接的检索需求 - + [QuickSearch - simple search] + + Fast, lightweight retrieval tool: + 1. Calls Zep semantic search directly + 2. Returns the most relevant results + 3. Suitable for simple, direct retrieval needs + Args: - graph_id: 图谱ID - query: 搜索查询 - limit: 返回结果数量 - + graph_id: graph ID + query: search query + limit: result count + Returns: - SearchResult: 搜索结果 + SearchResult: search result """ logger.info(t("console.quickSearchStart", query=query[:50])) - # 直接调用现有的search_graph方法 + # Call the existing search_graph method directly result = self.search_graph( graph_id=graph_id, query=query, @@ -1278,31 +1280,32 @@ class ZepToolsService: custom_questions: List[str] = None ) -> InterviewResult: """ - 【InterviewAgents - 深度采访】 - - 调用真实的OASIS采访API,采访模拟中正在运行的Agent: - 1. 自动读取人设文件,了解所有模拟Agent - 2. 使用LLM分析采访需求,智能选择最相关的Agent - 3. 使用LLM生成采访问题 - 4. 调用 /api/simulation/interview/batch 接口进行真实采访(双平台同时采访) - 5. 整合所有采访结果,生成采访报告 - - 【重要】此功能需要模拟环境处于运行状态(OASIS环境未关闭) - - 【使用场景】 - - 需要从不同角色视角了解事件看法 - - 需要收集多方意见和观点 - - 需要获取模拟Agent的真实回答(非LLM模拟) - + [InterviewAgents - in-depth interview] + + Calls the real OASIS interview API to interview agents currently running in the simulation: + 1. Automatically reads persona files to learn about all simulated agents + 2. Uses LLM to analyze the interview requirement and intelligently select the most relevant agents + 3. Uses LLM to generate interview questions + 4. Calls /api/simulation/interview/batch for real interviews (both platforms simultaneously) + 5. Integrates all interview results to produce an interview report + + [Important] This feature requires the simulation environment to be running + (OASIS environment must not have been closed). + + [Use cases] + - Need to understand event opinions from different role perspectives + - Need to collect opinions and views from multiple parties + - Need real answers from simulated agents (not LLM-simulated) + Args: - simulation_id: 模拟ID(用于定位人设文件和调用采访API) - interview_requirement: 采访需求描述(非结构化,如"了解学生对事件的看法") - simulation_requirement: 模拟需求背景(可选) - max_agents: 最多采访的Agent数量 - custom_questions: 自定义采访问题(可选,若不提供则自动生成) - + simulation_id: simulation ID (used to locate persona files and call interview API) + interview_requirement: interview requirement description (unstructured, e.g. "understand students' views on the event") + simulation_requirement: simulation requirement background (optional) + max_agents: maximum number of agents to interview + custom_questions: custom interview questions (optional; auto-generated if not provided) + Returns: - InterviewResult: 采访结果 + InterviewResult: interview result """ from .simulation_runner import SimulationRunner @@ -1313,18 +1316,18 @@ class ZepToolsService: interview_questions=custom_questions or [] ) - # Step 1: 读取人设文件 + # Step 1: Read persona files profiles = self._load_agent_profiles(simulation_id) if not profiles: logger.warning(t("console.profilesNotFound", simId=simulation_id)) - result.summary = "未找到可采访的Agent人设文件" + result.summary = "No agent persona files found for interview" return result result.total_agents = len(profiles) logger.info(t("console.loadedProfiles", count=len(profiles))) - # Step 2: 使用LLM选择要采访的Agent(返回agent_id列表) + # Step 2: Use LLM to select agents to interview (returns agent_id list) selected_agents, selected_indices, selection_reasoning = self._select_agents_for_interview( profiles=profiles, interview_requirement=interview_requirement, @@ -1336,7 +1339,7 @@ class ZepToolsService: result.selection_reasoning = selection_reasoning logger.info(t("console.selectedAgentsForInterview", count=len(selected_agents), indices=selected_indices)) - # Step 3: 生成采访问题(如果没有提供) + # Step 3: Generate interview questions (if not provided) if not result.interview_questions: result.interview_questions = self._generate_interview_questions( interview_requirement=interview_requirement, @@ -1345,103 +1348,103 @@ class ZepToolsService: ) logger.info(t("console.generatedInterviewQuestions", count=len(result.interview_questions))) - # 将问题合并为一个采访prompt + # Merge questions into a single interview prompt combined_prompt = "\n".join([f"{i+1}. {q}" for i, q in enumerate(result.interview_questions)]) - # 添加优化前缀,约束Agent回复格式 + # Add optimization prefix to constrain agent reply format INTERVIEW_PROMPT_PREFIX = ( - "你正在接受一次采访。请结合你的人设、所有的过往记忆与行动," - "以纯文本方式直接回答以下问题。\n" - "回复要求:\n" - "1. 直接用自然语言回答,不要调用任何工具\n" - "2. 不要返回JSON格式或工具调用格式\n" - "3. 不要使用Markdown标题(如#、##、###)\n" - "4. 按问题编号逐一回答,每个回答以「问题X:」开头(X为问题编号)\n" - "5. 每个问题的回答之间用空行分隔\n" - "6. 回答要有实质内容,每个问题至少回答2-3句话\n\n" + "You are being interviewed. Based on your persona, all past memories and actions, " + "answer the following questions directly in plain text.\n" + "Reply requirements:\n" + "1. Answer directly in natural language; do not call any tools\n" + "2. Do not return JSON format or tool call format\n" + "3. Do not use Markdown headings (e.g. #, ##, ###)\n" + "4. Answer each question by number, starting each answer with 'Question X:' (X = question number)\n" + "5. Separate answers for different questions with a blank line\n" + "6. Give substantive answers; each question should be answered in at least 2-3 sentences\n\n" ) optimized_prompt = f"{INTERVIEW_PROMPT_PREFIX}{combined_prompt}" - # Step 4: 调用真实的采访API(不指定platform,默认双平台同时采访) + # Step 4: Call the real interview API (no platform specified; default dual-platform simultaneous) try: - # 构建批量采访列表(不指定platform,双平台采访) + # Build batch interview list (no platform specified; dual-platform interview) interviews_request = [] for agent_idx in selected_indices: interviews_request.append({ "agent_id": agent_idx, - "prompt": optimized_prompt # 使用优化后的prompt - # 不指定platform,API会在twitter和reddit两个平台都采访 + "prompt": optimized_prompt # Use optimized prompt + # No platform specified; API will interview on both twitter and reddit }) logger.info(t("console.callingBatchInterviewApi", count=len(interviews_request))) - # 调用 SimulationRunner 的批量采访方法(不传platform,双平台采访) + # Call SimulationRunner batch interview method (no platform; dual-platform) api_result = SimulationRunner.interview_agents_batch( simulation_id=simulation_id, interviews=interviews_request, - platform=None, # 不指定platform,双平台采访 - timeout=180.0 # 双平台需要更长超时 + platform=None, # No platform specified; dual-platform interview + timeout=180.0 # Dual-platform requires a longer timeout ) logger.info(t("console.interviewApiReturned", count=api_result.get('interviews_count', 0), success=api_result.get('success'))) - # 检查API调用是否成功 + # Check whether API call succeeded if not api_result.get("success", False): - error_msg = api_result.get("error", "未知错误") + error_msg = api_result.get("error", "Unknown error") logger.warning(t("console.interviewApiReturnedFailure", error=error_msg)) - result.summary = f"采访API调用失败:{error_msg}。请检查OASIS模拟环境状态。" + result.summary = f"Interview API call failed: {error_msg}. Please check OASIS simulation environment status." return result - # Step 5: 解析API返回结果,构建AgentInterview对象 - # 双平台模式返回格式: {"twitter_0": {...}, "reddit_0": {...}, "twitter_1": {...}, ...} + # Step 5: Parse API result and build AgentInterview objects + # Dual-platform result format: {"twitter_0": {...}, "reddit_0": {...}, "twitter_1": {...}, ...} api_data = api_result.get("result", {}) results_dict = api_data.get("results", {}) if isinstance(api_data, dict) else {} for i, agent_idx in enumerate(selected_indices): agent = selected_agents[i] agent_name = agent.get("realname", agent.get("username", f"Agent_{agent_idx}")) - agent_role = agent.get("profession", "未知") + agent_role = agent.get("profession", "Unknown") agent_bio = agent.get("bio", "") - # 获取该Agent在两个平台的采访结果 + # Get interview results for this agent on both platforms twitter_result = results_dict.get(f"twitter_{agent_idx}", {}) reddit_result = results_dict.get(f"reddit_{agent_idx}", {}) twitter_response = twitter_result.get("response", "") reddit_response = reddit_result.get("response", "") - # 清理可能的工具调用 JSON 包裹 + # Clean up potential tool call JSON wrapper twitter_response = self._clean_tool_call_response(twitter_response) reddit_response = self._clean_tool_call_response(reddit_response) - # 始终输出双平台标记 - twitter_text = twitter_response if twitter_response else "(该平台未获得回复)" - reddit_text = reddit_response if reddit_response else "(该平台未获得回复)" - response_text = f"【Twitter平台回答】\n{twitter_text}\n\n【Reddit平台回答】\n{reddit_text}" + # Always output dual-platform labels + twitter_text = twitter_response if twitter_response else "(No response from this platform)" + reddit_text = reddit_response if reddit_response else "(No response from this platform)" + response_text = f"[Twitter platform response]\n{twitter_text}\n\n[Reddit platform response]\n{reddit_text}" - # 提取关键引言(从两个平台的回答中) + # Extract key quotes (from both platform responses) import re combined_responses = f"{twitter_response} {reddit_response}" - # 清理响应文本:去掉标记、编号、Markdown 等干扰 + # Clean response text: remove labels, numbers, Markdown noise, etc. clean_text = re.sub(r'#{1,6}\s+', '', combined_responses) clean_text = re.sub(r'\{[^}]*tool_name[^}]*\}', '', clean_text) clean_text = re.sub(r'[*_`|>~\-]{2,}', '', clean_text) - clean_text = re.sub(r'问题\d+[::]\s*', '', clean_text) + clean_text = re.sub(r'Question\s*\d+[::]\s*', '', clean_text) clean_text = re.sub(r'【[^】]+】', '', clean_text) - # 策略1(主): 提取完整的有实质内容的句子 + # Strategy 1 (primary): extract complete meaningful sentences sentences = re.split(r'[。!?]', clean_text) meaningful = [ s.strip() for s in sentences if 20 <= len(s.strip()) <= 150 and not re.match(r'^[\s\W,,;;::、]+', s.strip()) - and not s.strip().startswith(('{', '问题')) + and not s.strip().startswith(('{', 'Question', 'question')) ] meaningful.sort(key=len, reverse=True) key_quotes = [s + "。" for s in meaningful[:3]] - # 策略2(补充): 正确配对的中文引号「」内长文本 + # Strategy 2 (supplementary): long text inside matched Chinese 「」 quotes if not key_quotes: paired = re.findall(r'\u201c([^\u201c\u201d]{15,100})\u201d', clean_text) paired += re.findall(r'\u300c([^\u300c\u300d]{15,100})\u300d', clean_text) @@ -1450,7 +1453,7 @@ class ZepToolsService: interview = AgentInterview( agent_name=agent_name, agent_role=agent_role, - agent_bio=agent_bio[:1000], # 扩大bio长度限制 + agent_bio=agent_bio[:1000], # Increase bio length limit question=combined_prompt, response=response_text, key_quotes=key_quotes[:5] @@ -1460,18 +1463,18 @@ class ZepToolsService: result.interviewed_count = len(result.interviews) except ValueError as e: - # 模拟环境未运行 + # Simulation environment not running logger.warning(t("console.interviewApiCallFailed", error=e)) - result.summary = f"采访失败:{str(e)}。模拟环境可能已关闭,请确保OASIS环境正在运行。" + result.summary = f"Interview failed: {str(e)}. The simulation environment may have been closed; ensure OASIS is running." return result except Exception as e: logger.error(t("console.interviewApiCallException", error=e)) import traceback logger.error(traceback.format_exc()) - result.summary = f"采访过程发生错误:{str(e)}" + result.summary = f"An error occurred during the interview: {str(e)}" return result - # Step 6: 生成采访摘要 + # Step 6: Generate interview summary if result.interviews: result.summary = self._generate_interview_summary( interviews=result.interviews, @@ -1483,7 +1486,7 @@ class ZepToolsService: @staticmethod def _clean_tool_call_response(response: str) -> str: - """清理 Agent 回复中的 JSON 工具调用包裹,提取实际内容""" + """Strip JSON tool-call wrapper from Agent replies and extract the actual content""" if not response or not response.strip().startswith('{'): return response text = response.strip() @@ -1503,11 +1506,11 @@ class ZepToolsService: return response def _load_agent_profiles(self, simulation_id: str) -> List[Dict[str, Any]]: - """加载模拟的Agent人设文件""" + """Load agent persona files for the simulation""" import os import csv - # 构建人设文件路径 + # Build persona file path sim_dir = os.path.join( os.path.dirname(__file__), f'../../uploads/simulations/{simulation_id}' @@ -1515,7 +1518,7 @@ class ZepToolsService: profiles = [] - # 优先尝试读取Reddit JSON格式 + # Prefer Reddit JSON format first reddit_profile_path = os.path.join(sim_dir, "reddit_profiles.json") if os.path.exists(reddit_profile_path): try: @@ -1526,20 +1529,20 @@ class ZepToolsService: except Exception as e: logger.warning(t("console.readRedditProfilesFailed", error=e)) - # 尝试读取Twitter CSV格式 + # Try Twitter CSV format twitter_profile_path = os.path.join(sim_dir, "twitter_profiles.csv") if os.path.exists(twitter_profile_path): try: with open(twitter_profile_path, 'r', encoding='utf-8') as f: reader = csv.DictReader(f) for row in reader: - # CSV格式转换为统一格式 + # Convert CSV format to unified format profiles.append({ "realname": row.get("name", ""), "username": row.get("username", ""), "bio": row.get("description", ""), "persona": row.get("user_char", ""), - "profession": "未知" + "profession": "Unknown" }) logger.info(t("console.loadedTwitterProfiles", count=len(profiles))) return profiles @@ -1556,51 +1559,51 @@ class ZepToolsService: max_agents: int ) -> tuple: """ - 使用LLM选择要采访的Agent - + Use LLM to select agents for interview + Returns: tuple: (selected_agents, selected_indices, reasoning) - - selected_agents: 选中Agent的完整信息列表 - - selected_indices: 选中Agent的索引列表(用于API调用) - - reasoning: 选择理由 + - selected_agents: full info list of selected agents + - selected_indices: index list of selected agents (used for API calls) + - reasoning: selection rationale """ - # 构建Agent摘要列表 + # Build agent summary list agent_summaries = [] for i, profile in enumerate(profiles): summary = { "index": i, "name": profile.get("realname", profile.get("username", f"Agent_{i}")), - "profession": profile.get("profession", "未知"), + "profession": profile.get("profession", "Unknown"), "bio": profile.get("bio", "")[:200], "interested_topics": profile.get("interested_topics", []) } agent_summaries.append(summary) - system_prompt = """你是一个专业的采访策划专家。你的任务是根据采访需求,从模拟Agent列表中选择最适合采访的对象。 + system_prompt = """You are a professional interview planner. Your task is to select the most suitable agents to interview from the simulated agent list based on the interview requirement. -选择标准: -1. Agent的身份/职业与采访主题相关 -2. Agent可能持有独特或有价值的观点 -3. 选择多样化的视角(如:支持方、反对方、中立方、专业人士等) -4. 优先选择与事件直接相关的角色 +Selection criteria: +1. The agent's identity/profession is relevant to the interview topic +2. The agent may hold unique or valuable viewpoints +3. Select diverse perspectives (e.g. supporters, opponents, neutral parties, professionals, etc.) +4. Prioritize roles directly related to the event -返回JSON格式: +Return JSON format: { - "selected_indices": [选中Agent的索引列表], - "reasoning": "选择理由说明" + "selected_indices": [list of selected agent indices], + "reasoning": "selection rationale" }""" - user_prompt = f"""采访需求: + user_prompt = f"""Interview requirement: {interview_requirement} -模拟背景: -{simulation_requirement if simulation_requirement else "未提供"} +Simulation background: +{simulation_requirement if simulation_requirement else "Not provided"} -可选择的Agent列表(共{len(agent_summaries)}个): +Available agent list ({len(agent_summaries)} agents): {json.dumps(agent_summaries, ensure_ascii=False, indent=2)} -请选择最多{max_agents}个最适合采访的Agent,并说明选择理由。""" +Please select up to {max_agents} agents most suitable for the interview, and explain your selection rationale.""" try: response = self.llm.chat_json( @@ -1612,9 +1615,9 @@ class ZepToolsService: ) selected_indices = response.get("selected_indices", [])[:max_agents] - reasoning = response.get("reasoning", "基于相关性自动选择") + reasoning = response.get("reasoning", "Automatically selected based on relevance") - # 获取选中的Agent完整信息 + # Get full info for selected agents selected_agents = [] valid_indices = [] for idx in selected_indices: @@ -1626,10 +1629,10 @@ class ZepToolsService: except Exception as e: logger.warning(t("console.llmSelectAgentFailed", error=e)) - # 降级:选择前N个 + # Fallback: select the first N agents selected = profiles[:max_agents] indices = list(range(min(max_agents, len(profiles)))) - return selected, indices, "使用默认选择策略" + return selected, indices, "Using default selection strategy" def _generate_interview_questions( self, @@ -1637,29 +1640,29 @@ class ZepToolsService: simulation_requirement: str, selected_agents: List[Dict[str, Any]] ) -> List[str]: - """使用LLM生成采访问题""" - - agent_roles = [a.get("profession", "未知") for a in selected_agents] - - system_prompt = """你是一个专业的记者/采访者。根据采访需求,生成3-5个深度采访问题。 + """Use LLM to generate interview questions""" -问题要求: -1. 开放性问题,鼓励详细回答 -2. 针对不同角色可能有不同答案 -3. 涵盖事实、观点、感受等多个维度 -4. 语言自然,像真实采访一样 -5. 每个问题控制在50字以内,简洁明了 -6. 直接提问,不要包含背景说明或前缀 + agent_roles = [a.get("profession", "Unknown") for a in selected_agents] -返回JSON格式:{"questions": ["问题1", "问题2", ...]}""" + system_prompt = """You are a professional journalist/interviewer. Generate 3-5 in-depth interview questions based on the interview requirement. - user_prompt = f"""采访需求:{interview_requirement} +Question requirements: +1. Open-ended questions that encourage detailed answers +2. Questions that may yield different answers from different roles +3. Cover multiple dimensions: facts, opinions, feelings, etc. +4. Natural language, like a real interview +5. Keep each question under 50 words; be concise and clear +6. Ask directly; do not include background explanations or prefixes -模拟背景:{simulation_requirement if simulation_requirement else "未提供"} +Return JSON format: {"questions": ["question 1", "question 2", ...]}""" -采访对象角色:{', '.join(agent_roles)} + user_prompt = f"""Interview requirement: {interview_requirement} -请生成3-5个采访问题。""" +Simulation background: {simulation_requirement if simulation_requirement else "Not provided"} + +Interviewee roles: {', '.join(agent_roles)} + +Please generate 3-5 interview questions.""" try: response = self.llm.chat_json( @@ -1670,14 +1673,14 @@ class ZepToolsService: temperature=0.5 ) - return response.get("questions", [f"关于{interview_requirement},您有什么看法?"]) + return response.get("questions", [f"What are your thoughts on {interview_requirement}?"]) except Exception as e: logger.warning(t("console.generateInterviewQuestionsFailed", error=e)) return [ - f"关于{interview_requirement},您的观点是什么?", - "这件事对您或您所代表的群体有什么影响?", - "您认为应该如何解决或改进这个问题?" + f"What is your view on {interview_requirement}?", + "What impact does this have on you or the group you represent?", + "How do you think this issue should be resolved or improved?" ] def _generate_interview_summary( @@ -1685,39 +1688,39 @@ class ZepToolsService: interviews: List[AgentInterview], interview_requirement: str ) -> str: - """生成采访摘要""" + """Generate an interview summary""" if not interviews: - return "未完成任何采访" + return "No interviews completed" - # 收集所有采访内容 + # Collect all interview content interview_texts = [] for interview in interviews: - interview_texts.append(f"【{interview.agent_name}({interview.agent_role})】\n{interview.response[:500]}") - - quote_instruction = "引用受访者原话时使用中文引号「」" if get_locale() == 'zh' else 'Use quotation marks "" when quoting interviewees' - system_prompt = f"""你是一个专业的新闻编辑。请根据多位受访者的回答,生成一份采访摘要。 + interview_texts.append(f"[{interview.agent_name} ({interview.agent_role})]\n{interview.response[:500]}") -摘要要求: -1. 提炼各方主要观点 -2. 指出观点的共识和分歧 -3. 突出有价值的引言 -4. 客观中立,不偏袒任何一方 -5. 控制在1000字内 + quote_instruction = 'Use quotation marks "" when quoting interviewees' + system_prompt = f"""You are a professional news editor. Based on the responses of multiple interviewees, generate an interview summary. -格式约束(必须遵守): -- 使用纯文本段落,用空行分隔不同部分 -- 不要使用Markdown标题(如#、##、###) -- 不要使用分割线(如---、***) +Summary requirements: +1. Distill the main viewpoints of each party +2. Identify areas of consensus and disagreement +3. Highlight valuable quotes +4. Be objective and neutral, without favoring any side +5. Keep it under 1000 words + +Formatting constraints (must be followed): +- Use plain text paragraphs, separated by blank lines +- Do not use Markdown headings (e.g. #, ##, ###) +- Do not use dividers (e.g. ---, ***) - {quote_instruction} -- 可以使用**加粗**标记关键词,但不要使用其他Markdown语法""" +- You may use **bold** to highlight keywords, but do not use other Markdown syntax""" - user_prompt = f"""采访主题:{interview_requirement} + user_prompt = f"""Interview topic: {interview_requirement} -采访内容: +Interview content: {"".join(interview_texts)} -请生成采访摘要。""" +Please generate an interview summary.""" try: summary = self.llm.chat( @@ -1732,5 +1735,5 @@ class ZepToolsService: except Exception as e: logger.warning(t("console.generateInterviewSummaryFailed", error=e)) - # 降级:简单拼接 - return f"共采访了{len(interviews)}位受访者,包括:" + "、".join([i.agent_name for i in interviews]) + # Fallback: simple concatenation + return f"Interviewed {len(interviews)} respondents, including: " + ", ".join([i.agent_name for i in interviews]) diff --git a/backend/app/utils/__init__.py b/backend/app/utils/__init__.py index e70161ac..119fdafe 100644 --- a/backend/app/utils/__init__.py +++ b/backend/app/utils/__init__.py @@ -1,5 +1,5 @@ """ -工具模块 +Utilities module """ from .file_parser import FileParser diff --git a/backend/app/utils/file_parser.py b/backend/app/utils/file_parser.py index 3f1d8ed2..6f08e95e 100644 --- a/backend/app/utils/file_parser.py +++ b/backend/app/utils/file_parser.py @@ -1,6 +1,6 @@ """ -文件解析工具 -支持PDF、Markdown、TXT文件的文本提取 +File parsing utilities +Supports text extraction from PDF, Markdown, and TXT files """ import os @@ -10,29 +10,29 @@ from typing import List, Optional def _read_text_with_fallback(file_path: str) -> str: """ - 读取文本文件,UTF-8失败时自动探测编码。 - - 采用多级回退策略: - 1. 首先尝试 UTF-8 解码 - 2. 使用 charset_normalizer 检测编码 - 3. 回退到 chardet 检测编码 - 4. 最终使用 UTF-8 + errors='replace' 兜底 - + Read a text file, automatically detecting encoding if UTF-8 fails. + + Uses a multi-level fallback strategy: + 1. First attempts UTF-8 decoding + 2. Uses charset_normalizer to detect encoding + 3. Falls back to chardet for encoding detection + 4. Final fallback: UTF-8 with errors='replace' + Args: - file_path: 文件路径 - + file_path: Path to the file + Returns: - 解码后的文本内容 + Decoded text content """ data = Path(file_path).read_bytes() - - # 首先尝试 UTF-8 + + # First attempt: UTF-8 try: return data.decode('utf-8') except UnicodeDecodeError: pass - - # 尝试使用 charset_normalizer 检测编码 + + # Attempt encoding detection with charset_normalizer encoding = None try: from charset_normalizer import from_bytes @@ -41,8 +41,8 @@ def _read_text_with_fallback(file_path: str) -> str: encoding = best.encoding except Exception: pass - - # 回退到 chardet + + # Fall back to chardet if not encoding: try: import chardet @@ -50,140 +50,139 @@ def _read_text_with_fallback(file_path: str) -> str: encoding = result.get('encoding') if result else None except Exception: pass - - # 最终兜底:使用 UTF-8 + replace + + # Final fallback: UTF-8 with replace if not encoding: encoding = 'utf-8' - + return data.decode(encoding, errors='replace') class FileParser: - """文件解析器""" - + """File parser""" + SUPPORTED_EXTENSIONS = {'.pdf', '.md', '.markdown', '.txt'} - + @classmethod def extract_text(cls, file_path: str) -> str: """ - 从文件中提取文本 - + Extract text from a file + Args: - file_path: 文件路径 - + file_path: Path to the file + Returns: - 提取的文本内容 + Extracted text content """ path = Path(file_path) - + if not path.exists(): - raise FileNotFoundError(f"文件不存在: {file_path}") - + raise FileNotFoundError(f"File not found: {file_path}") + suffix = path.suffix.lower() - + if suffix not in cls.SUPPORTED_EXTENSIONS: - raise ValueError(f"不支持的文件格式: {suffix}") - + raise ValueError(f"Unsupported file format: {suffix}") + if suffix == '.pdf': return cls._extract_from_pdf(file_path) elif suffix in {'.md', '.markdown'}: return cls._extract_from_md(file_path) elif suffix == '.txt': return cls._extract_from_txt(file_path) - - raise ValueError(f"无法处理的文件格式: {suffix}") - + + raise ValueError(f"Cannot process file format: {suffix}") + @staticmethod def _extract_from_pdf(file_path: str) -> str: - """从PDF提取文本""" + """Extract text from a PDF file""" try: import fitz # PyMuPDF except ImportError: - raise ImportError("需要安装PyMuPDF: pip install PyMuPDF") - + raise ImportError("PyMuPDF is required: pip install PyMuPDF") + text_parts = [] with fitz.open(file_path) as doc: for page in doc: text = page.get_text() if text.strip(): text_parts.append(text) - + return "\n\n".join(text_parts) - + @staticmethod def _extract_from_md(file_path: str) -> str: - """从Markdown提取文本,支持自动编码检测""" + """Extract text from a Markdown file with automatic encoding detection""" return _read_text_with_fallback(file_path) - + @staticmethod def _extract_from_txt(file_path: str) -> str: - """从TXT提取文本,支持自动编码检测""" + """Extract text from a TXT file with automatic encoding detection""" return _read_text_with_fallback(file_path) - + @classmethod def extract_from_multiple(cls, file_paths: List[str]) -> str: """ - 从多个文件提取文本并合并 - + Extract text from multiple files and merge the results + Args: - file_paths: 文件路径列表 - + file_paths: List of file paths + Returns: - 合并后的文本 + Merged text content """ all_texts = [] - + for i, file_path in enumerate(file_paths, 1): try: text = cls.extract_text(file_path) filename = Path(file_path).name - all_texts.append(f"=== 文档 {i}: {filename} ===\n{text}") + all_texts.append(f"=== Document {i}: {filename} ===\n{text}") except Exception as e: - all_texts.append(f"=== 文档 {i}: {file_path} (提取失败: {str(e)}) ===") - + all_texts.append(f"=== Document {i}: {file_path} (extraction failed: {str(e)}) ===") + return "\n\n".join(all_texts) def split_text_into_chunks( - text: str, - chunk_size: int = 500, + text: str, + chunk_size: int = 500, overlap: int = 50 ) -> List[str]: """ - 将文本分割成小块 - + Split text into smaller chunks + Args: - text: 原始文本 - chunk_size: 每块的字符数 - overlap: 重叠字符数 - + text: Source text + chunk_size: Number of characters per chunk + overlap: Number of overlapping characters between chunks + Returns: - 文本块列表 + List of text chunks """ if len(text) <= chunk_size: return [text] if text.strip() else [] - + chunks = [] start = 0 - + while start < len(text): end = start + chunk_size - - # 尝试在句子边界处分割 + + # Try to split at sentence boundaries if end < len(text): - # 查找最近的句子结束符 + # Find the nearest sentence-ending separator for sep in ['。', '!', '?', '.\n', '!\n', '?\n', '\n\n', '. ', '! ', '? ']: last_sep = text[start:end].rfind(sep) if last_sep != -1 and last_sep > chunk_size * 0.3: end = start + last_sep + len(sep) break - + chunk = text[start:end].strip() if chunk: chunks.append(chunk) - - # 下一个块从重叠位置开始 - start = end - overlap if end < len(text) else len(text) - - return chunks + # Next chunk starts at the overlap position + start = end - overlap if end < len(text) else len(text) + + return chunks diff --git a/backend/app/utils/llm_client.py b/backend/app/utils/llm_client.py index 6c1a81f4..4820f6c4 100644 --- a/backend/app/utils/llm_client.py +++ b/backend/app/utils/llm_client.py @@ -1,19 +1,20 @@ """ -LLM客户端封装 -统一使用OpenAI格式调用 +LLM client wrapper +Unified interface using the OpenAI-compatible API format """ import json import re from typing import Optional, Dict, Any, List +from urllib.parse import urlparse, parse_qs, urlunparse from openai import OpenAI from ..config import Config class LLMClient: - """LLM客户端""" - + """LLM client""" + def __init__( self, api_key: Optional[str] = None, @@ -21,17 +22,32 @@ class LLMClient: model: Optional[str] = None ): self.api_key = api_key or Config.LLM_API_KEY - self.base_url = base_url or Config.LLM_BASE_URL + raw_url = base_url or Config.LLM_BASE_URL self.model = model or Config.LLM_MODEL_NAME - + if not self.api_key: - raise ValueError("LLM_API_KEY 未配置") - + raise ValueError("LLM_API_KEY is not configured") + + # Azure Portal provides full endpoint URLs like: + # https://.cognitiveservices.azure.com/openai/deployments//chat/completions?api-version=... + # The OpenAI SDK expects a base_url and appends /chat/completions itself, + # so we strip that suffix and extract api-version as a default query param. + default_query: Dict[str, str] = {} + if raw_url and '/chat/completions' in raw_url: + parsed = urlparse(raw_url) + qs = parse_qs(parsed.query) + if 'api-version' in qs: + default_query['api-version'] = qs['api-version'][0] + clean_path = parsed.path.replace('/chat/completions', '').rstrip('/') + raw_url = urlunparse(parsed._replace(path=clean_path, query='')) + + self.base_url = raw_url self.client = OpenAI( api_key=self.api_key, - base_url=self.base_url + base_url=self.base_url, + default_query=default_query if default_query else None ) - + def chat( self, messages: List[Dict[str, str]], @@ -40,33 +56,33 @@ class LLMClient: response_format: Optional[Dict] = None ) -> str: """ - 发送聊天请求 - + Send a chat request + Args: - messages: 消息列表 - temperature: 温度参数 - max_tokens: 最大token数 - response_format: 响应格式(如JSON模式) - + messages: List of messages + temperature: Temperature parameter + max_tokens: Maximum number of tokens + response_format: Response format (e.g. JSON mode) + Returns: - 模型响应文本 + Model response text """ kwargs = { "model": self.model, "messages": messages, "temperature": temperature, - "max_tokens": max_tokens, + "max_completion_tokens": max_tokens, } - + if response_format: kwargs["response_format"] = response_format - + response = self.client.chat.completions.create(**kwargs) content = response.choices[0].message.content - # 部分模型(如MiniMax M2.5)会在content中包含思考内容,需要移除 + # Some models (e.g. MiniMax M2.5) include reasoning content in the response; strip it out content = re.sub(r'[\s\S]*?', '', content).strip() return content - + def chat_json( self, messages: List[Dict[str, str]], @@ -74,15 +90,15 @@ class LLMClient: max_tokens: int = 4096 ) -> Dict[str, Any]: """ - 发送聊天请求并返回JSON - + Send a chat request and return parsed JSON + Args: - messages: 消息列表 - temperature: 温度参数 - max_tokens: 最大token数 - + messages: List of messages + temperature: Temperature parameter + max_tokens: Maximum number of tokens + Returns: - 解析后的JSON对象 + Parsed JSON object """ response = self.chat( messages=messages, @@ -90,7 +106,7 @@ class LLMClient: max_tokens=max_tokens, response_format={"type": "json_object"} ) - # 清理markdown代码块标记 + # Strip markdown code-block markers if present cleaned_response = response.strip() cleaned_response = re.sub(r'^```(?:json)?\s*\n?', '', cleaned_response, flags=re.IGNORECASE) cleaned_response = re.sub(r'\n?```\s*$', '', cleaned_response) @@ -99,5 +115,4 @@ class LLMClient: try: return json.loads(cleaned_response) except json.JSONDecodeError: - raise ValueError(f"LLM返回的JSON格式无效: {cleaned_response}") - + raise ValueError(f"Invalid JSON returned by LLM: {cleaned_response}") diff --git a/backend/app/utils/locale.py b/backend/app/utils/locale.py index 23d04aa9..9ad712c7 100644 --- a/backend/app/utils/locale.py +++ b/backend/app/utils/locale.py @@ -66,4 +66,4 @@ def t(key: str, **kwargs) -> str: def get_language_instruction() -> str: locale = get_locale() lang_config = _languages.get(locale, _languages.get('zh', {})) - return lang_config.get('llmInstruction', '请使用中文回答。') + return lang_config.get('llmInstruction', 'Please respond in Chinese.') diff --git a/backend/app/utils/logger.py b/backend/app/utils/logger.py index 1978c0b8..7443022a 100644 --- a/backend/app/utils/logger.py +++ b/backend/app/utils/logger.py @@ -1,6 +1,6 @@ """ -日志配置模块 -提供统一的日志管理,同时输出到控制台和文件 +Logging configuration module +Provides unified log management, writing to both console and file """ import os @@ -12,58 +12,58 @@ from logging.handlers import RotatingFileHandler def _ensure_utf8_stdout(): """ - 确保 stdout/stderr 使用 UTF-8 编码 - 解决 Windows 控制台中文乱码问题 + Ensure stdout/stderr use UTF-8 encoding. + Fixes garbled output in Windows consoles. """ if sys.platform == 'win32': - # Windows 下重新配置标准输出为 UTF-8 + # Reconfigure standard streams to UTF-8 on Windows if hasattr(sys.stdout, 'reconfigure'): sys.stdout.reconfigure(encoding='utf-8', errors='replace') if hasattr(sys.stderr, 'reconfigure'): sys.stderr.reconfigure(encoding='utf-8', errors='replace') -# 日志目录 +# Log directory LOG_DIR = os.path.join(os.path.dirname(os.path.dirname(os.path.dirname(__file__))), 'logs') def setup_logger(name: str = 'mirofish', level: int = logging.DEBUG) -> logging.Logger: """ - 设置日志器 - + Set up a logger + Args: - name: 日志器名称 - level: 日志级别 - + name: Logger name + level: Log level + Returns: - 配置好的日志器 + Configured logger instance """ - # 确保日志目录存在 + # Ensure the log directory exists os.makedirs(LOG_DIR, exist_ok=True) - - # 创建日志器 + + # Create logger logger = logging.getLogger(name) logger.setLevel(level) - - # 阻止日志向上传播到根 logger,避免重复输出 + + # Prevent log records from propagating to the root logger to avoid duplicate output logger.propagate = False - - # 如果已经有处理器,不重复添加 + + # Skip adding handlers if they already exist if logger.handlers: return logger - - # 日志格式 + + # Log formatters detailed_formatter = logging.Formatter( '[%(asctime)s] %(levelname)s [%(name)s.%(funcName)s:%(lineno)d] %(message)s', datefmt='%Y-%m-%d %H:%M:%S' ) - + simple_formatter = logging.Formatter( '[%(asctime)s] %(levelname)s: %(message)s', datefmt='%H:%M:%S' ) - - # 1. 文件处理器 - 详细日志(按日期命名,带轮转) + + # 1. File handler — detailed logs (date-stamped filename with rotation) log_filename = datetime.now().strftime('%Y-%m-%d') + '.log' file_handler = RotatingFileHandler( os.path.join(LOG_DIR, log_filename), @@ -73,30 +73,30 @@ def setup_logger(name: str = 'mirofish', level: int = logging.DEBUG) -> logging. ) file_handler.setLevel(logging.DEBUG) file_handler.setFormatter(detailed_formatter) - - # 2. 控制台处理器 - 简洁日志(INFO及以上) - # 确保 Windows 下使用 UTF-8 编码,避免中文乱码 + + # 2. Console handler — concise logs (INFO and above) + # Ensure UTF-8 encoding on Windows to avoid garbled output _ensure_utf8_stdout() console_handler = logging.StreamHandler(sys.stdout) console_handler.setLevel(logging.INFO) console_handler.setFormatter(simple_formatter) - - # 添加处理器 + + # Register handlers logger.addHandler(file_handler) logger.addHandler(console_handler) - + return logger def get_logger(name: str = 'mirofish') -> logging.Logger: """ - 获取日志器(如果不存在则创建) - + Get a logger, creating it if it does not exist + Args: - name: 日志器名称 - + name: Logger name + Returns: - 日志器实例 + Logger instance """ logger = logging.getLogger(name) if not logger.handlers: @@ -104,11 +104,11 @@ def get_logger(name: str = 'mirofish') -> logging.Logger: return logger -# 创建默认日志器 +# Create default logger logger = setup_logger() -# 便捷方法 +# Convenience functions def debug(msg, *args, **kwargs): logger.debug(msg, *args, **kwargs) @@ -123,4 +123,3 @@ def error(msg, *args, **kwargs): def critical(msg, *args, **kwargs): logger.critical(msg, *args, **kwargs) - diff --git a/backend/app/utils/retry.py b/backend/app/utils/retry.py index 819b1cfc..d66bbae5 100644 --- a/backend/app/utils/retry.py +++ b/backend/app/utils/retry.py @@ -1,6 +1,6 @@ """ -API调用重试机制 -用于处理LLM等外部API调用的重试逻辑 +API call retry mechanism +Handles retry logic for external API calls such as LLM services """ import time @@ -22,17 +22,17 @@ def retry_with_backoff( on_retry: Optional[Callable[[Exception, int], None]] = None ): """ - 带指数退避的重试装饰器 - + Retry decorator with exponential backoff + Args: - max_retries: 最大重试次数 - initial_delay: 初始延迟(秒) - max_delay: 最大延迟(秒) - backoff_factor: 退避因子 - jitter: 是否添加随机抖动 - exceptions: 需要重试的异常类型 - on_retry: 重试时的回调函数 (exception, retry_count) - + max_retries: Maximum number of retries + initial_delay: Initial delay in seconds + max_delay: Maximum delay in seconds + backoff_factor: Backoff multiplier + jitter: Whether to add random jitter + exceptions: Exception types that should trigger a retry + on_retry: Callback invoked on each retry (exception, retry_count) + Usage: @retry_with_backoff(max_retries=3) def call_llm_api(): @@ -43,36 +43,36 @@ def retry_with_backoff( def wrapper(*args, **kwargs) -> Any: last_exception = None delay = initial_delay - + for attempt in range(max_retries + 1): try: return func(*args, **kwargs) - + except exceptions as e: last_exception = e - + if attempt == max_retries: - logger.error(f"函数 {func.__name__} 在 {max_retries} 次重试后仍失败: {str(e)}") + logger.error(f"Function {func.__name__} failed after {max_retries} retries: {str(e)}") raise - - # 计算延迟 + + # Calculate delay current_delay = min(delay, max_delay) if jitter: current_delay = current_delay * (0.5 + random.random()) - + logger.warning( - f"函数 {func.__name__} 第 {attempt + 1} 次尝试失败: {str(e)}, " - f"{current_delay:.1f}秒后重试..." + f"Function {func.__name__} attempt {attempt + 1} failed: {str(e)}, " + f"retrying in {current_delay:.1f}s..." ) - + if on_retry: on_retry(e, attempt + 1) - + time.sleep(current_delay) delay *= backoff_factor - + raise last_exception - + return wrapper return decorator @@ -87,53 +87,53 @@ def retry_with_backoff_async( on_retry: Optional[Callable[[Exception, int], None]] = None ): """ - 异步版本的重试装饰器 + Async version of the retry decorator """ import asyncio - + def decorator(func: Callable) -> Callable: @functools.wraps(func) async def wrapper(*args, **kwargs) -> Any: last_exception = None delay = initial_delay - + for attempt in range(max_retries + 1): try: return await func(*args, **kwargs) - + except exceptions as e: last_exception = e - + if attempt == max_retries: - logger.error(f"异步函数 {func.__name__} 在 {max_retries} 次重试后仍失败: {str(e)}") + logger.error(f"Async function {func.__name__} failed after {max_retries} retries: {str(e)}") raise - + current_delay = min(delay, max_delay) if jitter: current_delay = current_delay * (0.5 + random.random()) - + logger.warning( - f"异步函数 {func.__name__} 第 {attempt + 1} 次尝试失败: {str(e)}, " - f"{current_delay:.1f}秒后重试..." + f"Async function {func.__name__} attempt {attempt + 1} failed: {str(e)}, " + f"retrying in {current_delay:.1f}s..." ) - + if on_retry: on_retry(e, attempt + 1) - + await asyncio.sleep(current_delay) delay *= backoff_factor - + raise last_exception - + return wrapper return decorator class RetryableAPIClient: """ - 可重试的API客户端封装 + Retryable API client wrapper """ - + def __init__( self, max_retries: int = 3, @@ -145,7 +145,7 @@ class RetryableAPIClient: self.initial_delay = initial_delay self.max_delay = max_delay self.backoff_factor = backoff_factor - + def call_with_retry( self, func: Callable, @@ -154,44 +154,44 @@ class RetryableAPIClient: **kwargs ) -> Any: """ - 执行函数调用并在失败时重试 - + Execute a function call and retry on failure + Args: - func: 要调用的函数 - *args: 函数参数 - exceptions: 需要重试的异常类型 - **kwargs: 函数关键字参数 - + func: Function to call + *args: Positional arguments for the function + exceptions: Exception types that should trigger a retry + **kwargs: Keyword arguments for the function + Returns: - 函数返回值 + Return value of the function """ last_exception = None delay = self.initial_delay - + for attempt in range(self.max_retries + 1): try: return func(*args, **kwargs) - + except exceptions as e: last_exception = e - + if attempt == self.max_retries: - logger.error(f"API调用在 {self.max_retries} 次重试后仍失败: {str(e)}") + logger.error(f"API call failed after {self.max_retries} retries: {str(e)}") raise - + current_delay = min(delay, self.max_delay) current_delay = current_delay * (0.5 + random.random()) - + logger.warning( - f"API调用第 {attempt + 1} 次尝试失败: {str(e)}, " - f"{current_delay:.1f}秒后重试..." + f"API call attempt {attempt + 1} failed: {str(e)}, " + f"retrying in {current_delay:.1f}s..." ) - + time.sleep(current_delay) delay *= self.backoff_factor - + raise last_exception - + def call_batch_with_retry( self, items: list, @@ -200,20 +200,20 @@ class RetryableAPIClient: continue_on_failure: bool = True ) -> Tuple[list, list]: """ - 批量调用并对每个失败项单独重试 - + Process a batch of items, retrying individually on failure + Args: - items: 要处理的项目列表 - process_func: 处理函数,接收单个item作为参数 - exceptions: 需要重试的异常类型 - continue_on_failure: 单项失败后是否继续处理其他项 - + items: List of items to process + process_func: Processing function that accepts a single item + exceptions: Exception types that should trigger a retry + continue_on_failure: Whether to continue processing remaining items after a failure + Returns: - (成功结果列表, 失败项列表) + (list of successful results, list of failed items) """ results = [] failures = [] - + for idx, item in enumerate(items): try: result = self.call_with_retry( @@ -222,17 +222,16 @@ class RetryableAPIClient: exceptions=exceptions ) results.append(result) - + except Exception as e: - logger.error(f"处理第 {idx + 1} 项失败: {str(e)}") + logger.error(f"Failed to process item {idx + 1}: {str(e)}") failures.append({ "index": idx, "item": item, "error": str(e) }) - + if not continue_on_failure: raise - - return results, failures + return results, failures diff --git a/backend/app/utils/zep_paging.py b/backend/app/utils/zep_paging.py index 943cd1ae..7967b28f 100644 --- a/backend/app/utils/zep_paging.py +++ b/backend/app/utils/zep_paging.py @@ -1,7 +1,8 @@ -"""Zep Graph 分页读取工具。 +"""Zep Graph paginated fetch utilities. -Zep 的 node/edge 列表接口使用 UUID cursor 分页, -本模块封装自动翻页逻辑(含单页重试),对调用方透明地返回完整列表。 +Zep's node/edge list endpoints use UUID-cursor-based pagination. +This module wraps the auto-pagination logic (with per-page retries) and +returns the full result list transparently to the caller. """ from __future__ import annotations @@ -31,7 +32,7 @@ def _fetch_page_with_retry( page_description: str = "page", **kwargs: Any, ) -> list[Any]: - """单页请求,失败时指数退避重试。仅重试网络/IO类瞬态错误。""" + """Fetch a single page with exponential-backoff retry on transient network/IO errors.""" if max_retries < 1: raise ValueError("max_retries must be >= 1") @@ -64,7 +65,7 @@ def fetch_all_nodes( max_retries: int = _DEFAULT_MAX_RETRIES, retry_delay: float = _DEFAULT_RETRY_DELAY, ) -> list[Any]: - """分页获取图谱节点,最多返回 max_items 条(默认 2000)。每页请求自带重试。""" + """Fetch all graph nodes with pagination, returning at most max_items (default 2000). Each page request includes retries.""" all_nodes: list[Any] = [] cursor: str | None = None page_num = 0 @@ -109,7 +110,7 @@ def fetch_all_edges( max_retries: int = _DEFAULT_MAX_RETRIES, retry_delay: float = _DEFAULT_RETRY_DELAY, ) -> list[Any]: - """分页获取图谱所有边,返回完整列表。每页请求自带重试。""" + """Fetch all graph edges with pagination, returning the complete list. Each page request includes retries.""" all_edges: list[Any] = [] cursor: str | None = None page_num = 0 diff --git a/frontend/package-lock.json b/frontend/package-lock.json index 3e56d752..fdab7ac4 100644 --- a/frontend/package-lock.json +++ b/frontend/package-lock.json @@ -1435,7 +1435,6 @@ "resolved": "https://registry.npmjs.org/d3-selection/-/d3-selection-3.0.0.tgz", "integrity": "sha512-fmTRWbNMmsmWq6xJV8D19U/gw/bwrHfNXxrIN+HfZgnzqTHp9jOmKMhsTUjXOJnZOdZY9Q28y4yebKzqDKlxlQ==", "license": "ISC", - "peer": true, "engines": { "node": ">=12" } @@ -1913,7 +1912,6 @@ "integrity": "sha512-QP88BAKvMam/3NxH6vj2o21R6MjxZUAd6nlwAS/pnGvN9IVLocLHxGYIzFhg6fUQ+5th6P4dv4eW9jX3DSIj7A==", "dev": true, "license": "MIT", - "peer": true, "engines": { "node": ">=12" }, @@ -2053,7 +2051,6 @@ "integrity": "sha512-ITcnkFeR3+fI8P1wMgItjGrR10170d8auB4EpMLPqmx6uxElH3a/hHGQabSHKdqd4FXWO1nFIp9rRn7JQ34ACQ==", "dev": true, "license": "MIT", - "peer": true, "dependencies": { "esbuild": "^0.25.0", "fdir": "^6.5.0", @@ -2128,7 +2125,6 @@ "resolved": "https://registry.npmjs.org/vue/-/vue-3.5.25.tgz", "integrity": "sha512-YLVdgv2K13WJ6n+kD5owehKtEXwdwXuj2TTyJMsO7pSeKw2bfRNZGjhB7YzrpbMYj5b5QsUebHpOqR3R3ziy/g==", "license": "MIT", - "peer": true, "dependencies": { "@vue/compiler-dom": "3.5.25", "@vue/compiler-sfc": "3.5.25", diff --git a/frontend/src/api/index.js b/frontend/src/api/index.js index ec57106c..ef73ef3c 100644 --- a/frontend/src/api/index.js +++ b/frontend/src/api/index.js @@ -4,7 +4,7 @@ import authState, { clearToken } from '../store/auth' // 创建axios实例 const service = axios.create({ - baseURL: import.meta.env.VITE_API_BASE_URL || 'http://localhost:5001', + baseURL: import.meta.env.VITE_API_BASE_URL || '', timeout: 300000, // 5分钟超时(本体生成可能需要较长时间) headers: { 'Content-Type': 'application/json' diff --git a/frontend/src/i18n/index.js b/frontend/src/i18n/index.js index aa265535..9f4c7494 100644 --- a/frontend/src/i18n/index.js +++ b/frontend/src/i18n/index.js @@ -14,7 +14,7 @@ for (const path in localeFiles) { } } -const savedLocale = localStorage.getItem('locale') || 'zh' +const savedLocale = localStorage.getItem('locale') || 'ca' const i18n = createI18n({ legacy: false,