From 20c830af12cecd6fa6f03e8c0aca98b40dd8c858 Mon Sep 17 00:00:00 2001 From: albert Date: Wed, 11 Mar 2026 10:37:06 +0800 Subject: [PATCH] fix(report_agent): handle API token overflow crash with context length error recovery --- .env.example | 2 + backend/app/config.py | 1 + backend/app/services/report_agent.py | 16 ++++-- backend/app/utils/llm_client.py | 79 +++++++++++++++++++++------- 4 files changed, 77 insertions(+), 21 deletions(-) diff --git a/.env.example b/.env.example index 78a3b72c..2ca9c1e3 100644 --- a/.env.example +++ b/.env.example @@ -4,6 +4,8 @@ LLM_API_KEY=your_api_key_here LLM_BASE_URL=https://dashscope.aliyuncs.com/compatible-mode/v1 LLM_MODEL_NAME=qwen-plus +# LLM最大输出token数(根据模型能力调整,默认4096) +# LLM_MAX_TOKENS=4096 # ===== ZEP记忆图谱配置 ===== # 每月免费额度即可支撑简单使用:https://app.getzep.com/ diff --git a/backend/app/config.py b/backend/app/config.py index 953dfa50..b0ef8688 100644 --- a/backend/app/config.py +++ b/backend/app/config.py @@ -31,6 +31,7 @@ class Config: LLM_API_KEY = os.environ.get('LLM_API_KEY') LLM_BASE_URL = os.environ.get('LLM_BASE_URL', 'https://api.openai.com/v1') LLM_MODEL_NAME = os.environ.get('LLM_MODEL_NAME', 'gpt-4o-mini') + LLM_MAX_TOKENS = int(os.environ.get('LLM_MAX_TOKENS', '4096')) # Zep配置 ZEP_API_KEY = os.environ.get('ZEP_API_KEY') diff --git a/backend/app/services/report_agent.py b/backend/app/services/report_agent.py index 02ca5bdc..a4033d85 100644 --- a/backend/app/services/report_agent.py +++ b/backend/app/services/report_agent.py @@ -1294,11 +1294,17 @@ class ReportAgent: for iteration in range(max_iterations): if progress_callback: progress_callback( - "generating", + "generating", int((iteration / max_iterations) * 100), f"深度检索与撰写中 ({tool_calls_count}/{self.MAX_TOOL_CALLS_PER_SECTION})" ) - + + # 防止消息历史无限增长导致上下文溢出 + # 保留 system + user prompt(前2条)和最近的对话轮次 + if len(messages) > 14: + messages = messages[:2] + messages[-12:] + logger.info(f"章节 {section.title}: 消息历史已裁剪至 {len(messages)} 条以防止上下文溢出") + # 调用LLM response = self.llm.chat( messages=messages, @@ -1502,7 +1508,11 @@ class ReportAgent: # 达到最大迭代次数,强制生成内容 logger.warning(f"章节 {section.title} 达到最大迭代次数,强制生成") messages.append({"role": "user", "content": REACT_FORCE_FINAL_MSG}) - + + # 裁剪消息以防止强制收尾时上下文溢出 + if len(messages) > 14: + messages = messages[:2] + messages[-12:] + response = self.llm.chat( messages=messages, temperature=0.5, diff --git a/backend/app/utils/llm_client.py b/backend/app/utils/llm_client.py index 6c1a81f4..9e30d623 100644 --- a/backend/app/utils/llm_client.py +++ b/backend/app/utils/llm_client.py @@ -4,16 +4,19 @@ LLM客户端封装 """ import json +import logging import re from typing import Optional, Dict, Any, List -from openai import OpenAI +from openai import OpenAI, BadRequestError, APIError from ..config import Config +logger = logging.getLogger(__name__) + class LLMClient: """LLM客户端""" - + def __init__( self, api_key: Optional[str] = None, @@ -23,64 +26,105 @@ class LLMClient: self.api_key = api_key or Config.LLM_API_KEY self.base_url = base_url or Config.LLM_BASE_URL self.model = model or Config.LLM_MODEL_NAME - + self.default_max_tokens = Config.LLM_MAX_TOKENS + if not self.api_key: raise ValueError("LLM_API_KEY 未配置") - + self.client = OpenAI( api_key=self.api_key, base_url=self.base_url ) - + + @staticmethod + def _trim_messages(messages: List[Dict[str, str]]) -> List[Dict[str, str]]: + """ + 当消息列表过长导致上下文溢出时,裁剪中间的历史消息。 + 保留第一条(system prompt)和最后几条消息,移除中间部分。 + """ + if len(messages) <= 4: + return messages + + # 保留 system prompt(第1条)+ 最近3轮对话(最后6条) + keep_tail = min(6, len(messages) - 1) + trimmed = [messages[0]] + messages[-keep_tail:] + logger.warning( + f"消息上下文过长,已裁剪: {len(messages)} -> {len(trimmed)} 条消息" + ) + return trimmed + def chat( self, messages: List[Dict[str, str]], temperature: float = 0.7, - max_tokens: int = 4096, + max_tokens: Optional[int] = None, response_format: Optional[Dict] = None ) -> str: """ 发送聊天请求 - + Args: messages: 消息列表 temperature: 温度参数 - max_tokens: 最大token数 + max_tokens: 最大token数(默认使用配置值) response_format: 响应格式(如JSON模式) - + Returns: 模型响应文本 """ + if max_tokens is None: + max_tokens = self.default_max_tokens + kwargs = { "model": self.model, "messages": messages, "temperature": temperature, "max_tokens": max_tokens, } - + if response_format: kwargs["response_format"] = response_format - - response = self.client.chat.completions.create(**kwargs) + + try: + response = self.client.chat.completions.create(**kwargs) + except BadRequestError as e: + error_msg = str(e).lower() + # 处理上下文长度超限错误:自动裁剪消息后重试一次 + if "context_length" in error_msg or "maximum context" in error_msg or "token" in error_msg: + logger.warning( + f"上下文长度超限,尝试裁剪消息后重试: {e}" + ) + trimmed_messages = self._trim_messages(messages) + if len(trimmed_messages) == len(messages): + # 无法进一步裁剪,向上抛出 + raise + kwargs["messages"] = trimmed_messages + response = self.client.chat.completions.create(**kwargs) + else: + raise + except APIError as e: + logger.error(f"LLM API 调用失败: {e}") + raise + content = response.choices[0].message.content # 部分模型(如MiniMax M2.5)会在content中包含思考内容,需要移除 content = re.sub(r'[\s\S]*?', '', content).strip() return content - + def chat_json( self, messages: List[Dict[str, str]], temperature: float = 0.3, - max_tokens: int = 4096 + max_tokens: Optional[int] = None ) -> Dict[str, Any]: """ 发送聊天请求并返回JSON - + Args: messages: 消息列表 temperature: 温度参数 - max_tokens: 最大token数 - + max_tokens: 最大token数(默认使用配置值) + Returns: 解析后的JSON对象 """ @@ -100,4 +144,3 @@ class LLMClient: return json.loads(cleaned_response) except json.JSONDecodeError: raise ValueError(f"LLM返回的JSON格式无效: {cleaned_response}") -