From af75f626fd3139a89e79bcf76859884477da1870 Mon Sep 17 00:00:00 2001
From: Pedro Renan <pedrorenan@gmail.com>
Date: Fri, 24 Apr 2026 21:46:12 +0100
Subject: [PATCH] feat(llm): add LLM_JSON_MODE to support runtimes without
 response_format

OpenAI-compatible runtimes differ in how they handle `response_format`:
cloud providers (OpenAI, Qwen/Dashscope, Ollama) accept
`{"type": "json_object"}`, while local runtimes like LM Studio and
llama.cpp server reject it with HTTP 400, only accepting `json_schema`
or `text`. This prevented MiroFish from running against fully-local
stacks.

Introduce `LLM_JSON_MODE` (default `json_object`) so users can opt out
of strict JSON response mode by setting `LLM_JSON_MODE=none`. The
existing prompt-based JSON + markdown-tolerant parsing already handles
the unstructured response path robustly, so `none` is viable for any
OpenAI-compatible endpoint.

Applied at all three call sites that send `response_format`:
- utils/llm_client.py (chat_json helper)
- services/oasis_profile_generator.py (persona synthesis)
- services/simulation_config_generator.py (time/event/agent config)

Documented in .env.example with guidance on when to pick each value.
---
 .env.example                                       |  6 ++++++
 backend/app/config.py                              |  4 ++++
 backend/app/services/oasis_profile_generator.py    | 14 ++++++++------
 .../app/services/simulation_config_generator.py    | 14 ++++++++------
 backend/app/utils/llm_client.py                    |  2 +-
 5 files changed, 27 insertions(+), 13 deletions(-)

diff --git a/.env.example b/.env.example
index 78a3b72c..49a60525 100644
--- a/.env.example
+++ b/.env.example
@@ -5,6 +5,12 @@ LLM_API_KEY=your_api_key_here
 LLM_BASE_URL=https://dashscope.aliyuncs.com/compatible-mode/v1
 LLM_MODEL_NAME=qwen-plus
 
+# LLM JSON output mode (optional, default: json_object)
+#   json_object — OpenAI, Ollama, Qwen Cloud, Anthropic-compatible endpoints
+#   none        — LM Studio, llama.cpp server, or any runtime that rejects the
+#                 `response_format` parameter. Relies on prompt-based JSON + robust parsing.
+# LLM_JSON_MODE=json_object
+
 # ===== ZEP记忆图谱配置 =====
 # 每月免费额度即可支撑简单使用：https://app.getzep.com/
 ZEP_API_KEY=your_zep_api_key_here
diff --git a/backend/app/config.py b/backend/app/config.py
index 953dfa50..d28ea904 100644
--- a/backend/app/config.py
+++ b/backend/app/config.py
@@ -31,6 +31,10 @@ class Config:
     LLM_API_KEY = os.environ.get('LLM_API_KEY')
     LLM_BASE_URL = os.environ.get('LLM_BASE_URL', 'https://api.openai.com/v1')
     LLM_MODEL_NAME = os.environ.get('LLM_MODEL_NAME', 'gpt-4o-mini')
+    # JSON output mode for LLM calls. Options:
+    #   "json_object" — OpenAI-compatible default (Ollama, Qwen Cloud, Anthropic via proxy)
+    #   "none"        — runtimes that reject response_format (LM Studio, llama.cpp server)
+    LLM_JSON_MODE = os.environ.get('LLM_JSON_MODE', 'json_object').lower()
     
     # Zep配置
     ZEP_API_KEY = os.environ.get('ZEP_API_KEY')
diff --git a/backend/app/services/oasis_profile_generator.py b/backend/app/services/oasis_profile_generator.py
index 7704a627..10fbe8ee 100644
--- a/backend/app/services/oasis_profile_generator.py
+++ b/backend/app/services/oasis_profile_generator.py
@@ -527,16 +527,18 @@ class OasisProfileGenerator:
         
         for attempt in range(max_attempts):
             try:
-                response = self.client.chat.completions.create(
-                    model=self.model_name,
-                    messages=[
+                create_kwargs = {
+                    "model": self.model_name,
+                    "messages": [
                         {"role": "system", "content": self._get_system_prompt(is_individual)},
                         {"role": "user", "content": prompt}
                     ],
-                    response_format={"type": "json_object"},
-                    temperature=0.7 - (attempt * 0.1)  # 每次重试降低温度
+                    "temperature": 0.7 - (attempt * 0.1)  # 每次重试降低温度
                     # 不设置max_tokens，让LLM自由发挥
-                )
+                }
+                if Config.LLM_JSON_MODE == "json_object":
+                    create_kwargs["response_format"] = {"type": "json_object"}
+                response = self.client.chat.completions.create(**create_kwargs)
                 
                 content = response.choices[0].message.content
                 
diff --git a/backend/app/services/simulation_config_generator.py b/backend/app/services/simulation_config_generator.py
index cb77f6b6..9be15449 100644
--- a/backend/app/services/simulation_config_generator.py
+++ b/backend/app/services/simulation_config_generator.py
@@ -440,16 +440,18 @@ class SimulationConfigGenerator:
         
         for attempt in range(max_attempts):
             try:
-                response = self.client.chat.completions.create(
-                    model=self.model_name,
-                    messages=[
+                create_kwargs = {
+                    "model": self.model_name,
+                    "messages": [
                         {"role": "system", "content": system_prompt},
                         {"role": "user", "content": prompt}
                     ],
-                    response_format={"type": "json_object"},
-                    temperature=0.7 - (attempt * 0.1)  # 每次重试降低温度
+                    "temperature": 0.7 - (attempt * 0.1)  # 每次重试降低温度
                     # 不设置max_tokens，让LLM自由发挥
-                )
+                }
+                if Config.LLM_JSON_MODE == "json_object":
+                    create_kwargs["response_format"] = {"type": "json_object"}
+                response = self.client.chat.completions.create(**create_kwargs)
                 
                 content = response.choices[0].message.content
                 finish_reason = response.choices[0].finish_reason
diff --git a/backend/app/utils/llm_client.py b/backend/app/utils/llm_client.py
index 6c1a81f4..a136e04c 100644
--- a/backend/app/utils/llm_client.py
+++ b/backend/app/utils/llm_client.py
@@ -88,7 +88,7 @@ class LLMClient:
             messages=messages,
             temperature=temperature,
             max_tokens=max_tokens,
-            response_format={"type": "json_object"}
+            response_format={"type": "json_object"} if Config.LLM_JSON_MODE == "json_object" else None
         )
         # 清理markdown代码块标记
         cleaned_response = response.strip()