MicroFish/backend/app/services/ontology_generator.py

"""
本体生成服务
接口1：分析文本内容，生成适合社会模拟的实体和关系类型定义
"""

import json
import math
import re
import traceback
from typing import Dict, Any, List, Optional
from ..utils.llm_client import LLMClient
from ..utils.locale import get_language_instruction
from ..utils.logger import get_logger
from ..config import Config

logger = get_logger('mirofish.ontology')


def _estimate_tokens(text: str) -> int:
    """Conservative token estimate for local vLLM context budgeting."""
    text = text or ""
    cjk_chars = len(re.findall(r'[\u3400-\u9fff\uf900-\ufaff]', text))
    non_cjk_chars = len(text) - cjk_chars
    return cjk_chars + math.ceil(non_cjk_chars / 4)


def _to_pascal_case(name: str) -> str:
    """将任意格式的名称转换为 PascalCase（如 'works_for' -> 'WorksFor', 'person' -> 'Person'）"""
    # 按非字母数字字符分割
    parts = re.split(r'[^a-zA-Z0-9]+', name)
    # 再按 camelCase 边界分割（如 'camelCase' -> ['camel', 'Case']）
    words = []
    for part in parts:
        words.extend(re.sub(r'([a-z])([A-Z])', r'\1_\2', part).split('_'))
    # 每个词首字母大写，过滤空串
    result = ''.join(word.capitalize() for word in words if word)
    return result if result else 'Unknown'


# 本体生成的系统提示词
ONTOLOGY_SYSTEM_PROMPT = """你是一个专业的知识图谱本体设计专家。你的任务是分析给定的文本内容和模拟需求，设计适合**社交媒体舆论模拟**的实体类型和关系类型。

**重要：你必须输出有效的JSON格式数据，不要输出任何其他内容。**

## 核心任务背景

我们正在构建一个**社交媒体舆论模拟系统**。在这个系统中：
- 每个实体都是一个可以在社交媒体上发声、互动、传播信息的"账号"或"主体"
- 实体之间会相互影响、转发、评论、回应
- 我们需要模拟舆论事件中各方的反应和信息传播路径

因此，**实体必须是现实中真实存在的、可以在社媒上发声和互动的主体**：

**可以是**：
- 具体的个人（公众人物、当事人、意见领袖、专家学者、普通人）
- 公司、企业（包括其官方账号）
- 组织机构（大学、协会、NGO、工会等）
- 政府部门、监管机构
- 媒体机构（报纸、电视台、自媒体、网站）
- 社交媒体平台本身
- 特定群体代表（如校友会、粉丝团、维权群体等）

**不可以是**：
- 抽象概念（如"舆论"、"情绪"、"趋势"）
- 主题/话题（如"学术诚信"、"教育改革"）
- 观点/态度（如"支持方"、"反对方"）

## 输出格式

请输出JSON格式，包含以下结构：

```json
{
    "entity_types": [
        {
            "name": "实体类型名称（英文，PascalCase）",
            "description": "简短描述（英文，不超过100字符）",
            "attributes": [
                {
                    "name": "属性名（英文，snake_case）",
                    "type": "text",
                    "description": "属性描述"
                }
            ],
            "examples": ["示例实体1", "示例实体2"]
        }
    ],
    "edge_types": [
        {
            "name": "关系类型名称（英文，UPPER_SNAKE_CASE）",
            "description": "简短描述（英文，不超过100字符）",
            "source_targets": [
                {"source": "源实体类型", "target": "目标实体类型"}
            ],
            "attributes": []
        }
    ],
    "analysis_summary": "对文本内容的简要分析说明"
}
```

## 设计指南（极其重要！）

### 1. 实体类型设计 - 必须严格遵守

**数量要求：必须正好10个实体类型**

**层次结构要求（必须同时包含具体类型和兜底类型）**：

你的10个实体类型必须包含以下层次：

A. **兜底类型（必须包含，放在列表最后2个）**：
   - `Person`: 任何自然人个体的兜底类型。当一个人不属于其他更具体的人物类型时，归入此类。
   - `Organization`: 任何组织机构的兜底类型。当一个组织不属于其他更具体的组织类型时，归入此类。

B. **具体类型（8个，根据文本内容设计）**：
   - 针对文本中出现的主要角色，设计更具体的类型
   - 例如：如果文本涉及学术事件，可以有 `Student`, `Professor`, `University`
   - 例如：如果文本涉及商业事件，可以有 `Company`, `CEO`, `Employee`

**为什么需要兜底类型**：
- 文本中会出现各种人物，如"中小学教师"、"路人甲"、"某位网友"
- 如果没有专门的类型匹配，他们应该被归入 `Person`
- 同理，小型组织、临时团体等应该归入 `Organization`

**具体类型的设计原则**：
- 从文本中识别出高频出现或关键的角色类型
- 每个具体类型应该有明确的边界，避免重叠
- description 必须清晰说明这个类型和兜底类型的区别

### 2. 关系类型设计

- 数量：6-10个
- 关系应该反映社媒互动中的真实联系
- 确保关系的 source_targets 涵盖你定义的实体类型

### 3. 属性设计

- 每个实体类型1-3个关键属性
- **注意**：属性名不能使用 `name`、`uuid`、`group_id`、`created_at`、`summary`（这些是系统保留字）
- 推荐使用：`full_name`, `title`, `role`, `position`, `location`, `description` 等

## 实体类型参考

**个人类（具体）**：
- Student: 学生
- Professor: 教授/学者
- Journalist: 记者
- Celebrity: 明星/网红
- Executive: 高管
- Official: 政府官员
- Lawyer: 律师
- Doctor: 医生

**个人类（兜底）**：
- Person: 任何自然人（不属于上述具体类型时使用）

**组织类（具体）**：
- University: 高校
- Company: 公司企业
- GovernmentAgency: 政府机构
- MediaOutlet: 媒体机构
- Hospital: 医院
- School: 中小学
- NGO: 非政府组织

**组织类（兜底）**：
- Organization: 任何组织机构（不属于上述具体类型时使用）

## 关系类型参考

- WORKS_FOR: 工作于
- STUDIES_AT: 就读于
- AFFILIATED_WITH: 隶属于
- REPRESENTS: 代表
- REGULATES: 监管
- REPORTS_ON: 报道
- COMMENTS_ON: 评论
- RESPONDS_TO: 回应
- SUPPORTS: 支持
- OPPOSES: 反对
- COLLABORATES_WITH: 合作
- COMPETES_WITH: 竞争
"""


class OntologyGenerator:
    """
    本体生成器
    分析文本内容，生成实体和关系类型定义
    """

    def __init__(self, llm_client: Optional[LLMClient] = None):
        self.llm_client = llm_client or LLMClient()

    def generate(
        self,
        document_texts: List[str],
        simulation_requirement: str,
        additional_context: Optional[str] = None
    ) -> Dict[str, Any]:
        """
        生成本体定义

        Args:
            document_texts: 文档文本列表
            simulation_requirement: 模拟需求描述
            additional_context: 额外上下文

        Returns:
            本体定义（entity_types, edge_types等）
        """
        lang_instruction = get_language_instruction()
        system_prompt = f"{ONTOLOGY_SYSTEM_PROMPT}\n\n{lang_instruction}\nIMPORTANT: Entity type names MUST be in English PascalCase (e.g., 'PersonEntity', 'MediaOrganization'). Relationship type names MUST be in English UPPER_SNAKE_CASE (e.g., 'WORKS_FOR'). Attribute names MUST be in English snake_case. Only description fields and analysis_summary should use the specified language above."

        chunks = self._build_document_chunks(
            document_texts=document_texts,
            simulation_requirement=simulation_requirement,
            additional_context=additional_context,
            system_prompt=system_prompt,
        )
        fallback_ontology = self._document_aware_fallback(document_texts, simulation_requirement)
        logger.info("Ontology generation split into %s LLM chunk(s)", len(chunks))

        partial_results = []
        for index, chunk in enumerate(chunks, start=1):
            user_message = self._build_user_message(
                [chunk],
                simulation_requirement,
                additional_context,
                chunk_index=index,
                chunk_count=len(chunks),
            )
            messages = [
                {"role": "system", "content": system_prompt},
                {"role": "user", "content": user_message}
            ]

            try:
                raw_result = self.llm_client.chat_json(
                    messages=messages,
                    temperature=0.3,
                    max_tokens=Config.ONTOLOGY_MAX_OUTPUT_TOKENS,
                    max_retries=Config.LLM_JSON_MAX_RETRIES,
                )
                result = self._coerce_ontology_result(raw_result)
                if self._has_usable_ontology(result):
                    partial_results.append(result)
                else:
                    logger.warning(
                        "Ontology LLM chunk %s/%s returned JSON without usable entity_types/edge_types: %s",
                        index,
                        len(chunks),
                        str(raw_result)[:1000],
                    )
            except Exception as exc:
                logger.error("Ontology LLM chunk %s/%s failed: %s", index, len(chunks), exc)
                logger.debug(traceback.format_exc())

        if partial_results:
            result = self._merge_ontologies(partial_results, simulation_requirement)
        else:
            logger.error("All ontology LLM chunks failed, using document-aware fallback ontology")
            result = fallback_ontology

        # 验证和后处理
        result = self._validate_and_process(result, fill_ontology=fallback_ontology)

        return result

    def _coerce_ontology_result(self, result: Dict[str, Any]) -> Dict[str, Any]:
        """Accept common local-LLM schema variants and normalize to our shape."""
        if not isinstance(result, dict):
            return {}

        for wrapper_key in ("ontology", "data", "result"):
            wrapped = result.get(wrapper_key)
            if isinstance(wrapped, dict):
                result = wrapped
                break

        normalized = dict(result)
        if "entity_types" not in normalized:
            for key in ("entities", "entityTypes", "node_types", "nodeTypes", "nodes"):
                if isinstance(normalized.get(key), list):
                    normalized["entity_types"] = normalized[key]
                    break

        if "edge_types" not in normalized:
            for key in ("relationships", "relations", "edges", "edgeTypes", "relation_types", "relationship_types"):
                if isinstance(normalized.get(key), list):
                    normalized["edge_types"] = normalized[key]
                    break

        normalized["entity_types"] = [
            self._coerce_entity_type(item)
            for item in normalized.get("entity_types", [])
            if isinstance(item, dict)
        ]
        normalized["edge_types"] = [
            self._coerce_edge_type(item)
            for item in normalized.get("edge_types", [])
            if isinstance(item, dict)
        ]
        return normalized

    def _coerce_entity_type(self, item: Dict[str, Any]) -> Dict[str, Any]:
        name = item.get("name") or item.get("type") or item.get("label")
        attributes = item.get("attributes") if isinstance(item.get("attributes"), list) else []
        return {
            "name": name,
            "description": item.get("description") or item.get("summary") or f"{name} entity.",
            "attributes": attributes,
            "examples": item.get("examples") if isinstance(item.get("examples"), list) else [],
        }

    def _coerce_edge_type(self, item: Dict[str, Any]) -> Dict[str, Any]:
        name = item.get("name") or item.get("type") or item.get("relation")
        source_targets = item.get("source_targets")
        if not isinstance(source_targets, list):
            source = item.get("source") or item.get("source_type")
            target = item.get("target") or item.get("target_type")
            source_targets = [{"source": source or "Person", "target": target or "Organization"}]
        return {
            "name": name,
            "description": item.get("description") or item.get("summary") or f"{name} relation.",
            "source_targets": source_targets,
            "attributes": item.get("attributes") if isinstance(item.get("attributes"), list) else [],
        }

    def _has_usable_ontology(self, result: Dict[str, Any]) -> bool:
        return bool(result.get("entity_types")) and bool(result.get("edge_types"))

    def _document_aware_fallback(
        self,
        document_texts: List[str],
        simulation_requirement: str,
    ) -> Dict[str, Any]:
        corpus = "\n".join(document_texts or [])[:50000]
        lower = corpus.lower()
        fate_markers = [
            "fate/grand order",
            "lostbelt",
            "阿瓦隆",
            "勒",
            "妖精",
            "摩根",
            "迦勒底",
            "不列顛",
            "奧伯龍",
            "科爾努諾斯",
        ]
        if any(marker in lower or marker in corpus for marker in fate_markers):
            return {
                "entity_types": [
                    {
                        "name": "FictionalCharacter",
                        "description": "Named story character or role in the Lostbelt narrative.",
                        "attributes": [{"name": "role", "type": "text", "description": "Narrative role"}],
                        "examples": ["Morgan", "Artoria Caster"],
                    },
                    {
                        "name": "Faction",
                        "description": "Political, military, or social group in the setting.",
                        "attributes": [{"name": "alignment", "type": "text", "description": "Faction alignment"}],
                        "examples": ["Chaldea", "Round Table"],
                    },
                    {
                        "name": "FairyClan",
                        "description": "Fairy clan or species group in Britain.",
                        "attributes": [{"name": "clan_role", "type": "text", "description": "Clan role"}],
                        "examples": ["Wind clan", "Fang clan"],
                    },
                    {
                        "name": "Kingdom",
                        "description": "Realm, court, or governing power.",
                        "attributes": [{"name": "ruler", "type": "text", "description": "Known ruler"}],
                        "examples": ["Camelot", "Fairy Britain"],
                    },
                    {
                        "name": "Deity",
                        "description": "Godlike or mythic entity affecting events.",
                        "attributes": [{"name": "domain", "type": "text", "description": "Mythic domain"}],
                        "examples": ["Cernunnos"],
                    },
                    {
                        "name": "Location",
                        "description": "Named place or region in the chronology.",
                        "attributes": [{"name": "region_type", "type": "text", "description": "Place category"}],
                        "examples": ["Avalon", "Britain"],
                    },
                    {
                        "name": "NarrativeEvent",
                        "description": "Major battle, calamity, or turning point.",
                        "attributes": [{"name": "era", "type": "text", "description": "Era or time marker"}],
                        "examples": ["Great Calamity", "Queen Morgan battle"],
                    },
                    {
                        "name": "SourceMaterial",
                        "description": "Official or community source cited by the report.",
                        "attributes": [{"name": "source_type", "type": "text", "description": "Source category"}],
                        "examples": ["Road to 7", "official soundtrack"],
                    },
                    {
                        "name": "Person",
                        "description": "Any individual not fitting other specific person types.",
                        "attributes": [{"name": "full_name", "type": "text", "description": "Full name"}],
                        "examples": ["writer", "commentator"],
                    },
                    {
                        "name": "Organization",
                        "description": "Any organization not fitting other specific organization types.",
                        "attributes": [{"name": "org_name", "type": "text", "description": "Organization name"}],
                        "examples": ["publisher", "studio"],
                    },
                ],
                "edge_types": [
                    {
                        "name": "APPEARS_IN",
                        "description": "Entity appears in a source, era, or event.",
                        "source_targets": [{"source": "FictionalCharacter", "target": "NarrativeEvent"}],
                        "attributes": [],
                    },
                    {
                        "name": "RULES",
                        "description": "Character or power rules a realm or group.",
                        "source_targets": [{"source": "FictionalCharacter", "target": "Kingdom"}],
                        "attributes": [],
                    },
                    {
                        "name": "ALLIED_WITH",
                        "description": "Entity is allied or cooperating with another.",
                        "source_targets": [{"source": "Faction", "target": "Faction"}, {"source": "FictionalCharacter", "target": "Faction"}],
                        "attributes": [],
                    },
                    {
                        "name": "OPPOSES",
                        "description": "Entity opposes another entity or faction.",
                        "source_targets": [{"source": "FictionalCharacter", "target": "FictionalCharacter"}, {"source": "Faction", "target": "Faction"}],
                        "attributes": [],
                    },
                    {
                        "name": "LOCATED_IN",
                        "description": "Entity or event is located in a place.",
                        "source_targets": [{"source": "NarrativeEvent", "target": "Location"}, {"source": "Kingdom", "target": "Location"}],
                        "attributes": [],
                    },
                    {
                        "name": "CAUSES",
                        "description": "Entity or event causes another event.",
                        "source_targets": [{"source": "NarrativeEvent", "target": "NarrativeEvent"}, {"source": "Deity", "target": "NarrativeEvent"}],
                        "attributes": [],
                    },
                    {
                        "name": "DOCUMENTS",
                        "description": "Source material documents an entity or event.",
                        "source_targets": [{"source": "SourceMaterial", "target": "NarrativeEvent"}, {"source": "SourceMaterial", "target": "FictionalCharacter"}],
                        "attributes": [],
                    },
                    {
                        "name": "TRANSFORMS_INTO",
                        "description": "Entity changes form, role, or state.",
                        "source_targets": [{"source": "FictionalCharacter", "target": "FictionalCharacter"}, {"source": "NarrativeEvent", "target": "NarrativeEvent"}],
                        "attributes": [],
                    },
                ],
                "analysis_summary": f"Document-aware fallback ontology generated for Fate/Lostbelt content: {simulation_requirement[:200]}",
            }

        return self._fallback_ontology(simulation_requirement)

    def _fallback_ontology(self, simulation_requirement: str) -> Dict[str, Any]:
        """Deterministic ontology used when a local LLM fails JSON generation."""
        return {
            "entity_types": [
                {
                    "name": "Journalist",
                    "description": "Reporter or editor participating in public discourse.",
                    "attributes": [{"name": "role", "type": "text", "description": "Media role"}],
                    "examples": ["reporter", "editor"],
                },
                {
                    "name": "MediaOutlet",
                    "description": "Media organization publishing news or commentary.",
                    "attributes": [{"name": "org_name", "type": "text", "description": "Outlet name"}],
                    "examples": ["newspaper", "online media"],
                },
                {
                    "name": "Company",
                    "description": "Business organization involved in the issue.",
                    "attributes": [{"name": "industry", "type": "text", "description": "Industry"}],
                    "examples": ["company", "platform"],
                },
                {
                    "name": "GovernmentAgency",
                    "description": "Government or regulator relevant to the event.",
                    "attributes": [{"name": "jurisdiction", "type": "text", "description": "Jurisdiction"}],
                    "examples": ["regulator", "department"],
                },
                {
                    "name": "Official",
                    "description": "Public official or authority figure.",
                    "attributes": [{"name": "title", "type": "text", "description": "Official title"}],
                    "examples": ["mayor", "spokesperson"],
                },
                {
                    "name": "Expert",
                    "description": "Analyst, scholar, or professional commentator.",
                    "attributes": [{"name": "specialty", "type": "text", "description": "Expertise"}],
                    "examples": ["researcher", "lawyer"],
                },
                {
                    "name": "CommunityGroup",
                    "description": "Grassroots group or collective actor.",
                    "attributes": [{"name": "focus", "type": "text", "description": "Group focus"}],
                    "examples": ["local group", "advocacy group"],
                },
                {
                    "name": "Influencer",
                    "description": "Online personality with audience influence.",
                    "attributes": [{"name": "platform", "type": "text", "description": "Main platform"}],
                    "examples": ["blogger", "creator"],
                },
                {
                    "name": "Person",
                    "description": "Any individual person not fitting specific person types.",
                    "attributes": [{"name": "full_name", "type": "text", "description": "Full name"}],
                    "examples": ["ordinary citizen", "witness"],
                },
                {
                    "name": "Organization",
                    "description": "Any organization not fitting specific organization types.",
                    "attributes": [{"name": "org_name", "type": "text", "description": "Organization name"}],
                    "examples": ["association", "small organization"],
                },
            ],
            "edge_types": [
                {
                    "name": "WORKS_FOR",
                    "description": "Employment or affiliation relationship.",
                    "source_targets": [{"source": "Person", "target": "Organization"}, {"source": "Journalist", "target": "MediaOutlet"}],
                    "attributes": [],
                },
                {
                    "name": "REPORTS_ON",
                    "description": "Publishes or reports about an actor.",
                    "source_targets": [{"source": "MediaOutlet", "target": "Organization"}, {"source": "Journalist", "target": "Person"}],
                    "attributes": [],
                },
                {
                    "name": "RESPONDS_TO",
                    "description": "Publicly responds to another actor.",
                    "source_targets": [{"source": "Person", "target": "Person"}, {"source": "Organization", "target": "Organization"}],
                    "attributes": [],
                },
                {
                    "name": "SUPPORTS",
                    "description": "Expresses support for another actor.",
                    "source_targets": [{"source": "Person", "target": "Organization"}, {"source": "Organization", "target": "Person"}],
                    "attributes": [],
                },
                {
                    "name": "OPPOSES",
                    "description": "Expresses opposition to another actor.",
                    "source_targets": [{"source": "Person", "target": "Organization"}, {"source": "Organization", "target": "Person"}],
                    "attributes": [],
                },
                {
                    "name": "COLLABORATES_WITH",
                    "description": "Cooperates with another actor.",
                    "source_targets": [{"source": "Organization", "target": "Organization"}, {"source": "Person", "target": "Person"}],
                    "attributes": [],
                },
                {
                    "name": "INFLUENCES",
                    "description": "Influences opinions or decisions.",
                    "source_targets": [{"source": "Influencer", "target": "Person"}, {"source": "MediaOutlet", "target": "Person"}],
                    "attributes": [],
                },
                {
                    "name": "REGULATES",
                    "description": "Regulatory or oversight relation.",
                    "source_targets": [{"source": "GovernmentAgency", "target": "Company"}, {"source": "Official", "target": "Organization"}],
                    "attributes": [],
                },
            ],
            "analysis_summary": f"Fallback ontology generated for: {simulation_requirement[:200]}",
        }

    def _context_input_budget(
        self,
        system_prompt: str,
        simulation_requirement: str,
        additional_context: Optional[str],
    ) -> int:
        empty_user_message = self._build_user_message(
            [""],
            simulation_requirement,
            additional_context,
            chunk_index=1,
            chunk_count=1,
        )
        reserved = (
            _estimate_tokens(system_prompt)
            + _estimate_tokens(empty_user_message)
            + Config.ONTOLOGY_MAX_OUTPUT_TOKENS
            + Config.ONTOLOGY_PROMPT_MARGIN_TOKENS
        )
        return max(512, Config.LLM_CONTEXT_WINDOW - reserved)

    def _build_document_chunks(
        self,
        document_texts: List[str],
        simulation_requirement: str,
        additional_context: Optional[str],
        system_prompt: str,
    ) -> List[str]:
        budget = self._context_input_budget(system_prompt, simulation_requirement, additional_context)
        chunks: List[str] = []

        for text in document_texts:
            normalized = text or ""
            current_parts: List[str] = []
            current_tokens = 0
            for part in self._iter_text_parts(normalized, budget):
                part_tokens = _estimate_tokens(part)
                if current_parts and current_tokens + part_tokens > budget:
                    chunks.append("\n\n".join(current_parts))
                    current_parts = []
                    current_tokens = 0
                current_parts.append(part)
                current_tokens += part_tokens
            if current_parts:
                chunks.append("\n\n".join(current_parts))

        if not chunks:
            return [""]

        max_chunks = max(1, Config.ONTOLOGY_MAX_CHUNKS)
        if len(chunks) > max_chunks:
            logger.warning(
                "Ontology input produced %s chunks; keeping first %s to avoid endpoint overload",
                len(chunks),
                max_chunks,
            )
            chunks = chunks[:max_chunks]

        return chunks

    def _iter_text_parts(self, text: str, budget: int):
        paragraphs = [part.strip() for part in re.split(r'\n{2,}', text) if part.strip()]
        if not paragraphs:
            paragraphs = [text.strip()] if text.strip() else [""]

        for paragraph in paragraphs:
            if _estimate_tokens(paragraph) <= budget:
                yield paragraph
                continue

            sentences = [part.strip() for part in re.split(r'(?<=[。！？.!?])\s*', paragraph) if part.strip()]
            if not sentences:
                sentences = [paragraph]

            current = ""
            for sentence in sentences:
                if _estimate_tokens(sentence) > budget:
                    if current:
                        yield current
                        current = ""
                    yield from self._split_oversize_text(sentence, budget)
                    continue
                candidate = f"{current}\n{sentence}".strip() if current else sentence
                if current and _estimate_tokens(candidate) > budget:
                    yield current
                    current = sentence
                else:
                    current = candidate
            if current:
                yield current

    def _split_oversize_text(self, text: str, budget: int):
        # Conservative char window: CJK can be close to one token per char.
        window = max(800, min(len(text), budget * 2))
        start = 0
        while start < len(text):
            end = min(len(text), start + window)
            chunk = text[start:end]
            while _estimate_tokens(chunk) > budget and len(chunk) > 500:
                end = start + max(500, (end - start) // 2)
                chunk = text[start:end]
            yield chunk
            start = end

    def _build_user_message(
        self,
        document_texts: List[str],
        simulation_requirement: str,
        additional_context: Optional[str],
        chunk_index: int = 1,
        chunk_count: int = 1,
    ) -> str:
        """构建用户消息"""

        # 合并文本
        combined_text = "\n\n---\n\n".join(document_texts)

        message = f"""## 模拟需求

{simulation_requirement}

## 文档内容

{combined_text}
"""

        if additional_context:
            message += f"""
## 额外说明

{additional_context}
"""

        message += """
请根据以上内容，设计适合社会舆论模拟的实体类型和关系类型。

**必须遵守的规则**：
1. 必须正好输出10个实体类型
2. 最后2个必须是兜底类型：Person（个人兜底）和 Organization（组织兜底）
3. 前8个是根据文本内容设计的具体类型
4. 所有实体类型必须是现实中可以发声的主体，不能是抽象概念
5. 属性名不能使用 name、uuid、group_id 等保留字，用 full_name、org_name 等替代
"""
        if chunk_count > 1:
            message += f"\n当前是第 {chunk_index}/{chunk_count} 个文本分片。请只基于当前分片生成候选本体，后续系统会合并去重。\n"

        return message

    def _merge_ontologies(self, results: List[Dict[str, Any]], simulation_requirement: str) -> Dict[str, Any]:
        merged = {
            "entity_types": [],
            "edge_types": [],
            "analysis_summary": "",
        }
        seen_entities = set()
        seen_edges = set()
        summaries = []

        for result in results:
            for entity in result.get("entity_types", []):
                name = _to_pascal_case(str(entity.get("name", "")))
                if not name or name in seen_entities:
                    continue
                seen_entities.add(name)
                entity = dict(entity)
                entity["name"] = name
                merged["entity_types"].append(entity)

            for edge in result.get("edge_types", []):
                name = str(edge.get("name", "")).upper()
                if not name or name in seen_edges:
                    continue
                seen_edges.add(name)
                edge = dict(edge)
                edge["name"] = name
                merged["edge_types"].append(edge)

            summary = str(result.get("analysis_summary", "")).strip()
            if summary:
                summaries.append(summary)

        if not merged["entity_types"] or not merged["edge_types"]:
            return self._fallback_ontology(simulation_requirement)

        merged["analysis_summary"] = " ".join(summaries[:3]) or f"Ontology generated for: {simulation_requirement[:200]}"
        return merged

    def _validate_and_process(
        self,
        result: Dict[str, Any],
        fill_ontology: Optional[Dict[str, Any]] = None,
    ) -> Dict[str, Any]:
        """验证和后处理结果"""

        # 确保必要字段存在
        if "entity_types" not in result:
            result["entity_types"] = []
        if "edge_types" not in result:
            result["edge_types"] = []
        if "analysis_summary" not in result:
            result["analysis_summary"] = ""

        # 验证实体类型
        # 记录原始名称到 PascalCase 的映射，用于后续修正 edge 的 source_targets 引用
        entity_name_map = {}
        for entity in result["entity_types"]:
            # 强制将 entity name 转为 PascalCase（Zep API 要求）
            if "name" in entity:
                original_name = entity["name"]
                entity["name"] = _to_pascal_case(original_name)
                if entity["name"] != original_name:
                    logger.warning(f"Entity type name '{original_name}' auto-converted to '{entity['name']}'")
                entity_name_map[original_name] = entity["name"]
            if "attributes" not in entity:
                entity["attributes"] = []
            if "examples" not in entity:
                entity["examples"] = []
            # 确保description不超过100字符
            if len(entity.get("description", "")) > 100:
                entity["description"] = entity["description"][:97] + "..."

        # 验证关系类型
        for edge in result["edge_types"]:
            # 强制将 edge name 转为 SCREAMING_SNAKE_CASE（Zep API 要求）
            if "name" in edge:
                original_name = edge["name"]
                edge["name"] = original_name.upper()
                if edge["name"] != original_name:
                    logger.warning(f"Edge type name '{original_name}' auto-converted to '{edge['name']}'")
            # 修正 source_targets 中的实体名称引用，与转换后的 PascalCase 保持一致
            for st in edge.get("source_targets", []):
                if st.get("source") in entity_name_map:
                    st["source"] = entity_name_map[st["source"]]
                if st.get("target") in entity_name_map:
                    st["target"] = entity_name_map[st["target"]]
            if "source_targets" not in edge:
                edge["source_targets"] = []
            if "attributes" not in edge:
                edge["attributes"] = []
            if len(edge.get("description", "")) > 100:
                edge["description"] = edge["description"][:97] + "..."

        # Zep API 限制：最多 10 个自定义实体类型，最多 10 个自定义边类型
        MAX_ENTITY_TYPES = 10
        MAX_EDGE_TYPES = 10

        # 去重：按 name 去重，保留首次出现的
        seen_names = set()
        deduped = []
        for entity in result["entity_types"]:
            name = entity.get("name", "")
            if name and name not in seen_names:
                seen_names.add(name)
                deduped.append(entity)
            elif name in seen_names:
                logger.warning(f"Duplicate entity type '{name}' removed during validation")
        result["entity_types"] = deduped

        # 兜底类型定义
        person_fallback = {
            "name": "Person",
            "description": "Any individual person not fitting other specific person types.",
            "attributes": [
                {"name": "full_name", "type": "text", "description": "Full name of the person"},
                {"name": "role", "type": "text", "description": "Role or occupation"}
            ],
            "examples": ["ordinary citizen", "anonymous netizen"]
        }

        organization_fallback = {
            "name": "Organization",
            "description": "Any organization not fitting other specific organization types.",
            "attributes": [
                {"name": "org_name", "type": "text", "description": "Name of the organization"},
                {"name": "org_type", "type": "text", "description": "Type of organization"}
            ],
            "examples": ["small business", "community group"]
        }

        # 检查是否已有兜底类型
        entity_names = {e["name"] for e in result["entity_types"]}
        has_person = "Person" in entity_names
        has_organization = "Organization" in entity_names

        # 需要添加的兜底类型
        fallbacks_to_add = []
        if not has_person:
            fallbacks_to_add.append(person_fallback)
        if not has_organization:
            fallbacks_to_add.append(organization_fallback)

        if fallbacks_to_add:
            current_count = len(result["entity_types"])
            needed_slots = len(fallbacks_to_add)

            # 如果添加后会超过 10 个，需要移除一些现有类型
            if current_count + needed_slots > MAX_ENTITY_TYPES:
                # 计算需要移除多少个
                to_remove = current_count + needed_slots - MAX_ENTITY_TYPES
                # 从末尾移除（保留前面更重要的具体类型）
                result["entity_types"] = result["entity_types"][:-to_remove]

            # 添加兜底类型
            result["entity_types"].extend(fallbacks_to_add)

        # Local LLMs sometimes return too few types. Fill from deterministic
        # social-simulation defaults so downstream graph setup always has a
        # usable ontology instead of failing later.
        fill_ontology = fill_ontology or self._fallback_ontology("")

        if len(result["entity_types"]) < MAX_ENTITY_TYPES:
            entity_names = {e["name"] for e in result["entity_types"]}
            for fallback_entity in fill_ontology.get("entity_types", []):
                if len(result["entity_types"]) >= MAX_ENTITY_TYPES:
                    break
                if fallback_entity["name"] in entity_names:
                    continue
                result["entity_types"].append(fallback_entity)
                entity_names.add(fallback_entity["name"])

        # 最终确保不超过限制（防御性编程）
        if len(result["entity_types"]) > MAX_ENTITY_TYPES:
            result["entity_types"] = result["entity_types"][:MAX_ENTITY_TYPES]

        if len(result["edge_types"]) > MAX_EDGE_TYPES:
            result["edge_types"] = result["edge_types"][:MAX_EDGE_TYPES]

        if len(result["edge_types"]) < 6:
            edge_names = {edge.get("name") for edge in result["edge_types"]}
            for fallback_edge in fill_ontology.get("edge_types", []):
                if len(result["edge_types"]) >= 6:
                    break
                if fallback_edge["name"] in edge_names:
                    continue
                result["edge_types"].append(fallback_edge)
                edge_names.add(fallback_edge["name"])

        return result

    def generate_python_code(self, ontology: Dict[str, Any]) -> str:
        """
        将本体定义转换为Python代码（类似ontology.py）

        Args:
            ontology: 本体定义

        Returns:
            Python代码字符串
        """
        code_lines = [
            '"""',
            '自定义实体类型定义',
            '由MiroFish自动生成，用于社会舆论模拟',
            '"""',
            '',
            'from pydantic import Field',
            'from zep_cloud.external_clients.ontology import EntityModel, EntityText, EdgeModel',
            '',
            '',
            '# ============== 实体类型定义 ==============',
            '',
        ]

        # 生成实体类型
        for entity in ontology.get("entity_types", []):
            name = entity["name"]
            desc = entity.get("description", f"A {name} entity.")

            code_lines.append(f'class {name}(EntityModel):')
            code_lines.append(f'    """{desc}"""')

            attrs = entity.get("attributes", [])
            if attrs:
                for attr in attrs:
                    attr_name = attr["name"]
                    attr_desc = attr.get("description", attr_name)
                    code_lines.append(f'    {attr_name}: EntityText = Field(')
                    code_lines.append(f'        description="{attr_desc}",')
                    code_lines.append(f'        default=None')
                    code_lines.append(f'    )')
            else:
                code_lines.append('    pass')

            code_lines.append('')
            code_lines.append('')

        code_lines.append('# ============== 关系类型定义 ==============')
        code_lines.append('')

        # 生成关系类型
        for edge in ontology.get("edge_types", []):
            name = edge["name"]
            # 转换为PascalCase类名
            class_name = ''.join(word.capitalize() for word in name.split('_'))
            desc = edge.get("description", f"A {name} relationship.")

            code_lines.append(f'class {class_name}(EdgeModel):')
            code_lines.append(f'    """{desc}"""')

            attrs = edge.get("attributes", [])
            if attrs:
                for attr in attrs:
                    attr_name = attr["name"]
                    attr_desc = attr.get("description", attr_name)
                    code_lines.append(f'    {attr_name}: EntityText = Field(')
                    code_lines.append(f'        description="{attr_desc}",')
                    code_lines.append(f'        default=None')
                    code_lines.append(f'    )')
            else:
                code_lines.append('    pass')

            code_lines.append('')
            code_lines.append('')

        # 生成类型字典
        code_lines.append('# ============== 类型配置 ==============')
        code_lines.append('')
        code_lines.append('ENTITY_TYPES = {')
        for entity in ontology.get("entity_types", []):
            name = entity["name"]
            code_lines.append(f'    "{name}": {name},')
        code_lines.append('}')
        code_lines.append('')
        code_lines.append('EDGE_TYPES = {')
        for edge in ontology.get("edge_types", []):
            name = edge["name"]
            class_name = ''.join(word.capitalize() for word in name.split('_'))
            code_lines.append(f'    "{name}": {class_name},')
        code_lines.append('}')
        code_lines.append('')

        # 生成边的source_targets映射
        code_lines.append('EDGE_SOURCE_TARGETS = {')
        for edge in ontology.get("edge_types", []):
            name = edge["name"]
            source_targets = edge.get("source_targets", [])
            if source_targets:
                st_list = ', '.join([
                    f'{{"source": "{st.get("source", "Entity")}", "target": "{st.get("target", "Entity")}"}}'
                    for st in source_targets
                ])
                code_lines.append(f'    "{name}": [{st_list}],')
        code_lines.append('}')

        return '\n'.join(code_lines)