MicroFish/backend/app/services/ontology_generator.py

"""Ontology generation service.

Pipeline step 1: analyze the source text and propose entity and relationship
types that fit a social-media opinion simulation.
"""

import json
import logging
import re
from typing import Dict, Any, List, Optional
from ..utils.llm_client import LLMClient
from ..utils.locale import get_language_instruction

logger = logging.getLogger(__name__)


def _to_pascal_case(name: str) -> str:
    """Convert an arbitrary identifier to PascalCase (e.g. ``works_for`` -> ``WorksFor``)."""
    # Split on non-alphanumeric separators first.
    parts = re.split(r'[^a-zA-Z0-9]+', name)
    # Then split on camelCase boundaries (e.g. ``camelCase`` -> ``['camel', 'Case']``).
    words = []
    for part in parts:
        words.extend(re.sub(r'([a-z])([A-Z])', r'\1_\2', part).split('_'))
    # Title-case each non-empty word and concatenate.
    result = ''.join(word.capitalize() for word in words if word)
    return result if result else 'Unknown'


# System prompt template for ontology generation.
ONTOLOGY_SYSTEM_PROMPT = """You are a professional knowledge-graph ontology designer. Your task is to analyze the supplied text and simulation requirement and design entity types and relationship types suitable for a **social-media public-opinion simulation**.

**Important: you must output valid JSON data and nothing else.**

## Core Task Background

We are building a **social-media public-opinion simulation system**. In this system:
- Every entity is an "account" or "actor" that can post on social media, interact with other accounts, and propagate information.
- Entities influence each other, repost, comment on, and respond to one another.
- We need to simulate how each side of a public-opinion event reacts and how information flows.

Therefore, **entities must be real-world subjects that can plausibly post on social media and interact with others**:

**Acceptable**:
- Specific individuals (public figures, parties to the event, opinion leaders, experts and scholars, ordinary people)
- Companies and businesses (including their official accounts)
- Organizations (universities, associations, NGOs, unions, etc.)
- Government departments and regulators
- Media organizations (newspapers, broadcasters, independent media, websites)
- Social-media platforms themselves
- Representatives of specific groups (alumni associations, fan communities, advocacy groups, etc.)

**Not acceptable**:
- Abstract concepts (such as "public opinion", "sentiment", "trend")
- Topics or subjects (such as "academic integrity", "education reform")
- Viewpoints or stances (such as "supporters", "opponents")

## Output Format

Return JSON with the following structure:

```json
{
    "entity_types": [
        {
            "name": "entity type name (English, PascalCase)",
            "description": "short description (English, no more than 100 characters)",
            "attributes": [
                {
                    "name": "attribute name (English, snake_case)",
                    "type": "text",
                    "description": "attribute description"
                }
            ],
            "examples": ["example entity 1", "example entity 2"]
        }
    ],
    "edge_types": [
        {
            "name": "relationship type name (English, UPPER_SNAKE_CASE)",
            "description": "short description (English, no more than 100 characters)",
            "source_targets": [
                {"source": "source entity type", "target": "target entity type"}
            ],
            "attributes": []
        }
    ],
    "analysis_summary": "brief analytical summary of the text content"
}
```

## Design Guidelines (must be followed)

### 1. Entity Type Design - strictly required

**Count requirement: exactly 10 entity types.**

**Hierarchy requirement (must include both concrete types and fallback types)**:

Your 10 entity types must form the following hierarchy:

A. **Fallback types (mandatory; placed as the last 2 entries)**:
   - `Person`: the fallback type for any individual. When a person does not fit any more specific person type, classify them here.
   - `Organization`: the fallback type for any organization. When an organization does not fit any more specific organization type, classify it here.

B. **Concrete types (8 entries, designed from the text content)**:
   - Define more specific types for the major roles that appear in the text.
   - Example: for an academic event, you might use `Student`, `Professor`, `University`.
   - Example: for a business event, you might use `Company`, `CEO`, `Employee`.

**Why fallback types are required**:
- The text will mention many kinds of people, e.g. "primary-school teachers", "passersby", "an anonymous netizen".
- When no dedicated type fits, they should fall into `Person`.
- Likewise, small organizations and ad-hoc groups should fall into `Organization`.

**Principles for concrete types**:
- Identify the high-frequency or pivotal role types in the text.
- Each concrete type should have a clear boundary and avoid overlap.
- The description must clearly state how the concrete type differs from the corresponding fallback type.

### 2. Relationship Type Design

- Count: 6 to 10.
- Relationships should reflect realistic interactions on social media.
- Ensure each relationship's source_targets cover the entity types you defined.

### 3. Attribute Design

- 1 to 3 key attributes per entity type.
- **Note**: attribute names must not use `name`, `uuid`, `group_id`, `created_at`, or `summary` (these are reserved by the system).
- Recommended names: `full_name`, `title`, `role`, `position`, `location`, `description`, etc.

## Entity Type Reference

**Individuals (concrete)**:
- Student: a student.
- Professor: a professor or scholar.
- Journalist: a journalist.
- Celebrity: a celebrity or internet personality.
- Executive: a senior business leader.
- Official: a government official.
- Lawyer: a lawyer.
- Doctor: a physician.

**Individuals (fallback)**:
- Person: any individual person (use when no concrete person type above applies).

**Organizations (concrete)**:
- University: a university or higher-education institution.
- Company: a company or business.
- GovernmentAgency: a government agency.
- MediaOutlet: a media organization.
- Hospital: a hospital.
- School: a primary or secondary school.
- NGO: a non-governmental organization.

**Organizations (fallback)**:
- Organization: any organization (use when no concrete organization type above applies).

## Relationship Type Reference

- WORKS_FOR: works for.
- STUDIES_AT: studies at.
- AFFILIATED_WITH: is affiliated with.
- REPRESENTS: represents.
- REGULATES: regulates.
- REPORTS_ON: reports on.
- COMMENTS_ON: comments on.
- RESPONDS_TO: responds to.
- SUPPORTS: supports.
- OPPOSES: opposes.
- COLLABORATES_WITH: collaborates with.
- COMPETES_WITH: competes with.
"""


class OntologyGenerator:
    """Generate an entity- and edge-type ontology from arbitrary input text."""

    def __init__(self, llm_client: Optional[LLMClient] = None):
        self.llm_client = llm_client or LLMClient()

    def generate(
        self,
        document_texts: List[str],
        simulation_requirement: str,
        additional_context: Optional[str] = None
    ) -> Dict[str, Any]:
        """Generate an ontology definition.

        Args:
            document_texts: Source document text segments.
            simulation_requirement: Description of the simulation goal.
            additional_context: Optional supplemental context.

        Returns:
            The ontology dict with ``entity_types``, ``edge_types``, and a summary.
        """
        # Compose the user message that frames the LLM request.
        user_message = self._build_user_message(
            document_texts,
            simulation_requirement,
            additional_context
        )

        lang_instruction = get_language_instruction()
        system_prompt = f"{ONTOLOGY_SYSTEM_PROMPT}\n\n{lang_instruction}\nIMPORTANT: Entity type names MUST be in English PascalCase (e.g., 'PersonEntity', 'MediaOrganization'). Relationship type names MUST be in English UPPER_SNAKE_CASE (e.g., 'WORKS_FOR'). Attribute names MUST be in English snake_case. Only description fields and analysis_summary should use the specified language above."
        messages = [
            {"role": "system", "content": system_prompt},
            {"role": "user", "content": user_message}
        ]

        # Invoke the LLM.
        result = self.llm_client.chat_json(
            messages=messages,
            temperature=0.3,
            max_tokens=4096
        )

        # Validate the LLM response and post-process it.
        result = self._validate_and_process(result)

        return result

    # Maximum length of source text passed to the LLM (50k characters).
    MAX_TEXT_LENGTH_FOR_LLM = 50000

    def _build_user_message(
        self,
        document_texts: List[str],
        simulation_requirement: str,
        additional_context: Optional[str]
    ) -> str:
        """Build the user-message string for the ontology LLM call."""

        # Concatenate the source documents into a single string.
        combined_text = "\n\n---\n\n".join(document_texts)
        original_length = len(combined_text)

        # If the combined text exceeds the LLM input cap, truncate it for the
        # LLM call only. The full text is still used for graph construction.
        if len(combined_text) > self.MAX_TEXT_LENGTH_FOR_LLM:
            combined_text = combined_text[:self.MAX_TEXT_LENGTH_FOR_LLM]
            combined_text += f"\n\n...(original text is {original_length} characters; only the first {self.MAX_TEXT_LENGTH_FOR_LLM} characters were used for ontology analysis)..."

        message = f"""## Simulation Requirement

{simulation_requirement}

## Document Content

{combined_text}
"""

        if additional_context:
            message += f"""
## Additional Context

{additional_context}
"""

        message += """
Based on the content above, design entity types and relationship types suitable for a social-media public-opinion simulation.

**Rules that must be followed**:
1. You must output exactly 10 entity types.
2. The last 2 must be fallback types: Person (individual fallback) and Organization (organization fallback).
3. The first 8 are concrete types designed from the text content.
4. Every entity type must be a real-world subject that can post on social media; abstract concepts are not allowed.
5. Attribute names must not use reserved words such as name, uuid, group_id; use alternatives such as full_name, org_name, etc.
"""

        return message

    def _validate_and_process(self, result: Dict[str, Any]) -> Dict[str, Any]:
        """Validate and post-process the LLM-generated ontology dict."""

        # Ensure required top-level fields exist.
        if "entity_types" not in result:
            result["entity_types"] = []
        if "edge_types" not in result:
            result["edge_types"] = []
        if "analysis_summary" not in result:
            result["analysis_summary"] = ""

        # Validate entity types.
        # Track original-name -> PascalCase mapping so edge source_targets
        # references can be fixed up consistently below.
        entity_name_map = {}
        for entity in result["entity_types"]:
            # Force entity names to PascalCase (required by the Zep API).
            if "name" in entity:
                original_name = entity["name"]
                entity["name"] = _to_pascal_case(original_name)
                if entity["name"] != original_name:
                    logger.warning(f"Entity type name '{original_name}' auto-converted to '{entity['name']}'")
                entity_name_map[original_name] = entity["name"]
            if "attributes" not in entity:
                entity["attributes"] = []
            if "examples" not in entity:
                entity["examples"] = []
            # Truncate descriptions longer than 100 characters.
            if len(entity.get("description", "")) > 100:
                entity["description"] = entity["description"][:97] + "..."

        # Validate edge types.
        for edge in result["edge_types"]:
            # Force edge names to SCREAMING_SNAKE_CASE (required by the Zep API).
            if "name" in edge:
                original_name = edge["name"]
                edge["name"] = original_name.upper()
                if edge["name"] != original_name:
                    logger.warning(f"Edge type name '{original_name}' auto-converted to '{edge['name']}'")
            # Rewrite source_targets entity-name references to match the
            # PascalCase-normalized entity names.
            for st in edge.get("source_targets", []):
                if st.get("source") in entity_name_map:
                    st["source"] = entity_name_map[st["source"]]
                if st.get("target") in entity_name_map:
                    st["target"] = entity_name_map[st["target"]]
            if "source_targets" not in edge:
                edge["source_targets"] = []
            if "attributes" not in edge:
                edge["attributes"] = []
            if len(edge.get("description", "")) > 100:
                edge["description"] = edge["description"][:97] + "..."

        # Zep API caps: at most 10 custom entity types and 10 custom edge types.
        MAX_ENTITY_TYPES = 10
        MAX_EDGE_TYPES = 10

        # Deduplicate by name, keeping the first occurrence.
        seen_names = set()
        deduped = []
        for entity in result["entity_types"]:
            name = entity.get("name", "")
            if name and name not in seen_names:
                seen_names.add(name)
                deduped.append(entity)
            elif name in seen_names:
                logger.warning(f"Duplicate entity type '{name}' removed during validation")
        result["entity_types"] = deduped

        # Fallback entity-type definitions used when the LLM omits them.
        person_fallback = {
            "name": "Person",
            "description": "Any individual person not fitting other specific person types.",
            "attributes": [
                {"name": "full_name", "type": "text", "description": "Full name of the person"},
                {"name": "role", "type": "text", "description": "Role or occupation"}
            ],
            "examples": ["ordinary citizen", "anonymous netizen"]
        }

        organization_fallback = {
            "name": "Organization",
            "description": "Any organization not fitting other specific organization types.",
            "attributes": [
                {"name": "org_name", "type": "text", "description": "Name of the organization"},
                {"name": "org_type", "type": "text", "description": "Type of organization"}
            ],
            "examples": ["small business", "community group"]
        }

        # Check whether the fallback types are already present.
        entity_names = {e["name"] for e in result["entity_types"]}
        has_person = "Person" in entity_names
        has_organization = "Organization" in entity_names

        # Collect missing fallback types to add below.
        fallbacks_to_add = []
        if not has_person:
            fallbacks_to_add.append(person_fallback)
        if not has_organization:
            fallbacks_to_add.append(organization_fallback)

        if fallbacks_to_add:
            current_count = len(result["entity_types"])
            needed_slots = len(fallbacks_to_add)

            # If adding the fallbacks would exceed the cap, drop some existing types.
            if current_count + needed_slots > MAX_ENTITY_TYPES:
                to_remove = current_count + needed_slots - MAX_ENTITY_TYPES
                # Drop trailing types first; the more specific types come earlier.
                result["entity_types"] = result["entity_types"][:-to_remove]

            result["entity_types"].extend(fallbacks_to_add)

        # Defensive cap enforcement: hard-trim if anything slipped through.
        if len(result["entity_types"]) > MAX_ENTITY_TYPES:
            result["entity_types"] = result["entity_types"][:MAX_ENTITY_TYPES]

        if len(result["edge_types"]) > MAX_EDGE_TYPES:
            result["edge_types"] = result["edge_types"][:MAX_EDGE_TYPES]

        return result

    def generate_python_code(self, ontology: Dict[str, Any]) -> str:
        """Render the ontology definition as Python source code.

        Args:
            ontology: Ontology definition dict.

        Returns:
            Python source code as a single string.
        """
        code_lines = [
            '"""',
            '自定义实体类型定义',
            '由MiroFish自动生成，用于社会舆论模拟',
            '"""',
            '',
            'from pydantic import Field',
            'from zep_cloud.external_clients.ontology import EntityModel, EntityText, EdgeModel',
            '',
            '',
            '# ============== 实体类型定义 ==============',
            '',
        ]

        # Emit each entity type as a Python class.
        for entity in ontology.get("entity_types", []):
            name = entity["name"]
            desc = entity.get("description", f"A {name} entity.")

            code_lines.append(f'class {name}(EntityModel):')
            code_lines.append(f'    """{desc}"""')

            attrs = entity.get("attributes", [])
            if attrs:
                for attr in attrs:
                    attr_name = attr["name"]
                    attr_desc = attr.get("description", attr_name)
                    code_lines.append(f'    {attr_name}: EntityText = Field(')
                    code_lines.append(f'        description="{attr_desc}",')
                    code_lines.append(f'        default=None')
                    code_lines.append(f'    )')
            else:
                code_lines.append('    pass')

            code_lines.append('')
            code_lines.append('')

        code_lines.append('# ============== 关系类型定义 ==============')
        code_lines.append('')

        # Emit each edge type as a Python class.
        for edge in ontology.get("edge_types", []):
            name = edge["name"]
            # Convert SCREAMING_SNAKE_CASE -> PascalCase for the class name.
            class_name = ''.join(word.capitalize() for word in name.split('_'))
            desc = edge.get("description", f"A {name} relationship.")

            code_lines.append(f'class {class_name}(EdgeModel):')
            code_lines.append(f'    """{desc}"""')

            attrs = edge.get("attributes", [])
            if attrs:
                for attr in attrs:
                    attr_name = attr["name"]
                    attr_desc = attr.get("description", attr_name)
                    code_lines.append(f'    {attr_name}: EntityText = Field(')
                    code_lines.append(f'        description="{attr_desc}",')
                    code_lines.append(f'        default=None')
                    code_lines.append(f'    )')
            else:
                code_lines.append('    pass')

            code_lines.append('')
            code_lines.append('')

        # Emit the type registries.
        code_lines.append('# ============== 类型配置 ==============')
        code_lines.append('')
        code_lines.append('ENTITY_TYPES = {')
        for entity in ontology.get("entity_types", []):
            name = entity["name"]
            code_lines.append(f'    "{name}": {name},')
        code_lines.append('}')
        code_lines.append('')
        code_lines.append('EDGE_TYPES = {')
        for edge in ontology.get("edge_types", []):
            name = edge["name"]
            class_name = ''.join(word.capitalize() for word in name.split('_'))
            code_lines.append(f'    "{name}": {class_name},')
        code_lines.append('}')
        code_lines.append('')

        # Emit the edge source_targets map.
        code_lines.append('EDGE_SOURCE_TARGETS = {')
        for edge in ontology.get("edge_types", []):
            name = edge["name"]
            source_targets = edge.get("source_targets", [])
            if source_targets:
                st_list = ', '.join([
                    f'{{"source": "{st.get("source", "Entity")}", "target": "{st.get("target", "Entity")}"}}'
                    for st in source_targets
                ])
                code_lines.append(f'    "{name}": [{st_list}],')
        code_lines.append('}')

        return '\n'.join(code_lines)