From dfabab5343e4690b4dbe706b61f21e51edf1aa29 Mon Sep 17 00:00:00 2001 From: Ubuntu Date: Sat, 25 Apr 2026 19:19:28 +0000 Subject: [PATCH] feat(graph): expand ontology to 20 types, pass attributes to Graphiti, add retry MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit - ontology_generator: increase entity limit to 20 (18 specific + 2 fallback), edge types to 20 (12-20 requested); bump max_tokens to 8192 - ontology_generator: fallback type names (Person/Organization) now resolved via i18n so Catalan gets Persona/Organització - locales: add ontologyFallback* keys to en/zh/es/ca - graph_builder: pass entity/edge attributes to non-Zep set_ontology (were discarded) - graphiti_backend: _make_model builds real Pydantic fields per attribute - graphiti_backend: _build_extraction_instructions includes per-entity attribute hints so the LLM knows which fields to extract - graphiti_backend: add_batch retries up to 3x on "node not found" race condition with exponential backoff (2s, 4s) before propagating the error Co-Authored-By: Claude Sonnet 4.6 --- CLAUDE.md | 80 ++++++ backend/app/graph/graphiti_backend.py | 294 ++++++++++++++++++--- backend/app/services/graph_builder.py | 50 ++-- backend/app/services/ontology_generator.py | 128 ++++----- locales/ca.json | 6 + locales/en.json | 6 + locales/es.json | 6 + locales/zh.json | 6 + 8 files changed, 447 insertions(+), 129 deletions(-) create mode 100644 CLAUDE.md diff --git a/CLAUDE.md b/CLAUDE.md new file mode 100644 index 00000000..baa5f878 --- /dev/null +++ b/CLAUDE.md @@ -0,0 +1,80 @@ +# CLAUDE.md + +This file provides guidance to Claude Code (claude.ai/code) when working with code in this repository. + +## Project Overview + +MiroFish is an AI-powered multi-agent simulation platform. It extracts entities from user-provided documents (PDF/MD/TXT), builds a knowledge graph (via Zep Cloud), generates agent personas, runs social interaction simulations (via OASIS/CAMEL-AI), and produces analytical reports. + +## Commands + +### Setup +```bash +cp .env.example .env # Configure API keys before first run +npm run setup:all # Install Node + Python dependencies (root, frontend, backend) +npm run setup # Node deps only +npm run setup:backend # Python deps only (uv venv) +``` + +### Development +```bash +npm run dev # Start both frontend (port 3000) & backend (port 5001) concurrently +npm run frontend # Frontend only +npm run backend # Backend only (uv run python run.py) +``` + +### Build +```bash +npm run build # Vite production build of frontend +``` + +### Testing +```bash +pytest # Run Python tests (pytest + pytest-asyncio available in venv) +``` + +Python 3.11–3.12 required (strict constraint). Node 18+ required. + +## Architecture + +### Overview +Full-stack monorepo: **Vue 3 SPA** (frontend, port 3000) + **Flask API** (backend, port 5001). Vite proxies all `/api/*` requests to the backend. + +### 5-Step Workflow Pipeline +1. **Graph Build** — Upload seed documents → entity/relationship extraction via LLM → Zep Cloud knowledge graph +2. **Environment Setup** — Agent persona generation (OASIS profiles) from the graph +3. **Simulation** — OASIS multi-agent simulation (Twitter + Reddit platforms) run as a subprocess +4. **Report** — ReportAgent (LLM with tool calling) analyzes simulation output +5. **Interaction** — Live chat with simulated agents + +### Key Backend Patterns +- **`models/project.py`** — `ProjectManager` singleton. Projects persist as `uploads/projects/{uuid}/project.json` + files. The server is the source of truth; frontend only holds a `projectId`. +- **`models/task.py`** — `TaskManager` singleton. In-memory async task tracking (PENDING → PROCESSING → COMPLETED|FAILED). Frontend polls `GET /api/.../task/{taskId}`. +- **`services/simulation_runner.py`** — Spawns OASIS as a subprocess. Communicates via IPC files at `/tmp/mirofish_sim_{id}_*.json`. Atexit cleanup registered. +- **`services/report_agent.py`** — Multi-turn LLM agent with tool use (Zep queries). Max 5 tool calls, 2 reflection rounds. +- **`utils/locale.py`** — Thread-local locale storage. Reads `Accept-Language` header from requests; falls back to thread-local for background workers (default: `zh`). + +### Key Frontend Patterns +- **`api/index.js`** — Axios instance with retry (`requestWithRetry`, 3 attempts, exponential backoff) and response interceptor. Auto-injects `Accept-Language` header from current locale. +- **`store/pendingUpload.js`** — Lightweight reactive state (no Vuex/Pinia) for deferred file uploads between views. +- Views are self-contained; no shared state beyond `projectId` in the URL route. + +### i18n +Translation files at `/locales/{en,zh}.json` are shared by both frontend and backend. The frontend uses `vue-i18n` v11 with `localStorage` persistence. The backend reads the `Accept-Language` header. `/locales/languages.json` also contains per-language LLM prompt instructions (to force LLM output language). + +### Configuration (`backend/app/config.py`) +Required environment variables (from `.env`): +- `LLM_API_KEY`, `LLM_BASE_URL`, `LLM_MODEL_NAME` — any OpenAI-compatible API (default: Qwen-plus via Alibaba Bailian) +- `ZEP_API_KEY` — Zep Cloud memory graph +- Optional: `LLM_BOOST_API_KEY`, `LLM_BOOST_BASE_URL`, `LLM_BOOST_MODEL_NAME` — faster secondary LLM + +## Git Remotes +- `origin` — this fork: `https://github.com/jaumemir/MiroFish` +- `upstream` — original project: `https://github.com/666ghj/MiroFish` + +To cherry-pick from upstream branches or PRs: +```bash +git fetch upstream +git cherry-pick +# or: git merge upstream/ +``` diff --git a/backend/app/graph/graphiti_backend.py b/backend/app/graph/graphiti_backend.py index 5a2436d4..b27b8060 100644 --- a/backend/app/graph/graphiti_backend.py +++ b/backend/app/graph/graphiti_backend.py @@ -1,6 +1,8 @@ """Graphiti + Neo4j implementation of GraphBackend.""" import asyncio +import json import threading +import typing import uuid as uuid_mod from typing import Any, Dict, List, Optional @@ -8,14 +10,77 @@ from .base import GraphBackend from ..config import Config from ..utils.logger import get_logger + +def _neo4j_val(v: Any) -> Any: + """Convert Neo4j native types to JSON-serializable Python types.""" + if v is None: + return None + t = type(v).__name__ + if t in ('DateTime', 'Date', 'Time', 'LocalDateTime', 'LocalTime', 'Duration'): + return str(v) + if isinstance(v, (list, tuple)): + return [_neo4j_val(i) for i in v] + if isinstance(v, dict): + return {k: _neo4j_val(vv) for k, vv in v.items()} + return v + + +def _neo4j_props(node_or_rel: Any) -> Dict[str, Any]: + """Return a JSON-safe dict of a Neo4j node or relationship's properties.""" + return {k: _neo4j_val(v) for k, v in dict(node_or_rel).items()} + logger = get_logger('mirofish.graph.graphiti') -def _run_async(coro): +def _make_azure_generic_client(config, client): + """Return an OpenAIGenericClient subclass that uses max_completion_tokens + instead of max_tokens — required by gpt-5 / o-series models on Azure.""" + from graphiti_core.llm_client.openai_generic_client import OpenAIGenericClient + import openai as _openai + from graphiti_core.llm_client.errors import RateLimitError as _RateLimitError + from pydantic import BaseModel as _BaseModel + + class _AzureGenericClient(OpenAIGenericClient): + async def _generate_response(self, messages, response_model=None, max_tokens=None, model_size=None): + from openai.types.chat import ChatCompletionMessageParam + if max_tokens is None: + max_tokens = self.max_tokens + openai_messages: list[ChatCompletionMessageParam] = [] + for m in messages: + if m.role == 'user': + openai_messages.append({'role': 'user', 'content': m.content}) + elif m.role == 'system': + openai_messages.append({'role': 'system', 'content': m.content}) + response_format: dict[str, Any] = {'type': 'json_object'} + if response_model is not None: + schema_name = getattr(response_model, '__name__', 'structured_response') + response_format = { + 'type': 'json_schema', + 'json_schema': { + 'name': schema_name, + 'schema': response_model.model_json_schema(), + }, + } + try: + response = await self.client.chat.completions.create( + model=self.model, + messages=openai_messages, + temperature=self.temperature, + max_completion_tokens=max_tokens, + response_format=response_format, + ) + return json.loads(response.choices[0].message.content or '{}') + except _openai.RateLimitError as e: + raise _RateLimitError from e + + return _AzureGenericClient(config=config, client=client) + + +def _run_async(coro, timeout=300): """Run an async coroutine from a sync context using a dedicated thread loop.""" loop = _get_event_loop() future = asyncio.run_coroutine_threadsafe(coro, loop) - return future.result(timeout=120) + return future.result(timeout=timeout) _loop: Optional[asyncio.AbstractEventLoop] = None @@ -45,51 +110,204 @@ class GraphitiBackend(GraphBackend): self._password = password or Config.NEO4J_PASSWORD if not self._password: raise ValueError("NEO4J_PASSWORD is not configured") + self._entity_types: Dict[str, Any] = {} + self._edge_types: Dict[str, Any] = {} + self._entity_defs: Dict[str, Any] = {} + self._edge_defs: Dict[str, Any] = {} self._client = self._build_client() + @staticmethod + def _parse_azure_url(raw_url: str): + """Strip /chat/completions or /embeddings suffix from Azure endpoint URLs. + Returns (clean_base_url, default_query_dict).""" + from urllib.parse import urlparse, parse_qs, urlunparse + default_query = {} + if raw_url and ('/chat/completions' in raw_url or '/embeddings' in raw_url): + parsed = urlparse(raw_url) + qs = parse_qs(parsed.query) + if 'api-version' in qs: + default_query['api-version'] = qs['api-version'][0] + clean_path = parsed.path.replace('/chat/completions', '').replace('/embeddings', '').rstrip('/') + raw_url = urlunparse(parsed._replace(path=clean_path, query='')) + return raw_url, default_query + def _build_client(self): from graphiti_core import Graphiti - from graphiti_core.llm_client.openai_client import OpenAIClient + from graphiti_core.llm_client.openai_generic_client import OpenAIGenericClient + from graphiti_core.llm_client.config import LLMConfig from graphiti_core.embedder.openai import OpenAIEmbedder, OpenAIEmbedderConfig - from neo4j import AsyncGraphDatabase + from graphiti_core.cross_encoder.openai_reranker_client import OpenAIRerankerClient + from openai import AsyncOpenAI - llm_client = OpenAIClient( + llm_base_url, llm_query = self._parse_azure_url(Config.LLM_BASE_URL) + small_base_url, small_query = self._parse_azure_url(Config.LLM_SMALL_BASE_URL) + embed_base_url, embed_query = self._parse_azure_url(Config.LLM_EMBED_BASE_URL) + + # Pre-built async clients so api-version is passed as default_query (Azure requirement) + async_llm_client = AsyncOpenAI( + api_key=Config.LLM_API_KEY, + base_url=llm_base_url, + default_query=llm_query or None, + ) + async_small_client = AsyncOpenAI( + api_key=Config.LLM_SMALL_API_KEY, + base_url=small_base_url, + default_query=small_query or None, + ) + async_embed_client = AsyncOpenAI( + api_key=Config.LLM_EMBED_API_KEY, + base_url=embed_base_url, + default_query=embed_query or None, + ) + + llm_config = LLMConfig( api_key=Config.LLM_API_KEY, model=Config.LLM_MODEL_NAME, - base_url=Config.LLM_BASE_URL, + small_model=Config.LLM_SMALL_MODEL_NAME, + base_url=llm_base_url, ) + llm_client = _make_azure_generic_client(config=llm_config, client=async_llm_client) embedder = OpenAIEmbedder( - OpenAIEmbedderConfig( - api_key=Config.LLM_API_KEY, - base_url=Config.LLM_BASE_URL, - ) + config=OpenAIEmbedderConfig( + api_key=Config.LLM_EMBED_API_KEY, + base_url=embed_base_url, + embedding_model=Config.LLM_EMBED_MODEL_NAME, + ), + client=async_embed_client, ) - driver = AsyncGraphDatabase.driver( - self._uri, auth=(self._user, self._password) + cross_encoder = OpenAIRerankerClient(config=llm_config, client=async_small_client) + return Graphiti( + uri=self._uri, + user=self._user, + password=self._password, + llm_client=llm_client, + embedder=embedder, + cross_encoder=cross_encoder, ) - return Graphiti(driver=driver, llm_client=llm_client, embedder=embedder) def create_graph(self, graph_id: str, name: str, description: str = "") -> None: logger.info(f"Graphiti graph namespace ready: {graph_id}") def set_ontology(self, graph_ids: List[str], entities: Dict[str, Any], edges: Dict[str, Any]) -> None: - logger.info("Graphiti uses LLM-driven ontology extraction; set_ontology is a no-op.") + from pydantic import BaseModel as _BaseModel, Field as _Field + + def _make_model(name: str, type_def: Any) -> Any: + if isinstance(type_def, dict): + doc = type_def.get("description", "") + attrs_defs = type_def.get("attributes", []) + else: + doc = getattr(type_def, "__doc__", "") or "" + attrs_defs = [] + + annotations: Dict[str, Any] = {} + fields: Dict[str, Any] = {"__doc__": doc, "__annotations__": annotations} + for attr in attrs_defs: + attr_name = attr.get("name", "") + attr_desc = attr.get("description", attr_name) + if not attr_name: + continue + annotations[attr_name] = Optional[str] + fields[attr_name] = _Field(default=None, description=attr_desc) + + return type(name, (_BaseModel,), fields) + + self._entity_types: Dict[str, Any] = { + name: _make_model(name, td) for name, td in (entities or {}).items() + } + self._edge_types: Dict[str, Any] = { + name: _make_model(name, td) for name, td in (edges or {}).items() + } + # Keep a separate plain dict for use in extraction instructions + self._entity_defs: Dict[str, Any] = dict(entities or {}) + self._edge_defs: Dict[str, Any] = dict(edges or {}) + if self._entity_types: + logger.info(f"Graphiti entity types: {list(self._entity_types.keys())}") + if self._edge_types: + logger.info(f"Graphiti edge types: {list(self._edge_types.keys())}") + + def _build_extraction_instructions(self) -> Optional[str]: + """Return custom instructions that constrain extraction to ontology types and attributes.""" + entity_defs = self._entity_defs or {} + edge_defs = self._edge_defs or {} + if not entity_defs and not edge_defs: + return None + + parts = [] + + if entity_defs: + entity_lines = [] + for name, td in entity_defs.items(): + desc = td.get("description", "") if isinstance(td, dict) else "" + attrs = td.get("attributes", []) if isinstance(td, dict) else [] + if attrs: + attr_str = ", ".join( + f"{a['name']} ({a.get('description', a['name'])})" + for a in attrs if a.get("name") + ) + entity_lines.append(f" - {name}: {desc} [attributes: {attr_str}]") + else: + entity_lines.append(f" - {name}: {desc}") + parts.append( + "Only classify entities using these types (use 'Entity' only if none fits):\n" + + "\n".join(entity_lines) + + "\nFor each entity, extract values for the listed attributes when present in the text." + ) + + if edge_defs: + edge_names = list(edge_defs.keys()) + parts.append( + f"Only use these relationship types: {', '.join(edge_names)}. " + "Do not invent new relationship type names." + ) + + return "\n\n".join(parts) def add_batch(self, graph_id: str, episodes: List[Any]) -> List[str]: from graphiti_core.nodes import EpisodeType + from datetime import datetime, timezone + import time as _time + + entity_types = self._entity_types or None + edge_types = self._edge_types or None + instructions = self._build_extraction_instructions() ids = [] + for ep in episodes: data = ep["data"] if isinstance(ep, dict) else ep.data ep_id = str(uuid_mod.uuid4()) - _run_async( - self._client.add_episode( - name=ep_id, - episode_body=data, - source=EpisodeType.text, - group_id=graph_id, - ) - ) ids.append(ep_id) + + last_exc = None + for attempt in range(3): + try: + _run_async( + self._client.add_episode( + name=ep_id, + episode_body=data, + source_description="MiroFish document chunk", + reference_time=datetime.now(timezone.utc), + source=EpisodeType.text, + group_id=graph_id, + entity_types=entity_types, + edge_types=edge_types, + custom_extraction_instructions=instructions, + ), + timeout=300, + ) + last_exc = None + break + except Exception as exc: + last_exc = exc + # "node not found" race condition — wait and retry + if "not found" in str(exc).lower() and attempt < 2: + logger.warning(f"Episode {ep_id} attempt {attempt + 1} failed ({exc}), retrying...") + _time.sleep(2 * (attempt + 1)) + else: + raise + + if last_exc: + raise last_exc + return ids def get_episode(self, uuid_: str) -> Any: @@ -100,19 +318,19 @@ class GraphitiBackend(GraphBackend): def get_all_nodes(self, graph_id: str) -> List[Dict[str, Any]]: results = _run_async( self._client.driver.execute_query( - "MATCH (n {group_id: $gid}) RETURN n", - {"gid": graph_id}, + "MATCH (n:Entity {group_id: $gid}) RETURN n", + params={"gid": graph_id}, ) ) nodes = [] for record in results.records: n = record["n"] nodes.append({ - "uuid": n.get("uuid", str(n.id)), + "uuid": n.get("uuid", n.element_id), "name": n.get("name", ""), "labels": list(n.labels), "summary": n.get("summary", ""), - "attributes": dict(n), + "attributes": _neo4j_props(n), "created_at": str(n.get("created_at", "")), }) return nodes @@ -121,20 +339,20 @@ class GraphitiBackend(GraphBackend): results = _run_async( self._client.driver.execute_query( "MATCH (s)-[r]->(t) WHERE r.group_id = $gid RETURN s, r, t", - {"gid": graph_id}, + params={"gid": graph_id}, ) ) edges = [] for record in results.records: r = record["r"] edges.append({ - "uuid": r.get("uuid", str(r.id)), + "uuid": r.get("uuid", r.element_id), "name": r.get("name", type(r).__name__), "fact": r.get("fact", ""), "source_node_uuid": record["s"].get("uuid", ""), "target_node_uuid": record["t"].get("uuid", ""), "fact_type": r.get("fact_type", ""), - "attributes": dict(r), + "attributes": _neo4j_props(r), "created_at": str(r.get("created_at", "")), "valid_at": str(r.get("valid_at", "")), "invalid_at": str(r.get("invalid_at", "")), @@ -147,7 +365,7 @@ class GraphitiBackend(GraphBackend): results = _run_async( self._client.driver.execute_query( "MATCH (n {uuid: $uuid}) RETURN n LIMIT 1", - {"uuid": uuid_}, + params={"uuid": uuid_}, ) ) if not results.records: @@ -158,7 +376,7 @@ class GraphitiBackend(GraphBackend): "name": n.get("name", ""), "labels": list(n.labels), "summary": n.get("summary", ""), - "attributes": dict(n), + "attributes": _neo4j_props(n), } def get_node_edges(self, node_uuid: str) -> List[Dict[str, Any]]: @@ -166,14 +384,14 @@ class GraphitiBackend(GraphBackend): self._client.driver.execute_query( "MATCH (n {uuid: $uuid})-[r]->(t) RETURN r, t " "UNION MATCH (s)-[r]->(n {uuid: $uuid}) RETURN r, s as t", - {"uuid": node_uuid}, + params={"uuid": node_uuid}, ) ) edges = [] for record in results.records: r = record["r"] edges.append({ - "uuid": r.get("uuid", str(r.id)), + "uuid": r.get("uuid", r.element_id), "name": r.get("name", ""), "fact": r.get("fact", ""), "source_node_uuid": r.get("source_node_uuid", node_uuid), @@ -198,14 +416,20 @@ class GraphitiBackend(GraphBackend): return {"edges": edges, "nodes": []} def add_text(self, graph_id: str, data: str) -> None: - ep_id = str(uuid_mod.uuid4()) from graphiti_core.nodes import EpisodeType + from datetime import datetime, timezone + ep_id = str(uuid_mod.uuid4()) _run_async( self._client.add_episode( name=ep_id, episode_body=data, + source_description="MiroFish document chunk", + reference_time=datetime.now(timezone.utc), source=EpisodeType.text, group_id=graph_id, + entity_types=self._entity_types or None, + edge_types=self._edge_types or None, + custom_extraction_instructions=self._build_extraction_instructions(), ) ) @@ -213,6 +437,6 @@ class GraphitiBackend(GraphBackend): _run_async( self._client.driver.execute_query( "MATCH (n {group_id: $gid}) DETACH DELETE n", - {"gid": graph_id}, + params={"gid": graph_id}, ) ) diff --git a/backend/app/services/graph_builder.py b/backend/app/services/graph_builder.py index d6aa33c3..01882ef7 100644 --- a/backend/app/services/graph_builder.py +++ b/backend/app/services/graph_builder.py @@ -10,8 +10,6 @@ import threading from typing import Dict, Any, List, Optional, Callable from dataclasses import dataclass -from zep_cloud import EntityEdgeSourceTarget - from ..config import Config from ..graph import get_graph_backend from ..models.task import TaskManager, TaskStatus @@ -197,6 +195,25 @@ class GraphBuilderService: def set_ontology(self, graph_id: str, ontology: Dict[str, Any]): """Set graph ontology (public method)""" + from ..config import Config + if Config.GRAPH_BACKEND != "zep": + entities = { + e["name"]: { + "description": e.get("description", ""), + "attributes": e.get("attributes", []), + } + for e in ontology.get("entity_types", []) + } + edges = { + e["name"]: { + "description": e.get("description", ""), + "attributes": e.get("attributes", []), + } + for e in ontology.get("edge_types", []) + } + self._graph.set_ontology(graph_ids=[graph_id], entities=entities, edges=edges) + return + import warnings from typing import Optional from pydantic import Field @@ -210,60 +227,51 @@ class GraphBuilderService: RESERVED_NAMES = {'uuid', 'name', 'group_id', 'name_embedding', 'summary', 'created_at'} def safe_attr_name(attr_name: str) -> str: - """Convert reserved names to safe attribute names""" if attr_name.lower() in RESERVED_NAMES: return f"entity_{attr_name}" return attr_name - + # Dynamically create entity types entity_types = {} for entity_def in ontology.get("entity_types", []): name = entity_def["name"] description = entity_def.get("description", f"A {name} entity.") - # Build attribute dict and type annotations (required by Pydantic v2) attrs = {"__doc__": description} annotations = {} for attr_def in entity_def.get("attributes", []): - attr_name = safe_attr_name(attr_def["name"]) # Use safe name + attr_name = safe_attr_name(attr_def["name"]) attr_desc = attr_def.get("description", attr_name) - # Zep API requires Field description — this is mandatory attrs[attr_name] = Field(description=attr_desc, default=None) - annotations[attr_name] = Optional[EntityText] # Type annotation + annotations[attr_name] = Optional[EntityText] attrs["__annotations__"] = annotations - - # Dynamically create class entity_class = type(name, (EntityModel,), attrs) entity_class.__doc__ = description entity_types[name] = entity_class - + # Dynamically create edge types edge_definitions = {} for edge_def in ontology.get("edge_types", []): name = edge_def["name"] description = edge_def.get("description", f"A {name} relationship.") - # Build attribute dict and type annotations attrs = {"__doc__": description} annotations = {} for attr_def in edge_def.get("attributes", []): - attr_name = safe_attr_name(attr_def["name"]) # Use safe name + attr_name = safe_attr_name(attr_def["name"]) attr_desc = attr_def.get("description", attr_name) - # Zep API requires Field description — this is mandatory attrs[attr_name] = Field(description=attr_desc, default=None) - annotations[attr_name] = Optional[str] # Edge attributes use str type + annotations[attr_name] = Optional[str] attrs["__annotations__"] = annotations - - # Dynamically create class class_name = ''.join(word.capitalize() for word in name.split('_')) edge_class = type(class_name, (EdgeModel,), attrs) edge_class.__doc__ = description - - # Build source_targets + + from zep_cloud import EntityEdgeSourceTarget source_targets = [] for st in edge_def.get("source_targets", []): source_targets.append( @@ -272,10 +280,10 @@ class GraphBuilderService: target=st.get("target", "Entity") ) ) - + if source_targets: edge_definitions[name] = (edge_class, source_targets) - + if entity_types or edge_definitions: self._graph.set_ontology( graph_ids=[graph_id], diff --git a/backend/app/services/ontology_generator.py b/backend/app/services/ontology_generator.py index 1dd4d879..d7579d81 100644 --- a/backend/app/services/ontology_generator.py +++ b/backend/app/services/ontology_generator.py @@ -8,7 +8,7 @@ import logging import re from typing import Dict, Any, List, Optional from ..utils.llm_client import LLMClient -from ..utils.locale import get_language_instruction +from ..utils.locale import get_language_instruction, t logger = logging.getLogger(__name__) @@ -62,29 +62,29 @@ Please output JSON format with the following structure: { "entity_types": [ { - "name": "Entity type name (English, PascalCase)", - "description": "Brief description (English, max 100 characters)", + "name": "Entity type name (PascalCase, in the language specified by the language instruction)", + "description": "Brief description (in the language specified by the language instruction, max 100 characters)", "attributes": [ { - "name": "Attribute name (English, snake_case)", + "name": "Attribute name (snake_case, in the language specified by the language instruction)", "type": "text", - "description": "Attribute description" + "description": "Attribute description (in the language specified by the language instruction)" } ], - "examples": ["Example entity 1", "Example entity 2"] + "examples": ["Example entity 1 (in the language specified by the language instruction)", "Example entity 2"] } ], "edge_types": [ { - "name": "Relationship type name (English, UPPER_SNAKE_CASE)", - "description": "Brief description (English, max 100 characters)", + "name": "Relationship type name (UPPER_SNAKE_CASE, in the language specified by the language instruction)", + "description": "Brief description (in the language specified by the language instruction, max 100 characters)", "source_targets": [ {"source": "Source entity type", "target": "Target entity type"} ], "attributes": [] } ], - "analysis_summary": "Brief analysis summary of the text content" + "analysis_summary": "Brief analysis summary of the text content (in the language specified by the language instruction)" } ``` @@ -92,20 +92,21 @@ Please output JSON format with the following structure: ### 1. Entity Type Design — Must Be Strictly Followed -**Quantity requirement: exactly 10 entity types** +**Quantity requirement: exactly 20 entity types** **Hierarchy requirement (must include both specific types and fallback types)**: -Your 10 entity types must include the following levels: +Your 20 entity types must include the following levels: A. **Fallback types (required, placed as the last 2 in the list)**: - `Person`: Fallback type for any individual person. Use this when a person does not fit any other more specific person type. - `Organization`: Fallback type for any organization. Use this when an organization does not fit any other more specific organization type. -B. **Specific types (8 types, designed based on text content)**: +B. **Specific types (18 types, designed based on text content)**: - Design more specific types for the main roles that appear in the text - - Example: if the text involves an academic event, you might have `Student`, `Professor`, `University` - - Example: if the text involves a business event, you might have `Company`, `CEO`, `Employee` + - Example: if the text involves an academic event, you might have `Student`, `Professor`, `University`, `ResearchGroup`, `Alumni`, etc. + - Example: if the text involves a business event, you might have `Company`, `CEO`, `Employee`, `Investor`, `Regulator`, etc. + - Ensure broad coverage of all actor categories present in the text **Why fallback types are needed**: - Various people appear in text, such as "primary and secondary school teachers", "passersby", "some netizen" @@ -119,9 +120,10 @@ B. **Specific types (8 types, designed based on text content)**: ### 2. Relationship Type Design -- Quantity: 6-10 +- Quantity: 12-20 - Relationships should reflect real connections in social media interactions - Ensure the source_targets in relationships cover the entity types you have defined +- Aim for rich coverage: include hierarchical, collaborative, adversarial, and informational relationships ### 3. Attribute Design @@ -129,47 +131,23 @@ B. **Specific types (8 types, designed based on text content)**: - **Note**: Attribute names must not use `name`, `uuid`, `group_id`, `created_at`, `summary` (these are system reserved words) - Recommended: `full_name`, `title`, `role`, `position`, `location`, `description`, etc. -## Entity Type Reference +## Entity and Relationship Type Reference -**Individual types (specific)**: -- Student: student -- Professor: professor/scholar -- Journalist: journalist -- Celebrity: celebrity/influencer -- Executive: corporate executive -- Official: government official -- Lawyer: lawyer -- Doctor: doctor +Use the language specified in the language instruction for ALL names. Keep PascalCase for entity names and UPPER_SNAKE_CASE for relationship names, but use words from the target language. -**Individual types (fallback)**: -- Person: any individual (use when not fitting the specific types above) +**Individual type examples** (translate to target language): +- A person who is a student → StudentName in target language, PascalCase +- A person who is a journalist → JournalistName in target language, PascalCase +- Fallback for any individual → PersonName in target language, PascalCase -**Organization types (specific)**: -- University: university/college -- Company: company/enterprise -- GovernmentAgency: government agency -- MediaOutlet: media organization -- Hospital: hospital -- School: primary/secondary school -- NGO: non-governmental organization +**Organization type examples** (translate to target language): +- A university → UniversityName in target language, PascalCase +- A government agency → AgencyName in target language, PascalCase +- Fallback for any organization → OrganizationName in target language, PascalCase -**Organization types (fallback)**: -- Organization: any organization (use when not fitting the specific types above) - -## Relationship Type Reference - -- WORKS_FOR: works for -- STUDIES_AT: studies at -- AFFILIATED_WITH: affiliated with -- REPRESENTS: represents -- REGULATES: regulates -- REPORTS_ON: reports on -- COMMENTS_ON: comments on -- RESPONDS_TO: responds to -- SUPPORTS: supports -- OPPOSES: opposes -- COLLABORATES_WITH: collaborates with -- COMPETES_WITH: competes with +**Relationship type examples** (translate to target language): +- works for → WORKS_FOR translated to target language, UPPER_SNAKE_CASE +- reports on → REPORTS_ON translated to target language, UPPER_SNAKE_CASE """ @@ -209,17 +187,17 @@ class OntologyGenerator: lang_instruction ) - system_prompt = f"LANGUAGE INSTRUCTION (HIGHEST PRIORITY — MUST BE FOLLOWED): {lang_instruction} All description fields, analysis_summary, and examples MUST be written in this language.\n\n{ONTOLOGY_SYSTEM_PROMPT}\n\n{lang_instruction}\nIMPORTANT: Entity type names MUST be in English PascalCase (e.g., 'PersonEntity', 'MediaOrganization'). Relationship type names MUST be in English UPPER_SNAKE_CASE (e.g., 'WORKS_FOR'). Attribute names MUST be in English snake_case. Only description fields and analysis_summary should use the specified language above." + system_prompt = f"LANGUAGE INSTRUCTION (HIGHEST PRIORITY — MUST BE FOLLOWED): {lang_instruction} ALL fields including names, descriptions, analysis_summary, and examples MUST be written in this language.\n\n{ONTOLOGY_SYSTEM_PROMPT}\n\n{lang_instruction}\nIMPORTANT: Entity type names MUST be in PascalCase (e.g., 'AgenciaGovern', 'FuncionariPublic'). Relationship type names MUST be in UPPER_SNAKE_CASE (e.g., 'TREBALLA_PER', 'RESPON_A'). Attribute names MUST be in snake_case. All names, descriptions, and examples must use the language specified above." messages = [ {"role": "system", "content": system_prompt}, {"role": "user", "content": user_message} ] - # Call LLM + # Call LLM — 20 entity types + 20 edge types need more tokens than the old 10+10 result = self.llm_client.chat_json( messages=messages, temperature=0.3, - max_tokens=4096 + max_tokens=8192 ) # Validate and post-process @@ -268,11 +246,12 @@ class OntologyGenerator: Based on the content above, design entity types and relationship types suitable for social opinion simulation. **Mandatory rules**: -1. Output exactly 10 entity types +1. Output exactly 20 entity types 2. The last 2 must be fallback types: Person (individual fallback) and Organization (organization fallback) -3. The first 8 are specific types designed from the document content +3. The first 18 are specific types designed from the document content 4. All entity types must be real-world subjects capable of speaking out, not abstract concepts 5. Attribute names must not use reserved words: name, uuid, group_id — use full_name, org_name, etc. instead +6. Output 12-20 relationship types covering hierarchical, collaborative, adversarial, and informational relationships {lang_instruction} """ @@ -330,9 +309,10 @@ Based on the content above, design entity types and relationship types suitable if len(edge.get("description", "")) > 100: edge["description"] = edge["description"][:97] + "..." - # Zep API limit: maximum 10 custom entity types and 10 custom edge types - MAX_ENTITY_TYPES = 10 - MAX_EDGE_TYPES = 10 + # Limits: Graphiti/Neo4j has no hard cap; Zep Cloud allows max 10 of each. + # We keep a generous cap for Graphiti and enforce Zep's limit at build time via config. + MAX_ENTITY_TYPES = 20 + MAX_EDGE_TYPES = 20 # Deduplicate: keep first occurrence by name seen_names = set() @@ -346,31 +326,35 @@ Based on the content above, design entity types and relationship types suitable logger.warning(f"Duplicate entity type '{name}' removed during validation") result["entity_types"] = deduped - # Fallback type definitions + # Fallback type definitions — names and descriptions come from i18n so they match + # the locale used for the rest of the ontology (e.g. "Persona"/"Organització" in Catalan). + person_fallback_name = _to_pascal_case(t("step1.ontologyFallbackPersonName") or "Person") + org_fallback_name = _to_pascal_case(t("step1.ontologyFallbackOrgName") or "Organization") + person_fallback = { - "name": "Person", - "description": "Any individual person not fitting other specific person types.", + "name": person_fallback_name, + "description": t("step1.ontologyFallbackPersonDesc") or "Any individual person not fitting other specific person types.", "attributes": [ {"name": "full_name", "type": "text", "description": "Full name of the person"}, {"name": "role", "type": "text", "description": "Role or occupation"} ], - "examples": ["ordinary citizen", "anonymous netizen"] + "examples": t("step1.ontologyFallbackPersonExamples") or ["ordinary citizen", "anonymous netizen"] } organization_fallback = { - "name": "Organization", - "description": "Any organization not fitting other specific organization types.", + "name": org_fallback_name, + "description": t("step1.ontologyFallbackOrgDesc") or "Any organization not fitting other specific organization types.", "attributes": [ {"name": "org_name", "type": "text", "description": "Name of the organization"}, {"name": "org_type", "type": "text", "description": "Type of organization"} ], - "examples": ["small business", "community group"] + "examples": t("step1.ontologyFallbackOrgExamples") or ["small business", "community group"] } - # Check whether fallback types already exist + # Check whether fallback types already exist (match by i18n name) entity_names = {e["name"] for e in result["entity_types"]} - has_person = "Person" in entity_names - has_organization = "Organization" in entity_names + has_person = person_fallback_name in entity_names + has_organization = org_fallback_name in entity_names # Collect fallback types to add fallbacks_to_add = [] @@ -383,11 +367,9 @@ Based on the content above, design entity types and relationship types suitable current_count = len(result["entity_types"]) needed_slots = len(fallbacks_to_add) - # If adding them would exceed 10, remove some existing types + # If adding them would exceed the limit, remove some existing types from the end if current_count + needed_slots > MAX_ENTITY_TYPES: - # Calculate how many to remove to_remove = current_count + needed_slots - MAX_ENTITY_TYPES - # Remove from the end (preserve the more important specific types at the front) result["entity_types"] = result["entity_types"][:-to_remove] # Add fallback types diff --git a/locales/ca.json b/locales/ca.json index 96b1cb47..1d7843bf 100644 --- a/locales/ca.json +++ b/locales/ca.json @@ -89,6 +89,12 @@ "ontologyGenerating": "Generant", "ontologyPending": "Pendent", "ontologyDesc": "El LLM analitza el contingut del document i els requisits de simulació, extreu llavors de realitat i auto-genera una estructura d'ontologia adequada", + "ontologyFallbackPersonName": "Persona", + "ontologyFallbackPersonDesc": "Qualsevol persona individual que no encaixa en altres tipus de persona més específics.", + "ontologyFallbackPersonExamples": ["ciutadà ordinari", "internauta anònim"], + "ontologyFallbackOrgName": "Organització", + "ontologyFallbackOrgDesc": "Qualsevol organització que no encaixa en altres tipus d'organització més específics.", + "ontologyFallbackOrgExamples": ["petita empresa", "grup comunitari"], "analyzingDocs": "Analitzant documents...", "graphRagBuild": "Construcció de GraphRAG", "graphRagDesc": "Basant-se en l'ontologia generada, els documents es divideixen automàticament en fragments i s'envien a Zep per construir un graf de coneixement, extraient entitats i relacions, formant memòria temporal i resums de comunitat", diff --git a/locales/en.json b/locales/en.json index 768796ac..0ab01c4c 100644 --- a/locales/en.json +++ b/locales/en.json @@ -89,6 +89,12 @@ "ontologyGenerating": "Generating", "ontologyPending": "Pending", "ontologyDesc": "LLM analyzes document content and simulation requirements, extracts reality seeds, and auto-generates a suitable ontology structure", + "ontologyFallbackPersonName": "Person", + "ontologyFallbackPersonDesc": "Any individual person not fitting other specific person types.", + "ontologyFallbackPersonExamples": ["ordinary citizen", "anonymous netizen"], + "ontologyFallbackOrgName": "Organization", + "ontologyFallbackOrgDesc": "Any organization not fitting other specific organization types.", + "ontologyFallbackOrgExamples": ["small business", "community group"], "analyzingDocs": "Analyzing documents...", "graphRagBuild": "GraphRAG Build", "graphRagDesc": "Based on the generated ontology, documents are auto-chunked and sent to Zep to build a knowledge graph, extracting entities and relations, forming temporal memory and community summaries", diff --git a/locales/es.json b/locales/es.json index 11ede00a..dce645e8 100644 --- a/locales/es.json +++ b/locales/es.json @@ -89,6 +89,12 @@ "ontologyGenerating": "Generando", "ontologyPending": "Pendiente", "ontologyDesc": "El LLM analiza el contenido del documento y los requisitos de simulación, extrae semillas de la realidad y genera automáticamente la estructura ontológica adecuada", + "ontologyFallbackPersonName": "Person", + "ontologyFallbackPersonDesc": "Cualquier persona individual que no encaja en otros tipos de persona más específicos.", + "ontologyFallbackPersonExamples": ["ciudadano ordinario", "internauta anónimo"], + "ontologyFallbackOrgName": "Organization", + "ontologyFallbackOrgDesc": "Cualquier organización que no encaja en otros tipos de organización más específicos.", + "ontologyFallbackOrgExamples": ["pequeña empresa", "grupo comunitario"], "analyzingDocs": "Analizando documentos...", "graphRagBuild": "Construcción de GraphRAG", "graphRagDesc": "A partir de la ontología generada, los documentos se fragmentan automáticamente y se envían a Zep para construir un grafo de conocimiento, extrayendo entidades y relaciones, formando memoria temporal y resúmenes de comunidad", diff --git a/locales/zh.json b/locales/zh.json index c17e414c..ba88b109 100644 --- a/locales/zh.json +++ b/locales/zh.json @@ -89,6 +89,12 @@ "ontologyGenerating": "生成中", "ontologyPending": "等待", "ontologyDesc": "LLM分析文档内容与模拟需求,提取出现实种子,自动生成合适的本体结构", + "ontologyFallbackPersonName": "Person", + "ontologyFallbackPersonDesc": "任何不适合其他具体人物类型的个人。", + "ontologyFallbackPersonExamples": ["普通市民", "匿名网友"], + "ontologyFallbackOrgName": "Organization", + "ontologyFallbackOrgDesc": "任何不适合其他具体组织类型的组织。", + "ontologyFallbackOrgExamples": ["小型企业", "社区团体"], "analyzingDocs": "正在分析文档...", "graphRagBuild": "GraphRAG构建", "graphRagDesc": "基于生成的本体,将文档自动分块后调用 Zep 构建知识图谱,提取实体和关系,并形成时序记忆与社区摘要",