507 lines
20 KiB
Python
507 lines
20 KiB
Python
"""
|
||
本体生成服务
|
||
接口1:分析文本内容,生成适合社会模拟的实体和关系类型定义
|
||
"""
|
||
|
||
import json
|
||
import logging
|
||
import re
|
||
from typing import Dict, Any, List, Optional
|
||
from ..utils.llm_client import LLMClient
|
||
from ..utils.locale import get_language_instruction
|
||
|
||
logger = logging.getLogger(__name__)
|
||
|
||
|
||
def _to_pascal_case(name: str) -> str:
|
||
"""将任意格式的名称转换为 PascalCase(如 'works_for' -> 'WorksFor', 'person' -> 'Person')"""
|
||
# 按非字母数字字符分割
|
||
parts = re.split(r'[^a-zA-Z0-9]+', name)
|
||
# 再按 camelCase 边界分割(如 'camelCase' -> ['camel', 'Case'])
|
||
words = []
|
||
for part in parts:
|
||
words.extend(re.sub(r'([a-z])([A-Z])', r'\1_\2', part).split('_'))
|
||
# 每个词首字母大写,过滤空串
|
||
result = ''.join(word.capitalize() for word in words if word)
|
||
return result if result else 'Unknown'
|
||
|
||
|
||
# 本体生成的系统提示词
|
||
ONTOLOGY_SYSTEM_PROMPT = """You are a professional knowledge-graph ontology designer. Your task is to analyze the supplied text and simulation requirement and design entity types and relationship types suitable for a **social-media public-opinion simulation**.
|
||
|
||
**Important: you must output valid JSON data and nothing else.**
|
||
|
||
## Core Task Background
|
||
|
||
We are building a **social-media public-opinion simulation system**. In this system:
|
||
- Every entity is an "account" or "actor" that can post on social media, interact with other accounts, and propagate information.
|
||
- Entities influence each other, repost, comment on, and respond to one another.
|
||
- We need to simulate how each side of a public-opinion event reacts and how information flows.
|
||
|
||
Therefore, **entities must be real-world subjects that can plausibly post on social media and interact with others**:
|
||
|
||
**Acceptable**:
|
||
- Specific individuals (public figures, parties to the event, opinion leaders, experts and scholars, ordinary people)
|
||
- Companies and businesses (including their official accounts)
|
||
- Organizations (universities, associations, NGOs, unions, etc.)
|
||
- Government departments and regulators
|
||
- Media organizations (newspapers, broadcasters, independent media, websites)
|
||
- Social-media platforms themselves
|
||
- Representatives of specific groups (alumni associations, fan communities, advocacy groups, etc.)
|
||
|
||
**Not acceptable**:
|
||
- Abstract concepts (such as "public opinion", "sentiment", "trend")
|
||
- Topics or subjects (such as "academic integrity", "education reform")
|
||
- Viewpoints or stances (such as "supporters", "opponents")
|
||
|
||
## Output Format
|
||
|
||
Return JSON with the following structure:
|
||
|
||
```json
|
||
{
|
||
"entity_types": [
|
||
{
|
||
"name": "entity type name (English, PascalCase)",
|
||
"description": "short description (English, no more than 100 characters)",
|
||
"attributes": [
|
||
{
|
||
"name": "attribute name (English, snake_case)",
|
||
"type": "text",
|
||
"description": "attribute description"
|
||
}
|
||
],
|
||
"examples": ["example entity 1", "example entity 2"]
|
||
}
|
||
],
|
||
"edge_types": [
|
||
{
|
||
"name": "relationship type name (English, UPPER_SNAKE_CASE)",
|
||
"description": "short description (English, no more than 100 characters)",
|
||
"source_targets": [
|
||
{"source": "source entity type", "target": "target entity type"}
|
||
],
|
||
"attributes": []
|
||
}
|
||
],
|
||
"analysis_summary": "brief analytical summary of the text content"
|
||
}
|
||
```
|
||
|
||
## Design Guidelines (must be followed)
|
||
|
||
### 1. Entity Type Design - strictly required
|
||
|
||
**Count requirement: exactly 10 entity types.**
|
||
|
||
**Hierarchy requirement (must include both concrete types and fallback types)**:
|
||
|
||
Your 10 entity types must form the following hierarchy:
|
||
|
||
A. **Fallback types (mandatory; placed as the last 2 entries)**:
|
||
- `Person`: the fallback type for any individual. When a person does not fit any more specific person type, classify them here.
|
||
- `Organization`: the fallback type for any organization. When an organization does not fit any more specific organization type, classify it here.
|
||
|
||
B. **Concrete types (8 entries, designed from the text content)**:
|
||
- Define more specific types for the major roles that appear in the text.
|
||
- Example: for an academic event, you might use `Student`, `Professor`, `University`.
|
||
- Example: for a business event, you might use `Company`, `CEO`, `Employee`.
|
||
|
||
**Why fallback types are required**:
|
||
- The text will mention many kinds of people, e.g. "primary-school teachers", "passersby", "an anonymous netizen".
|
||
- When no dedicated type fits, they should fall into `Person`.
|
||
- Likewise, small organizations and ad-hoc groups should fall into `Organization`.
|
||
|
||
**Principles for concrete types**:
|
||
- Identify the high-frequency or pivotal role types in the text.
|
||
- Each concrete type should have a clear boundary and avoid overlap.
|
||
- The description must clearly state how the concrete type differs from the corresponding fallback type.
|
||
|
||
### 2. Relationship Type Design
|
||
|
||
- Count: 6 to 10.
|
||
- Relationships should reflect realistic interactions on social media.
|
||
- Ensure each relationship's source_targets cover the entity types you defined.
|
||
|
||
### 3. Attribute Design
|
||
|
||
- 1 to 3 key attributes per entity type.
|
||
- **Note**: attribute names must not use `name`, `uuid`, `group_id`, `created_at`, or `summary` (these are reserved by the system).
|
||
- Recommended names: `full_name`, `title`, `role`, `position`, `location`, `description`, etc.
|
||
|
||
## Entity Type Reference
|
||
|
||
**Individuals (concrete)**:
|
||
- Student: a student.
|
||
- Professor: a professor or scholar.
|
||
- Journalist: a journalist.
|
||
- Celebrity: a celebrity or internet personality.
|
||
- Executive: a senior business leader.
|
||
- Official: a government official.
|
||
- Lawyer: a lawyer.
|
||
- Doctor: a physician.
|
||
|
||
**Individuals (fallback)**:
|
||
- Person: any individual person (use when no concrete person type above applies).
|
||
|
||
**Organizations (concrete)**:
|
||
- University: a university or higher-education institution.
|
||
- Company: a company or business.
|
||
- GovernmentAgency: a government agency.
|
||
- MediaOutlet: a media organization.
|
||
- Hospital: a hospital.
|
||
- School: a primary or secondary school.
|
||
- NGO: a non-governmental organization.
|
||
|
||
**Organizations (fallback)**:
|
||
- Organization: any organization (use when no concrete organization type above applies).
|
||
|
||
## Relationship Type Reference
|
||
|
||
- WORKS_FOR: works for.
|
||
- STUDIES_AT: studies at.
|
||
- AFFILIATED_WITH: is affiliated with.
|
||
- REPRESENTS: represents.
|
||
- REGULATES: regulates.
|
||
- REPORTS_ON: reports on.
|
||
- COMMENTS_ON: comments on.
|
||
- RESPONDS_TO: responds to.
|
||
- SUPPORTS: supports.
|
||
- OPPOSES: opposes.
|
||
- COLLABORATES_WITH: collaborates with.
|
||
- COMPETES_WITH: competes with.
|
||
"""
|
||
|
||
|
||
class OntologyGenerator:
|
||
"""
|
||
本体生成器
|
||
分析文本内容,生成实体和关系类型定义
|
||
"""
|
||
|
||
def __init__(self, llm_client: Optional[LLMClient] = None):
|
||
self.llm_client = llm_client or LLMClient()
|
||
|
||
def generate(
|
||
self,
|
||
document_texts: List[str],
|
||
simulation_requirement: str,
|
||
additional_context: Optional[str] = None
|
||
) -> Dict[str, Any]:
|
||
"""
|
||
生成本体定义
|
||
|
||
Args:
|
||
document_texts: 文档文本列表
|
||
simulation_requirement: 模拟需求描述
|
||
additional_context: 额外上下文
|
||
|
||
Returns:
|
||
本体定义(entity_types, edge_types等)
|
||
"""
|
||
# 构建用户消息
|
||
user_message = self._build_user_message(
|
||
document_texts,
|
||
simulation_requirement,
|
||
additional_context
|
||
)
|
||
|
||
lang_instruction = get_language_instruction()
|
||
system_prompt = f"{ONTOLOGY_SYSTEM_PROMPT}\n\n{lang_instruction}\nIMPORTANT: Entity type names MUST be in English PascalCase (e.g., 'PersonEntity', 'MediaOrganization'). Relationship type names MUST be in English UPPER_SNAKE_CASE (e.g., 'WORKS_FOR'). Attribute names MUST be in English snake_case. Only description fields and analysis_summary should use the specified language above."
|
||
messages = [
|
||
{"role": "system", "content": system_prompt},
|
||
{"role": "user", "content": user_message}
|
||
]
|
||
|
||
# 调用LLM
|
||
result = self.llm_client.chat_json(
|
||
messages=messages,
|
||
temperature=0.3,
|
||
max_tokens=4096
|
||
)
|
||
|
||
# 验证和后处理
|
||
result = self._validate_and_process(result)
|
||
|
||
return result
|
||
|
||
# 传给 LLM 的文本最大长度(5万字)
|
||
MAX_TEXT_LENGTH_FOR_LLM = 50000
|
||
|
||
def _build_user_message(
|
||
self,
|
||
document_texts: List[str],
|
||
simulation_requirement: str,
|
||
additional_context: Optional[str]
|
||
) -> str:
|
||
"""构建用户消息"""
|
||
|
||
# 合并文本
|
||
combined_text = "\n\n---\n\n".join(document_texts)
|
||
original_length = len(combined_text)
|
||
|
||
# 如果文本超过5万字,截断(仅影响传给LLM的内容,不影响图谱构建)
|
||
if len(combined_text) > self.MAX_TEXT_LENGTH_FOR_LLM:
|
||
combined_text = combined_text[:self.MAX_TEXT_LENGTH_FOR_LLM]
|
||
combined_text += f"\n\n...(original text is {original_length} characters; only the first {self.MAX_TEXT_LENGTH_FOR_LLM} characters were used for ontology analysis)..."
|
||
|
||
message = f"""## Simulation Requirement
|
||
|
||
{simulation_requirement}
|
||
|
||
## Document Content
|
||
|
||
{combined_text}
|
||
"""
|
||
|
||
if additional_context:
|
||
message += f"""
|
||
## Additional Context
|
||
|
||
{additional_context}
|
||
"""
|
||
|
||
message += """
|
||
Based on the content above, design entity types and relationship types suitable for a social-media public-opinion simulation.
|
||
|
||
**Rules that must be followed**:
|
||
1. You must output exactly 10 entity types.
|
||
2. The last 2 must be fallback types: Person (individual fallback) and Organization (organization fallback).
|
||
3. The first 8 are concrete types designed from the text content.
|
||
4. Every entity type must be a real-world subject that can post on social media; abstract concepts are not allowed.
|
||
5. Attribute names must not use reserved words such as name, uuid, group_id; use alternatives such as full_name, org_name, etc.
|
||
"""
|
||
|
||
return message
|
||
|
||
def _validate_and_process(self, result: Dict[str, Any]) -> Dict[str, Any]:
|
||
"""验证和后处理结果"""
|
||
|
||
# 确保必要字段存在
|
||
if "entity_types" not in result:
|
||
result["entity_types"] = []
|
||
if "edge_types" not in result:
|
||
result["edge_types"] = []
|
||
if "analysis_summary" not in result:
|
||
result["analysis_summary"] = ""
|
||
|
||
# 验证实体类型
|
||
# 记录原始名称到 PascalCase 的映射,用于后续修正 edge 的 source_targets 引用
|
||
entity_name_map = {}
|
||
for entity in result["entity_types"]:
|
||
# 强制将 entity name 转为 PascalCase(Zep API 要求)
|
||
if "name" in entity:
|
||
original_name = entity["name"]
|
||
entity["name"] = _to_pascal_case(original_name)
|
||
if entity["name"] != original_name:
|
||
logger.warning(f"Entity type name '{original_name}' auto-converted to '{entity['name']}'")
|
||
entity_name_map[original_name] = entity["name"]
|
||
if "attributes" not in entity:
|
||
entity["attributes"] = []
|
||
if "examples" not in entity:
|
||
entity["examples"] = []
|
||
# 确保description不超过100字符
|
||
if len(entity.get("description", "")) > 100:
|
||
entity["description"] = entity["description"][:97] + "..."
|
||
|
||
# 验证关系类型
|
||
for edge in result["edge_types"]:
|
||
# 强制将 edge name 转为 SCREAMING_SNAKE_CASE(Zep API 要求)
|
||
if "name" in edge:
|
||
original_name = edge["name"]
|
||
edge["name"] = original_name.upper()
|
||
if edge["name"] != original_name:
|
||
logger.warning(f"Edge type name '{original_name}' auto-converted to '{edge['name']}'")
|
||
# 修正 source_targets 中的实体名称引用,与转换后的 PascalCase 保持一致
|
||
for st in edge.get("source_targets", []):
|
||
if st.get("source") in entity_name_map:
|
||
st["source"] = entity_name_map[st["source"]]
|
||
if st.get("target") in entity_name_map:
|
||
st["target"] = entity_name_map[st["target"]]
|
||
if "source_targets" not in edge:
|
||
edge["source_targets"] = []
|
||
if "attributes" not in edge:
|
||
edge["attributes"] = []
|
||
if len(edge.get("description", "")) > 100:
|
||
edge["description"] = edge["description"][:97] + "..."
|
||
|
||
# Zep API 限制:最多 10 个自定义实体类型,最多 10 个自定义边类型
|
||
MAX_ENTITY_TYPES = 10
|
||
MAX_EDGE_TYPES = 10
|
||
|
||
# 去重:按 name 去重,保留首次出现的
|
||
seen_names = set()
|
||
deduped = []
|
||
for entity in result["entity_types"]:
|
||
name = entity.get("name", "")
|
||
if name and name not in seen_names:
|
||
seen_names.add(name)
|
||
deduped.append(entity)
|
||
elif name in seen_names:
|
||
logger.warning(f"Duplicate entity type '{name}' removed during validation")
|
||
result["entity_types"] = deduped
|
||
|
||
# 兜底类型定义
|
||
person_fallback = {
|
||
"name": "Person",
|
||
"description": "Any individual person not fitting other specific person types.",
|
||
"attributes": [
|
||
{"name": "full_name", "type": "text", "description": "Full name of the person"},
|
||
{"name": "role", "type": "text", "description": "Role or occupation"}
|
||
],
|
||
"examples": ["ordinary citizen", "anonymous netizen"]
|
||
}
|
||
|
||
organization_fallback = {
|
||
"name": "Organization",
|
||
"description": "Any organization not fitting other specific organization types.",
|
||
"attributes": [
|
||
{"name": "org_name", "type": "text", "description": "Name of the organization"},
|
||
{"name": "org_type", "type": "text", "description": "Type of organization"}
|
||
],
|
||
"examples": ["small business", "community group"]
|
||
}
|
||
|
||
# 检查是否已有兜底类型
|
||
entity_names = {e["name"] for e in result["entity_types"]}
|
||
has_person = "Person" in entity_names
|
||
has_organization = "Organization" in entity_names
|
||
|
||
# 需要添加的兜底类型
|
||
fallbacks_to_add = []
|
||
if not has_person:
|
||
fallbacks_to_add.append(person_fallback)
|
||
if not has_organization:
|
||
fallbacks_to_add.append(organization_fallback)
|
||
|
||
if fallbacks_to_add:
|
||
current_count = len(result["entity_types"])
|
||
needed_slots = len(fallbacks_to_add)
|
||
|
||
# 如果添加后会超过 10 个,需要移除一些现有类型
|
||
if current_count + needed_slots > MAX_ENTITY_TYPES:
|
||
# 计算需要移除多少个
|
||
to_remove = current_count + needed_slots - MAX_ENTITY_TYPES
|
||
# 从末尾移除(保留前面更重要的具体类型)
|
||
result["entity_types"] = result["entity_types"][:-to_remove]
|
||
|
||
# 添加兜底类型
|
||
result["entity_types"].extend(fallbacks_to_add)
|
||
|
||
# 最终确保不超过限制(防御性编程)
|
||
if len(result["entity_types"]) > MAX_ENTITY_TYPES:
|
||
result["entity_types"] = result["entity_types"][:MAX_ENTITY_TYPES]
|
||
|
||
if len(result["edge_types"]) > MAX_EDGE_TYPES:
|
||
result["edge_types"] = result["edge_types"][:MAX_EDGE_TYPES]
|
||
|
||
return result
|
||
|
||
def generate_python_code(self, ontology: Dict[str, Any]) -> str:
|
||
"""
|
||
将本体定义转换为Python代码(类似ontology.py)
|
||
|
||
Args:
|
||
ontology: 本体定义
|
||
|
||
Returns:
|
||
Python代码字符串
|
||
"""
|
||
code_lines = [
|
||
'"""',
|
||
'自定义实体类型定义',
|
||
'由MiroFish自动生成,用于社会舆论模拟',
|
||
'"""',
|
||
'',
|
||
'from pydantic import Field',
|
||
'from zep_cloud.external_clients.ontology import EntityModel, EntityText, EdgeModel',
|
||
'',
|
||
'',
|
||
'# ============== 实体类型定义 ==============',
|
||
'',
|
||
]
|
||
|
||
# 生成实体类型
|
||
for entity in ontology.get("entity_types", []):
|
||
name = entity["name"]
|
||
desc = entity.get("description", f"A {name} entity.")
|
||
|
||
code_lines.append(f'class {name}(EntityModel):')
|
||
code_lines.append(f' """{desc}"""')
|
||
|
||
attrs = entity.get("attributes", [])
|
||
if attrs:
|
||
for attr in attrs:
|
||
attr_name = attr["name"]
|
||
attr_desc = attr.get("description", attr_name)
|
||
code_lines.append(f' {attr_name}: EntityText = Field(')
|
||
code_lines.append(f' description="{attr_desc}",')
|
||
code_lines.append(f' default=None')
|
||
code_lines.append(f' )')
|
||
else:
|
||
code_lines.append(' pass')
|
||
|
||
code_lines.append('')
|
||
code_lines.append('')
|
||
|
||
code_lines.append('# ============== 关系类型定义 ==============')
|
||
code_lines.append('')
|
||
|
||
# 生成关系类型
|
||
for edge in ontology.get("edge_types", []):
|
||
name = edge["name"]
|
||
# 转换为PascalCase类名
|
||
class_name = ''.join(word.capitalize() for word in name.split('_'))
|
||
desc = edge.get("description", f"A {name} relationship.")
|
||
|
||
code_lines.append(f'class {class_name}(EdgeModel):')
|
||
code_lines.append(f' """{desc}"""')
|
||
|
||
attrs = edge.get("attributes", [])
|
||
if attrs:
|
||
for attr in attrs:
|
||
attr_name = attr["name"]
|
||
attr_desc = attr.get("description", attr_name)
|
||
code_lines.append(f' {attr_name}: EntityText = Field(')
|
||
code_lines.append(f' description="{attr_desc}",')
|
||
code_lines.append(f' default=None')
|
||
code_lines.append(f' )')
|
||
else:
|
||
code_lines.append(' pass')
|
||
|
||
code_lines.append('')
|
||
code_lines.append('')
|
||
|
||
# 生成类型字典
|
||
code_lines.append('# ============== 类型配置 ==============')
|
||
code_lines.append('')
|
||
code_lines.append('ENTITY_TYPES = {')
|
||
for entity in ontology.get("entity_types", []):
|
||
name = entity["name"]
|
||
code_lines.append(f' "{name}": {name},')
|
||
code_lines.append('}')
|
||
code_lines.append('')
|
||
code_lines.append('EDGE_TYPES = {')
|
||
for edge in ontology.get("edge_types", []):
|
||
name = edge["name"]
|
||
class_name = ''.join(word.capitalize() for word in name.split('_'))
|
||
code_lines.append(f' "{name}": {class_name},')
|
||
code_lines.append('}')
|
||
code_lines.append('')
|
||
|
||
# 生成边的source_targets映射
|
||
code_lines.append('EDGE_SOURCE_TARGETS = {')
|
||
for edge in ontology.get("edge_types", []):
|
||
name = edge["name"]
|
||
source_targets = edge.get("source_targets", [])
|
||
if source_targets:
|
||
st_list = ', '.join([
|
||
f'{{"source": "{st.get("source", "Entity")}", "target": "{st.get("target", "Entity")}"}}'
|
||
for st in source_targets
|
||
])
|
||
code_lines.append(f' "{name}": [{st_list}],')
|
||
code_lines.append('}')
|
||
|
||
return '\n'.join(code_lines)
|
||
|