MicroFish/backend/app/utils/local_graph_store.py

291 lines
11 KiB
Python
Raw Blame History

This file contains ambiguous Unicode characters

This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

"""
本地JSON文件图谱存储
替代Zep Cloud将图谱数据节点、边、情节存储在本地JSON文件中
存储目录结构:
{storage_dir}/
{graph_id}/
metadata.json - 图谱元数据和本体定义
nodes.json - 节点列表
edges.json - 边列表
episodes.jsonl - 情节文本日志(追加写入)
"""
from __future__ import annotations
import json
import os
import shutil
import threading
import uuid
from datetime import datetime
from typing import Any, Dict, List, Optional
from .logger import get_logger
logger = get_logger('mirofish.local_graph_store')
# 每个图谱一把锁,保证并发写入安全
_global_lock = threading.Lock()
_graph_locks: Dict[str, threading.Lock] = {}
def _lock_for(graph_id: str) -> threading.Lock:
with _global_lock:
if graph_id not in _graph_locks:
_graph_locks[graph_id] = threading.Lock()
return _graph_locks[graph_id]
class LocalGraphStore:
"""本地JSON文件图谱存储"""
def __init__(self, storage_dir: str):
self.storage_dir = storage_dir
os.makedirs(storage_dir, exist_ok=True)
# ── 图谱生命周期 ──────────────────────────────────────────────────────────
def create_graph(self, graph_id: str, name: str, description: str = "") -> None:
graph_dir = self._graph_dir(graph_id)
os.makedirs(graph_dir, exist_ok=True)
self._write_json(self._meta_path(graph_id), {
"graph_id": graph_id,
"name": name,
"description": description,
"created_at": datetime.now().isoformat(),
"ontology": None,
})
if not os.path.exists(self._nodes_path(graph_id)):
self._write_json(self._nodes_path(graph_id), [])
if not os.path.exists(self._edges_path(graph_id)):
self._write_json(self._edges_path(graph_id), [])
logger.info(f"本地图谱已创建: {graph_id}")
def delete_graph(self, graph_id: str) -> None:
graph_dir = self._graph_dir(graph_id)
if os.path.exists(graph_dir):
shutil.rmtree(graph_dir)
logger.info(f"本地图谱已删除: {graph_id}")
def graph_exists(self, graph_id: str) -> bool:
return os.path.exists(self._meta_path(graph_id))
# ── 本体 ──────────────────────────────────────────────────────────────────
def set_ontology(self, graph_id: str, ontology: Dict[str, Any]) -> None:
meta = self._read_json(self._meta_path(graph_id)) or {}
meta["ontology"] = ontology
self._write_json(self._meta_path(graph_id), meta)
def get_ontology(self, graph_id: str) -> Optional[Dict[str, Any]]:
meta = self._read_json(self._meta_path(graph_id)) or {}
return meta.get("ontology")
def get_metadata(self, graph_id: str) -> Optional[Dict[str, Any]]:
return self._read_json(self._meta_path(graph_id))
# ── 情节Episode────────────────────────────────────────────────────────
def add_episode(self, graph_id: str, text: str) -> str:
"""追加一条情节文本返回情节uuid本地存储立即处理完成"""
episode_id = uuid.uuid4().hex
record = {
"uuid": episode_id,
"text": text,
"created_at": datetime.now().isoformat(),
"processed": True,
}
ep_path = self._episodes_path(graph_id)
with _lock_for(graph_id):
with open(ep_path, 'a', encoding='utf-8') as f:
f.write(json.dumps(record, ensure_ascii=False) + '\n')
return episode_id
def add_episodes_batch(self, graph_id: str, texts: List[str]) -> List[str]:
return [self.add_episode(graph_id, t) for t in texts]
def episode_is_processed(self, graph_id: str, episode_uuid: str) -> bool:
"""本地存储中的情节总是立即处理完成"""
return True
# ── 节点 ──────────────────────────────────────────────────────────────────
def get_nodes(self, graph_id: str) -> List[Dict[str, Any]]:
return self._read_json(self._nodes_path(graph_id)) or []
def get_node(self, graph_id: str, node_uuid: str) -> Optional[Dict[str, Any]]:
for node in self.get_nodes(graph_id):
if node.get("uuid") == node_uuid:
return node
return None
def upsert_node(
self,
graph_id: str,
name: str,
labels: Optional[List[str]] = None,
summary: str = "",
attributes: Optional[Dict[str, Any]] = None,
) -> str:
"""按名称查找节点存在则更新不存在则创建。返回uuid。"""
labels = labels or ["Entity"]
attributes = attributes or {}
with _lock_for(graph_id):
nodes = self._read_json(self._nodes_path(graph_id)) or []
# 按名称(不区分大小写)查找
for node in nodes:
if node.get("name", "").lower() == name.lower():
# 合并标签
existing = set(node.get("labels", []))
existing.update(labels)
node["labels"] = list(existing)
# 若原摘要为空则填充
if summary and not node.get("summary"):
node["summary"] = summary
# 合并属性
if attributes:
node.setdefault("attributes", {}).update(attributes)
self._write_json(self._nodes_path(graph_id), nodes)
return node["uuid"]
# 创建新节点
node_uuid = uuid.uuid4().hex
nodes.append({
"uuid": node_uuid,
"name": name,
"labels": labels,
"summary": summary,
"attributes": attributes,
"created_at": datetime.now().isoformat(),
})
self._write_json(self._nodes_path(graph_id), nodes)
return node_uuid
# ── 边 ───────────────────────────────────────────────────────────────────
def get_edges(self, graph_id: str) -> List[Dict[str, Any]]:
return self._read_json(self._edges_path(graph_id)) or []
def get_node_edges(self, graph_id: str, node_uuid: str) -> List[Dict[str, Any]]:
"""获取与指定节点相关的所有边(作为源或目标)"""
return [
e for e in self.get_edges(graph_id)
if e.get("source_node_uuid") == node_uuid or e.get("target_node_uuid") == node_uuid
]
def add_edge(self, graph_id: str, edge: Dict[str, Any]) -> str:
"""添加一条边返回其uuid。"""
edge_uuid = edge.get("uuid") or uuid.uuid4().hex
edge = dict(edge)
edge["uuid"] = edge_uuid
edge.setdefault("created_at", datetime.now().isoformat())
edge.setdefault("valid_at", None)
edge.setdefault("invalid_at", None)
edge.setdefault("expired_at", None)
edge.setdefault("attributes", {})
with _lock_for(graph_id):
edges = self._read_json(self._edges_path(graph_id)) or []
edges.append(edge)
self._write_json(self._edges_path(graph_id), edges)
return edge_uuid
def add_fact_edge(
self,
graph_id: str,
source_uuid: str,
target_uuid: str,
name: str,
fact: str,
) -> str:
"""便利方法:在两个节点之间添加一条命名事实边。"""
return self.add_edge(graph_id, {
"name": name,
"fact": fact,
"source_node_uuid": source_uuid,
"target_node_uuid": target_uuid,
})
# ── 搜索 ─────────────────────────────────────────────────────────────────
def search(
self,
graph_id: str,
query: str,
limit: int = 10,
scope: str = "edges",
) -> Dict[str, Any]:
"""基于关键词的本地搜索"""
query_lower = query.lower()
keywords = [
w.strip()
for w in query_lower.replace(',', ' ').replace('', ' ').split()
if len(w.strip()) > 1
]
def score(text: str) -> int:
if not text:
return 0
tl = text.lower()
if query_lower in tl:
return 100
return sum(10 for kw in keywords if kw in tl)
result_edges: List[Dict] = []
result_nodes: List[Dict] = []
facts: List[str] = []
if scope in ("edges", "both"):
scored = sorted(
[(score(e.get("fact", "")) + score(e.get("name", "")), e)
for e in self.get_edges(graph_id)
if score(e.get("fact", "")) + score(e.get("name", "")) > 0],
key=lambda x: x[0], reverse=True,
)
for _, edge in scored[:limit]:
result_edges.append(edge)
if edge.get("fact"):
facts.append(edge["fact"])
if scope in ("nodes", "both"):
scored = sorted(
[(score(n.get("name", "")) + score(n.get("summary", "")), n)
for n in self.get_nodes(graph_id)
if score(n.get("name", "")) + score(n.get("summary", "")) > 0],
key=lambda x: x[0], reverse=True,
)
for _, node in scored[:limit]:
result_nodes.append(node)
if node.get("summary"):
facts.append(f"[{node['name']}]: {node['summary']}")
return {"facts": facts, "edges": result_edges, "nodes": result_nodes}
# ── 内部路径辅助 ──────────────────────────────────────────────────────────
def _graph_dir(self, graph_id: str) -> str:
return os.path.join(self.storage_dir, graph_id)
def _meta_path(self, graph_id: str) -> str:
return os.path.join(self._graph_dir(graph_id), "metadata.json")
def _nodes_path(self, graph_id: str) -> str:
return os.path.join(self._graph_dir(graph_id), "nodes.json")
def _edges_path(self, graph_id: str) -> str:
return os.path.join(self._graph_dir(graph_id), "edges.json")
def _episodes_path(self, graph_id: str) -> str:
return os.path.join(self._graph_dir(graph_id), "episodes.jsonl")
def _read_json(self, path: str) -> Any:
if not os.path.exists(path):
return None
with open(path, 'r', encoding='utf-8') as f:
return json.load(f)
def _write_json(self, path: str, data: Any) -> None:
with open(path, 'w', encoding='utf-8') as f:
json.dump(data, f, ensure_ascii=False, indent=2)