feat: add entity deduplication after graph building

This commit is contained in:
Stayfoool 2026-03-11 21:17:15 +08:00
parent 985f89f49a
commit a728540a25
6 changed files with 937 additions and 3 deletions

View File

@ -447,7 +447,7 @@ def build_graph():
) )
def wait_progress_callback(msg, progress_ratio): def wait_progress_callback(msg, progress_ratio):
progress = 55 + int(progress_ratio * 35) # 55% - 90% progress = 55 + int(progress_ratio * 25) # 55% - 80%
task_manager.update_task( task_manager.update_task(
task_id, task_id,
message=msg, message=msg,
@ -456,6 +456,34 @@ def build_graph():
builder._wait_for_episodes(episode_uuids, wait_progress_callback) builder._wait_for_episodes(episode_uuids, wait_progress_callback)
# 实体去重
task_manager.update_task(
task_id,
message="执行实体去重...",
progress=80
)
dedup_result = None
try:
from ..services.entity_deduplicator import EntityDeduplicator
deduplicator = EntityDeduplicator()
dedup_report = deduplicator.deduplicate(
graph_id=graph_id,
progress_callback=lambda msg, prog: task_manager.update_task(
task_id,
message=f"去重: {msg}",
progress=80 + int(prog * 10), # 80% - 90%
),
)
dedup_result = dedup_report.to_dict()
build_logger.info(
f"[{task_id}] 实体去重完成: "
f"发现 {dedup_report.groups_found} 组重复, "
f"删除 {dedup_report.nodes_removed} 个节点, "
f"迁移 {dedup_report.edges_migrated} 条边"
)
except Exception as dedup_err:
build_logger.warning(f"[{task_id}] 实体去重失败(不影响图谱构建): {dedup_err}")
# 获取图谱数据 # 获取图谱数据
task_manager.update_task( task_manager.update_task(
task_id, task_id,
@ -483,7 +511,8 @@ def build_graph():
"graph_id": graph_id, "graph_id": graph_id,
"node_count": node_count, "node_count": node_count,
"edge_count": edge_count, "edge_count": edge_count,
"chunk_count": total_chunks "chunk_count": total_chunks,
"dedup_report": dedup_result
} }
) )
@ -615,3 +644,62 @@ def delete_graph(graph_id: str):
"error": str(e), "error": str(e),
"traceback": traceback.format_exc() "traceback": traceback.format_exc()
}), 500 }), 500
# ============== 接口:实体去重 ==============
@graph_bp.route('/deduplicate', methods=['POST'])
def deduplicate_graph():
"""
对已构建的图谱执行实体去重
请求JSON
{
"graph_id": "mirofish_xxxx", // 必填
"dry_run": false // 可选默认falsetrue时仅检测不合并
}
返回
{
"success": true,
"data": { ...DeduplicationReport... }
}
"""
try:
if not Config.ZEP_API_KEY:
return jsonify({
"success": False,
"error": "ZEP_API_KEY未配置"
}), 500
if not Config.LLM_API_KEY:
return jsonify({
"success": False,
"error": "LLM_API_KEY未配置实体去重需要 LLM 支持)"
}), 500
data = request.get_json() or {}
graph_id = data.get('graph_id')
dry_run = data.get('dry_run', False)
if not graph_id:
return jsonify({
"success": False,
"error": "请提供 graph_id"
}), 400
from ..services.entity_deduplicator import EntityDeduplicator
deduplicator = EntityDeduplicator()
report = deduplicator.deduplicate(graph_id=graph_id, dry_run=dry_run)
return jsonify({
"success": True,
"data": report.to_dict()
})
except Exception as e:
return jsonify({
"success": False,
"error": str(e),
"traceback": traceback.format_exc()
}), 500

View File

@ -7,6 +7,7 @@ from .graph_builder import GraphBuilderService
from .text_processor import TextProcessor from .text_processor import TextProcessor
from .zep_entity_reader import ZepEntityReader, EntityNode, FilteredEntities from .zep_entity_reader import ZepEntityReader, EntityNode, FilteredEntities
from .oasis_profile_generator import OasisProfileGenerator, OasisAgentProfile from .oasis_profile_generator import OasisProfileGenerator, OasisAgentProfile
from .entity_deduplicator import EntityDeduplicator, DeduplicationReport
from .simulation_manager import SimulationManager, SimulationState, SimulationStatus from .simulation_manager import SimulationManager, SimulationState, SimulationStatus
from .simulation_config_generator import ( from .simulation_config_generator import (
SimulationConfigGenerator, SimulationConfigGenerator,
@ -46,6 +47,8 @@ __all__ = [
'FilteredEntities', 'FilteredEntities',
'OasisProfileGenerator', 'OasisProfileGenerator',
'OasisAgentProfile', 'OasisAgentProfile',
'EntityDeduplicator',
'DeduplicationReport',
'SimulationManager', 'SimulationManager',
'SimulationState', 'SimulationState',
'SimulationStatus', 'SimulationStatus',

View File

@ -0,0 +1,673 @@
"""
实体去重服务
图谱构建完成后识别并合并指向同一现实实体的重复节点
典型场景
- "特朗普" "美国总统特朗普" Zep 识别为两个不同节点
- 本服务通过 LLM 判断它们是否指向同一实体并自动合并
"""
import json
import time
from typing import Dict, Any, List, Optional, Callable, Tuple
from dataclasses import dataclass, field
import httpx
from zep_cloud.client import Zep
from ..config import Config
from ..utils.llm_client import LLMClient
from ..utils.logger import get_logger
from ..utils.zep_paging import fetch_all_nodes, fetch_all_edges
ZEP_API_BASE = "https://api.getzep.com/api/v2"
logger = get_logger('mirofish.entity_deduplicator')
DEDUP_SYSTEM_PROMPT = """你是一个实体消歧专家。你的任务是从一组知识图谱节点中,识别出指向同一现实世界实体的重复节点。
**重要你必须输出有效的JSON格式数据不要输出任何其他内容**
## 判断标准
两个节点应被判定为"同一实体"当且仅当
- 它们指向现实世界中完全相同的人组织或事物
- 仅仅是称呼不同如全名 vs 简称带头衔 vs 不带头衔
- 例如"特朗普""美国总统特朗普"是同一个人
## 硬性规则(必须严格遵守)
1. **类型必须一致**人物只能与人物合并组织只能与组织合并地点只能与地点合并绝对不允许跨类型合并
2. **上下级关系不合并**国务院领事馆总部分公司部门下属机构它们是不同实体
3. **关联关系不合并**某人在某组织任职不代表这个人和组织是同一实体
4. **信息来源不是实体**新闻媒体数据平台等信息来源与它们报道的实体不是同一事物
5. **宁可漏判不可误判**如果不确定两个节点是否为同一实体就不要合并
## 反例(以下情况绝对不应合并)
- "丹凯恩斯将军" "美国参谋长联席会议" 不合并人物 vs 组织
- "美国国务院" "驻土耳其阿达纳总领馆" 不合并上下级机构
- "伊朗驻华大使法兹里" "金十数据" 不合并外交官 vs 财经平台
- "霍尔木兹海峡" "美国" 不合并地理位置 vs 国家
- "新华网" "新华社" 不合并网站 vs 通讯社虽有关联但是不同实体
## 正例(以下情况应该合并)
- "特朗普" "美国总统特朗普" 合并同一个人简称 vs 带头衔
- "阿拉格齐" "伊朗外交部长阿拉格齐" 合并同一个人简称 vs 全称+头衔canonical_name 应为 "阿拉格齐"
- "伊朗革命卫队" "伊朗伊斯兰革命卫队" 合并同一组织简称 vs 全称
## 输出格式
```json
{
"duplicate_groups": [
{
"canonical_name": "应保留的标准名称(选择最简洁常用的)",
"members": [
{"uuid": "节点uuid", "name": "节点名称"}
],
"reason": "合并理由(简短)"
}
]
}
```
规则
- 每个 duplicate_group 至少包含 2 members
- canonical_name 应选择最常用最简洁辨识度最高的名称"特朗普"优于"美国总统特朗普"
- 如果没有发现任何重复返回 `{"duplicate_groups": []}`
"""
DEDUP_BATCH_SIZE = 80
NAME_JACCARD_THRESHOLD = 0.5
@dataclass
class MergeAction:
"""单次合并操作的记录"""
group_canonical_name: str
keep_node_uuid: str
keep_node_name: str
removed_nodes: List[Dict[str, str]]
edges_migrated: int
reason: str
@dataclass
class DeduplicationReport:
"""去重执行报告"""
graph_id: str
total_nodes_before: int
total_nodes_after: int
groups_found: int
nodes_removed: int
edges_migrated: int
merge_actions: List[MergeAction] = field(default_factory=list)
errors: List[str] = field(default_factory=list)
def to_dict(self) -> Dict[str, Any]:
return {
"graph_id": self.graph_id,
"total_nodes_before": self.total_nodes_before,
"total_nodes_after": self.total_nodes_after,
"groups_found": self.groups_found,
"nodes_removed": self.nodes_removed,
"edges_migrated": self.edges_migrated,
"merge_actions": [
{
"canonical_name": a.group_canonical_name,
"keep_node": {"uuid": a.keep_node_uuid, "name": a.keep_node_name},
"removed_nodes": a.removed_nodes,
"edges_migrated": a.edges_migrated,
"reason": a.reason,
}
for a in self.merge_actions
],
"errors": self.errors,
}
class EntityDeduplicator:
"""
实体去重服务
工作流程
1. Zep 获取图谱中所有节点
2. 将节点名称列表发送给 LLM识别重复组
3. 对每个重复组选择主节点并合并其余节点的边和摘要
4. 删除多余节点
"""
def __init__(
self,
zep_api_key: Optional[str] = None,
llm_client: Optional[LLMClient] = None,
):
self.zep_api_key = zep_api_key or Config.ZEP_API_KEY
if not self.zep_api_key:
raise ValueError("ZEP_API_KEY 未配置")
self.zep_client = Zep(api_key=self.zep_api_key)
self.llm_client = llm_client or LLMClient()
self._http = httpx.Client(
base_url=ZEP_API_BASE,
headers={"Authorization": f"Api-Key {self.zep_api_key}"},
timeout=60.0,
)
def deduplicate(
self,
graph_id: str,
dry_run: bool = False,
progress_callback: Optional[Callable[[str, float], None]] = None,
) -> DeduplicationReport:
"""
对指定图谱执行实体去重
Args:
graph_id: Zep 图谱 ID
dry_run: 仅检测重复但不实际合并用于预览
progress_callback: 进度回调 (message, progress_0_to_1)
Returns:
DeduplicationReport
"""
def _progress(msg: str, pct: float):
logger.info(f"[dedup] {msg}")
if progress_callback:
progress_callback(msg, pct)
_progress("读取图谱节点...", 0.0)
nodes = fetch_all_nodes(self.zep_client, graph_id)
total_before = len(nodes)
if total_before < 2:
_progress("节点数不足,无需去重", 1.0)
return DeduplicationReport(
graph_id=graph_id,
total_nodes_before=total_before,
total_nodes_after=total_before,
groups_found=0,
nodes_removed=0,
edges_migrated=0,
)
node_list = []
for n in nodes:
uuid = getattr(n, 'uuid_', None) or getattr(n, 'uuid', '')
name = n.name or ""
labels = n.labels or []
summary = n.summary or ""
node_list.append({
"uuid": uuid,
"name": name,
"labels": labels,
"summary": summary,
})
_progress(f"{total_before} 个节点,开始 LLM 去重识别...", 0.1)
duplicate_groups = self._find_duplicates(node_list)
groups_found = len(duplicate_groups)
if groups_found == 0:
_progress("未发现重复实体", 1.0)
return DeduplicationReport(
graph_id=graph_id,
total_nodes_before=total_before,
total_nodes_after=total_before,
groups_found=0,
nodes_removed=0,
edges_migrated=0,
)
_progress(f"发现 {groups_found} 组重复实体", 0.3)
if dry_run:
actions = []
total_removable = 0
for g in duplicate_groups:
members = g["members"]
keep = members[0]
removable = members[1:]
total_removable += len(removable)
actions.append(MergeAction(
group_canonical_name=g["canonical_name"],
keep_node_uuid=keep["uuid"],
keep_node_name=keep["name"],
removed_nodes=[{"uuid": r["uuid"], "name": r["name"]} for r in removable],
edges_migrated=0,
reason=g.get("reason", ""),
))
_progress(f"Dry-run 完成:可合并 {total_removable} 个重复节点", 1.0)
return DeduplicationReport(
graph_id=graph_id,
total_nodes_before=total_before,
total_nodes_after=total_before - total_removable,
groups_found=groups_found,
nodes_removed=0,
edges_migrated=0,
merge_actions=actions,
)
_progress("开始合并重复节点...", 0.4)
report = DeduplicationReport(
graph_id=graph_id,
total_nodes_before=total_before,
total_nodes_after=total_before,
groups_found=groups_found,
nodes_removed=0,
edges_migrated=0,
)
node_map = {n["uuid"]: n for n in node_list}
for idx, group in enumerate(duplicate_groups):
group_progress = 0.4 + 0.55 * (idx / groups_found)
canonical = group["canonical_name"]
members = group["members"]
reason = group.get("reason", "")
if len(members) < 2:
continue
valid_members = [m for m in members if m["uuid"] in node_map]
if len(valid_members) < 2:
continue
keep_node = self._pick_primary_node(valid_members, node_map, canonical)
dup_nodes = [m for m in valid_members if m["uuid"] != keep_node["uuid"]]
_progress(
f"合并组 [{canonical}]: 保留 '{keep_node['name']}'"
f"删除 {len(dup_nodes)} 个重复节点",
group_progress,
)
edges_migrated = 0
removed = []
for dup in dup_nodes:
try:
migrated = self._merge_node_into(
graph_id, keep_node["uuid"], dup["uuid"], node_map
)
edges_migrated += migrated
removed.append({"uuid": dup["uuid"], "name": dup["name"]})
except Exception as e:
err_msg = f"合并节点 '{dup['name']}' 失败: {str(e)}"
logger.error(err_msg)
report.errors.append(err_msg)
self._update_primary_node(keep_node["uuid"], canonical, valid_members, node_map)
report.merge_actions.append(MergeAction(
group_canonical_name=canonical,
keep_node_uuid=keep_node["uuid"],
keep_node_name=keep_node["name"],
removed_nodes=removed,
edges_migrated=edges_migrated,
reason=reason,
))
report.nodes_removed += len(removed)
report.edges_migrated += edges_migrated
for r in removed:
node_map.pop(r["uuid"], None)
report.total_nodes_after = total_before - report.nodes_removed
_progress(
f"去重完成:合并 {report.groups_found} 组,"
f"删除 {report.nodes_removed} 个节点,"
f"迁移 {report.edges_migrated} 条边",
1.0,
)
return report
# ------------------------------------------------------------------
# 名称相似度 & 类型兼容性 预筛选
# ------------------------------------------------------------------
@staticmethod
def _labels_compatible(labels_a: List[str], labels_b: List[str]) -> bool:
"""两个节点的类型标签是否兼容(至少有一个共同标签)"""
if not labels_a or not labels_b:
return True
return bool(set(labels_a) & set(labels_b))
@staticmethod
def _name_similar(name_a: str, name_b: str) -> bool:
"""两个名称是否足够相似,可作为候选重复对"""
a = name_a.strip()
b = name_b.strip()
if not a or not b:
return False
if a == b:
return True
if a in b or b in a:
return True
chars_a = set(a)
chars_b = set(b)
union = chars_a | chars_b
if not union:
return False
jaccard = len(chars_a & chars_b) / len(union)
return jaccard >= NAME_JACCARD_THRESHOLD
def _build_candidate_clusters(
self, node_list: List[Dict[str, Any]]
) -> List[List[Dict[str, Any]]]:
"""
预筛选按名称相似度 + 类型兼容性将节点聚类
只有名称相似且类型兼容的节点才会被放入同一候选簇
使用 union-find 算法构建连通分量
"""
n = len(node_list)
parent = list(range(n))
def find(x: int) -> int:
while parent[x] != x:
parent[x] = parent[parent[x]]
x = parent[x]
return x
def union(x: int, y: int):
px, py = find(x), find(y)
if px != py:
parent[px] = py
for i in range(n):
for j in range(i + 1, n):
if not self._labels_compatible(
node_list[i]["labels"], node_list[j]["labels"]
):
continue
if self._name_similar(node_list[i]["name"], node_list[j]["name"]):
union(i, j)
clusters: Dict[int, List[Dict[str, Any]]] = {}
for i in range(n):
root = find(i)
clusters.setdefault(root, []).append(node_list[i])
return [c for c in clusters.values() if len(c) >= 2]
# ------------------------------------------------------------------
# LLM 重复检测
# ------------------------------------------------------------------
def _find_duplicates(
self, node_list: List[Dict[str, Any]]
) -> List[Dict[str, Any]]:
"""
先用名称相似度 + 类型兼容性预筛选再逐簇调用 LLM 确认
这样 LLM 只处理预筛选后的候选节点避免在无关节点间产生误判
"""
clusters = self._build_candidate_clusters(node_list)
if not clusters:
logger.info("[dedup] 名称相似度预筛选: 未发现候选重复节点")
return []
candidate_count = sum(len(c) for c in clusters)
logger.info(
f"[dedup] 名称相似度预筛选: "
f"{len(clusters)} 组候选 ({candidate_count} 个节点,"
f"{len(node_list)} 个节点中筛出)"
)
all_groups: List[Dict[str, Any]] = []
for cluster in clusters:
groups = self._find_duplicates_single_batch(cluster)
all_groups.extend(groups)
return all_groups
def _find_duplicates_single_batch(
self, node_list: List[Dict[str, Any]]
) -> List[Dict[str, Any]]:
"""对一组候选节点调用 LLM 确认是否为同一实体"""
nodes_desc = "\n".join(
f"- uuid: {n['uuid']} | name: {n['name']} "
f"| labels: {', '.join(n['labels'])} "
f"| summary: {(n.get('summary') or '')[:100]}"
for n in node_list
)
user_message = (
f"以下是知识图谱中一组名称相似的实体节点,请判断其中是否有"
f"指向同一现实实体的重复节点:\n\n"
f"{nodes_desc}\n\n"
f"请严格按要求的JSON格式返回结果。"
f"注意:名称相似不等于是同一实体,请仔细分析 labels 和 summary。"
f"如果这些节点都不是重复的,返回 {{\"duplicate_groups\": []}}"
)
messages = [
{"role": "system", "content": DEDUP_SYSTEM_PROMPT},
{"role": "user", "content": user_message},
]
try:
result = self.llm_client.chat_json(
messages=messages, temperature=0.1, max_tokens=4096
)
groups = result.get("duplicate_groups", [])
return self._validate_groups(groups, node_list)
except Exception as e:
logger.error(f"LLM 去重识别失败: {e}")
return []
def _validate_groups(
self,
groups: List[Dict[str, Any]],
node_list: List[Dict[str, Any]],
) -> List[Dict[str, Any]]:
"""校验 LLM 返回的分组:过滤无效数据 + 类型一致性二次检查"""
valid_uuids = {n["uuid"] for n in node_list}
uuid_to_labels = {n["uuid"]: n.get("labels", []) for n in node_list}
validated = []
for g in groups:
if not isinstance(g, dict):
continue
members = g.get("members", [])
if not isinstance(members, list) or len(members) < 2:
continue
valid_members = []
for m in members:
if isinstance(m, dict) and m.get("uuid") in valid_uuids:
valid_members.append(m)
if len(valid_members) < 2:
continue
# 类型一致性检查:以第一个成员的类型为基准,过滤类型不兼容的成员
base_labels = set(uuid_to_labels.get(valid_members[0]["uuid"], []))
if base_labels:
type_checked = [valid_members[0]]
for m in valid_members[1:]:
m_labels = set(uuid_to_labels.get(m["uuid"], []))
if not m_labels or (m_labels & base_labels):
type_checked.append(m)
else:
logger.warning(
f"[dedup] 类型不一致,拒绝合并: "
f"'{m.get('name')}' ({list(m_labels)}) vs "
f"'{valid_members[0].get('name')}' ({list(base_labels)})"
)
valid_members = type_checked
if len(valid_members) < 2:
continue
seen_uuids: set = set()
unique_members = []
for m in valid_members:
if m["uuid"] not in seen_uuids:
seen_uuids.add(m["uuid"])
unique_members.append(m)
if len(unique_members) >= 2:
validated.append({
"canonical_name": g.get("canonical_name", unique_members[0]["name"]),
"members": unique_members,
"reason": g.get("reason", ""),
})
return validated
# ------------------------------------------------------------------
# 节点合并操作
# ------------------------------------------------------------------
def _pick_primary_node(
self,
members: List[Dict[str, str]],
node_map: Dict[str, Dict[str, Any]],
canonical_name: str,
) -> Dict[str, str]:
"""
选择主节点保留节点
优先级
1. 名称与 canonical_name 完全匹配的节点
2. summary 最长的节点信息最丰富
3. 列表中的第一个
"""
for m in members:
if m["name"] == canonical_name:
return m
best = members[0]
best_len = len(node_map.get(best["uuid"], {}).get("summary", ""))
for m in members[1:]:
s_len = len(node_map.get(m["uuid"], {}).get("summary", ""))
if s_len > best_len:
best = m
best_len = s_len
return best
def _merge_node_into(
self,
graph_id: str,
keep_uuid: str,
remove_uuid: str,
node_map: Dict[str, Dict[str, Any]],
) -> int:
"""
remove_uuid 节点的边迁移到 keep_uuid然后删除 remove_uuid
Returns:
迁移的边数量
"""
try:
edges = self.zep_client.graph.node.get_edges(node_uuid=remove_uuid)
except Exception as e:
logger.warning(f"获取节点 {remove_uuid} 的边失败: {e}")
edges = []
migrated = 0
keep_name = node_map.get(keep_uuid, {}).get("name", "")
old_edge_uuids: List[str] = []
for edge in edges:
source_uuid = edge.source_node_uuid
target_uuid = edge.target_node_uuid
fact = edge.fact or ""
edge_name = edge.name or ""
old_edge_uuids.append(edge.uuid_)
if source_uuid == remove_uuid:
other_uuid = target_uuid
else:
other_uuid = source_uuid
if other_uuid == keep_uuid:
continue
other_name = node_map.get(other_uuid, {}).get("name", "")
if not other_name:
continue
if source_uuid == remove_uuid:
src_name, tgt_name = keep_name, other_name
else:
src_name, tgt_name = other_name, keep_name
try:
self.zep_client.graph.add_fact_triple(
graph_id=graph_id,
fact=fact if fact else f"{src_name} {edge_name} {tgt_name}",
fact_name=edge_name,
source_node_name=src_name,
target_node_name=tgt_name,
)
migrated += 1
time.sleep(0.3)
except Exception as e:
logger.warning(f"迁移边 '{edge_name}' 失败: {e}")
self._remove_node(remove_uuid, old_edge_uuids)
return migrated
def _remove_node(self, node_uuid: str, edge_uuids: List[str]):
"""
删除节点先尝试直接 HTTP DELETE失败则降级为删除所有关联边节点变为孤立节点
"""
try:
resp = self._http.delete(f"graph/node/{node_uuid}")
resp.raise_for_status()
logger.info(f"已通过 HTTP API 删除节点 {node_uuid}")
return
except Exception as e:
logger.warning(f"HTTP 删除节点 {node_uuid} 失败 ({e}),降级为删除关联边")
deleted_edges = 0
for eu in edge_uuids:
try:
self.zep_client.graph.edge.delete(uuid_=eu)
deleted_edges += 1
time.sleep(0.2)
except Exception as e:
logger.warning(f"删除边 {eu} 失败: {e}")
logger.info(f"已删除节点 {node_uuid}{deleted_edges}/{len(edge_uuids)} 条边(节点变为孤立节点)")
def _update_primary_node(
self,
keep_uuid: str,
canonical_name: str,
all_members: List[Dict[str, str]],
node_map: Dict[str, Dict[str, Any]],
):
"""更新主节点:合并所有成员的 summary统一名称"""
summaries = []
for m in all_members:
s = node_map.get(m["uuid"], {}).get("summary", "")
if s:
summaries.append(s)
merged_summary = "\n\n".join(dict.fromkeys(summaries))
update_body: Dict[str, Any] = {}
current_name = node_map.get(keep_uuid, {}).get("name", "")
if current_name != canonical_name:
update_body["name"] = canonical_name
if merged_summary:
update_body["summary"] = merged_summary
if not update_body:
return
try:
resp = self._http.patch(
f"graph/node/{keep_uuid}",
json=update_body,
)
resp.raise_for_status()
logger.info(f"已更新主节点 {keep_uuid} 名称/摘要")
except Exception as e:
logger.warning(f"更新主节点 {keep_uuid} 失败 (HTTP PATCH): {e},跳过名称/摘要更新")

View File

@ -140,6 +140,28 @@
<span class="stat-label">SCHEMA类型</span> <span class="stat-label">SCHEMA类型</span>
</div> </div>
</div> </div>
<!-- Dedup Report -->
<div v-if="dedupReport && dedupReport.groups_found > 0" class="dedup-section">
<div class="dedup-title">
实体去重合并 <strong>{{ dedupReport.groups_found }}</strong> 删除 <strong>{{ dedupReport.nodes_removed }}</strong> 个冗余节点
</div>
<div class="dedup-list">
<div
v-for="(action, idx) in dedupReport.merge_actions"
:key="idx"
class="dedup-item"
>
<span class="dedup-keep"> {{ action.keep_node.name }}</span>
<span class="dedup-arrow"></span>
<span
v-for="(removed, ri) in action.removed_nodes"
:key="ri"
class="dedup-removed"
>{{ removed.name }}<span v-if="ri < action.removed_nodes.length - 1"></span></span>
</div>
</div>
</div>
</div> </div>
</div> </div>
@ -199,6 +221,7 @@ const props = defineProps({
ontologyProgress: Object, ontologyProgress: Object,
buildProgress: Object, buildProgress: Object,
graphData: Object, graphData: Object,
dedupReport: Object,
systemLogs: { type: Array, default: () => [] } systemLogs: { type: Array, default: () => [] }
}) })
@ -598,6 +621,61 @@ watch(() => props.systemLogs.length, () => {
display: block; display: block;
} }
/* Dedup Report */
.dedup-section {
margin-top: 12px;
padding: 10px;
background: #FAFAFA;
border-radius: 6px;
border-left: 3px solid #FF6B35;
}
.dedup-title {
font-size: 11px;
color: #555;
margin-bottom: 8px;
}
.dedup-title strong {
color: #FF6B35;
}
.dedup-list {
display: flex;
flex-direction: column;
gap: 4px;
max-height: 120px;
overflow-y: auto;
}
.dedup-item {
display: flex;
align-items: center;
gap: 5px;
font-size: 11px;
padding: 3px 6px;
background: #FFF;
border-radius: 3px;
flex-wrap: wrap;
}
.dedup-keep {
color: #2E7D32;
font-weight: 600;
white-space: nowrap;
}
.dedup-arrow {
color: #999;
font-size: 10px;
}
.dedup-removed {
color: #B71C1C;
text-decoration: line-through;
opacity: 0.7;
}
/* Step 03 Button */ /* Step 03 Button */
.action-btn { .action-btn {
width: 100%; width: 100%;

View File

@ -56,6 +56,7 @@
:ontologyProgress="ontologyProgress" :ontologyProgress="ontologyProgress"
:buildProgress="buildProgress" :buildProgress="buildProgress"
:graphData="graphData" :graphData="graphData"
:dedupReport="dedupReport"
:systemLogs="systemLogs" :systemLogs="systemLogs"
@next-step="handleNextStep" @next-step="handleNextStep"
/> />
@ -100,6 +101,7 @@ const graphLoading = ref(false)
const error = ref('') const error = ref('')
const projectData = ref(null) const projectData = ref(null)
const graphData = ref(null) const graphData = ref(null)
const dedupReport = ref(null)
const currentPhase = ref(-1) // -1: Upload, 0: Ontology, 1: Build, 2: Complete const currentPhase = ref(-1) // -1: Upload, 0: Ontology, 1: Build, 2: Complete
const ontologyProgress = ref(null) const ontologyProgress = ref(null)
const buildProgress = ref(null) const buildProgress = ref(null)
@ -332,9 +334,20 @@ const pollTaskStatus = async (taskId) => {
buildProgress.value = { progress: task.progress || 0, message: task.message } buildProgress.value = { progress: task.progress || 0, message: task.message }
if (task.status === 'completed') { if (task.status === 'completed') {
if (task.result?.dedup_report) {
dedupReport.value = task.result.dedup_report
const dr = task.result.dedup_report
if (dr.groups_found > 0) {
addLog(`实体去重: 合并 ${dr.groups_found} 组, 删除 ${dr.nodes_removed} 个冗余节点`)
for (const action of dr.merge_actions) {
const removedNames = action.removed_nodes.map(n => n.name).join(', ')
addLog(`${action.canonical_name}${removedNames}`)
}
}
}
addLog('Graph build task completed.') addLog('Graph build task completed.')
stopPolling() stopPolling()
stopGraphPolling() // Stop polling, do final load stopGraphPolling()
currentPhase.value = 2 currentPhase.value = 2
// Final load // Final load

View File

@ -355,6 +355,31 @@
</div> </div>
</div> </div>
</div> </div>
<div class="detail-section" v-if="dedupReport && dedupReport.groups_found > 0">
<div class="detail-label">实体去重</div>
<div class="dedup-summary">
<span class="dedup-stat">合并 <strong>{{ dedupReport.groups_found }}</strong> 组重复实体</span>
<span class="dedup-stat">删除 <strong>{{ dedupReport.nodes_removed }}</strong> 个冗余节点</span>
</div>
<div class="dedup-details">
<div
v-for="(action, idx) in dedupReport.merge_actions"
:key="idx"
class="dedup-group"
>
<div class="dedup-group-header">
<span class="dedup-keep"> {{ action.keep_node.name }}</span>
<span class="dedup-arrow"></span>
<span
v-for="(removed, ri) in action.removed_nodes"
:key="ri"
class="dedup-removed"
>{{ removed.name }}<span v-if="ri < action.removed_nodes.length - 1"></span></span>
</div>
</div>
</div>
</div>
</div> </div>
</div> </div>
@ -430,6 +455,7 @@ const graphLoading = ref(false)
const error = ref('') const error = ref('')
const projectData = ref(null) const projectData = ref(null)
const graphData = ref(null) const graphData = ref(null)
const dedupReport = ref(null)
const buildProgress = ref(null) const buildProgress = ref(null)
const ontologyProgress = ref(null) // const ontologyProgress = ref(null) //
const currentPhase = ref(-1) // -1: , 0: , 1: , 2: const currentPhase = ref(-1) // -1: , 0: , 1: , 2:
@ -799,6 +825,10 @@ const pollTaskStatus = async (taskId) => {
if (task.status === 'completed') { if (task.status === 'completed') {
console.log('✅ 图谱构建完成,正在加载完整数据...') console.log('✅ 图谱构建完成,正在加载完整数据...')
if (task.result?.dedup_report) {
dedupReport.value = task.result.dedup_report
}
stopPolling() stopPolling()
stopGraphPolling() stopGraphPolling()
currentPhase.value = 2 currentPhase.value = 2
@ -1946,6 +1976,55 @@ onUnmounted(() => {
letter-spacing: 0.05em; letter-spacing: 0.05em;
} }
/* 去重报告 */
.dedup-summary {
display: flex;
gap: 16px;
margin-bottom: 10px;
font-size: 0.8rem;
color: #555;
}
.dedup-summary strong {
color: #FF6B35;
}
.dedup-details {
display: flex;
flex-direction: column;
gap: 6px;
max-height: 160px;
overflow-y: auto;
}
.dedup-group-header {
display: flex;
align-items: center;
gap: 6px;
font-size: 0.78rem;
padding: 5px 8px;
background: #F5F5F5;
border-radius: 4px;
flex-wrap: wrap;
}
.dedup-keep {
color: #2E7D32;
font-weight: 600;
white-space: nowrap;
}
.dedup-arrow {
color: #999;
font-size: 0.7rem;
}
.dedup-removed {
color: #B71C1C;
text-decoration: line-through;
opacity: 0.75;
}
/* 下一步按钮 */ /* 下一步按钮 */
.next-step-section { .next-step-section {
margin-top: 24px; margin-top: 24px;