MicroFish/backend/app/utils/zep_paging.py

145 lines
4.3 KiB
Python
Raw Blame History

This file contains ambiguous Unicode characters

This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

"""Zep Graph 分页读取工具。
Zep 的 node/edge 列表接口使用 UUID cursor 分页,
本模块封装自动翻页逻辑(含单页重试),对调用方透明地返回完整列表。
"""
from __future__ import annotations
import time
from collections.abc import Callable
from typing import Any
from typing import Any
from .logger import get_logger
logger = get_logger('mirofish.zep_paging')
_DEFAULT_PAGE_SIZE = 100
_MAX_NODES = 2000
_DEFAULT_MAX_RETRIES = 3
_DEFAULT_RETRY_DELAY = 2.0 # seconds, doubles each retry
def _fetch_page_with_retry(
api_call: Callable[..., list[Any]],
*args: Any,
max_retries: int = _DEFAULT_MAX_RETRIES,
retry_delay: float = _DEFAULT_RETRY_DELAY,
page_description: str = "page",
**kwargs: Any,
) -> list[Any]:
"""单页请求失败时指数退避重试。自动处理429限速。"""
if max_retries < 1:
raise ValueError("max_retries must be >= 1")
last_exception: Exception | None = None
delay = retry_delay
for attempt in range(max_retries):
try:
return api_call(*args, **kwargs)
except Exception as e:
last_exception = e
if attempt < max_retries - 1:
# 检测429限速使用retry-after头部指定的等待时间
wait = delay
logger.warning(
f"Zep {page_description} attempt {attempt + 1} failed: {str(e)[:100]}, retrying in {wait:.1f}s..."
)
time.sleep(wait)
delay *= 2
else:
logger.error(f"Zep {page_description} failed after {max_retries} attempts: {str(e)}")
assert last_exception is not None
raise last_exception
def fetch_all_nodes(
client: Any,
graph_id: str,
page_size: int = _DEFAULT_PAGE_SIZE,
max_items: int = _MAX_NODES,
max_retries: int = _DEFAULT_MAX_RETRIES,
retry_delay: float = _DEFAULT_RETRY_DELAY,
) -> list[Any]:
"""分页获取图谱节点,最多返回 max_items 条(默认 2000。每页请求自带重试。"""
all_nodes: list[Any] = []
cursor: str | None = None
page_num = 0
while True:
kwargs: dict[str, Any] = {"limit": page_size}
if cursor is not None:
kwargs["uuid_cursor"] = cursor
page_num += 1
batch = _fetch_page_with_retry(
client.graph.node.get_by_graph_id,
graph_id,
max_retries=max_retries,
retry_delay=retry_delay,
page_description=f"fetch nodes page {page_num} (graph={graph_id})",
**kwargs,
)
if not batch:
break
all_nodes.extend(batch)
if len(all_nodes) >= max_items:
all_nodes = all_nodes[:max_items]
logger.warning(f"Node count reached limit ({max_items}), stopping pagination for graph {graph_id}")
break
if len(batch) < page_size:
break
cursor = getattr(batch[-1], "uuid_", None) or getattr(batch[-1], "uuid", None)
if cursor is None:
logger.warning(f"Node missing uuid field, stopping pagination at {len(all_nodes)} nodes")
break
return all_nodes
def fetch_all_edges(
client: Any,
graph_id: str,
page_size: int = _DEFAULT_PAGE_SIZE,
max_retries: int = _DEFAULT_MAX_RETRIES,
retry_delay: float = _DEFAULT_RETRY_DELAY,
) -> list[Any]:
"""分页获取图谱所有边,返回完整列表。每页请求自带重试。"""
all_edges: list[Any] = []
cursor: str | None = None
page_num = 0
while True:
kwargs: dict[str, Any] = {"limit": page_size}
if cursor is not None:
kwargs["uuid_cursor"] = cursor
page_num += 1
batch = _fetch_page_with_retry(
client.graph.edge.get_by_graph_id,
graph_id,
max_retries=max_retries,
retry_delay=retry_delay,
page_description=f"fetch edges page {page_num} (graph={graph_id})",
**kwargs,
)
if not batch:
break
all_edges.extend(batch)
if len(batch) < page_size:
break
cursor = getattr(batch[-1], "uuid_", None) or getattr(batch[-1], "uuid", None)
if cursor is None:
logger.warning(f"Edge missing uuid field, stopping pagination at {len(all_edges)} edges")
break
return all_edges