diff --git a/.env.example b/.env.example index 0bde1892..e18a407e 100644 --- a/.env.example +++ b/.env.example @@ -18,6 +18,14 @@ LLM_MODEL_NAME=qwen-plus # EMBEDDING_BASE_URL= EMBEDDING_MODEL=text-embedding-3-small +# Local embeddings via Ollama (run: ollama pull mxbai-embed-large). +# mxbai-embed-large is 1024-dim, matching Graphiti's default EMBEDDING_DIM. +# 768-dim models (e.g. nomic-embed-text) are NOT supported until EMBEDDING_DIM +# becomes configurable. Use host.docker.internal in Docker, localhost in host mode. +# EMBEDDING_BASE_URL=http://host.docker.internal:11434/v1 +# EMBEDDING_API_KEY=ollama +# EMBEDDING_MODEL=mxbai-embed-large + # Knowledge graph — Neo4j (default works for both Docker and host modes). # Docker compose overrides NEO4J_URI to bolt://neo4j:7687 inside the stack. NEO4J_URI=bolt://localhost:7687 diff --git a/.github/workflows/i18n-cjk-guard.yml b/.github/workflows/i18n-cjk-guard.yml new file mode 100644 index 00000000..067d06b5 --- /dev/null +++ b/.github/workflows/i18n-cjk-guard.yml @@ -0,0 +1,26 @@ +name: i18n CJK Guard + +on: + pull_request: + branches: [main] + +permissions: + contents: read + +jobs: + guard: + runs-on: ubuntu-latest + timeout-minutes: 1 + steps: + - name: Checkout + uses: actions/checkout@v4 + with: + fetch-depth: 1 + + - name: Set up Python + uses: actions/setup-python@v5 + with: + python-version: '3.11' + + - name: Run i18n CJK guard + run: python scripts/ci/i18n_cjk_guard.py diff --git a/.kiro/specs/graphiti-ollama-embedder/design.md b/.kiro/specs/graphiti-ollama-embedder/design.md new file mode 100644 index 00000000..db4cc600 --- /dev/null +++ b/.kiro/specs/graphiti-ollama-embedder/design.md @@ -0,0 +1,296 @@ +# Design Document — graphiti-ollama-embedder + +## Overview + +**Purpose**: Add first-class documentation for using a local Ollama embedder (`mxbai-embed-large`) with the Graphiti adapter, and remove the silent placeholder-UUID fallback in `_GraphNamespace.add_batch` so embedding failures terminate the surrounding graph-build `Task` with the underlying error visible. + +**Users**: Self-hosting MiroFish operators who run the LLM/embedder stack locally on Ollama, and any operator hitting a misconfigured embedder (which currently produces an empty graph that *looks* successfully built). + +**Impact**: The graph-build pipeline becomes correctly observable: invalid `EMBEDDING_*` configuration produces a `Task.status = FAILED` with the underlying error, instead of `COMPLETED` with no nodes. The change is invisible on the OpenAI/Gemini happy path. + +### Goals +- R1 — `.env.example`, `CLAUDE.md`, `README.md`, `docker-compose.yml` document Ollama as a supported embedder configuration with `mxbai-embed-large` and a `curl` smoke test. +- R2 — embedding failures in `_GraphNamespace.add_batch` propagate to the calling background task, which terminates with `status=FAILED` and a non-empty `error`. ERROR-level logging instead of `WARNING`. +- R3 — OpenAI- and Gemini-based deployments are unchanged; no new env var; the 1024-dim constraint is documented. + +### Non-Goals +- Adding a startup-time embedder health probe. +- Making `EMBEDDING_DIM` env-configurable to support 768-dim models. +- Adding an `ollama` provider literal in `_build_llm_and_embedder` (Ollama uses the existing `openai` branch with a different `EMBEDDING_BASE_URL`). +- Generic retry/backoff for transient embedder errors. Tracked as an explicit follow-up. + +## Boundary Commitments + +### This Spec Owns +- The documentation surface for Ollama embedder configuration in `.env.example`, `CLAUDE.md`, `README.md`, and `docker-compose.yml` comments. +- The error-propagation contract of `_GraphNamespace.add_batch` in `backend/app/services/graphiti_adapter.py`. +- Adapter-level ERROR-log emission for failed `add_episode` calls. + +### Out of Boundary +- Behavior of `_GraphNamespace.add(...)` (single-episode path; already correct). +- Behavior of `_GraphNamespace.search(...)` (still allowed to log-and-return-empty per steering). +- The `_build_graph_worker` outer `try/except` and `fail_task` plumbing — already implements the contract this spec depends on. +- Any change to `_build_llm_and_embedder` (no provider literal added; existing `openai` branch is sufficient). +- Generic retry policy. + +### Allowed Dependencies +- `app.utils.logger.get_logger(...)` for ERROR-level emission. +- The existing `_run` helper that drives async Graphiti calls on the persistent loop. +- The existing `Task` lifecycle methods (`fail_task`) called from `_build_graph_worker` — relied on, not modified. +- `graphiti_core.embedder.openai.OpenAIEmbedder` configured with arbitrary `base_url`. + +### Revalidation Triggers +- Any future provider literal added to `_build_llm_and_embedder` (would change which env vars feed which embedder). +- Any change to the contract that `_GraphNamespace.add_batch` returns one `_EpisodeResult` per input episode in input order. +- Any change to how `_build_graph_worker` translates exceptions into `Task` failures (would invalidate the assumption that propagating from the adapter is sufficient). + +## Architecture + +### Existing Architecture Analysis +- The Graphiti adapter (`backend/app/services/graphiti_adapter.py`) is the **single** read/write surface for Neo4j (`tech.md`: "All graph reads/writes go through the `graphiti_adapter`"). +- Graph build runs as a background `Task` (`models/task.py`), tracked through the `Task` model with `status`, `progress`, `error`, polled by the frontend. +- `error-handling.md` mandates that long-running tasks always reach `COMPLETED` or `FAILED`. The current silent-swallow path violates this by producing `COMPLETED` with no nodes. +- The `OpenAIEmbedder` from `graphiti_core` accepts an arbitrary `base_url` / `api_key` / `embedding_model`. Ollama's `/v1/embeddings` is OpenAI-compatible. No new client class is needed. + +### Architecture Pattern & Boundary Map + +```mermaid +flowchart TD + UI[Frontend Step 1
Graph Build] -->|POST /api/graph/build| API[graph_bp handler] + API --> SVC[GraphBuilderService.build_graph_async] + SVC -->|spawn thread| W[_build_graph_worker] + W --> ADD[GraphBuilderService.add_text_batches] + ADD --> NS["_GraphNamespace.add_batch
(this spec)"] + NS -->|_run| GR[graphiti_core.add_episode] + GR -->|/v1/embeddings| EMB[OpenAI-SDK embedder
OpenAI / Gemini / Ollama] + + NS -. raise on failure .-> ADD + ADD -. raise .-> W + W -. fail_task(error) .-> TM[TaskManager] + TM -. status=FAILED .-> UI + + classDef changed fill:#fef3c7,stroke:#92400e,stroke-width:2px; + class NS changed; +``` + +**Architecture Integration**: +- **Selected pattern**: minimal extension of the existing adapter pattern — fix one method's failure semantics, add no new layer. +- **Domain/feature boundaries**: error propagation stays at the adapter; task-state translation stays in the worker; UI rendering of failed tasks is unchanged. +- **Existing patterns preserved**: single-surface graph adapter; background-task `Task` lifecycle; `_run` async-loop helper; `OpenAIEmbedder` reuse for any OpenAI-SDK target. +- **New components rationale**: none — no new module is introduced. +- **Steering compliance**: + - `error-handling.md` § Background Task Errors — failure now terminates the task with a real error. + - `error-handling.md` § Logging — ERROR level for unrecoverable; WARNING reserved for retry/recovered. + - `tech.md` § Key Libraries — adapter remains the single graph read/write surface. + +### Technology Stack & Alignment + +| Layer | Choice / Version | Role in Feature | Notes | +|-------|------------------|-----------------|-------| +| Frontend / CLI | Vue 3.5 (unchanged) | Polls `Task` status; renders failure | No code change. | +| Backend / Services | Python ≥3.11, Flask 3.0, `graphiti-core ≥ 0.3` | `_GraphNamespace.add_batch` failure propagation | One method edited. | +| Data / Storage | Neo4j 5.x via `bolt://` (unchanged) | Same writes attempted; failed writes never partially commit because the adapter is the only path. | — | +| Messaging / Events | None | — | — | +| Infrastructure / Runtime | Optional Ollama daemon at `http://host.docker.internal:11434/v1` | Source of `mxbai-embed-large` embeddings (1024-dim). | Documented, not enforced. | + +## File Structure Plan + +### Modified Files +- `backend/app/services/graphiti_adapter.py` — replace the broad `except Exception` in `_GraphNamespace.add_batch` (lines ~471–473) with `logger.exception(...)` + `raise`. Remove the placeholder-UUID fallback. ~5 LOC delta. +- `.env.example` — add a commented Ollama embedder block (3 commented env-var lines + a 1-line comment about `ollama pull`). +- `CLAUDE.md` — extend the "Required Environment Variables" section to list three supported embedder providers (OpenAI, Gemini, Ollama) and the 1024-dim constraint. +- `README.md` — replace the single Gemini hint comment in the Required Environment Variables block with a short three-option block (OpenAI, Gemini, Ollama) and append a one-line `curl` smoke-test snippet inside the same setup section. +- `docker-compose.yml` — one comment line above the `mirofish` service noting that Ollama on the host is reached via `host.docker.internal:11434`. + +### New Files +- None. + +> No code is moved or split. All edits are local and additive except the 5-line deletion in `_GraphNamespace.add_batch`. + +## System Flows + +### Failure flow (the change) + +```mermaid +sequenceDiagram + autonumber + participant W as _build_graph_worker + participant A as add_text_batches + participant NS as _GraphNamespace.add_batch + participant G as graphiti_core.add_episode + participant E as Embedder (Ollama / OpenAI) + participant TM as TaskManager + + W->>A: chunks, batch_size + loop per batch + A->>NS: add_batch(group_id, episodes) + loop per episode + NS->>G: _run(add_episode(...)) + G->>E: POST /v1/embeddings + alt embedder OK + E-->>G: 200, vector(1024) + G-->>NS: EpisodeResult + else embedder error (404 / 401 / connection) + E-->>G: 4xx/5xx + G-->>NS: raise exception + Note right of NS: logger.exception(...); raise + end + end + end + + Note over A: try/except wraps add_batch and re-raises + NS-->>A: raise + A-->>W: raise + W->>TM: fail_task(task_id, str(e) + traceback) + TM-->>W: Task.status = FAILED +``` + +Decisions reflected in the diagram: +- The adapter raises immediately on any exception from `_g.add_episode`. +- The single-episode `add()` path (not shown) is unchanged because it already raises naturally. +- `add_text_batches` already re-raises after a localized progress message — no edit needed there. + +## Requirements Traceability + +| Requirement | Summary | Components | Interfaces | Flows | +|-------------|---------|------------|------------|-------| +| 1.1 | `.env.example` Ollama block | `.env.example` (modified file) | n/a | n/a | +| 1.2 | `CLAUDE.md` lists three providers + 1024-dim constraint | `CLAUDE.md` (modified file) | n/a | n/a | +| 1.3 | docker-compose / README note about `host.docker.internal:11434` | `docker-compose.yml`, `README.md` (modified files) | n/a | n/a | +| 1.4 | `curl` smoke-test snippet | `README.md` (modified file) | n/a | n/a | +| 1.5 | End-to-end build with `mxbai-embed-large` | `graphiti_adapter._build_llm_and_embedder` (unchanged) | `OpenAIEmbedderConfig` | Failure flow (happy path is identical to today) | +| 2.1 | No placeholder UUID on failure | `_GraphNamespace.add_batch` | `_EpisodeResult` (only emitted on success) | Failure flow | +| 2.2 | Propagate exception | `_GraphNamespace.add_batch` | n/a | Failure flow | +| 2.3 | `Task.FAILED` with non-empty error | `_build_graph_worker` (unchanged) | `TaskManager.fail_task` | Failure flow | +| 2.4 | Log at ERROR level | `_GraphNamespace.add_batch` | `logger.exception(...)` | Failure flow | +| 2.5 | UI shows error, no fake-success placeholder | Frontend Step 1 (unchanged) | Task polling | Failure flow | +| 2.6 | Preserve happy-path UUID contract | `_GraphNamespace.add_batch` | `_EpisodeResult.uuid_` | n/a | +| 3.1 | OpenAI/Gemini behavior unchanged | `_build_llm_and_embedder` (unchanged) | n/a | n/a | +| 3.2 | No new env var | scope rule | n/a | n/a | +| 3.3 | 1024-dim constraint documented | `CLAUDE.md` (modified file) | n/a | n/a | + +## Components and Interfaces + +| Component | Domain/Layer | Intent | Req Coverage | Key Dependencies (P0/P1) | Contracts | +|-----------|--------------|--------|--------------|--------------------------|-----------| +| `_GraphNamespace.add_batch` | services / graph-adapter | Ingest a batch of text episodes; raise on first failure; preserve UUIDs on success | 2.1, 2.2, 2.4, 2.6 | `graphiti_core.add_episode` (P0), `app.utils.logger` (P0) | Service | +| Documentation set (`.env.example`, `CLAUDE.md`, `README.md`, `docker-compose.yml`) | docs | Describe Ollama embedder configuration and constraints | 1.1, 1.2, 1.3, 1.4, 3.3 | none | Doc | + +### graph-adapter / `_GraphNamespace.add_batch` + +| Field | Detail | +|-------|--------| +| Intent | Ingest each episode through `graphiti_core.add_episode`; propagate the first failure to the caller; never substitute a placeholder UUID. | +| Requirements | 2.1, 2.2, 2.4, 2.6 | + +**Responsibilities & Constraints** +- Iterate `episodes` in input order. +- For each episode, call `_run(self._g.add_episode(...))` and append a `_EpisodeResult` whose `uuid_` matches the Graphiti-assigned episode UUID. +- On any exception from `_run(...)`, emit `logger.exception(...)` (ERROR level with traceback) including the `graph_id` and the index of the failing episode for diagnosability, then `raise`. +- Do **not** swallow the exception. Do **not** return a `_EpisodeResult` for the failed episode. Do **not** continue the loop after a failure. +- Domain boundary: the method speaks Graphiti and Python exceptions; it does not know about `Task` lifecycles. +- Data ownership: emits `_EpisodeResult` instances only for successfully ingested episodes. + +**Dependencies** +- Inbound: `GraphBuilderService.add_text_batches` (P0, sole production caller for this method). +- Outbound: `graphiti_core.add_episode` via `_run(...)` (P0). +- External: `app.utils.logger.get_logger("mirofish.graph_builder")` (P0). + +**Contracts**: Service [x] / API [ ] / Event [ ] / Batch [ ] / State [ ] + +##### Service Interface +```python +class _GraphNamespace: + def add_batch(self, graph_id: str, episodes: List[Any]) -> List[_EpisodeResult]: + """Add a batch of episodes. + + Returns a list of _EpisodeResult, one per successfully ingested + episode, in input order. Raises the underlying exception on the + first failure; partial results are not returned. + + Preconditions: + - graph_id is a non-empty per-project group_id. + - Each item in `episodes` exposes a `data` attribute (str) + or stringifies to a meaningful body. + + Postconditions: + - On success: len(returned list) == len(episodes), each + `_EpisodeResult.uuid_` is the Graphiti-assigned UUID. + - On failure: an exception is raised; no `_EpisodeResult` + is returned for the failing episode and no further episodes + are attempted; partial successes prior to the failure are + committed in Neo4j (this matches today's behavior because + `add_episode` is invoked synchronously per episode). + + Invariants: + - Never returns a `_EpisodeResult` whose UUID was generated + locally as a placeholder. + """ +``` + +- Preconditions: as above. +- Postconditions: as above. +- Invariants: never emit a placeholder UUID. + +**Implementation Notes** +- Integration: the method is called from `GraphBuilderService.add_text_batches` (graph_builder.py:289–308), which already wraps the call in `try/except Exception: progress_callback(...); raise`. No caller-side change. +- Validation: input shape unchanged. +- Risks: an environment that was producing "successful" empty graphs because of the silent fallback will now produce a failed `Task`. This is the intended correction; PR description must call it out. + +### Documentation set + +**Edits (verbatim intent)**: +- `.env.example` — add an opt-in commented block, e.g.: + ```env + # Local embeddings via Ollama (run: ollama pull mxbai-embed-large). + # mxbai-embed-large is 1024-dim, matching Graphiti's default EMBEDDING_DIM. + # EMBEDDING_BASE_URL=http://host.docker.internal:11434/v1 + # EMBEDDING_API_KEY=ollama + # EMBEDDING_MODEL=mxbai-embed-large + ``` +- `CLAUDE.md` — extend the embedder note to enumerate OpenAI / Gemini / Ollama and call out the 1024-dim constraint. +- `README.md` — keep the existing Gemini comment, add the Ollama three-line example, append the `curl` smoke-test below the env block. +- `docker-compose.yml` — one comment above the `mirofish` service: `# Note: Ollama on the host is reachable from this container via host.docker.internal:11434`. + +These edits are doc-only; they do not affect the runtime contract. + +## Data Models + +No new data models. The `_EpisodeResult` dataclass shape is unchanged. The `Task` model is unchanged. The `Project.status` lifecycle is unchanged. + +## Error Handling + +### Error Strategy +- The adapter raises on first failure; the worker catches and routes to `Task.fail_task`. This is the existing project pattern (`error-handling.md` § Background Task Errors), and this spec aligns the adapter with it. +- No retries inside `add_batch`. Transient resilience, if added later, belongs at a layer that owns idempotency considerations (out of scope). + +### Error Categories and Responses +- **Embedder configuration errors** (404 unknown model, 401 unauthorized, connection refused) → adapter raises → worker fails the task with the exception's `str()` plus traceback → frontend renders `Task.error`. Operator action: fix `EMBEDDING_*` env vars per the new docs and re-run the build. +- **Embedder transient errors** (timeouts, intermittent 5xx) → today, treated identically to configuration errors (task fails). Future follow-up may narrow this with `retry_with_backoff`. +- **Graphiti-internal errors** unrelated to embeddings (e.g., Neo4j unavailable) → already raised by `_run(...)` and currently swallowed; this fix surfaces them too. Treated as a positive side effect. + +### Monitoring +- `logger.exception(...)` in `_GraphNamespace.add_batch` adds a full traceback at ERROR level, enabling existing log-aggregation setups to alert on adapter-level errors. +- `_build_graph_worker` already calls `logger.exception(f"task {task_id} failed")`; the two log lines are complementary (adapter-context vs. task-context). + +## Testing Strategy + +This is an extension feature — the project's testing stance is intentionally minimal (`tech.md`: "pytest is wired ... but coverage is intentionally minimal. Don't add a heavy test harness without discussing scope."). + +### Unit Tests (lightweight, optional) +- If we add a test, the right scope is a single pytest case for `_GraphNamespace.add_batch` that monkeypatches `self._g.add_episode` to raise, calls `add_batch`, and asserts the exception propagates and no `_EpisodeResult` is returned. Do not add a heavier harness. + +### Manual / End-to-End +1. **Happy path (OpenAI)**: existing setup — verify graph build still completes with real nodes/edges (no behavior change expected). +2. **Happy path (Ollama)**: `ollama pull mxbai-embed-large`; set the three `EMBEDDING_*` env vars per `.env.example`; run the smoke-test `curl` to confirm 1024-dim response; run a graph build through the UI; verify Neo4j has nodes/edges. +3. **Failure path (typo'd model)**: set `EMBEDDING_MODEL=text-embedding-3-small-typo` against an Ollama base URL; trigger a graph build; verify the task transitions to `FAILED` with the underlying 404 message visible in `Task.error` and the UI; verify backend logs include the ERROR-level traceback. + +### Performance / Load +- Not applicable. No throughput change expected on the happy path. Failure path returns earlier than today (bonus). + +## Security Considerations +- No new secrets introduced. `EMBEDDING_API_KEY=ollama` is documented as a placeholder string ignored by Ollama; this is consistent with the project's existing handling of `ZEP_API_KEY` (empty string acceptable). +- `error-handling.md` § Logging forbids logging API keys / full prompts. `logger.exception(...)` includes the exception message and traceback — Graphiti's exceptions do not echo API keys, but the ERROR log line should not include the request body. Implementation note: log only `graph_id` and episode index alongside the exception. + +## Migration Strategy +- None. The fix is purely additive on documentation and a strictly-more-correct behavior change in `add_batch`. Operators do not need to take action unless their graphs were silently empty, in which case this surfacing IS the migration trigger. diff --git a/.kiro/specs/graphiti-ollama-embedder/gap-analysis.md b/.kiro/specs/graphiti-ollama-embedder/gap-analysis.md new file mode 100644 index 00000000..b31cde62 --- /dev/null +++ b/.kiro/specs/graphiti-ollama-embedder/gap-analysis.md @@ -0,0 +1,99 @@ +# Gap Analysis — graphiti-ollama-embedder + +## 1. Current State Investigation + +### Domain assets touched by this feature +- `backend/app/services/graphiti_adapter.py` + - Lines 92–139 — `_build_llm_and_embedder(provider)`. Builds an `OpenAIEmbedder` (when `provider == "openai"`) using `EMBEDDING_API_KEY or LLM_API_KEY`, `EMBEDDING_BASE_URL or LLM_BASE_URL`, and `EMBEDDING_MODEL`. Already supports pointing the embedder at any OpenAI-compatible endpoint — no code change is needed for Ollama support. This is a **documentation gap, not a code gap**. + - Lines 455–475 — `_GraphNamespace.add_batch`. Iterates episodes, calls `add_episode`, and on `except Exception as e` logs a one-line `WARNING` and substitutes a fresh placeholder UUID. This is the silent-swallow path. + - Line 441–453 — `_GraphNamespace.add(...)`. Single-episode path. **Already raises naturally** because there is no `try/except`. + - Lines 504–506 — `_GraphNamespace.search(...)`. Has its own `except Exception` that logs and returns empty results. Per `error-handling.md` ("for non-fatal search failures, log and return empty results") this is the documented contract; out of scope. +- `backend/app/services/graph_builder.py` + - Lines 256–310 — `add_text_batches(...)`. Already wraps `client.graph.add_batch(...)` in `try/except Exception` and **re-raises** after a progress message. So if `_GraphNamespace.add_batch` raises, the exception propagates correctly. + - Lines 143–234 — `_build_graph_worker`. Outer `try/except Exception` calls `self.task_manager.fail_task(task_id, error_msg)` with `f"{str(e)}\n{traceback.format_exc()}"`. This already implements the "task always terminates" rule from `error-handling.md`. +- `backend/app/config.py` + - Lines 40, 50–51 — defines `EMBEDDING_MODEL`, `EMBEDDING_API_KEY`, `EMBEDDING_BASE_URL`. No change required. +- `.env.example` (project root) — currently only documents the OpenAI/Gemini path with commented-out `EMBEDDING_API_KEY` / `EMBEDDING_BASE_URL` lines. +- `CLAUDE.md` lines 60–82 — "Required Environment Variables" section lists `EMBEDDING_MODEL` with a note about Gemini overrides only. +- `README.md` lines 148–165 — "Required Environment Variables" section, mentions "uncomment if using a non-OpenAI provider, e.g. Gemini" but no Ollama example. +- `docker-compose.yml` lines 21–37 — `mirofish` service uses `env_file: .env` and overrides `NEO4J_URI`. No Ollama hint, but the standard `host.docker.internal` route works. + +### Conventions extracted from steering +- `tech.md`: "All graph reads/writes go through the `graphiti_adapter`; do not call Neo4j drivers directly from feature code." — adapter is the right place for the fix. +- `error-handling.md`: "Long-running tasks must always reach a terminal state (`COMPLETED` or `FAILED`)" — silent placeholder UUID violates this. +- `error-handling.md`: "Don't catch `Exception` inside an API handler just to log and continue" — same anti-pattern in the adapter today. +- `error-handling.md` § Logging: `WARNING` is for "retry triggered, transient failure, recovered state"; `ERROR` is for "task failure, unrecoverable exception". The current `WARNING` mislabels what is actually an unrecoverable failure for the task. +- `tech.md`: Ollama is **not currently** an officially listed provider. CLAUDE.md only enumerates OpenAI and Gemini. +- `commits.md` / `dev-guidelines.md`: 4-space indent, max 120 chars/line, double-quoted Python strings, snake_case, conventional commits. + +### Integration surfaces +- The `OpenAIEmbedder` from `graphiti_core.embedder.openai` already accepts an arbitrary `base_url`. Ollama exposes `/v1/embeddings` at `http://localhost:11434/v1`. No new client class is required. +- Background-task lifecycle: API handler → `GraphBuilderService.build_graph_async()` → background thread → `_build_graph_worker` → `fail_task(task_id, msg)`. Already in place; this feature just needs to stop short-circuiting it. + +## 2. Requirements Feasibility Analysis + +| Req | Need | Maps to | Gap | +| --- | --- | --- | --- | +| R1.1 | `.env.example` Ollama block | `.env.example` | **Missing** (docs) | +| R1.2 | `CLAUDE.md` lists OpenAI/Gemini/Ollama, dim constraint | `CLAUDE.md` | **Missing** (docs) | +| R1.3 | Docker-compose / README note about `host.docker.internal:11434` | `docker-compose.yml` comments / `README.md` | **Missing** (docs) | +| R1.4 | `curl` smoke-test snippet | `README.md` | **Missing** (docs) | +| R1.5 | Pipeline works end-to-end with mxbai-embed-large | adapter is already provider-agnostic via OpenAI-SDK | **No code gap** — already supported, just undocumented | +| R2.1 | Drop placeholder-UUID fallback | `graphiti_adapter.py:471–473` | **Constraint** — narrow change only | +| R2.2 | Propagate ingest exception | `graphiti_adapter.py:471–473` + caller | **Missing** — adapter swallows; caller re-raises if it sees an exception | +| R2.3 | `Task` transitions to `FAILED` with non-empty `error` | `graph_builder.py:231–234` | **Already implemented** — relies on R2.2 | +| R2.4 | Log at `ERROR` level | `graphiti_adapter.py:472` | **Missing** — currently `WARNING` | +| R2.5 | UI shows error, no fake-success placeholder | downstream of R2.3 | **Already implemented** via task polling | +| R2.6 | Preserve happy-path UUID contract | `graphiti_adapter.py:455–474` | **Constraint** — keep return shape on success | +| R3.1 | OpenAI/Gemini behavior unchanged | `_build_llm_and_embedder` | **No change needed** — branch untouched | +| R3.2 | No new env var | scope rule | **Constraint** | +| R3.3 | Document 1024-dim constraint | `CLAUDE.md` | **Missing** (docs) | + +### Research needed +- None for this feature — `OpenAIEmbedder` already supports custom `base_url`, and Ollama's `/v1/embeddings` is OpenAI-compatible (well-known and used in many projects). The 1024-dim constraint comes from `graphiti_core/embedder/client.py:22` (`EMBEDDING_DIM = 1024`) and is documented in the ticket itself. +- One mild unknown: whether to narrow the `except` to a transient subset (e.g., `httpx.TimeoutException`, `httpx.NetworkError`) and retry, or simply drop the catch entirely. Decided in design phase, not blocking. + +### Complexity signal +- Mostly documentation. The code change is **5 lines** in one method. + +## 3. Implementation Approach Options + +### Option A — Pure narrow fix in `_GraphNamespace.add_batch` + docs only (RECOMMENDED) +- **What**: delete the `except Exception` block in `add_batch` (or replace with `logger.exception(...)` + `raise`); update `.env.example`, `CLAUDE.md`, `README.md`, `docker-compose.yml` comments. +- **Files**: `backend/app/services/graphiti_adapter.py`, `.env.example`, `CLAUDE.md`, `README.md`, `docker-compose.yml`. +- **Trade-offs**: + - ✅ Minimal blast radius — adapter behavior outside `add_batch` is untouched. + - ✅ Existing background-task contract carries the failure to the UI for free. + - ✅ Honors steering rules: don't catch `Exception` to log-and-continue; tasks must terminate; ERROR-level logging for unrecoverable failures. + - ❌ Loses the (currently broken) "best effort, keep going on a partial failure" intent. In practice that intent never produced a usable graph anyway, so the loss is theoretical. + +### Option B — Narrow the catch to transient errors and retry, fail loud on the rest +- **What**: keep a `try/except`, but only catch a small set of transient classes (`httpx.TimeoutException`, `httpx.NetworkError`, `openai.APIConnectionError`), wrap the whole `add_episode` call in `retry_with_backoff` from `app/utils/retry.py`, and re-raise everything else immediately. +- **Trade-offs**: + - ✅ Adds small resilience for genuinely transient blips. + - ✅ Aligns with the existing `retry_with_backoff` pattern. + - ❌ More moving parts; broader change for a bug fix. + - ❌ Single-episode `add()` would also need the same treatment to avoid two divergent retry semantics. + - ❌ Out-of-scope creep: ticket is focused on stopping the silent swallow + documenting Ollama. + +### Option C — Per-provider embedder factory + Option A +- **What**: extend `_build_llm_and_embedder` with a third provider literal (`"ollama"`) that uses `OpenAIEmbedder` under the hood with hardcoded sensible defaults. +- **Trade-offs**: + - ✅ Symmetric with `openai`/`gemini`. + - ❌ The ticket explicitly lists "per-provider embedder factory" as out of scope. + - ❌ Duplicate code path — Ollama is just OpenAI-SDK with a different base URL. + +## 4. Effort & Risk + +- **Effort**: **S** (≤1 day). One file, one method, ~5 LOC delta plus 4 doc edits. +- **Risk**: **Low**. The change makes a previously-silent failure loud; it cannot break the happy path because the happy-path branch is the same return statement. Documentation changes are not load-bearing. + +One non-zero risk: if there are real-world users today whose graph builds succeed only by accident (i.e., the fallback hides intermittent embedding failures), they will start seeing failed tasks instead of (broken) successful ones. This is the intended correction — but worth noting in the PR description so the operator can re-check their embedder credentials. + +## 5. Recommendations for design phase + +- **Preferred approach**: **Option A**. Smallest correct fix; documentation reflects the already-supported configuration; follows steering's error-handling philosophy literally. +- **Key decisions to lock in design**: + 1. Drop the `except` entirely, or narrow it? Default: drop. Rationale: the only retry path that matters is transient network blips, and those would also kill the surrounding `_run` loop today; addressing them would be a follow-up using the project's `retry_with_backoff` decorator on the underlying graph driver call, not a band-aid in `add_batch`. + 2. Which docs files mention Ollama? Default: `.env.example`, `CLAUDE.md`, `README.md`, `docker-compose.yml` comment. Two-file or three-file split? +- **Carry-forward research**: none. diff --git a/.kiro/specs/graphiti-ollama-embedder/requirements.md b/.kiro/specs/graphiti-ollama-embedder/requirements.md new file mode 100644 index 00000000..24096bf1 --- /dev/null +++ b/.kiro/specs/graphiti-ollama-embedder/requirements.md @@ -0,0 +1,128 @@ +# Requirements Document + +## Project Description (Input) +Fix Graphiti embedding integration with Ollama (mxbai-embed-large) and stop silently swallowing embedding failures. Two bugs: (1) No first-class support for local Ollama embedders — `EMBEDDING_MODEL` defaults to OpenAI's `text-embedding-3-small` and the embedder reuses `LLM_BASE_URL` when `EMBEDDING_BASE_URL` is unset, so Ollama users get 404s; `.env.example` and `CLAUDE.md` don't document Ollama. (2) `backend/app/services/graphiti_adapter.py:471-473` catches every exception during episode ingestion, logs a truncated `WARNING`, and substitutes a placeholder UUID, so a graph build appears to succeed but writes nothing. Tracked as GitHub issue #18. + +## Introduction +This feature adds first-class documentation for using a local Ollama embedder +(`mxbai-embed-large`, 1024-dim) with the Graphiti adapter and removes the +silent placeholder-UUID fallback in `_GraphNamespace.add_batch` so that +embedding failures terminate the surrounding background `Task` with the +underlying error visible in the UI and logs. + +The work spans two narrowly scoped changes: + +1. **Documentation update** — `.env.example`, `CLAUDE.md`, and the README / + docker-compose comments gain a short Ollama section that explains how to + point the embedder at a local Ollama instance, why `mxbai-embed-large` is + the recommended model (1024-dim, matches Graphiti's default + `EMBEDDING_DIM`), and how to smoke-test connectivity with one `curl` + command before kicking off a graph build. +2. **Loud failure** — the broad `except Exception` in + `_GraphNamespace.add_batch` is removed (or narrowed to a small set of + transient network errors). Episode ingestion failures now propagate to + the calling background task, which marks itself `FAILED` with the + underlying error message attached, rather than logging a `WARNING` and + returning a fake UUID. + +No new dependency, environment variable, or config flag is introduced. +All existing OpenAI/Gemini configurations continue to work unchanged. + +## Boundary Context +- **In scope**: documenting Ollama as a third supported embedder provider + in `.env.example`, `CLAUDE.md`, and the docker-compose / README comments; + removing the silent placeholder-UUID fallback in + `_GraphNamespace.add_batch`; surfacing the underlying ingestion error to + the background `Task` so it terminates with `status=FAILED`; documenting + a one-line `curl` smoke test for embedder connectivity. +- **Out of scope**: a startup-time embedder health probe that refuses to + boot on dim/model mismatch; making `EMBEDDING_DIM` env-configurable so + 768-dim or 1536-dim embedders can be used; adding a per-provider + embedder factory (today the adapter only branches on `openai` and + `gemini`); generic retry/backoff policy changes elsewhere in the + pipeline. +- **Adjacent expectations**: the existing background-task error-handling + contract from `.kiro/steering/error-handling.md` already specifies that + worker exceptions must call `fail_task(...)`. This feature relies on + that contract — it does not introduce a new one. The single-episode + `_GraphNamespace.add(...)` path is left untouched because it already + re-raises naturally. + +## Requirements + +### Requirement 1: Ollama Embedder Documentation +**Objective:** As a self-hosting MiroFish operator, I want first-class +documentation for using a local Ollama embedder, so that I can run the +Graphiti pipeline without needing an OpenAI- or Gemini-compatible +embeddings endpoint. + +#### Acceptance Criteria +1. The `.env.example` file shall contain a commented Ollama embedder block + showing `EMBEDDING_BASE_URL`, `EMBEDDING_API_KEY`, and `EMBEDDING_MODEL` + set to `http://host.docker.internal:11434/v1`, a non-empty placeholder + string, and `mxbai-embed-large` respectively, with a comment noting the + `ollama pull mxbai-embed-large` prerequisite. +2. The `CLAUDE.md` file shall list the three supported embedder providers + (OpenAI, Gemini, Ollama) and shall state the 1024-dim constraint that + forces `mxbai-embed-large` over `nomic-embed-text` (768-dim). +3. Where the user runs MiroFish in Docker, the docker-compose comments or + README shall note that Ollama on the host is reached from the + `mirofish` container via `host.docker.internal:11434`. +4. The documentation shall include a one-line `curl` example that calls + `$EMBEDDING_BASE_URL/embeddings` with the configured model and confirms + the response embedding length is 1024. +5. When the operator follows the documented Ollama configuration with + `mxbai-embed-large` pulled in Ollama, the existing graph-build pipeline + shall complete end-to-end and write real nodes and edges to Neo4j with + no code changes beyond the env-var configuration. + +### Requirement 2: Loud Embedding Failure +**Objective:** As a MiroFish operator, I want embedding failures during +graph build to surface as a visible task failure with the underlying +error, so that I can fix my embedder configuration instead of seeing an +"empty graph" with no diagnostic. + +#### Acceptance Criteria +1. The `_GraphNamespace.add_batch` method shall not return a placeholder + `_EpisodeResult` UUID when the underlying `add_episode` call raises an + exception. +2. If `add_episode` raises any exception other than a narrowly defined set + of transient network errors, then `_GraphNamespace.add_batch` shall + propagate the exception to its caller. +3. When `_GraphNamespace.add_batch` propagates an exception, the + surrounding graph-build background `Task` shall transition to + `FAILED` with `Task.error` containing a non-empty message derived from + the underlying exception (per the existing + `.kiro/steering/error-handling.md` contract). +4. While a graph-build task is failing because of a misconfigured + `EMBEDDING_MODEL`, `EMBEDDING_BASE_URL`, or `EMBEDDING_API_KEY`, the + adapter shall log the underlying `add_episode` error at `ERROR` level + (not `WARNING`) before raising, so the root cause is visible in + server logs. +5. Where the configured `EMBEDDING_MODEL` is invalid (e.g. a typo, or a + model not pulled in Ollama), the user-facing project state shall move + out of `GRAPH_BUILDING` and the task shall surface the underlying + embedder error to the frontend without producing a placeholder-UUID + "successful" episode. +6. The `_GraphNamespace.add_batch` method shall preserve its current + contract for successful episodes: each successfully ingested episode + shall still produce one `_EpisodeResult` whose `uuid_` matches the + Graphiti-assigned episode UUID, in input order. + +### Requirement 3: Backwards Compatibility +**Objective:** As an existing MiroFish operator already running with an +OpenAI- or Gemini-compatible embedder, I want this change to be invisible +on the happy path, so that no upgrade action is required. + +#### Acceptance Criteria +1. Where `EMBEDDING_BASE_URL`, `EMBEDDING_API_KEY`, and `EMBEDDING_MODEL` + are unset or set to OpenAI/Gemini-compatible values, the embedder + construction in `_build_llm_and_embedder` shall behave identically to + the current implementation. +2. The graph-build pipeline shall not require any new environment + variable to function; Ollama support shall be enabled purely by + setting the three existing `EMBEDDING_*` variables. +3. While Graphiti's default `EMBEDDING_DIM` is 1024, the documentation + shall explicitly note that any embedder model with a different output + dimension is unsupported by this change and is an explicit follow-up + item. diff --git a/.kiro/specs/graphiti-ollama-embedder/research.md b/.kiro/specs/graphiti-ollama-embedder/research.md new file mode 100644 index 00000000..7c79d614 --- /dev/null +++ b/.kiro/specs/graphiti-ollama-embedder/research.md @@ -0,0 +1,103 @@ +# Research & Design Decisions — graphiti-ollama-embedder + +## Summary +- **Feature**: `graphiti-ollama-embedder` +- **Discovery Scope**: Extension (small, narrowly scoped change to an existing adapter + supporting docs) +- **Key Findings**: + - The Graphiti `OpenAIEmbedder` already accepts an arbitrary `base_url` and `api_key`. Pointing it at Ollama's OpenAI-compatible `/v1/embeddings` endpoint requires **no code change** — only documentation. + - The silent placeholder-UUID fallback in `_GraphNamespace.add_batch` violates the project's existing background-task error-handling contract (`error-handling.md`: "Long-running tasks must always reach a terminal state"). The plumbing to surface a failure already exists in `_build_graph_worker`. + - `mxbai-embed-large` is the only widely-available local embedder that matches Graphiti's hard-coded `EMBEDDING_DIM = 1024`. Smaller models (`nomic-embed-text` at 768) would silently mis-fit Neo4j vector indexes and are out of scope. + +## Research Log + +### Ollama's OpenAI-compatible embeddings API +- **Context**: Verify that no Ollama-specific Graphiti embedder class is required. +- **Sources Consulted**: Existing code at `backend/app/services/graphiti_adapter.py:92–115` (`OpenAIEmbedderConfig` accepts arbitrary `base_url`); ticket #18 description; Graphiti `embedder/client.py:22` (`EMBEDDING_DIM = 1024`). +- **Findings**: + - Ollama exposes `POST /v1/embeddings` mirroring the OpenAI shape. + - The current `_build_llm_and_embedder("openai")` branch already uses `EMBEDDING_API_KEY or LLM_API_KEY` and `EMBEDDING_BASE_URL or LLM_BASE_URL`, so any OpenAI-compatible endpoint just works. + - Ollama ignores the auth header but `OpenAIEmbedderConfig` requires a non-empty `api_key`; the literal string `"ollama"` is the de-facto convention. +- **Implications**: This is a documentation-only ask for R1. No new provider literal, no new factory branch. + +### Failure-propagation contract +- **Context**: Confirm that removing the broad `except` in `_GraphNamespace.add_batch` will result in `Task.status = FAILED` in the UI. +- **Sources Consulted**: + - `.kiro/steering/error-handling.md` § Background Task Errors — outer `except Exception` in worker calls `fail_task(task_id, str(e))`. + - `backend/app/services/graph_builder.py:289–308` — `add_text_batches` already wraps `client.graph.add_batch` in `try/except` and re-raises after a localized progress message. + - `backend/app/services/graph_builder.py:231–234` — `_build_graph_worker` catches every exception and calls `self.task_manager.fail_task(task_id, error_msg)` with a full traceback. +- **Findings**: The chain `add_episode → _GraphNamespace.add_batch → add_text_batches → _build_graph_worker → fail_task` is intact except for the swallow at the adapter layer. Removing the swallow is sufficient; no caller-side change is required. +- **Implications**: R2.3 / R2.5 are realized for free as soon as R2.2 is implemented. + +### Single vs. batch ingestion path +- **Context**: Determine whether the single-episode `_GraphNamespace.add(...)` (line 441) needs a parallel fix. +- **Sources Consulted**: `graphiti_adapter.py:441–453`. No `try/except`; exceptions bubble naturally. +- **Findings**: Only the batch path swallows. The single path already complies. +- **Implications**: Fix is local to `add_batch`. Do not introduce symmetric handling in `add(...)`. + +### Logging level +- **Context**: Decide between `WARNING` and `ERROR` for the failure log line. +- **Sources Consulted**: `.kiro/steering/error-handling.md` § Logging: + - `ERROR` — task failure, unrecoverable exception + - `WARNING` — retry triggered, transient failure, recovered state +- **Findings**: A failure that terminates the surrounding task is unrecoverable from the task's perspective, so `ERROR` is correct. The current `WARNING` is mislabelled. +- **Implications**: R2.4 — change to `logger.exception(...)` (which logs at ERROR with traceback). + +### Documentation surfaces +- **Context**: Decide which files need updating to satisfy R1. +- **Sources Consulted**: `.env.example` (canonical config), `CLAUDE.md` lines 60–82, `README.md` lines 148–165, `docker-compose.yml` lines 21–37. +- **Findings**: All four are appropriate. `README.md` already has a placeholder for "non-OpenAI provider" and is the natural home for the `curl` smoke test snippet. `docker-compose.yml` benefits from one additional comment about `host.docker.internal`. +- **Implications**: Update all four; keep edits minimal and additive. + +## Architecture Pattern Evaluation + +| Option | Description | Strengths | Risks / Limitations | Notes | +|--------|-------------|-----------|---------------------|-------| +| A. Drop swallow + docs | Remove `except` block in `add_batch`; update four docs files | Smallest surface; honors steering rules; symmetric with `add()` | Loses (broken) "best effort" intent | Recommended | +| B. Narrow + retry | Catch only transient classes (`httpx.TimeoutException`, `openai.APIConnectionError`); use `retry_with_backoff` from `app/utils/retry.py`; raise everything else | Adds resilience to genuine network blips | More moving parts; would also need to update `add()` for symmetry | Defer to follow-up | +| C. New `ollama` provider literal | Extend `_build_llm_and_embedder` with a third branch | Symmetric with `openai`/`gemini` | Explicitly out of scope per ticket; duplicate code path (Ollama is OpenAI-SDK with custom `base_url`) | Rejected | + +## Design Decisions + +### Decision: Adopt Option A (drop the placeholder fallback entirely; documentation only for Ollama support) +- **Context**: R2 mandates that embedding failures during graph build surface as visible task failures. R1 mandates documentation for an Ollama embedder. The adapter already supports any OpenAI-compatible base URL. +- **Alternatives Considered**: + 1. **Option B (narrow + retry)** — keep a small `except` clause for transient errors and use the project's `retry_with_backoff`. + 2. **Option C (new provider literal)** — add an `ollama` branch in `_build_llm_and_embedder`. +- **Selected Approach**: + - In `_GraphNamespace.add_batch`, replace the `try/except Exception` block with a straightforward call. Failures from `_run(self._g.add_episode(...))` propagate to the caller. + - Use `logger.exception(...)` immediately before re-raise is unnecessary — `_build_graph_worker` already calls `logger.exception(f"task {task_id} failed")` per the error-handling steering. To honor R2.4 explicitly without double-logging, wrap the call in a narrow `try/except: logger.exception(...); raise` so the adapter-level context (`group_id`, episode index) is captured before bubbling. + - Update `.env.example`, `CLAUDE.md`, `README.md`, and `docker-compose.yml` to document Ollama configuration (R1). +- **Rationale**: + - The ticket explicitly lists transient-retry behavior and per-provider factory as out of scope. + - Steering's error-handling chapter forbids catch-and-continue in service code. + - Smaller surface = lower regression risk. +- **Trade-offs**: + - +Visibility: real config errors now surface at the UI. + - +Code symmetry: `add()` and `add_batch()` behave the same on failure. + - −One-time noise: operators whose graph builds were "succeeding" only because of the silent fallback will now see a failed task. This is the intended correction; mention in PR body. +- **Follow-up**: + - If transient blips become an operational issue, revisit Option B in a separate ticket using `retry_with_backoff` against `_g.add_episode`. + +### Decision: Use `logger.exception(...)` not `logger.error(...)` +- **Context**: R2.4 requires ERROR-level logging of the underlying exception. +- **Alternatives Considered**: `logger.error(str(e))` (no traceback), `logger.warning(...)` (current behavior). +- **Selected Approach**: `logger.exception("Episode add failed (group_id=%s)", graph_id)` then `raise`. +- **Rationale**: `logger.exception` logs at ERROR with the full traceback, which is what the steering doc prescribes for unrecoverable adapter failures. +- **Trade-offs**: A small amount of duplication if `_build_graph_worker` also logs via `logger.exception`. Acceptable — the two log lines describe different layers (adapter vs. task) and have different identifying context. + +### Decision: Document Ollama under the existing OpenAI provider, not as a separate provider literal +- **Context**: The ticket lists "per-provider embedder factory" as out of scope; Ollama is already reachable via the existing `openai` branch. +- **Selected Approach**: Document Ollama as a configuration *choice* of the existing `openai` Graphiti provider (set the three `EMBEDDING_*` env vars). +- **Rationale**: Avoids code duplication and matches the ticket's scope. + +## Risks & Mitigations +- **Risk**: Operators currently relying on the silent fallback see new failed tasks. **Mitigation**: PR body calls this out explicitly with a "what changed" note pointing at the embedder env vars. +- **Risk**: The `except` is removed but a transient timeout intermittently fails the entire graph build. **Mitigation**: Documented as a known follow-up (Option B). Acceptable today because the alternative was an empty graph that *looked* successful. +- **Risk**: Documentation drifts between `.env.example`, `CLAUDE.md`, `README.md`. **Mitigation**: Keep all four edits in this PR and reference the same env-var triple verbatim. + +## References +- Ticket #18 — `.ticket/18.md` (snapshot in this repo) +- Steering — `.kiro/steering/error-handling.md` § Background Task Errors and § Logging +- Steering — `.kiro/steering/tech.md` § Key Libraries (`graphiti-core` adapter rule) +- Code — `backend/app/services/graphiti_adapter.py:92–115, :441–475` +- Code — `backend/app/services/graph_builder.py:143–234, :256–310` diff --git a/.kiro/specs/graphiti-ollama-embedder/spec.json b/.kiro/specs/graphiti-ollama-embedder/spec.json new file mode 100644 index 00000000..61f6946c --- /dev/null +++ b/.kiro/specs/graphiti-ollama-embedder/spec.json @@ -0,0 +1,23 @@ +{ + "feature_name": "graphiti-ollama-embedder", + "created_at": "2026-05-07T20:24:55Z", + "updated_at": "2026-05-07T20:35:00Z", + "language": "en", + "phase": "tasks-generated", + "ticket": 18, + "approvals": { + "requirements": { + "generated": true, + "approved": true + }, + "design": { + "generated": true, + "approved": true + }, + "tasks": { + "generated": true, + "approved": true + } + }, + "ready_for_implementation": true +} diff --git a/.kiro/specs/graphiti-ollama-embedder/tasks.md b/.kiro/specs/graphiti-ollama-embedder/tasks.md new file mode 100644 index 00000000..c04de73d --- /dev/null +++ b/.kiro/specs/graphiti-ollama-embedder/tasks.md @@ -0,0 +1,91 @@ +# Implementation Tasks — graphiti-ollama-embedder + +> Source spec: `.kiro/specs/graphiti-ollama-embedder/` +> Ticket: #18 + +## Plan + +This feature has two narrowly scoped deliverables: + +1. **Code change** — remove the silent placeholder-UUID fallback in `_GraphNamespace.add_batch` so embedding failures propagate and the surrounding graph-build `Task` ends in `FAILED`. +2. **Configuration documentation** — describe the existing-but-undocumented Ollama embedder configuration in `.env.example`, `CLAUDE.md`, `README.md`, and `docker-compose.yml`. + +The code change is self-contained in one method. The configuration-file edits do not depend on the code change and can run in parallel with each other. + +## Tasks + +- [x] 1. Make embedding-batch failures loud (adapter fix) +- [x] 1.1 Replace the silent placeholder-UUID fallback in `_GraphNamespace.add_batch` with ERROR-level logging plus exception propagation + - Open the per-episode `try/except Exception` around the synchronous `add_episode` call in the batch ingestion path of the Graphiti adapter and remove the placeholder-UUID branch entirely. + - Replace the existing `WARNING`-level log line with a `logger.exception(...)` call that captures the `graph_id` and the index of the failing episode in its message; do not include the episode body, API keys, or full traceback duplication beyond what `logger.exception` emits. + - Re-raise the original exception so it bubbles up to `GraphBuilderService.add_text_batches` (which already re-raises) and on to `_build_graph_worker` (which already calls `fail_task`). + - Preserve the happy-path contract: a successful episode still produces exactly one `_EpisodeResult` whose `uuid_` matches the Graphiti-assigned episode UUID, and the returned list keeps input order. + - Leave the single-episode `add(...)` method untouched (it already raises naturally) and leave `_GraphNamespace.search(...)` untouched (its log-and-return-empty contract is documented in steering and out of scope). + - Observable completion: when the embedder is misconfigured (e.g. `EMBEDDING_MODEL` set to an unknown model on the configured base URL), starting a graph build through the UI causes the `Task` to transition to `FAILED` with `Task.error` populated by the underlying Graphiti exception message, and the backend log includes an ERROR-level entry from the Graphiti adapter naming the failing `graph_id`. + - _Requirements: 2.1, 2.2, 2.4, 2.6_ + - _Boundary: graphiti_adapter._GraphNamespace.add_batch_ + +- [x] 2. Document the Ollama embedder configuration +- [x] 2.1 (P) Add a commented Ollama embedder block to `.env.example` + - Append three commented environment-variable lines configuring the existing `EMBEDDING_BASE_URL`, `EMBEDDING_API_KEY`, and `EMBEDDING_MODEL` for an Ollama deployment with `mxbai-embed-large`. + - Include a short comment explaining the prerequisite step (`ollama pull mxbai-embed-large`) and the rationale for `mxbai-embed-large` over `nomic-embed-text` (1024-dim vs 768-dim, must match Graphiti's default `EMBEDDING_DIM`). + - Use `http://host.docker.internal:11434/v1` as the base URL example so the snippet works from inside the `mirofish` container; mention that host-mode (`npm run dev`) operators can substitute `http://localhost:11434/v1`. + - Set the example `EMBEDDING_API_KEY` to a non-empty placeholder string (Ollama ignores the value but `OpenAIEmbedderConfig` requires it to be non-empty). + - Leave the existing OpenAI/Gemini commented examples untouched — the Ollama block is additive. + - Observable completion: a fresh `cp .env.example .env` followed by uncommenting only the three Ollama lines and pulling the model in Ollama is sufficient to point the existing `openai`-provider Graphiti embedder at the local Ollama daemon. + - _Requirements: 1.1_ + - _Boundary: .env.example_ + +- [x] 2.2 (P) Extend the "Required Environment Variables" section in `CLAUDE.md` + - Update the `EMBEDDING_MODEL` notes to enumerate the three supported embedder configurations: OpenAI (`text-embedding-3-small`), Gemini (`text-embedding-004`), and Ollama (`mxbai-embed-large`). + - Document the 1024-dim constraint imposed by Graphiti's default `EMBEDDING_DIM` and explicitly note that 768-dim models such as `nomic-embed-text` are unsupported until `EMBEDDING_DIM` is made configurable. + - Cross-reference `.env.example` for the Ollama-specific `EMBEDDING_BASE_URL`/`EMBEDDING_API_KEY` triple instead of duplicating the values inline. + - Observable completion: a new contributor reading only `CLAUDE.md` § "Required Environment Variables" can identify all three supported embedder providers and the dim constraint without consulting external sources. + - _Requirements: 1.2, 3.3_ + - _Boundary: CLAUDE.md_ + +- [x] 2.3 (P) Add an Ollama section and `curl` smoke test to `README.md` + - In the "Required Environment Variables" block, add an Ollama example alongside the existing Gemini hint covering `EMBEDDING_BASE_URL`, `EMBEDDING_API_KEY`, and `EMBEDDING_MODEL`. + - Append a one-line `curl` snippet that POSTs to `$EMBEDDING_BASE_URL/embeddings` with the configured model and a trivial input, then pipes through `jq '.data[0].embedding | length'` to verify a `1024` response — explicitly framed as a pre-build smoke test. + - Use the same `host.docker.internal:11434` convention as `.env.example` and `docker-compose.yml`, with a short note on the `localhost` substitution for host-mode operators. + - Keep the existing copy/install steps untouched; this edit is additive within the same `## Configure Environment Variables` (or equivalent) subsection. + - Observable completion: an operator running the new `curl` snippet against a correctly configured Ollama daemon sees `1024` printed to stdout and can use that as a go/no-go signal before kicking off the graph build. + - _Requirements: 1.3, 1.4_ + - _Boundary: README.md_ + +- [x] 2.4 (P) Add a `host.docker.internal` comment to the `mirofish` service in `docker-compose.yml` + - Add a single comment line above (or alongside) the existing `NEO4J_URI` override in the `mirofish` service noting that an Ollama daemon running on the host is reachable from this container via `host.docker.internal:11434` and that this is the value to use for `EMBEDDING_BASE_URL` when running the Compose stack. + - Do not introduce any new service, environment variable, or volume; the change is comment-only. + - Observable completion: a reader of `docker-compose.yml` who sets up Ollama on the host can derive the correct `EMBEDDING_BASE_URL` value without consulting external Docker networking documentation. + - _Requirements: 1.3_ + - _Boundary: docker-compose.yml_ + +- [ ] 3. Manual end-to-end verification (deferred to reviewer — requires running Neo4j + LLM stack) +- [ ] 3.1 Verify the happy and failure paths through the graph-build pipeline (deferred to reviewer) + - Run `npm run dev` against the existing OpenAI/Qwen-style embedder configuration to confirm the graph-build flow still completes with real nodes/edges in Neo4j (regression check for R3.1). + - Set `EMBEDDING_MODEL` to a deliberately invalid value (e.g. `text-embedding-3-small-typo`) against the same base URL, trigger a graph build through the UI, and confirm the project exits `GRAPH_BUILDING`, the backing `Task` reaches `status = FAILED`, and `Task.error` carries the underlying 404/unknown-model message (R2.3, R2.5). Inspect the backend logs for the new ERROR-level entry from the Graphiti adapter (R2.4). + - If an Ollama daemon with `mxbai-embed-large` is available, follow the documented `.env.example` snippet plus the `curl` smoke test, then run a full graph build and confirm Neo4j has nodes/edges scoped to the project's `group_id` (R1.5). + - Note in the PR body that, on a partial-batch failure, episodes successfully written before the failure remain committed in Neo4j (post-condition documented in design.md); a re-run appends rather than overwrites because Graphiti episode UUIDs are unique. + - Observable completion: PR description records the three scenarios (OpenAI happy path, deliberate-typo failure path, optional Ollama happy path) with the resulting `Task` status, an excerpt of `Task.error` for the failure case, and a link to (or extract from) the ERROR-level adapter log. + - _Depends: 1.1, 2.1, 2.2, 2.3, 2.4_ + - _Requirements: 1.5, 2.3, 2.4, 2.5, 3.1, 3.2_ + - _Boundary: end-to-end pipeline (verification only, no code change)_ + +## Requirements Coverage + +| Requirement | Tasks | +|-------------|-------| +| 1.1 | 2.1 | +| 1.2 | 2.2 | +| 1.3 | 2.3, 2.4 | +| 1.4 | 2.3 | +| 1.5 | 3.1 | +| 2.1 | 1.1 | +| 2.2 | 1.1 | +| 2.3 | 3.1 (verification — already implemented in `_build_graph_worker`) | +| 2.4 | 1.1, 3.1 | +| 2.5 | 3.1 (verification — already implemented in frontend task polling) | +| 2.6 | 1.1 | +| 3.1 | 3.1 | +| 3.2 | 3.1 | +| 3.3 | 2.2 | diff --git a/.kiro/specs/i18n-ci-guard/baseline.txt b/.kiro/specs/i18n-ci-guard/baseline.txt new file mode 100644 index 00000000..94f44463 --- /dev/null +++ b/.kiro/specs/i18n-ci-guard/baseline.txt @@ -0,0 +1,5 @@ +# Per-path CJK baseline for the i18n CI guard. +# Format: \t. Sorted lexicographically. +# Refresh via: python scripts/ci/i18n_cjk_guard.py --update-baseline +backend/app 307 +frontend/src 124 diff --git a/.kiro/specs/i18n-ci-guard/design.md b/.kiro/specs/i18n-ci-guard/design.md new file mode 100644 index 00000000..d694e1f6 --- /dev/null +++ b/.kiro/specs/i18n-ci-guard/design.md @@ -0,0 +1,544 @@ +# Design — i18n-ci-guard + +## Overview + +This feature installs a permanent, PR-time CI guard that blocks +regressions of the project's English-by-default state. It performs two +checks: `locales/en.json` must contain zero CJK characters, and the +total CJK match count under `backend/app/` and `frontend/src/` must not +exceed a committed per-path baseline. The guard is a single Python +script invoked by a single GitHub Actions workflow. + +**Purpose**: This feature delivers an automatic regression gate to the +i18n initiative so reviewers do not have to spot CJK reintroductions +by eye. +**Users**: Project maintainers and PR authors. Maintainers gain a +hard regression gate; PR authors gain a script they can run locally to +catch regressions before pushing. +**Impact**: Adds the project's first `pull_request`-triggered CI +workflow. No production source under `backend/app/`, `frontend/src/`, +or `locales/` is modified by this spec — only new files are added. + +### Goals + +- Fail any PR that introduces a CJK character into `locales/en.json`. +- Fail any PR whose CJK match count under `backend/app/` or + `frontend/src/` exceeds the committed baseline. +- Print a single actionable failure message that includes the exact + command a contributor must run if the regression is intentional. +- Run end-to-end under sixty seconds on `ubuntu-latest`. +- Be reproducible verbatim on a developer machine with Python ≥3.11 + and `git`. + +### Non-Goals + +- Re-implementing the full classification pipeline from + `.kiro/specs/i18n-e2e-english-verification/` (that work belongs to + PR #27). +- Auto-updating the baseline on `main`. +- Translating any production source to satisfy a higher baseline. The + initial baseline is recorded against `main` and only ratchets down + over time. +- Gating commits at pre-commit time. The guard is CI-only; a future + spec may wrap it in a hook. + +## Boundary Commitments + +### This Spec Owns + +- The guard script `scripts/ci/i18n_cjk_guard.py` and its CLI + contract. +- The workflow `.github/workflows/i18n-cjk-guard.yml` and its + trigger configuration. +- The baseline file `.kiro/specs/i18n-ci-guard/baseline.txt` and its + format. +- The pass/fail semantics of both checks. + +### Out of Boundary + +- Any change to files under `backend/app/`, `frontend/src/`, or + `locales/` — except `locales/en.json` if it is found to contain CJK + during initial baseline calibration (a remediation translation would + be a separate spec/PR). +- The classification heuristics in PR #27's `classify.py`. +- Pre-commit hooks; IDE integrations; alternative scoped paths beyond + `backend/app/` and `frontend/src/`. + +### Allowed Dependencies + +- Python ≥3.11 standard library. +- `git` (for `git grep -nIP` invocation). +- `actions/checkout@v4` and `actions/setup-python@v5` from the + GitHub Actions Marketplace. + +### Revalidation Triggers + +- Adding a third scoped path → baseline file format changes; consumers + (none today) re-check. +- Changing the regex range → audit pipeline alignment must be + re-confirmed. +- Switching from `pull_request` to `merge_group` or other event → + required-status-check rules in branch protection must be re-checked. + +## Architecture + +### Existing Architecture Analysis + +- **Repo layout**: monorepo split by runtime (`backend/`, `frontend/`) + with shared `locales/` at root. The guard scopes its scan to + `backend/app/`, `frontend/src/`, and `locales/en.json`, matching the + audit pipeline's canonical scope. +- **Existing scripts pattern**: `scripts/.py` for developer + tools. The new `scripts/ci/` subdirectory introduces a clear, + CI-only home without disturbing the existing developer scripts. +- **Existing CI**: `.github/workflows/docker-image.yml` is tag-only. + No `pull_request` workflow exists. The new workflow is additive and + does not affect the docker-image workflow. + +### Architecture Pattern & Boundary Map + +```mermaid +flowchart LR + PR[Pull Request to main] -->|trigger| WF[.github/workflows/i18n-cjk-guard.yml] + WF -->|setup-python + checkout| RUN[python scripts/ci/i18n_cjk_guard.py] + RUN -->|read| EN[locales/en.json] + RUN -->|git grep -nIP| BAPP[backend/app/] + RUN -->|git grep -nIP| FSRC[frontend/src/] + RUN -->|read| BL[.kiro/specs/i18n-ci-guard/baseline.txt] + RUN -->|exit 0 or 1| WF + WF -->|status| PR + + DEV[Developer terminal] -->|python scripts/ci/i18n_cjk_guard.py| RUN + DEV -->|--update-baseline| RUN + RUN -.->|writes| BL +``` + +**Architecture Integration**: + +- **Selected pattern**: single-purpose script + thin workflow. + Matches the project's existing `scripts/.py` convention. +- **Domain boundaries**: the guard is a pure verification tool with no + side effects on production code. Its only writeable surface is the + baseline file, and only when explicitly invoked with + `--update-baseline`. +- **Existing patterns preserved**: stdlib-only Python tooling + (precedent: `scripts/check_i18n_logs.py`); single-file workflows in + `.github/workflows/`. +- **New components rationale**: a new file rather than an extension of + an existing script — the existing script is scoped to a fixed + module list and is not a regression gate. +- **Steering compliance**: respects layer-based structure (script + lives at repo root in `scripts/ci/`, not under `backend/` or + `frontend/`), no new heavy dependencies, no `os.getenv` calls + outside `backend/app/config.py`. + +### Technology Stack + +| Layer | Choice / Version | Role in Feature | Notes | +|-------|------------------|-----------------|-------| +| Frontend / CLI | Python 3.11 stdlib (`argparse`, `json`, `re`, `subprocess`, `pathlib`, `sys`) | Guard CLI | Stdlib only — Req 5.5 | +| Backend / Services | n/a | — | Guard does not touch backend services | +| Data / Storage | Plain-text baseline file under `.kiro/specs/` | Per-path count store | One line per path, `\t` | +| Messaging / Events | n/a | — | — | +| Infrastructure / Runtime | GitHub Actions `ubuntu-latest`, `actions/checkout@v4`, `actions/setup-python@v5` | PR-time runner | `fetch-depth: 1` is sufficient | + +## File Structure Plan + +### Directory Structure + +``` +scripts/ +└── ci/ + └── i18n_cjk_guard.py # Guard CLI (new) + +.github/ +└── workflows/ + └── i18n-cjk-guard.yml # PR-time workflow (new) + +.kiro/specs/i18n-ci-guard/ +├── spec.json # (existing, updated) +├── requirements.md # (existing) +├── gap-analysis.md # (existing) +├── research.md # (existing) +├── design.md # (this file) +├── tasks.md # (created in next phase) +└── baseline.txt # Per-path CJK match counts (new) +``` + +### Modified Files + +- `.kiro/specs/i18n-ci-guard/spec.json` — phase / approval fields + updated by Kiro flow only. +- No production source files are modified by this spec. + +## System Flows + +### Guard execution (default mode) + +```mermaid +sequenceDiagram + participant CI as GitHub Actions + participant Script as i18n_cjk_guard.py + participant Repo as Working tree + participant BL as baseline.txt + + CI->>Script: python scripts/ci/i18n_cjk_guard.py + Script->>Repo: read locales/en.json + Script->>Script: scan for CJK chars + alt en.json has CJK + Script-->>CI: exit 1 + per-key findings + else en.json clean + Script->>Repo: git grep -nIP backend/app/ + Script->>Repo: git grep -nIP frontend/src/ + Script->>BL: read baseline counts + alt any current count > baseline + Script-->>CI: exit 1 + per-path delta + refresh hint + else within baseline + Script-->>CI: exit 0 + summary + end + end +``` + +### Baseline refresh + +```mermaid +sequenceDiagram + participant Dev as Developer + participant Script as i18n_cjk_guard.py + participant Repo as Working tree + participant BL as baseline.txt + + Dev->>Script: python scripts/ci/i18n_cjk_guard.py --update-baseline + Script->>Repo: git grep -nIP backend/app/ + Script->>Repo: git grep -nIP frontend/src/ + Script->>BL: write per-path counts (sorted) + Script-->>Dev: exit 0 + new counts +``` + +The two checks run in fixed order: en.json first (cheap, decisive), +then per-path counts. Both run under all conditions; the script does +not short-circuit after the first failure so the contributor sees the +complete diagnostic in one CI log. + +## Requirements Traceability + +| Requirement | Summary | Components | Interfaces | Flows | +|-------------|---------|------------|------------|-------| +| 1.1 | Scan en.json for CJK | `i18n_cjk_guard.py` | CLI default mode | Guard execution | +| 1.2 | Fail with key:line per offender | `i18n_cjk_guard.py` | CLI stderr output | Guard execution | +| 1.3 | Report clean state | `i18n_cjk_guard.py` | CLI stdout summary | Guard execution | +| 1.4 | Hard error if file missing | `i18n_cjk_guard.py` | CLI stderr + exit 1 | Guard execution | +| 2.1 | Count CJK matches per scoped path | `i18n_cjk_guard.py` | `git grep -nIP` invocation | Guard execution | +| 2.2 | Read baseline counts | `i18n_cjk_guard.py`, `baseline.txt` | File read | Guard execution | +| 2.3 | Fail on regression | `i18n_cjk_guard.py` | Exit 1 | Guard execution | +| 2.4 | Pass when within baseline | `i18n_cjk_guard.py` | Exit 0 | Guard execution | +| 2.5 | Skip binary files | `git grep -I` | — | Guard execution | +| 2.6 | Tracked-only scope | `git grep` default | — | Guard execution | +| 3.1 | Per-key locale failure detail | `i18n_cjk_guard.py` | CLI stderr lines | Guard execution | +| 3.2 | Per-path regression detail | `i18n_cjk_guard.py` | CLI stderr lines | Guard execution | +| 3.3 | Print refresh command | `i18n_cjk_guard.py` | CLI stderr footer | Guard execution | +| 3.4 | Success summary lines | `i18n_cjk_guard.py` | CLI stdout | Guard execution | +| 4.1 | Baseline under spec dir | `baseline.txt` | File path | — | +| 4.2 | Diff-friendly text format | `baseline.txt` | File format | — | +| 4.3 | Refresh via flag | `i18n_cjk_guard.py` | `--update-baseline` | Baseline refresh | +| 4.4 | No implicit baseline writes | `i18n_cjk_guard.py` | CLI default mode | Guard execution | +| 4.5 | Hard error if baseline missing | `i18n_cjk_guard.py` | Exit 1 + message | Guard execution | +| 5.1 | PR-only trigger to main | `i18n-cjk-guard.yml` | `on.pull_request.branches` | — | +| 5.2 | Checkout PR head | `i18n-cjk-guard.yml` | `actions/checkout@v4` | — | +| 5.3 | Surface output on failure | `i18n-cjk-guard.yml` | Default GH log | — | +| 5.4 | Pass on exit 0 | `i18n-cjk-guard.yml` | Default | — | +| 5.5 | Stdlib-only, no third-party | `i18n_cjk_guard.py`, `i18n-cjk-guard.yml` | — | — | +| 5.6 | ≤60s runtime | `i18n-cjk-guard.yml` | `timeout-minutes: 1` | — | +| 6.1 | Same result locally | `i18n_cjk_guard.py` | CLI | — | +| 6.2 | Single stable entry point | `scripts/ci/i18n_cjk_guard.py` | Path | — | +| 6.3 | No env vars / secrets | `i18n_cjk_guard.py` | CLI | — | + +## Components and Interfaces + +| Component | Domain/Layer | Intent | Req Coverage | Key Dependencies | Contracts | +|-----------|--------------|--------|--------------|------------------|-----------| +| `i18n_cjk_guard.py` | CI script | Two-check guard CLI | 1.1–6.3 | `git`, Python stdlib | Service (CLI) | +| `i18n-cjk-guard.yml` | CI workflow | Run guard on every PR to main | 5.1–5.6 | `actions/checkout@v4`, `actions/setup-python@v5` | Batch / Job | +| `baseline.txt` | Data | Per-path baseline counts | 4.1, 4.2, 2.2 | — | State (file) | + +### CI Script + +#### `i18n_cjk_guard.py` + +| Field | Detail | +|-------|--------| +| Intent | Run two CJK-regression checks; optionally refresh the baseline | +| Requirements | 1.1, 1.2, 1.3, 1.4, 2.1, 2.2, 2.3, 2.4, 2.5, 2.6, 3.1, 3.2, 3.3, 3.4, 4.1, 4.3, 4.4, 4.5, 5.5, 6.1, 6.2, 6.3 | +| Owner / Reviewers | i18n maintainers | + +**Responsibilities & Constraints** + +- Owns the canonical guard semantics: which paths are scoped, which + regex is canonical, what counts as a regression. +- Runs in pure Python 3.11 stdlib + a single `git` subprocess per + scoped path. +- Never modifies any file other than the baseline file, and only when + invoked with `--update-baseline`. +- Always runs both checks (does not short-circuit), so a single CI log + shows every failure mode at once. + +**Dependencies** + +- Inbound: `i18n-cjk-guard.yml` workflow; developers running locally. +- Outbound: `git` subprocess (`git grep`, `git rev-parse`). +- External: none. + +**Contracts**: Service [x] / API [ ] / Event [ ] / Batch [ ] / State [x] + +##### Service Interface (CLI) + +```text +i18n_cjk_guard.py [--update-baseline] [--baseline PATH] [--repo-root PATH] +``` + +Type-annotated module signature (Python type hints, public functions +only): + +```python +def main(argv: list[str]) -> int: ... + +def run_check(repo_root: pathlib.Path, baseline_path: pathlib.Path) -> int: + """Run both checks; return 0 on success, 1 on any failure.""" + +def update_baseline(repo_root: pathlib.Path, baseline_path: pathlib.Path) -> int: + """Refresh the baseline file with current per-path counts; return 0.""" + +def scan_locale_cjk(en_json_path: pathlib.Path) -> list[LocaleFinding]: + """Return a list of (key, line_number, snippet) tuples for every + CJK occurrence in locales/en.json. Empty list when clean.""" + +def count_path_cjk(repo_root: pathlib.Path, scoped_path: str) -> int: + """Return the number of CJK match lines under scoped_path, + using `git grep -nIP '[\\x{4e00}-\\x{9fff}]' -- `.""" + +def read_baseline(baseline_path: pathlib.Path) -> dict[str, int]: + """Parse the baseline file. Each non-empty, non-comment line is + '\\t'. Raise BaselineError on any malformed input + or missing file.""" + +def write_baseline(baseline_path: pathlib.Path, counts: dict[str, int]) -> None: + """Atomically overwrite the baseline file with sorted entries + and a single trailing newline.""" +``` + +Where: + +```python +LocaleFinding = tuple[str, int, str] # (dotted_key, line_number, snippet) +SCOPED_PATHS: tuple[str, ...] = ("backend/app", "frontend/src") +EN_JSON_REL_PATH: str = "locales/en.json" +CJK_PATTERN: str = "[\\x{4e00}-\\x{9fff}]" # passed to git grep -P +CJK_RE: re.Pattern[str] = re.compile(r"[一-鿿]") +SNIPPET_MAX_LEN: int = 80 +``` + +- **Preconditions**: invoked with CWD at the repo root or + `--repo-root` set; `git` is on `$PATH`; the working tree is the + intended scan target. +- **Postconditions** (default mode): exit 0 iff both checks pass; + exit 1 otherwise. Stdout receives the success summary; stderr + receives findings on failure. The baseline file is unchanged. +- **Postconditions** (`--update-baseline`): the baseline file is + rewritten to current per-path counts and exit 0 is returned. +- **Invariants**: regex range, scoped paths, and baseline file path + are constants — no env-var override. + +##### State Management + +- **State model**: a dict `{: }` parsed from + the baseline file. +- **Persistence**: plain-text file at + `.kiro/specs/i18n-ci-guard/baseline.txt`. Atomic write via + `tmp + os.replace`. +- **Concurrency**: single-writer (developer running + `--update-baseline`); CI workers only read. + +**Implementation Notes** + +- Output format mirrors `scripts/check_i18n_logs.py`: + `:: : ` on stderr, summary on stdout, + trailing `OK` or `N issues`. +- The exact refresh command printed on regression failure is: + `python scripts/ci/i18n_cjk_guard.py --update-baseline`. +- `count_path_cjk` invokes `git grep` via `subprocess.run` with + `check=False`; `git grep` exits 1 when there are zero matches, so + the function treats exit codes 0 and 1 as success and any other + code as a hard error. +- Localised key extraction for `en.json` walks the parsed JSON dict; + line numbers are obtained by re-reading the file as text and + matching the value's first textual occurrence. +- Risks: see `research.md` § Risks & Mitigations. + +### CI Workflow + +#### `i18n-cjk-guard.yml` + +| Field | Detail | +|-------|--------| +| Intent | Run the guard on every PR to `main` | +| Requirements | 5.1, 5.2, 5.3, 5.4, 5.5, 5.6 | +| Owner / Reviewers | i18n maintainers | + +**Contracts**: Batch / Job [x] + +##### Batch / Job Contract + +- **Trigger**: `on: pull_request: branches: [main]`. +- **Input / validation**: PR head ref checkout via + `actions/checkout@v4` with `fetch-depth: 1`. Python set up via + `actions/setup-python@v5` with `python-version: '3.11'`. +- **Output / destination**: pass/fail status surfaced as a GitHub + Actions check on the PR. Script stdout/stderr appears in the + workflow log. +- **Idempotency & recovery**: re-running the workflow re-evaluates the + same working tree; no persistent side effects on the runner. + +##### Workflow shape (sketch) + +```yaml +name: i18n CJK Guard +on: + pull_request: + branches: [main] +jobs: + guard: + runs-on: ubuntu-latest + timeout-minutes: 1 + steps: + - uses: actions/checkout@v4 + with: + fetch-depth: 1 + - uses: actions/setup-python@v5 + with: + python-version: '3.11' + - run: python scripts/ci/i18n_cjk_guard.py +``` + +### Baseline Data File + +#### `baseline.txt` + +| Field | Detail | +|-------|--------| +| Intent | Persist the per-path CJK match-count baseline | +| Requirements | 2.2, 4.1, 4.2 | + +**Contracts**: State [x] + +##### Format + +```text +# Per-path CJK baseline for the i18n CI guard. +# Format: \t. Sorted lexicographically. +# Refresh via: python scripts/ci/i18n_cjk_guard.py --update-baseline +backend/app +frontend/src +``` + +- One header block of `#`-prefixed comments (parser ignores). +- Blank lines ignored. +- Lines must match `^(?P[^\t\n]+)\t(?P\d+)$`. +- Trailing newline mandatory. + +## Data Models + +### Domain Model + +- `LocaleFinding` — value object + `(dotted_key: str, line_number: int, snippet: str)`. +- `PathCount` — pair `(scoped_path: str, count: int)`. The full + baseline is a `dict[str, int]` keyed by scoped path. + +Invariants: + +- `count` is a non-negative integer. +- `scoped_path` is one of `SCOPED_PATHS`. +- `LocaleFinding.snippet` is at most `SNIPPET_MAX_LEN` characters, + truncated with an ellipsis when needed. + +## Error Handling + +### Error Strategy + +- All non-zero exits are accompanied by a stderr message identifying + the failing check, the offending file or path, and (for regressions) + the refresh command. The script never raises uncaught exceptions + past `main()` in normal flow; unexpected I/O errors propagate as + `OSError` with a clear traceback so CI logs surface them clearly. + +### Error Categories and Responses + +- **Locale failure** (Req 1.2): one stderr line per offending key + (`locales/en.json:: cjk-in-en: = `), then a + trailing `N issues` summary. +- **Regression failure** (Req 3.2): one stderr line per regressed + path (`: cjk-regression: baseline= current= delta=+`) + followed by a one-line refresh hint: + `# refresh via: python scripts/ci/i18n_cjk_guard.py --update-baseline`. +- **Missing en.json** (Req 1.4): stderr `locales/en.json: missing + catalogue file`, exit 1. +- **Missing or malformed baseline** (Req 4.5): stderr + `: missing or malformed; refresh via …`, exit 1. +- **`git grep` unavailable / non-PCRE**: stderr + `git grep failed: `, exit 1. + +### Monitoring + +- The guard is a single short-lived script. All observability is + delegated to GitHub Actions logs (stdout/stderr, run duration). + No external telemetry. + +## Testing Strategy + +### Unit Tests (Python) + +Place tests under `scripts/ci/tests/test_i18n_cjk_guard.py` (or invoke +the script directly via subprocess in a tmp git repo). The project's +test runner is `pytest` (already used by `backend/`), but the new +tests must be runnable with `python -m pytest` from the repo root +without backend dependencies. Tests are scoped to: + +1. `scan_locale_cjk` — clean catalogue returns empty list; planted CJK + value returns a single `LocaleFinding` with the correct key and + line number. +2. `count_path_cjk` — given a tmp git repo with N planted CJK lines, + returns N; binary file matches are excluded; untracked file + matches are excluded. +3. `read_baseline` / `write_baseline` round-trip — write counts, + re-read, equal. +4. `read_baseline` malformed input — non-tab line → `BaselineError`. +5. `run_check` end-to-end — passing baseline → exit 0; regressed + baseline → exit 1 and stderr contains the refresh command. + +### Integration Tests + +1. Workflow shape — `actionlint` (optional, if installed locally) on + `i18n-cjk-guard.yml`. At minimum, `python -c "import yaml; + yaml.safe_load(open('.github/workflows/i18n-cjk-guard.yml'))"` for + YAML validity. +2. Local end-to-end — run + `python scripts/ci/i18n_cjk_guard.py` from the repo root with the + committed baseline; expect exit 0 on a clean checkout of `main`. +3. Refresh end-to-end — run with `--update-baseline`; verify + baseline file is rewritten and a second default run is exit 0. + +### Performance / Load + +- Single-pass `git grep` over the scoped paths runs in <2 s on the + current repo. The workflow's `timeout-minutes: 1` is a hard ceiling + per Req 5.6. + +## Optional Sections + +### Security Considerations + +- The guard reads only tracked text files; no secrets are accessed. +- The workflow uses `GITHUB_TOKEN` only implicitly via + `actions/checkout`; no additional permissions are requested + (`permissions:` block omitted relies on the repo default of + `contents: read`, which is sufficient). diff --git a/.kiro/specs/i18n-ci-guard/gap-analysis.md b/.kiro/specs/i18n-ci-guard/gap-analysis.md new file mode 100644 index 00000000..15bc37de --- /dev/null +++ b/.kiro/specs/i18n-ci-guard/gap-analysis.md @@ -0,0 +1,169 @@ +# Gap Analysis — i18n-ci-guard + +Comparison of the approved requirements against the current MiroFish +codebase, focused on what already exists, what is missing, and what +options the design phase should choose between. + +## 1. Current State Investigation + +### Domain assets already in the repo + +- **`scripts/check_i18n_logs.py`** — Python-stdlib-only, exit-code-based + i18n verification script. Uses the same canonical CJK regex + `[一-鿿]` (`U+4E00..U+9FFF`) the new guard needs, prints findings as + `:: : `, and was written for ticket #6. + Strong precedent for the new guard's CLI surface and output format. +- **`scripts/_apply_translations.py`, `scripts/_codemod_i18n.py`, + `scripts/_merge_locale_keys.py`** — i18n tooling sibling scripts. + Convention is to keep auxiliary i18n scripts under `scripts/` at the + repo root. +- **`.github/workflows/docker-image.yml`** — only existing GH Actions + workflow; triggers on tag pushes and `workflow_dispatch`. No PR-time + workflow exists yet, so the new guard introduces the project's first + PR-blocking CI check. +- **PR #27 / branch `chore/i18n-10-e2e-english-verification`** — defines + the audit methodology referenced by the ticket. Its `audit_cjk.sh` + uses `git grep -nIP '[\x{4e00}-\x{9fff}]' -- backend/app frontend/src + locales/en.json` — the canonical scoped scan command. PR #27 is open; + the new guard must work with or without it merged. +- **`.kiro/specs//`** — established home for spec artefacts. + `i18n-externalize-backend-logs/` is the closest precedent for an + i18n-flavoured spec. +- **`locales/en.json`, `locales/zh.json`, `locales/languages.json`** — + shared i18n source consumed by both runtimes. + +### Conventions extracted + +- Auxiliary scripts: `scripts/.py`, Python ≥3.11 stdlib only, + shebang `#!/usr/bin/env python3`, double-quoted strings, snake_case, + Google-style docstrings on the module and public functions. +- Output format: `:: : `, summary line + `OK` or `N issues`, exit `0`/`1`. +- Reuse the canonical regex `[一-鿿]` rather than re-deriving range + literals. +- 4-space indent, ≤120 cols, no trailing whitespace, single trailing + newline (`.claude/rules/dev-guidelines.md`). + +### Integration surfaces + +- **CI**: GitHub Actions, `.github/workflows/`. `ubuntu-latest` runner, + Python 3.11+ via `actions/setup-python@v5` (use the same version + pin already present in the docker-image workflow ecosystem if any). +- **Repo layout boundaries** scoped by the audit: `backend/app/`, + `frontend/src/`, `locales/en.json` — all live at repo root or two + levels deep. +- **Git working tree**: the guard relies on `git grep -I` for tracked, + text-only matches; this binds the guard to a runner that has `git` + available (true on `ubuntu-latest` and on developer machines). + +## 2. Requirement-to-Asset Map + +| Req | Need | Existing asset | Gap | +| --- | --------------------------------- | ----------------------------------------------------------------------------------------------- | ----------- | +| 1 | CJK scan of `locales/en.json` | `scripts/check_i18n_logs.py` already loads `locales/*.json` and runs the canonical regex. | Missing — new guard must scan en.json specifically and emit `key:line` per offender. | +| 2 | CJK count under `backend/app/` and `frontend/src/` against baseline | Audit `audit_cjk.sh` (PR #27) demonstrates `git grep -nIP` is the canonical scan; no baseline file exists yet on main. | Missing — no per-path counter, no baseline file. | +| 3 | Actionable failure messaging | `check_i18n_logs.py` output format reusable. | Missing — need refresh-baseline command in failure text. | +| 4 | Baseline file lifecycle | None. | Missing — file format and refresh subcommand to design. | +| 5 | GH Actions PR integration | `.github/workflows/` directory exists; one tag-only workflow. | Missing — new `pull_request` workflow. | +| 6 | Local reproducibility | Existing scripts run locally with stdlib; same pattern reusable. | None — covered by following the existing pattern. | + +## 3. Implementation Approach Options + +### Option A — Extend `scripts/check_i18n_logs.py` + +Add a new `--cjk-guard` mode (catalogue scan + per-path baseline diff) +to the existing script, then call it from the new workflow. + +- ✅ One file to maintain; reuses the regex constant and CLI. +- ❌ The existing script is tightly scoped to the in-scope backend + modules and the parity check. Mixing a PR-gating regression check into + it dilutes its intent and grows it past the SRP line that the + surrounding scripts respect. +- ❌ The existing script targets a fixed list of backend modules; the + new guard scans whole subtrees. The two scopes don't fit one CLI. + +### Option B — New, focused script `scripts/ci/i18n_cjk_guard.py` + new workflow (recommended) + +A new directory `scripts/ci/` holds CI-only scripts; the guard is a +single file that performs both checks and supports a `--refresh-baseline` +flag. New workflow `.github/workflows/i18n-cjk-guard.yml` runs it on +every PR to `main`. + +- ✅ Clean separation: production-i18n script (`check_i18n_logs.py`) + and CI-gating script (`i18n_cjk_guard.py`) live side by side without + overlapping responsibilities. +- ✅ Mirrors the established convention of one script per + responsibility under `scripts/`. +- ✅ The baseline file lives under the spec dir + (`.kiro/specs/i18n-ci-guard/baseline.txt`), matching the ticket's + "baseline must be committed and reviewable" requirement. +- ❌ One more file in the repo, but the file is small (~150 LoC). + +### Option C — Hybrid: shared `cjk_scan.py` helper + thin guard script + +Factor the regex + git-grep logic into a tiny shared helper consumed by +both `check_i18n_logs.py` and the new guard. + +- ✅ DRY for the regex constant. +- ❌ Premature abstraction: today the only shared element is one + one-line regex. The two scripts have different scopes, output + formats, and consumers. Pulling a helper out now satisfies + consistency without paying for itself; defer until a third caller + appears. + +### Recommendation + +**Option B**. It matches the project's established "one focused script +per responsibility" convention, isolates the new CI surface from +existing i18n scripts, and keeps the baseline file collocated with +spec metadata where reviewers expect to find it. + +## 4. Research Items for Design Phase + +- **Baseline file format**: prefer a stable, line-oriented text format + over JSON to minimize diff churn (e.g., `pathcount` per line, + trailing newline). Confirm in design. +- **`git grep` invocation portability**: `git grep -nIP` works on all + modern git builds (≥2.4 ships PCRE2). `ubuntu-latest` ships ≥2.40. + No portability concern; record the assumption explicitly. +- **`fetch-depth`** for the `actions/checkout@v4` step: `git grep` + scans the working tree, not history, so a shallow clone (`fetch-depth: + 1`) is sufficient. +- **Workflow timeout budget**: capture the empirical runtime of the + full scan locally (already measured: a single `git grep` over the + scoped paths runs in <2 seconds with ~3.6k matches). The 60-second + ceiling in Req 5 is comfortable. +- **Failure-message refresh command** wording: the design should pin + the exact command shown to contributors so it stays one stable + string developers can copy. +- **Initial baseline values**: with `git grep -nIP '[\x{4e00}-\x{9fff}]'` + on the current branch — `backend/app` = 2707, `frontend/src` = 902, + `locales/en.json` = 0. The committed baseline must be regenerated + against `main` at implementation time so it reflects the merge target. + +## 5. Effort & Risk + +- **Effort**: **S** (1–3 days). Small, self-contained additions + (one Python script, one workflow file, one baseline file, plus the + spec). All patterns already exist in the repo. +- **Risk**: **Low**. No production-source changes, no new dependencies, + no architectural shifts. The only failure mode is a noisy guard + blocking unrelated PRs — mitigated by the per-path baseline ratchet. + +## 6. Recommendations for Design Phase + +- Adopt **Option B** (new focused script + new workflow + baseline file + under spec dir). +- Lock in the canonical regex `[一-鿿]` and the canonical scan command + `git grep -nIP '[\x{4e00}-\x{9fff}]' -- ` to keep this guard + bytewise-aligned with the audit pipeline. +- Use a line-oriented baseline format keyed by scoped path; explicit + `--refresh-baseline` (or equivalent) subcommand updates it; no + implicit overwrite. +- Output: machine-friendly findings on stderr, summary on stdout, + exit `0`/`1`. +- The workflow should run only on `pull_request` to `main` (Req 5.1) + with `fetch-depth: 1` and `actions/setup-python@v5`. No third-party + packages. +- Baseline counts must be recomputed against `main` before the PR + ships; do not commit baselines from a feature branch's working tree. diff --git a/.kiro/specs/i18n-ci-guard/requirements.md b/.kiro/specs/i18n-ci-guard/requirements.md new file mode 100644 index 00000000..78eb6139 --- /dev/null +++ b/.kiro/specs/i18n-ci-guard/requirements.md @@ -0,0 +1,189 @@ +# Requirements Document + +## Project Description (Input) +Add a permanent CI guard that runs an i18n CJK audit on every pull request. + +Linked GitHub issue: #26 (.ticket/26.md). + +The guard must fail a PR build when: +1. locales/en.json contains any CJK character (range U+4E00..U+9FFF), or +2. The total count of CJK matches across backend/app/ and frontend/src/ regresses (i.e. exceeds) a committed baseline value. + +## Introduction + +The i18n initiative has driven the project toward English-by-default UI, logs, +prompts, and documentation. Manual audits (see PR #27, the +`i18n-e2e-english-verification` spec) have repeatedly surfaced regressions +where Chinese strings re-enter the codebase. This spec installs a permanent, +self-contained CI guard that runs on every pull request and fails the build +when (a) `locales/en.json` is no longer CJK-clean, or (b) the total CJK match +count under `backend/app/` and `frontend/src/` regresses against a committed +baseline. + +The guard is intentionally minimal: it captures the two highest-signal checks +from the larger audit pipeline so it can run on every PR with a sub-minute +budget and without depending on the (currently unmerged) verification spec. +The committed baseline lets the project ratchet down gaps over time without +blocking unrelated PRs on pre-existing CJK content. + +## Boundary Context + +- **In scope**: + - A locally runnable Python script that performs both guard checks on the + current working tree. + - A baseline file committed under the spec directory recording the + accepted CJK match counts per scoped path. + - A GitHub Actions workflow that runs the script on every pull request + targeting `main` and fails the build when either check fails. + - A clear, actionable failure message (which path regressed, baseline + value, current value, command to update the baseline). +- **Out of scope**: + - The full classification pipeline (`classify.py`, `render_report.py`, + `post_comment.sh`) from the unmerged `i18n-e2e-english-verification` + spec — those scripts perform deeper audit work and are not required + for the PR-time guard. + - Auto-updating the baseline on `main` (the baseline is a normal + reviewable file). + - Translation work itself; this spec only enforces a regression gate. + - Any change to production source under `backend/app/`, `frontend/src/`, + or `locales/` apart from translations needed to satisfy the guard + against its own initial baseline. +- **Adjacent expectations**: + - PR #27 (`chore/i18n-10-e2e-english-verification`) provides the + methodology referenced here. This spec must remain functional whether + PR #27 has been merged or not. + - The guard reuses the canonical CJK regex range + `[一-鿿]` already established by that audit. + +## Requirements + +### Requirement 1: Locale-catalogue CJK cleanliness check + +**Objective:** As a maintainer of the English locale catalogue, I want every +PR to fail when `locales/en.json` reintroduces any CJK character, so that the +English catalogue stays CJK-free. + +#### Acceptance Criteria + +1. When the guard script is run from the repository root, the i18n CI Guard + shall scan the contents of `locales/en.json` for any character in the + range `U+4E00..U+9FFF`. +2. If `locales/en.json` contains at least one such character, the i18n CI + Guard shall exit with a non-zero status and report each offending + `key:line` pair on standard output. +3. While `locales/en.json` contains zero such characters, the i18n CI Guard + shall report the catalogue as CJK-clean. +4. If `locales/en.json` is missing or unreadable, the i18n CI Guard shall + exit with a non-zero status and emit an explicit error message naming + the missing file. + +### Requirement 2: Backend/frontend CJK regression check against committed baseline + +**Objective:** As a maintainer of English support across the codebase, I +want every PR to fail when the total CJK match count under `backend/app/` +or `frontend/src/` exceeds a committed baseline, so that the codebase +ratchets monotonically toward English-only without blocking PRs on +pre-existing CJK content. + +#### Acceptance Criteria + +1. When the guard script is run, the i18n CI Guard shall count the total + number of CJK matches (range `U+4E00..U+9FFF`, line-level, text files + only) under each of the scoped paths `backend/app/` and `frontend/src/`. +2. The i18n CI Guard shall read the baseline counts from a single + committed baseline file under the spec directory. +3. If the current count for any scoped path exceeds the baseline count for + that path, the i18n CI Guard shall exit with a non-zero status. +4. While the current count for every scoped path is less than or equal to + the baseline, the i18n CI Guard shall exit with status zero for this + check. +5. The i18n CI Guard shall ignore matches inside binary files + (image, font, archive, lockfile, or other non-text formats) by relying + on `git grep -I` semantics. +6. The i18n CI Guard shall scope its scan to tracked files only (matches + in untracked or ignored files shall not contribute to the count). + +### Requirement 3: Actionable failure messaging + +**Objective:** As a contributor whose PR was rejected by the guard, I want +the failure message to tell me exactly what regressed and how to fix it, +so that I can either translate the offending content or — when intentional — +update the baseline through normal review. + +#### Acceptance Criteria + +1. If the locale-catalogue check fails, the i18n CI Guard shall print, for + each offending entry: the dotted catalogue key, the line number in + `locales/en.json`, and a truncated snippet of the value. +2. If the regression check fails, the i18n CI Guard shall print, for each + regressed scoped path: the path name, the baseline count, the current + count, and the delta. +3. If the regression check fails, the i18n CI Guard shall print the exact + shell command a contributor must run locally to refresh the baseline + file so the PR can be re-reviewed against the new value. +4. The i18n CI Guard shall print, on success, a one-line summary per check + confirming the catalogue is CJK-clean and the per-path counts are at or + below baseline. + +### Requirement 4: Baseline file lifecycle + +**Objective:** As a reviewer enforcing English support, I want the baseline +to live in the repository as a small, human-readable file that only changes +through code review, so that downward ratcheting is intentional and +auditable. + +#### Acceptance Criteria + +1. The i18n CI Guard shall store the baseline as a single committed file + under `.kiro/specs/i18n-ci-guard/`. +2. The baseline file shall record one count per scoped path, in a stable, + diff-friendly text format (no JSON line shuffling, no trailing + whitespace). +3. When the guard script is invoked with an explicit "refresh baseline" + subcommand or flag, the i18n CI Guard shall overwrite the baseline file + with the current per-path counts and exit with status zero. +4. While no refresh flag is supplied, the i18n CI Guard shall never modify + the baseline file. +5. If the baseline file is missing at check time, the i18n CI Guard shall + exit with a non-zero status and instruct the contributor to refresh it. + +### Requirement 5: GitHub Actions PR integration + +**Objective:** As a project maintainer, I want every pull request targeting +`main` to be gated by the guard, so that no merge silently regresses the +English-only state of the catalogue or codebase. + +#### Acceptance Criteria + +1. The i18n CI Guard workflow shall trigger on every `pull_request` event + whose base ref is `main`. +2. While the workflow runs, the i18n CI Guard shall check out the PR head + commit with full history sufficient for `git grep` to scan tracked + files. +3. When the guard script exits with non-zero status, the workflow shall + fail and surface the script's standard output and standard error in the + GitHub Actions log. +4. When the guard script exits with status zero, the workflow shall pass. +5. The workflow shall use only Python from the standard + `actions/setup-python` distribution and tools already available on the + GitHub-hosted `ubuntu-latest` runner (`bash`, `git`); it shall not + install third-party Python packages. +6. The workflow shall complete within sixty seconds of wall-clock time on + a clean `ubuntu-latest` runner. + +### Requirement 6: Local reproducibility + +**Objective:** As a developer preparing a PR, I want to run the same guard +locally before pushing, so that I can catch regressions before CI does. + +#### Acceptance Criteria + +1. When the guard script is invoked from a developer machine that has + Python 3.11 or newer and `git` available, the i18n CI Guard shall + produce the same pass/fail result and the same per-path counts that + it would produce in CI for the same working tree. +2. The i18n CI Guard shall expose a single, stable invocation entry point + (a script under `scripts/ci/`) documented in the spec's design and + README touchpoints. +3. The i18n CI Guard shall require zero environment variables or secrets + to run locally. diff --git a/.kiro/specs/i18n-ci-guard/research.md b/.kiro/specs/i18n-ci-guard/research.md new file mode 100644 index 00000000..65171669 --- /dev/null +++ b/.kiro/specs/i18n-ci-guard/research.md @@ -0,0 +1,175 @@ +# Research & Design Decisions — i18n-ci-guard + +## Summary +- **Feature**: `i18n-ci-guard` +- **Discovery Scope**: Simple Addition (one Python script + one GH Actions + workflow + one baseline file). Extension-flavoured because it builds on + established `scripts/` conventions and the canonical CJK regex used by + the larger audit pipeline. +- **Key Findings**: + - The canonical CJK match command `git grep -nIP '[\x{4e00}-\x{9fff}]' + -- ` is already used by the unmerged audit pipeline (PR #27) + and is portable on every git ≥2.4 (`ubuntu-latest` ships ≥2.40). + - `scripts/check_i18n_logs.py` is a strong CLI/style precedent: + Python-stdlib-only, exit `0`/`1`, output as `:: + : `, canonical regex `[一-鿿]`. + - The repository has no existing `pull_request`-triggered GH Actions + workflow; this guard introduces the first one. The only existing + workflow (`.github/workflows/docker-image.yml`) runs on tag pushes + only. + - Current per-path counts on this branch: + `backend/app=2707, frontend/src=902, locales/en.json=0`. These are + sample counts; the committed baseline must be regenerated against + `main` at implementation time. + +## Research Log + +### Canonical scan command +- **Context**: Requirement 2 needs a stable per-path CJK count and + Requirement 5.5 forbids third-party packages. +- **Sources Consulted**: + - `audit_cjk.sh` from PR #27 commit `3481408`. + - `git grep` man page. +- **Findings**: + - `git grep -nIP '[\x{4e00}-\x{9fff}]' -- ` returns one match + per matching line in tracked, text-only files. `-I` excludes binary + files; `-P` enables PCRE2 so the `\x{...}` Unicode range works. + - This matches the input format consumed by the existing audit + classifier, so the guard's match counts are directly comparable + across pipelines. +- **Implications**: + - The guard re-uses this exact command; no new dependencies. + - Because `-I` skips binary files and tracked-only is the default, + Requirements 2.5 and 2.6 are satisfied by the command itself + rather than by additional script logic. + +### Baseline file format +- **Context**: Requirement 4 needs a diff-friendly committed baseline. +- **Sources Consulted**: + - Diff churn behaviour of JSON vs. line-oriented text in this repo's + history (e.g. `locales/*.json` PR diffs frequently re-key, while + plain-text `parity.txt` from PR #27 reads cleanly). +- **Findings**: + - Line-oriented `\t` files produce minimal diffs and + require no JSON parser. + - A two-line file (one per scoped path) is large enough to be + self-explanatory and small enough to never line-shuffle. +- **Implications**: + - Use plain text, sorted by path, single trailing newline. Reject + the file as malformed if the script cannot parse it (Req 4.5). + +### Locale-catalogue scan path +- **Context**: Requirement 1 wants `key:line` per CJK offender in + `locales/en.json`. +- **Sources Consulted**: + - `scripts/check_i18n_logs.py` (`flatten_keys` reuse pattern). + - `check_parity.py` from PR #27 (`flatten`, `[cjk-in-en]` block). +- **Findings**: + - Both precedents flatten the locale dict and run the canonical + regex against each leaf string value. Line numbers are derivable + by re-reading the file as text and matching the value's first + occurrence (good enough for an actionable error message). + - Empty-string values and non-string leaf values (booleans, null) + are skipped. +- **Implications**: + - Implement a tiny flatten-then-scan helper inside the guard + script; do not add a new shared utility module. + +### GH Actions trigger and budget +- **Context**: Requirements 5.1, 5.5, 5.6. +- **Sources Consulted**: + - GitHub-hosted runners reference (`ubuntu-latest`). + - `actions/setup-python@v5` README. +- **Findings**: + - `ubuntu-latest` has Python 3.10+ pre-installed; `actions/setup-python@v5` + pins to 3.11 in <5 s. + - A single `git grep` over the scoped paths runs in <2 s on this + repo (~3.6k matches). End-to-end the workflow comfortably fits + inside the 60 s ceiling. +- **Implications**: + - Use `actions/checkout@v4` with `fetch-depth: 1`, + `actions/setup-python@v5` with `python-version: '3.11'`, and run + the script directly. No caching layer needed. + +## Architecture Pattern Evaluation + +| Option | Description | Strengths | Risks / Limitations | Notes | +|--------|-------------|-----------|---------------------|-------| +| A. Extend `check_i18n_logs.py` | Add `--cjk-guard` mode to existing script | Reuses one file | Conflates two scopes; existing script is module-scoped, guard is subtree-scoped | Rejected | +| B. New `scripts/ci/i18n_cjk_guard.py` + new workflow | Single-purpose script + workflow + baseline file | Clean SRP; matches "one script per responsibility" precedent | One additional file | **Selected** | +| C. Shared `cjk_scan.py` helper + thin guard | Factor regex/git-grep into helper | DRY for regex constant | Premature abstraction; only one shared symbol today | Rejected | + +## Design Decisions + +### Decision: Single-purpose CI script + GH Actions workflow (Option B) +- **Context**: Requirements 1–6 demand a small, self-contained guard. +- **Alternatives Considered**: A (extend), C (shared helper). +- **Selected Approach**: New script `scripts/ci/i18n_cjk_guard.py`, + new workflow `.github/workflows/i18n-cjk-guard.yml`, baseline file + `.kiro/specs/i18n-ci-guard/baseline.txt`. +- **Rationale**: Matches the project's "one focused script per + responsibility" convention; isolates a CI-blocking surface from the + existing i18n developer scripts; keeps the baseline collocated with + the spec for review traceability. +- **Trade-offs**: One more file in `scripts/` vs. tighter cohesion. +- **Follow-up**: When a third caller wants the canonical regex, factor + it out then. + +### Decision: Plain-text baseline format +- **Context**: Requirement 4.2 demands stable, diff-friendly format. +- **Alternatives Considered**: JSON, YAML. +- **Selected Approach**: One line per scoped path: `\t`, + sorted lexicographically by path, single trailing newline. +- **Rationale**: Zero parser dependency; predictable diffs; trivial + to refresh atomically. +- **Trade-offs**: Less expressive than JSON (no nested structure), but + the data model is two integers — nesting is unnecessary. + +### Decision: Refresh via `--update-baseline` subcommand-style flag +- **Context**: Requirement 4.3 needs an explicit refresh path. +- **Alternatives Considered**: Separate `update_baseline.py` script; + Makefile target. +- **Selected Approach**: Single script with two modes: default (check + + exit 0/1) and `--update-baseline` (overwrite baseline + exit 0). +- **Rationale**: One CLI surface to remember; the failure message + prints the exact command to run. +- **Trade-offs**: Slightly more conditional logic in one script; + acceptable given the small total LoC. + +### Decision: Workflow runs only on `pull_request` to `main` +- **Context**: Requirement 5.1. +- **Alternatives Considered**: Run on `push` to all branches as well; + run on `pull_request` to any base branch. +- **Selected Approach**: `on.pull_request.branches: [main]` only. +- **Rationale**: Aligns with how the existing project uses `main` as + the protected branch (see `gh pr list` history; every feature PR + targets `main`). Avoids redundant runs on intra-branch chains. +- **Trade-offs**: A direct push to `main` would not be guarded — but + branch protection already discourages that path (per + `dev-guidelines.md`). + +## Risks & Mitigations + +- **Risk**: Baseline drifts upward unintentionally during + `--update-baseline` runs, hiding real regressions. + - *Mitigation*: Failure message instructs contributors to refresh + *only when intentional*; the baseline file is reviewed in the same + PR diff. Acceptance Criteria 3.3 makes this explicit. +- **Risk**: `git grep -P` not built with PCRE on a developer's local + git build (rare on Linux/macOS, possible on minimal Windows builds). + - *Mitigation*: The guard prints a clear error if `git grep` exits + non-zero with PCRE mode; documents Python ≥3.11 + git ≥2.20 as + prerequisites. +- **Risk**: Baseline counts captured on a feature branch include + changes not yet on `main`, mis-anchoring the ratchet. + - *Mitigation*: The implementation task explicitly recomputes + baseline against `origin/main` before committing; documented in + `tasks.md`. + +## References +- PR #27 audit pipeline (`audit_cjk.sh`, `check_parity.py`, + `classify.py`) — methodology source of truth. +- `scripts/check_i18n_logs.py` — CLI/style precedent. +- `git grep` man page — `-n`, `-I`, `-P` flag semantics. +- GitHub Actions `actions/setup-python@v5` and `actions/checkout@v4` + README pages. diff --git a/.kiro/specs/i18n-ci-guard/spec.json b/.kiro/specs/i18n-ci-guard/spec.json new file mode 100644 index 00000000..3a251576 --- /dev/null +++ b/.kiro/specs/i18n-ci-guard/spec.json @@ -0,0 +1,24 @@ +{ + "feature_name": "i18n-ci-guard", + "created_at": "2026-05-08T00:25:37Z", + "updated_at": "2026-05-08T00:40:00Z", + "language": "en", + "phase": "tasks-generated", + "approvals": { + "requirements": { + "generated": true, + "approved": true + }, + "design": { + "generated": true, + "approved": true + }, + "tasks": { + "generated": true, + "approved": true + } + }, + "ready_for_implementation": true, + "ticket": "26", + "ticket_url": "https://github.com/salestech-group/MiroFish/issues/26" +} diff --git a/.kiro/specs/i18n-ci-guard/tasks.md b/.kiro/specs/i18n-ci-guard/tasks.md new file mode 100644 index 00000000..cf5e6ad1 --- /dev/null +++ b/.kiro/specs/i18n-ci-guard/tasks.md @@ -0,0 +1,157 @@ +# Implementation Tasks — i18n-ci-guard + +> Approved spec: see `requirements.md`, `design.md`, `research.md`, +> `gap-analysis.md` in this directory. + +## Tasks + +- [x] 1. Foundation: scaffold the CI guard script with stable CLI surface and stdlib-only dependencies +- [x] 1.1 Create the empty guard script and CLI skeleton + - Place the new script at the path designated by the design (`scripts/ci/`). + - Establish the module docstring, the canonical CJK regex constant, the + scoped-paths constant tuple, and the `argparse` parser exposing default + check mode plus an explicit `--update-baseline` flag and a + `--baseline` path override. + - Confirm the script exits 0 on a smoke `--help` invocation and rejects + unknown flags with non-zero exit. + - Observable: running `python scripts/ci/i18n_cjk_guard.py --help` from + the repo root prints usage text containing every documented flag and + exits 0; running with an unknown flag exits non-zero. + - _Requirements: 5.5, 6.2, 6.3_ + - _Boundary: i18n_cjk_guard.py_ + +- [x] 2. Core: implement the two CJK checks +- [x] 2.1 Implement the locale-catalogue scan + - Recursively walk the parsed `locales/en.json` dict, applying the + canonical regex to every string leaf to gather offending entries. + - Compute the source line number by re-reading the file as text and + matching the value's first textual occurrence; truncate snippets to + the documented snippet length. + - On a missing or unreadable catalogue file, emit a clear stderr + message and exit non-zero. + - Observable: against a synthetic clean catalogue, the function returns + an empty list; against a synthetic catalogue with one CJK value, it + returns exactly one finding tuple with the correct dotted key and + line number. + - _Requirements: 1.1, 1.2, 1.3, 1.4, 3.1_ + - _Boundary: i18n_cjk_guard.py_ + +- [x] 2.2 (P) Implement the per-path CJK count via `git grep` + - Invoke `git grep -nIP '[\x{4e00}-\x{9fff}]' -- ` for each + scoped path; treat exit codes 0 (matches found) and 1 (no matches) as + success, any other exit code as a hard error reported on stderr. + - Count lines of stdout; the result for a zero-match path must be the + integer `0`, never an exception. + - Reject working-tree states where `git` is not available or PCRE is + not enabled, with a clear stderr message. + - Observable: against a tmp git repository with N planted CJK lines + under a scoped path, the function returns N; with zero CJK content, + it returns 0; binary files and untracked files do not contribute. + - _Requirements: 2.1, 2.4, 2.5, 2.6_ + - _Boundary: i18n_cjk_guard.py_ + +- [x] 2.3 Implement baseline file read/write with strict format + - Parse the baseline file as `\t` lines, ignoring `#` + comments and blank lines, raising a typed error on malformed input + or missing file. + - Write atomically (`tmp + os.replace`) with sorted entries, a single + header comment block, and a single trailing newline. + - Observable: a round-trip write/read of a deterministic counts dict + yields the same dict; a baseline file containing a non-tab line is + rejected with a clear error; the baseline file ends with exactly one + `\n`. + - _Requirements: 4.2, 4.3_ + - _Boundary: i18n_cjk_guard.py_ + +- [x] 3. Integration: wire the two checks into the default and refresh modes +- [x] 3.1 Compose the default check mode + - Run both checks under all conditions (do not short-circuit), so a + single CI log shows every failure in one pass. + - Print a one-line success summary per check on stdout when both pass. + - On locale failure, print `:: : ` lines + on stderr and a trailing `N issues` summary; on regression failure, + print `: cjk-regression: baseline= current= delta=+` + lines plus the exact verbatim refresh command. + - Surface a non-zero exit when either check fails and exit 0 only when + both pass. + - Observable: against a working tree with the committed baseline at or + above the current count and a CJK-clean en.json, exit code is 0 and + stdout contains the success summary; planting one CJK char in + en.json or planting enough new CJK lines to break the baseline + yields exit 1 and the documented stderr text. + - _Requirements: 1.2, 1.3, 1.4, 2.2, 2.3, 2.4, 3.1, 3.2, 3.3, 3.4, 4.4, 4.5_ + - _Boundary: i18n_cjk_guard.py_ + +- [x] 3.2 Compose the `--update-baseline` mode + - When the flag is provided, recompute current per-path counts and + overwrite the baseline file via the atomic writer; print the new + counts on stdout; exit 0. + - When the flag is absent, never write the baseline file under any + code path. + - Observable: invoking with `--update-baseline` rewrites the baseline + file's contents to match current counts and exits 0; running the + default mode immediately afterward exits 0. + - _Requirements: 4.3, 4.4_ + - _Boundary: i18n_cjk_guard.py_ + +- [x] 4. Establish the committed baseline anchored to `main` +- [x] 4.1 Capture initial baseline counts against `main` + - Operate from a tree that reflects `origin/main`'s state for the + scoped paths (e.g., a fresh checkout, a worktree at `origin/main`, + or `git checkout origin/main -- backend/app frontend/src` followed + by a clean revert) so the committed baseline does not over- or + under-count relative to the merge target. + - Run `--update-baseline` to materialize the counts; confirm the + resulting file is exactly two non-comment data lines (one per + scoped path) sorted lexicographically. + - Observable: the baseline file is committed to + `.kiro/specs/i18n-ci-guard/baseline.txt` and `python scripts/ci/i18n_cjk_guard.py` + against the same `main`-aligned tree exits 0. + - _Requirements: 4.1, 4.2_ + - _Boundary: baseline.txt_ + +- [x] 5. Wire the guard into GitHub Actions on every PR to `main` +- [x] 5.1 Add the PR-time workflow + - Create the workflow file at the path designated by the design, + triggered on `pull_request` whose base ref is `main`. + - Set explicit minimal permissions (`contents: read`), a one-minute + job timeout, `actions/checkout@v4` with `fetch-depth: 1`, and + `actions/setup-python@v5` pinned to Python 3.11. + - The single executable step invokes the guard script with no + arguments; the workflow surfaces the script's stdout and stderr in + the GitHub Actions log without filtering. + - Observable: the workflow YAML parses cleanly; on a PR with no CJK + regression, the job passes; on a PR that introduces a CJK regression + or CJK in en.json, the job fails and the log shows the documented + failure messages. + - _Requirements: 5.1, 5.2, 5.3, 5.4, 5.5, 5.6_ + - _Boundary: i18n-cjk-guard.yml_ + +- [x] 6. Validation: tests and end-to-end checks +- [x] 6.1 Add unit and integration tests for the guard script + - Cover the locale scan against a synthetic clean catalogue and a + synthetic CJK-tainted catalogue, asserting findings tuples match. + - Cover the per-path counter against a tmp git repo with both N>0 + and N=0 planted CJK lines, asserting the zero-match path exits + cleanly with a count of 0. + - Cover the baseline read/write round-trip and the malformed-input + rejection path. + - Cover the default mode end-to-end (pass and fail paths) with the + expected exit codes and stderr fragments, including the verbatim + refresh command on regression failure. + - Observable: `python -m pytest scripts/ci/tests/test_i18n_cjk_guard.py` + from the repo root passes locally with stdlib-only Python. + - _Requirements: 1.1, 1.2, 1.3, 1.4, 2.1, 2.4, 2.5, 2.6, 3.3, 4.3, 4.5, 6.1, 6.3_ + - _Boundary: scripts/ci/tests/_ + +- [x] 6.2 Run the guard locally to confirm reproducibility against the committed baseline + - From a clean working tree at `main` (or a worktree at `origin/main` + + this branch's new files merged on top), invoke the guard with no + arguments and confirm exit code 0 and the success summary. + - Confirm the same command is the documented developer entry point + referenced from the failure-message refresh hint. + - Observable: terminal session shows exit code 0 and the documented + one-line per-check success summary; the same script path (`scripts/ci/i18n_cjk_guard.py`) + appears verbatim in the regression-failure refresh hint. + - _Requirements: 6.1, 6.2, 6.3_ + - _Boundary: i18n_cjk_guard.py, baseline.txt_ diff --git a/.kiro/specs/i18n-e2e-english-verification/audit/9dcaecd2d27e6325bae0c53b9ab41eb86d0269cd/cjk-grep-bucketed.txt b/.kiro/specs/i18n-e2e-english-verification/audit/9dcaecd2d27e6325bae0c53b9ab41eb86d0269cd/cjk-grep-bucketed.txt new file mode 100644 index 00000000..7170dd33 --- /dev/null +++ b/.kiro/specs/i18n-e2e-english-verification/audit/9dcaecd2d27e6325bae0c53b9ab41eb86d0269cd/cjk-grep-bucketed.txt @@ -0,0 +1,2924 @@ +[backend/app] (2792 lines) +backend/app/__init__.py:2:MiroFish Backend - Flask应用工厂 +backend/app/__init__.py:8:# 抑制 multiprocessing resource_tracker 的警告(来自第三方库如 transformers) +backend/app/__init__.py:9:# 需要在所有其他导入之前设置 +backend/app/__init__.py:21: """Flask应用工厂函数""" +backend/app/__init__.py:25: # 设置JSON编码:确保中文直接显示(而不是 \uXXXX 格式) +backend/app/__init__.py:26: # Flask >= 2.3 使用 app.json.ensure_ascii,旧版本使用 JSON_AS_ASCII 配置 +backend/app/__init__.py:30: # 设置日志 +backend/app/__init__.py:33: # 只在 reloader 子进程中打印启动信息(避免 debug 模式下打印两次) +backend/app/__init__.py:43: # 启用CORS +backend/app/__init__.py:46: # 注册模拟进程清理函数(确保服务器关闭时终止所有模拟进程) +backend/app/__init__.py:52: # 请求日志中间件 +backend/app/__init__.py:66: # 注册蓝图 +backend/app/__init__.py:72: # 健康检查 +backend/app/api/__init__.py:2:API路由模块 +backend/app/api/graph.py:2:图谱相关API路由 +backend/app/api/graph.py:3:采用项目上下文机制,服务端持久化状态 +backend/app/api/graph.py:29:# 获取日志器 +backend/app/api/graph.py:34: """检查文件扩展名是否允许""" +backend/app/api/graph.py:41:# ============== 项目管理接口 ============== +backend/app/api/graph.py:46: 获取项目详情 +backend/app/api/graph.py:65: 列出所有项目 +backend/app/api/graph.py:80: 删除项目 +backend/app/api/graph.py:99: 重置项目状态(用于重新构建图谱) +backend/app/api/graph.py:109: # 重置到本体已生成状态 +backend/app/api/graph.py:127:# ============== 接口1:上传文件并生成本体 ============== +backend/app/api/graph.py:132: 接口1:上传文件,分析生成本体定义 +backend/app/api/graph.py:134: 请求方式:multipart/form-data +backend/app/api/graph.py:136: 参数: +backend/app/api/graph.py:137: files: 上传的文件(PDF/MD/TXT),可多个 +backend/app/api/graph.py:138: simulation_requirement: 模拟需求描述(必填) +backend/app/api/graph.py:139: project_name: 项目名称(可选) +backend/app/api/graph.py:140: additional_context: 额外说明(可选) +backend/app/api/graph.py:142: 返回: +backend/app/api/graph.py:160: # 获取参数 +backend/app/api/graph.py:174: # 获取上传的文件 +backend/app/api/graph.py:182: # 创建项目 +backend/app/api/graph.py:187: # 保存文件并提取文本 +backend/app/api/graph.py:193: # 保存文件到项目目录 +backend/app/api/graph.py:204: # 提取文本 +backend/app/api/graph.py:217: # 保存提取的文本 +backend/app/api/graph.py:222: # 生成本体 +backend/app/api/graph.py:231: # 保存本体到项目 +backend/app/api/graph.py:265:# ============== 接口2:构建图谱 ============== +backend/app/api/graph.py:270: 接口2:根据project_id构建图谱 +backend/app/api/graph.py:272: 请求(JSON): +backend/app/api/graph.py:274: "project_id": "proj_xxxx", // 必填,来自接口1 +backend/app/api/graph.py:275: "graph_name": "图谱名称", // 可选 +backend/app/api/graph.py:276: "chunk_size": 500, // 可选,默认500 +backend/app/api/graph.py:277: "chunk_overlap": 50 // 可选,默认50 +backend/app/api/graph.py:280: 返回: +backend/app/api/graph.py:286: "message": "图谱构建任务已启动" +backend/app/api/graph.py:293: # 检查配置 +backend/app/api/graph.py:296: errors.append("NEO4J未配置") +backend/app/api/graph.py:301: "error": "配置错误: " + "; ".join(errors) +backend/app/api/graph.py:304: # 解析请求 +backend/app/api/graph.py:315: # 获取项目 +backend/app/api/graph.py:323: # 检查项目状态 +backend/app/api/graph.py:324: force = data.get('force', False) # 强制重新构建 +backend/app/api/graph.py:339: # 如果强制重建,重置状态 +backend/app/api/graph.py:346: # 获取配置 +backend/app/api/graph.py:351: # 更新项目配置 +backend/app/api/graph.py:355: # 获取提取的文本 +backend/app/api/graph.py:363: # 获取本体 +backend/app/api/graph.py:371: # 创建异步任务 +backend/app/api/graph.py:373: task_id = task_manager.create_task(f"构建图谱: {graph_name}") +backend/app/api/graph.py:376: # 更新项目状态 +backend/app/api/graph.py:381: # 启动后台任务 +backend/app/api/graph.py:385: build_logger.info(f"[{task_id}] 开始构建图谱...") +backend/app/api/graph.py:389: message="初始化图谱构建服务..." +backend/app/api/graph.py:392: # 创建图谱构建服务 +backend/app/api/graph.py:395: # 分块 +backend/app/api/graph.py:398: message="文本分块中...", +backend/app/api/graph.py:408: # 创建图谱 +backend/app/api/graph.py:411: message="创建Zep图谱...", +backend/app/api/graph.py:416: # 更新项目的graph_id +backend/app/api/graph.py:420: # 设置本体 +backend/app/api/graph.py:423: message="设置本体定义...", +backend/app/api/graph.py:428: # 添加文本(progress_callback 签名是 (msg, progress_ratio)) +backend/app/api/graph.py:451: msg_start = (f"断点续传:跳过 {skip_chunks} 个已处理块,继续处理 {remaining} 块..." +backend/app/api/graph.py:452: if skip_chunks > 0 else f"开始添加 {total_chunks} 个文本块...") +backend/app/api/graph.py:463: # 等待Zep处理完成(查询每个episode的processed状态) +backend/app/api/graph.py:466: message="等待Zep处理数据...", +backend/app/api/graph.py:480: # 获取图谱数据 +backend/app/api/graph.py:483: message="获取图谱数据...", +backend/app/api/graph.py:488: # 更新项目状态 +backend/app/api/graph.py:494: build_logger.info(f"[{task_id}] 图谱构建完成: graph_id={graph_id}, 节点={node_count}, 边={edge_count}") +backend/app/api/graph.py:496: # 完成 +backend/app/api/graph.py:500: message="图谱构建完成", +backend/app/api/graph.py:512: # 更新项目状态为失败 +backend/app/api/graph.py:513: build_logger.error(f"[{task_id}] 图谱构建失败: {str(e)}") +backend/app/api/graph.py:523: message=f"构建失败: {str(e)}", +backend/app/api/graph.py:527: # 启动后台线程 +backend/app/api/graph.py:536: "message": "图谱构建任务已启动,请通过 /task/{task_id} 查询进度" +backend/app/api/graph.py:548:# ============== 任务查询接口 ============== +backend/app/api/graph.py:553: 查询任务状态 +backend/app/api/graph.py:572: 列出所有任务 +backend/app/api/graph.py:583:# ============== 图谱数据接口 ============== +backend/app/api/graph.py:611: 获取图谱数据(节点和边)。 +backend/app/api/graph.py:612: - 有缓存且未过期:直接返回缓存,不调用 Zep +backend/app/api/graph.py:613: - 有缓存但已过期:立即返回旧缓存,后台异步刷新 +backend/app/api/graph.py:614: - 无缓存:后台线程拉取,返回 202 让前端稍后重试 +backend/app/api/graph.py:643: 删除Zep图谱 +backend/app/api/report.py:2:Report API路由 +backend/app/api/report.py:3:提供模拟报告生成、获取、对话等接口 +backend/app/api/report.py:23:# ============== 报告生成接口 ============== +backend/app/api/report.py:28: 生成模拟分析报告(异步任务) +backend/app/api/report.py:30: 这是一个耗时操作,接口会立即返回task_id, +backend/app/api/report.py:31: 使用 GET /api/report/generate/status 查询进度 +backend/app/api/report.py:33: 请求(JSON): +backend/app/api/report.py:35: "simulation_id": "sim_xxxx", // 必填,模拟ID +backend/app/api/report.py:36: "force_regenerate": false // 可选,强制重新生成 +backend/app/api/report.py:39: 返回: +backend/app/api/report.py:46: "message": "报告生成任务已启动" +backend/app/api/report.py:62: # 获取模拟信息 +backend/app/api/report.py:72: # 检查是否已有报告 +backend/app/api/report.py:87: # 获取项目信息 +backend/app/api/report.py:109: # 提前生成 report_id,以便立即返回给前端 +backend/app/api/report.py:113: # 创建异步任务 +backend/app/api/report.py:127: # 定义后台任务 +backend/app/api/report.py:138: # 创建Report Agent +backend/app/api/report.py:145: # 进度回调 +backend/app/api/report.py:153: # 生成报告(传入预先生成的 report_id) +backend/app/api/report.py:159: # 保存报告 +backend/app/api/report.py:178: # 启动后台线程 +backend/app/api/report.py:206: 查询报告生成任务进度 +backend/app/api/report.py:208: 请求(JSON): +backend/app/api/report.py:210: "task_id": "task_xxxx", // 可选,generate返回的task_id +backend/app/api/report.py:211: "simulation_id": "sim_xxxx" // 可选,模拟ID +backend/app/api/report.py:214: 返回: +backend/app/api/report.py:231: # 如果提供了simulation_id,先检查是否已有完成的报告 +backend/app/api/report.py:275:# ============== 报告获取接口 ============== +backend/app/api/report.py:280: 获取报告详情 +backend/app/api/report.py:282: 返回: +backend/app/api/report.py:322: 根据模拟ID获取报告 +backend/app/api/report.py:324: 返回: +backend/app/api/report.py:361: 列出所有报告 +backend/app/api/report.py:363: Query参数: +backend/app/api/report.py:364: simulation_id: 按模拟ID过滤(可选) +backend/app/api/report.py:365: limit: 返回数量限制(默认50) +backend/app/api/report.py:367: 返回: +backend/app/api/report.py:401: 下载报告(Markdown格式) +backend/app/api/report.py:403: 返回Markdown文件 +backend/app/api/report.py:417: # 如果MD文件不存在,生成一个临时文件 +backend/app/api/report.py:446: """删除报告""" +backend/app/api/report.py:470:# ============== Report Agent对话接口 ============== +backend/app/api/report.py:475: 与Report Agent对话 +backend/app/api/report.py:477: Report Agent可以在对话中自主调用检索工具来回答问题 +backend/app/api/report.py:479: 请求(JSON): +backend/app/api/report.py:481: "simulation_id": "sim_xxxx", // 必填,模拟ID +backend/app/api/report.py:482: "message": "请解释一下舆情走向", // 必填,用户消息 +backend/app/api/report.py:483: "chat_history": [ // 可选,对话历史 +backend/app/api/report.py:489: 返回: +backend/app/api/report.py:493: "response": "Agent回复...", +backend/app/api/report.py:494: "tool_calls": [调用的工具列表], +backend/app/api/report.py:495: "sources": [信息来源] +backend/app/api/report.py:518: # 获取模拟和项目信息 +backend/app/api/report.py:544: # 创建Agent并进行对话 +backend/app/api/report.py:567:# ============== 报告进度与分章节接口 ============== +backend/app/api/report.py:572: 获取报告生成进度(实时) +backend/app/api/report.py:574: 返回: +backend/app/api/report.py:580: "message": "正在生成章节: 关键发现", +backend/app/api/report.py:581: "current_section": "关键发现", +backend/app/api/report.py:582: "completed_sections": ["执行摘要", "模拟背景"], +backend/app/api/report.py:613: 获取已生成的章节列表(分章节输出) +backend/app/api/report.py:615: 前端可以轮询此接口获取已生成的章节内容,无需等待整个报告完成 +backend/app/api/report.py:617: 返回: +backend/app/api/report.py:626: "content": "## 执行摘要\\n\\n..." +backend/app/api/report.py:638: # 获取报告状态 +backend/app/api/report.py:664: 获取单个章节内容 +backend/app/api/report.py:666: 返回: +backend/app/api/report.py:671: "content": "## 执行摘要\\n\\n..." +backend/app/api/report.py:705:# ============== 报告状态检查接口 ============== +backend/app/api/report.py:710: 检查模拟是否有报告,以及报告状态 +backend/app/api/report.py:712: 用于前端判断是否解锁Interview功能 +backend/app/api/report.py:714: 返回: +backend/app/api/report.py:733: # 只有报告完成后才解锁interview +backend/app/api/report.py:756:# ============== Agent 日志接口 ============== +backend/app/api/report.py:761: 获取 Report Agent 的详细执行日志 +backend/app/api/report.py:763: 实时获取报告生成过程中的每一步动作,包括: +backend/app/api/report.py:764: - 报告开始、规划开始/完成 +backend/app/api/report.py:765: - 每个章节的开始、工具调用、LLM响应、完成 +backend/app/api/report.py:766: - 报告完成或失败 +backend/app/api/report.py:768: Query参数: +backend/app/api/report.py:769: from_line: 从第几行开始读取(可选,默认0,用于增量获取) +backend/app/api/report.py:771: 返回: +backend/app/api/report.py:782: "section_title": "执行摘要", +backend/app/api/report.py:820: 获取完整的 Agent 日志(一次性获取全部) +backend/app/api/report.py:822: 返回: +backend/app/api/report.py:851:# ============== 控制台日志接口 ============== +backend/app/api/report.py:856: 获取 Report Agent 的控制台输出日志 +backend/app/api/report.py:858: 实时获取报告生成过程中的控制台输出(INFO、WARNING等), +backend/app/api/report.py:859: 这与 agent-log 接口返回的结构化 JSON 日志不同, +backend/app/api/report.py:860: 是纯文本格式的控制台风格日志。 +backend/app/api/report.py:862: Query参数: +backend/app/api/report.py:863: from_line: 从第几行开始读取(可选,默认0,用于增量获取) +backend/app/api/report.py:865: 返回: +backend/app/api/report.py:870: "[19:46:14] INFO: 搜索完成: 找到 15 条相关事实", +backend/app/api/report.py:871: "[19:46:14] INFO: 图谱搜索: graph_id=xxx, query=...", +backend/app/api/report.py:902: 获取完整的控制台日志(一次性获取全部) +backend/app/api/report.py:904: 返回: +backend/app/api/report.py:933:# ============== 工具调用接口(供调试使用)============== +backend/app/api/report.py:938: 图谱搜索工具接口(供调试使用) +backend/app/api/report.py:940: 请求(JSON): +backend/app/api/report.py:943: "query": "搜索查询", +backend/app/api/report.py:986: 图谱统计工具接口(供调试使用) +backend/app/api/report.py:988: 请求(JSON): +backend/app/api/simulation.py:2:模拟相关API路由 +backend/app/api/simulation.py:3:Step2: Zep实体读取与过滤、OASIS模拟准备与运行(全程自动化) +backend/app/api/simulation.py:23:# Interview prompt 优化前缀 +backend/app/api/simulation.py:24:# 添加此前缀可以避免Agent调用工具,直接用文本回复 +backend/app/api/simulation.py:25:INTERVIEW_PROMPT_PREFIX = "结合你的人设、所有的过往记忆与行动,不调用任何工具直接用文本回复我:" +backend/app/api/simulation.py:30: 优化Interview提问,添加前缀避免Agent调用工具 +backend/app/api/simulation.py:33: prompt: 原始提问 +backend/app/api/simulation.py:36: 优化后的提问 +backend/app/api/simulation.py:40: # 避免重复添加前缀 +backend/app/api/simulation.py:46:# ============== 实体读取接口 ============== +backend/app/api/simulation.py:51: 获取图谱中的所有实体(已过滤) +backend/app/api/simulation.py:53: 只返回符合预定义实体类型的节点(Labels不只是Entity的节点) +backend/app/api/simulation.py:55: Query参数: +backend/app/api/simulation.py:56: entity_types: 逗号分隔的实体类型列表(可选,用于进一步过滤) +backend/app/api/simulation.py:57: enrich: 是否获取相关边信息(默认true) +backend/app/api/simulation.py:95: """获取单个实体的详细信息""" +backend/app/api/simulation.py:128: """获取指定类型的所有实体""" +backend/app/api/simulation.py:163:# ============== 模拟管理接口 ============== +backend/app/api/simulation.py:168: 创建新的模拟 +backend/app/api/simulation.py:170: 注意:max_rounds等参数由LLM智能生成,无需手动设置 +backend/app/api/simulation.py:172: 请求(JSON): +backend/app/api/simulation.py:174: "project_id": "proj_xxxx", // 必填 +backend/app/api/simulation.py:175: "graph_id": "mirofish_xxxx", // 可选,如不提供则从project获取 +backend/app/api/simulation.py:176: "enable_twitter": true, // 可选,默认true +backend/app/api/simulation.py:177: "enable_reddit": true // 可选,默认true +backend/app/api/simulation.py:180: 返回: +backend/app/api/simulation.py:242: 检查模拟是否已经准备完成 +backend/app/api/simulation.py:244: 检查条件: +backend/app/api/simulation.py:245: 1. state.json 存在且 status 为 "ready" +backend/app/api/simulation.py:246: 2. 必要文件存在:reddit_profiles.json, twitter_profiles.csv, simulation_config.json +backend/app/api/simulation.py:248: 注意:运行脚本(run_*.py)保留在 backend/scripts/ 目录,不再复制到模拟目录 +backend/app/api/simulation.py:251: simulation_id: 模拟ID +backend/app/api/simulation.py:261: # 检查目录是否存在 +backend/app/api/simulation.py:263: return False, {"reason": "模拟目录不存在"} +backend/app/api/simulation.py:265: # 必要文件列表(不包括脚本,脚本位于 backend/scripts/) +backend/app/api/simulation.py:273: # 检查文件是否存在 +backend/app/api/simulation.py:285: "reason": "缺少必要文件", +backend/app/api/simulation.py:290: # 检查state.json中的状态 +backend/app/api/simulation.py:300: # 详细日志 +backend/app/api/simulation.py:303: # 如果 config_generated=True 且文件存在,认为准备完成 +backend/app/api/simulation.py:304: # 以下状态都说明准备工作已完成: +backend/app/api/simulation.py:305: # - ready: 准备完成,可以运行 +backend/app/api/simulation.py:306: # - preparing: 如果 config_generated=True 说明已完成 +backend/app/api/simulation.py:307: # - running: 正在运行,说明准备早就完成了 +backend/app/api/simulation.py:308: # - completed: 运行完成,说明准备早就完成了 +backend/app/api/simulation.py:309: # - stopped: 已停止,说明准备早就完成了 +backend/app/api/simulation.py:310: # - failed: 运行失败(但准备是完成的) +backend/app/api/simulation.py:313: # 获取文件统计信息 +backend/app/api/simulation.py:323: # 如果状态是preparing但文件已完成,自动更新状态为ready +backend/app/api/simulation.py:350: "reason": f"状态不在已准备列表中或config_generated为false: status={status}, config_generated={config_generated}", +backend/app/api/simulation.py:356: return False, {"reason": f"读取状态文件失败: {str(e)}"} +backend/app/api/simulation.py:362: 准备模拟环境(异步任务,LLM智能生成所有参数) +backend/app/api/simulation.py:364: 这是一个耗时操作,接口会立即返回task_id, +backend/app/api/simulation.py:365: 使用 GET /api/simulation/prepare/status 查询进度 +backend/app/api/simulation.py:367: 特性: +backend/app/api/simulation.py:368: - 自动检测已完成的准备工作,避免重复生成 +backend/app/api/simulation.py:369: - 如果已准备完成,直接返回已有结果 +backend/app/api/simulation.py:370: - 支持强制重新生成(force_regenerate=true) +backend/app/api/simulation.py:372: 步骤: +backend/app/api/simulation.py:373: 1. 检查是否已有完成的准备工作 +backend/app/api/simulation.py:374: 2. 从Zep图谱读取并过滤实体 +backend/app/api/simulation.py:375: 3. 为每个实体生成OASIS Agent Profile(带重试机制) +backend/app/api/simulation.py:376: 4. LLM智能生成模拟配置(带重试机制) +backend/app/api/simulation.py:377: 5. 保存配置文件和预设脚本 +backend/app/api/simulation.py:379: 请求(JSON): +backend/app/api/simulation.py:381: "simulation_id": "sim_xxxx", // 必填,模拟ID +backend/app/api/simulation.py:382: "entity_types": ["Student", "PublicFigure"], // 可选,指定实体类型 +backend/app/api/simulation.py:383: "use_llm_for_profiles": true, // 可选,是否用LLM生成人设 +backend/app/api/simulation.py:384: "parallel_profile_count": 5, // 可选,并行生成人设数量,默认5 +backend/app/api/simulation.py:385: "force_regenerate": false // 可选,强制重新生成,默认false +backend/app/api/simulation.py:388: 返回: +backend/app/api/simulation.py:393: "task_id": "task_xxxx", // 新任务时返回 +backend/app/api/simulation.py:395: "message": "准备任务已启动|已有完成的准备工作", +backend/app/api/simulation.py:396: "already_prepared": true|false // 是否已准备完成 +backend/app/api/simulation.py:424: # 检查是否强制重新生成 +backend/app/api/simulation.py:428: # 检查是否已经准备完成(避免重复生成) +backend/app/api/simulation.py:440: "message": "已有完成的准备工作,无需重复生成", +backend/app/api/simulation.py:448: # 从项目获取必要信息 +backend/app/api/simulation.py:456: # 获取模拟需求 +backend/app/api/simulation.py:464: # 获取文档文本 +backend/app/api/simulation.py:471: # ========== 同步获取实体数量(在后台任务启动前) ========== +backend/app/api/simulation.py:472: # 这样前端在调用prepare后立即就能获取到预期Agent总数 +backend/app/api/simulation.py:476: # 快速读取实体(不需要边信息,只统计数量) +backend/app/api/simulation.py:480: enrich_with_edges=False # 不获取边信息,加快速度 +backend/app/api/simulation.py:482: # 保存实体数量到状态(供前端立即获取) +backend/app/api/simulation.py:488: # 失败不影响后续流程,后台任务会重新获取 +backend/app/api/simulation.py:490: # 创建异步任务 +backend/app/api/simulation.py:500: # 更新模拟状态(包含预先获取的实体数量) +backend/app/api/simulation.py:504: # 定义后台任务 +backend/app/api/simulation.py:511: message="开始准备模拟环境..." +backend/app/api/simulation.py:514: # 准备模拟(带进度回调) +backend/app/api/simulation.py:515: # 存储阶段进度详情 +backend/app/api/simulation.py:519: # 计算总进度 +backend/app/api/simulation.py:530: # 构建详细进度信息 +backend/app/api/simulation.py:532: "reading": "读取图谱实体", +backend/app/api/simulation.py:533: "generating_profiles": "生成Agent人设", +backend/app/api/simulation.py:534: "generating_config": "生成模拟配置", +backend/app/api/simulation.py:535: "copying_scripts": "准备模拟脚本" +backend/app/api/simulation.py:541: # 更新阶段详情 +backend/app/api/simulation.py:550: # 构建详细进度信息 +backend/app/api/simulation.py:563: # 构建简洁消息 +backend/app/api/simulation.py:589: # 任务完成 +backend/app/api/simulation.py:599: # 更新模拟状态为失败 +backend/app/api/simulation.py:606: # 启动后台线程 +backend/app/api/simulation.py:616: "message": "准备任务已启动,请通过 /api/simulation/prepare/status 查询进度", +backend/app/api/simulation.py:618: "expected_entities_count": state.entities_count, # 预期的Agent总数 +backend/app/api/simulation.py:619: "entity_types": state.entity_types # 实体类型列表 +backend/app/api/simulation.py:641: 查询准备任务进度 +backend/app/api/simulation.py:643: 支持两种查询方式: +backend/app/api/simulation.py:644: 1. 通过task_id查询正在进行的任务进度 +backend/app/api/simulation.py:645: 2. 通过simulation_id检查是否已有完成的准备工作 +backend/app/api/simulation.py:647: 请求(JSON): +backend/app/api/simulation.py:649: "task_id": "task_xxxx", // 可选,prepare返回的task_id +backend/app/api/simulation.py:650: "simulation_id": "sim_xxxx" // 可选,模拟ID(用于检查已完成的准备) +backend/app/api/simulation.py:653: 返回: +backend/app/api/simulation.py:661: "already_prepared": true|false, // 是否已有完成的准备 +backend/app/api/simulation.py:662: "prepare_info": {...} // 已准备完成时的详细信息 +backend/app/api/simulation.py:674: # 如果提供了simulation_id,先检查是否已准备完成 +backend/app/api/simulation.py:684: "message": "已有完成的准备工作", +backend/app/api/simulation.py:690: # 如果没有task_id,返回错误 +backend/app/api/simulation.py:693: # 有simulation_id但未准备完成 +backend/app/api/simulation.py:700: "message": "尚未开始准备,请调用 /api/simulation/prepare 开始", +backend/app/api/simulation.py:713: # 任务不存在,但如果有simulation_id,检查是否已准备完成 +backend/app/api/simulation.py:724: "message": "任务已完成(准备工作已存在)", +backend/app/api/simulation.py:753: """获取模拟状态""" +backend/app/api/simulation.py:766: # 如果模拟已准备好,附加运行说明 +backend/app/api/simulation.py:787: 列出所有模拟 +backend/app/api/simulation.py:789: Query参数: +backend/app/api/simulation.py:790: project_id: 按项目ID过滤(可选) +backend/app/api/simulation.py:815: 获取 simulation 对应的最新 report_id +backend/app/api/simulation.py:817: 遍历 reports 目录,找出 simulation_id 匹配的 report, +backend/app/api/simulation.py:818: 如果有多个则返回最新的(按 created_at 排序) +backend/app/api/simulation.py:821: simulation_id: 模拟ID +backend/app/api/simulation.py:824: report_id 或 None +backend/app/api/simulation.py:829: # reports 目录路径:backend/uploads/reports +backend/app/api/simulation.py:830: # __file__ 是 app/api/simulation.py,需要向上两级到 backend/ +backend/app/api/simulation.py:863: # 按创建时间倒序排序,返回最新的 +backend/app/api/simulation.py:875: 获取历史模拟列表(带项目详情) +backend/app/api/simulation.py:877: 用于首页历史项目展示,返回包含项目名称、描述等丰富信息的模拟列表 +backend/app/api/simulation.py:879: Query参数: +backend/app/api/simulation.py:880: limit: 返回数量限制(默认20) +backend/app/api/simulation.py:882: 返回: +backend/app/api/simulation.py:889: "project_name": "武大舆情分析", +backend/app/api/simulation.py:890: "simulation_requirement": "如果武汉大学发布...", +backend/app/api/simulation.py:913: # 增强模拟数据,只从 Simulation 文件读取 +backend/app/api/simulation.py:918: # 获取模拟配置信息(从 simulation_config.json 读取 simulation_requirement) +backend/app/api/simulation.py:924: # 推荐轮数(后备值) +backend/app/api/simulation.py:934: # 获取运行状态(从 run_state.json 读取用户设置的实际轮数) +backend/app/api/simulation.py:939: # 使用用户设置的 total_rounds,若无则使用推荐轮数 +backend/app/api/simulation.py:946: # 获取关联项目的文件列表(最多3个) +backend/app/api/simulation.py:950: {"filename": f.get("filename", "未知文件")} +backend/app/api/simulation.py:956: # 获取关联的 report_id(查找该 simulation 最新的 report) +backend/app/api/simulation.py:959: # 添加版本号 +backend/app/api/simulation.py:962: # 格式化日期 +backend/app/api/simulation.py:989: 获取模拟的Agent Profile +backend/app/api/simulation.py:991: Query参数: +backend/app/api/simulation.py:992: platform: 平台类型(reddit/twitter,默认reddit) +backend/app/api/simulation.py:1027: 实时获取模拟的Agent Profile(用于在生成过程中实时查看进度) +backend/app/api/simulation.py:1029: 与 /profiles 接口的区别: +backend/app/api/simulation.py:1030: - 直接读取文件,不经过 SimulationManager +backend/app/api/simulation.py:1031: - 适用于生成过程中的实时查看 +backend/app/api/simulation.py:1032: - 返回额外的元数据(如文件修改时间、是否正在生成等) +backend/app/api/simulation.py:1034: Query参数: +backend/app/api/simulation.py:1035: platform: 平台类型(reddit/twitter,默认reddit) +backend/app/api/simulation.py:1037: 返回: +backend/app/api/simulation.py:1044: "total_expected": 93, // 预期总数(如果有) +backend/app/api/simulation.py:1045: "is_generating": true, // 是否正在生成 +backend/app/api/simulation.py:1059: # 获取模拟目录 +backend/app/api/simulation.py:1068: # 确定文件路径 +backend/app/api/simulation.py:1074: # 检查文件是否存在 +backend/app/api/simulation.py:1080: # 获取文件修改时间 +backend/app/api/simulation.py:1096: # 检查是否正在生成(通过 state.json 判断) +backend/app/api/simulation.py:1137: 实时获取模拟配置(用于在生成过程中实时查看进度) +backend/app/api/simulation.py:1139: 与 /config 接口的区别: +backend/app/api/simulation.py:1140: - 直接读取文件,不经过 SimulationManager +backend/app/api/simulation.py:1141: - 适用于生成过程中的实时查看 +backend/app/api/simulation.py:1142: - 返回额外的元数据(如文件修改时间、是否正在生成等) +backend/app/api/simulation.py:1143: - 即使配置还没生成完也能返回部分信息 +backend/app/api/simulation.py:1145: 返回: +backend/app/api/simulation.py:1152: "is_generating": true, // 是否正在生成 +backend/app/api/simulation.py:1153: "generation_stage": "generating_config", // 当前生成阶段 +backend/app/api/simulation.py:1154: "config": {...} // 配置内容(如果存在) +backend/app/api/simulation.py:1162: # 获取模拟目录 +backend/app/api/simulation.py:1171: # 配置文件路径 +backend/app/api/simulation.py:1174: # 检查文件是否存在 +backend/app/api/simulation.py:1180: # 获取文件修改时间 +backend/app/api/simulation.py:1191: # 检查是否正在生成(通过 state.json 判断) +backend/app/api/simulation.py:1205: # 判断当前阶段 +backend/app/api/simulation.py:1216: # 构建返回数据 +backend/app/api/simulation.py:1227: # 如果配置存在,提取一些关键统计信息 +backend/app/api/simulation.py:1257: 获取模拟配置(LLM智能生成的完整配置) +backend/app/api/simulation.py:1259: 返回包含: +backend/app/api/simulation.py:1260: - time_config: 时间配置(模拟时长、轮次、高峰/低谷时段) +backend/app/api/simulation.py:1261: - agent_configs: 每个Agent的活动配置(活跃度、发言频率、立场等) +backend/app/api/simulation.py:1262: - event_config: 事件配置(初始帖子、热点话题) +backend/app/api/simulation.py:1263: - platform_configs: 平台配置 +backend/app/api/simulation.py:1264: - generation_reasoning: LLM的配置推理说明 +backend/app/api/simulation.py:1292: """下载模拟配置文件""" +backend/app/api/simulation.py:1322: 下载模拟运行脚本文件(通用脚本,位于 backend/scripts/) +backend/app/api/simulation.py:1324: script_name可选值: +backend/app/api/simulation.py:1331: # 脚本位于 backend/scripts/ 目录 +backend/app/api/simulation.py:1334: # 验证脚本名称 +backend/app/api/simulation.py:1371:# ============== Profile生成接口(独立使用) ============== +backend/app/api/simulation.py:1376: 直接从图谱生成OASIS Agent Profile(不创建模拟) +backend/app/api/simulation.py:1378: 请求(JSON): +backend/app/api/simulation.py:1380: "graph_id": "mirofish_xxxx", // 必填 +backend/app/api/simulation.py:1381: "entity_types": ["Student"], // 可选 +backend/app/api/simulation.py:1382: "use_llm": true, // 可选 +backend/app/api/simulation.py:1383: "platform": "reddit" // 可选 +backend/app/api/simulation.py:1445:# ============== 模拟运行控制接口 ============== +backend/app/api/simulation.py:1450: 开始运行模拟 +backend/app/api/simulation.py:1452: 请求(JSON): +backend/app/api/simulation.py:1454: "simulation_id": "sim_xxxx", // 必填,模拟ID +backend/app/api/simulation.py:1455: "platform": "parallel", // 可选: twitter / reddit / parallel (默认) +backend/app/api/simulation.py:1456: "max_rounds": 100, // 可选: 最大模拟轮数,用于截断过长的模拟 +backend/app/api/simulation.py:1457: "enable_graph_memory_update": false, // 可选: 是否将Agent活动动态更新到Zep图谱记忆 +backend/app/api/simulation.py:1458: "force": false // 可选: 强制重新开始(会停止运行中的模拟并清理日志) +backend/app/api/simulation.py:1461: 关于 force 参数: +backend/app/api/simulation.py:1462: - 启用后,如果模拟正在运行或已完成,会先停止并清理运行日志 +backend/app/api/simulation.py:1463: - 清理的内容包括:run_state.json, actions.jsonl, simulation.log 等 +backend/app/api/simulation.py:1464: - 不会清理配置文件(simulation_config.json)和 profile 文件 +backend/app/api/simulation.py:1465: - 适用于需要重新运行模拟的场景 +backend/app/api/simulation.py:1467: 关于 enable_graph_memory_update: +backend/app/api/simulation.py:1468: - 启用后,模拟中所有Agent的活动(发帖、评论、点赞等)都会实时更新到Zep图谱 +backend/app/api/simulation.py:1469: - 这可以让图谱"记住"模拟过程,用于后续分析或AI对话 +backend/app/api/simulation.py:1470: - 需要模拟关联的项目有有效的 graph_id +backend/app/api/simulation.py:1471: - 采用批量更新机制,减少API调用次数 +backend/app/api/simulation.py:1473: 返回: +backend/app/api/simulation.py:1483: "graph_memory_update_enabled": true, // 是否启用了图谱记忆更新 +backend/app/api/simulation.py:1484: "force_restarted": true // 是否是强制重新开始 +backend/app/api/simulation.py:1499: max_rounds = data.get('max_rounds') # 可选:最大模拟轮数 +backend/app/api/simulation.py:1500: enable_graph_memory_update = data.get('enable_graph_memory_update', False) # 可选:是否启用图谱记忆更新 +backend/app/api/simulation.py:1501: force = data.get('force', False) # 可选:强制重新开始 +backend/app/api/simulation.py:1503: # 验证 max_rounds 参数 +backend/app/api/simulation.py:1524: # 检查模拟是否已准备好 +backend/app/api/simulation.py:1536: # 智能处理状态:如果准备工作已完成,允许重新启动 +backend/app/api/simulation.py:1538: # 检查准备工作是否已完成 +backend/app/api/simulation.py:1542: # 准备工作已完成,检查是否有正在运行的进程 +backend/app/api/simulation.py:1544: # 检查模拟进程是否真的在运行 +backend/app/api/simulation.py:1547: # 进程确实在运行 +backend/app/api/simulation.py:1549: # 强制模式:停止运行中的模拟 +backend/app/api/simulation.py:1561: # 如果是强制模式,清理运行日志 +backend/app/api/simulation.py:1569: # 进程不存在或已结束,重置状态为 ready +backend/app/api/simulation.py:1574: # 准备工作未完成 +backend/app/api/simulation.py:1580: # 获取图谱ID(用于图谱记忆更新) +backend/app/api/simulation.py:1583: # 从模拟状态或项目中获取 graph_id +backend/app/api/simulation.py:1586: # 尝试从项目中获取 +backend/app/api/simulation.py:1599: # 启动模拟 +backend/app/api/simulation.py:1608: # 更新模拟状态 +backend/app/api/simulation.py:1643: 停止模拟 +backend/app/api/simulation.py:1645: 请求(JSON): +backend/app/api/simulation.py:1647: "simulation_id": "sim_xxxx" // 必填,模拟ID +backend/app/api/simulation.py:1650: 返回: +backend/app/api/simulation.py:1672: # 更新模拟状态 +backend/app/api/simulation.py:1699:# ============== 实时状态监控接口 ============== +backend/app/api/simulation.py:1704: 获取模拟运行实时状态(用于前端轮询) +backend/app/api/simulation.py:1706: 返回: +backend/app/api/simulation.py:1762: 获取模拟运行详细状态(包含所有动作) +backend/app/api/simulation.py:1764: 用于前端展示实时动态 +backend/app/api/simulation.py:1766: Query参数: +backend/app/api/simulation.py:1767: platform: 过滤平台(twitter/reddit,可选) +backend/app/api/simulation.py:1769: 返回: +backend/app/api/simulation.py:1791: "twitter_actions": [...], # Twitter 平台的所有动作 +backend/app/api/simulation.py:1792: "reddit_actions": [...] # Reddit 平台的所有动作 +backend/app/api/simulation.py:1812: # 获取完整的动作列表 +backend/app/api/simulation.py:1818: # 分平台获取动作 +backend/app/api/simulation.py:1829: # 获取当前轮次的动作(recent_actions 只展示最新一轮) +backend/app/api/simulation.py:1837: # 获取基础状态信息 +backend/app/api/simulation.py:1843: # recent_actions 只展示当前最新一轮两个平台的内容 +backend/app/api/simulation.py:1863: 获取模拟中的Agent动作历史 +backend/app/api/simulation.py:1865: Query参数: +backend/app/api/simulation.py:1866: limit: 返回数量(默认100) +backend/app/api/simulation.py:1867: offset: 偏移量(默认0) +backend/app/api/simulation.py:1868: platform: 过滤平台(twitter/reddit) +backend/app/api/simulation.py:1869: agent_id: 过滤Agent ID +backend/app/api/simulation.py:1870: round_num: 过滤轮次 +backend/app/api/simulation.py:1872: 返回: +backend/app/api/simulation.py:1917: 获取模拟时间线(按轮次汇总) +backend/app/api/simulation.py:1919: 用于前端展示进度条和时间线视图 +backend/app/api/simulation.py:1921: Query参数: +backend/app/api/simulation.py:1922: start_round: 起始轮次(默认0) +backend/app/api/simulation.py:1923: end_round: 结束轮次(默认全部) +backend/app/api/simulation.py:1925: 返回每轮的汇总信息 +backend/app/api/simulation.py:1957: 获取每个Agent的统计信息 +backend/app/api/simulation.py:1959: 用于前端展示Agent活跃度排行、动作分布等 +backend/app/api/simulation.py:1981:# ============== 数据库查询接口 ============== +backend/app/api/simulation.py:1986: 获取模拟中的帖子 +backend/app/api/simulation.py:1988: Query参数: +backend/app/api/simulation.py:1989: platform: 平台类型(twitter/reddit) +backend/app/api/simulation.py:1990: limit: 返回数量(默认50) +backend/app/api/simulation.py:1991: offset: 偏移量 +backend/app/api/simulation.py:1993: 返回帖子列表(从SQLite数据库读取) +backend/app/api/simulation.py:2015: "message": "数据库不存在,模拟可能尚未运行" +backend/app/api/simulation.py:2064: 获取模拟中的评论(仅Reddit) +backend/app/api/simulation.py:2066: Query参数: +backend/app/api/simulation.py:2067: post_id: 过滤帖子ID(可选) +backend/app/api/simulation.py:2068: limit: 返回数量 +backend/app/api/simulation.py:2069: offset: 偏移量 +backend/app/api/simulation.py:2136:# ============== Interview 采访接口 ============== +backend/app/api/simulation.py:2141: 采访单个Agent +backend/app/api/simulation.py:2143: 注意:此功能需要模拟环境处于运行状态(完成模拟循环后进入等待命令模式) +backend/app/api/simulation.py:2145: 请求(JSON): +backend/app/api/simulation.py:2147: "simulation_id": "sim_xxxx", // 必填,模拟ID +backend/app/api/simulation.py:2148: "agent_id": 0, // 必填,Agent ID +backend/app/api/simulation.py:2149: "prompt": "你对这件事有什么看法?", // 必填,采访问题 +backend/app/api/simulation.py:2150: "platform": "twitter", // 可选,指定平台(twitter/reddit) +backend/app/api/simulation.py:2151: // 不指定时:双平台模拟同时采访两个平台 +backend/app/api/simulation.py:2152: "timeout": 60 // 可选,超时时间(秒),默认60 +backend/app/api/simulation.py:2155: 返回(不指定platform,双平台模式): +backend/app/api/simulation.py:2160: "prompt": "你对这件事有什么看法?", +backend/app/api/simulation.py:2173: 返回(指定platform): +backend/app/api/simulation.py:2178: "prompt": "你对这件事有什么看法?", +backend/app/api/simulation.py:2181: "response": "我认为...", +backend/app/api/simulation.py:2195: platform = data.get('platform') # 可选:twitter/reddit/None +backend/app/api/simulation.py:2216: # 验证platform参数 +backend/app/api/simulation.py:2223: # 检查环境状态 +backend/app/api/simulation.py:2230: # 优化prompt,添加前缀避免Agent调用工具 +backend/app/api/simulation.py:2270: 批量采访多个Agent +backend/app/api/simulation.py:2272: 注意:此功能需要模拟环境处于运行状态 +backend/app/api/simulation.py:2274: 请求(JSON): +backend/app/api/simulation.py:2276: "simulation_id": "sim_xxxx", // 必填,模拟ID +backend/app/api/simulation.py:2277: "interviews": [ // 必填,采访列表 +backend/app/api/simulation.py:2280: "prompt": "你对A有什么看法?", +backend/app/api/simulation.py:2281: "platform": "twitter" // 可选,指定该Agent的采访平台 +backend/app/api/simulation.py:2285: "prompt": "你对B有什么看法?" // 不指定platform则使用默认值 +backend/app/api/simulation.py:2288: "platform": "reddit", // 可选,默认平台(被每项的platform覆盖) +backend/app/api/simulation.py:2289: // 不指定时:双平台模拟每个Agent同时采访两个平台 +backend/app/api/simulation.py:2290: "timeout": 120 // 可选,超时时间(秒),默认120 +backend/app/api/simulation.py:2293: 返回: +backend/app/api/simulation.py:2316: platform = data.get('platform') # 可选:twitter/reddit/None +backend/app/api/simulation.py:2331: # 验证platform参数 +backend/app/api/simulation.py:2338: # 验证每个采访项 +backend/app/api/simulation.py:2350: # 验证每项的platform(如果有) +backend/app/api/simulation.py:2358: # 检查环境状态 +backend/app/api/simulation.py:2365: # 优化每个采访项的prompt,添加前缀避免Agent调用工具 +backend/app/api/simulation.py:2408: 全局采访 - 使用相同问题采访所有Agent +backend/app/api/simulation.py:2410: 注意:此功能需要模拟环境处于运行状态 +backend/app/api/simulation.py:2412: 请求(JSON): +backend/app/api/simulation.py:2414: "simulation_id": "sim_xxxx", // 必填,模拟ID +backend/app/api/simulation.py:2415: "prompt": "你对这件事整体有什么看法?", // 必填,采访问题(所有Agent使用相同问题) +backend/app/api/simulation.py:2416: "platform": "reddit", // 可选,指定平台(twitter/reddit) +backend/app/api/simulation.py:2417: // 不指定时:双平台模拟每个Agent同时采访两个平台 +backend/app/api/simulation.py:2418: "timeout": 180 // 可选,超时时间(秒),默认180 +backend/app/api/simulation.py:2421: 返回: +backend/app/api/simulation.py:2443: platform = data.get('platform') # 可选:twitter/reddit/None +backend/app/api/simulation.py:2458: # 验证platform参数 +backend/app/api/simulation.py:2465: # 检查环境状态 +backend/app/api/simulation.py:2472: # 优化prompt,添加前缀避免Agent调用工具 +backend/app/api/simulation.py:2511: 获取Interview历史记录 +backend/app/api/simulation.py:2513: 从模拟数据库中读取所有Interview记录 +backend/app/api/simulation.py:2515: 请求(JSON): +backend/app/api/simulation.py:2517: "simulation_id": "sim_xxxx", // 必填,模拟ID +backend/app/api/simulation.py:2518: "platform": "reddit", // 可选,平台类型(reddit/twitter) +backend/app/api/simulation.py:2519: // 不指定则返回两个平台的所有历史 +backend/app/api/simulation.py:2520: "agent_id": 0, // 可选,只获取该Agent的采访历史 +backend/app/api/simulation.py:2521: "limit": 100 // 可选,返回数量,默认100 +backend/app/api/simulation.py:2524: 返回: +backend/app/api/simulation.py:2532: "response": "我认为...", +backend/app/api/simulation.py:2533: "prompt": "你对这件事有什么看法?", +backend/app/api/simulation.py:2546: platform = data.get('platform') # 不指定则返回两个平台的历史 +backend/app/api/simulation.py:2583: 获取模拟环境状态 +backend/app/api/simulation.py:2585: 检查模拟环境是否存活(可以接收Interview命令) +backend/app/api/simulation.py:2587: 请求(JSON): +backend/app/api/simulation.py:2589: "simulation_id": "sim_xxxx" // 必填,模拟ID +backend/app/api/simulation.py:2592: 返回: +backend/app/api/simulation.py:2600: "message": "环境正在运行,可以接收Interview命令" +backend/app/api/simulation.py:2617: # 获取更详细的状态信息 +backend/app/api/simulation.py:2621: message = "环境正在运行,可以接收Interview命令" +backend/app/api/simulation.py:2623: message = "环境未运行或已关闭" +backend/app/api/simulation.py:2648: 关闭模拟环境 +backend/app/api/simulation.py:2650: 向模拟发送关闭环境命令,使其优雅退出等待命令模式。 +backend/app/api/simulation.py:2652: 注意:这不同于 /stop 接口,/stop 会强制终止进程, +backend/app/api/simulation.py:2653: 而此接口会让模拟优雅地关闭环境并退出。 +backend/app/api/simulation.py:2655: 请求(JSON): +backend/app/api/simulation.py:2657: "simulation_id": "sim_xxxx", // 必填,模拟ID +backend/app/api/simulation.py:2658: "timeout": 30 // 可选,超时时间(秒),默认30 +backend/app/api/simulation.py:2661: 返回: +backend/app/api/simulation.py:2665: "message": "环境关闭命令已发送", +backend/app/api/simulation.py:2688: # 更新模拟状态 +backend/app/config.py:2:配置管理 +backend/app/config.py:3:统一从项目根目录的 .env 文件加载配置 +backend/app/config.py:9:# 加载项目根目录的 .env 文件 +backend/app/config.py:10:# 路径: MiroFish/.env (相对于 backend/app/config.py) +backend/app/config.py:16: # 如果根目录没有 .env,尝试加载环境变量(用于生产环境) +backend/app/config.py:21: """Flask配置类""" +backend/app/config.py:23: # Flask配置 +backend/app/config.py:27: # JSON配置 - 禁用ASCII转义,让中文直接显示(而不是 \uXXXX 格式) +backend/app/config.py:30: # LLM配置(统一使用OpenAI格式) +backend/app/config.py:35: # Neo4j + Graphiti配置(替代 Zep Cloud) +backend/app/config.py:53: # Zep配置(保留兼容性,已废弃) +backend/app/config.py:56: # 文件上传配置 +backend/app/config.py:61: # 文本处理配置 +backend/app/config.py:62: DEFAULT_CHUNK_SIZE = 500 # 默认切块大小 +backend/app/config.py:63: DEFAULT_CHUNK_OVERLAP = 50 # 默认重叠大小 +backend/app/config.py:65: # OASIS模拟配置 +backend/app/config.py:69: # OASIS平台可用动作配置 +backend/app/config.py:79: # Report Agent配置 +backend/app/config.py:86: """验证必要配置""" +backend/app/config.py:89: errors.append("LLM_API_KEY 未配置") +backend/app/config.py:91: errors.append("NEO4J_PASSWORD 未配置") +backend/app/models/__init__.py:2:数据模型模块 +backend/app/models/project.py:2:项目上下文管理 +backend/app/models/project.py:3:用于在服务端持久化项目状态,避免前端在接口间传递大量数据 +backend/app/models/project.py:18: """项目状态""" +backend/app/models/project.py:19: CREATED = "created" # 刚创建,文件已上传 +backend/app/models/project.py:20: ONTOLOGY_GENERATED = "ontology_generated" # 本体已生成 +backend/app/models/project.py:21: GRAPH_BUILDING = "graph_building" # 图谱构建中 +backend/app/models/project.py:22: GRAPH_COMPLETED = "graph_completed" # 图谱构建完成 +backend/app/models/project.py:23: FAILED = "failed" # 失败 +backend/app/models/project.py:28: """项目数据模型""" +backend/app/models/project.py:35: # 文件信息 +backend/app/models/project.py:39: # 本体信息(接口1生成后填充) +backend/app/models/project.py:43: # 图谱信息(接口2完成后填充) +backend/app/models/project.py:47: # 配置 +backend/app/models/project.py:52: # 错误信息 +backend/app/models/project.py:56: """转换为字典""" +backend/app/models/project.py:77: """从字典创建""" +backend/app/models/project.py:102: """项目管理器 - 负责项目的持久化存储和检索""" +backend/app/models/project.py:104: # 项目存储根目录 +backend/app/models/project.py:109: """确保项目目录存在""" +backend/app/models/project.py:114: """获取项目目录路径""" +backend/app/models/project.py:119: """获取项目元数据文件路径""" +backend/app/models/project.py:124: """获取项目文件存储目录""" +backend/app/models/project.py:129: """获取项目提取文本存储路径""" +backend/app/models/project.py:135: 创建新项目 +backend/app/models/project.py:138: name: 项目名称 +backend/app/models/project.py:141: 新创建的Project对象 +backend/app/models/project.py:156: # 创建项目目录结构 +backend/app/models/project.py:162: # 保存项目元数据 +backend/app/models/project.py:169: """保存项目元数据""" +backend/app/models/project.py:179: 获取项目 +backend/app/models/project.py:182: project_id: 项目ID +backend/app/models/project.py:185: Project对象,如果不存在返回None +backend/app/models/project.py:200: 列出所有项目 +backend/app/models/project.py:203: limit: 返回数量限制 +backend/app/models/project.py:206: 项目列表,按创建时间倒序 +backend/app/models/project.py:216: # 按创建时间倒序排序 +backend/app/models/project.py:224: 删除项目及其所有文件 +backend/app/models/project.py:227: project_id: 项目ID +backend/app/models/project.py:230: 是否删除成功 +backend/app/models/project.py:243: 保存上传的文件到项目目录 +backend/app/models/project.py:246: project_id: 项目ID +backend/app/models/project.py:247: file_storage: Flask的FileStorage对象 +backend/app/models/project.py:248: original_filename: 原始文件名 +backend/app/models/project.py:251: 文件信息字典 {filename, path, size} +backend/app/models/project.py:256: # 生成安全的文件名 +backend/app/models/project.py:261: # 保存文件 +backend/app/models/project.py:264: # 获取文件大小 +backend/app/models/project.py:276: """保存提取的文本""" +backend/app/models/project.py:283: """获取提取的文本""" +backend/app/models/project.py:294: """获取项目的所有文件路径""" +backend/app/models/task.py:2:任务状态管理 +backend/app/models/task.py:3:用于跟踪长时间运行的任务(如图谱构建) +backend/app/models/task.py:17: """任务状态枚举""" +backend/app/models/task.py:18: PENDING = "pending" # 等待中 +backend/app/models/task.py:19: PROCESSING = "processing" # 处理中 +backend/app/models/task.py:20: COMPLETED = "completed" # 已完成 +backend/app/models/task.py:21: FAILED = "failed" # 失败 +backend/app/models/task.py:26: """任务数据类""" +backend/app/models/task.py:32: progress: int = 0 # 总进度百分比 0-100 +backend/app/models/task.py:33: message: str = "" # 状态消息 +backend/app/models/task.py:34: result: Optional[Dict] = None # 任务结果 +backend/app/models/task.py:35: error: Optional[str] = None # 错误信息 +backend/app/models/task.py:36: metadata: Dict = field(default_factory=dict) # 额外元数据 +backend/app/models/task.py:37: progress_detail: Dict = field(default_factory=dict) # 详细进度信息 +backend/app/models/task.py:40: """转换为字典""" +backend/app/models/task.py:58: 任务管理器 +backend/app/models/task.py:59: 线程安全的任务状态管理 +backend/app/models/task.py:66: """单例模式""" +backend/app/models/task.py:77: 创建新任务 +backend/app/models/task.py:80: task_type: 任务类型 +backend/app/models/task.py:81: metadata: 额外元数据 +backend/app/models/task.py:84: 任务ID +backend/app/models/task.py:104: """获取任务""" +backend/app/models/task.py:119: 更新任务状态 +backend/app/models/task.py:122: task_id: 任务ID +backend/app/models/task.py:123: status: 新状态 +backend/app/models/task.py:124: progress: 进度 +backend/app/models/task.py:125: message: 消息 +backend/app/models/task.py:126: result: 结果 +backend/app/models/task.py:127: error: 错误信息 +backend/app/models/task.py:128: progress_detail: 详细进度信息 +backend/app/models/task.py:148: """标记任务完成""" +backend/app/models/task.py:158: """标记任务失败""" +backend/app/models/task.py:167: """列出任务""" +backend/app/models/task.py:175: """清理旧任务""" +backend/app/services/__init__.py:2:业务服务模块 +backend/app/services/graph_builder.py:2:图谱构建服务 +backend/app/services/graph_builder.py:3:接口2:使用Zep API构建Standalone Graph +backend/app/services/graph_builder.py:72: """图谱信息""" +backend/app/services/graph_builder.py:89: 图谱构建服务 +backend/app/services/graph_builder.py:90: 负责调用Zep API构建知识图谱 +backend/app/services/graph_builder.py:107: 异步构建图谱 +backend/app/services/graph_builder.py:110: text: 输入文本 +backend/app/services/graph_builder.py:111: ontology: 本体定义(来自接口1的输出) +backend/app/services/graph_builder.py:112: graph_name: 图谱名称 +backend/app/services/graph_builder.py:113: chunk_size: 文本块大小 +backend/app/services/graph_builder.py:114: chunk_overlap: 块重叠大小 +backend/app/services/graph_builder.py:115: batch_size: 每批发送的块数量 +backend/app/services/graph_builder.py:118: 任务ID +backend/app/services/graph_builder.py:120: # 创建任务 +backend/app/services/graph_builder.py:133: # 在后台线程中执行构建 +backend/app/services/graph_builder.py:154: """图谱构建工作线程""" +backend/app/services/graph_builder.py:164: # 1. 创建图谱 +backend/app/services/graph_builder.py:172: # 2. 设置本体 +backend/app/services/graph_builder.py:180: # 3. 文本分块 +backend/app/services/graph_builder.py:189: # 4. 分批发送数据 +backend/app/services/graph_builder.py:199: # 5. 等待Zep处理完成 +backend/app/services/graph_builder.py:215: # 6. 获取图谱信息 +backend/app/services/graph_builder.py:224: # 完成 +backend/app/services/graph_builder.py:237: """创建Zep图谱(公开方法)""" +backend/app/services/graph_builder.py:249: """设置图谱本体提示(Graphiti自动提取实体,本体作为提示存储)""" +backend/app/services/graph_builder.py:264: """分批添加文本到图谱,返回所有 episode 的 uuid 列表。 +backend/app/services/graph_builder.py:265: skip_chunks: 跳过已处理的块数(用于断点续传)。""" +backend/app/services/graph_builder.py:282: # 构建episode数据 +backend/app/services/graph_builder.py:288: # 发送到Zep +backend/app/services/graph_builder.py:295: # 收集返回的 episode uuid +backend/app/services/graph_builder.py:302: # 避免请求过快 +backend/app/services/graph_builder.py:318: """等待所有 episode 处理完成(通过查询每个 episode 的 processed 状态)""" +backend/app/services/graph_builder.py:341: # 检查每个 episode 的处理状态 +backend/app/services/graph_builder.py:352: # 忽略单个查询错误,继续 +backend/app/services/graph_builder.py:363: time.sleep(3) # 每3秒检查一次 +backend/app/services/graph_builder.py:369: """获取图谱信息""" +backend/app/services/graph_builder.py:370: # 获取节点(分页) +backend/app/services/graph_builder.py:373: # 获取边(分页) +backend/app/services/graph_builder.py:376: # 统计实体类型 +backend/app/services/graph_builder.py:393: 获取完整图谱数据(包含详细信息) +backend/app/services/graph_builder.py:396: graph_id: 图谱ID +backend/app/services/graph_builder.py:399: 包含nodes和edges的字典,包括时间信息、属性等详细数据 +backend/app/services/graph_builder.py:404: # 创建节点映射用于获取节点名称 +backend/app/services/graph_builder.py:411: # 获取创建时间 +backend/app/services/graph_builder.py:432: # 获取时间信息 +backend/app/services/graph_builder.py:438: # 获取 episodes +backend/app/services/graph_builder.py:445: # 获取 fact_type +backend/app/services/graph_builder.py:474: """删除图谱""" +backend/app/services/oasis_profile_generator.py:2:OASIS Agent Profile生成器 +backend/app/services/oasis_profile_generator.py:3:将Zep图谱中的实体转换为OASIS模拟平台所需的Agent Profile格式 +backend/app/services/oasis_profile_generator.py:5:优化改进: +backend/app/services/oasis_profile_generator.py:6:1. 调用Zep检索功能二次丰富节点信息 +backend/app/services/oasis_profile_generator.py:7:2. 优化提示词生成非常详细的人设 +backend/app/services/oasis_profile_generator.py:8:3. 区分个人实体和抽象群体实体 +backend/app/services/oasis_profile_generator.py:31: """OASIS Agent Profile数据结构""" +backend/app/services/oasis_profile_generator.py:32: # 通用字段 +backend/app/services/oasis_profile_generator.py:39: # 可选字段 - Reddit风格 +backend/app/services/oasis_profile_generator.py:42: # 可选字段 - Twitter风格 +backend/app/services/oasis_profile_generator.py:47: # 额外人设信息 +backend/app/services/oasis_profile_generator.py:55: # 来源实体信息 +backend/app/services/oasis_profile_generator.py:62: """转换为Reddit平台格式""" +backend/app/services/oasis_profile_generator.py:65: "username": self.user_name, # OASIS 库要求字段名为 username(无下划线) +backend/app/services/oasis_profile_generator.py:73: # 添加额外人设信息(如果有) +backend/app/services/oasis_profile_generator.py:90: """转换为Twitter平台格式""" +backend/app/services/oasis_profile_generator.py:93: "username": self.user_name, # OASIS 库要求字段名为 username(无下划线) +backend/app/services/oasis_profile_generator.py:103: # 添加额外人设信息 +backend/app/services/oasis_profile_generator.py:120: """转换为完整字典格式""" +backend/app/services/oasis_profile_generator.py:145: OASIS Profile生成器 +backend/app/services/oasis_profile_generator.py:147: 将Zep图谱中的实体转换为OASIS模拟所需的Agent Profile +backend/app/services/oasis_profile_generator.py:149: 优化特性: +backend/app/services/oasis_profile_generator.py:150: 1. 调用Zep图谱检索功能获取更丰富的上下文 +backend/app/services/oasis_profile_generator.py:151: 2. 生成非常详细的人设(包括基本信息、职业经历、性格特征、社交媒体行为等) +backend/app/services/oasis_profile_generator.py:152: 3. 区分个人实体和抽象群体实体 +backend/app/services/oasis_profile_generator.py:155: # MBTI类型列表 +backend/app/services/oasis_profile_generator.py:163: # 常见国家列表 +backend/app/services/oasis_profile_generator.py:169: # 个人类型实体(需要生成具体人设) +backend/app/services/oasis_profile_generator.py:175: # 群体/机构类型实体(需要生成群体代表人设) +backend/app/services/oasis_profile_generator.py:194: raise ValueError("LLM_API_KEY 未配置") +backend/app/services/oasis_profile_generator.py:211: 从Zep实体生成OASIS Agent Profile +backend/app/services/oasis_profile_generator.py:214: entity: Zep实体节点 +backend/app/services/oasis_profile_generator.py:215: user_id: 用户ID(用于OASIS) +backend/app/services/oasis_profile_generator.py:216: use_llm: 是否使用LLM生成详细人设 +backend/app/services/oasis_profile_generator.py:223: # 基础信息 +backend/app/services/oasis_profile_generator.py:227: # 构建上下文信息 +backend/app/services/oasis_profile_generator.py:231: # 使用LLM生成详细人设 +backend/app/services/oasis_profile_generator.py:240: # 使用规则生成基础人设 +backend/app/services/oasis_profile_generator.py:269: """生成用户名""" +backend/app/services/oasis_profile_generator.py:270: # 移除特殊字符,转换为小写 +backend/app/services/oasis_profile_generator.py:274: # 添加随机后缀避免重复 +backend/app/services/oasis_profile_generator.py:280: 使用Zep图谱混合搜索功能获取实体相关的丰富信息 +backend/app/services/oasis_profile_generator.py:282: Zep没有内置混合搜索接口,需要分别搜索edges和nodes然后合并结果。 +backend/app/services/oasis_profile_generator.py:283: 使用并行请求同时搜索,提高效率。 +backend/app/services/oasis_profile_generator.py:286: entity: 实体节点对象 +backend/app/services/oasis_profile_generator.py:289: 包含facts, node_summaries, context的字典 +backend/app/services/oasis_profile_generator.py:304: # 必须有graph_id才能进行搜索 +backend/app/services/oasis_profile_generator.py:312: """搜索边(事实/关系)- 带重试机制""" +backend/app/services/oasis_profile_generator.py:336: """搜索节点(实体摘要)- 带重试机制""" +backend/app/services/oasis_profile_generator.py:360: # 并行执行edges和nodes搜索 +backend/app/services/oasis_profile_generator.py:365: # 获取结果 +backend/app/services/oasis_profile_generator.py:369: # 处理边搜索结果 +backend/app/services/oasis_profile_generator.py:377: # 处理节点搜索结果 +backend/app/services/oasis_profile_generator.py:384: all_summaries.add(f"相关实体: {node.name}") +backend/app/services/oasis_profile_generator.py:387: # 构建综合上下文 +backend/app/services/oasis_profile_generator.py:390: context_parts.append("事实信息:\n" + "\n".join(f"- {f}" for f in results["facts"][:20])) +backend/app/services/oasis_profile_generator.py:392: context_parts.append("相关实体:\n" + "\n".join(f"- {s}" for s in results["node_summaries"][:10])) +backend/app/services/oasis_profile_generator.py:406: 构建实体的完整上下文信息 +backend/app/services/oasis_profile_generator.py:408: 包括: +backend/app/services/oasis_profile_generator.py:409: 1. 实体本身的边信息(事实) +backend/app/services/oasis_profile_generator.py:410: 2. 关联节点的详细信息 +backend/app/services/oasis_profile_generator.py:411: 3. Zep混合检索到的丰富信息 +backend/app/services/oasis_profile_generator.py:415: # 1. 添加实体属性信息 +backend/app/services/oasis_profile_generator.py:422: context_parts.append("### 实体属性\n" + "\n".join(attrs)) +backend/app/services/oasis_profile_generator.py:424: # 2. 添加相关边信息(事实/关系) +backend/app/services/oasis_profile_generator.py:428: for edge in entity.related_edges: # 不限制数量 +backend/app/services/oasis_profile_generator.py:438: relationships.append(f"- {entity.name} --[{edge_name}]--> (相关实体)") +backend/app/services/oasis_profile_generator.py:440: relationships.append(f"- (相关实体) --[{edge_name}]--> {entity.name}") +backend/app/services/oasis_profile_generator.py:443: context_parts.append("### 相关事实和关系\n" + "\n".join(relationships)) +backend/app/services/oasis_profile_generator.py:445: # 3. 添加关联节点的详细信息 +backend/app/services/oasis_profile_generator.py:448: for node in entity.related_nodes: # 不限制数量 +backend/app/services/oasis_profile_generator.py:453: # 过滤掉默认标签 +backend/app/services/oasis_profile_generator.py:463: context_parts.append("### 关联实体信息\n" + "\n".join(related_info)) +backend/app/services/oasis_profile_generator.py:465: # 4. 使用Zep混合检索获取更丰富的信息 +backend/app/services/oasis_profile_generator.py:469: # 去重:排除已存在的事实 +backend/app/services/oasis_profile_generator.py:472: context_parts.append("### Zep检索到的事实信息\n" + "\n".join(f"- {f}" for f in new_facts[:15])) +backend/app/services/oasis_profile_generator.py:475: context_parts.append("### Zep检索到的相关节点\n" + "\n".join(f"- {s}" for s in zep_results["node_summaries"][:10])) +backend/app/services/oasis_profile_generator.py:480: """判断是否是个人类型实体""" +backend/app/services/oasis_profile_generator.py:484: """判断是否是群体/机构类型实体""" +backend/app/services/oasis_profile_generator.py:496: 使用LLM生成非常详细的人设 +backend/app/services/oasis_profile_generator.py:498: 根据实体类型区分: +backend/app/services/oasis_profile_generator.py:499: - 个人实体:生成具体的人物设定 +backend/app/services/oasis_profile_generator.py:500: - 群体/机构实体:生成代表性账号设定 +backend/app/services/oasis_profile_generator.py:514: # 尝试多次生成,直到成功或达到最大重试次数 +backend/app/services/oasis_profile_generator.py:527: temperature=0.7 - (attempt * 0.1) # 每次重试降低温度 +backend/app/services/oasis_profile_generator.py:528: # 不设置max_tokens,让LLM自由发挥 +backend/app/services/oasis_profile_generator.py:533: # 检查是否被截断(finish_reason不是'stop') +backend/app/services/oasis_profile_generator.py:539: # 尝试解析JSON +backend/app/services/oasis_profile_generator.py:543: # 验证必需字段 +backend/app/services/oasis_profile_generator.py:547: result["persona"] = entity_summary or f"{entity_name}是一个{entity_type}。" +backend/app/services/oasis_profile_generator.py:554: # 尝试修复JSON +backend/app/services/oasis_profile_generator.py:566: time.sleep(1 * (attempt + 1)) # 指数退避 +backend/app/services/oasis_profile_generator.py:574: """修复被截断的JSON(输出被max_tokens限制截断)""" +backend/app/services/oasis_profile_generator.py:577: # 如果JSON被截断,尝试闭合它 +backend/app/services/oasis_profile_generator.py:580: # 计算未闭合的括号 +backend/app/services/oasis_profile_generator.py:584: # 检查是否有未闭合的字符串 +backend/app/services/oasis_profile_generator.py:585: # 简单检查:如果最后一个引号后没有逗号或闭合括号,可能是字符串被截断 +backend/app/services/oasis_profile_generator.py:587: # 尝试闭合字符串 +backend/app/services/oasis_profile_generator.py:590: # 闭合括号 +backend/app/services/oasis_profile_generator.py:597: """尝试修复损坏的JSON""" +backend/app/services/oasis_profile_generator.py:600: # 1. 首先尝试修复被截断的情况 +backend/app/services/oasis_profile_generator.py:603: # 2. 尝试提取JSON部分 +backend/app/services/oasis_profile_generator.py:608: # 3. 处理字符串中的换行符问题 +backend/app/services/oasis_profile_generator.py:609: # 找到所有字符串值并替换其中的换行符 +backend/app/services/oasis_profile_generator.py:612: # 替换字符串内的实际换行符为空格 +backend/app/services/oasis_profile_generator.py:614: # 替换多余空格 +backend/app/services/oasis_profile_generator.py:618: # 匹配JSON字符串值 +backend/app/services/oasis_profile_generator.py:621: # 4. 尝试解析 +backend/app/services/oasis_profile_generator.py:627: # 5. 如果还是失败,尝试更激进的修复 +backend/app/services/oasis_profile_generator.py:629: # 移除所有控制字符 +backend/app/services/oasis_profile_generator.py:631: # 替换所有连续空白 +backend/app/services/oasis_profile_generator.py:639: # 6. 尝试从内容中提取部分信息 +backend/app/services/oasis_profile_generator.py:641: persona_match = re.search(r'"persona"\s*:\s*"([^"]*)', content) # 可能被截断 +backend/app/services/oasis_profile_generator.py:644: persona = persona_match.group(1) if persona_match else (entity_summary or f"{entity_name}是一个{entity_type}。") +backend/app/services/oasis_profile_generator.py:646: # 如果提取到了有意义的内容,标记为已修复 +backend/app/services/oasis_profile_generator.py:655: # 7. 完全失败,返回基础结构 +backend/app/services/oasis_profile_generator.py:659: "persona": entity_summary or f"{entity_name}是一个{entity_type}。" +backend/app/services/oasis_profile_generator.py:663: """获取系统提示词""" +backend/app/services/oasis_profile_generator.py:664: base_prompt = "你是社交媒体用户画像生成专家。生成详细、真实的人设用于舆论模拟,最大程度还原已有现实情况。必须返回有效的JSON格式,所有字符串值不能包含未转义的换行符。" +backend/app/services/oasis_profile_generator.py:675: """构建个人实体的详细人设提示词""" +backend/app/services/oasis_profile_generator.py:677: attrs_str = json.dumps(entity_attributes, ensure_ascii=False) if entity_attributes else "无" +backend/app/services/oasis_profile_generator.py:678: context_str = context[:3000] if context else "无额外上下文" +backend/app/services/oasis_profile_generator.py:680: return f"""为实体生成详细的社交媒体用户人设,最大程度还原已有现实情况。 +backend/app/services/oasis_profile_generator.py:682:实体名称: {entity_name} +backend/app/services/oasis_profile_generator.py:683:实体类型: {entity_type} +backend/app/services/oasis_profile_generator.py:684:实体摘要: {entity_summary} +backend/app/services/oasis_profile_generator.py:685:实体属性: {attrs_str} +backend/app/services/oasis_profile_generator.py:687:上下文信息: +backend/app/services/oasis_profile_generator.py:690:请生成JSON,包含以下字段: +backend/app/services/oasis_profile_generator.py:692:1. bio: 社交媒体简介,200字 +backend/app/services/oasis_profile_generator.py:693:2. persona: 详细人设描述(2000字的纯文本),需包含: +backend/app/services/oasis_profile_generator.py:694: - 基本信息(年龄、职业、教育背景、所在地) +backend/app/services/oasis_profile_generator.py:695: - 人物背景(重要经历、与事件的关联、社会关系) +backend/app/services/oasis_profile_generator.py:696: - 性格特征(MBTI类型、核心性格、情绪表达方式) +backend/app/services/oasis_profile_generator.py:697: - 社交媒体行为(发帖频率、内容偏好、互动风格、语言特点) +backend/app/services/oasis_profile_generator.py:698: - 立场观点(对话题的态度、可能被激怒/感动的内容) +backend/app/services/oasis_profile_generator.py:699: - 独特特征(口头禅、特殊经历、个人爱好) +backend/app/services/oasis_profile_generator.py:700: - 个人记忆(人设的重要部分,要介绍这个个体与事件的关联,以及这个个体在事件中的已有动作与反应) +backend/app/services/oasis_profile_generator.py:701:3. age: 年龄数字(必须是整数) +backend/app/services/oasis_profile_generator.py:702:4. gender: 性别,必须是英文: "male" 或 "female" +backend/app/services/oasis_profile_generator.py:703:5. mbti: MBTI类型(如INTJ、ENFP等) +backend/app/services/oasis_profile_generator.py:704:6. country: 国家(使用中文,如"中国") +backend/app/services/oasis_profile_generator.py:705:7. profession: 职业 +backend/app/services/oasis_profile_generator.py:706:8. interested_topics: 感兴趣话题数组 +backend/app/services/oasis_profile_generator.py:708:重要: +backend/app/services/oasis_profile_generator.py:709:- 所有字段值必须是字符串或数字,不要使用换行符 +backend/app/services/oasis_profile_generator.py:710:- persona必须是一段连贯的文字描述 +backend/app/services/oasis_profile_generator.py:711:- {get_language_instruction()} (gender字段必须用英文male/female) +backend/app/services/oasis_profile_generator.py:712:- 内容要与实体信息保持一致 +backend/app/services/oasis_profile_generator.py:713:- age必须是有效的整数,gender必须是"male"或"female" +backend/app/services/oasis_profile_generator.py:724: """构建群体/机构实体的详细人设提示词""" +backend/app/services/oasis_profile_generator.py:726: attrs_str = json.dumps(entity_attributes, ensure_ascii=False) if entity_attributes else "无" +backend/app/services/oasis_profile_generator.py:727: context_str = context[:3000] if context else "无额外上下文" +backend/app/services/oasis_profile_generator.py:729: return f"""为机构/群体实体生成详细的社交媒体账号设定,最大程度还原已有现实情况。 +backend/app/services/oasis_profile_generator.py:731:实体名称: {entity_name} +backend/app/services/oasis_profile_generator.py:732:实体类型: {entity_type} +backend/app/services/oasis_profile_generator.py:733:实体摘要: {entity_summary} +backend/app/services/oasis_profile_generator.py:734:实体属性: {attrs_str} +backend/app/services/oasis_profile_generator.py:736:上下文信息: +backend/app/services/oasis_profile_generator.py:739:请生成JSON,包含以下字段: +backend/app/services/oasis_profile_generator.py:741:1. bio: 官方账号简介,200字,专业得体 +backend/app/services/oasis_profile_generator.py:742:2. persona: 详细账号设定描述(2000字的纯文本),需包含: +backend/app/services/oasis_profile_generator.py:743: - 机构基本信息(正式名称、机构性质、成立背景、主要职能) +backend/app/services/oasis_profile_generator.py:744: - 账号定位(账号类型、目标受众、核心功能) +backend/app/services/oasis_profile_generator.py:745: - 发言风格(语言特点、常用表达、禁忌话题) +backend/app/services/oasis_profile_generator.py:746: - 发布内容特点(内容类型、发布频率、活跃时间段) +backend/app/services/oasis_profile_generator.py:747: - 立场态度(对核心话题的官方立场、面对争议的处理方式) +backend/app/services/oasis_profile_generator.py:748: - 特殊说明(代表的群体画像、运营习惯) +backend/app/services/oasis_profile_generator.py:749: - 机构记忆(机构人设的重要部分,要介绍这个机构与事件的关联,以及这个机构在事件中的已有动作与反应) +backend/app/services/oasis_profile_generator.py:750:3. age: 固定填30(机构账号的虚拟年龄) +backend/app/services/oasis_profile_generator.py:751:4. gender: 固定填"other"(机构账号使用other表示非个人) +backend/app/services/oasis_profile_generator.py:752:5. mbti: MBTI类型,用于描述账号风格,如ISTJ代表严谨保守 +backend/app/services/oasis_profile_generator.py:753:6. country: 国家(使用中文,如"中国") +backend/app/services/oasis_profile_generator.py:754:7. profession: 机构职能描述 +backend/app/services/oasis_profile_generator.py:755:8. interested_topics: 关注领域数组 +backend/app/services/oasis_profile_generator.py:757:重要: +backend/app/services/oasis_profile_generator.py:758:- 所有字段值必须是字符串或数字,不允许null值 +backend/app/services/oasis_profile_generator.py:759:- persona必须是一段连贯的文字描述,不要使用换行符 +backend/app/services/oasis_profile_generator.py:760:- {get_language_instruction()} (gender字段必须用英文"other") +backend/app/services/oasis_profile_generator.py:761:- age必须是整数30,gender必须是字符串"other" +backend/app/services/oasis_profile_generator.py:762:- 机构账号发言要符合其身份定位""" +backend/app/services/oasis_profile_generator.py:771: """使用规则生成基础人设""" +backend/app/services/oasis_profile_generator.py:773: # 根据实体类型生成不同的人设 +backend/app/services/oasis_profile_generator.py:804: "age": 30, # 机构虚拟年龄 +backend/app/services/oasis_profile_generator.py:805: "gender": "other", # 机构使用other +backend/app/services/oasis_profile_generator.py:806: "mbti": "ISTJ", # 机构风格:严谨保守 +backend/app/services/oasis_profile_generator.py:807: "country": "中国", +backend/app/services/oasis_profile_generator.py:816: "age": 30, # 机构虚拟年龄 +backend/app/services/oasis_profile_generator.py:817: "gender": "other", # 机构使用other +backend/app/services/oasis_profile_generator.py:818: "mbti": "ISTJ", # 机构风格:严谨保守 +backend/app/services/oasis_profile_generator.py:819: "country": "中国", +backend/app/services/oasis_profile_generator.py:825: # 默认人设 +backend/app/services/oasis_profile_generator.py:838: """设置图谱ID用于Zep检索""" +backend/app/services/oasis_profile_generator.py:852: 批量从实体生成Agent Profile(支持并行生成) +backend/app/services/oasis_profile_generator.py:855: entities: 实体列表 +backend/app/services/oasis_profile_generator.py:856: use_llm: 是否使用LLM生成详细人设 +backend/app/services/oasis_profile_generator.py:857: progress_callback: 进度回调函数 (current, total, message) +backend/app/services/oasis_profile_generator.py:858: graph_id: 图谱ID,用于Zep检索获取更丰富上下文 +backend/app/services/oasis_profile_generator.py:859: parallel_count: 并行生成数量,默认5 +backend/app/services/oasis_profile_generator.py:860: realtime_output_path: 实时写入的文件路径(如果提供,每生成一个就写入一次) +backend/app/services/oasis_profile_generator.py:861: output_platform: 输出平台格式 ("reddit" 或 "twitter") +backend/app/services/oasis_profile_generator.py:864: Agent Profile列表 +backend/app/services/oasis_profile_generator.py:869: # 设置graph_id用于Zep检索 +backend/app/services/oasis_profile_generator.py:874: profiles = [None] * total # 预分配列表保持顺序 +backend/app/services/oasis_profile_generator.py:875: completed_count = [0] # 使用列表以便在闭包中修改 +backend/app/services/oasis_profile_generator.py:878: # 实时写入文件的辅助函数 +backend/app/services/oasis_profile_generator.py:880: """实时保存已生成的 profiles 到文件""" +backend/app/services/oasis_profile_generator.py:885: # 过滤出已生成的 profiles +backend/app/services/oasis_profile_generator.py:892: # Reddit JSON 格式 +backend/app/services/oasis_profile_generator.py:897: # Twitter CSV 格式 +backend/app/services/oasis_profile_generator.py:913: """生成单个profile的工作函数""" +backend/app/services/oasis_profile_generator.py:924: # 实时输出生成的人设到控制台和日志 +backend/app/services/oasis_profile_generator.py:931: # 创建一个基础profile +backend/app/services/oasis_profile_generator.py:945: print(f"开始生成Agent人设 - 共 {total} 个实体,并行数: {parallel_count}") +backend/app/services/oasis_profile_generator.py:948: # 使用线程池并行执行 +backend/app/services/oasis_profile_generator.py:950: # 提交所有任务 +backend/app/services/oasis_profile_generator.py:956: # 收集结果 +backend/app/services/oasis_profile_generator.py:969: # 实时写入文件 +backend/app/services/oasis_profile_generator.py:976: f"已完成 {current}/{total}: {entity.name}({entity_type})" +backend/app/services/oasis_profile_generator.py:997: # 实时写入文件(即使是备用人设) +backend/app/services/oasis_profile_generator.py:1001: print(f"人设生成完成!共生成 {len([p for p in profiles if p])} 个Agent") +backend/app/services/oasis_profile_generator.py:1007: """实时输出生成的人设到控制台(完整内容,不截断)""" +backend/app/services/oasis_profile_generator.py:1010: # 构建完整输出内容(不截断) +backend/app/services/oasis_profile_generator.py:1011: topics_str = ', '.join(profile.interested_topics) if profile.interested_topics else '无' +backend/app/services/oasis_profile_generator.py:1017: f"用户名: {profile.user_name}", +backend/app/services/oasis_profile_generator.py:1019: f"【简介】", +backend/app/services/oasis_profile_generator.py:1022: f"【详细人设】", +backend/app/services/oasis_profile_generator.py:1025: f"【基本属性】", +backend/app/services/oasis_profile_generator.py:1026: f"年龄: {profile.age} | 性别: {profile.gender} | MBTI: {profile.mbti}", +backend/app/services/oasis_profile_generator.py:1027: f"职业: {profile.profession} | 国家: {profile.country}", +backend/app/services/oasis_profile_generator.py:1028: f"兴趣话题: {topics_str}", +backend/app/services/oasis_profile_generator.py:1034: # 只输出到控制台(避免重复,logger不再输出完整内容) +backend/app/services/oasis_profile_generator.py:1044: 保存Profile到文件(根据平台选择正确格式) +backend/app/services/oasis_profile_generator.py:1046: OASIS平台格式要求: +backend/app/services/oasis_profile_generator.py:1047: - Twitter: CSV格式 +backend/app/services/oasis_profile_generator.py:1048: - Reddit: JSON格式 +backend/app/services/oasis_profile_generator.py:1051: profiles: Profile列表 +backend/app/services/oasis_profile_generator.py:1052: file_path: 文件路径 +backend/app/services/oasis_profile_generator.py:1053: platform: 平台类型 ("reddit" 或 "twitter") +backend/app/services/oasis_profile_generator.py:1062: 保存Twitter Profile为CSV格式(符合OASIS官方要求) +backend/app/services/oasis_profile_generator.py:1064: OASIS Twitter要求的CSV字段: +backend/app/services/oasis_profile_generator.py:1065: - user_id: 用户ID(根据CSV顺序从0开始) +backend/app/services/oasis_profile_generator.py:1066: - name: 用户真实姓名 +backend/app/services/oasis_profile_generator.py:1067: - username: 系统中的用户名 +backend/app/services/oasis_profile_generator.py:1068: - user_char: 详细人设描述(注入到LLM系统提示中,指导Agent行为) +backend/app/services/oasis_profile_generator.py:1069: - description: 简短的公开简介(显示在用户资料页面) +backend/app/services/oasis_profile_generator.py:1071: user_char vs description 区别: +backend/app/services/oasis_profile_generator.py:1072: - user_char: 内部使用,LLM系统提示,决定Agent如何思考和行动 +backend/app/services/oasis_profile_generator.py:1073: - description: 外部显示,其他用户可见的简介 +backend/app/services/oasis_profile_generator.py:1077: # 确保文件扩展名是.csv +backend/app/services/oasis_profile_generator.py:1084: # 写入OASIS要求的表头 +backend/app/services/oasis_profile_generator.py:1088: # 写入数据行 +backend/app/services/oasis_profile_generator.py:1090: # user_char: 完整人设(bio + persona),用于LLM系统提示 +backend/app/services/oasis_profile_generator.py:1094: # 处理换行符(CSV中用空格替代) +backend/app/services/oasis_profile_generator.py:1097: # description: 简短简介,用于外部显示 +backend/app/services/oasis_profile_generator.py:1101: idx, # user_id: 从0开始的顺序ID +backend/app/services/oasis_profile_generator.py:1102: profile.name, # name: 真实姓名 +backend/app/services/oasis_profile_generator.py:1103: profile.user_name, # username: 用户名 +backend/app/services/oasis_profile_generator.py:1104: user_char, # user_char: 完整人设(内部LLM使用) +backend/app/services/oasis_profile_generator.py:1105: description # description: 简短简介(外部显示) +backend/app/services/oasis_profile_generator.py:1113: 标准化gender字段为OASIS要求的英文格式 +backend/app/services/oasis_profile_generator.py:1115: OASIS要求: male, female, other +backend/app/services/oasis_profile_generator.py:1122: # 中文映射 +backend/app/services/oasis_profile_generator.py:1124: "男": "male", +backend/app/services/oasis_profile_generator.py:1125: "女": "female", +backend/app/services/oasis_profile_generator.py:1126: "机构": "other", +backend/app/services/oasis_profile_generator.py:1127: "其他": "other", +backend/app/services/oasis_profile_generator.py:1128: # 英文已有 +backend/app/services/oasis_profile_generator.py:1138: 保存Reddit Profile为JSON格式 +backend/app/services/oasis_profile_generator.py:1140: 使用与 to_reddit_format() 一致的格式,确保 OASIS 能正确读取。 +backend/app/services/oasis_profile_generator.py:1141: 必须包含 user_id 字段,这是 OASIS agent_graph.get_agent() 匹配的关键! +backend/app/services/oasis_profile_generator.py:1143: 必需字段: +backend/app/services/oasis_profile_generator.py:1144: - user_id: 用户ID(整数,用于匹配 initial_posts 中的 poster_agent_id) +backend/app/services/oasis_profile_generator.py:1145: - username: 用户名 +backend/app/services/oasis_profile_generator.py:1146: - name: 显示名称 +backend/app/services/oasis_profile_generator.py:1147: - bio: 简介 +backend/app/services/oasis_profile_generator.py:1148: - persona: 详细人设 +backend/app/services/oasis_profile_generator.py:1149: - age: 年龄(整数) +backend/app/services/oasis_profile_generator.py:1150: - gender: "male", "female", 或 "other" +backend/app/services/oasis_profile_generator.py:1151: - mbti: MBTI类型 +backend/app/services/oasis_profile_generator.py:1152: - country: 国家 +backend/app/services/oasis_profile_generator.py:1156: # 使用与 to_reddit_format() 一致的格式 +backend/app/services/oasis_profile_generator.py:1158: "user_id": profile.user_id if profile.user_id is not None else idx, # 关键:必须包含 user_id +backend/app/services/oasis_profile_generator.py:1165: # OASIS必需字段 - 确保都有默认值 +backend/app/services/oasis_profile_generator.py:1169: "country": profile.country if profile.country else "中国", +backend/app/services/oasis_profile_generator.py:1172: # 可选字段 +backend/app/services/oasis_profile_generator.py:1185: # 保留旧方法名作为别名,保持向后兼容 +backend/app/services/oasis_profile_generator.py:1192: """[已废弃] 请使用 save_profiles() 方法""" +backend/app/services/ontology_generator.py:2:本体生成服务 +backend/app/services/ontology_generator.py:3:接口1:分析文本内容,生成适合社会模拟的实体和关系类型定义 +backend/app/services/ontology_generator.py:17: """将任意格式的名称转换为 PascalCase(如 'works_for' -> 'WorksFor', 'person' -> 'Person')""" +backend/app/services/ontology_generator.py:18: # 按非字母数字字符分割 +backend/app/services/ontology_generator.py:20: # 再按 camelCase 边界分割(如 'camelCase' -> ['camel', 'Case']) +backend/app/services/ontology_generator.py:24: # 每个词首字母大写,过滤空串 +backend/app/services/ontology_generator.py:29:# 本体生成的系统提示词 +backend/app/services/ontology_generator.py:178: 本体生成器 +backend/app/services/ontology_generator.py:179: 分析文本内容,生成实体和关系类型定义 +backend/app/services/ontology_generator.py:192: 生成本体定义 +backend/app/services/ontology_generator.py:195: document_texts: 文档文本列表 +backend/app/services/ontology_generator.py:196: simulation_requirement: 模拟需求描述 +backend/app/services/ontology_generator.py:197: additional_context: 额外上下文 +backend/app/services/ontology_generator.py:200: 本体定义(entity_types, edge_types等) +backend/app/services/ontology_generator.py:202: # 构建用户消息 +backend/app/services/ontology_generator.py:216: # 调用LLM +backend/app/services/ontology_generator.py:223: # 验证和后处理 +backend/app/services/ontology_generator.py:228: # 传给 LLM 的文本最大长度(5万字) +backend/app/services/ontology_generator.py:237: """构建用户消息""" +backend/app/services/ontology_generator.py:239: # 合并文本 +backend/app/services/ontology_generator.py:243: # 如果文本超过5万字,截断(仅影响传给LLM的内容,不影响图谱构建) +backend/app/services/ontology_generator.py:278: """验证和后处理结果""" +backend/app/services/ontology_generator.py:280: # 确保必要字段存在 +backend/app/services/ontology_generator.py:288: # 验证实体类型 +backend/app/services/ontology_generator.py:289: # 记录原始名称到 PascalCase 的映射,用于后续修正 edge 的 source_targets 引用 +backend/app/services/ontology_generator.py:292: # 强制将 entity name 转为 PascalCase(Zep API 要求) +backend/app/services/ontology_generator.py:303: # 确保description不超过100字符 +backend/app/services/ontology_generator.py:307: # 验证关系类型 +backend/app/services/ontology_generator.py:309: # 强制将 edge name 转为 SCREAMING_SNAKE_CASE(Zep API 要求) +backend/app/services/ontology_generator.py:315: # 修正 source_targets 中的实体名称引用,与转换后的 PascalCase 保持一致 +backend/app/services/ontology_generator.py:328: # Zep API 限制:最多 10 个自定义实体类型,最多 10 个自定义边类型 +backend/app/services/ontology_generator.py:332: # 去重:按 name 去重,保留首次出现的 +backend/app/services/ontology_generator.py:344: # 兜底类型定义 +backend/app/services/ontology_generator.py:365: # 检查是否已有兜底类型 +backend/app/services/ontology_generator.py:370: # 需要添加的兜底类型 +backend/app/services/ontology_generator.py:381: # 如果添加后会超过 10 个,需要移除一些现有类型 +backend/app/services/ontology_generator.py:383: # 计算需要移除多少个 +backend/app/services/ontology_generator.py:385: # 从末尾移除(保留前面更重要的具体类型) +backend/app/services/ontology_generator.py:388: # 添加兜底类型 +backend/app/services/ontology_generator.py:391: # 最终确保不超过限制(防御性编程) +backend/app/services/ontology_generator.py:402: 将本体定义转换为Python代码(类似ontology.py) +backend/app/services/ontology_generator.py:405: ontology: 本体定义 +backend/app/services/ontology_generator.py:408: Python代码字符串 +backend/app/services/ontology_generator.py:412: '自定义实体类型定义', +backend/app/services/ontology_generator.py:413: '由MiroFish自动生成,用于社会舆论模拟', +backend/app/services/ontology_generator.py:420: '# ============== 实体类型定义 ==============', +backend/app/services/ontology_generator.py:424: # 生成实体类型 +backend/app/services/ontology_generator.py:447: code_lines.append('# ============== 关系类型定义 ==============') +backend/app/services/ontology_generator.py:450: # 生成关系类型 +backend/app/services/ontology_generator.py:453: # 转换为PascalCase类名 +backend/app/services/ontology_generator.py:475: # 生成类型字典 +backend/app/services/ontology_generator.py:476: code_lines.append('# ============== 类型配置 ==============') +backend/app/services/ontology_generator.py:492: # 生成边的source_targets映射 +backend/app/services/report_agent.py:2:Report Agent服务 +backend/app/services/report_agent.py:3:使用LangChain + Zep实现ReACT模式的模拟报告生成 +backend/app/services/report_agent.py:5:功能: +backend/app/services/report_agent.py:6:1. 根据模拟需求和Zep图谱信息生成报告 +backend/app/services/report_agent.py:7:2. 先规划目录结构,然后分段生成 +backend/app/services/report_agent.py:8:3. 每段采用ReACT多轮思考与反思模式 +backend/app/services/report_agent.py:9:4. 支持与用户对话,在对话中自主调用检索工具 +backend/app/services/report_agent.py:38: Report Agent 详细日志记录器 +backend/app/services/report_agent.py:40: 在报告文件夹中生成 agent_log.jsonl 文件,记录每一步详细动作。 +backend/app/services/report_agent.py:41: 每行是一个完整的 JSON 对象,包含时间戳、动作类型、详细内容等。 +backend/app/services/report_agent.py:46: 初始化日志记录器 +backend/app/services/report_agent.py:49: report_id: 报告ID,用于确定日志文件路径 +backend/app/services/report_agent.py:59: """确保日志文件所在目录存在""" +backend/app/services/report_agent.py:64: """获取从开始到现在的耗时(秒)""" +backend/app/services/report_agent.py:76: 记录一条日志 +backend/app/services/report_agent.py:79: action: 动作类型,如 'start', 'tool_call', 'llm_response', 'section_complete' 等 +backend/app/services/report_agent.py:80: stage: 当前阶段,如 'planning', 'generating', 'completed' +backend/app/services/report_agent.py:81: details: 详细内容字典,不截断 +backend/app/services/report_agent.py:82: section_title: 当前章节标题(可选) +backend/app/services/report_agent.py:83: section_index: 当前章节索引(可选) +backend/app/services/report_agent.py:96: # 追加写入 JSONL 文件 +backend/app/services/report_agent.py:101: """记录报告生成开始""" +backend/app/services/report_agent.py:114: """记录大纲规划开始""" +backend/app/services/report_agent.py:122: """记录规划时获取的上下文信息""" +backend/app/services/report_agent.py:133: """记录大纲规划完成""" +backend/app/services/report_agent.py:144: """记录章节生成开始""" +backend/app/services/report_agent.py:154: """记录 ReACT 思考过程""" +backend/app/services/report_agent.py:175: """记录工具调用""" +backend/app/services/report_agent.py:197: """记录工具调用结果(完整内容,不截断)""" +backend/app/services/report_agent.py:206: "result": result, # 完整结果,不截断 +backend/app/services/report_agent.py:221: """记录 LLM 响应(完整内容,不截断)""" +backend/app/services/report_agent.py:229: "response": response, # 完整响应,不截断 +backend/app/services/report_agent.py:244: """记录章节内容生成完成(仅记录内容,不代表整个章节完成)""" +backend/app/services/report_agent.py:251: "content": content, # 完整内容,不截断 +backend/app/services/report_agent.py:265: 记录章节生成完成 +backend/app/services/report_agent.py:267: 前端应监听此日志来判断一个章节是否真正完成,并获取完整内容 +backend/app/services/report_agent.py:282: """记录报告生成完成""" +backend/app/services/report_agent.py:294: """记录错误""" +backend/app/services/report_agent.py:309: Report Agent 控制台日志记录器 +backend/app/services/report_agent.py:311: 将控制台风格的日志(INFO、WARNING等)写入报告文件夹中的 console_log.txt 文件。 +backend/app/services/report_agent.py:312: 这些日志与 agent_log.jsonl 不同,是纯文本格式的控制台输出。 +backend/app/services/report_agent.py:317: 初始化控制台日志记录器 +backend/app/services/report_agent.py:320: report_id: 报告ID,用于确定日志文件路径 +backend/app/services/report_agent.py:331: """确保日志文件所在目录存在""" +backend/app/services/report_agent.py:336: """设置文件处理器,将日志同时写入文件""" +backend/app/services/report_agent.py:339: # 创建文件处理器 +backend/app/services/report_agent.py:347: # 使用与控制台相同的简洁格式 +backend/app/services/report_agent.py:354: # 添加到 report_agent 相关的 logger +backend/app/services/report_agent.py:362: # 避免重复添加 +backend/app/services/report_agent.py:367: """关闭文件处理器并从 logger 中移除""" +backend/app/services/report_agent.py:385: """析构时确保关闭文件处理器""" +backend/app/services/report_agent.py:390: """报告状态""" +backend/app/services/report_agent.py:400: """报告章节""" +backend/app/services/report_agent.py:411: """转换为Markdown格式""" +backend/app/services/report_agent.py:420: """报告大纲""" +backend/app/services/report_agent.py:433: """转换为Markdown格式""" +backend/app/services/report_agent.py:443: """完整报告""" +backend/app/services/report_agent.py:471:# Prompt 模板常量 +backend/app/services/report_agent.py:474:# ── 工具描述 ── +backend/app/services/report_agent.py:550:# ── 大纲规划 prompt ── +backend/app/services/report_agent.py:613:# ── 章节生成 prompt ── +backend/app/services/report_agent.py:794:# ── ReACT 循环内消息模板 ── +backend/app/services/report_agent.py:861:# ReportAgent 主类 +backend/app/services/report_agent.py:867: Report Agent - 模拟报告生成Agent +backend/app/services/report_agent.py:869: 采用ReACT(Reasoning + Acting)模式: +backend/app/services/report_agent.py:870: 1. 规划阶段:分析模拟需求,规划报告目录结构 +backend/app/services/report_agent.py:871: 2. 生成阶段:逐章节生成内容,每章节可多次调用工具获取信息 +backend/app/services/report_agent.py:872: 3. 反思阶段:检查内容完整性和准确性 +backend/app/services/report_agent.py:875: # 最大工具调用次数(每个章节) +backend/app/services/report_agent.py:878: # 最大反思轮数 +backend/app/services/report_agent.py:881: # 对话中的最大工具调用次数 +backend/app/services/report_agent.py:893: 初始化Report Agent +backend/app/services/report_agent.py:896: graph_id: 图谱ID +backend/app/services/report_agent.py:897: simulation_id: 模拟ID +backend/app/services/report_agent.py:898: simulation_requirement: 模拟需求描述 +backend/app/services/report_agent.py:899: llm_client: LLM客户端(可选) +backend/app/services/report_agent.py:900: zep_tools: Zep工具服务(可选) +backend/app/services/report_agent.py:909: # 工具定义 +backend/app/services/report_agent.py:912: # 日志记录器(在 generate_report 中初始化) +backend/app/services/report_agent.py:914: # 控制台日志记录器(在 generate_report 中初始化) +backend/app/services/report_agent.py:920: """定义可用工具""" +backend/app/services/report_agent.py:958: 执行工具调用 +backend/app/services/report_agent.py:961: tool_name: 工具名称 +backend/app/services/report_agent.py:962: parameters: 工具参数 +backend/app/services/report_agent.py:963: report_context: 报告上下文(用于InsightForge) +backend/app/services/report_agent.py:966: 工具执行结果(文本格式) +backend/app/services/report_agent.py:983: # 广度搜索 - 获取全貌 +backend/app/services/report_agent.py:996: # 简单搜索 - 快速检索 +backend/app/services/report_agent.py:1009: # 深度采访 - 调用真实的OASIS采访API获取模拟Agent的回答(双平台) +backend/app/services/report_agent.py:1023: # ========== 向后兼容的旧工具(内部重定向到新工具) ========== +backend/app/services/report_agent.py:1026: # 重定向到 quick_search +backend/app/services/report_agent.py:1043: # 重定向到 insight_forge,因为它更强大 +backend/app/services/report_agent.py:1064: # 合法的工具名称集合,用于裸 JSON 兜底解析时校验 +backend/app/services/report_agent.py:1069: 从LLM响应中解析工具调用 +backend/app/services/report_agent.py:1071: 支持的格式(按优先级): +backend/app/services/report_agent.py:1073: 2. 裸 JSON(响应整体或单行就是一个工具调用 JSON) +backend/app/services/report_agent.py:1077: # 格式1: XML风格(标准格式) +backend/app/services/report_agent.py:1089: # 格式2: 兜底 - LLM 直接输出裸 JSON(没包 标签) +backend/app/services/report_agent.py:1090: # 只在格式1未匹配时尝试,避免误匹配正文中的 JSON +backend/app/services/report_agent.py:1101: # 响应可能包含思考文字 + 裸 JSON,尝试提取最后一个 JSON 对象 +backend/app/services/report_agent.py:1115: """校验解析出的 JSON 是否是合法的工具调用""" +backend/app/services/report_agent.py:1116: # 支持 {"name": ..., "parameters": ...} 和 {"tool": ..., "params": ...} 两种键名 +backend/app/services/report_agent.py:1119: # 统一键名为 name / parameters +backend/app/services/report_agent.py:1128: """生成工具描述文本""" +backend/app/services/report_agent.py:1142: 规划报告大纲 +backend/app/services/report_agent.py:1144: 使用LLM分析模拟需求,规划报告的目录结构 +backend/app/services/report_agent.py:1147: progress_callback: 进度回调函数 +backend/app/services/report_agent.py:1150: ReportOutline: 报告大纲 +backend/app/services/report_agent.py:1157: # 首先获取模拟上下文 +backend/app/services/report_agent.py:1188: # 解析大纲 +backend/app/services/report_agent.py:1210: # 返回默认大纲(3个章节,作为fallback) +backend/app/services/report_agent.py:1230: 使用ReACT模式生成单个章节内容 +backend/app/services/report_agent.py:1232: ReACT循环: +backend/app/services/report_agent.py:1233: 1. Thought(思考)- 分析需要什么信息 +backend/app/services/report_agent.py:1234: 2. Action(行动)- 调用工具获取信息 +backend/app/services/report_agent.py:1235: 3. Observation(观察)- 分析工具返回结果 +backend/app/services/report_agent.py:1236: 4. 重复直到信息足够或达到最大次数 +backend/app/services/report_agent.py:1237: 5. Final Answer(最终回答)- 生成章节内容 +backend/app/services/report_agent.py:1240: section: 要生成的章节 +backend/app/services/report_agent.py:1241: outline: 完整大纲 +backend/app/services/report_agent.py:1242: previous_sections: 之前章节的内容(用于保持连贯性) +backend/app/services/report_agent.py:1243: progress_callback: 进度回调 +backend/app/services/report_agent.py:1244: section_index: 章节索引(用于日志记录) +backend/app/services/report_agent.py:1247: 章节内容(Markdown格式) +backend/app/services/report_agent.py:1251: # 记录章节开始日志 +backend/app/services/report_agent.py:1264: # 构建用户prompt - 每个已完成章节各传入最大4000字 +backend/app/services/report_agent.py:1268: # 每个章节最多4000字 +backend/app/services/report_agent.py:1285: # ReACT循环 +backend/app/services/report_agent.py:1287: max_iterations = 5 # 最大迭代轮数 +backend/app/services/report_agent.py:1288: min_tool_calls = 3 # 最少工具调用次数 +backend/app/services/report_agent.py:1289: conflict_retries = 0 # 工具调用与Final Answer同时出现的连续冲突次数 +backend/app/services/report_agent.py:1290: used_tools = set() # 记录已调用过的工具名 +backend/app/services/report_agent.py:1293: # 报告上下文,用于InsightForge的子问题生成 +backend/app/services/report_agent.py:1304: # 调用LLM +backend/app/services/report_agent.py:1311: # 检查 LLM 返回是否为 None(API 异常或内容为空) +backend/app/services/report_agent.py:1314: # 如果还有迭代次数,添加消息并重试 +backend/app/services/report_agent.py:1319: # 最后一次迭代也返回 None,跳出循环进入强制收尾 +backend/app/services/report_agent.py:1324: # 解析一次,复用结果 +backend/app/services/report_agent.py:1329: # ── 冲突处理:LLM 同时输出了工具调用和 Final Answer ── +backend/app/services/report_agent.py:1337: # 前两次:丢弃本次响应,要求 LLM 重新回复 +backend/app/services/report_agent.py:1351: # 第三次:降级处理,截断到第一个工具调用,强制执行 +backend/app/services/report_agent.py:1363: # 记录 LLM 响应日志 +backend/app/services/report_agent.py:1374: # ── 情况1:LLM 输出了 Final Answer ── +backend/app/services/report_agent.py:1376: # 工具调用次数不足,拒绝并要求继续调工具 +backend/app/services/report_agent.py:1391: # 正常结束 +backend/app/services/report_agent.py:1404: # ── 情况2:LLM 尝试调用工具 ── +backend/app/services/report_agent.py:1406: # 工具额度已耗尽 → 明确告知,要求输出 Final Answer +backend/app/services/report_agent.py:1418: # 只执行第一个工具调用 +backend/app/services/report_agent.py:1450: # 构建未使用工具提示 +backend/app/services/report_agent.py:1470: # ── 情况3:既没有工具调用,也没有 Final Answer ── +backend/app/services/report_agent.py:1474: # 工具调用次数不足,推荐未用过的工具 +backend/app/services/report_agent.py:1488: # 工具调用已足够,LLM 输出了内容但没带 "Final Answer:" 前缀 +backend/app/services/report_agent.py:1489: # 直接将这段内容作为最终答案,不再空转 +backend/app/services/report_agent.py:1502: # 达到最大迭代次数,强制生成内容 +backend/app/services/report_agent.py:1512: # 检查强制收尾时 LLM 返回是否为 None +backend/app/services/report_agent.py:1521: # 记录章节内容生成完成日志 +backend/app/services/report_agent.py:1538: 生成完整报告(分章节实时输出) +backend/app/services/report_agent.py:1540: 每个章节生成完成后立即保存到文件夹,不需要等待整个报告完成。 +backend/app/services/report_agent.py:1541: 文件结构: +backend/app/services/report_agent.py:1543: meta.json - 报告元信息 +backend/app/services/report_agent.py:1544: outline.json - 报告大纲 +backend/app/services/report_agent.py:1545: progress.json - 生成进度 +backend/app/services/report_agent.py:1546: section_01.md - 第1章节 +backend/app/services/report_agent.py:1547: section_02.md - 第2章节 +backend/app/services/report_agent.py:1549: full_report.md - 完整报告 +backend/app/services/report_agent.py:1552: progress_callback: 进度回调函数 (stage, progress, message) +backend/app/services/report_agent.py:1553: report_id: 报告ID(可选,如果不传则自动生成) +backend/app/services/report_agent.py:1556: Report: 完整报告 +backend/app/services/report_agent.py:1560: # 如果没有传入 report_id,则自动生成 +backend/app/services/report_agent.py:1574: # 已完成的章节标题列表(用于进度追踪) +backend/app/services/report_agent.py:1578: # 初始化:创建报告文件夹并保存初始状态 +backend/app/services/report_agent.py:1581: # 初始化日志记录器(结构化日志 agent_log.jsonl) +backend/app/services/report_agent.py:1589: # 初始化控制台日志记录器(console_log.txt) +backend/app/services/report_agent.py:1598: # 阶段1: 规划大纲 +backend/app/services/report_agent.py:1605: # 记录规划开始日志 +backend/app/services/report_agent.py:1617: # 记录规划完成日志 +backend/app/services/report_agent.py:1620: # 保存大纲到文件 +backend/app/services/report_agent.py:1630: # 阶段2: 逐章节生成(分章节保存) +backend/app/services/report_agent.py:1634: generated_sections = [] # 保存内容用于上下文 +backend/app/services/report_agent.py:1640: # 更新进度 +backend/app/services/report_agent.py:1655: # 生成主章节内容 +backend/app/services/report_agent.py:1672: # 保存章节 +backend/app/services/report_agent.py:1676: # 记录章节完成日志 +backend/app/services/report_agent.py:1688: # 更新进度 +backend/app/services/report_agent.py:1697: # 阶段3: 组装完整报告 +backend/app/services/report_agent.py:1706: # 使用ReportManager组装完整报告 +backend/app/services/report_agent.py:1711: # 计算总耗时 +backend/app/services/report_agent.py:1714: # 记录报告完成日志 +backend/app/services/report_agent.py:1721: # 保存最终报告 +backend/app/services/report_agent.py:1733: # 关闭控制台日志记录器 +backend/app/services/report_agent.py:1745: # 记录错误日志 +backend/app/services/report_agent.py:1749: # 保存失败状态 +backend/app/services/report_agent.py:1757: pass # 忽略保存失败的错误 +backend/app/services/report_agent.py:1759: # 关闭控制台日志记录器 +backend/app/services/report_agent.py:1772: 与Report Agent对话 +backend/app/services/report_agent.py:1774: 在对话中Agent可以自主调用检索工具来回答问题 +backend/app/services/report_agent.py:1777: message: 用户消息 +backend/app/services/report_agent.py:1778: chat_history: 对话历史 +backend/app/services/report_agent.py:1782: "response": "Agent回复", +backend/app/services/report_agent.py:1783: "tool_calls": [调用的工具列表], +backend/app/services/report_agent.py:1784: "sources": [信息来源] +backend/app/services/report_agent.py:1791: # 获取已生成的报告内容 +backend/app/services/report_agent.py:1796: # 限制报告长度,避免上下文过长 +backend/app/services/report_agent.py:1810: # 构建消息 +backend/app/services/report_agent.py:1813: # 添加历史对话 +backend/app/services/report_agent.py:1814: for h in chat_history[-10:]: # 限制历史长度 +backend/app/services/report_agent.py:1817: # 添加用户消息 +backend/app/services/report_agent.py:1823: # ReACT循环(简化版) +backend/app/services/report_agent.py:1825: max_iterations = 2 # 减少迭代轮数 +backend/app/services/report_agent.py:1833: # 解析工具调用 +backend/app/services/report_agent.py:1837: # 没有工具调用,直接返回响应 +backend/app/services/report_agent.py:1847: # 执行工具调用(限制数量) +backend/app/services/report_agent.py:1849: for call in tool_calls[:1]: # 每轮最多执行1次工具调用 +backend/app/services/report_agent.py:1855: "result": result[:1500] # 限制结果长度 +backend/app/services/report_agent.py:1859: # 将结果添加到消息 +backend/app/services/report_agent.py:1867: # 达到最大迭代,获取最终响应 +backend/app/services/report_agent.py:1873: # 清理响应 +backend/app/services/report_agent.py:1886: 报告管理器 +backend/app/services/report_agent.py:1888: 负责报告的持久化存储和检索 +backend/app/services/report_agent.py:1890: 文件结构(分章节输出): +backend/app/services/report_agent.py:1893: meta.json - 报告元信息和状态 +backend/app/services/report_agent.py:1894: outline.json - 报告大纲 +backend/app/services/report_agent.py:1895: progress.json - 生成进度 +backend/app/services/report_agent.py:1896: section_01.md - 第1章节 +backend/app/services/report_agent.py:1897: section_02.md - 第2章节 +backend/app/services/report_agent.py:1899: full_report.md - 完整报告 +backend/app/services/report_agent.py:1902: # 报告存储目录 +backend/app/services/report_agent.py:1907: """确保报告根目录存在""" +backend/app/services/report_agent.py:1912: """获取报告文件夹路径""" +backend/app/services/report_agent.py:1917: """确保报告文件夹存在并返回路径""" +backend/app/services/report_agent.py:1924: """获取报告元信息文件路径""" +backend/app/services/report_agent.py:1929: """获取完整报告Markdown文件路径""" +backend/app/services/report_agent.py:1934: """获取大纲文件路径""" +backend/app/services/report_agent.py:1939: """获取进度文件路径""" +backend/app/services/report_agent.py:1944: """获取章节Markdown文件路径""" +backend/app/services/report_agent.py:1949: """获取 Agent 日志文件路径""" +backend/app/services/report_agent.py:1954: """获取控制台日志文件路径""" +backend/app/services/report_agent.py:1960: 获取控制台日志内容 +backend/app/services/report_agent.py:1962: 这是报告生成过程中的控制台输出日志(INFO、WARNING等), +backend/app/services/report_agent.py:1963: 与 agent_log.jsonl 的结构化日志不同。 +backend/app/services/report_agent.py:1966: report_id: 报告ID +backend/app/services/report_agent.py:1967: from_line: 从第几行开始读取(用于增量获取,0 表示从头开始) +backend/app/services/report_agent.py:1971: "logs": [日志行列表], +backend/app/services/report_agent.py:1972: "total_lines": 总行数, +backend/app/services/report_agent.py:1973: "from_line": 起始行号, +backend/app/services/report_agent.py:1974: "has_more": 是否还有更多日志 +backend/app/services/report_agent.py:1994: # 保留原始日志行,去掉末尾换行符 +backend/app/services/report_agent.py:2001: "has_more": False # 已读取到末尾 +backend/app/services/report_agent.py:2007: 获取完整的控制台日志(一次性获取全部) +backend/app/services/report_agent.py:2010: report_id: 报告ID +backend/app/services/report_agent.py:2013: 日志行列表 +backend/app/services/report_agent.py:2021: 获取 Agent 日志内容 +backend/app/services/report_agent.py:2024: report_id: 报告ID +backend/app/services/report_agent.py:2025: from_line: 从第几行开始读取(用于增量获取,0 表示从头开始) +backend/app/services/report_agent.py:2029: "logs": [日志条目列表], +backend/app/services/report_agent.py:2030: "total_lines": 总行数, +backend/app/services/report_agent.py:2031: "from_line": 起始行号, +backend/app/services/report_agent.py:2032: "has_more": 是否还有更多日志 +backend/app/services/report_agent.py:2056: # 跳过解析失败的行 +backend/app/services/report_agent.py:2063: "has_more": False # 已读取到末尾 +backend/app/services/report_agent.py:2069: 获取完整的 Agent 日志(用于一次性获取全部) +backend/app/services/report_agent.py:2072: report_id: 报告ID +backend/app/services/report_agent.py:2075: 日志条目列表 +backend/app/services/report_agent.py:2083: 保存报告大纲 +backend/app/services/report_agent.py:2085: 在规划阶段完成后立即调用 +backend/app/services/report_agent.py:2102: 保存单个章节 +backend/app/services/report_agent.py:2104: 在每个章节生成完成后立即调用,实现分章节输出 +backend/app/services/report_agent.py:2107: report_id: 报告ID +backend/app/services/report_agent.py:2108: section_index: 章节索引(从1开始) +backend/app/services/report_agent.py:2109: section: 章节对象 +backend/app/services/report_agent.py:2112: 保存的文件路径 +backend/app/services/report_agent.py:2116: # 构建章节Markdown内容 - 清理可能存在的重复标题 +backend/app/services/report_agent.py:2122: # 保存文件 +backend/app/services/report_agent.py:2134: 清理章节内容 +backend/app/services/report_agent.py:2136: 1. 移除内容开头与章节标题重复的Markdown标题行 +backend/app/services/report_agent.py:2137: 2. 将所有 ### 及以下级别的标题转换为粗体文本 +backend/app/services/report_agent.py:2140: content: 原始内容 +backend/app/services/report_agent.py:2141: section_title: 章节标题 +backend/app/services/report_agent.py:2144: 清理后的内容 +backend/app/services/report_agent.py:2159: # 检查是否是Markdown标题行 +backend/app/services/report_agent.py:2166: # 检查是否是与章节标题重复的标题(跳过前5行内的重复) +backend/app/services/report_agent.py:2172: # 将所有级别的标题(#, ##, ###, ####等)转换为粗体 +backend/app/services/report_agent.py:2173: # 因为章节标题由系统添加,内容中不应有任何标题 +backend/app/services/report_agent.py:2175: cleaned_lines.append("") # 添加空行 +backend/app/services/report_agent.py:2178: # 如果上一行是被跳过的标题,且当前行为空,也跳过 +backend/app/services/report_agent.py:2186: # 移除开头的空行 +backend/app/services/report_agent.py:2190: # 移除开头的分隔线 +backend/app/services/report_agent.py:2193: # 同时移除分隔线后的空行 +backend/app/services/report_agent.py:2210: 更新报告生成进度 +backend/app/services/report_agent.py:2212: 前端可以通过读取progress.json获取实时进度 +backend/app/services/report_agent.py:2230: """获取报告生成进度""" +backend/app/services/report_agent.py:2242: 获取已生成的章节列表 +backend/app/services/report_agent.py:2244: 返回所有已保存的章节文件信息 +backend/app/services/report_agent.py:2258: # 从文件名解析章节索引 +backend/app/services/report_agent.py:2273: 组装完整报告 +backend/app/services/report_agent.py:2275: 从已保存的章节文件组装完整报告,并进行标题清理 +backend/app/services/report_agent.py:2279: # 构建报告头部 +backend/app/services/report_agent.py:2284: # 按顺序读取所有章节文件 +backend/app/services/report_agent.py:2289: # 后处理:清理整个报告的标题问题 +backend/app/services/report_agent.py:2292: # 保存完整报告 +backend/app/services/report_agent.py:2303: 后处理报告内容 +backend/app/services/report_agent.py:2305: 1. 移除重复的标题 +backend/app/services/report_agent.py:2306: 2. 保留报告主标题(#)和章节标题(##),移除其他级别的标题(###, ####等) +backend/app/services/report_agent.py:2307: 3. 清理多余的空行和分隔线 +backend/app/services/report_agent.py:2310: content: 原始报告内容 +backend/app/services/report_agent.py:2311: outline: 报告大纲 +backend/app/services/report_agent.py:2314: 处理后的内容 +backend/app/services/report_agent.py:2322: # 收集大纲中的所有章节标题 +backend/app/services/report_agent.py:2332: # 检查是否是标题行 +backend/app/services/report_agent.py:2339: # 检查是否是重复标题(在连续5行内出现相同内容的标题) +backend/app/services/report_agent.py:2351: # 跳过重复标题及其后的空行 +backend/app/services/report_agent.py:2357: # 标题层级处理: +backend/app/services/report_agent.py:2358: # - # (level=1) 只保留报告主标题 +backend/app/services/report_agent.py:2359: # - ## (level=2) 保留章节标题 +backend/app/services/report_agent.py:2360: # - ### 及以下 (level>=3) 转换为粗体文本 +backend/app/services/report_agent.py:2364: # 保留报告主标题 +backend/app/services/report_agent.py:2368: # 章节标题错误使用了#,修正为## +backend/app/services/report_agent.py:2372: # 其他一级标题转为粗体 +backend/app/services/report_agent.py:2378: # 保留章节标题 +backend/app/services/report_agent.py:2382: # 非章节的二级标题转为粗体 +backend/app/services/report_agent.py:2387: # ### 及以下级别的标题转换为粗体文本 +backend/app/services/report_agent.py:2396: # 跳过标题后紧跟的分隔线 +backend/app/services/report_agent.py:2401: # 标题后只保留一个空行 +backend/app/services/report_agent.py:2412: # 清理连续的多个空行(保留最多2个) +backend/app/services/report_agent.py:2428: """保存报告元信息和完整报告""" +backend/app/services/report_agent.py:2431: # 保存元信息JSON +backend/app/services/report_agent.py:2435: # 保存大纲 +backend/app/services/report_agent.py:2439: # 保存完整Markdown报告 +backend/app/services/report_agent.py:2448: """获取报告""" +backend/app/services/report_agent.py:2452: # 兼容旧格式:检查直接存储在reports目录下的文件 +backend/app/services/report_agent.py:2462: # 重建Report对象 +backend/app/services/report_agent.py:2478: # 如果markdown_content为空,尝试从full_report.md读取 +backend/app/services/report_agent.py:2501: """根据模拟ID获取报告""" +backend/app/services/report_agent.py:2506: # 新格式:文件夹 +backend/app/services/report_agent.py:2511: # 兼容旧格式:JSON文件 +backend/app/services/report_agent.py:2522: """列出报告""" +backend/app/services/report_agent.py:2528: # 新格式:文件夹 +backend/app/services/report_agent.py:2534: # 兼容旧格式:JSON文件 +backend/app/services/report_agent.py:2542: # 按创建时间倒序 +backend/app/services/report_agent.py:2549: """删除报告(整个文件夹)""" +backend/app/services/report_agent.py:2554: # 新格式:删除整个文件夹 +backend/app/services/report_agent.py:2560: # 兼容旧格式:删除单独的文件 +backend/app/services/simulation_config_generator.py:2:模拟配置智能生成器 +backend/app/services/simulation_config_generator.py:3:使用LLM根据模拟需求、文档内容、图谱信息自动生成细致的模拟参数 +backend/app/services/simulation_config_generator.py:4:实现全程自动化,无需人工设置参数 +backend/app/services/simulation_config_generator.py:6:采用分步生成策略,避免一次性生成过长内容导致失败: +backend/app/services/simulation_config_generator.py:7:1. 生成时间配置 +backend/app/services/simulation_config_generator.py:8:2. 生成事件配置 +backend/app/services/simulation_config_generator.py:9:3. 分批生成Agent配置 +backend/app/services/simulation_config_generator.py:10:4. 生成平台配置 +backend/app/services/simulation_config_generator.py:28:# 中国作息时间配置(北京时间) +backend/app/services/simulation_config_generator.py:30: # 深夜时段(几乎无人活动) +backend/app/services/simulation_config_generator.py:32: # 早间时段(逐渐醒来) +backend/app/services/simulation_config_generator.py:34: # 工作时段 +backend/app/services/simulation_config_generator.py:36: # 晚间高峰(最活跃) +backend/app/services/simulation_config_generator.py:38: # 夜间时段(活跃度下降) +backend/app/services/simulation_config_generator.py:40: # 活跃度系数 +backend/app/services/simulation_config_generator.py:42: "dead": 0.05, # 凌晨几乎无人 +backend/app/services/simulation_config_generator.py:43: "morning": 0.4, # 早间逐渐活跃 +backend/app/services/simulation_config_generator.py:44: "work": 0.7, # 工作时段中等 +backend/app/services/simulation_config_generator.py:45: "peak": 1.5, # 晚间高峰 +backend/app/services/simulation_config_generator.py:46: "night": 0.5 # 深夜下降 +backend/app/services/simulation_config_generator.py:53: """单个Agent的活动配置""" +backend/app/services/simulation_config_generator.py:59: # 活跃度配置 (0.0-1.0) +backend/app/services/simulation_config_generator.py:60: activity_level: float = 0.5 # 整体活跃度 +backend/app/services/simulation_config_generator.py:62: # 发言频率(每小时预期发言次数) +backend/app/services/simulation_config_generator.py:66: # 活跃时间段(24小时制,0-23) +backend/app/services/simulation_config_generator.py:69: # 响应速度(对热点事件的反应延迟,单位:模拟分钟) +backend/app/services/simulation_config_generator.py:73: # 情感倾向 (-1.0到1.0,负面到正面) +backend/app/services/simulation_config_generator.py:76: # 立场(对特定话题的态度) +backend/app/services/simulation_config_generator.py:79: # 影响力权重(决定其发言被其他Agent看到的概率) +backend/app/services/simulation_config_generator.py:85: """时间模拟配置(基于中国人作息习惯)""" +backend/app/services/simulation_config_generator.py:86: # 模拟总时长(模拟小时数) +backend/app/services/simulation_config_generator.py:87: total_simulation_hours: int = 72 # 默认模拟72小时(3天) +backend/app/services/simulation_config_generator.py:89: # 每轮代表的时间(模拟分钟)- 默认60分钟(1小时),加快时间流速 +backend/app/services/simulation_config_generator.py:92: # 每小时激活的Agent数量范围 +backend/app/services/simulation_config_generator.py:96: # 高峰时段(晚间19-22点,中国人最活跃的时间) +backend/app/services/simulation_config_generator.py:100: # 低谷时段(凌晨0-5点,几乎无人活动) +backend/app/services/simulation_config_generator.py:102: off_peak_activity_multiplier: float = 0.05 # 凌晨活跃度极低 +backend/app/services/simulation_config_generator.py:104: # 早间时段 +backend/app/services/simulation_config_generator.py:108: # 工作时段 +backend/app/services/simulation_config_generator.py:115: """事件配置""" +backend/app/services/simulation_config_generator.py:116: # 初始事件(模拟开始时的触发事件) +backend/app/services/simulation_config_generator.py:119: # 定时事件(在特定时间触发的事件) +backend/app/services/simulation_config_generator.py:122: # 热点话题关键词 +backend/app/services/simulation_config_generator.py:125: # 舆论引导方向 +backend/app/services/simulation_config_generator.py:131: """平台特定配置""" +backend/app/services/simulation_config_generator.py:134: # 推荐算法权重 +backend/app/services/simulation_config_generator.py:135: recency_weight: float = 0.4 # 时间新鲜度 +backend/app/services/simulation_config_generator.py:136: popularity_weight: float = 0.3 # 热度 +backend/app/services/simulation_config_generator.py:137: relevance_weight: float = 0.3 # 相关性 +backend/app/services/simulation_config_generator.py:139: # 病毒传播阈值(达到多少互动后触发扩散) +backend/app/services/simulation_config_generator.py:142: # 回声室效应强度(相似观点聚集程度) +backend/app/services/simulation_config_generator.py:148: """完整的模拟参数配置""" +backend/app/services/simulation_config_generator.py:149: # 基础信息 +backend/app/services/simulation_config_generator.py:155: # 时间配置 +backend/app/services/simulation_config_generator.py:158: # Agent配置列表 +backend/app/services/simulation_config_generator.py:161: # 事件配置 +backend/app/services/simulation_config_generator.py:164: # 平台配置 +backend/app/services/simulation_config_generator.py:168: # LLM配置 +backend/app/services/simulation_config_generator.py:172: # 生成元数据 +backend/app/services/simulation_config_generator.py:174: generation_reasoning: str = "" # LLM的推理说明 +backend/app/services/simulation_config_generator.py:177: """转换为字典""" +backend/app/services/simulation_config_generator.py:196: """转换为JSON字符串""" +backend/app/services/simulation_config_generator.py:202: 模拟配置智能生成器 +backend/app/services/simulation_config_generator.py:204: 使用LLM分析模拟需求、文档内容、图谱实体信息, +backend/app/services/simulation_config_generator.py:205: 自动生成最佳的模拟参数配置 +backend/app/services/simulation_config_generator.py:207: 采用分步生成策略: +backend/app/services/simulation_config_generator.py:208: 1. 生成时间配置和事件配置(轻量级) +backend/app/services/simulation_config_generator.py:209: 2. 分批生成Agent配置(每批10-20个) +backend/app/services/simulation_config_generator.py:210: 3. 生成平台配置 +backend/app/services/simulation_config_generator.py:213: # 上下文最大字符数 +backend/app/services/simulation_config_generator.py:215: # 每批生成的Agent数量 +backend/app/services/simulation_config_generator.py:218: # 各步骤的上下文截断长度(字符数) +backend/app/services/simulation_config_generator.py:219: TIME_CONFIG_CONTEXT_LENGTH = 10000 # 时间配置 +backend/app/services/simulation_config_generator.py:220: EVENT_CONFIG_CONTEXT_LENGTH = 8000 # 事件配置 +backend/app/services/simulation_config_generator.py:221: ENTITY_SUMMARY_LENGTH = 300 # 实体摘要 +backend/app/services/simulation_config_generator.py:222: AGENT_SUMMARY_LENGTH = 300 # Agent配置中的实体摘要 +backend/app/services/simulation_config_generator.py:223: ENTITIES_PER_TYPE_DISPLAY = 20 # 每类实体显示数量 +backend/app/services/simulation_config_generator.py:236: raise ValueError("LLM_API_KEY 未配置") +backend/app/services/simulation_config_generator.py:256: 智能生成完整的模拟配置(分步生成) +backend/app/services/simulation_config_generator.py:259: simulation_id: 模拟ID +backend/app/services/simulation_config_generator.py:260: project_id: 项目ID +backend/app/services/simulation_config_generator.py:261: graph_id: 图谱ID +backend/app/services/simulation_config_generator.py:262: simulation_requirement: 模拟需求描述 +backend/app/services/simulation_config_generator.py:263: document_text: 原始文档内容 +backend/app/services/simulation_config_generator.py:264: entities: 过滤后的实体列表 +backend/app/services/simulation_config_generator.py:265: enable_twitter: 是否启用Twitter +backend/app/services/simulation_config_generator.py:266: enable_reddit: 是否启用Reddit +backend/app/services/simulation_config_generator.py:267: progress_callback: 进度回调函数(current_step, total_steps, message) +backend/app/services/simulation_config_generator.py:270: SimulationParameters: 完整的模拟参数 +backend/app/services/simulation_config_generator.py:274: # 计算总步骤数 +backend/app/services/simulation_config_generator.py:276: total_steps = 3 + num_batches # 时间配置 + 事件配置 + N批Agent + 平台配置 +backend/app/services/simulation_config_generator.py:286: # 1. 构建基础上下文信息 +backend/app/services/simulation_config_generator.py:295: # ========== 步骤1: 生成时间配置 ========== +backend/app/services/simulation_config_generator.py:302: # ========== 步骤2: 生成事件配置 ========== +backend/app/services/simulation_config_generator.py:308: # ========== 步骤3-N: 分批生成Agent配置 ========== +backend/app/services/simulation_config_generator.py:330: # ========== 为初始帖子分配发布者 Agent ========== +backend/app/services/simulation_config_generator.py:336: # ========== 最后一步: 生成平台配置 ========== +backend/app/services/simulation_config_generator.py:361: # 构建最终参数 +backend/app/services/simulation_config_generator.py:387: """构建LLM上下文,截断到最大长度""" +backend/app/services/simulation_config_generator.py:389: # 实体摘要 +backend/app/services/simulation_config_generator.py:392: # 构建上下文 +backend/app/services/simulation_config_generator.py:399: remaining_length = self.MAX_CONTEXT_LENGTH - current_length - 500 # 留500字符余量 +backend/app/services/simulation_config_generator.py:410: """生成实体摘要""" +backend/app/services/simulation_config_generator.py:413: # 按类型分组 +backend/app/services/simulation_config_generator.py:423: # 使用配置的显示数量和摘要长度 +backend/app/services/simulation_config_generator.py:435: """带重试的LLM调用,包含JSON修复逻辑""" +backend/app/services/simulation_config_generator.py:450: temperature=0.7 - (attempt * 0.1) # 每次重试降低温度 +backend/app/services/simulation_config_generator.py:451: # 不设置max_tokens,让LLM自由发挥 +backend/app/services/simulation_config_generator.py:457: # 检查是否被截断 +backend/app/services/simulation_config_generator.py:462: # 尝试解析JSON +backend/app/services/simulation_config_generator.py:468: # 尝试修复JSON +backend/app/services/simulation_config_generator.py:481: raise last_error or Exception("LLM调用失败") +backend/app/services/simulation_config_generator.py:484: """修复被截断的JSON""" +backend/app/services/simulation_config_generator.py:487: # 计算未闭合的括号 +backend/app/services/simulation_config_generator.py:491: # 检查是否有未闭合的字符串 +backend/app/services/simulation_config_generator.py:495: # 闭合括号 +backend/app/services/simulation_config_generator.py:502: """尝试修复配置JSON""" +backend/app/services/simulation_config_generator.py:505: # 修复被截断的情况 +backend/app/services/simulation_config_generator.py:508: # 提取JSON部分 +backend/app/services/simulation_config_generator.py:513: # 移除字符串中的换行符 +backend/app/services/simulation_config_generator.py:525: # 尝试移除所有控制字符 +backend/app/services/simulation_config_generator.py:536: """生成时间配置""" +backend/app/services/simulation_config_generator.py:537: # 使用配置的上下文截断长度 +backend/app/services/simulation_config_generator.py:540: # 计算最大允许值(80%的agent数) +backend/app/services/simulation_config_generator.py:598: """获取默认时间配置(中国人作息)""" +backend/app/services/simulation_config_generator.py:601: "minutes_per_round": 60, # 每轮1小时,加快时间流速 +backend/app/services/simulation_config_generator.py:612: """解析时间配置结果,并验证agents_per_hour值不超过总agent数""" +backend/app/services/simulation_config_generator.py:613: # 获取原始值 +backend/app/services/simulation_config_generator.py:617: # 验证并修正:确保不超过总agent数 +backend/app/services/simulation_config_generator.py:626: # 确保 min < max +backend/app/services/simulation_config_generator.py:633: minutes_per_round=result.get("minutes_per_round", 60), # 默认每轮1小时 +backend/app/services/simulation_config_generator.py:638: off_peak_activity_multiplier=0.05, # 凌晨几乎无人 +backend/app/services/simulation_config_generator.py:652: """生成事件配置""" +backend/app/services/simulation_config_generator.py:654: # 获取可用的实体类型列表,供 LLM 参考 +backend/app/services/simulation_config_generator.py:659: # 为每种类型列出代表性实体名称 +backend/app/services/simulation_config_generator.py:673: # 使用配置的上下文截断长度 +backend/app/services/simulation_config_generator.py:720: """解析事件配置结果""" +backend/app/services/simulation_config_generator.py:734: 为初始帖子分配合适的发布者 Agent +backend/app/services/simulation_config_generator.py:736: 根据每个帖子的 poster_type 匹配最合适的 agent_id +backend/app/services/simulation_config_generator.py:741: # 按实体类型建立 agent 索引 +backend/app/services/simulation_config_generator.py:749: # 类型映射表(处理 LLM 可能输出的不同格式) +backend/app/services/simulation_config_generator.py:761: # 记录每种类型已使用的 agent 索引,避免重复使用同一个 agent +backend/app/services/simulation_config_generator.py:769: # 尝试找到匹配的 agent +backend/app/services/simulation_config_generator.py:772: # 1. 直接匹配 +backend/app/services/simulation_config_generator.py:779: # 2. 使用别名匹配 +backend/app/services/simulation_config_generator.py:792: # 3. 如果仍未找到,使用影响力最高的 agent +backend/app/services/simulation_config_generator.py:796: # 按影响力排序,选择影响力最高的 +backend/app/services/simulation_config_generator.py:820: """分批生成Agent配置""" +backend/app/services/simulation_config_generator.py:822: # 构建实体信息(使用配置的摘要长度) +backend/app/services/simulation_config_generator.py:879: # 构建AgentActivityConfig对象 +backend/app/services/simulation_config_generator.py:885: # 如果LLM没有生成,使用规则生成 +backend/app/services/simulation_config_generator.py:909: """基于规则生成单个Agent配置(中国人作息)""" +backend/app/services/simulation_config_generator.py:913: # 官方机构:工作时间活动,低频率,高影响力 +backend/app/services/simulation_config_generator.py:926: # 媒体:全天活动,中等频率,高影响力 +backend/app/services/simulation_config_generator.py:939: # 专家/教授:工作+晚间活动,中等频率 +backend/app/services/simulation_config_generator.py:952: # 学生:晚间为主,高频率 +backend/app/services/simulation_config_generator.py:957: "active_hours": [8, 9, 10, 11, 12, 13, 18, 19, 20, 21, 22, 23], # 上午+晚间 +backend/app/services/simulation_config_generator.py:965: # 校友:晚间为主 +backend/app/services/simulation_config_generator.py:970: "active_hours": [12, 13, 19, 20, 21, 22, 23], # 午休+晚间 +backend/app/services/simulation_config_generator.py:978: # 普通人:晚间高峰 +backend/app/services/simulation_config_generator.py:983: "active_hours": [9, 10, 11, 12, 13, 18, 19, 20, 21, 22, 23], # 白天+晚间 +backend/app/services/simulation_ipc.py:2:模拟IPC通信模块 +backend/app/services/simulation_ipc.py:3:用于Flask后端和模拟脚本之间的进程间通信 +backend/app/services/simulation_ipc.py:5:通过文件系统实现简单的命令/响应模式: +backend/app/services/simulation_ipc.py:6:1. Flask写入命令到 commands/ 目录 +backend/app/services/simulation_ipc.py:7:2. 模拟脚本轮询命令目录,执行命令并写入响应到 responses/ 目录 +backend/app/services/simulation_ipc.py:8:3. Flask轮询响应目录获取结果 +backend/app/services/simulation_ipc.py:27: """命令类型""" +backend/app/services/simulation_ipc.py:28: INTERVIEW = "interview" # 单个Agent采访 +backend/app/services/simulation_ipc.py:29: BATCH_INTERVIEW = "batch_interview" # 批量采访 +backend/app/services/simulation_ipc.py:30: CLOSE_ENV = "close_env" # 关闭环境 +backend/app/services/simulation_ipc.py:34: """命令状态""" +backend/app/services/simulation_ipc.py:43: """IPC命令""" +backend/app/services/simulation_ipc.py:69: """IPC响应""" +backend/app/services/simulation_ipc.py:98: 模拟IPC客户端(Flask端使用) +backend/app/services/simulation_ipc.py:100: 用于向模拟进程发送命令并等待响应 +backend/app/services/simulation_ipc.py:105: 初始化IPC客户端 +backend/app/services/simulation_ipc.py:108: simulation_dir: 模拟数据目录 +backend/app/services/simulation_ipc.py:114: # 确保目录存在 +backend/app/services/simulation_ipc.py:126: 发送命令并等待响应 +backend/app/services/simulation_ipc.py:129: command_type: 命令类型 +backend/app/services/simulation_ipc.py:130: args: 命令参数 +backend/app/services/simulation_ipc.py:131: timeout: 超时时间(秒) +backend/app/services/simulation_ipc.py:132: poll_interval: 轮询间隔(秒) +backend/app/services/simulation_ipc.py:138: TimeoutError: 等待响应超时 +backend/app/services/simulation_ipc.py:147: # 写入命令文件 +backend/app/services/simulation_ipc.py:154: # 等待响应 +backend/app/services/simulation_ipc.py:165: # 清理命令和响应文件 +backend/app/services/simulation_ipc.py:179: # 超时 +backend/app/services/simulation_ipc.py:182: # 清理命令文件 +backend/app/services/simulation_ipc.py:188: raise TimeoutError(f"等待命令响应超时 ({timeout}秒)") +backend/app/services/simulation_ipc.py:198: 发送单个Agent采访命令 +backend/app/services/simulation_ipc.py:202: prompt: 采访问题 +backend/app/services/simulation_ipc.py:203: platform: 指定平台(可选) +backend/app/services/simulation_ipc.py:204: - "twitter": 只采访Twitter平台 +backend/app/services/simulation_ipc.py:205: - "reddit": 只采访Reddit平台 +backend/app/services/simulation_ipc.py:206: - None: 双平台模拟时同时采访两个平台,单平台模拟时采访该平台 +backend/app/services/simulation_ipc.py:207: timeout: 超时时间 +backend/app/services/simulation_ipc.py:210: IPCResponse,result字段包含采访结果 +backend/app/services/simulation_ipc.py:232: 发送批量采访命令 +backend/app/services/simulation_ipc.py:235: interviews: 采访列表,每个元素包含 {"agent_id": int, "prompt": str, "platform": str(可选)} +backend/app/services/simulation_ipc.py:236: platform: 默认平台(可选,会被每个采访项的platform覆盖) +backend/app/services/simulation_ipc.py:237: - "twitter": 默认只采访Twitter平台 +backend/app/services/simulation_ipc.py:238: - "reddit": 默认只采访Reddit平台 +backend/app/services/simulation_ipc.py:239: - None: 双平台模拟时每个Agent同时采访两个平台 +backend/app/services/simulation_ipc.py:240: timeout: 超时时间 +backend/app/services/simulation_ipc.py:243: IPCResponse,result字段包含所有采访结果 +backend/app/services/simulation_ipc.py:257: 发送关闭环境命令 +backend/app/services/simulation_ipc.py:260: timeout: 超时时间 +backend/app/services/simulation_ipc.py:273: 检查模拟环境是否存活 +backend/app/services/simulation_ipc.py:275: 通过检查 env_status.json 文件来判断 +backend/app/services/simulation_ipc.py:291: 模拟IPC服务器(模拟脚本端使用) +backend/app/services/simulation_ipc.py:293: 轮询命令目录,执行命令并返回响应 +backend/app/services/simulation_ipc.py:298: 初始化IPC服务器 +backend/app/services/simulation_ipc.py:301: simulation_dir: 模拟数据目录 +backend/app/services/simulation_ipc.py:307: # 确保目录存在 +backend/app/services/simulation_ipc.py:311: # 环境状态 +backend/app/services/simulation_ipc.py:315: """标记服务器为运行状态""" +backend/app/services/simulation_ipc.py:320: """标记服务器为停止状态""" +backend/app/services/simulation_ipc.py:325: """更新环境状态文件""" +backend/app/services/simulation_ipc.py:335: 轮询命令目录,返回第一个待处理的命令 +backend/app/services/simulation_ipc.py:338: IPCCommand 或 None +backend/app/services/simulation_ipc.py:343: # 按时间排序获取命令文件 +backend/app/services/simulation_ipc.py:365: 发送响应 +backend/app/services/simulation_ipc.py:368: response: IPC响应 +backend/app/services/simulation_ipc.py:374: # 删除命令文件 +backend/app/services/simulation_ipc.py:382: """发送成功响应""" +backend/app/services/simulation_ipc.py:390: """发送错误响应""" +backend/app/services/simulation_manager.py:2:OASIS模拟管理器 +backend/app/services/simulation_manager.py:3:管理Twitter和Reddit双平台并行模拟 +backend/app/services/simulation_manager.py:4:使用预设脚本 + LLM智能生成配置参数 +backend/app/services/simulation_manager.py:26: """模拟状态""" +backend/app/services/simulation_manager.py:32: STOPPED = "stopped" # 模拟被手动停止 +backend/app/services/simulation_manager.py:33: COMPLETED = "completed" # 模拟自然完成 +backend/app/services/simulation_manager.py:38: """平台类型""" +backend/app/services/simulation_manager.py:45: """模拟状态""" +backend/app/services/simulation_manager.py:50: # 平台启用状态 +backend/app/services/simulation_manager.py:54: # 状态 +backend/app/services/simulation_manager.py:57: # 准备阶段数据 +backend/app/services/simulation_manager.py:62: # 配置生成信息 +backend/app/services/simulation_manager.py:66: # 运行时数据 +backend/app/services/simulation_manager.py:71: # 时间戳 +backend/app/services/simulation_manager.py:75: # 错误信息 +backend/app/services/simulation_manager.py:79: """完整状态字典(内部使用)""" +backend/app/services/simulation_manager.py:101: """简化状态字典(API返回使用)""" +backend/app/services/simulation_manager.py:117: 模拟管理器 +backend/app/services/simulation_manager.py:119: 核心功能: +backend/app/services/simulation_manager.py:120: 1. 从Zep图谱读取实体并过滤 +backend/app/services/simulation_manager.py:121: 2. 生成OASIS Agent Profile +backend/app/services/simulation_manager.py:122: 3. 使用LLM智能生成模拟配置参数 +backend/app/services/simulation_manager.py:123: 4. 准备预设脚本所需的所有文件 +backend/app/services/simulation_manager.py:126: # 模拟数据存储目录 +backend/app/services/simulation_manager.py:133: # 确保目录存在 +backend/app/services/simulation_manager.py:136: # 内存中的模拟状态缓存 +backend/app/services/simulation_manager.py:140: """获取模拟数据目录""" +backend/app/services/simulation_manager.py:146: """保存模拟状态到文件""" +backend/app/services/simulation_manager.py:158: """从文件加载模拟状态""" +backend/app/services/simulation_manager.py:202: 创建新的模拟 +backend/app/services/simulation_manager.py:205: project_id: 项目ID +backend/app/services/simulation_manager.py:206: graph_id: Zep图谱ID +backend/app/services/simulation_manager.py:207: enable_twitter: 是否启用Twitter模拟 +backend/app/services/simulation_manager.py:208: enable_reddit: 是否启用Reddit模拟 +backend/app/services/simulation_manager.py:241: 准备模拟环境(全程自动化) +backend/app/services/simulation_manager.py:243: 步骤: +backend/app/services/simulation_manager.py:244: 1. 从Zep图谱读取并过滤实体 +backend/app/services/simulation_manager.py:245: 2. 为每个实体生成OASIS Agent Profile(可选LLM增强,支持并行) +backend/app/services/simulation_manager.py:246: 3. 使用LLM智能生成模拟配置参数(时间、活跃度、发言频率等) +backend/app/services/simulation_manager.py:247: 4. 保存配置文件和Profile文件 +backend/app/services/simulation_manager.py:248: 5. 复制预设脚本到模拟目录 +backend/app/services/simulation_manager.py:251: simulation_id: 模拟ID +backend/app/services/simulation_manager.py:252: simulation_requirement: 模拟需求描述(用于LLM生成配置) +backend/app/services/simulation_manager.py:253: document_text: 原始文档内容(用于LLM理解背景) +backend/app/services/simulation_manager.py:254: defined_entity_types: 预定义的实体类型(可选) +backend/app/services/simulation_manager.py:255: use_llm_for_profiles: 是否使用LLM生成详细人设 +backend/app/services/simulation_manager.py:256: progress_callback: 进度回调函数 (stage, progress, message) +backend/app/services/simulation_manager.py:257: parallel_profile_count: 并行生成人设的数量,默认3 +backend/app/services/simulation_manager.py:264: raise ValueError(f"模拟不存在: {simulation_id}") +backend/app/services/simulation_manager.py:272: # ========== 阶段1: 读取并过滤实体 ========== +backend/app/services/simulation_manager.py:300: state.error = "没有找到符合条件的实体,请检查图谱是否正确构建" +backend/app/services/simulation_manager.py:304: # ========== 阶段2: 生成Agent Profile ========== +backend/app/services/simulation_manager.py:315: # 传入graph_id以启用Zep检索功能,获取更丰富的上下文 +backend/app/services/simulation_manager.py:329: # 设置实时保存的文件路径(优先使用 Reddit JSON 格式) +backend/app/services/simulation_manager.py:343: graph_id=state.graph_id, # 传入graph_id用于Zep检索 +backend/app/services/simulation_manager.py:344: parallel_count=parallel_profile_count, # 并行生成数量 +backend/app/services/simulation_manager.py:345: realtime_output_path=realtime_output_path, # 实时保存路径 +backend/app/services/simulation_manager.py:346: output_platform=realtime_platform # 输出格式 +backend/app/services/simulation_manager.py:351: # 保存Profile文件(注意:Twitter使用CSV格式,Reddit使用JSON格式) +backend/app/services/simulation_manager.py:352: # Reddit 已经在生成过程中实时保存了,这里再保存一次确保完整性 +backend/app/services/simulation_manager.py:369: # Twitter使用CSV格式!这是OASIS的要求 +backend/app/services/simulation_manager.py:384: # ========== 阶段3: LLM智能生成模拟配置 ========== +backend/app/services/simulation_manager.py:422: # 保存配置文件 +backend/app/services/simulation_manager.py:438: # 注意:运行脚本保留在 backend/scripts/ 目录,不再复制到模拟目录 +backend/app/services/simulation_manager.py:439: # 启动模拟时,simulation_runner 会从 scripts/ 目录运行脚本 +backend/app/services/simulation_manager.py:441: # 更新状态 +backend/app/services/simulation_manager.py:459: """获取模拟状态""" +backend/app/services/simulation_manager.py:463: """列出所有模拟""" +backend/app/services/simulation_manager.py:468: # 跳过隐藏文件(如 .DS_Store)和非目录文件 +backend/app/services/simulation_manager.py:481: """获取模拟的Agent Profile""" +backend/app/services/simulation_manager.py:484: raise ValueError(f"模拟不存在: {simulation_id}") +backend/app/services/simulation_manager.py:496: """获取模拟配置""" +backend/app/services/simulation_manager.py:507: """获取运行说明""" +backend/app/services/simulation_manager.py:522: f"1. 激活conda环境: conda activate MiroFish\n" +backend/app/services/simulation_manager.py:523: f"2. 运行模拟 (脚本位于 {scripts_dir}):\n" +backend/app/services/simulation_manager.py:524: f" - 单独运行Twitter: python {scripts_dir}/run_twitter_simulation.py --config {config_path}\n" +backend/app/services/simulation_manager.py:525: f" - 单独运行Reddit: python {scripts_dir}/run_reddit_simulation.py --config {config_path}\n" +backend/app/services/simulation_manager.py:526: f" - 并行运行双平台: python {scripts_dir}/run_parallel_simulation.py --config {config_path}" +backend/app/services/simulation_runner.py:2:OASIS模拟运行器 +backend/app/services/simulation_runner.py:3:在后台运行模拟并记录每个Agent的动作,支持实时状态监控 +backend/app/services/simulation_runner.py:29:# 标记是否已注册清理函数 +backend/app/services/simulation_runner.py:32:# 平台检测 +backend/app/services/simulation_runner.py:37: """运行器状态""" +backend/app/services/simulation_runner.py:50: """Agent动作记录""" +backend/app/services/simulation_runner.py:77: """每轮摘要""" +backend/app/services/simulation_runner.py:103: """模拟运行状态(实时)""" +backend/app/services/simulation_runner.py:107: # 进度信息 +backend/app/services/simulation_runner.py:113: # 各平台独立轮次和模拟时间(用于双平台并行显示) +backend/app/services/simulation_runner.py:119: # 平台状态 +backend/app/services/simulation_runner.py:125: # 平台完成状态(通过检测 actions.jsonl 中的 simulation_end 事件) +backend/app/services/simulation_runner.py:129: # 每轮摘要 +backend/app/services/simulation_runner.py:132: # 最近动作(用于前端实时展示) +backend/app/services/simulation_runner.py:136: # 时间戳 +backend/app/services/simulation_runner.py:141: # 错误信息 +backend/app/services/simulation_runner.py:144: # 进程ID(用于停止) +backend/app/services/simulation_runner.py:148: """添加动作到最近动作列表""" +backend/app/services/simulation_runner.py:169: # 各平台独立轮次和时间 +backend/app/services/simulation_runner.py:189: """包含最近动作的详细信息""" +backend/app/services/simulation_runner.py:198: 模拟运行器 +backend/app/services/simulation_runner.py:200: 负责: +backend/app/services/simulation_runner.py:201: 1. 在后台进程中运行OASIS模拟 +backend/app/services/simulation_runner.py:202: 2. 解析运行日志,记录每个Agent的动作 +backend/app/services/simulation_runner.py:203: 3. 提供实时状态查询接口 +backend/app/services/simulation_runner.py:204: 4. 支持暂停/停止/恢复操作 +backend/app/services/simulation_runner.py:207: # 运行状态存储目录 +backend/app/services/simulation_runner.py:213: # 脚本目录 +backend/app/services/simulation_runner.py:219: # 内存中的运行状态 +backend/app/services/simulation_runner.py:224: _stdout_files: Dict[str, Any] = {} # 存储 stdout 文件句柄 +backend/app/services/simulation_runner.py:225: _stderr_files: Dict[str, Any] = {} # 存储 stderr 文件句柄 +backend/app/services/simulation_runner.py:227: # 图谱记忆更新配置 +backend/app/services/simulation_runner.py:232: """获取运行状态""" +backend/app/services/simulation_runner.py:236: # 尝试从文件加载 +backend/app/services/simulation_runner.py:244: """从文件加载运行状态""" +backend/app/services/simulation_runner.py:260: # 各平台独立轮次和时间 +backend/app/services/simulation_runner.py:278: # 加载最近动作 +backend/app/services/simulation_runner.py:300: """保存运行状态到文件""" +backend/app/services/simulation_runner.py:317: max_rounds: int = None, # 最大模拟轮数(可选,用于截断过长的模拟) +backend/app/services/simulation_runner.py:318: enable_graph_memory_update: bool = False, # 是否将活动更新到Zep图谱 +backend/app/services/simulation_runner.py:319: graph_id: str = None # Zep图谱ID(启用图谱更新时必需) +backend/app/services/simulation_runner.py:322: 启动模拟 +backend/app/services/simulation_runner.py:325: simulation_id: 模拟ID +backend/app/services/simulation_runner.py:326: platform: 运行平台 (twitter/reddit/parallel) +backend/app/services/simulation_runner.py:327: max_rounds: 最大模拟轮数(可选,用于截断过长的模拟) +backend/app/services/simulation_runner.py:328: enable_graph_memory_update: 是否将Agent活动动态更新到Zep图谱 +backend/app/services/simulation_runner.py:329: graph_id: Zep图谱ID(启用图谱更新时必需) +backend/app/services/simulation_runner.py:334: # 检查是否已在运行 +backend/app/services/simulation_runner.py:337: raise ValueError(f"模拟已在运行中: {simulation_id}") +backend/app/services/simulation_runner.py:339: # 加载模拟配置 +backend/app/services/simulation_runner.py:344: raise ValueError(f"模拟配置不存在,请先调用 /prepare 接口") +backend/app/services/simulation_runner.py:349: # 初始化运行状态 +backend/app/services/simulation_runner.py:355: # 如果指定了最大轮数,则截断 +backend/app/services/simulation_runner.py:372: # 如果启用图谱记忆更新,创建更新器 +backend/app/services/simulation_runner.py:375: raise ValueError("启用图谱记忆更新时必须提供 graph_id") +backend/app/services/simulation_runner.py:387: # 确定运行哪个脚本(脚本位于 backend/scripts/ 目录) +backend/app/services/simulation_runner.py:402: raise ValueError(f"脚本不存在: {script_path}") +backend/app/services/simulation_runner.py:404: # 创建动作队列 +backend/app/services/simulation_runner.py:408: # 启动模拟进程 +backend/app/services/simulation_runner.py:410: # 构建运行命令,使用完整路径 +backend/app/services/simulation_runner.py:411: # 新的日志结构: +backend/app/services/simulation_runner.py:412: # twitter/actions.jsonl - Twitter 动作日志 +backend/app/services/simulation_runner.py:413: # reddit/actions.jsonl - Reddit 动作日志 +backend/app/services/simulation_runner.py:414: # simulation.log - 主进程日志 +backend/app/services/simulation_runner.py:417: sys.executable, # Python解释器 +backend/app/services/simulation_runner.py:419: "--config", config_path, # 使用完整配置文件路径 +backend/app/services/simulation_runner.py:422: # 如果指定了最大轮数,添加到命令行参数 +backend/app/services/simulation_runner.py:426: # 创建主日志文件,避免 stdout/stderr 管道缓冲区满导致进程阻塞 +backend/app/services/simulation_runner.py:430: # 设置子进程环境变量,确保 Windows 上使用 UTF-8 编码 +backend/app/services/simulation_runner.py:431: # 这可以修复第三方库(如 OASIS)读取文件时未指定编码的问题 +backend/app/services/simulation_runner.py:433: env['PYTHONUTF8'] = '1' # Python 3.7+ 支持,让所有 open() 默认使用 UTF-8 +backend/app/services/simulation_runner.py:434: env['PYTHONIOENCODING'] = 'utf-8' # 确保 stdout/stderr 使用 UTF-8 +backend/app/services/simulation_runner.py:436: # 设置工作目录为模拟目录(数据库等文件会生成在此) +backend/app/services/simulation_runner.py:437: # 使用 start_new_session=True 创建新的进程组,确保可以通过 os.killpg 终止所有子进程 +backend/app/services/simulation_runner.py:442: stderr=subprocess.STDOUT, # stderr 也写入同一个文件 +backend/app/services/simulation_runner.py:444: encoding='utf-8', # 显式指定编码 +backend/app/services/simulation_runner.py:446: env=env, # 传递带有 UTF-8 设置的环境变量 +backend/app/services/simulation_runner.py:447: start_new_session=True, # 创建新进程组,确保服务器关闭时能终止所有相关进程 +backend/app/services/simulation_runner.py:450: # 保存文件句柄以便后续关闭 +backend/app/services/simulation_runner.py:452: cls._stderr_files[simulation_id] = None # 不再需要单独的 stderr +backend/app/services/simulation_runner.py:462: # 启动监控线程 +backend/app/services/simulation_runner.py:483: """监控模拟进程,解析动作日志""" +backend/app/services/simulation_runner.py:487: # 新的日志结构:分平台的动作日志 +backend/app/services/simulation_runner.py:501: while process.poll() is None: # 进程仍在运行 +backend/app/services/simulation_runner.py:502: # 读取 Twitter 动作日志 +backend/app/services/simulation_runner.py:508: # 读取 Reddit 动作日志 +backend/app/services/simulation_runner.py:514: # 更新状态 +backend/app/services/simulation_runner.py:518: # 进程结束后,最后读取一次日志 +backend/app/services/simulation_runner.py:524: # 进程结束 +backend/app/services/simulation_runner.py:533: # 从主日志文件读取错误信息 +backend/app/services/simulation_runner.py:539: error_info = f.read()[-2000:] # 取最后2000字符 +backend/app/services/simulation_runner.py:542: state.error = f"进程退出码: {exit_code}, 错误: {error_info}" +backend/app/services/simulation_runner.py:556: # 停止图谱记忆更新器 +backend/app/services/simulation_runner.py:565: # 清理进程资源 +backend/app/services/simulation_runner.py:569: # 关闭日志文件句柄 +backend/app/services/simulation_runner.py:592: 读取动作日志文件 +backend/app/services/simulation_runner.py:595: log_path: 日志文件路径 +backend/app/services/simulation_runner.py:596: position: 上次读取位置 +backend/app/services/simulation_runner.py:597: state: 运行状态对象 +backend/app/services/simulation_runner.py:598: platform: 平台名称 (twitter/reddit) +backend/app/services/simulation_runner.py:601: 新的读取位置 +backend/app/services/simulation_runner.py:603: # 检查是否启用了图谱记忆更新 +backend/app/services/simulation_runner.py:618: # 处理事件类型的条目 +backend/app/services/simulation_runner.py:622: # 检测 simulation_end 事件,标记平台已完成 +backend/app/services/simulation_runner.py:633: # 检查是否所有启用的平台都已完成 +backend/app/services/simulation_runner.py:634: # 如果只运行了一个平台,只检查那个平台 +backend/app/services/simulation_runner.py:635: # 如果运行了两个平台,需要两个都完成 +backend/app/services/simulation_runner.py:642: # 更新轮次信息(从 round_end 事件) +backend/app/services/simulation_runner.py:647: # 更新各平台独立的轮次和时间 +backend/app/services/simulation_runner.py:657: # 总体轮次取两个平台的最大值 +backend/app/services/simulation_runner.py:660: # 总体时间取两个平台的最大值 +backend/app/services/simulation_runner.py:678: # 更新轮次 +backend/app/services/simulation_runner.py:682: # 如果启用了图谱记忆更新,将活动发送到Zep +backend/app/services/simulation_runner.py:696: 检查所有启用的平台是否都已完成模拟 +backend/app/services/simulation_runner.py:698: 通过检查对应的 actions.jsonl 文件是否存在来判断平台是否被启用 +backend/app/services/simulation_runner.py:701: True 如果所有启用的平台都已完成 +backend/app/services/simulation_runner.py:707: # 检查哪些平台被启用(通过文件是否存在判断) +backend/app/services/simulation_runner.py:711: # 如果平台被启用但未完成,则返回 False +backend/app/services/simulation_runner.py:717: # 至少有一个平台被启用且已完成 +backend/app/services/simulation_runner.py:723: 跨平台终止进程及其子进程 +backend/app/services/simulation_runner.py:726: process: 要终止的进程 +backend/app/services/simulation_runner.py:727: simulation_id: 模拟ID(用于日志) +backend/app/services/simulation_runner.py:728: timeout: 等待进程退出的超时时间(秒) +backend/app/services/simulation_runner.py:731: # Windows: 使用 taskkill 命令终止进程树 +backend/app/services/simulation_runner.py:732: # /F = 强制终止, /T = 终止进程树(包括子进程) +backend/app/services/simulation_runner.py:735: # 先尝试优雅终止 +backend/app/services/simulation_runner.py:744: # 强制终止 +backend/app/services/simulation_runner.py:760: # Unix: 使用进程组终止 +backend/app/services/simulation_runner.py:761: # 由于使用了 start_new_session=True,进程组 ID 等于主进程 PID +backend/app/services/simulation_runner.py:765: # 先发送 SIGTERM 给整个进程组 +backend/app/services/simulation_runner.py:771: # 如果超时后还没结束,强制发送 SIGKILL +backend/app/services/simulation_runner.py:778: """停止模拟""" +backend/app/services/simulation_runner.py:781: raise ValueError(f"模拟不存在: {simulation_id}") +backend/app/services/simulation_runner.py:784: raise ValueError(f"模拟未在运行: {simulation_id}, status={state.runner_status}") +backend/app/services/simulation_runner.py:789: # 终止进程 +backend/app/services/simulation_runner.py:795: # 进程已经不存在 +backend/app/services/simulation_runner.py:799: # 回退到直接终止进程 +backend/app/services/simulation_runner.py:812: # 停止图谱记忆更新器 +backend/app/services/simulation_runner.py:834: 从单个动作文件中读取动作 +backend/app/services/simulation_runner.py:837: file_path: 动作日志文件路径 +backend/app/services/simulation_runner.py:838: default_platform: 默认平台(当动作记录中没有 platform 字段时使用) +backend/app/services/simulation_runner.py:839: platform_filter: 过滤平台 +backend/app/services/simulation_runner.py:840: agent_id: 过滤 Agent ID +backend/app/services/simulation_runner.py:841: round_num: 过滤轮次 +backend/app/services/simulation_runner.py:857: # 跳过非动作记录(如 simulation_start, round_start, round_end 等事件) +backend/app/services/simulation_runner.py:861: # 跳过没有 agent_id 的记录(非 Agent 动作) +backend/app/services/simulation_runner.py:865: # 获取平台:优先使用记录中的 platform,否则使用默认平台 +backend/app/services/simulation_runner.py:868: # 过滤 +backend/app/services/simulation_runner.py:902: 获取所有平台的完整动作历史(无分页限制) +backend/app/services/simulation_runner.py:905: simulation_id: 模拟ID +backend/app/services/simulation_runner.py:906: platform: 过滤平台(twitter/reddit) +backend/app/services/simulation_runner.py:907: agent_id: 过滤Agent +backend/app/services/simulation_runner.py:908: round_num: 过滤轮次 +backend/app/services/simulation_runner.py:911: 完整的动作列表(按时间戳排序,新的在前) +backend/app/services/simulation_runner.py:916: # 读取 Twitter 动作文件(根据文件路径自动设置 platform 为 twitter) +backend/app/services/simulation_runner.py:921: default_platform="twitter", # 自动填充 platform 字段 +backend/app/services/simulation_runner.py:927: # 读取 Reddit 动作文件(根据文件路径自动设置 platform 为 reddit) +backend/app/services/simulation_runner.py:932: default_platform="reddit", # 自动填充 platform 字段 +backend/app/services/simulation_runner.py:938: # 如果分平台文件不存在,尝试读取旧的单一文件格式 +backend/app/services/simulation_runner.py:943: default_platform=None, # 旧格式文件中应该有 platform 字段 +backend/app/services/simulation_runner.py:949: # 按时间戳排序(新的在前) +backend/app/services/simulation_runner.py:965: 获取动作历史(带分页) +backend/app/services/simulation_runner.py:968: simulation_id: 模拟ID +backend/app/services/simulation_runner.py:969: limit: 返回数量限制 +backend/app/services/simulation_runner.py:970: offset: 偏移量 +backend/app/services/simulation_runner.py:971: platform: 过滤平台 +backend/app/services/simulation_runner.py:972: agent_id: 过滤Agent +backend/app/services/simulation_runner.py:973: round_num: 过滤轮次 +backend/app/services/simulation_runner.py:976: 动作列表 +backend/app/services/simulation_runner.py:985: # 分页 +backend/app/services/simulation_runner.py:996: 获取模拟时间线(按轮次汇总) +backend/app/services/simulation_runner.py:999: simulation_id: 模拟ID +backend/app/services/simulation_runner.py:1000: start_round: 起始轮次 +backend/app/services/simulation_runner.py:1001: end_round: 结束轮次 +backend/app/services/simulation_runner.py:1004: 每轮的汇总信息 +backend/app/services/simulation_runner.py:1008: # 按轮次分组 +backend/app/services/simulation_runner.py:1041: # 转换为列表 +backend/app/services/simulation_runner.py:1062: 获取每个Agent的统计信息 +backend/app/services/simulation_runner.py:1065: Agent统计列表 +backend/app/services/simulation_runner.py:1097: # 按总动作数排序 +backend/app/services/simulation_runner.py:1105: 清理模拟的运行日志(用于强制重新开始模拟) +backend/app/services/simulation_runner.py:1107: 会删除以下文件: +backend/app/services/simulation_runner.py:1113: - twitter_simulation.db(模拟数据库) +backend/app/services/simulation_runner.py:1114: - reddit_simulation.db(模拟数据库) +backend/app/services/simulation_runner.py:1115: - env_status.json(环境状态) +backend/app/services/simulation_runner.py:1117: 注意:不会删除配置文件(simulation_config.json)和 profile 文件 +backend/app/services/simulation_runner.py:1120: simulation_id: 模拟ID +backend/app/services/simulation_runner.py:1123: 清理结果信息 +backend/app/services/simulation_runner.py:1130: return {"success": True, "message": "模拟目录不存在,无需清理"} +backend/app/services/simulation_runner.py:1135: # 要删除的文件列表(包括数据库文件) +backend/app/services/simulation_runner.py:1141: "twitter_simulation.db", # Twitter 平台数据库 +backend/app/services/simulation_runner.py:1142: "reddit_simulation.db", # Reddit 平台数据库 +backend/app/services/simulation_runner.py:1143: "env_status.json", # 环境状态文件 +backend/app/services/simulation_runner.py:1146: # 要删除的目录列表(包含动作日志) +backend/app/services/simulation_runner.py:1149: # 删除文件 +backend/app/services/simulation_runner.py:1157: errors.append(f"删除 {filename} 失败: {str(e)}") +backend/app/services/simulation_runner.py:1159: # 清理平台目录中的动作日志 +backend/app/services/simulation_runner.py:1169: errors.append(f"删除 {dir_name}/actions.jsonl 失败: {str(e)}") +backend/app/services/simulation_runner.py:1171: # 清理内存中的运行状态 +backend/app/services/simulation_runner.py:1183: # 防止重复清理的标志 +backend/app/services/simulation_runner.py:1189: 清理所有运行中的模拟进程 +backend/app/services/simulation_runner.py:1191: 在服务器关闭时调用,确保所有子进程被终止 +backend/app/services/simulation_runner.py:1193: # 防止重复清理 +backend/app/services/simulation_runner.py:1198: # 检查是否有内容需要清理(避免空进程的进程打印无用日志) +backend/app/services/simulation_runner.py:1203: return # 没有需要清理的内容,静默返回 +backend/app/services/simulation_runner.py:1207: # 首先停止所有图谱记忆更新器(stop_all 内部会打印日志) +backend/app/services/simulation_runner.py:1214: # 复制字典以避免在迭代时修改 +backend/app/services/simulation_runner.py:1219: if process.poll() is None: # 进程仍在运行 +backend/app/services/simulation_runner.py:1223: # 使用跨平台的进程终止方法 +backend/app/services/simulation_runner.py:1226: # 进程可能已经不存在,尝试直接终止 +backend/app/services/simulation_runner.py:1233: # 更新 run_state.json +backend/app/services/simulation_runner.py:1240: state.error = "服务器关闭,模拟被终止" +backend/app/services/simulation_runner.py:1243: # 同时更新 state.json,将状态设为 stopped +backend/app/services/simulation_runner.py:1264: # 清理文件句柄 +backend/app/services/simulation_runner.py:1281: # 清理内存中的状态 +backend/app/services/simulation_runner.py:1290: 注册清理函数 +backend/app/services/simulation_runner.py:1292: 在 Flask 应用启动时调用,确保服务器关闭时清理所有模拟进程 +backend/app/services/simulation_runner.py:1299: # Flask debug 模式下,只在 reloader 子进程中注册清理(实际运行应用的进程) +backend/app/services/simulation_runner.py:1300: # WERKZEUG_RUN_MAIN=true 表示是 reloader 子进程 +backend/app/services/simulation_runner.py:1301: # 如果不是 debug 模式,则没有这个环境变量,也需要注册 +backend/app/services/simulation_runner.py:1305: # 在 debug 模式下,只在 reloader 子进程中注册;非 debug 模式下始终注册 +backend/app/services/simulation_runner.py:1307: _cleanup_registered = True # 标记已注册,防止子进程再次尝试 +backend/app/services/simulation_runner.py:1310: # 保存原有的信号处理器 +backend/app/services/simulation_runner.py:1313: # SIGHUP 只在 Unix 系统存在(macOS/Linux),Windows 没有 +backend/app/services/simulation_runner.py:1320: """信号处理器:先清理模拟进程,再调用原处理器""" +backend/app/services/simulation_runner.py:1321: # 只有在有进程需要清理时才打印日志 +backend/app/services/simulation_runner.py:1326: # 调用原有的信号处理器,让 Flask 正常退出 +backend/app/services/simulation_runner.py:1332: # SIGHUP: 终端关闭时发送 +backend/app/services/simulation_runner.py:1336: # 默认行为:正常退出 +backend/app/services/simulation_runner.py:1339: # 如果原处理器不可调用(如 SIG_DFL),则使用默认行为 +backend/app/services/simulation_runner.py:1342: # 注册 atexit 处理器(作为备用) +backend/app/services/simulation_runner.py:1345: # 注册信号处理器(仅在主线程中) +backend/app/services/simulation_runner.py:1347: # SIGTERM: kill 命令默认信号 +backend/app/services/simulation_runner.py:1351: # SIGHUP: 终端关闭(仅 Unix 系统) +backend/app/services/simulation_runner.py:1355: # 不在主线程中,只能使用 atexit +backend/app/services/simulation_runner.py:1363: 获取所有正在运行的模拟ID列表 +backend/app/services/simulation_runner.py:1371: # ============== Interview 功能 ============== +backend/app/services/simulation_runner.py:1376: 检查模拟环境是否存活(可以接收Interview命令) +backend/app/services/simulation_runner.py:1379: simulation_id: 模拟ID +backend/app/services/simulation_runner.py:1382: True 表示环境存活,False 表示环境已关闭 +backend/app/services/simulation_runner.py:1394: 获取模拟环境的详细状态信息 +backend/app/services/simulation_runner.py:1397: simulation_id: 模拟ID +backend/app/services/simulation_runner.py:1400: 状态详情字典,包含 status, twitter_available, reddit_available, timestamp +backend/app/services/simulation_runner.py:1437: 采访单个Agent +backend/app/services/simulation_runner.py:1440: simulation_id: 模拟ID +backend/app/services/simulation_runner.py:1442: prompt: 采访问题 +backend/app/services/simulation_runner.py:1443: platform: 指定平台(可选) +backend/app/services/simulation_runner.py:1444: - "twitter": 只采访Twitter平台 +backend/app/services/simulation_runner.py:1445: - "reddit": 只采访Reddit平台 +backend/app/services/simulation_runner.py:1446: - None: 双平台模拟时同时采访两个平台,返回整合结果 +backend/app/services/simulation_runner.py:1447: timeout: 超时时间(秒) +backend/app/services/simulation_runner.py:1450: 采访结果字典 +backend/app/services/simulation_runner.py:1453: ValueError: 模拟不存在或环境未运行 +backend/app/services/simulation_runner.py:1454: TimeoutError: 等待响应超时 +backend/app/services/simulation_runner.py:1458: raise ValueError(f"模拟不存在: {simulation_id}") +backend/app/services/simulation_runner.py:1463: raise ValueError(f"模拟环境未运行或已关闭,无法执行Interview: {simulation_id}") +backend/app/services/simulation_runner.py:1500: 批量采访多个Agent +backend/app/services/simulation_runner.py:1503: simulation_id: 模拟ID +backend/app/services/simulation_runner.py:1504: interviews: 采访列表,每个元素包含 {"agent_id": int, "prompt": str, "platform": str(可选)} +backend/app/services/simulation_runner.py:1505: platform: 默认平台(可选,会被每个采访项的platform覆盖) +backend/app/services/simulation_runner.py:1506: - "twitter": 默认只采访Twitter平台 +backend/app/services/simulation_runner.py:1507: - "reddit": 默认只采访Reddit平台 +backend/app/services/simulation_runner.py:1508: - None: 双平台模拟时每个Agent同时采访两个平台 +backend/app/services/simulation_runner.py:1509: timeout: 超时时间(秒) +backend/app/services/simulation_runner.py:1512: 批量采访结果字典 +backend/app/services/simulation_runner.py:1515: ValueError: 模拟不存在或环境未运行 +backend/app/services/simulation_runner.py:1516: TimeoutError: 等待响应超时 +backend/app/services/simulation_runner.py:1520: raise ValueError(f"模拟不存在: {simulation_id}") +backend/app/services/simulation_runner.py:1525: raise ValueError(f"模拟环境未运行或已关闭,无法执行Interview: {simulation_id}") +backend/app/services/simulation_runner.py:1559: 采访所有Agent(全局采访) +backend/app/services/simulation_runner.py:1561: 使用相同的问题采访模拟中的所有Agent +backend/app/services/simulation_runner.py:1564: simulation_id: 模拟ID +backend/app/services/simulation_runner.py:1565: prompt: 采访问题(所有Agent使用相同问题) +backend/app/services/simulation_runner.py:1566: platform: 指定平台(可选) +backend/app/services/simulation_runner.py:1567: - "twitter": 只采访Twitter平台 +backend/app/services/simulation_runner.py:1568: - "reddit": 只采访Reddit平台 +backend/app/services/simulation_runner.py:1569: - None: 双平台模拟时每个Agent同时采访两个平台 +backend/app/services/simulation_runner.py:1570: timeout: 超时时间(秒) +backend/app/services/simulation_runner.py:1573: 全局采访结果字典 +backend/app/services/simulation_runner.py:1577: raise ValueError(f"模拟不存在: {simulation_id}") +backend/app/services/simulation_runner.py:1579: # 从配置文件获取所有Agent信息 +backend/app/services/simulation_runner.py:1582: raise ValueError(f"模拟配置不存在: {simulation_id}") +backend/app/services/simulation_runner.py:1589: raise ValueError(f"模拟配置中没有Agent: {simulation_id}") +backend/app/services/simulation_runner.py:1591: # 构建批量采访列表 +backend/app/services/simulation_runner.py:1617: 关闭模拟环境(而不是停止模拟进程) +backend/app/services/simulation_runner.py:1619: 向模拟发送关闭环境命令,使其优雅退出等待命令模式 +backend/app/services/simulation_runner.py:1622: simulation_id: 模拟ID +backend/app/services/simulation_runner.py:1623: timeout: 超时时间(秒) +backend/app/services/simulation_runner.py:1626: 操作结果字典 +backend/app/services/simulation_runner.py:1630: raise ValueError(f"模拟不存在: {simulation_id}") +backend/app/services/simulation_runner.py:1637: "message": "环境已经关闭" +backend/app/services/simulation_runner.py:1647: "message": "环境关闭命令已发送", +backend/app/services/simulation_runner.py:1652: # 超时可能是因为环境正在关闭 +backend/app/services/simulation_runner.py:1655: "message": "环境关闭命令已发送(等待响应超时,环境可能正在关闭)" +backend/app/services/simulation_runner.py:1666: """从单个数据库获取Interview历史""" +backend/app/services/simulation_runner.py:1725: 获取Interview历史记录(从数据库读取) +backend/app/services/simulation_runner.py:1728: simulation_id: 模拟ID +backend/app/services/simulation_runner.py:1729: platform: 平台类型(reddit/twitter/None) +backend/app/services/simulation_runner.py:1730: - "reddit": 只获取Reddit平台的历史 +backend/app/services/simulation_runner.py:1731: - "twitter": 只获取Twitter平台的历史 +backend/app/services/simulation_runner.py:1732: - None: 获取两个平台的所有历史 +backend/app/services/simulation_runner.py:1733: agent_id: 指定Agent ID(可选,只获取该Agent的历史) +backend/app/services/simulation_runner.py:1734: limit: 每个平台返回数量限制 +backend/app/services/simulation_runner.py:1737: Interview历史记录列表 +backend/app/services/simulation_runner.py:1743: # 确定要查询的平台 +backend/app/services/simulation_runner.py:1747: # 不指定platform时,查询两个平台 +backend/app/services/simulation_runner.py:1760: # 按时间降序排序 +backend/app/services/simulation_runner.py:1763: # 如果查询了多个平台,限制总数 +backend/app/services/text_processor.py:2:文本处理服务 +backend/app/services/text_processor.py:10: """文本处理器""" +backend/app/services/text_processor.py:14: """从多个文件提取文本""" +backend/app/services/text_processor.py:24: 分割文本 +backend/app/services/text_processor.py:27: text: 原始文本 +backend/app/services/text_processor.py:28: chunk_size: 块大小 +backend/app/services/text_processor.py:29: overlap: 重叠大小 +backend/app/services/text_processor.py:32: 文本块列表 +backend/app/services/text_processor.py:39: 预处理文本 +backend/app/services/text_processor.py:40: - 移除多余空白 +backend/app/services/text_processor.py:41: - 标准化换行 +backend/app/services/text_processor.py:44: text: 原始文本 +backend/app/services/text_processor.py:47: 处理后的文本 +backend/app/services/text_processor.py:51: # 标准化换行 +backend/app/services/text_processor.py:54: # 移除连续空行(保留最多两个换行) +backend/app/services/text_processor.py:57: # 移除行首行尾空白 +backend/app/services/text_processor.py:65: """获取文本统计信息""" +backend/app/services/zep_entity_reader.py:2:Zep实体读取与过滤服务 +backend/app/services/zep_entity_reader.py:3:从Zep图谱中读取节点,筛选出符合预定义实体类型的节点 +backend/app/services/zep_entity_reader.py:19:# 用于泛型返回类型 +backend/app/services/zep_entity_reader.py:25: """实体节点数据结构""" +backend/app/services/zep_entity_reader.py:31: # 相关的边信息 +backend/app/services/zep_entity_reader.py:33: # 相关的其他节点信息 +backend/app/services/zep_entity_reader.py:48: """获取实体类型(排除默认的Entity标签)""" +backend/app/services/zep_entity_reader.py:57: """过滤后的实体集合""" +backend/app/services/zep_entity_reader.py:74: Zep实体读取与过滤服务 +backend/app/services/zep_entity_reader.py:76: 主要功能: +backend/app/services/zep_entity_reader.py:77: 1. 从Zep图谱读取所有节点 +backend/app/services/zep_entity_reader.py:78: 2. 筛选出符合预定义实体类型的节点(Labels不只是Entity的节点) +backend/app/services/zep_entity_reader.py:79: 3. 获取每个实体的相关边和关联节点信息 +backend/app/services/zep_entity_reader.py:93: 带重试机制的Zep API调用 +backend/app/services/zep_entity_reader.py:96: func: 要执行的函数(无参数的lambda或callable) +backend/app/services/zep_entity_reader.py:97: operation_name: 操作名称,用于日志 +backend/app/services/zep_entity_reader.py:98: max_retries: 最大重试次数(默认3次,即最多尝试3次) +backend/app/services/zep_entity_reader.py:99: initial_delay: 初始延迟秒数 +backend/app/services/zep_entity_reader.py:102: API调用结果 +backend/app/services/zep_entity_reader.py:117: delay *= 2 # 指数退避 +backend/app/services/zep_entity_reader.py:125: 获取图谱的所有节点(分页获取) +backend/app/services/zep_entity_reader.py:128: graph_id: 图谱ID +backend/app/services/zep_entity_reader.py:131: 节点列表 +backend/app/services/zep_entity_reader.py:152: 获取图谱的所有边(分页获取) +backend/app/services/zep_entity_reader.py:155: graph_id: 图谱ID +backend/app/services/zep_entity_reader.py:158: 边列表 +backend/app/services/zep_entity_reader.py:180: 获取指定节点的所有相关边(带重试机制) +backend/app/services/zep_entity_reader.py:183: node_uuid: 节点UUID +backend/app/services/zep_entity_reader.py:186: 边列表 +backend/app/services/zep_entity_reader.py:189: # 使用重试机制调用Zep API +backend/app/services/zep_entity_reader.py:192: operation_name=f"获取节点边(node={node_uuid[:8]}...)" +backend/app/services/zep_entity_reader.py:218: 筛选出符合预定义实体类型的节点 +backend/app/services/zep_entity_reader.py:220: 筛选逻辑: +backend/app/services/zep_entity_reader.py:221: - 如果节点的Labels只有一个"Entity",说明这个实体不符合我们预定义的类型,跳过 +backend/app/services/zep_entity_reader.py:222: - 如果节点的Labels包含除"Entity"和"Node"之外的标签,说明符合预定义类型,保留 +backend/app/services/zep_entity_reader.py:225: graph_id: 图谱ID +backend/app/services/zep_entity_reader.py:226: defined_entity_types: 预定义的实体类型列表(可选,如果提供则只保留这些类型) +backend/app/services/zep_entity_reader.py:227: enrich_with_edges: 是否获取每个实体的相关边信息 +backend/app/services/zep_entity_reader.py:230: FilteredEntities: 过滤后的实体集合 +backend/app/services/zep_entity_reader.py:246: # 获取所有节点 +backend/app/services/zep_entity_reader.py:262: # 获取所有边(用于后续关联查找) +backend/app/services/zep_entity_reader.py:265: # 构建节点UUID到节点数据的映射 +backend/app/services/zep_entity_reader.py:268: # 筛选符合条件的实体 +backend/app/services/zep_entity_reader.py:275: # 筛选逻辑:Labels必须包含除"Entity"和"Node"之外的标签 +backend/app/services/zep_entity_reader.py:279: # 只有默认标签,跳过 +backend/app/services/zep_entity_reader.py:282: # 如果指定了预定义类型,检查是否匹配 +backend/app/services/zep_entity_reader.py:293: # 创建实体节点对象 +backend/app/services/zep_entity_reader.py:302: # 获取相关边和节点 +backend/app/services/zep_entity_reader.py:327: # 获取关联节点的基本信息 +backend/app/services/zep_entity_reader.py:358: 获取单个实体及其完整上下文(边和关联节点,带重试机制) +backend/app/services/zep_entity_reader.py:361: graph_id: 图谱ID +backend/app/services/zep_entity_reader.py:362: entity_uuid: 实体UUID +backend/app/services/zep_entity_reader.py:365: EntityNode或None +backend/app/services/zep_entity_reader.py:368: # 使用重试机制获取节点 +backend/app/services/zep_entity_reader.py:371: operation_name=f"获取节点详情(uuid={entity_uuid[:8]}...)" +backend/app/services/zep_entity_reader.py:377: # 获取节点的边 +backend/app/services/zep_entity_reader.py:380: # 获取所有节点用于关联查找 +backend/app/services/zep_entity_reader.py:384: # 处理相关边和节点 +backend/app/services/zep_entity_reader.py:406: # 获取关联节点信息 +backend/app/services/zep_entity_reader.py:439: 获取指定类型的所有实体 +backend/app/services/zep_entity_reader.py:442: graph_id: 图谱ID +backend/app/services/zep_entity_reader.py:443: entity_type: 实体类型(如 "Student", "PublicFigure" 等) +backend/app/services/zep_entity_reader.py:444: enrich_with_edges: 是否获取相关边信息 +backend/app/services/zep_entity_reader.py:447: 实体列表 +backend/app/services/zep_graph_memory_updater.py:2:Zep图谱记忆更新服务 +backend/app/services/zep_graph_memory_updater.py:3:将模拟中的Agent活动动态更新到Zep图谱中 +backend/app/services/zep_graph_memory_updater.py:26: """Agent活动记录""" +backend/app/services/zep_graph_memory_updater.py:37: 将活动转换为可以发送给Zep的文本描述 +backend/app/services/zep_graph_memory_updater.py:39: 采用自然语言描述格式,让Zep能够从中提取实体和关系 +backend/app/services/zep_graph_memory_updater.py:40: 不添加模拟相关的前缀,避免误导图谱更新 +backend/app/services/zep_graph_memory_updater.py:42: # 根据不同的动作类型生成不同的描述 +backend/app/services/zep_graph_memory_updater.py:61: # 直接返回 "agent名称: 活动描述" 格式,不添加模拟前缀 +backend/app/services/zep_graph_memory_updater.py:67: return f"发布了一条帖子:「{content}」" +backend/app/services/zep_graph_memory_updater.py:68: return "发布了一条帖子" +backend/app/services/zep_graph_memory_updater.py:71: """点赞帖子 - 包含帖子原文和作者信息""" +backend/app/services/zep_graph_memory_updater.py:76: return f"点赞了{post_author}的帖子:「{post_content}」" +backend/app/services/zep_graph_memory_updater.py:78: return f"点赞了一条帖子:「{post_content}」" +backend/app/services/zep_graph_memory_updater.py:80: return f"点赞了{post_author}的一条帖子" +backend/app/services/zep_graph_memory_updater.py:81: return "点赞了一条帖子" +backend/app/services/zep_graph_memory_updater.py:84: """踩帖子 - 包含帖子原文和作者信息""" +backend/app/services/zep_graph_memory_updater.py:89: return f"踩了{post_author}的帖子:「{post_content}」" +backend/app/services/zep_graph_memory_updater.py:91: return f"踩了一条帖子:「{post_content}」" +backend/app/services/zep_graph_memory_updater.py:93: return f"踩了{post_author}的一条帖子" +backend/app/services/zep_graph_memory_updater.py:94: return "踩了一条帖子" +backend/app/services/zep_graph_memory_updater.py:97: """转发帖子 - 包含原帖内容和作者信息""" +backend/app/services/zep_graph_memory_updater.py:102: return f"转发了{original_author}的帖子:「{original_content}」" +backend/app/services/zep_graph_memory_updater.py:104: return f"转发了一条帖子:「{original_content}」" +backend/app/services/zep_graph_memory_updater.py:106: return f"转发了{original_author}的一条帖子" +backend/app/services/zep_graph_memory_updater.py:107: return "转发了一条帖子" +backend/app/services/zep_graph_memory_updater.py:110: """引用帖子 - 包含原帖内容、作者信息和引用评论""" +backend/app/services/zep_graph_memory_updater.py:117: base = f"引用了{original_author}的帖子「{original_content}」" +backend/app/services/zep_graph_memory_updater.py:119: base = f"引用了一条帖子「{original_content}」" +backend/app/services/zep_graph_memory_updater.py:121: base = f"引用了{original_author}的一条帖子" +backend/app/services/zep_graph_memory_updater.py:123: base = "引用了一条帖子" +backend/app/services/zep_graph_memory_updater.py:126: base += f",并评论道:「{quote_content}」" +backend/app/services/zep_graph_memory_updater.py:130: """关注用户 - 包含被关注用户的名称""" +backend/app/services/zep_graph_memory_updater.py:134: return f"关注了用户「{target_user_name}」" +backend/app/services/zep_graph_memory_updater.py:135: return "关注了一个用户" +backend/app/services/zep_graph_memory_updater.py:138: """发表评论 - 包含评论内容和所评论的帖子信息""" +backend/app/services/zep_graph_memory_updater.py:145: return f"在{post_author}的帖子「{post_content}」下评论道:「{content}」" +backend/app/services/zep_graph_memory_updater.py:147: return f"在帖子「{post_content}」下评论道:「{content}」" +backend/app/services/zep_graph_memory_updater.py:149: return f"在{post_author}的帖子下评论道:「{content}」" +backend/app/services/zep_graph_memory_updater.py:150: return f"评论道:「{content}」" +backend/app/services/zep_graph_memory_updater.py:151: return "发表了评论" +backend/app/services/zep_graph_memory_updater.py:154: """点赞评论 - 包含评论内容和作者信息""" +backend/app/services/zep_graph_memory_updater.py:159: return f"点赞了{comment_author}的评论:「{comment_content}」" +backend/app/services/zep_graph_memory_updater.py:161: return f"点赞了一条评论:「{comment_content}」" +backend/app/services/zep_graph_memory_updater.py:163: return f"点赞了{comment_author}的一条评论" +backend/app/services/zep_graph_memory_updater.py:164: return "点赞了一条评论" +backend/app/services/zep_graph_memory_updater.py:167: """踩评论 - 包含评论内容和作者信息""" +backend/app/services/zep_graph_memory_updater.py:172: return f"踩了{comment_author}的评论:「{comment_content}」" +backend/app/services/zep_graph_memory_updater.py:174: return f"踩了一条评论:「{comment_content}」" +backend/app/services/zep_graph_memory_updater.py:176: return f"踩了{comment_author}的一条评论" +backend/app/services/zep_graph_memory_updater.py:177: return "踩了一条评论" +backend/app/services/zep_graph_memory_updater.py:180: """搜索帖子 - 包含搜索关键词""" +backend/app/services/zep_graph_memory_updater.py:182: return f"搜索了「{query}」" if query else "进行了搜索" +backend/app/services/zep_graph_memory_updater.py:185: """搜索用户 - 包含搜索关键词""" +backend/app/services/zep_graph_memory_updater.py:187: return f"搜索了用户「{query}」" if query else "搜索了用户" +backend/app/services/zep_graph_memory_updater.py:190: """屏蔽用户 - 包含被屏蔽用户的名称""" +backend/app/services/zep_graph_memory_updater.py:194: return f"屏蔽了用户「{target_user_name}」" +backend/app/services/zep_graph_memory_updater.py:195: return "屏蔽了一个用户" +backend/app/services/zep_graph_memory_updater.py:198: # 对于未知的动作类型,生成通用描述 +backend/app/services/zep_graph_memory_updater.py:199: return f"执行了{self.action_type}操作" +backend/app/services/zep_graph_memory_updater.py:204: Zep图谱记忆更新器 +backend/app/services/zep_graph_memory_updater.py:206: 监控模拟的actions日志文件,将新的agent活动实时更新到Zep图谱中。 +backend/app/services/zep_graph_memory_updater.py:207: 按平台分组,每累积BATCH_SIZE条活动后批量发送到Zep。 +backend/app/services/zep_graph_memory_updater.py:209: 所有有意义的行为都会被更新到Zep,action_args中会包含完整的上下文信息: +backend/app/services/zep_graph_memory_updater.py:210: - 点赞/踩的帖子原文 +backend/app/services/zep_graph_memory_updater.py:211: - 转发/引用的帖子原文 +backend/app/services/zep_graph_memory_updater.py:212: - 关注/屏蔽的用户名 +backend/app/services/zep_graph_memory_updater.py:213: - 点赞/踩的评论原文 +backend/app/services/zep_graph_memory_updater.py:216: # 批量发送大小(每个平台累积多少条后发送) +backend/app/services/zep_graph_memory_updater.py:219: # 平台名称映射(用于控制台显示) +backend/app/services/zep_graph_memory_updater.py:221: 'twitter': '世界1', +backend/app/services/zep_graph_memory_updater.py:222: 'reddit': '世界2', +backend/app/services/zep_graph_memory_updater.py:225: # 发送间隔(秒),避免请求过快 +backend/app/services/zep_graph_memory_updater.py:228: # 重试配置 +backend/app/services/zep_graph_memory_updater.py:230: RETRY_DELAY = 2 # 秒 +backend/app/services/zep_graph_memory_updater.py:234: 初始化更新器 +backend/app/services/zep_graph_memory_updater.py:237: graph_id: Zep图谱ID +backend/app/services/zep_graph_memory_updater.py:238: api_key: Zep API Key(可选,默认从配置读取) +backend/app/services/zep_graph_memory_updater.py:243: # 活动队列 +backend/app/services/zep_graph_memory_updater.py:246: # 按平台分组的活动缓冲区(每个平台各自累积到BATCH_SIZE后批量发送) +backend/app/services/zep_graph_memory_updater.py:253: # 控制标志 +backend/app/services/zep_graph_memory_updater.py:257: # 统计 +backend/app/services/zep_graph_memory_updater.py:258: self._total_activities = 0 # 实际添加到队列的活动数 +backend/app/services/zep_graph_memory_updater.py:259: self._total_sent = 0 # 成功发送到Zep的批次数 +backend/app/services/zep_graph_memory_updater.py:260: self._total_items_sent = 0 # 成功发送到Zep的活动条数 +backend/app/services/zep_graph_memory_updater.py:261: self._failed_count = 0 # 发送失败的批次数 +backend/app/services/zep_graph_memory_updater.py:262: self._skipped_count = 0 # 被过滤跳过的活动数(DO_NOTHING) +backend/app/services/zep_graph_memory_updater.py:267: """获取平台的显示名称""" +backend/app/services/zep_graph_memory_updater.py:271: """启动后台工作线程""" +backend/app/services/zep_graph_memory_updater.py:289: """停止后台工作线程""" +backend/app/services/zep_graph_memory_updater.py:292: # 发送剩余的活动 +backend/app/services/zep_graph_memory_updater.py:302: 添加一个agent活动到队列 +backend/app/services/zep_graph_memory_updater.py:304: 所有有意义的行为都会被添加到队列,包括: +backend/app/services/zep_graph_memory_updater.py:305: - CREATE_POST(发帖) +backend/app/services/zep_graph_memory_updater.py:306: - CREATE_COMMENT(评论) +backend/app/services/zep_graph_memory_updater.py:307: - QUOTE_POST(引用帖子) +backend/app/services/zep_graph_memory_updater.py:308: - SEARCH_POSTS(搜索帖子) +backend/app/services/zep_graph_memory_updater.py:309: - SEARCH_USER(搜索用户) +backend/app/services/zep_graph_memory_updater.py:310: - LIKE_POST/DISLIKE_POST(点赞/踩帖子) +backend/app/services/zep_graph_memory_updater.py:311: - REPOST(转发) +backend/app/services/zep_graph_memory_updater.py:312: - FOLLOW(关注) +backend/app/services/zep_graph_memory_updater.py:313: - MUTE(屏蔽) +backend/app/services/zep_graph_memory_updater.py:314: - LIKE_COMMENT/DISLIKE_COMMENT(点赞/踩评论) +backend/app/services/zep_graph_memory_updater.py:316: action_args中会包含完整的上下文信息(如帖子原文、用户名等)。 +backend/app/services/zep_graph_memory_updater.py:319: activity: Agent活动记录 +backend/app/services/zep_graph_memory_updater.py:321: # 跳过DO_NOTHING类型的活动 +backend/app/services/zep_graph_memory_updater.py:332: 从字典数据添加活动 +backend/app/services/zep_graph_memory_updater.py:335: data: 从actions.jsonl解析的字典数据 +backend/app/services/zep_graph_memory_updater.py:336: platform: 平台名称 (twitter/reddit) +backend/app/services/zep_graph_memory_updater.py:338: # 跳过事件类型的条目 +backend/app/services/zep_graph_memory_updater.py:355: """后台工作循环 - 按平台批量发送活动到Zep""" +backend/app/services/zep_graph_memory_updater.py:359: # 尝试从队列获取活动(超时1秒) +backend/app/services/zep_graph_memory_updater.py:363: # 将活动添加到对应平台的缓冲区 +backend/app/services/zep_graph_memory_updater.py:370: # 检查该平台是否达到批量大小 +backend/app/services/zep_graph_memory_updater.py:374: # 释放锁后再发送 +backend/app/services/zep_graph_memory_updater.py:376: # 发送间隔,避免请求过快 +backend/app/services/zep_graph_memory_updater.py:388: 批量发送活动到Zep图谱(合并为一条文本) +backend/app/services/zep_graph_memory_updater.py:391: activities: Agent活动列表 +backend/app/services/zep_graph_memory_updater.py:392: platform: 平台名称 +backend/app/services/zep_graph_memory_updater.py:397: # 将多条活动合并为一条文本,用换行分隔 +backend/app/services/zep_graph_memory_updater.py:401: # 带重试的发送 +backend/app/services/zep_graph_memory_updater.py:426: """发送队列和缓冲区中剩余的活动""" +backend/app/services/zep_graph_memory_updater.py:427: # 首先处理队列中剩余的活动,添加到缓冲区 +backend/app/services/zep_graph_memory_updater.py:439: # 然后发送各平台缓冲区中剩余的活动(即使不足BATCH_SIZE条) +backend/app/services/zep_graph_memory_updater.py:446: # 清空所有缓冲区 +backend/app/services/zep_graph_memory_updater.py:451: """获取统计信息""" +backend/app/services/zep_graph_memory_updater.py:458: "total_activities": self._total_activities, # 添加到队列的活动总数 +backend/app/services/zep_graph_memory_updater.py:459: "batches_sent": self._total_sent, # 成功发送的批次数 +backend/app/services/zep_graph_memory_updater.py:460: "items_sent": self._total_items_sent, # 成功发送的活动条数 +backend/app/services/zep_graph_memory_updater.py:461: "failed_count": self._failed_count, # 发送失败的批次数 +backend/app/services/zep_graph_memory_updater.py:462: "skipped_count": self._skipped_count, # 被过滤跳过的活动数(DO_NOTHING) +backend/app/services/zep_graph_memory_updater.py:464: "buffer_sizes": buffer_sizes, # 各平台缓冲区大小 +backend/app/services/zep_graph_memory_updater.py:471: 管理多个模拟的Zep图谱记忆更新器 +backend/app/services/zep_graph_memory_updater.py:473: 每个模拟可以有自己的更新器实例 +backend/app/services/zep_graph_memory_updater.py:482: 为模拟创建图谱记忆更新器 +backend/app/services/zep_graph_memory_updater.py:485: simulation_id: 模拟ID +backend/app/services/zep_graph_memory_updater.py:486: graph_id: Zep图谱ID +backend/app/services/zep_graph_memory_updater.py:489: ZepGraphMemoryUpdater实例 +backend/app/services/zep_graph_memory_updater.py:492: # 如果已存在,先停止旧的 +backend/app/services/zep_graph_memory_updater.py:505: """获取模拟的更新器""" +backend/app/services/zep_graph_memory_updater.py:510: """停止并移除模拟的更新器""" +backend/app/services/zep_graph_memory_updater.py:517: # 防止 stop_all 重复调用的标志 +backend/app/services/zep_graph_memory_updater.py:522: """停止所有更新器""" +backend/app/services/zep_graph_memory_updater.py:523: # 防止重复调用 +backend/app/services/zep_graph_memory_updater.py:540: """获取所有更新器的统计信息""" +backend/app/services/zep_tools.py:2:Zep检索工具服务 +backend/app/services/zep_tools.py:3:封装图谱搜索、节点读取、边查询等工具,供Report Agent使用 +backend/app/services/zep_tools.py:5:核心检索工具(优化后): +backend/app/services/zep_tools.py:6:1. InsightForge(深度洞察检索)- 最强大的混合检索,自动生成子问题并多维度检索 +backend/app/services/zep_tools.py:7:2. PanoramaSearch(广度搜索)- 获取全貌,包括过期内容 +backend/app/services/zep_tools.py:8:3. QuickSearch(简单搜索)- 快速检索 +backend/app/services/zep_tools.py:29: """搜索结果""" +backend/app/services/zep_tools.py:46: """转换为文本格式,供LLM理解""" +backend/app/services/zep_tools.py:47: text_parts = [f"搜索查询: {self.query}", f"找到 {self.total_count} 条相关信息"] +backend/app/services/zep_tools.py:50: text_parts.append("\n### 相关事实:") +backend/app/services/zep_tools.py:59: """节点信息""" +backend/app/services/zep_tools.py:76: """转换为文本格式""" +backend/app/services/zep_tools.py:77: entity_type = next((l for l in self.labels if l not in ["Entity", "Node"]), "未知类型") +backend/app/services/zep_tools.py:78: return f"实体: {self.name} (类型: {entity_type})\n摘要: {self.summary}" +backend/app/services/zep_tools.py:83: """边信息""" +backend/app/services/zep_tools.py:91: # 时间信息 +backend/app/services/zep_tools.py:113: """转换为文本格式""" +backend/app/services/zep_tools.py:116: base_text = f"关系: {source} --[{self.name}]--> {target}\n事实: {self.fact}" +backend/app/services/zep_tools.py:119: valid_at = self.valid_at or "未知" +backend/app/services/zep_tools.py:120: invalid_at = self.invalid_at or "至今" +backend/app/services/zep_tools.py:121: base_text += f"\n时效: {valid_at} - {invalid_at}" +backend/app/services/zep_tools.py:123: base_text += f" (已过期: {self.expired_at})" +backend/app/services/zep_tools.py:129: """是否已过期""" +backend/app/services/zep_tools.py:134: """是否已失效""" +backend/app/services/zep_tools.py:141: 深度洞察检索结果 (InsightForge) +backend/app/services/zep_tools.py:142: 包含多个子问题的检索结果,以及综合分析 +backend/app/services/zep_tools.py:148: # 各维度检索结果 +backend/app/services/zep_tools.py:149: semantic_facts: List[str] = field(default_factory=list) # 语义搜索结果 +backend/app/services/zep_tools.py:150: entity_insights: List[Dict[str, Any]] = field(default_factory=list) # 实体洞察 +backend/app/services/zep_tools.py:151: relationship_chains: List[str] = field(default_factory=list) # 关系链 +backend/app/services/zep_tools.py:153: # 统计信息 +backend/app/services/zep_tools.py:172: """转换为详细的文本格式,供LLM理解""" +backend/app/services/zep_tools.py:174: f"## 未来预测深度分析", +backend/app/services/zep_tools.py:175: f"分析问题: {self.query}", +backend/app/services/zep_tools.py:176: f"预测场景: {self.simulation_requirement}", +backend/app/services/zep_tools.py:177: f"\n### 预测数据统计", +backend/app/services/zep_tools.py:178: f"- 相关预测事实: {self.total_facts}条", +backend/app/services/zep_tools.py:179: f"- 涉及实体: {self.total_entities}个", +backend/app/services/zep_tools.py:180: f"- 关系链: {self.total_relationships}条" +backend/app/services/zep_tools.py:183: # 子问题 +backend/app/services/zep_tools.py:185: text_parts.append(f"\n### 分析的子问题") +backend/app/services/zep_tools.py:189: # 语义搜索结果 +backend/app/services/zep_tools.py:191: text_parts.append(f"\n### 【关键事实】(请在报告中引用这些原文)") +backend/app/services/zep_tools.py:195: # 实体洞察 +backend/app/services/zep_tools.py:197: text_parts.append(f"\n### 【核心实体】") +backend/app/services/zep_tools.py:199: text_parts.append(f"- **{entity.get('name', '未知')}** ({entity.get('type', '实体')})") +backend/app/services/zep_tools.py:201: text_parts.append(f" 摘要: \"{entity.get('summary')}\"") +backend/app/services/zep_tools.py:203: text_parts.append(f" 相关事实: {len(entity.get('related_facts', []))}条") +backend/app/services/zep_tools.py:205: # 关系链 +backend/app/services/zep_tools.py:207: text_parts.append(f"\n### 【关系链】") +backend/app/services/zep_tools.py:217: 广度搜索结果 (Panorama) +backend/app/services/zep_tools.py:218: 包含所有相关信息,包括过期内容 +backend/app/services/zep_tools.py:222: # 全部节点 +backend/app/services/zep_tools.py:224: # 全部边(包括过期的) +backend/app/services/zep_tools.py:226: # 当前有效的事实 +backend/app/services/zep_tools.py:228: # 已过期/失效的事实(历史记录) +backend/app/services/zep_tools.py:231: # 统计 +backend/app/services/zep_tools.py:251: """转换为文本格式(完整版本,不截断)""" +backend/app/services/zep_tools.py:253: f"## 广度搜索结果(未来全景视图)", +backend/app/services/zep_tools.py:254: f"查询: {self.query}", +backend/app/services/zep_tools.py:255: f"\n### 统计信息", +backend/app/services/zep_tools.py:256: f"- 总节点数: {self.total_nodes}", +backend/app/services/zep_tools.py:257: f"- 总边数: {self.total_edges}", +backend/app/services/zep_tools.py:258: f"- 当前有效事实: {self.active_count}条", +backend/app/services/zep_tools.py:259: f"- 历史/过期事实: {self.historical_count}条" +backend/app/services/zep_tools.py:262: # 当前有效的事实(完整输出,不截断) +backend/app/services/zep_tools.py:264: text_parts.append(f"\n### 【当前有效事实】(模拟结果原文)") +backend/app/services/zep_tools.py:268: # 历史/过期事实(完整输出,不截断) +backend/app/services/zep_tools.py:270: text_parts.append(f"\n### 【历史/过期事实】(演变过程记录)") +backend/app/services/zep_tools.py:274: # 关键实体(完整输出,不截断) +backend/app/services/zep_tools.py:276: text_parts.append(f"\n### 【涉及实体】") +backend/app/services/zep_tools.py:278: entity_type = next((l for l in node.labels if l not in ["Entity", "Node"]), "实体") +backend/app/services/zep_tools.py:286: """单个Agent的采访结果""" +backend/app/services/zep_tools.py:288: agent_role: str # 角色类型(如:学生、教师、媒体等) +backend/app/services/zep_tools.py:289: agent_bio: str # 简介 +backend/app/services/zep_tools.py:290: question: str # 采访问题 +backend/app/services/zep_tools.py:291: response: str # 采访回答 +backend/app/services/zep_tools.py:292: key_quotes: List[str] = field(default_factory=list) # 关键引言 +backend/app/services/zep_tools.py:306: # 显示完整的agent_bio,不截断 +backend/app/services/zep_tools.py:307: text += f"_简介: {self.agent_bio}_\n\n" +backend/app/services/zep_tools.py:311: text += "\n**关键引言:**\n" +backend/app/services/zep_tools.py:313: # 清理各种引号 +backend/app/services/zep_tools.py:317: # 去掉开头的标点 +backend/app/services/zep_tools.py:320: # 过滤包含问题编号的垃圾内容(问题1-9) +backend/app/services/zep_tools.py:328: # 截断过长内容(按句号截断,而非硬截断) +backend/app/services/zep_tools.py:343: 采访结果 (Interview) +backend/app/services/zep_tools.py:344: 包含多个模拟Agent的采访回答 +backend/app/services/zep_tools.py:346: interview_topic: str # 采访主题 +backend/app/services/zep_tools.py:347: interview_questions: List[str] # 采访问题列表 +backend/app/services/zep_tools.py:349: # 采访选择的Agent +backend/app/services/zep_tools.py:351: # 各Agent的采访回答 +backend/app/services/zep_tools.py:354: # 选择Agent的理由 +backend/app/services/zep_tools.py:356: # 整合后的采访摘要 +backend/app/services/zep_tools.py:359: # 统计 +backend/app/services/zep_tools.py:376: """转换为详细的文本格式,供LLM理解和报告引用""" +backend/app/services/zep_tools.py:378: "## 深度采访报告", +backend/app/services/zep_tools.py:379: f"**采访主题:** {self.interview_topic}", +backend/app/services/zep_tools.py:380: f"**采访人数:** {self.interviewed_count} / {self.total_agents} 位模拟Agent", +backend/app/services/zep_tools.py:381: "\n### 采访对象选择理由", +backend/app/services/zep_tools.py:382: self.selection_reasoning or "(自动选择)", +backend/app/services/zep_tools.py:384: "\n### 采访实录", +backend/app/services/zep_tools.py:389: text_parts.append(f"\n#### 采访 #{i}: {interview.agent_name}") +backend/app/services/zep_tools.py:393: text_parts.append("(无采访记录)\n\n---") +backend/app/services/zep_tools.py:395: text_parts.append("\n### 采访摘要与核心观点") +backend/app/services/zep_tools.py:396: text_parts.append(self.summary or "(无摘要)") +backend/app/services/zep_tools.py:403: Zep检索工具服务 +backend/app/services/zep_tools.py:405: 【核心检索工具 - 优化后】 +backend/app/services/zep_tools.py:406: 1. insight_forge - 深度洞察检索(最强大,自动生成子问题,多维度检索) +backend/app/services/zep_tools.py:407: 2. panorama_search - 广度搜索(获取全貌,包括过期内容) +backend/app/services/zep_tools.py:408: 3. quick_search - 简单搜索(快速检索) +backend/app/services/zep_tools.py:409: 4. interview_agents - 深度采访(采访模拟Agent,获取多视角观点) +backend/app/services/zep_tools.py:411: 【基础工具】 +backend/app/services/zep_tools.py:412: - search_graph - 图谱语义搜索 +backend/app/services/zep_tools.py:413: - get_all_nodes - 获取图谱所有节点 +backend/app/services/zep_tools.py:414: - get_all_edges - 获取图谱所有边(含时间信息) +backend/app/services/zep_tools.py:415: - get_node_detail - 获取节点详细信息 +backend/app/services/zep_tools.py:416: - get_node_edges - 获取节点相关的边 +backend/app/services/zep_tools.py:417: - get_entities_by_type - 按类型获取实体 +backend/app/services/zep_tools.py:418: - get_entity_summary - 获取实体的关系摘要 +backend/app/services/zep_tools.py:421: # 重试配置 +backend/app/services/zep_tools.py:427: # LLM客户端用于InsightForge生成子问题 +backend/app/services/zep_tools.py:433: """延迟初始化LLM客户端""" +backend/app/services/zep_tools.py:439: """带重试机制的API调用(自动处理429限速)""" +backend/app/services/zep_tools.py:450: # 检测429限速错误,使用retry-after头部的等待时间 +backend/app/services/zep_tools.py:479: 图谱语义搜索 +backend/app/services/zep_tools.py:481: 使用混合搜索(语义+BM25)在图谱中搜索相关信息。 +backend/app/services/zep_tools.py:482: 如果Zep Cloud的search API不可用,则降级为本地关键词匹配。 +backend/app/services/zep_tools.py:485: graph_id: 图谱ID (Standalone Graph) +backend/app/services/zep_tools.py:486: query: 搜索查询 +backend/app/services/zep_tools.py:487: limit: 返回结果数量 +backend/app/services/zep_tools.py:488: scope: 搜索范围,"edges" 或 "nodes" +backend/app/services/zep_tools.py:491: SearchResult: 搜索结果 +backend/app/services/zep_tools.py:495: # 尝试使用Zep Cloud Search API +backend/app/services/zep_tools.py:504: operation_name=f"图谱搜索(graph={graph_id})" +backend/app/services/zep_tools.py:511: # 解析边搜索结果 +backend/app/services/zep_tools.py:524: # 解析节点搜索结果 +backend/app/services/zep_tools.py:533: # 节点摘要也算作事实 +backend/app/services/zep_tools.py:549: # 降级:使用本地关键词匹配搜索 +backend/app/services/zep_tools.py:560: 本地关键词匹配搜索(作为Zep Search API的降级方案) +backend/app/services/zep_tools.py:562: 获取所有边/节点,然后在本地进行关键词匹配 +backend/app/services/zep_tools.py:565: graph_id: 图谱ID +backend/app/services/zep_tools.py:566: query: 搜索查询 +backend/app/services/zep_tools.py:567: limit: 返回结果数量 +backend/app/services/zep_tools.py:568: scope: 搜索范围 +backend/app/services/zep_tools.py:571: SearchResult: 搜索结果 +backend/app/services/zep_tools.py:579: # 提取查询关键词(简单分词) +backend/app/services/zep_tools.py:584: """计算文本与查询的匹配分数""" +backend/app/services/zep_tools.py:588: # 完全匹配查询 +backend/app/services/zep_tools.py:591: # 关键词匹配 +backend/app/services/zep_tools.py:600: # 获取所有边并匹配 +backend/app/services/zep_tools.py:608: # 按分数排序 +backend/app/services/zep_tools.py:623: # 获取所有节点并匹配 +backend/app/services/zep_tools.py:658: 获取图谱的所有节点(分页获取) +backend/app/services/zep_tools.py:661: graph_id: 图谱ID +backend/app/services/zep_tools.py:664: 节点列表 +backend/app/services/zep_tools.py:686: 获取图谱的所有边(分页获取,包含时间信息) +backend/app/services/zep_tools.py:689: graph_id: 图谱ID +backend/app/services/zep_tools.py:690: include_temporal: 是否包含时间信息(默认True) +backend/app/services/zep_tools.py:693: 边列表(包含created_at, valid_at, invalid_at, expired_at) +backend/app/services/zep_tools.py:710: # 添加时间信息 +backend/app/services/zep_tools.py:724: 获取单个节点的详细信息 +backend/app/services/zep_tools.py:727: node_uuid: 节点UUID +backend/app/services/zep_tools.py:730: 节点信息或None +backend/app/services/zep_tools.py:737: operation_name=f"获取节点详情(uuid={node_uuid[:8]}...)" +backend/app/services/zep_tools.py:756: 获取节点相关的所有边 +backend/app/services/zep_tools.py:758: 通过获取图谱所有边,然后过滤出与指定节点相关的边 +backend/app/services/zep_tools.py:761: graph_id: 图谱ID +backend/app/services/zep_tools.py:762: node_uuid: 节点UUID +backend/app/services/zep_tools.py:765: 边列表 +backend/app/services/zep_tools.py:770: # 获取图谱所有边,然后过滤 +backend/app/services/zep_tools.py:775: # 检查边是否与指定节点相关(作为源或目标) +backend/app/services/zep_tools.py:792: 按类型获取实体 +backend/app/services/zep_tools.py:795: graph_id: 图谱ID +backend/app/services/zep_tools.py:796: entity_type: 实体类型(如 Student, PublicFigure 等) +backend/app/services/zep_tools.py:799: 符合类型的实体列表 +backend/app/services/zep_tools.py:807: # 检查labels是否包含指定类型 +backend/app/services/zep_tools.py:820: 获取指定实体的关系摘要 +backend/app/services/zep_tools.py:822: 搜索与该实体相关的所有信息,并生成摘要 +backend/app/services/zep_tools.py:825: graph_id: 图谱ID +backend/app/services/zep_tools.py:826: entity_name: 实体名称 +backend/app/services/zep_tools.py:829: 实体摘要信息 +backend/app/services/zep_tools.py:833: # 先搜索该实体相关的信息 +backend/app/services/zep_tools.py:840: # 尝试在所有节点中找到该实体 +backend/app/services/zep_tools.py:850: # 传入graph_id参数 +backend/app/services/zep_tools.py:863: 获取图谱的统计信息 +backend/app/services/zep_tools.py:866: graph_id: 图谱ID +backend/app/services/zep_tools.py:869: 统计信息 +backend/app/services/zep_tools.py:876: # 统计实体类型分布 +backend/app/services/zep_tools.py:883: # 统计关系类型分布 +backend/app/services/zep_tools.py:903: 获取模拟相关的上下文信息 +backend/app/services/zep_tools.py:905: 综合搜索与模拟需求相关的所有信息 +backend/app/services/zep_tools.py:908: graph_id: 图谱ID +backend/app/services/zep_tools.py:909: simulation_requirement: 模拟需求描述 +backend/app/services/zep_tools.py:910: limit: 每类信息的数量限制 +backend/app/services/zep_tools.py:913: 模拟上下文信息 +backend/app/services/zep_tools.py:917: # 搜索与模拟需求相关的信息 +backend/app/services/zep_tools.py:924: # 获取图谱统计 +backend/app/services/zep_tools.py:927: # 获取所有实体节点 +backend/app/services/zep_tools.py:930: # 筛选有实际类型的实体(非纯Entity节点) +backend/app/services/zep_tools.py:945: "entities": entities[:limit], # 限制数量 +backend/app/services/zep_tools.py:949: # ========== 核心检索工具(优化后) ========== +backend/app/services/zep_tools.py:960: 【InsightForge - 深度洞察检索】 +backend/app/services/zep_tools.py:962: 最强大的混合检索函数,自动分解问题并多维度检索: +backend/app/services/zep_tools.py:963: 1. 使用LLM将问题分解为多个子问题 +backend/app/services/zep_tools.py:964: 2. 对每个子问题进行语义搜索 +backend/app/services/zep_tools.py:965: 3. 提取相关实体并获取其详细信息 +backend/app/services/zep_tools.py:966: 4. 追踪关系链 +backend/app/services/zep_tools.py:967: 5. 整合所有结果,生成深度洞察 +backend/app/services/zep_tools.py:970: graph_id: 图谱ID +backend/app/services/zep_tools.py:971: query: 用户问题 +backend/app/services/zep_tools.py:972: simulation_requirement: 模拟需求描述 +backend/app/services/zep_tools.py:973: report_context: 报告上下文(可选,用于更精准的子问题生成) +backend/app/services/zep_tools.py:974: max_sub_queries: 最大子问题数量 +backend/app/services/zep_tools.py:977: InsightForgeResult: 深度洞察检索结果 +backend/app/services/zep_tools.py:987: # Step 1: 使用LLM生成子问题 +backend/app/services/zep_tools.py:997: # Step 2: 对每个子问题进行语义搜索 +backend/app/services/zep_tools.py:1017: # 对原始问题也进行搜索 +backend/app/services/zep_tools.py:1032: # Step 3: 从边中提取相关实体UUID,只获取这些实体的信息(不获取全部节点) +backend/app/services/zep_tools.py:1043: # 获取所有相关实体的详情(不限制数量,完整输出) +backend/app/services/zep_tools.py:1045: node_map = {} # 用于后续关系链构建 +backend/app/services/zep_tools.py:1047: for uuid in list(entity_uuids): # 处理所有实体,不截断 +backend/app/services/zep_tools.py:1051: # 单独获取每个相关节点的信息 +backend/app/services/zep_tools.py:1055: entity_type = next((l for l in node.labels if l not in ["Entity", "Node"]), "实体") +backend/app/services/zep_tools.py:1057: # 获取该实体相关的所有事实(不截断) +backend/app/services/zep_tools.py:1068: "related_facts": related_facts # 完整输出,不截断 +backend/app/services/zep_tools.py:1077: # Step 4: 构建所有关系链(不限制数量) +backend/app/services/zep_tools.py:1079: for edge_data in all_edges: # 处理所有边,不截断 +backend/app/services/zep_tools.py:1106: 使用LLM生成子问题 +backend/app/services/zep_tools.py:1108: 将复杂问题分解为多个可以独立检索的子问题 +backend/app/services/zep_tools.py:1110: system_prompt = """你是一个专业的问题分析专家。你的任务是将一个复杂问题分解为多个可以在模拟世界中独立观察的子问题。 +backend/app/services/zep_tools.py:1112:要求: +backend/app/services/zep_tools.py:1113:1. 每个子问题应该足够具体,可以在模拟世界中找到相关的Agent行为或事件 +backend/app/services/zep_tools.py:1114:2. 子问题应该覆盖原问题的不同维度(如:谁、什么、为什么、怎么样、何时、何地) +backend/app/services/zep_tools.py:1115:3. 子问题应该与模拟场景相关 +backend/app/services/zep_tools.py:1116:4. 返回JSON格式:{"sub_queries": ["子问题1", "子问题2", ...]}""" +backend/app/services/zep_tools.py:1118: user_prompt = f"""模拟需求背景: +backend/app/services/zep_tools.py:1121:{f"报告上下文:{report_context[:500]}" if report_context else ""} +backend/app/services/zep_tools.py:1123:请将以下问题分解为{max_queries}个子问题: +backend/app/services/zep_tools.py:1126:返回JSON格式的子问题列表。""" +backend/app/services/zep_tools.py:1138: # 确保是字符串列表 +backend/app/services/zep_tools.py:1143: # 降级:返回基于原问题的变体 +backend/app/services/zep_tools.py:1146: f"{query} 的主要参与者", +backend/app/services/zep_tools.py:1147: f"{query} 的原因和影响", +backend/app/services/zep_tools.py:1148: f"{query} 的发展过程" +backend/app/services/zep_tools.py:1159: 【PanoramaSearch - 广度搜索】 +backend/app/services/zep_tools.py:1161: 获取全貌视图,包括所有相关内容和历史/过期信息: +backend/app/services/zep_tools.py:1162: 1. 获取所有相关节点 +backend/app/services/zep_tools.py:1163: 2. 获取所有边(包括已过期/失效的) +backend/app/services/zep_tools.py:1164: 3. 分类整理当前有效和历史信息 +backend/app/services/zep_tools.py:1166: 这个工具适用于需要了解事件全貌、追踪演变过程的场景。 +backend/app/services/zep_tools.py:1169: graph_id: 图谱ID +backend/app/services/zep_tools.py:1170: query: 搜索查询(用于相关性排序) +backend/app/services/zep_tools.py:1171: include_expired: 是否包含过期内容(默认True) +backend/app/services/zep_tools.py:1172: limit: 返回结果数量限制 +backend/app/services/zep_tools.py:1175: PanoramaResult: 广度搜索结果 +backend/app/services/zep_tools.py:1181: # 获取所有节点 +backend/app/services/zep_tools.py:1187: # 获取所有边(包含时间信息) +backend/app/services/zep_tools.py:1192: # 分类事实 +backend/app/services/zep_tools.py:1200: # 为事实添加实体名称 +backend/app/services/zep_tools.py:1204: # 判断是否过期/失效 +backend/app/services/zep_tools.py:1208: # 历史/过期事实,添加时间标记 +backend/app/services/zep_tools.py:1209: valid_at = edge.valid_at or "未知" +backend/app/services/zep_tools.py:1210: invalid_at = edge.invalid_at or edge.expired_at or "未知" +backend/app/services/zep_tools.py:1214: # 当前有效事实 +backend/app/services/zep_tools.py:1217: # 基于查询进行相关性排序 +backend/app/services/zep_tools.py:1231: # 排序并限制数量 +backend/app/services/zep_tools.py:1250: 【QuickSearch - 简单搜索】 +backend/app/services/zep_tools.py:1252: 快速、轻量级的检索工具: +backend/app/services/zep_tools.py:1253: 1. 直接调用Zep语义搜索 +backend/app/services/zep_tools.py:1254: 2. 返回最相关的结果 +backend/app/services/zep_tools.py:1255: 3. 适用于简单、直接的检索需求 +backend/app/services/zep_tools.py:1258: graph_id: 图谱ID +backend/app/services/zep_tools.py:1259: query: 搜索查询 +backend/app/services/zep_tools.py:1260: limit: 返回结果数量 +backend/app/services/zep_tools.py:1263: SearchResult: 搜索结果 +backend/app/services/zep_tools.py:1267: # 直接调用现有的search_graph方法 +backend/app/services/zep_tools.py:1287: 【InterviewAgents - 深度采访】 +backend/app/services/zep_tools.py:1289: 调用真实的OASIS采访API,采访模拟中正在运行的Agent: +backend/app/services/zep_tools.py:1290: 1. 自动读取人设文件,了解所有模拟Agent +backend/app/services/zep_tools.py:1291: 2. 使用LLM分析采访需求,智能选择最相关的Agent +backend/app/services/zep_tools.py:1292: 3. 使用LLM生成采访问题 +backend/app/services/zep_tools.py:1293: 4. 调用 /api/simulation/interview/batch 接口进行真实采访(双平台同时采访) +backend/app/services/zep_tools.py:1294: 5. 整合所有采访结果,生成采访报告 +backend/app/services/zep_tools.py:1296: 【重要】此功能需要模拟环境处于运行状态(OASIS环境未关闭) +backend/app/services/zep_tools.py:1298: 【使用场景】 +backend/app/services/zep_tools.py:1299: - 需要从不同角色视角了解事件看法 +backend/app/services/zep_tools.py:1300: - 需要收集多方意见和观点 +backend/app/services/zep_tools.py:1301: - 需要获取模拟Agent的真实回答(非LLM模拟) +backend/app/services/zep_tools.py:1304: simulation_id: 模拟ID(用于定位人设文件和调用采访API) +backend/app/services/zep_tools.py:1305: interview_requirement: 采访需求描述(非结构化,如"了解学生对事件的看法") +backend/app/services/zep_tools.py:1306: simulation_requirement: 模拟需求背景(可选) +backend/app/services/zep_tools.py:1307: max_agents: 最多采访的Agent数量 +backend/app/services/zep_tools.py:1308: custom_questions: 自定义采访问题(可选,若不提供则自动生成) +backend/app/services/zep_tools.py:1311: InterviewResult: 采访结果 +backend/app/services/zep_tools.py:1322: # Step 1: 读取人设文件 +backend/app/services/zep_tools.py:1327: result.summary = "未找到可采访的Agent人设文件" +backend/app/services/zep_tools.py:1333: # Step 2: 使用LLM选择要采访的Agent(返回agent_id列表) +backend/app/services/zep_tools.py:1345: # Step 3: 生成采访问题(如果没有提供) +backend/app/services/zep_tools.py:1354: # 将问题合并为一个采访prompt +backend/app/services/zep_tools.py:1357: # 添加优化前缀,约束Agent回复格式 +backend/app/services/zep_tools.py:1359: "你正在接受一次采访。请结合你的人设、所有的过往记忆与行动," +backend/app/services/zep_tools.py:1360: "以纯文本方式直接回答以下问题。\n" +backend/app/services/zep_tools.py:1361: "回复要求:\n" +backend/app/services/zep_tools.py:1362: "1. 直接用自然语言回答,不要调用任何工具\n" +backend/app/services/zep_tools.py:1363: "2. 不要返回JSON格式或工具调用格式\n" +backend/app/services/zep_tools.py:1364: "3. 不要使用Markdown标题(如#、##、###)\n" +backend/app/services/zep_tools.py:1365: "4. 按问题编号逐一回答,每个回答以「问题X:」开头(X为问题编号)\n" +backend/app/services/zep_tools.py:1366: "5. 每个问题的回答之间用空行分隔\n" +backend/app/services/zep_tools.py:1367: "6. 回答要有实质内容,每个问题至少回答2-3句话\n\n" +backend/app/services/zep_tools.py:1371: # Step 4: 调用真实的采访API(不指定platform,默认双平台同时采访) +backend/app/services/zep_tools.py:1373: # 构建批量采访列表(不指定platform,双平台采访) +backend/app/services/zep_tools.py:1378: "prompt": optimized_prompt # 使用优化后的prompt +backend/app/services/zep_tools.py:1379: # 不指定platform,API会在twitter和reddit两个平台都采访 +backend/app/services/zep_tools.py:1384: # 调用 SimulationRunner 的批量采访方法(不传platform,双平台采访) +backend/app/services/zep_tools.py:1388: platform=None, # 不指定platform,双平台采访 +backend/app/services/zep_tools.py:1389: timeout=180.0 # 双平台需要更长超时 +backend/app/services/zep_tools.py:1394: # 检查API调用是否成功 +backend/app/services/zep_tools.py:1396: error_msg = api_result.get("error", "未知错误") +backend/app/services/zep_tools.py:1398: result.summary = f"采访API调用失败:{error_msg}。请检查OASIS模拟环境状态。" +backend/app/services/zep_tools.py:1401: # Step 5: 解析API返回结果,构建AgentInterview对象 +backend/app/services/zep_tools.py:1402: # 双平台模式返回格式: {"twitter_0": {...}, "reddit_0": {...}, "twitter_1": {...}, ...} +backend/app/services/zep_tools.py:1409: agent_role = agent.get("profession", "未知") +backend/app/services/zep_tools.py:1412: # 获取该Agent在两个平台的采访结果 +backend/app/services/zep_tools.py:1419: # 清理可能的工具调用 JSON 包裹 +backend/app/services/zep_tools.py:1423: # 始终输出双平台标记 +backend/app/services/zep_tools.py:1424: twitter_text = twitter_response if twitter_response else "(该平台未获得回复)" +backend/app/services/zep_tools.py:1425: reddit_text = reddit_response if reddit_response else "(该平台未获得回复)" +backend/app/services/zep_tools.py:1426: response_text = f"【Twitter平台回答】\n{twitter_text}\n\n【Reddit平台回答】\n{reddit_text}" +backend/app/services/zep_tools.py:1428: # 提取关键引言(从两个平台的回答中) +backend/app/services/zep_tools.py:1432: # 清理响应文本:去掉标记、编号、Markdown 等干扰 +backend/app/services/zep_tools.py:1436: clean_text = re.sub(r'问题\d+[::]\s*', '', clean_text) +backend/app/services/zep_tools.py:1439: # 策略1(主): 提取完整的有实质内容的句子 +backend/app/services/zep_tools.py:1445: and not s.strip().startswith(('{', '问题')) +backend/app/services/zep_tools.py:1450: # 策略2(补充): 正确配对的中文引号「」内长文本 +backend/app/services/zep_tools.py:1459: agent_bio=agent_bio[:1000], # 扩大bio长度限制 +backend/app/services/zep_tools.py:1469: # 模拟环境未运行 +backend/app/services/zep_tools.py:1471: result.summary = f"采访失败:{str(e)}。模拟环境可能已关闭,请确保OASIS环境正在运行。" +backend/app/services/zep_tools.py:1477: result.summary = f"采访过程发生错误:{str(e)}" +backend/app/services/zep_tools.py:1480: # Step 6: 生成采访摘要 +backend/app/services/zep_tools.py:1492: """清理 Agent 回复中的 JSON 工具调用包裹,提取实际内容""" +backend/app/services/zep_tools.py:1512: """加载模拟的Agent人设文件""" +backend/app/services/zep_tools.py:1516: # 构建人设文件路径 +backend/app/services/zep_tools.py:1524: # 优先尝试读取Reddit JSON格式 +backend/app/services/zep_tools.py:1535: # 尝试读取Twitter CSV格式 +backend/app/services/zep_tools.py:1542: # CSV格式转换为统一格式 +backend/app/services/zep_tools.py:1548: "profession": "未知" +backend/app/services/zep_tools.py:1565: 使用LLM选择要采访的Agent +backend/app/services/zep_tools.py:1569: - selected_agents: 选中Agent的完整信息列表 +backend/app/services/zep_tools.py:1570: - selected_indices: 选中Agent的索引列表(用于API调用) +backend/app/services/zep_tools.py:1571: - reasoning: 选择理由 +backend/app/services/zep_tools.py:1574: # 构建Agent摘要列表 +backend/app/services/zep_tools.py:1580: "profession": profile.get("profession", "未知"), +backend/app/services/zep_tools.py:1586: system_prompt = """你是一个专业的采访策划专家。你的任务是根据采访需求,从模拟Agent列表中选择最适合采访的对象。 +backend/app/services/zep_tools.py:1588:选择标准: +backend/app/services/zep_tools.py:1589:1. Agent的身份/职业与采访主题相关 +backend/app/services/zep_tools.py:1590:2. Agent可能持有独特或有价值的观点 +backend/app/services/zep_tools.py:1591:3. 选择多样化的视角(如:支持方、反对方、中立方、专业人士等) +backend/app/services/zep_tools.py:1592:4. 优先选择与事件直接相关的角色 +backend/app/services/zep_tools.py:1594:返回JSON格式: +backend/app/services/zep_tools.py:1596: "selected_indices": [选中Agent的索引列表], +backend/app/services/zep_tools.py:1597: "reasoning": "选择理由说明" +backend/app/services/zep_tools.py:1600: user_prompt = f"""采访需求: +backend/app/services/zep_tools.py:1603:模拟背景: +backend/app/services/zep_tools.py:1604:{simulation_requirement if simulation_requirement else "未提供"} +backend/app/services/zep_tools.py:1606:可选择的Agent列表(共{len(agent_summaries)}个): +backend/app/services/zep_tools.py:1609:请选择最多{max_agents}个最适合采访的Agent,并说明选择理由。""" +backend/app/services/zep_tools.py:1621: reasoning = response.get("reasoning", "基于相关性自动选择") +backend/app/services/zep_tools.py:1623: # 获取选中的Agent完整信息 +backend/app/services/zep_tools.py:1635: # 降级:选择前N个 +backend/app/services/zep_tools.py:1638: return selected, indices, "使用默认选择策略" +backend/app/services/zep_tools.py:1646: """使用LLM生成采访问题""" +backend/app/services/zep_tools.py:1648: agent_roles = [a.get("profession", "未知") for a in selected_agents] +backend/app/services/zep_tools.py:1650: system_prompt = """你是一个专业的记者/采访者。根据采访需求,生成3-5个深度采访问题。 +backend/app/services/zep_tools.py:1652:问题要求: +backend/app/services/zep_tools.py:1653:1. 开放性问题,鼓励详细回答 +backend/app/services/zep_tools.py:1654:2. 针对不同角色可能有不同答案 +backend/app/services/zep_tools.py:1655:3. 涵盖事实、观点、感受等多个维度 +backend/app/services/zep_tools.py:1656:4. 语言自然,像真实采访一样 +backend/app/services/zep_tools.py:1657:5. 每个问题控制在50字以内,简洁明了 +backend/app/services/zep_tools.py:1658:6. 直接提问,不要包含背景说明或前缀 +backend/app/services/zep_tools.py:1660:返回JSON格式:{"questions": ["问题1", "问题2", ...]}""" +backend/app/services/zep_tools.py:1662: user_prompt = f"""采访需求:{interview_requirement} +backend/app/services/zep_tools.py:1664:模拟背景:{simulation_requirement if simulation_requirement else "未提供"} +backend/app/services/zep_tools.py:1666:采访对象角色:{', '.join(agent_roles)} +backend/app/services/zep_tools.py:1668:请生成3-5个采访问题。""" +backend/app/services/zep_tools.py:1679: return response.get("questions", [f"关于{interview_requirement},您有什么看法?"]) +backend/app/services/zep_tools.py:1684: f"关于{interview_requirement},您的观点是什么?", +backend/app/services/zep_tools.py:1685: "这件事对您或您所代表的群体有什么影响?", +backend/app/services/zep_tools.py:1686: "您认为应该如何解决或改进这个问题?" +backend/app/services/zep_tools.py:1694: """生成采访摘要""" +backend/app/services/zep_tools.py:1697: return "未完成任何采访" +backend/app/services/zep_tools.py:1699: # 收集所有采访内容 +backend/app/services/zep_tools.py:1704: system_prompt = """你是一个专业的新闻编辑。请根据多位受访者的回答,生成一份采访摘要。 +backend/app/services/zep_tools.py:1706:摘要要求: +backend/app/services/zep_tools.py:1707:1. 提炼各方主要观点 +backend/app/services/zep_tools.py:1708:2. 指出观点的共识和分歧 +backend/app/services/zep_tools.py:1709:3. 突出有价值的引言 +backend/app/services/zep_tools.py:1710:4. 客观中立,不偏袒任何一方 +backend/app/services/zep_tools.py:1711:5. 控制在1000字内 +backend/app/services/zep_tools.py:1713:格式约束(必须遵守): +backend/app/services/zep_tools.py:1714:- 使用纯文本段落,用空行分隔不同部分 +backend/app/services/zep_tools.py:1715:- 不要使用Markdown标题(如#、##、###) +backend/app/services/zep_tools.py:1716:- 不要使用分割线(如---、***) +backend/app/services/zep_tools.py:1717:- 引用受访者原话时使用中文引号「」 +backend/app/services/zep_tools.py:1718:- 可以使用**加粗**标记关键词,但不要使用其他Markdown语法""" +backend/app/services/zep_tools.py:1720: user_prompt = f"""采访主题:{interview_requirement} +backend/app/services/zep_tools.py:1722:采访内容: +backend/app/services/zep_tools.py:1725:请生成采访摘要。""" +backend/app/services/zep_tools.py:1740: # 降级:简单拼接 +backend/app/services/zep_tools.py:1741: return f"共采访了{len(interviews)}位受访者,包括:" + "、".join([i.agent_name for i in interviews]) +backend/app/utils/__init__.py:2:工具模块 +backend/app/utils/file_parser.py:2:文件解析工具 +backend/app/utils/file_parser.py:3:支持PDF、Markdown、TXT文件的文本提取 +backend/app/utils/file_parser.py:13: 读取文本文件,UTF-8失败时自动探测编码。 +backend/app/utils/file_parser.py:15: 采用多级回退策略: +backend/app/utils/file_parser.py:16: 1. 首先尝试 UTF-8 解码 +backend/app/utils/file_parser.py:17: 2. 使用 charset_normalizer 检测编码 +backend/app/utils/file_parser.py:18: 3. 回退到 chardet 检测编码 +backend/app/utils/file_parser.py:19: 4. 最终使用 UTF-8 + errors='replace' 兜底 +backend/app/utils/file_parser.py:22: file_path: 文件路径 +backend/app/utils/file_parser.py:25: 解码后的文本内容 +backend/app/utils/file_parser.py:29: # 首先尝试 UTF-8 +backend/app/utils/file_parser.py:35: # 尝试使用 charset_normalizer 检测编码 +backend/app/utils/file_parser.py:45: # 回退到 chardet +backend/app/utils/file_parser.py:54: # 最终兜底:使用 UTF-8 + replace +backend/app/utils/file_parser.py:62: """文件解析器""" +backend/app/utils/file_parser.py:69: 从文件中提取文本 +backend/app/utils/file_parser.py:72: file_path: 文件路径 +backend/app/utils/file_parser.py:75: 提取的文本内容 +backend/app/utils/file_parser.py:80: raise FileNotFoundError(f"文件不存在: {file_path}") +backend/app/utils/file_parser.py:85: raise ValueError(f"不支持的文件格式: {suffix}") +backend/app/utils/file_parser.py:94: raise ValueError(f"无法处理的文件格式: {suffix}") +backend/app/utils/file_parser.py:98: """从PDF提取文本""" +backend/app/utils/file_parser.py:102: raise ImportError("需要安装PyMuPDF: pip install PyMuPDF") +backend/app/utils/file_parser.py:115: """从Markdown提取文本,支持自动编码检测""" +backend/app/utils/file_parser.py:120: """从TXT提取文本,支持自动编码检测""" +backend/app/utils/file_parser.py:126: 从多个文件提取文本并合并 +backend/app/utils/file_parser.py:129: file_paths: 文件路径列表 +backend/app/utils/file_parser.py:132: 合并后的文本 +backend/app/utils/file_parser.py:140: all_texts.append(f"=== 文档 {i}: {filename} ===\n{text}") +backend/app/utils/file_parser.py:142: all_texts.append(f"=== 文档 {i}: {file_path} (提取失败: {str(e)}) ===") +backend/app/utils/file_parser.py:153: 将文本分割成小块 +backend/app/utils/file_parser.py:156: text: 原始文本 +backend/app/utils/file_parser.py:157: chunk_size: 每块的字符数 +backend/app/utils/file_parser.py:158: overlap: 重叠字符数 +backend/app/utils/file_parser.py:161: 文本块列表 +backend/app/utils/file_parser.py:172: # 尝试在句子边界处分割 +backend/app/utils/file_parser.py:174: # 查找最近的句子结束符 +backend/app/utils/file_parser.py:185: # 下一个块从重叠位置开始 +backend/app/utils/llm_client.py:2:LLM客户端封装 +backend/app/utils/llm_client.py:3:统一使用OpenAI格式调用 +backend/app/utils/llm_client.py:16: """LLM客户端""" +backend/app/utils/llm_client.py:29: raise ValueError("LLM_API_KEY 未配置") +backend/app/utils/llm_client.py:41: 发送聊天请求 +backend/app/utils/llm_client.py:44: messages: 消息列表 +backend/app/utils/llm_client.py:45: temperature: 温度参数 +backend/app/utils/llm_client.py:46: max_tokens: 最大token数 +backend/app/utils/llm_client.py:47: response_format: 响应格式(如JSON模式) +backend/app/utils/llm_client.py:50: 模型响应文本 +backend/app/utils/llm_client.py:64: # 部分模型(如MiniMax M2.5)会在content中包含思考内容,需要移除 +backend/app/utils/llm_client.py:82: # 清理markdown代码块标记 +backend/app/utils/llm_client.py:93: raise ValueError(f"LLM返回的JSON格式无效: {cleaned_response}") +backend/app/utils/locale.py:96: return lang_config.get('llmInstruction', '请使用中文回答。') +backend/app/utils/logger.py:2:日志配置模块 +backend/app/utils/logger.py:3:提供统一的日志管理,同时输出到控制台和文件 +backend/app/utils/logger.py:15: 确保 stdout/stderr 使用 UTF-8 编码 +backend/app/utils/logger.py:16: 解决 Windows 控制台中文乱码问题 +backend/app/utils/logger.py:19: # Windows 下重新配置标准输出为 UTF-8 +backend/app/utils/logger.py:26:# 日志目录 +backend/app/utils/logger.py:32: 设置日志器 +backend/app/utils/logger.py:35: name: 日志器名称 +backend/app/utils/logger.py:36: level: 日志级别 +backend/app/utils/logger.py:39: 配置好的日志器 +backend/app/utils/logger.py:41: # 确保日志目录存在 +backend/app/utils/logger.py:44: # 创建日志器 +backend/app/utils/logger.py:48: # 阻止日志向上传播到根 logger,避免重复输出 +backend/app/utils/logger.py:51: # 如果已经有处理器,不重复添加 +backend/app/utils/logger.py:55: # 日志格式 +backend/app/utils/logger.py:66: # 1. 文件处理器 - 详细日志(按日期命名,带轮转) +backend/app/utils/logger.py:77: # 2. 控制台处理器 - 简洁日志(INFO及以上) +backend/app/utils/logger.py:78: # 确保 Windows 下使用 UTF-8 编码,避免中文乱码 +backend/app/utils/logger.py:84: # 添加处理器 +backend/app/utils/logger.py:93: 获取日志器(如果不存在则创建) +backend/app/utils/logger.py:96: name: 日志器名称 +backend/app/utils/logger.py:99: 日志器实例 +backend/app/utils/logger.py:107:# 创建默认日志器 +backend/app/utils/logger.py:111:# 便捷方法 +backend/app/utils/retry.py:2:API调用重试机制 +backend/app/utils/retry.py:3:用于处理LLM等外部API调用的重试逻辑 +backend/app/utils/retry.py:25: 带指数退避的重试装饰器 +backend/app/utils/retry.py:28: max_retries: 最大重试次数 +backend/app/utils/retry.py:29: initial_delay: 初始延迟(秒) +backend/app/utils/retry.py:30: max_delay: 最大延迟(秒) +backend/app/utils/retry.py:31: backoff_factor: 退避因子 +backend/app/utils/retry.py:32: jitter: 是否添加随机抖动 +backend/app/utils/retry.py:33: exceptions: 需要重试的异常类型 +backend/app/utils/retry.py:34: on_retry: 重试时的回调函数 (exception, retry_count) +backend/app/utils/retry.py:55: logger.error(f"函数 {func.__name__} 在 {max_retries} 次重试后仍失败: {str(e)}") +backend/app/utils/retry.py:58: # 计算延迟 +backend/app/utils/retry.py:64: f"函数 {func.__name__} 第 {attempt + 1} 次尝试失败: {str(e)}, " +backend/app/utils/retry.py:65: f"{current_delay:.1f}秒后重试..." +backend/app/utils/retry.py:90: 异步版本的重试装饰器 +backend/app/utils/retry.py:108: logger.error(f"异步函数 {func.__name__} 在 {max_retries} 次重试后仍失败: {str(e)}") +backend/app/utils/retry.py:116: f"异步函数 {func.__name__} 第 {attempt + 1} 次尝试失败: {str(e)}, " +backend/app/utils/retry.py:117: f"{current_delay:.1f}秒后重试..." +backend/app/utils/retry.py:134: 可重试的API客户端封装 +backend/app/utils/retry.py:157: 执行函数调用并在失败时重试 +backend/app/utils/retry.py:160: func: 要调用的函数 +backend/app/utils/retry.py:161: *args: 函数参数 +backend/app/utils/retry.py:162: exceptions: 需要重试的异常类型 +backend/app/utils/retry.py:163: **kwargs: 函数关键字参数 +backend/app/utils/retry.py:166: 函数返回值 +backend/app/utils/retry.py:179: logger.error(f"API调用在 {self.max_retries} 次重试后仍失败: {str(e)}") +backend/app/utils/retry.py:186: f"API调用第 {attempt + 1} 次尝试失败: {str(e)}, " +backend/app/utils/retry.py:187: f"{current_delay:.1f}秒后重试..." +backend/app/utils/retry.py:203: 批量调用并对每个失败项单独重试 +backend/app/utils/retry.py:206: items: 要处理的项目列表 +backend/app/utils/retry.py:207: process_func: 处理函数,接收单个item作为参数 +backend/app/utils/retry.py:208: exceptions: 需要重试的异常类型 +backend/app/utils/retry.py:209: continue_on_failure: 单项失败后是否继续处理其他项 +backend/app/utils/retry.py:212: (成功结果列表, 失败项列表) +backend/app/utils/retry.py:227: logger.error(f"处理第 {idx + 1} 项失败: {str(e)}") +backend/app/utils/zep_paging.py:1:"""Zep Graph 分页读取工具。 +backend/app/utils/zep_paging.py:3:Zep 的 node/edge 列表接口使用 UUID cursor 分页, +backend/app/utils/zep_paging.py:4:本模块封装自动翻页逻辑(含单页重试),对调用方透明地返回完整列表。 +backend/app/utils/zep_paging.py:33: """单页请求,失败时指数退避重试。自动处理429限速。""" +backend/app/utils/zep_paging.py:46: # 检测429限速,使用retry-after头部指定的等待时间 +backend/app/utils/zep_paging.py:68: """分页获取图谱节点,最多返回 max_items 条(默认 2000)。每页请求自带重试。""" +backend/app/utils/zep_paging.py:113: """分页获取图谱所有边,返回完整列表。每页请求自带重试。""" + +[frontend/src] (124 lines) +frontend/src/components/Step2EnvSetup.vue:680: if (newStage === '生成Agent人设' || newStage === 'generating_profiles') { +frontend/src/components/Step2EnvSetup.vue:682: } else if (newStage === '生成模拟配置' || newStage === 'generating_config') { +frontend/src/components/Step2EnvSetup.vue:689: } else if (newStage === '准备模拟脚本' || newStage === 'copying_scripts') { +frontend/src/components/Step3Simulation.vue:423: startError.value = res.error || '启动失败' +frontend/src/components/Step4Report.vue:555: const queryMatch = text.match(/分析问题:\s*(.+?)(?:\n|$)/) +frontend/src/components/Step4Report.vue:559: const reqMatch = text.match(/预测场景:\s*(.+?)(?:\n|$)/) +frontend/src/components/Step4Report.vue:562: // Extract counters from the "相关预测事实: X条" format. +frontend/src/components/Step4Report.vue:563: const factMatch = text.match(/相关预测事实:\s*(\d+)/) +frontend/src/components/Step4Report.vue:564: const entityMatch = text.match(/涉及实体:\s*(\d+)/) +frontend/src/components/Step4Report.vue:565: const relMatch = text.match(/关系链:\s*(\d+)/) +frontend/src/components/Step4Report.vue:571: const subQSection = text.match(/### 分析的子问题\n([\s\S]*?)(?=\n###|$)/) +frontend/src/components/Step4Report.vue:578: const factsSection = text.match(/### 【关键事实】[\s\S]*?\n([\s\S]*?)(?=\n###|$)/) +frontend/src/components/Step4Report.vue:588: const entitySection = text.match(/### 【核心实体】\n([\s\S]*?)(?=\n###|$)/) +frontend/src/components/Step4Report.vue:595: const summaryMatch = block.match(/摘要:\s*"?(.+?)"?(?:\n|$)/) +frontend/src/components/Step4Report.vue:596: const relatedMatch = block.match(/相关事实:\s*(\d+)/) +frontend/src/components/Step4Report.vue:607: const relSection = text.match(/### 【关系链】\n([\s\S]*?)(?=\n###|$)/) +frontend/src/components/Step4Report.vue:636: const queryMatch = text.match(/查询:\s*(.+?)(?:\n|$)/) +frontend/src/components/Step4Report.vue:640: const nodesMatch = text.match(/总节点数:\s*(\d+)/) +frontend/src/components/Step4Report.vue:641: const edgesMatch = text.match(/总边数:\s*(\d+)/) +frontend/src/components/Step4Report.vue:642: const activeMatch = text.match(/当前有效事实:\s*(\d+)/) +frontend/src/components/Step4Report.vue:643: const histMatch = text.match(/历史\/过期事实:\s*(\d+)/) +frontend/src/components/Step4Report.vue:650: const activeSection = text.match(/### 【当前有效事实】[\s\S]*?\n([\s\S]*?)(?=\n###|$)/) +frontend/src/components/Step4Report.vue:661: const histSection = text.match(/### 【历史\/过期事实】[\s\S]*?\n([\s\S]*?)(?=\n###|$)/) +frontend/src/components/Step4Report.vue:671: const entitySection = text.match(/### 【涉及实体】\n([\s\S]*?)(?=\n###|$)/) +frontend/src/components/Step4Report.vue:700: const topicMatch = text.match(/\*\*采访主题:\*\*\s*(.+?)(?:\n|$)/) +frontend/src/components/Step4Report.vue:703: // Extract the interview-count line, e.g. "5 / 9 位模拟Agent". +frontend/src/components/Step4Report.vue:704: const countMatch = text.match(/\*\*采访人数:\*\*\s*(\d+)\s*\/\s*(\d+)/) +frontend/src/components/Step4Report.vue:712: const reasonMatch = text.match(/### 采访对象选择理由\n([\s\S]*?)(?=\n---\n|\n### 采访实录)/) +frontend/src/components/Step4Report.vue:738: // Format 2: "- 选择(index ):" +frontend/src/components/Step4Report.vue:740: headerMatch = line.match(/^-\s*选择([^((]+)(?:[((]index\s*=?\s*\d+[))])?[::]\s*(.*)/) +frontend/src/components/Step4Report.vue:763: } else if (currentName && line.trim() && !line.match(/^未选|^综上|^最终选择/)) { +frontend/src/components/Step4Report.vue:779: const interviewBlocks = text.split(/#### 采访 #\d+:/).slice(1) +frontend/src/components/Step4Report.vue:795: // Extract the title (e.g. "学生", "教育从业者"). +frontend/src/components/Step4Report.vue:809: const bioMatch = block.match(/_简介:\s*([\s\S]*?)_\n/) +frontend/src/components/Step4Report.vue:832: const answerMatch = block.match(/\*\*A:\*\*\s*([\s\S]*?)(?=\*\*关键引言|$)/) +frontend/src/components/Step4Report.vue:837: const twitterMatch = answerText.match(/【Twitter平台回答】\n?([\s\S]*?)(?=【Reddit平台回答】|$)/) +frontend/src/components/Step4Report.vue:838: const redditMatch = answerText.match(/【Reddit平台回答】\n?([\s\S]*?)$/) +frontend/src/components/Step4Report.vue:850: if (interview.redditAnswer && interview.redditAnswer !== '(该平台未获得回复)') { +frontend/src/components/Step4Report.vue:854: if (interview.twitterAnswer && interview.twitterAnswer !== '(该平台未获得回复)') { +frontend/src/components/Step4Report.vue:864: const quotesMatch = block.match(/\*\*关键引言:\*\*\n([\s\S]*?)(?=\n---|\n####|$)/) +frontend/src/components/Step4Report.vue:886: const summaryMatch = text.match(/### 采访摘要与核心观点\n([\s\S]*?)$/) +frontend/src/components/Step4Report.vue:908: const queryMatch = text.match(/搜索查询:\s*(.+?)(?:\n|$)/) +frontend/src/components/Step4Report.vue:912: const countMatch = text.match(/找到\s*(\d+)\s*条/) +frontend/src/components/Step4Report.vue:916: const factsSection = text.match(/### 相关事实:\n([\s\S]*)$/) +frontend/src/components/Step4Report.vue:923: const edgesSection = text.match(/### 相关边:\n([\s\S]*?)(?=\n###|$)/) +frontend/src/components/Step4Report.vue:936: const nodesSection = text.match(/### 相关节点:\n([\s\S]*?)(?=\n###|$)/) +frontend/src/components/Step4Report.vue:1325: return t === '(该平台未获得回复)' || t === '(该平台未获得回复)' || t === '[无回复]' +frontend/src/components/Step4Report.vue:1334: // 1. "问题X:" / "问题X:" — the newer Chinese-style format from the backend. +frontend/src/components/Step4Report.vue:1339: // Try the "问题X:" form first. +frontend/src/components/Step4Report.vue:1340: const cnPattern = /(?:^|[\r\n]+)问题(\d+)[::]\s*/g +frontend/src/components/Step4Report.vue:1364: .replace(/^问题\d+[::]\s*/, '') +frontend/src/components/Step4Report.vue:1464: h('div', { class: 'reason-label' }, '选择理由'), +frontend/src/components/Step4Report.vue:1774: return steps[0] || { noLabel: '--', title: '等待开始', status: 'todo', meta: '' } +frontend/src/components/Step4Report.vue:2005: if (log.includes('ERROR') || log.includes('错误')) return 'error' +frontend/src/components/Step4Report.vue:2006: if (log.includes('WARNING') || log.includes('警告')) return 'warning' +frontend/src/components/Step4Report.vue:2096: // Look for content after the Chinese "最终答案:" marker. +frontend/src/components/Step4Report.vue:2097: const chineseFinalMatch = response.match(/最终答案[::]\s*\n*([\s\S]*)$/i) +frontend/src/components/Step5Interaction.vue:721: .map(msg => `${msg.role === 'user' ? '提问者' : '你'}:${msg.content}`) +frontend/src/components/Step5Interaction.vue:723: prompt = `以下是我们之前的对话:\n${historyContext}\n\n现在我的新问题是:${message}` +frontend/src/views/Process.vue:10:
图谱构建
+frontend/src/views/Process.vue:26: 实时知识图谱 +frontend/src/views/Process.vue:30: {{ graphData.node_count || graphData.nodes?.length || 0 }} 节点 +frontend/src/views/Process.vue:32: {{ graphData.edge_count || graphData.edges?.length || 0 }} 关系 +frontend/src/views/Process.vue:36: