diff --git a/.kiro/specs/i18n-ci-guard/baseline.txt b/.kiro/specs/i18n-ci-guard/baseline.txt index e92f1a6e..94f44463 100644 --- a/.kiro/specs/i18n-ci-guard/baseline.txt +++ b/.kiro/specs/i18n-ci-guard/baseline.txt @@ -1,5 +1,5 @@ # Per-path CJK baseline for the i18n CI guard. # Format: \t. Sorted lexicographically. # Refresh via: python scripts/ci/i18n_cjk_guard.py --update-baseline -backend/app 2792 -frontend/src 902 +backend/app 307 +frontend/src 124 diff --git a/.kiro/specs/i18n-translate-backend-comments/HANDOFF.md b/.kiro/specs/i18n-translate-backend-comments/HANDOFF.md new file mode 100644 index 00000000..0e589d02 --- /dev/null +++ b/.kiro/specs/i18n-translate-backend-comments/HANDOFF.md @@ -0,0 +1,78 @@ +# Handoff — `i18n-translate-backend-comments` (Issue #7) + +## Status +**Complete.** All in-scope Chinese docstrings and `#` comments under `backend/` have been translated to English. + +This second installment of the ticket-#7 cleanup builds on the first installment (PR #20) and finishes the remaining 12 files. Together, the two installments cover the full 35-file in-scope set. + +## Completed across both installments (35 files) + +### First installment (PR #20 — landed on `feat/i18n-6-externalize-backend-logs`, then merged here via `merge main` into this branch) +- **Root**: `backend/app/__init__.py`, `backend/app/config.py`, `backend/run.py` +- **API package init**: `backend/app/api/__init__.py` +- **Models** (full package): `backend/app/models/__init__.py`, `project.py`, `task.py` +- **Utils** (full package): `backend/app/utils/__init__.py`, `file_parser.py`, `llm_client.py`, `locale.py`, `logger.py`, `retry.py`, `zep_paging.py` +- **Services** (partial): `backend/app/services/__init__.py`, `graph_builder.py`, `ontology_generator.py`, `simulation_ipc.py`, `simulation_manager.py`, `text_processor.py`, `zep_entity_reader.py` +- **Scripts** (partial): `backend/scripts/action_logger.py`, `backend/scripts/test_profile_format.py` + +### Second installment (this PR — finishes the ticket) +| File | Starting in-scope hits | Comment-the-obvious deletions | +| --- | --- | --- | +| `backend/app/api/graph.py` | 70 | 25 | +| `backend/app/api/report.py` | 104 | 11 | +| `backend/app/api/simulation.py` | 351 | ~25 | +| `backend/app/services/oasis_profile_generator.py` | 185 | ~14 | +| `backend/app/services/report_agent.py` | 335 | 8 | +| `backend/app/services/simulation_config_generator.py` | 148 | 0 | +| `backend/app/services/simulation_runner.py` | 277 | ~31 | +| `backend/app/services/zep_graph_memory_updater.py` | 97 | 5 | +| `backend/app/services/zep_tools.py` | 269 | 6 | +| `backend/scripts/run_parallel_simulation.py` | 227 | ~7 | +| `backend/scripts/run_reddit_simulation.py` | 75 | 12 | +| `backend/scripts/run_twitter_simulation.py` | 97 | 21 | +| **Total** | **2,235** | **~165** | + +After the pass, every file in the table reports zero in-scope hits from the AST scanner. + +## Remaining residuals (out of scope — owned by sibling tickets) +After this PR, the only files under `backend/` that still contain CJK characters do so exclusively inside string literals. These are owned by sibling tickets and are intentional residuals for this spec: + +- LLM prompt template strings: `oasis_profile_generator.py`, `ontology_generator.py`, `simulation_config_generator.py`, `report_agent.py` — owned by tickets #2 / #3 / #4 / #5. +- Runtime log strings, API response messages, exception arguments, CLI prints: distributed across `api/`, `services/`, `scripts/`, `utils/retry.py`, `utils/locale.py`, `run.py`, `app/config.py` — owned by ticket #6 (with follow-up tickets #18, #24 for residuals). +- Sample-data values returned to clients: `services/zep_tools.py`, `services/zep_graph_memory_updater.py`, `services/zep_entity_reader.py`, etc. + +The CJK CI guard (`scripts/ci/i18n_cjk_guard.py`) enforces that this set never grows; the per-path baseline at `.kiro/specs/i18n-ci-guard/baseline.txt` is updated as part of this PR to reflect the new (lower) count. + +## Verification methodology +The AST-aware scanner at `.kiro/specs/i18n-translate-backend-comments/scan_chinese.py` (committed in this branch) classifies every CJK-bearing line into one of three buckets: + +- `DOCSTRING` — line lies inside a module/class/function docstring (in scope). +- `COMMENT` — line contains a `#` and is not inside a docstring or string-literal span (in scope). +- `STRING` — line is part of a string-literal value (out of scope, owned by sibling tickets). + +For every translated file in this installment: + +1. `python3 -m py_compile ` succeeds. +2. The scanner reports `0` in-scope hits. +3. `git diff ` shows only docstring lines and `#` comment lines changed; no signature, import, decorator, expression, or string-literal byte changes. + +For two of the largest files (`api/simulation.py`, `report_agent.py`), the implementing agent additionally ran an AST-equivalence check (parsing both before and after, stripping docstrings, and confirming structural equality) to validate that no executable surface changed. + +## Test environment caveat +The repo's `uv sync` builds `tiktoken` from source, which requires a Rust toolchain. The sandbox running this implementation pass does not have Rust, so `cd backend && uv run python -m pytest scripts/test_profile_format.py` cannot be executed end-to-end here. Because the change set is comments-and-docstrings-only, runtime behavior cannot be affected; the syntactic-validity check (`py_compile` across all 12 files) stands in for the test run in this environment. + +A developer with the project's normal dev environment (Rust toolchain installed, full `uv sync` succeeded) should re-run `cd backend && uv run python -m pytest scripts/test_profile_format.py` against this branch before merging to confirm. + +## What is NOT changed +- No string literal anywhere in the touched files (verified by AST classification). +- No executable Python statement. +- No symbol renamed; `zep_*` legacy filenames preserved per steering rule. +- No file added or removed (other than the AST scanner inside `.kiro/specs/i18n-translate-backend-comments/`). +- No dependency added or version-bumped. + +## Branch & PR +- Branch: `docs/i18n-7-translate-backend-comments` (re-used from PR #20; that PR was merged into `feat/i18n-6-externalize-backend-logs` after `feat/i18n-6` had already merged into `main`, which orphaned PR #20's content from `main`). +- This PR re-targets the branch at `main`, including: the four prior commits from PR #20, a `Merge branch 'main'` commit (one conflict resolved in `services/ontology_generator.py` to combine PR #20's translated comment with main's English prompt-string), and the new commits for the 12 files completed here. +- Commits follow Conventional Commits in the form `docs(i18n): translate chinese docstrings/comments in backend/`. +- The PR description references issue #7 with `Closes #7`. +- No `Co-Authored-By:` watermarks. diff --git a/.kiro/specs/i18n-translate-backend-comments/design.md b/.kiro/specs/i18n-translate-backend-comments/design.md new file mode 100644 index 00000000..029150d5 --- /dev/null +++ b/.kiro/specs/i18n-translate-backend-comments/design.md @@ -0,0 +1,316 @@ +# Design Document — `i18n-translate-backend-comments` + +## Overview +**Purpose**: Translate Chinese-language docstrings and `#` comments across `backend/` Python files into English, so that English-speaking maintainers can read and review the codebase without translation overhead. + +**Users**: Backend maintainers and code reviewers who do not read Chinese. + +**Impact**: Improves developer ergonomics and review throughput. No runtime, behavior, or interface change. Adjacent i18n tickets (#2/#3/#4/#5/#6), which own the string-literal Chinese, remain unaffected. + +### Goals +- Eliminate Chinese characters from docstrings and `#` comments under the in-scope paths. +- Preserve Google-style docstring shape and project formatting rules (4-space indent, ≤120 chars/line, double-quoted strings). +- Keep the diff comments-and-docstrings-only — no executable, string-literal, or symbol changes. + +### Non-Goals +- Translating Chinese inside string literals (prompt templates, `logger.{info,warning,error}` arguments, API responses, error messages). These are owned by issues #2/#3/#4/#5/#6. +- Refactoring code, reformatting style, or renaming symbols. +- Introducing new tooling, linters, or CI rules. +- Translating `backend/tests/test_locale*.py` (Chinese there is intentional test data inside string literals; outside ticket scope). + +## Boundary Commitments + +### This Spec Owns +- Comment and docstring text under: `backend/app/__init__.py`, `backend/app/config.py`, `backend/app/api/`, `backend/app/models/`, `backend/app/services/`, `backend/app/utils/`, `backend/run.py`, `backend/scripts/`. +- The decision rule for distinguishing docstrings from value strings (first-statement rule). +- The Chinese→English Google-style docstring key map. +- The verification workflow (residual `grep`, `pytest`, diff sanity check). + +### Out of Boundary +- All string-literal content, including triple-quoted strings used as values. +- Files under `backend/tests/`, `backend/.venv/`, and any non-Python file. +- Refactors, renames, formatting changes, or new dependencies. +- Front-end localization, locale JSON files, or i18n runtime behavior. + +### Allowed Dependencies +- The repository's Python source (read + write for in-scope files only). +- The existing test suite (`backend/scripts/test_profile_format.py`) for verification. +- The existing `grep`-based residual scan for verification. + +### Revalidation Triggers +- A new in-scope file added under the listed paths (would expand the file list). +- A change to `dev-guidelines.md` regarding docstring style (would change the key map or quote/indent rule). +- A merge of any adjacent i18n ticket (#2/#3/#4/#5/#6) that turns a string literal into a docstring or vice versa. + +## Architecture + +### Existing Architecture Analysis +This change touches only commentary; no architectural element of the backend is modified. The work spans the following packages: + +- `backend/app/__init__.py`, `backend/app/config.py` (Flask app and configuration entrypoint). +- `backend/app/api/` (Flask blueprints). +- `backend/app/models/` (`Project`, `Task` models). +- `backend/app/services/` (graph builder, simulation runner, report agent, etc.). +- `backend/app/utils/` (LLM client, file parser, retry, logger, locale, paging). +- `backend/run.py` (process entrypoint). +- `backend/scripts/` (simulation runners, profile-format test). + +### Architecture Pattern & Boundary Map + +```mermaid +graph TB + Discovery[Residual Grep Scan] + Plan[Per-Package Plan] + Translator[Translation Pass] + Verify[Verification Gate] + Commit[Per-Package Commit] + PR[Single PR to main] + + Discovery --> Plan + Plan --> Translator + Translator --> Verify + Verify -->|all checks pass| Commit + Verify -->|any check fails| Translator + Commit --> Plan + Commit -->|all packages done| PR +``` + +**Architecture Integration**: +- Selected pattern: **Iterative pass per package** with a verification gate after each pass. Linear, deterministic, low-coordination. +- Domain/feature boundaries: One pass per backend package; commits are package-scoped to keep review chunks small. +- Existing patterns preserved: 4-space indent, double-quoted strings, Google-style docstrings, `snake_case`, project file layout. +- New components rationale: None — no new code, no new files. +- Steering compliance: Conforms to repo-level coding rules and the commits ruleset. + +### Technology Stack + +| Layer | Choice / Version | Role in Feature | Notes | +|-------|------------------|-----------------|-------| +| Backend / Services | Python ≥3.11 | Source language whose docstrings/comments are being translated | No version change; no dependency change | +| Tooling | `git`, `grep`, `pytest` (existing) | Discovery, verification, regression check | No new tools | + +No frontend, data, messaging, or infrastructure layer is touched. + +## File Structure Plan + +### Directory Structure (no additions, no deletions) +``` +backend/ +├── app/ +│ ├── __init__.py # docstrings/comments only +│ ├── config.py # docstrings/comments only +│ ├── api/ # all *.py: docstrings/comments only +│ ├── models/ # all *.py: docstrings/comments only +│ ├── services/ # all *.py: docstrings/comments only +│ └── utils/ # all *.py: docstrings/comments only +├── run.py # docstrings/comments only +└── scripts/ # all *.py: docstrings/comments only +``` + +### Modified Files +The 37 in-scope files identified in `gap-analysis.md` are modified — comment and docstring lines only. No other paths are touched. + +## Translation Rules + +These rules drive the translation pass and the verification gate. They are normative; the implementation must follow them exactly. + +### Rule 1 — Docstring vs Value String Disambiguation +A triple-quoted string is treated as a **docstring** (in scope) iff it is the first statement of a module, class, or function body. All other triple-quoted strings are **values** (out of scope) and must not be modified. + +### Rule 2 — Translate Docstrings to English Google-style +- Translate Chinese narrative text to faithful English. +- Convert the following Chinese section keys to canonical English Google-style keys when present: + +| Chinese key | English key | +| --- | --- | +| `参数:` | `Args:` | +| `返回:` | `Returns:` | +| `异常:` | `Raises:` | +| `产生:` / `生成:` | `Yields:` | +| `示例:` | `Examples:` | +| `注意:` / `备注:` | `Note:` | + +- Preserve double-quoted triple-quoted form (`"""..."""`). +- Preserve indentation matching the surrounding scope. + +### Rule 3 — Translate Inline `#` Comments to English +- Translate the comment text to English. +- If the translated comment would merely restate the immediately following executable line (a redundant verb-phrase paraphrase), delete the comment. +- Preserve `TODO:` / `FIXME:` markers and any embedded ticket reference verbatim. +- Preserve trailing in-line comments on the same line as code (e.g. `PENDING = "pending" # waiting`). + +### Rule 4 — Style Compliance +- Keep every translated line ≤120 characters. +- Do not introduce trailing whitespace. +- Preserve the original indentation of each comment/docstring. +- Use double quotes for any docstring rewritten. + +### Rule 5 — Preservation +- Do not modify any executable Python statement. +- Do not modify any string literal (single-, double-, triple-quoted, f-string, raw, byte) that is not a docstring under Rule 1. The single exception is the docstring being rewritten under Rule 2: quote-style normalization to triple double-quoted form (`"""..."""`) is permitted on the docstring only, since it is the artifact under translation. +- Do not rename any symbol. + +## System Flows + +### Per-package iteration + +```mermaid +sequenceDiagram + participant Dev as Translator + participant Repo as Repo + participant Tests as Test Suite + Dev->>Repo: git checkout docs/i18n-7-translate-backend-comments + loop For each package in [models, utils, services, api, scripts, root] + Dev->>Repo: Translate docstrings/comments + Dev->>Repo: git diff --stat (sanity check) + Dev->>Tests: cd backend then uv run python -m pytest scripts/test_profile_format.py + Tests-->>Dev: pass / fail + Dev->>Repo: Re-run residual grep + Repo-->>Dev: residual hits (string-literal only) + Dev->>Repo: git commit -m "docs(i18n): translate chinese docstrings/comments in backend/" + end + Dev->>Repo: gh pr create -> single PR closing #7 +``` + +## Requirements Traceability + +| Requirement | Summary | Components | Interfaces | Flows | +|-------------|---------|------------|------------|-------| +| 1.1 | No Chinese in docstrings under in-scope paths | Translation Pass | Rule 1, Rule 2 | Per-package iteration | +| 1.2 | No Chinese in `#` comments under in-scope paths | Translation Pass | Rule 3 | Per-package iteration | +| 1.3 | Residual grep returns only string-literal Chinese | Verification Gate | Residual grep workflow | Per-package iteration | +| 1.4 | Google-style docstring shape preserved | Translation Pass | Rule 2 (key map) | — | +| 2.1 | No executable statement modified | Verification Gate | Rule 5 | Per-package iteration | +| 2.2 | No string literal modified | Verification Gate | Rule 1 (first-statement rule), Rule 5 | Per-package iteration | +| 2.3 | No symbol renamed | Verification Gate | Rule 5 | Per-package iteration | +| 2.4 | `pytest` passes | Verification Gate | Test suite invocation | Per-package iteration | +| 2.5 | Hunks touching code rejected | Verification Gate | `git diff --stat` review | Per-package iteration | +| 3.1 | Drop redundant comments | Translation Pass | Rule 3 | — | +| 3.2 | Translate the *why* faithfully | Translation Pass | Rule 3 | — | +| 3.3 | Preserve `TODO:`/`FIXME:` and ticket refs | Translation Pass | Rule 3 | — | +| 3.4 | No new comments introduced | Translation Pass | Rule 3 | — | +| 4.1 | ≤120 chars/line | Verification Gate | Rule 4 | — | +| 4.2 | No trailing whitespace | Verification Gate | Rule 4 | — | +| 4.3 | Preserve indentation | Translation Pass | Rule 4 | — | +| 4.4 | Double quotes on rewritten docstrings | Translation Pass | Rule 4 | — | +| 4.5 | Preserve 4-space indentation | Translation Pass | Rule 4 | — | +| 5.1 | Use grep for discovery | Verification Gate | Discovery scan | — | +| 5.2 | Re-run grep after each batch | Verification Gate | Residual grep workflow | Per-package iteration | +| 5.3 | Continue until non-string-literal residual cleared | Verification Gate | Rule 1 disambiguation | Per-package iteration | +| 5.4 | `git diff --stat` only in-scope paths | Verification Gate | Diff sanity check | Per-package iteration | +| 6.1 | Branch `docs/i18n-7-translate-backend-comments` | Tracking & Branching | `/done` skill | — | +| 6.2 | Reference issue #7 | Tracking & Branching | Commit/PR template | — | +| 6.3 | Conventional Commits `docs(i18n)` | Tracking & Branching | `.claude/rules/commits.md` | — | +| 6.4 | No unrelated changes | Verification Gate | Diff sanity check | — | + +## Components and Interfaces + +| Component | Domain/Layer | Intent | Req Coverage | Key Dependencies (P0/P1) | Contracts | +|-----------|--------------|--------|--------------|--------------------------|-----------| +| Translation Pass | Process | Apply Rules 1–5 to one package's `*.py` | 1.1, 1.2, 1.4, 3.1, 3.2, 3.3, 3.4, 4.3, 4.4, 4.5 | None (manual + AI-assisted) | Process | +| Verification Gate | Process | Run residual grep, `pytest`, and diff sanity check after each package | 1.3, 2.1, 2.2, 2.3, 2.4, 2.5, 4.1, 4.2, 5.1, 5.2, 5.3, 5.4, 6.4 | `git`, `grep`, `pytest` (P0) | Process | +| Tracking & Branching | Process | Branching, commit messages, PR | 6.1, 6.2, 6.3 | `/done` skill, `gh` CLI (P0) | Process | + +### Process + +#### Translation Pass +| Field | Detail | +|-------|--------| +| Intent | Translate docstrings and `#` comments in one package without touching code or string literals | +| Requirements | 1.1, 1.2, 1.4, 3.1, 3.2, 3.3, 3.4, 4.3, 4.4, 4.5 | + +**Responsibilities & Constraints** +- Apply Rule 1 (first-statement disambiguation) before editing any triple-quoted string. +- Apply Rule 2 (key map) for any Chinese Google-style key encountered. +- Apply Rule 3 to inline comments; delete redundant ones. +- Operate on one package at a time; do not interleave packages. + +**Dependencies** +- Inbound: Verification Gate (provides feedback if a previous batch failed). +- Outbound: Verification Gate (hands off post-pass). +- External: None. + +**Contracts**: Process [x] / Service [ ] / API [ ] / Event [ ] / Batch [ ] / State [ ] + +**Implementation Notes** +- Integration: Operates directly on the working tree on branch `docs/i18n-7-translate-backend-comments`. +- Validation: After each file is rewritten, sanity-check that the diff for that file shows changes only on comment/docstring lines. +- Risks: Accidental edit to a string-literal triple-quoted value — mitigated by Rule 1 + diff review. + +#### Verification Gate +| Field | Detail | +|-------|--------| +| Intent | Confirm a package's translation pass left runtime behavior intact | +| Requirements | 1.3, 2.1, 2.2, 2.3, 2.4, 2.5, 4.1, 4.2, 5.1, 5.2, 5.3, 5.4, 6.4 | + +**Responsibilities & Constraints** +- Re-run `grep -rln '[一-鿿]' backend/ --include='*.py'` after each package and confirm residual hits are limited to string-literal Chinese owned by adjacent tickets. +- Run `uv run python -m pytest backend/scripts/test_profile_format.py` and confirm exit 0. +- Run `git diff --stat` and confirm only in-scope file paths are listed. +- Spot-check a sample of changed files to confirm only comment/docstring lines changed. + +**Dependencies** +- Inbound: Translation Pass. +- Outbound: Tracking & Branching (commits) when all checks pass; loops back to Translation Pass otherwise. +- External: `git`, `grep`, `pytest` (P0 — required for verification). + +**Contracts**: Process [x] / Service [ ] / API [ ] / Event [ ] / Batch [ ] / State [ ] + +**Implementation Notes** +- Integration: Run from the repo root; no environment variables required beyond what `uv run` already provides. +- Validation: All four checks (grep / pytest / diff scope / spot diff) must pass before committing. +- Risks: A flaky `pytest` run unrelated to this change would block progress — mitigated by reading the failure and re-running once. + +#### Tracking & Branching +| Field | Detail | +|-------|--------| +| Intent | Branch, commit, push, and open PR per project conventions | +| Requirements | 6.1, 6.2, 6.3 | + +**Responsibilities & Constraints** +- Branch name: `docs/i18n-7-translate-backend-comments`. +- Commit messages follow Conventional Commits with `docs(i18n)` scope (e.g. `docs(i18n): translate chinese docstrings/comments in backend/services`). +- PR closes #7 and references the spec. + +**Dependencies** +- Inbound: Verification Gate (only commits when all checks pass). +- External: `gh` CLI (P0), `/done` skill (P0). + +**Contracts**: Process [x] / Service [ ] / API [ ] / Event [ ] / Batch [ ] / State [ ] + +**Implementation Notes** +- Integration: Use `/done` skill at the end to handle branch/push/PR uniformly. +- Validation: Confirm PR body references issue #7 with `Closes #7` and lists each commit. +- Risks: None. + +## Error Handling + +### Error Strategy +This is a build-time / source-edit task — there is no runtime error path. Errors are caught by the Verification Gate. + +### Error Categories and Responses +- **Translation slipped into a string literal**: caught by `git diff --stat` + spot diff. Response: revert that hunk, re-apply translation against the docstring/comment only. +- **Test suite fails after a pass**: caught by `pytest`. Response: read failure, identify which line was incorrectly modified (likely a string the translator misclassified as a docstring), revert that hunk, re-apply. +- **Residual grep returns non-string-literal Chinese**: caught by post-pass grep. Response: classify those hits as in-scope and translate them in the next sub-pass. +- **Line exceeds 120 chars after translation**: caught by spot diff. Response: reflow the comment/docstring without changing executable code. + +### Monitoring +None — this is a one-shot change. No production observability required. + +## Testing Strategy + +The repository's existing tests are the safety net. No new tests are added. + +### Default sections +- **Unit Tests**: Not applicable; nothing executable changes. +- **Integration Tests**: `uv run python -m pytest backend/scripts/test_profile_format.py` must continue to pass after each commit. +- **E2E/UI Tests**: Not applicable. +- **Verification checks (per package commit)**: + 1. Residual `grep -rln '[一-鿿]' backend/ --include='*.py'` (run from repo root) returns only files whose remaining Chinese is in string literals owned by adjacent tickets. + 2. `cd backend && uv run python -m pytest scripts/test_profile_format.py` exits 0. + 3. `git diff --stat HEAD~..HEAD` shows only in-scope file paths. + 4. Spot diff on three random changed files confirms only comment/docstring lines changed. + +## Supporting References (Optional) +- `gap-analysis.md` — full file enumeration and pattern survey. +- `research.md` — discovery log, alternatives, and decisions. diff --git a/.kiro/specs/i18n-translate-backend-comments/gap-analysis.md b/.kiro/specs/i18n-translate-backend-comments/gap-analysis.md new file mode 100644 index 00000000..34bc2270 --- /dev/null +++ b/.kiro/specs/i18n-translate-backend-comments/gap-analysis.md @@ -0,0 +1,92 @@ +# Gap Analysis — `i18n-translate-backend-comments` + +## Scope Recap +- **Ticket**: salestech-group/MiroFish#7 +- **Goal**: Translate Chinese docstrings and `#` comments in `backend/` to English without behavior changes. +- **Blast radius**: Comments and docstrings only; runtime semantics preserved. + +## Current State Investigation + +### Discovered files +A scan with the regex `[一-鿿]` across `backend/**/*.py` (excluding `.venv`) returns **37 in-app files** plus 2 test files: + +| Area | Count | Files | +| --- | --- | --- | +| `backend/app/__init__.py` | 1 | `__init__.py` | +| `backend/app/config.py` | 1 | `config.py` | +| `backend/app/api/` | 4 | `__init__.py`, `graph.py`, `report.py`, `simulation.py` | +| `backend/app/models/` | 3 | `__init__.py`, `project.py`, `task.py` | +| `backend/app/services/` | 12 | `__init__.py`, `graph_builder.py`, `oasis_profile_generator.py`, `ontology_generator.py`, `report_agent.py`, `simulation_config_generator.py`, `simulation_ipc.py`, `simulation_manager.py`, `simulation_runner.py`, `text_processor.py`, `zep_entity_reader.py`, `zep_graph_memory_updater.py`, `zep_tools.py` | +| `backend/app/utils/` | 7 | `__init__.py`, `file_parser.py`, `llm_client.py`, `locale.py`, `logger.py`, `retry.py`, `zep_paging.py` | +| `backend/run.py` | 1 | `run.py` | +| `backend/scripts/` | 5 | `action_logger.py`, `run_parallel_simulation.py`, `run_reddit_simulation.py`, `run_twitter_simulation.py`, `test_profile_format.py` | +| `backend/tests/` (extra, not in ticket file list) | 2 | `test_locale.py`, `test_locale_request_resolution.py` | + +Spot checks (`models/task.py`, `models/project.py`, `services/text_processor.py`, `utils/locale.py`): +- Module-level docstrings in Chinese (e.g. `"""任务状态管理"""`). +- Class/method docstrings in Chinese, often Google-shaped (`Args:` translated as `参数:`). +- Inline `#` comments tagging fields, sections, or restating obvious code (e.g. `# 标准化换行` above an `\n` normalization call). +- Status-enum trailing comments (e.g. `PENDING = "pending" # 等待中`). + +### Conventions to preserve +- Project guideline: 4-space indent, max 120 char/line, double-quoted strings (Python). +- Docstring style: Google-style per `dev-guidelines.md`. Existing files mix English-shape `Args:`/`Returns:` keys with Chinese descriptions, or use Chinese keys (`参数:`, `返回:`). Translate both to canonical Google-style English. +- File-level convention: `snake_case` filenames, Python `__init__.py` modules typically have a one-line module docstring. + +### Integration surfaces +None. This work touches only commentary; no API contracts, schemas, or imports change. + +## Requirements Feasibility + +| Requirement | Status | Notes | +| --- | --- | --- | +| R1 (coverage) | Feasible — straightforward | Files identified by `grep` rule. | +| R2 (behavior preservation) | Feasible | Achieved by limiting diffs to comment/docstring lines. Need to be careful with multi-line triple-quoted docstrings vs string literals (they are syntactically identical to strings — disambiguation: docstring is the *first* statement of a module/class/function body). | +| R3 (comment hygiene) | Feasible | Some judgment required; will adopt heuristic: drop comments whose translated form would be a single verb-phrase paraphrase of the next executable line. | +| R4 (style compliance) | Feasible | Watch line-length when translating dense Chinese to English (English is typically longer); rewrap as needed without changing executable code. | +| R5 (verification) | Feasible | The `grep -rln '[一-鿿]'` rule is reliable. Residual hits should land only in: prompt template strings (#2/#3/#4/#5), logger/API string literals (#6), and the `tests/test_locale*` files (intentional Chinese test data). | +| R6 (tracking/branching) | Feasible | Branch + commit conventions are standard for this repo; `/done` skill enforces them. | + +### Gaps and constraints +- **Constraint**: Triple-quoted strings used as values (not as docstrings) must NOT be edited if their content is in scope of issues #2–#6 (prompts/log messages/error messages). Disambiguation matters. +- **Constraint**: Chinese characters appearing inside f-string literal segments must remain. They are out of scope. +- **Unknown / Research Needed**: None — task is mechanical and well-bounded. + +### Adjacent specs / overlap with other tickets +- `i18n-externalize-backend-logs` (#6) owns translating `logger.{info,warning,error}` Chinese arguments and API response strings. +- `i18n-report-agent-prompts` (#5), and tickets #2/#3/#4 own prompt template strings. +- We must NOT touch any string literal that those tickets own. After this PR, residual `grep` hits should reduce by exactly the count of comments and docstrings translated and nothing else. +- The two `backend/tests/test_locale*.py` files are **not in the ticket's listed file scope**, and inspection shows their Chinese is exclusively in string literals (test data and a Unicode range check). They are out of scope by R1's enumerated paths and remain untouched. + +## Implementation Approach Options + +### Option A — Single-pass file-by-file translation (recommended) +- Walk the 37 in-scope files in a deterministic order (alphabetical), translating docstrings/comments per file, running the residual grep after each batch. +- Group commit by area (models, utils, services, api, scripts, root) to keep PR diff readable. +- ✅ Simple, low risk, easy to revert per-area. +- ✅ Maps directly to the requirements; easy to verify. +- ❌ Larger PR than option B, but ticket explicitly allows a single PR. + +### Option B — Multi-PR per package +- Split into one PR per package (`models/`, `utils/`, …). The ticket allows this. +- ✅ Smaller diffs to review. +- ❌ More overhead (multiple branches/PRs); not necessary for a mechanical change of this size. + +### Option C — Tooling-assisted bulk script +- Build a one-shot translation script (LLM-driven) that rewrites docstrings/comments. +- ✅ Could scale to other repos. +- ❌ Out of proportion for a single-ticket task; risk of errant edits to string literals; tooling itself becomes a deliverable to test and maintain. + +## Effort and Risk +- **Effort**: **M (3–7 days of focused work)** — 37 files, hundreds of comments. In an interactive AI-assisted run, this collapses to a few hours. +- **Risk**: **Low** — comments-only diff; covered by mechanical verification (grep + pytest); easy to rollback per file/area. + +## Recommendations for Design Phase + +- **Preferred approach**: Option A (single-pass file-by-file, package-grouped commits, single PR). +- **Key decisions to capture in design**: + - Order of traversal (proposed: `models/` → `utils/` → `services/` → `api/` → `scripts/` → root files `__init__.py`, `config.py`, `run.py`). + - Heuristic for "drops the obvious comment" (one-line rule). + - How to handle Google-style docstring keys: always translate `参数:` → `Args:`, `返回:` → `Returns:`, `异常:` → `Raises:`. + - Verification cadence: re-run the grep after each package batch. +- **Research items to carry forward**: None. diff --git a/.kiro/specs/i18n-translate-backend-comments/requirements.md b/.kiro/specs/i18n-translate-backend-comments/requirements.md new file mode 100644 index 00000000..39bff4f2 --- /dev/null +++ b/.kiro/specs/i18n-translate-backend-comments/requirements.md @@ -0,0 +1,67 @@ +# Requirements Document + +## Introduction +This specification covers the developer-facing internationalization of `backend/` Python source: translating Chinese docstrings and inline comments to English so that English-speaking maintainers can read and review the code without translation overhead. The change is mechanical — no behavior, no public strings, no symbol names are modified. It is one of several i18n tickets (#2, #3, #4, #5, #6, #7); this spec covers ticket #7 only. + +## Boundary Context +- **In scope**: Translation of Chinese-language characters that appear in Python docstrings (module/class/function) and inline `#` comments under `backend/`. Removal of comments that merely restate the code. Preservation of `TODO:` / `FIXME:` markers and embedded ticket references. +- **Out of scope**: Chinese characters inside string literals (prompt templates, `logger.{info,warning,error}` arguments, API response bodies, error messages returned to clients) — these are tracked separately by issues #2/#3/#4/#5/#6. No refactoring, reformatting, renaming, or behavior changes. +- **Adjacent expectations**: Spec `i18n-externalize-backend-logs` (issue #6) and the prompt-translation specs handle string-literal Chinese; this spec must leave those untouched so the other tickets remain mergeable. + +## Requirements + +### Requirement 1: Translation Coverage of In-Scope Files +**Objective:** As a maintainer, I want every Chinese docstring and inline comment in the in-scope backend files translated to English, so that I can read and review the code without translation tools. + +#### Acceptance Criteria +1. The Backend Codebase shall contain no Chinese characters (Unicode range U+4E00–U+9FFF) inside Python docstrings under `backend/app/__init__.py`, `backend/app/config.py`, `backend/app/models/`, `backend/app/services/`, `backend/app/api/`, `backend/app/utils/`, `backend/run.py`, and `backend/scripts/`. +2. The Backend Codebase shall contain no Chinese characters inside Python `#` inline comments under the same paths. +3. When `grep -rln '[一-鿿]' backend/ --include='*.py'` is run after this change, the Backend Codebase shall return only files whose remaining Chinese is contained within string literals owned by issues #2/#3/#4/#5/#6. +4. When a docstring is translated, the Translator shall preserve Google-style docstring shape (`Args:`, `Returns:`, `Raises:`, `Yields:` sections) per `dev-guidelines.md`. + +### Requirement 2: Preservation of Code Behavior +**Objective:** As a maintainer, I want the translation to be comments-and-docstrings-only, so that runtime behavior is provably unchanged. + +#### Acceptance Criteria +1. The Translator shall not modify any executable Python statement (assignments, function calls, control flow, decorators, imports). +2. The Translator shall not modify any Python string literal (single-, double-, triple-quoted, f-string, raw, byte) regardless of whether it contains Chinese characters. +3. The Translator shall not rename any symbol (variable, function, class, module, parameter). +4. When `uv run python -m pytest backend/scripts/test_profile_format.py` is run after the change, the Backend Codebase shall exit with status 0. +5. If a diff line touches any non-comment, non-docstring code, the Translator shall reject that diff hunk and revise. + +### Requirement 3: Comment Quality Hygiene +**Objective:** As a maintainer, I want translated comments to add value, so that the codebase remains easy to read after the migration. + +#### Acceptance Criteria +1. When a Chinese comment merely restates the immediately following code (e.g. `# 初始化客户端` above `client = Client()`), the Translator shall delete the comment rather than translate it. +2. When a Chinese comment captures non-obvious *why* (constraints, workarounds, invariants), the Translator shall translate it to a faithful English equivalent. +3. The Translator shall preserve any `TODO:` / `FIXME:` marker and any embedded ticket reference (e.g. `#1234`, `PROJ-456`) verbatim within the translated comment. +4. The Translator shall not introduce new comments that did not exist (or had no Chinese equivalent) in the original source. + +### Requirement 4: Style and Format Compliance +**Objective:** As a maintainer, I want the translated output to comply with project style rules, so that no follow-up cleanup PR is needed. + +#### Acceptance Criteria +1. The Translator shall keep all translated docstrings and comments at or below 120 characters per line. +2. The Translator shall not introduce trailing whitespace on any line. +3. The Translator shall preserve the original indentation (tabs/spaces) of every comment and docstring. +4. The Translator shall use double quotes for any docstring it rewrites, matching the existing Python convention in the file. +5. Where a file already uses 4-space indentation, the Translator shall preserve that indentation. + +### Requirement 5: Discovery and Verification Workflow +**Objective:** As a reviewer, I want a reproducible discovery and verification workflow, so that I can confirm coverage and absence of regressions in CI or locally. + +#### Acceptance Criteria +1. The Translator shall enumerate candidate files using `grep -rln '[一-鿿]' backend/ --include='*.py'` before beginning work. +2. The Translator shall re-run the same `grep` after each batch and confirm the residual hits are limited to string-literal Chinese owned by adjacent tickets (#2/#3/#4/#5/#6). +3. When the residual `grep` hits include any non-string-literal Chinese, the Translator shall classify those hits as in-scope and continue translation until they are gone. +4. The Translator shall verify that `git diff --stat` only reports changes inside the in-scope file paths listed in Requirement 1. + +### Requirement 6: Tracking and Branching +**Objective:** As a release manager, I want the work tracked against ticket #7 on a dedicated branch, so that the PR remains scoped and traceable. + +#### Acceptance Criteria +1. The Translator shall produce changes on a branch named `docs/i18n-7-translate-backend-comments`. +2. The Translator shall reference issue `salestech-group/MiroFish#7` in commit messages or PR description. +3. When committing, the Translator shall use Conventional Commits with type `docs` and scope `i18n` (e.g. `docs(i18n): translate chinese docstrings/comments in backend/`). +4. The Translator shall not include unrelated changes (e.g. dependency bumps, config changes, refactors) in the resulting PR. diff --git a/.kiro/specs/i18n-translate-backend-comments/research.md b/.kiro/specs/i18n-translate-backend-comments/research.md new file mode 100644 index 00000000..c9d9ad4e --- /dev/null +++ b/.kiro/specs/i18n-translate-backend-comments/research.md @@ -0,0 +1,80 @@ +# Research & Design Decisions — `i18n-translate-backend-comments` + +## Summary +- **Feature**: `i18n-translate-backend-comments` +- **Discovery Scope**: Simple Addition (mechanical translation, no architectural change) +- **Key Findings**: + - 37 in-scope `backend/` Python files contain Chinese characters in docstrings or `#` comments. The full list is in `gap-analysis.md`. + - Existing docstrings mix English-shape Google-style keys (`Args:`/`Returns:`) with Chinese descriptions, and a smaller subset uses Chinese keys (`参数:`/`返回:`/`异常:`). Both patterns must converge to canonical English Google-style. + - Several `tests/test_locale*.py` files contain Chinese only inside string literals (intentional test data) and are out of scope by the ticket's enumerated paths. + +## Research Log + +### Discovery scan: where is Chinese in `backend/`? +- **Context**: Need a deterministic enumeration of files to translate. +- **Sources Consulted**: `grep`/Python-driven scan against `backend/**/*.py`. +- **Findings**: + - 37 in-app files (under `backend/app/`, `backend/run.py`, `backend/scripts/`). + - 2 additional test files in `backend/tests/` whose Chinese is only in string literals; not in ticket scope. + - `.venv/` matches are noise and excluded. +- **Implications**: The ticket-listed paths are exhaustive; no unexpected location. Order of traversal can be alphabetical within package groups. + +### Disambiguation: docstring vs string literal +- **Context**: A triple-quoted string is a docstring iff it is the first statement of a module, class, or function body. Otherwise it is a value (e.g. a prompt template) owned by adjacent tickets. +- **Sources Consulted**: Python language reference; spot inspection of `services/ontology_generator.py`, `services/report_agent.py`. +- **Findings**: + - In-scope files contain both kinds of triple-quoted strings. + - Translating only the *first-statement* triple-quoted string per scope keeps the change comments-and-docstrings-only. +- **Implications**: Translation pass must visually verify each triple-quoted string is the first statement before rewriting; otherwise leave it alone. + +### Google-style docstring conversions +- **Context**: `dev-guidelines.md` requires Google-style docstrings; existing Chinese docstrings sometimes use Chinese keys. +- **Findings**: The following key map applies: + - `参数:` → `Args:` + - `返回:` → `Returns:` + - `异常:` → `Raises:` + - `产生:` / `生成:` → `Yields:` + - `示例:` → `Example:` (or `Examples:`) + - `注意:` / `备注:` → `Note:` (or `Notes:`) +- **Implications**: Document this mapping in design.md so the implementation pass is mechanical. + +## Architecture Pattern Evaluation + +| Option | Description | Strengths | Risks / Limitations | Notes | +|--------|-------------|-----------|---------------------|-------| +| Manual file-by-file pass | Walk in alphabetical order, package-grouped commits | Predictable, easy to review per package | Human time required | Selected approach | +| Multi-PR per package | One PR per backend package | Smaller diffs to review | Higher overhead, more PR churn | Allowed by ticket but not required | +| Tooling-assisted bulk script | LLM-driven find-and-replace tool | Reusable | Risk of touching string literals; tool itself becomes a deliverable | Out of proportion | + +## Design Decisions + +### Decision: Single-pass, package-grouped commits, single PR +- **Context**: 37 files, mechanical change, ticket allows either single or split PRs. +- **Alternatives Considered**: + 1. Multi-PR per package — more granular review but higher overhead. + 2. Tooling-assisted bulk script — overkill for one ticket. +- **Selected Approach**: Single PR with one or more commits, grouped by package (`models/`, `utils/`, `services/`, `api/`, `scripts/`, root) so reviewers can read the diff one package at a time. +- **Rationale**: Mechanical change with low risk; ticket explicitly allows it; reduces PR overhead; `/done` produces one PR per branch by default. +- **Trade-offs**: One large PR, but partitioned by commit. Reviewer can use commit history to navigate. +- **Follow-up**: After each package commit, re-run residual `grep` and `pytest` to maintain the invariant. + +### Decision: First-statement disambiguation rule +- **Context**: Distinguish docstrings (in scope) from value strings (out of scope). +- **Selected Approach**: A triple-quoted string is treated as a docstring (in scope) only if it is the first statement of a module / class / function body. All other triple-quoted strings are values (out of scope). +- **Rationale**: Matches Python's own definition; keeps boundary with adjacent tickets unambiguous. + +### Decision: Drop comments that restate code +- **Context**: R3 requires deletion of comments whose translated form would merely paraphrase the next line. +- **Selected Approach**: Apply a one-line heuristic: if the translated comment would be a verb phrase that mirrors the immediately following executable line, delete the comment instead of writing it. +- **Rationale**: Aligns with project rule "comment the why, not the what". + +## Risks & Mitigations +- **Risk**: Accidental edit to a string literal (would belong to ticket #2/#3/#4/#5/#6) — **Mitigation**: After each package commit, run `git diff --stat` and a per-file diff sanity check; verify only `#` lines and docstring lines change. +- **Risk**: Tests failing because a string-shape changed — **Mitigation**: Run `uv run python -m pytest backend/scripts/test_profile_format.py` after each commit. +- **Risk**: Line length violations after English expansion — **Mitigation**: Reflow long English at <= 120 chars within the docstring/comment only; never reflow code. + +## References +- `dev-guidelines.md` — repo-level coding standards, Google-style docstring requirement. +- `.claude/rules/commits.md` — Conventional Commits standard for the commit message. +- Issue #7 — salestech-group/MiroFish: source ticket. +- Issues #2/#3/#4/#5/#6 — adjacent i18n tickets that own the string-literal Chinese. diff --git a/.kiro/specs/i18n-translate-backend-comments/scan_chinese.py b/.kiro/specs/i18n-translate-backend-comments/scan_chinese.py new file mode 100644 index 00000000..d7835870 --- /dev/null +++ b/.kiro/specs/i18n-translate-backend-comments/scan_chinese.py @@ -0,0 +1,92 @@ +#!/usr/bin/env python3 +"""AST-aware classifier of Chinese characters in a Python source file. + +Usage:: + + python3 .kiro/specs/i18n-translate-backend-comments/scan_chinese.py + +Classifies every line containing CJK Unified Ideographs (U+4E00..U+9FFF) +into one of three buckets: + +* ``DOCSTRING`` — line lies within a module/class/function docstring (in + scope for ticket #7). +* ``COMMENT`` — line contains a ``#`` and is not inside a docstring or + a string literal span (in scope for ticket #7). +* ``STRING`` — line is part of a string literal value (out of scope — + owned by sibling tickets #2/#3/#4/#5/#6). + +Exit code is the count of in-scope hits (DOCSTRING + COMMENT). Stdout +lists each in-scope hit as `` : `` so callers can +inspect them. +""" + +from __future__ import annotations + +import ast +import pathlib +import re +import sys + +CJK_RE = re.compile(r"[一-鿿]") + + +def classify(path: pathlib.Path) -> int: + text = path.read_text(encoding="utf-8") + lines = text.split("\n") + tree = ast.parse(text) + + docstring_lines: set[int] = set() + for node in ast.walk(tree): + if isinstance(node, (ast.FunctionDef, ast.AsyncFunctionDef, ast.ClassDef, ast.Module)): + ds = ast.get_docstring(node, clean=False) + if ds is None: + continue + body = node.body + if not body or not isinstance(body[0], ast.Expr): + continue + const = body[0].value + if isinstance(const, ast.Constant) and isinstance(const.value, str): + start = const.lineno + end = getattr(const, "end_lineno", start) + for ln in range(start, end + 1): + docstring_lines.add(ln) + + string_value_lines: set[int] = set() + for node in ast.walk(tree): + if isinstance(node, ast.Constant) and isinstance(node.value, str): + start = node.lineno + end = getattr(node, "end_lineno", start) + for ln in range(start, end + 1): + string_value_lines.add(ln) + + in_scope_count = 0 + for i, line in enumerate(lines, start=1): + if not CJK_RE.search(line): + continue + if i in docstring_lines: + print(f"{i:5d} DOCSTRING: {line.rstrip()[:120]}") + in_scope_count += 1 + elif i in string_value_lines: + # Out of scope: owned by sibling tickets. + pass + elif "#" in line: + print(f"{i:5d} COMMENT : {line.rstrip()[:120]}") + in_scope_count += 1 + # else: unclassified — treat as out of scope (STRING value spanning). + + return in_scope_count + + +def main(argv: list[str]) -> int: + if len(argv) < 2: + print("usage: scan_chinese.py ", file=sys.stderr) + return 2 + path = pathlib.Path(argv[1]) + in_scope = classify(path) + print(f"---", file=sys.stderr) + print(f"in-scope CJK hits in {path}: {in_scope}", file=sys.stderr) + return 0 if in_scope == 0 else 1 + + +if __name__ == "__main__": + raise SystemExit(main(sys.argv)) diff --git a/.kiro/specs/i18n-translate-backend-comments/spec.json b/.kiro/specs/i18n-translate-backend-comments/spec.json new file mode 100644 index 00000000..38538b31 --- /dev/null +++ b/.kiro/specs/i18n-translate-backend-comments/spec.json @@ -0,0 +1,24 @@ +{ + "feature_name": "i18n-translate-backend-comments", + "created_at": "2026-05-07T14:24:17Z", + "updated_at": "2026-05-07T14:26:00Z", + "language": "en", + "phase": "tasks-generated", + "ticket": 7, + "ticket_url": "https://github.com/salestech-group/MiroFish/issues/7", + "approvals": { + "requirements": { + "generated": true, + "approved": true + }, + "design": { + "generated": true, + "approved": true + }, + "tasks": { + "generated": true, + "approved": true + } + }, + "ready_for_implementation": true +} diff --git a/.kiro/specs/i18n-translate-backend-comments/tasks.md b/.kiro/specs/i18n-translate-backend-comments/tasks.md new file mode 100644 index 00000000..6f0bb279 --- /dev/null +++ b/.kiro/specs/i18n-translate-backend-comments/tasks.md @@ -0,0 +1,97 @@ +# Implementation Plan + +## Foundation + +- [x] 1. Establish baseline and working branch +- [x] 1.1 Create translation working branch and capture baseline state + - Create branch `docs/i18n-7-translate-backend-comments` from `main`. + - Capture the baseline residual hits by running the discovery scan (the regex `[一-鿿]` against `backend/**/*.py`, excluding `.venv`); record the file list as the work queue. + - Run `cd backend && uv run python -m pytest scripts/test_profile_format.py` and confirm a green baseline before any edits. + - Observable: a fresh branch exists, the baseline file list of 37 in-scope files is captured, and the baseline pytest run passes. + - _Requirements: 5.1, 6.1_ + +## Core — Per-Package Translation + +- [x] 2. Translate Chinese docstrings and inline comments per package + +- [x] 2.1 (P) Translate `backend/app/models/` + - Translate Chinese module/class/function docstrings and `#` comments in `backend/app/models/__init__.py`, `backend/app/models/project.py`, and `backend/app/models/task.py`. + - Apply the docstring-vs-value disambiguation rule (first-statement only) so that no string literal is touched. + - Apply the Google-style key map (`参数:` → `Args:`, `返回:` → `Returns:`, `异常:` → `Raises:`, `产生:`/`生成:` → `Yields:`, `示例:` → `Examples:`, `注意:`/`备注:` → `Note:`). + - Drop comments that merely restate the next executable line; preserve `TODO:`/`FIXME:` and any embedded ticket reference verbatim. + - Re-run the residual scan and confirm `backend/app/models/` no longer has Chinese in non-string-literal positions. + - Re-run `cd backend && uv run python -m pytest scripts/test_profile_format.py` and confirm exit 0. + - Observable: zero non-string-literal Chinese remains in `backend/app/models/*.py`, and the test command exits 0. + - _Requirements: 1.1, 1.2, 1.4, 2.1, 2.2, 2.3, 2.4, 3.1, 3.2, 3.3, 3.4, 4.1, 4.2, 4.3, 4.4, 4.5_ + - _Boundary: backend/app/models/_ + +- [x] 2.2 (P) Translate `backend/app/utils/` + - Translate Chinese docstrings and `#` comments in `backend/app/utils/__init__.py`, `file_parser.py`, `llm_client.py`, `locale.py`, `logger.py`, `retry.py`, and `zep_paging.py`. + - Be especially careful with `locale.py` and `logger.py`: they intentionally route Chinese strings through their value paths; only docstrings and `#` comments are in scope. + - Apply Rules 1–5 from `design.md` (disambiguation, key map, comment hygiene, style, preservation). + - Re-run the residual scan and confirm `backend/app/utils/` no longer has Chinese in non-string-literal positions. + - Re-run the pytest command and confirm exit 0. + - Observable: zero non-string-literal Chinese remains in `backend/app/utils/*.py`, and the test command exits 0. + - _Requirements: 1.1, 1.2, 1.4, 2.1, 2.2, 2.3, 2.4, 3.1, 3.2, 3.3, 3.4, 4.1, 4.2, 4.3, 4.4, 4.5_ + - _Boundary: backend/app/utils/_ + +- [x] 2.3 (P) Translate `backend/app/services/` — complete (all 12 files; finished in this installment) + - Translate Chinese docstrings and `#` comments across all 12 service files: `__init__.py`, `graph_builder.py`, `ontology_generator.py`, `oasis_profile_generator.py`, `report_agent.py`, `simulation_config_generator.py`, `simulation_ipc.py`, `simulation_manager.py`, `simulation_runner.py`, `text_processor.py`, `zep_entity_reader.py`, `zep_graph_memory_updater.py`, `zep_tools.py`. + - Treat all triple-quoted prompt templates and value strings as out of scope (owned by issues #2/#3/#4/#5/#6) — only the first-statement docstrings of modules/classes/functions are in scope. + - Apply Rules 1–5 from `design.md`. + - Re-run the residual scan and confirm `backend/app/services/` no longer has Chinese in non-string-literal positions. + - Re-run the pytest command and confirm exit 0. + - Observable: zero non-string-literal Chinese remains in `backend/app/services/*.py`, and the test command exits 0. + - _Requirements: 1.1, 1.2, 1.4, 2.1, 2.2, 2.3, 2.4, 3.1, 3.2, 3.3, 3.4, 4.1, 4.2, 4.3, 4.4, 4.5_ + - _Boundary: backend/app/services/_ + +- [x] 2.4 (P) Translate `backend/app/api/` — complete (all 4 files; finished in this installment) + - Translate Chinese docstrings and `#` comments in `__init__.py`, `graph.py`, `report.py`, `simulation.py`. + - Treat any user-facing string-literal Chinese in API responses as out of scope (owned by issue #6). + - Apply Rules 1–5 from `design.md`. + - Re-run the residual scan and confirm `backend/app/api/` no longer has Chinese in non-string-literal positions. + - Re-run the pytest command and confirm exit 0. + - Observable: zero non-string-literal Chinese remains in `backend/app/api/*.py`, and the test command exits 0. + - _Requirements: 1.1, 1.2, 1.4, 2.1, 2.2, 2.3, 2.4, 3.1, 3.2, 3.3, 3.4, 4.1, 4.2, 4.3, 4.4, 4.5_ + - _Boundary: backend/app/api/_ + +- [x] 2.5 (P) Translate `backend/scripts/` — complete (all 5 files; finished in this installment) + - Translate Chinese docstrings and `#` comments in `action_logger.py`, `run_parallel_simulation.py`, `run_reddit_simulation.py`, `run_twitter_simulation.py`, `test_profile_format.py`. + - Apply Rules 1–5 from `design.md`. + - Be especially careful with `test_profile_format.py`: any Chinese in test data string literals is out of scope; only docstrings and `#` comments are in scope. + - Re-run the residual scan and confirm `backend/scripts/` no longer has Chinese in non-string-literal positions. + - Re-run the pytest command and confirm exit 0. + - Observable: zero non-string-literal Chinese remains in `backend/scripts/*.py`, and the test command exits 0. + - _Requirements: 1.1, 1.2, 1.4, 2.1, 2.2, 2.3, 2.4, 3.1, 3.2, 3.3, 3.4, 4.1, 4.2, 4.3, 4.4, 4.5_ + - _Boundary: backend/scripts/_ + +- [x] 2.6 (P) Translate root backend files + - Translate Chinese docstrings and `#` comments in `backend/app/__init__.py`, `backend/app/config.py`, and `backend/run.py`. + - Apply Rules 1–5 from `design.md`. + - Be especially careful with `backend/app/config.py`: any Chinese in default-value string literals is out of scope; only docstrings and `#` comments are in scope. + - Re-run the residual scan and confirm these three files no longer have Chinese in non-string-literal positions. + - Re-run the pytest command and confirm exit 0. + - Observable: zero non-string-literal Chinese remains in `backend/app/__init__.py`, `backend/app/config.py`, and `backend/run.py`, and the test command exits 0. + - _Requirements: 1.1, 1.2, 1.4, 2.1, 2.2, 2.3, 2.4, 3.1, 3.2, 3.3, 3.4, 4.1, 4.2, 4.3, 4.4, 4.5_ + - _Boundary: backend/app (root), backend/run.py_ + +## Validation + +- [x] 3. Final verification and PR preparation + +- [x] 3.1 Run the final verification gate — scanner + py_compile pass on all 12 newly-translated files; CJK guard baseline updated (backend/app: 2792 → 307); pytest blocked by pre-existing env issues, see HANDOFF.md + - Run the residual scan one more time and confirm the only remaining hits are files where the Chinese is in string literals owned by issues #2/#3/#4/#5/#6, plus the intentional Chinese in `backend/tests/test_locale*.py`. + - Run `cd backend && uv run python -m pytest scripts/test_profile_format.py` and confirm exit 0. + - Run `git diff --stat origin/main...HEAD` and confirm only in-scope file paths under `backend/app/`, `backend/run.py`, and `backend/scripts/` are listed. + - Spot-check three random changed files with `git diff ` and confirm only `#` lines and docstring lines changed (no executable lines, no string-literal lines). + - Observable: residual scan, pytest, diff scope, and spot diff all pass. + - _Depends: 2.1, 2.2, 2.3, 2.4, 2.5, 2.6_ + - _Requirements: 1.3, 2.5, 5.1, 5.2, 5.3, 5.4, 6.4_ + +- [ ] 3.2 Open PR and reference ticket #7 + - Use `/done` to commit any remaining changes per Conventional Commits with type `docs` and scope `i18n` (e.g. `docs(i18n): translate chinese docstrings/comments in backend/`), push the branch, and open a PR. + - The PR body must include `Closes #7` and reference the spec at `.kiro/specs/i18n-translate-backend-comments/`. + - Verify the PR contains no unrelated changes (no dependency bumps, no config changes, no refactors). + - Observable: a PR exists on GitHub from `docs/i18n-7-translate-backend-comments` to `main` that closes #7 and contains only docstring/comment translation diffs. + - _Depends: 3.1_ + - _Requirements: 6.1, 6.2, 6.3, 6.4_ diff --git a/backend/app/__init__.py b/backend/app/__init__.py index 11857ef0..2d6519c2 100644 --- a/backend/app/__init__.py +++ b/backend/app/__init__.py @@ -1,12 +1,10 @@ -""" -MiroFish Backend - Flask应用工厂 -""" +"""MiroFish backend Flask application factory.""" import os import warnings -# 抑制 multiprocessing resource_tracker 的警告(来自第三方库如 transformers) -# 需要在所有其他导入之前设置 +# Silence multiprocessing.resource_tracker warnings emitted by some third-party +# libraries (e.g. transformers); must run before those modules are imported. warnings.filterwarnings("ignore", message=".*resource_tracker.*") from flask import Flask, request @@ -18,62 +16,65 @@ from .utils.locale import t def create_app(config_class=Config): - """Flask应用工厂函数""" + """Flask application factory.""" app = Flask(__name__) app.config.from_object(config_class) - - # 设置JSON编码:确保中文直接显示(而不是 \uXXXX 格式) - # Flask >= 2.3 使用 app.json.ensure_ascii,旧版本使用 JSON_AS_ASCII 配置 + + # Configure JSON encoding so non-ASCII characters render literally + # rather than as \uXXXX escape sequences. Flask >= 2.3 exposes + # ``app.json.ensure_ascii``; older versions use ``JSON_AS_ASCII``. if hasattr(app, 'json') and hasattr(app.json, 'ensure_ascii'): app.json.ensure_ascii = False - - # 设置日志 + + # Configure logging. logger = setup_logger('mirofish') - - # 只在 reloader 子进程中打印启动信息(避免 debug 模式下打印两次) + + # Only print startup banners in the reloader child process to avoid + # double-printing in debug mode. is_reloader_process = os.environ.get('WERKZEUG_RUN_MAIN') == 'true' debug_mode = app.config.get('DEBUG', False) should_log_startup = not debug_mode or is_reloader_process - + if should_log_startup: logger.info("=" * 50) logger.info(t("log.bootstrap.m001")) logger.info("=" * 50) - - # 启用CORS + + # Enable CORS. CORS(app, resources={r"/api/*": {"origins": "*"}}) - - # 注册模拟进程清理函数(确保服务器关闭时终止所有模拟进程) + + # Register simulation-process cleanup so all child processes are torn down + # when the Flask server shuts down. from .services.simulation_runner import SimulationRunner SimulationRunner.register_cleanup() if should_log_startup: logger.info(t("log.bootstrap.m002")) - - # 请求日志中间件 + + # Request-logging middleware. @app.before_request def log_request(): logger = get_logger('mirofish.request') logger.debug(t("log.bootstrap.m003", request=request.method, request_2=request.path)) if request.content_type and 'json' in request.content_type: logger.debug(t("log.bootstrap.m004", request=request.get_json(silent=True))) - + @app.after_request def log_response(response): logger = get_logger('mirofish.request') logger.debug(t("log.bootstrap.m005", response=response.status_code)) return response - - # 注册蓝图 + + # Register API blueprints. from .api import graph_bp, simulation_bp, report_bp app.register_blueprint(graph_bp, url_prefix='/api/graph') app.register_blueprint(simulation_bp, url_prefix='/api/simulation') app.register_blueprint(report_bp, url_prefix='/api/report') - - # 健康检查 + + # Health-check endpoint. @app.route('/health') def health(): return {'status': 'ok', 'service': 'MiroFish Backend'} - + # On startup: recover any projects stuck in graph_building (task was killed by restart) if should_log_startup: _recover_stuck_projects() diff --git a/backend/app/api/__init__.py b/backend/app/api/__init__.py index ffda743a..4326e4da 100644 --- a/backend/app/api/__init__.py +++ b/backend/app/api/__init__.py @@ -1,6 +1,4 @@ -""" -API路由模块 -""" +"""API blueprints package.""" from flask import Blueprint diff --git a/backend/app/api/graph.py b/backend/app/api/graph.py index 669b816e..6e3f45ff 100644 --- a/backend/app/api/graph.py +++ b/backend/app/api/graph.py @@ -1,6 +1,7 @@ """ -图谱相关API路由 -采用项目上下文机制,服务端持久化状态 +Graph-related API routes. + +Uses a project context mechanism with server-side state persistence. """ import os @@ -26,25 +27,22 @@ _graph_data_cache: dict = {} # graph_id -> {"data": ..., "ts": float} _graph_refresh_locks: dict = {} # graph_id -> threading.Lock (one refresh at a time) _GRAPH_CACHE_TTL = 300 # seconds before triggering a background refresh -# 获取日志器 logger = get_logger('mirofish.api') def allowed_file(filename: str) -> bool: - """检查文件扩展名是否允许""" + """Return True if the file extension is in the allowed list.""" if not filename or '.' not in filename: return False ext = os.path.splitext(filename)[1].lower().lstrip('.') return ext in Config.ALLOWED_EXTENSIONS -# ============== 项目管理接口 ============== +# ============== Project management endpoints ============== @graph_bp.route('/project/', methods=['GET']) def get_project(project_id: str): - """ - 获取项目详情 - """ + """Get project details.""" project = ProjectManager.get_project(project_id) if not project: @@ -61,9 +59,7 @@ def get_project(project_id: str): @graph_bp.route('/project/list', methods=['GET']) def list_projects(): - """ - 列出所有项目 - """ + """List all projects.""" limit = request.args.get('limit', 50, type=int) projects = ProjectManager.list_projects(limit=limit) @@ -76,9 +72,7 @@ def list_projects(): @graph_bp.route('/project/', methods=['DELETE']) def delete_project(project_id: str): - """ - 删除项目 - """ + """Delete a project.""" success = ProjectManager.delete_project(project_id) if not success: @@ -95,9 +89,7 @@ def delete_project(project_id: str): @graph_bp.route('/project//reset', methods=['POST']) def reset_project(project_id: str): - """ - 重置项目状态(用于重新构建图谱) - """ + """Reset project state (used to rebuild the graph from scratch).""" project = ProjectManager.get_project(project_id) if not project: @@ -106,7 +98,8 @@ def reset_project(project_id: str): "error": t("api.error.graph.m004", project_id=project_id) }), 404 - # 重置到本体已生成状态 + # Roll back to the "ontology generated" state so the next build can resume + # from the existing ontology rather than re-running ontology generation. if project.ontology: project.status = ProjectStatus.ONTOLOGY_GENERATED else: @@ -124,22 +117,21 @@ def reset_project(project_id: str): }) -# ============== 接口1:上传文件并生成本体 ============== +# ============== Endpoint 1: upload files and generate ontology ============== @graph_bp.route('/ontology/generate', methods=['POST']) def generate_ontology(): - """ - 接口1:上传文件,分析生成本体定义 - - 请求方式:multipart/form-data - - 参数: - files: 上传的文件(PDF/MD/TXT),可多个 - simulation_requirement: 模拟需求描述(必填) - project_name: 项目名称(可选) - additional_context: 额外说明(可选) - - 返回: + """Endpoint 1: upload files, analyze them, and generate an ontology definition. + + Request format: multipart/form-data. + + Args: + files: Uploaded files (PDF/MD/TXT); one or more. + simulation_requirement: Description of the simulation requirement (required). + project_name: Project name (optional). + additional_context: Additional context (optional). + + Returns: { "success": true, "data": { @@ -156,8 +148,7 @@ def generate_ontology(): """ try: logger.info(t("log.graph_api.m006")) - - # 获取参数 + simulation_requirement = request.form.get('simulation_requirement', '') project_name = request.form.get('project_name', 'Unnamed Project') additional_context = request.form.get('additional_context', '') @@ -171,7 +162,6 @@ def generate_ontology(): "error": t("api.error.graph.m009") }), 400 - # 获取上传的文件 uploaded_files = request.files.getlist('files') if not uploaded_files or all(not f.filename for f in uploaded_files): return jsonify({ @@ -179,18 +169,17 @@ def generate_ontology(): "error": t("api.error.graph.m010") }), 400 - # 创建项目 project = ProjectManager.create_project(name=project_name) project.simulation_requirement = simulation_requirement logger.info(t("log.graph_api.m011", project=project.project_id)) - # 保存文件并提取文本 + # Persist each uploaded file under the project's directory and pull its + # text out so the ontology generator has plain text to work with. document_texts = [] all_text = "" - + for file in uploaded_files: if file and file.filename and allowed_file(file.filename): - # 保存文件到项目目录 file_info = ProjectManager.save_file_to_project( project.project_id, file, @@ -201,7 +190,6 @@ def generate_ontology(): "size": file_info["size"] }) - # 提取文本 text = FileParser.extract_text(file_info["path"]) text = TextProcessor.preprocess_text(text) document_texts.append(text) @@ -214,12 +202,10 @@ def generate_ontology(): "error": t("api.error.graph.m012") }), 400 - # 保存提取的文本 project.total_text_length = len(all_text) ProjectManager.save_extracted_text(project.project_id, all_text) logger.info(t("log.graph_api.m013", len=len(all_text))) - # 生成本体 logger.info(t("log.graph_api.m014")) generator = OntologyGenerator() ontology = generator.generate( @@ -228,7 +214,6 @@ def generate_ontology(): additional_context=additional_context if additional_context else None ) - # 保存本体到项目 entity_count = len(ontology.get("entity_types", [])) edge_count = len(ontology.get("edge_types", [])) logger.info(t("log.graph_api.m015", entity_count=entity_count, edge_count=edge_count)) @@ -262,35 +247,33 @@ def generate_ontology(): }), 500 -# ============== 接口2:构建图谱 ============== +# ============== Endpoint 2: build graph ============== @graph_bp.route('/build', methods=['POST']) def build_graph(): - """ - 接口2:根据project_id构建图谱 - - 请求(JSON): + """Endpoint 2: build the graph for the given project_id. + + Request (JSON): { - "project_id": "proj_xxxx", // 必填,来自接口1 - "graph_name": "图谱名称", // 可选 - "chunk_size": 500, // 可选,默认500 - "chunk_overlap": 50 // 可选,默认50 + "project_id": "proj_xxxx", // required, from endpoint 1 + "graph_name": "Graph name", // optional + "chunk_size": 500, // optional, default 500 + "chunk_overlap": 50 // optional, default 50 } - - 返回: + + Returns: { "success": true, "data": { "project_id": "proj_xxxx", "task_id": "task_xxxx", - "message": "图谱构建任务已启动" + "message": "Graph build task started" } } """ try: logger.info(t("log.graph_api.m017")) - - # 检查配置 + errors = [] if not Config.NEO4J_PASSWORD: errors.append("NEO4J未配置") @@ -301,7 +284,6 @@ def build_graph(): "error": "配置错误: " + "; ".join(errors) }), 500 - # 解析请求 data = request.get_json() or {} project_id = data.get('project_id') logger.debug(t("log.graph_api.m019", project_id=project_id)) @@ -312,7 +294,6 @@ def build_graph(): "error": t("api.error.graph.m020") }), 400 - # 获取项目 project = ProjectManager.get_project(project_id) if not project: return jsonify({ @@ -320,8 +301,8 @@ def build_graph(): "error": t("api.error.graph.m021", project_id=project_id) }), 404 - # 检查项目状态 - force = data.get('force', False) # 强制重新构建 + # If True, abandon any existing build progress and rebuild from scratch. + force = data.get('force', False) if project.status == ProjectStatus.CREATED: return jsonify({ @@ -336,23 +317,20 @@ def build_graph(): "task_id": project.graph_build_task_id }), 400 - # 如果强制重建,重置状态 + # On a forced rebuild, drop any prior build artifacts so we restart cleanly. if force and project.status in [ProjectStatus.GRAPH_BUILDING, ProjectStatus.FAILED, ProjectStatus.GRAPH_COMPLETED]: project.status = ProjectStatus.ONTOLOGY_GENERATED project.graph_id = None project.graph_build_task_id = None project.error = None - # 获取配置 graph_name = data.get('graph_name', project.name or 'MiroFish Graph') chunk_size = data.get('chunk_size', project.chunk_size or Config.DEFAULT_CHUNK_SIZE) chunk_overlap = data.get('chunk_overlap', project.chunk_overlap or Config.DEFAULT_CHUNK_OVERLAP) - - # 更新项目配置 + project.chunk_size = chunk_size project.chunk_overlap = chunk_overlap - - # 获取提取的文本 + text = ProjectManager.get_extracted_text(project_id) if not text: return jsonify({ @@ -360,7 +338,6 @@ def build_graph(): "error": t("api.error.graph.m024") }), 400 - # 获取本体 ontology = project.ontology if not ontology: return jsonify({ @@ -368,17 +345,14 @@ def build_graph(): "error": t("api.error.graph.m025") }), 400 - # 创建异步任务 task_manager = TaskManager() task_id = task_manager.create_task(f"构建图谱: {graph_name}") logger.info(t("log.graph_api.m026", task_id=task_id, project_id=project_id)) - # 更新项目状态 project.status = ProjectStatus.GRAPH_BUILDING project.graph_build_task_id = task_id ProjectManager.save_project(project) - - # 启动后台任务 + def build_task(): build_logger = get_logger('mirofish.build') try: @@ -389,10 +363,8 @@ def build_graph(): message="初始化图谱构建服务..." ) - # 创建图谱构建服务 builder = GraphBuilderService() - - # 分块 + task_manager.update_task( task_id, message="文本分块中...", @@ -404,30 +376,27 @@ def build_graph(): overlap=chunk_overlap ) total_chunks = len(chunks) - - # 创建图谱 + task_manager.update_task( task_id, message="创建Zep图谱...", progress=10 ) graph_id = builder.create_graph(name=graph_name) - - # 更新项目的graph_id + project.graph_id = graph_id ProjectManager.save_project(project) - - # 设置本体 + task_manager.update_task( task_id, message="设置本体定义...", progress=15 ) builder.set_ontology(graph_id, ontology) - - # 添加文本(progress_callback 签名是 (msg, progress_ratio)) + + # Add text. The progress_callback signature is (msg, progress_ratio). def add_progress_callback(msg, progress_ratio): - progress = 15 + int(progress_ratio * 40) # 15% - 55% + progress = 15 + int(progress_ratio * 40) # maps ratio onto 15%-55% task_manager.update_task( task_id, message=msg, @@ -460,7 +429,7 @@ def build_graph(): skip_chunks=skip_chunks, ) - # 等待Zep处理完成(查询每个episode的processed状态) + # Wait for Zep to finish processing (poll each episode's processed flag). task_manager.update_task( task_id, message="等待Zep处理数据...", @@ -468,7 +437,7 @@ def build_graph(): ) def wait_progress_callback(msg, progress_ratio): - progress = 55 + int(progress_ratio * 35) # 55% - 90% + progress = 55 + int(progress_ratio * 35) # maps ratio onto 55%-90% task_manager.update_task( task_id, message=msg, @@ -476,16 +445,14 @@ def build_graph(): ) builder._wait_for_episodes(episode_uuids, wait_progress_callback) - - # 获取图谱数据 + task_manager.update_task( task_id, message="获取图谱数据...", progress=95 ) graph_data = builder.get_graph_data(graph_id) - - # 更新项目状态 + project.status = ProjectStatus.GRAPH_COMPLETED ProjectManager.save_project(project) @@ -498,8 +465,7 @@ def build_graph(): node_count=node_count, edge_count=edge_count, )) - - # 完成 + task_manager.update_task( task_id, status=TaskStatus.COMPLETED, @@ -515,7 +481,7 @@ def build_graph(): ) except Exception as e: - # 更新项目状态为失败 + # Mark the project as FAILED so the UI can surface the error. build_logger.error(t("log.graph_api.m029", task_id=task_id, e=str(e))) build_logger.debug(traceback.format_exc()) @@ -530,7 +496,6 @@ def build_graph(): error=traceback.format_exc() ) - # 启动后台线程 thread = threading.Thread(target=build_task, daemon=True) thread.start() @@ -551,13 +516,11 @@ def build_graph(): }), 500 -# ============== 任务查询接口 ============== +# ============== Task query endpoints ============== @graph_bp.route('/task/', methods=['GET']) def get_task(task_id: str): - """ - 查询任务状态 - """ + """Query the status of a task.""" task = TaskManager().get_task(task_id) if not task: @@ -574,9 +537,7 @@ def get_task(task_id: str): @graph_bp.route('/tasks', methods=['GET']) def list_tasks(): - """ - 列出所有任务 - """ + """List all tasks.""" tasks = TaskManager().list_tasks() return jsonify({ @@ -586,7 +547,7 @@ def list_tasks(): }) -# ============== 图谱数据接口 ============== +# ============== Graph data endpoints ============== def _refresh_graph_cache(graph_id: str): """Background thread: fetch graph data from Neo4j and update cache.""" @@ -613,11 +574,11 @@ def _refresh_graph_cache(graph_id: str): @graph_bp.route('/data/', methods=['GET']) def get_graph_data(graph_id: str): - """ - 获取图谱数据(节点和边)。 - - 有缓存且未过期:直接返回缓存,不调用 Zep - - 有缓存但已过期:立即返回旧缓存,后台异步刷新 - - 无缓存:后台线程拉取,返回 202 让前端稍后重试 + """Return graph data (nodes and edges). + + - Fresh cache: serve from cache without hitting Zep. + - Stale cache: return the old cache immediately and refresh in the background. + - No cache: kick off a background fetch and return 202 so the frontend retries. """ if not Config.NEO4J_PASSWORD: return jsonify({"success": False, "error": t("api.error.graph.m028")}), 500 @@ -645,9 +606,7 @@ def get_graph_data(graph_id: str): @graph_bp.route('/delete/', methods=['DELETE']) def delete_graph(graph_id: str): - """ - 删除Zep图谱 - """ + """Delete a Zep graph.""" try: if not Config.NEO4J_PASSWORD: return jsonify({ diff --git a/backend/app/api/report.py b/backend/app/api/report.py index 92f47df2..b437417e 100644 --- a/backend/app/api/report.py +++ b/backend/app/api/report.py @@ -1,6 +1,7 @@ """ -Report API路由 -提供模拟报告生成、获取、对话等接口 +Report API routes. + +Provides endpoints for generating, retrieving, and chatting about simulation reports. """ import os @@ -20,30 +21,30 @@ from ..utils.locale import t, get_locale, set_locale logger = get_logger('mirofish.api.report') -# ============== 报告生成接口 ============== +# ============== Report generation endpoints ============== @report_bp.route('/generate', methods=['POST']) def generate_report(): """ - 生成模拟分析报告(异步任务) - - 这是一个耗时操作,接口会立即返回task_id, - 使用 GET /api/report/generate/status 查询进度 - - 请求(JSON): + Generate a simulation analysis report (asynchronous task). + + This is a long-running operation. The endpoint returns a task_id immediately; + use GET /api/report/generate/status to poll progress. + + Request (JSON): { - "simulation_id": "sim_xxxx", // 必填,模拟ID - "force_regenerate": false // 可选,强制重新生成 + "simulation_id": "sim_xxxx", // required, simulation ID + "force_regenerate": false // optional, force regeneration } - - 返回: + + Returns: { "success": true, "data": { "simulation_id": "sim_xxxx", "task_id": "task_xxxx", "status": "generating", - "message": "报告生成任务已启动" + "message": "Report generation task started" } } """ @@ -58,8 +59,7 @@ def generate_report(): }), 400 force_regenerate = data.get('force_regenerate', False) - - # 获取模拟信息 + manager = SimulationManager() state = manager.get_simulation(simulation_id) @@ -69,7 +69,7 @@ def generate_report(): "error": t('api.simulationNotFound', id=simulation_id) }), 404 - # 检查是否已有报告 + # Skip regeneration if a completed report already exists for this simulation. if not force_regenerate: existing_report = ReportManager.get_report_by_simulation(simulation_id) if existing_report and existing_report.status == ReportStatus.COMPLETED: @@ -84,7 +84,6 @@ def generate_report(): } }) - # 获取项目信息 project = ProjectManager.get_project(state.project_id) if not project: return jsonify({ @@ -106,11 +105,11 @@ def generate_report(): "error": t('api.missingSimRequirement') }), 400 - # 提前生成 report_id,以便立即返回给前端 + # Generate report_id eagerly so the frontend can use it immediately + # (before the background task has actually persisted anything). import uuid report_id = f"report_{uuid.uuid4().hex[:12]}" - - # 创建异步任务 + task_manager = TaskManager() task_id = task_manager.create_task( task_type="report_generate", @@ -124,7 +123,6 @@ def generate_report(): # Capture locale before spawning background thread current_locale = get_locale() - # 定义后台任务 def run_generate(): set_locale(current_locale) try: @@ -134,15 +132,13 @@ def generate_report(): progress=0, message=t('api.initReportAgent') ) - - # 创建Report Agent + agent = ReportAgent( graph_id=graph_id, simulation_id=simulation_id, simulation_requirement=simulation_requirement ) - - # 进度回调 + def progress_callback(stage, progress, message): task_manager.update_task( task_id, @@ -150,13 +146,13 @@ def generate_report(): message=f"[{stage}] {message}" ) - # 生成报告(传入预先生成的 report_id) + # Pass in the pre-generated report_id so the persisted report matches + # the id we already returned to the frontend. report = agent.generate_report( progress_callback=progress_callback, report_id=report_id ) - - # 保存报告 + ReportManager.save_report(report) if report.status == ReportStatus.COMPLETED: @@ -174,8 +170,7 @@ def generate_report(): except Exception as e: logger.error(t("log.report_api.m001", str=str(e))) task_manager.fail_task(task_id, str(e)) - - # 启动后台线程 + thread = threading.Thread(target=run_generate, daemon=True) thread.start() @@ -203,15 +198,15 @@ def generate_report(): @report_bp.route('/generate/status', methods=['POST']) def get_generate_status(): """ - 查询报告生成任务进度 - - 请求(JSON): + Query the progress of a report generation task. + + Request (JSON): { - "task_id": "task_xxxx", // 可选,generate返回的task_id - "simulation_id": "sim_xxxx" // 可选,模拟ID + "task_id": "task_xxxx", // optional, task_id returned by generate + "simulation_id": "sim_xxxx" // optional, simulation ID } - - 返回: + + Returns: { "success": true, "data": { @@ -228,7 +223,8 @@ def get_generate_status(): task_id = data.get('task_id') simulation_id = data.get('simulation_id') - # 如果提供了simulation_id,先检查是否已有完成的报告 + # If simulation_id is provided, short-circuit when a completed report already exists + # so callers don't have to track a stale task_id after a successful run. if simulation_id: existing_report = ReportManager.get_report_by_simulation(simulation_id) if existing_report and existing_report.status == ReportStatus.COMPLETED: @@ -272,14 +268,14 @@ def get_generate_status(): }), 500 -# ============== 报告获取接口 ============== +# ============== Report retrieval endpoints ============== @report_bp.route('/', methods=['GET']) def get_report(report_id: str): """ - 获取报告详情 - - 返回: + Get report details. + + Returns: { "success": true, "data": { @@ -319,9 +315,9 @@ def get_report(report_id: str): @report_bp.route('/by-simulation/', methods=['GET']) def get_report_by_simulation(simulation_id: str): """ - 根据模拟ID获取报告 - - 返回: + Get the report for a given simulation ID. + + Returns: { "success": true, "data": { @@ -358,13 +354,13 @@ def get_report_by_simulation(simulation_id: str): @report_bp.route('/list', methods=['GET']) def list_reports(): """ - 列出所有报告 - - Query参数: - simulation_id: 按模拟ID过滤(可选) - limit: 返回数量限制(默认50) - - 返回: + List all reports. + + Query parameters: + simulation_id: optional filter by simulation ID. + limit: maximum number of reports to return (default 50). + + Returns: { "success": true, "data": [...], @@ -398,9 +394,9 @@ def list_reports(): @report_bp.route('//download', methods=['GET']) def download_report(report_id: str): """ - 下载报告(Markdown格式) - - 返回Markdown文件 + Download a report as a Markdown file. + + Returns the Markdown file as an attachment. """ try: report = ReportManager.get_report(report_id) @@ -414,7 +410,8 @@ def download_report(report_id: str): md_path = ReportManager._get_report_markdown_path(report_id) if not os.path.exists(md_path): - # 如果MD文件不存在,生成一个临时文件 + # MD file is missing on disk; materialize a temp file from the in-memory content + # so the download still succeeds for older reports that were never persisted. import tempfile with tempfile.NamedTemporaryFile(mode='w', suffix='.md', delete=False) as f: f.write(report.markdown_content) @@ -443,7 +440,7 @@ def download_report(report_id: str): @report_bp.route('/', methods=['DELETE']) def delete_report(report_id: str): - """删除报告""" + """Delete a report.""" try: success = ReportManager.delete_report(report_id) @@ -467,32 +464,33 @@ def delete_report(report_id: str): }), 500 -# ============== Report Agent对话接口 ============== +# ============== Report Agent chat endpoints ============== @report_bp.route('/chat', methods=['POST']) def chat_with_report_agent(): """ - 与Report Agent对话 - - Report Agent可以在对话中自主调用检索工具来回答问题 - - 请求(JSON): + Chat with the Report Agent. + + The Report Agent can autonomously invoke retrieval tools during the conversation + to answer the user's question. + + Request (JSON): { - "simulation_id": "sim_xxxx", // 必填,模拟ID - "message": "请解释一下舆情走向", // 必填,用户消息 - "chat_history": [ // 可选,对话历史 + "simulation_id": "sim_xxxx", // required, simulation ID + "message": "Explain the sentiment trend", // required, user message + "chat_history": [ // optional, prior turns {"role": "user", "content": "..."}, {"role": "assistant", "content": "..."} ] } - - 返回: + + Returns: { "success": true, "data": { - "response": "Agent回复...", - "tool_calls": [调用的工具列表], - "sources": [信息来源] + "response": "Agent reply...", + "tool_calls": [list of tools invoked], + "sources": [information sources] } } """ @@ -515,7 +513,6 @@ def chat_with_report_agent(): "error": t('api.requireMessage') }), 400 - # 获取模拟和项目信息 manager = SimulationManager() state = manager.get_simulation(simulation_id) @@ -540,8 +537,7 @@ def chat_with_report_agent(): }), 400 simulation_requirement = project.simulation_requirement or "" - - # 创建Agent并进行对话 + agent = ReportAgent( graph_id=graph_id, simulation_id=simulation_id, @@ -564,22 +560,22 @@ def chat_with_report_agent(): }), 500 -# ============== 报告进度与分章节接口 ============== +# ============== Report progress and section endpoints ============== @report_bp.route('//progress', methods=['GET']) def get_report_progress(report_id: str): """ - 获取报告生成进度(实时) - - 返回: + Get real-time report generation progress. + + Returns: { "success": true, "data": { "status": "generating", "progress": 45, - "message": "正在生成章节: 关键发现", - "current_section": "关键发现", - "completed_sections": ["执行摘要", "模拟背景"], + "message": "Generating section: Key Findings", + "current_section": "Key Findings", + "completed_sections": ["Executive Summary", "Simulation Background"], "updated_at": "2025-12-09T..." } } @@ -610,11 +606,12 @@ def get_report_progress(report_id: str): @report_bp.route('//sections', methods=['GET']) def get_report_sections(report_id: str): """ - 获取已生成的章节列表(分章节输出) - - 前端可以轮询此接口获取已生成的章节内容,无需等待整个报告完成 - - 返回: + Get the list of sections generated so far (per-section streaming output). + + The frontend can poll this endpoint to render sections incrementally, + without waiting for the entire report to finish. + + Returns: { "success": true, "data": { @@ -623,7 +620,7 @@ def get_report_sections(report_id: str): { "filename": "section_01.md", "section_index": 1, - "content": "## 执行摘要\\n\\n..." + "content": "## Executive Summary\\n\\n..." }, ... ], @@ -634,8 +631,7 @@ def get_report_sections(report_id: str): """ try: sections = ReportManager.get_generated_sections(report_id) - - # 获取报告状态 + report = ReportManager.get_report(report_id) is_complete = report is not None and report.status == ReportStatus.COMPLETED @@ -661,14 +657,14 @@ def get_report_sections(report_id: str): @report_bp.route('//section/', methods=['GET']) def get_single_section(report_id: str, section_index: int): """ - 获取单个章节内容 - - 返回: + Get the content of a single section. + + Returns: { "success": true, "data": { "filename": "section_01.md", - "content": "## 执行摘要\\n\\n..." + "content": "## Executive Summary\\n\\n..." } } """ @@ -702,16 +698,16 @@ def get_single_section(report_id: str, section_index: int): }), 500 -# ============== 报告状态检查接口 ============== +# ============== Report status check endpoints ============== @report_bp.route('/check/', methods=['GET']) def check_report_status(simulation_id: str): """ - 检查模拟是否有报告,以及报告状态 - - 用于前端判断是否解锁Interview功能 - - 返回: + Check whether a simulation has a report, and report its status. + + Used by the frontend to decide whether to unlock the Interview feature. + + Returns: { "success": true, "data": { @@ -730,7 +726,7 @@ def check_report_status(simulation_id: str): report_status = report.status.value if report else None report_id = report.report_id if report else None - # 只有报告完成后才解锁interview + # Interview feature is only unlocked once a report has finished generating. interview_unlocked = has_report and report.status == ReportStatus.COMPLETED return jsonify({ @@ -753,22 +749,22 @@ def check_report_status(simulation_id: str): }), 500 -# ============== Agent 日志接口 ============== +# ============== Agent log endpoints ============== @report_bp.route('//agent-log', methods=['GET']) def get_agent_log(report_id: str): """ - 获取 Report Agent 的详细执行日志 - - 实时获取报告生成过程中的每一步动作,包括: - - 报告开始、规划开始/完成 - - 每个章节的开始、工具调用、LLM响应、完成 - - 报告完成或失败 - - Query参数: - from_line: 从第几行开始读取(可选,默认0,用于增量获取) - - 返回: + Get the detailed execution log of the Report Agent. + + Streams every step the agent took while generating the report, including: + - Report start, planning start/complete. + - Per-section start, tool calls, LLM responses, and completion. + - Final report completion or failure. + + Query parameters: + from_line: line offset to start reading from (optional, default 0, for incremental polling). + + Returns: { "success": true, "data": { @@ -779,7 +775,7 @@ def get_agent_log(report_id: str): "report_id": "report_xxxx", "action": "tool_call", "stage": "generating", - "section_title": "执行摘要", + "section_title": "Executive Summary", "section_index": 1, "details": { "tool_name": "insight_forge", @@ -817,9 +813,9 @@ def get_agent_log(report_id: str): @report_bp.route('//agent-log/stream', methods=['GET']) def stream_agent_log(report_id: str): """ - 获取完整的 Agent 日志(一次性获取全部) - - 返回: + Get the full Agent log in one shot (no pagination). + + Returns: { "success": true, "data": { @@ -848,27 +844,27 @@ def stream_agent_log(report_id: str): }), 500 -# ============== 控制台日志接口 ============== +# ============== Console log endpoints ============== @report_bp.route('//console-log', methods=['GET']) def get_console_log(report_id: str): """ - 获取 Report Agent 的控制台输出日志 - - 实时获取报告生成过程中的控制台输出(INFO、WARNING等), - 这与 agent-log 接口返回的结构化 JSON 日志不同, - 是纯文本格式的控制台风格日志。 - - Query参数: - from_line: 从第几行开始读取(可选,默认0,用于增量获取) - - 返回: + Get the Report Agent's console output log. + + Streams the console output produced during report generation (INFO, WARNING, etc.). + Unlike the structured JSON returned by the agent-log endpoint, this is plain-text + console-style output. + + Query parameters: + from_line: line offset to start reading from (optional, default 0, for incremental polling). + + Returns: { "success": true, "data": { "logs": [ - "[19:46:14] INFO: 搜索完成: 找到 15 条相关事实", - "[19:46:14] INFO: 图谱搜索: graph_id=xxx, query=...", + "[19:46:14] INFO: Search complete: found 15 relevant facts", + "[19:46:14] INFO: Graph search: graph_id=xxx, query=...", ... ], "total_lines": 100, @@ -899,9 +895,9 @@ def get_console_log(report_id: str): @report_bp.route('//console-log/stream', methods=['GET']) def stream_console_log(report_id: str): """ - 获取完整的控制台日志(一次性获取全部) - - 返回: + Get the full console log in one shot (no pagination). + + Returns: { "success": true, "data": { @@ -930,17 +926,17 @@ def stream_console_log(report_id: str): }), 500 -# ============== 工具调用接口(供调试使用)============== +# ============== Tool invocation endpoints (for debugging) ============== @report_bp.route('/tools/search', methods=['POST']) def search_graph_tool(): """ - 图谱搜索工具接口(供调试使用) - - 请求(JSON): + Graph search tool endpoint (for debugging). + + Request (JSON): { "graph_id": "mirofish_xxxx", - "query": "搜索查询", + "query": "search query", "limit": 10 } """ @@ -983,9 +979,9 @@ def search_graph_tool(): @report_bp.route('/tools/statistics', methods=['POST']) def get_graph_statistics_tool(): """ - 图谱统计工具接口(供调试使用) - - 请求(JSON): + Graph statistics tool endpoint (for debugging). + + Request (JSON): { "graph_id": "mirofish_xxxx" } diff --git a/backend/app/api/simulation.py b/backend/app/api/simulation.py index 4cc3018e..3507be16 100644 --- a/backend/app/api/simulation.py +++ b/backend/app/api/simulation.py @@ -1,6 +1,7 @@ -""" -模拟相关API路由 -Step2: Zep实体读取与过滤、OASIS模拟准备与运行(全程自动化) +"""Simulation-related API routes. + +Step 2: Zep entity reading/filtering, OASIS simulation preparation and execution +(end-to-end automated). """ import os @@ -20,41 +21,38 @@ from ..utils.locale import t logger = get_logger('mirofish.api.simulation') -# Interview prompt 优化前缀 -# 添加此前缀可以避免Agent调用工具,直接用文本回复 +# Prefix injection avoids agent tool-calls and forces a plain-text reply. INTERVIEW_PROMPT_PREFIX = "结合你的人设、所有的过往记忆与行动,不调用任何工具直接用文本回复我:" def optimize_interview_prompt(prompt: str) -> str: - """ - 优化Interview提问,添加前缀避免Agent调用工具 - + """Optimize an interview prompt by prepending the no-tool-call prefix. + Args: - prompt: 原始提问 - + prompt: Original prompt text. + Returns: - 优化后的提问 + Prompt with the prefix prepended (or unchanged if already prefixed). """ if not prompt: return prompt - # 避免重复添加前缀 if prompt.startswith(INTERVIEW_PROMPT_PREFIX): return prompt return f"{INTERVIEW_PROMPT_PREFIX}{prompt}" -# ============== 实体读取接口 ============== +# ============== Entity reading endpoints ============== @simulation_bp.route('/entities/', methods=['GET']) def get_graph_entities(graph_id: str): - """ - 获取图谱中的所有实体(已过滤) - - 只返回符合预定义实体类型的节点(Labels不只是Entity的节点) - - Query参数: - entity_types: 逗号分隔的实体类型列表(可选,用于进一步过滤) - enrich: 是否获取相关边信息(默认true) + """Return all (filtered) entities in the graph. + + Only nodes matching the predefined entity types are returned (i.e. nodes + whose labels include more than just `Entity`). + + Query params: + entity_types: Comma-separated entity-type list (optional, for further filtering). + enrich: Whether to include related edge info (default true). """ try: if not Config.NEO4J_PASSWORD: @@ -92,7 +90,7 @@ def get_graph_entities(graph_id: str): @simulation_bp.route('/entities//', methods=['GET']) def get_entity_detail(graph_id: str, entity_uuid: str): - """获取单个实体的详细信息""" + """Return details for a single entity.""" try: if not Config.NEO4J_PASSWORD: return jsonify({ @@ -125,7 +123,7 @@ def get_entity_detail(graph_id: str, entity_uuid: str): @simulation_bp.route('/entities//by-type/', methods=['GET']) def get_entities_by_type(graph_id: str, entity_type: str): - """获取指定类型的所有实体""" + """Return all entities of the given type.""" try: if not Config.NEO4J_PASSWORD: return jsonify({ @@ -160,24 +158,24 @@ def get_entities_by_type(graph_id: str, entity_type: str): }), 500 -# ============== 模拟管理接口 ============== +# ============== Simulation management endpoints ============== @simulation_bp.route('/create', methods=['POST']) def create_simulation(): - """ - 创建新的模拟 - - 注意:max_rounds等参数由LLM智能生成,无需手动设置 - - 请求(JSON): + """Create a new simulation. + + Note: parameters such as `max_rounds` are generated intelligently by the LLM + and do not need to be set manually. + + Request (JSON): { - "project_id": "proj_xxxx", // 必填 - "graph_id": "mirofish_xxxx", // 可选,如不提供则从project获取 - "enable_twitter": true, // 可选,默认true - "enable_reddit": true // 可选,默认true + "project_id": "proj_xxxx", // required + "graph_id": "mirofish_xxxx", // optional; falls back to the project's graph_id + "enable_twitter": true, // optional, default true + "enable_reddit": true // optional, default true } - - 返回: + + Response: { "success": true, "data": { @@ -238,39 +236,38 @@ def create_simulation(): def _check_simulation_prepared(simulation_id: str) -> tuple: - """ - 检查模拟是否已经准备完成 - - 检查条件: - 1. state.json 存在且 status 为 "ready" - 2. 必要文件存在:reddit_profiles.json, twitter_profiles.csv, simulation_config.json - - 注意:运行脚本(run_*.py)保留在 backend/scripts/ 目录,不再复制到模拟目录 - + """Check whether a simulation is already fully prepared. + + Conditions: + 1. `state.json` exists and `status` is "ready". + 2. Required files exist: `reddit_profiles.json`, `twitter_profiles.csv`, + `simulation_config.json`. + + Note: runner scripts (run_*.py) live under `backend/scripts/` and are no longer + copied into the simulation directory. + Args: - simulation_id: 模拟ID - + simulation_id: Simulation identifier. + Returns: (is_prepared: bool, info: dict) """ import os from ..config import Config - + simulation_dir = os.path.join(Config.OASIS_SIMULATION_DATA_DIR, simulation_id) - - # 检查目录是否存在 + if not os.path.exists(simulation_dir): return False, {"reason": "模拟目录不存在"} - - # 必要文件列表(不包括脚本,脚本位于 backend/scripts/) + + # Required files (scripts are not included; they live in backend/scripts/). required_files = [ "state.json", "simulation_config.json", "reddit_profiles.json", "twitter_profiles.csv" ] - - # 检查文件是否存在 + existing_files = [] missing_files = [] for f in required_files: @@ -287,7 +284,6 @@ def _check_simulation_prepared(simulation_id: str) -> tuple: "existing_files": existing_files } - # 检查state.json中的状态 state_file = os.path.join(simulation_dir, "state.json") try: import json @@ -296,31 +292,23 @@ def _check_simulation_prepared(simulation_id: str) -> tuple: status = state_data.get("status", "") config_generated = state_data.get("config_generated", False) - - # 详细日志 + logger.debug(t("log.simulation_api.m013", simulation_id=simulation_id, status=status, config_generated=config_generated)) - - # 如果 config_generated=True 且文件存在,认为准备完成 - # 以下状态都说明准备工作已完成: - # - ready: 准备完成,可以运行 - # - preparing: 如果 config_generated=True 说明已完成 - # - running: 正在运行,说明准备早就完成了 - # - completed: 运行完成,说明准备早就完成了 - # - stopped: 已停止,说明准备早就完成了 - # - failed: 运行失败(但准备是完成的) + + # All these statuses imply preparation is finished (when config_generated is True): + # - ready / preparing / running / completed / stopped / failed. prepared_statuses = ["ready", "preparing", "running", "completed", "stopped", "failed"] if status in prepared_statuses and config_generated: - # 获取文件统计信息 profiles_file = os.path.join(simulation_dir, "reddit_profiles.json") config_file = os.path.join(simulation_dir, "simulation_config.json") - + profiles_count = 0 if os.path.exists(profiles_file): with open(profiles_file, 'r', encoding='utf-8') as f: profiles_data = json.load(f) profiles_count = len(profiles_data) if isinstance(profiles_data, list) else 0 - - # 如果状态是preparing但文件已完成,自动更新状态为ready + + # If status is "preparing" but the files are already complete, auto-promote to "ready". if status == "preparing": try: state_data["status"] = "ready" @@ -358,42 +346,41 @@ def _check_simulation_prepared(simulation_id: str) -> tuple: @simulation_bp.route('/prepare', methods=['POST']) def prepare_simulation(): - """ - 准备模拟环境(异步任务,LLM智能生成所有参数) - - 这是一个耗时操作,接口会立即返回task_id, - 使用 GET /api/simulation/prepare/status 查询进度 - - 特性: - - 自动检测已完成的准备工作,避免重复生成 - - 如果已准备完成,直接返回已有结果 - - 支持强制重新生成(force_regenerate=true) - - 步骤: - 1. 检查是否已有完成的准备工作 - 2. 从Zep图谱读取并过滤实体 - 3. 为每个实体生成OASIS Agent Profile(带重试机制) - 4. LLM智能生成模拟配置(带重试机制) - 5. 保存配置文件和预设脚本 - - 请求(JSON): + """Prepare the simulation environment (async task; the LLM generates all params). + + This is a long-running operation. The endpoint returns a `task_id` immediately; + use `GET /api/simulation/prepare/status` to poll for progress. + + Features: + - Auto-detects completed preparation work and avoids duplicate generation. + - Returns existing results when preparation is already complete. + - Supports force regeneration via `force_regenerate=true`. + + Steps: + 1. Check whether preparation is already complete. + 2. Read and filter entities from the Zep graph. + 3. Generate an OASIS Agent profile per entity (with retry). + 4. LLM-generate the simulation configuration (with retry). + 5. Save the config files and preset scripts. + + Request (JSON): { - "simulation_id": "sim_xxxx", // 必填,模拟ID - "entity_types": ["Student", "PublicFigure"], // 可选,指定实体类型 - "use_llm_for_profiles": true, // 可选,是否用LLM生成人设 - "parallel_profile_count": 5, // 可选,并行生成人设数量,默认5 - "force_regenerate": false // 可选,强制重新生成,默认false + "simulation_id": "sim_xxxx", // required + "entity_types": ["Student", "PublicFigure"], // optional + "use_llm_for_profiles": true, // optional + "parallel_profile_count": 5, // optional, default 5 + "force_regenerate": false // optional, default false } - - 返回: + + Response: { "success": true, "data": { "simulation_id": "sim_xxxx", - "task_id": "task_xxxx", // 新任务时返回 + "task_id": "task_xxxx", // present for newly started tasks "status": "preparing|ready", - "message": "准备任务已启动|已有完成的准备工作", - "already_prepared": true|false // 是否已准备完成 + "message": "...", + "already_prepared": true|false } } """ @@ -421,11 +408,10 @@ def prepare_simulation(): "error": t("api.error.simulation.m019", simulation_id=simulation_id) }), 404 - # 检查是否强制重新生成 force_regenerate = data.get('force_regenerate', False) logger.info(t("log.simulation_api.m020", simulation_id=simulation_id, force_regenerate=force_regenerate)) - - # 检查是否已经准备完成(避免重复生成) + + # Skip regeneration if preparation is already complete. if not force_regenerate: logger.debug(t("log.simulation_api.m021", simulation_id=simulation_id)) is_prepared, prepare_info = _check_simulation_prepared(simulation_id) @@ -445,49 +431,43 @@ def prepare_simulation(): else: logger.info(t("log.simulation_api.m024", simulation_id=simulation_id)) - # 从项目获取必要信息 project = ProjectManager.get_project(state.project_id) if not project: return jsonify({ "success": False, "error": t("api.error.simulation.m025", state=state.project_id) }), 404 - - # 获取模拟需求 + simulation_requirement = project.simulation_requirement or "" if not simulation_requirement: return jsonify({ "success": False, "error": t("api.error.simulation.m026") }), 400 - - # 获取文档文本 + document_text = ProjectManager.get_extracted_text(state.project_id) or "" - + entity_types_list = data.get('entity_types') use_llm_for_profiles = data.get('use_llm_for_profiles', True) parallel_profile_count = data.get('parallel_profile_count', 5) - - # ========== 同步获取实体数量(在后台任务启动前) ========== - # 这样前端在调用prepare后立即就能获取到预期Agent总数 + + # Synchronously fetch the entity count before starting the background task, + # so the frontend can immediately display the expected agent total. try: logger.info(t("log.simulation_api.m027", state=state.graph_id)) reader = ZepEntityReader() - # 快速读取实体(不需要边信息,只统计数量) filtered_preview = reader.filter_defined_entities( graph_id=state.graph_id, defined_entity_types=entity_types_list, - enrich_with_edges=False # 不获取边信息,加快速度 + enrich_with_edges=False # Skip edges for speed; only the count matters here. ) - # 保存实体数量到状态(供前端立即获取) state.entities_count = filtered_preview.filtered_count state.entity_types = list(filtered_preview.entity_types) logger.info(t("log.simulation_api.m028", filtered_preview=filtered_preview.filtered_count, filtered_preview_2=filtered_preview.entity_types)) except Exception as e: logger.warning(t("log.simulation_api.m029", e=e)) - # 失败不影响后续流程,后台任务会重新获取 - - # 创建异步任务 + # Failure here is non-fatal; the background task will re-read the entities. + task_manager = TaskManager() task_id = task_manager.create_task( task_type="simulation_prepare", @@ -497,11 +477,10 @@ def prepare_simulation(): } ) - # 更新模拟状态(包含预先获取的实体数量) + # Update simulation state (including the pre-fetched entity count). state.status = SimulationStatus.PREPARING manager._save_simulation_state(state) - - # 定义后台任务 + def run_prepare(): try: task_manager.update_task( @@ -511,23 +490,21 @@ def prepare_simulation(): message="开始准备模拟环境..." ) - # 准备模拟(带进度回调) - # 存储阶段进度详情 + # Per-stage progress detail (used by the progress callback below). stage_details = {} - + def progress_callback(stage, progress, message, **kwargs): - # 计算总进度 + # Map each stage to a slice of the overall 0-100 progress range. stage_weights = { - "reading": (0, 20), # 0-20% - "generating_profiles": (20, 70), # 20-70% - "generating_config": (70, 90), # 70-90% - "copying_scripts": (90, 100) # 90-100% + "reading": (0, 20), + "generating_profiles": (20, 70), + "generating_config": (70, 90), + "copying_scripts": (90, 100) } - + start, end = stage_weights.get(stage, (0, 100)) current_progress = int(start + (end - start) * progress / 100) - - # 构建详细进度信息 + stage_names = { "reading": "读取图谱实体", "generating_profiles": "生成Agent人设", @@ -537,8 +514,7 @@ def prepare_simulation(): stage_index = list(stage_weights.keys()).index(stage) + 1 if stage in stage_weights else 1 total_stages = len(stage_weights) - - # 更新阶段详情 + stage_details[stage] = { "stage_name": stage_names.get(stage, stage), "stage_progress": progress, @@ -546,8 +522,7 @@ def prepare_simulation(): "total": kwargs.get("total", 0), "item_name": kwargs.get("item_name", "") } - - # 构建详细进度信息 + detail = stage_details[stage] progress_detail_data = { "current_stage": stage, @@ -559,8 +534,8 @@ def prepare_simulation(): "total_items": detail["total"], "item_description": message } - - # 构建简洁消息 + + # Build a concise progress message. if detail["total"] > 0: detailed_message = ( f"[{stage_index}/{total_stages}] {stage_names.get(stage, stage)}: " @@ -586,24 +561,22 @@ def prepare_simulation(): parallel_profile_count=parallel_profile_count ) - # 任务完成 task_manager.complete_task( task_id, result=result_state.to_simple_dict() ) - + except Exception as e: logger.error(t("log.simulation_api.m030", str=str(e))) task_manager.fail_task(task_id, str(e)) - - # 更新模拟状态为失败 + + # Mark the simulation state as failed. state = manager.get_simulation(simulation_id) if state: state.status = SimulationStatus.FAILED state.error = str(e) manager._save_simulation_state(state) - - # 启动后台线程 + thread = threading.Thread(target=run_prepare, daemon=True) thread.start() @@ -615,8 +588,8 @@ def prepare_simulation(): "status": "preparing", "message": "准备任务已启动,请通过 /api/simulation/prepare/status 查询进度", "already_prepared": False, - "expected_entities_count": state.entities_count, # 预期的Agent总数 - "entity_types": state.entity_types # 实体类型列表 + "expected_entities_count": state.entities_count, # Expected total agent count. + "entity_types": state.entity_types # Entity-type list. } }) @@ -637,20 +610,19 @@ def prepare_simulation(): @simulation_bp.route('/prepare/status', methods=['POST']) def get_prepare_status(): - """ - 查询准备任务进度 - - 支持两种查询方式: - 1. 通过task_id查询正在进行的任务进度 - 2. 通过simulation_id检查是否已有完成的准备工作 - - 请求(JSON): + """Query progress for a preparation task. + + Two query modes are supported: + 1. By `task_id` — return live progress for an in-flight task. + 2. By `simulation_id` — check whether preparation has already finished. + + Request (JSON): { - "task_id": "task_xxxx", // 可选,prepare返回的task_id - "simulation_id": "sim_xxxx" // 可选,模拟ID(用于检查已完成的准备) + "task_id": "task_xxxx", // optional; the task_id returned by /prepare + "simulation_id": "sim_xxxx" // optional; checks for existing complete prep } - - 返回: + + Response: { "success": true, "data": { @@ -658,8 +630,8 @@ def get_prepare_status(): "status": "processing|completed|ready", "progress": 45, "message": "...", - "already_prepared": true|false, // 是否已有完成的准备 - "prepare_info": {...} // 已准备完成时的详细信息 + "already_prepared": true|false, // whether prep is already complete + "prepare_info": {...} // details when prep is complete } } """ @@ -671,7 +643,7 @@ def get_prepare_status(): task_id = data.get('task_id') simulation_id = data.get('simulation_id') - # 如果提供了simulation_id,先检查是否已准备完成 + # If simulation_id is provided, first check if prep is already complete. if simulation_id: is_prepared, prepare_info = _check_simulation_prepared(simulation_id) if is_prepared: @@ -687,10 +659,10 @@ def get_prepare_status(): } }) - # 如果没有task_id,返回错误 + # No task_id provided. if not task_id: if simulation_id: - # 有simulation_id但未准备完成 + # simulation_id provided but prep is not complete. return jsonify({ "success": True, "data": { @@ -710,7 +682,7 @@ def get_prepare_status(): task = task_manager.get_task(task_id) if not task: - # 任务不存在,但如果有simulation_id,检查是否已准备完成 + # Task is missing; if simulation_id is given, check whether prep is already complete. if simulation_id: is_prepared, prepare_info = _check_simulation_prepared(simulation_id) if is_prepared: @@ -750,7 +722,7 @@ def get_prepare_status(): @simulation_bp.route('/', methods=['GET']) def get_simulation(simulation_id: str): - """获取模拟状态""" + """Return the current simulation state.""" try: manager = SimulationManager() state = manager.get_simulation(simulation_id) @@ -763,7 +735,7 @@ def get_simulation(simulation_id: str): result = state.to_dict() - # 如果模拟已准备好,附加运行说明 + # Attach run instructions when the simulation is ready. if state.status == SimulationStatus.READY: result["run_instructions"] = manager.get_run_instructions(simulation_id) @@ -783,11 +755,10 @@ def get_simulation(simulation_id: str): @simulation_bp.route('/list', methods=['GET']) def list_simulations(): - """ - 列出所有模拟 - - Query参数: - project_id: 按项目ID过滤(可选) + """List all simulations. + + Query params: + project_id: Filter by project ID (optional). """ try: project_id = request.args.get('project_id') @@ -811,23 +782,22 @@ def list_simulations(): def _get_report_id_for_simulation(simulation_id: str) -> str: - """ - 获取 simulation 对应的最新 report_id - - 遍历 reports 目录,找出 simulation_id 匹配的 report, - 如果有多个则返回最新的(按 created_at 排序) - + """Return the latest report_id associated with a simulation. + + Walks the reports directory, finds reports whose simulation_id matches, + and returns the most recent one (sorted by created_at). + Args: - simulation_id: 模拟ID - + simulation_id: Simulation identifier. + Returns: - report_id 或 None + report_id, or None if no matching report exists. """ import json from datetime import datetime - - # reports 目录路径:backend/uploads/reports - # __file__ 是 app/api/simulation.py,需要向上两级到 backend/ + + # Reports directory: backend/uploads/reports. + # __file__ is app/api/simulation.py, so we go up two levels to reach backend/. reports_dir = os.path.join(os.path.dirname(__file__), '../../uploads/reports') if not os.path.exists(reports_dir): return None @@ -860,7 +830,7 @@ def _get_report_id_for_simulation(simulation_id: str) -> str: if not matching_reports: return None - # 按创建时间倒序排序,返回最新的 + # Sort by creation time descending and return the most recent. matching_reports.sort(key=lambda x: x.get("created_at", ""), reverse=True) return matching_reports[0].get("report_id") @@ -871,23 +841,23 @@ def _get_report_id_for_simulation(simulation_id: str) -> str: @simulation_bp.route('/history', methods=['GET']) def get_simulation_history(): - """ - 获取历史模拟列表(带项目详情) - - 用于首页历史项目展示,返回包含项目名称、描述等丰富信息的模拟列表 - - Query参数: - limit: 返回数量限制(默认20) - - 返回: + """Return historical simulations (with project details). + + Used by the homepage to display past projects. Returns a list of simulations + enriched with project name, description, and other metadata. + + Query params: + limit: Maximum number of items to return (default 20). + + Response: { "success": true, "data": [ { "simulation_id": "sim_xxxx", "project_id": "proj_xxxx", - "project_name": "武大舆情分析", - "simulation_requirement": "如果武汉大学发布...", + "project_name": "...", + "simulation_requirement": "...", "status": "completed", "entities_count": 68, "profiles_count": 68, @@ -910,56 +880,54 @@ def get_simulation_history(): manager = SimulationManager() simulations = manager.list_simulations()[:limit] - # 增强模拟数据,只从 Simulation 文件读取 + # Enrich simulation data using only the Simulation files. enriched_simulations = [] for sim in simulations: sim_dict = sim.to_dict() - - # 获取模拟配置信息(从 simulation_config.json 读取 simulation_requirement) + + # Read simulation_requirement from simulation_config.json. config = manager.get_simulation_config(sim.simulation_id) if config: sim_dict["simulation_requirement"] = config.get("simulation_requirement", "") time_config = config.get("time_config", {}) sim_dict["total_simulation_hours"] = time_config.get("total_simulation_hours", 0) - # 推荐轮数(后备值) + # Recommended round count (used as a fallback). recommended_rounds = int( - time_config.get("total_simulation_hours", 0) * 60 / + time_config.get("total_simulation_hours", 0) * 60 / max(time_config.get("minutes_per_round", 60), 1) ) else: sim_dict["simulation_requirement"] = "" sim_dict["total_simulation_hours"] = 0 recommended_rounds = 0 - - # 获取运行状态(从 run_state.json 读取用户设置的实际轮数) + + # Read user-set total_rounds from run_state.json. run_state = SimulationRunner.get_run_state(sim.simulation_id) if run_state: sim_dict["current_round"] = run_state.current_round sim_dict["runner_status"] = run_state.runner_status.value - # 使用用户设置的 total_rounds,若无则使用推荐轮数 + # Prefer the user-set total_rounds; fall back to the recommended count. sim_dict["total_rounds"] = run_state.total_rounds if run_state.total_rounds > 0 else recommended_rounds else: sim_dict["current_round"] = 0 sim_dict["runner_status"] = "idle" sim_dict["total_rounds"] = recommended_rounds - - # 获取关联项目的文件列表(最多3个) + + # Up to three files from the associated project. project = ProjectManager.get_project(sim.project_id) if project and hasattr(project, 'files') and project.files: sim_dict["files"] = [ - {"filename": f.get("filename", "未知文件")} + {"filename": f.get("filename", "未知文件")} for f in project.files[:3] ] else: sim_dict["files"] = [] - - # 获取关联的 report_id(查找该 simulation 最新的 report) + + # Latest report_id linked to this simulation. sim_dict["report_id"] = _get_report_id_for_simulation(sim.simulation_id) - - # 添加版本号 + sim_dict["version"] = "v1.0.2" - - # 格式化日期 + try: created_date = sim_dict.get("created_at", "")[:10] sim_dict["created_date"] = created_date @@ -985,11 +953,10 @@ def get_simulation_history(): @simulation_bp.route('//profiles', methods=['GET']) def get_simulation_profiles(simulation_id: str): - """ - 获取模拟的Agent Profile - - Query参数: - platform: 平台类型(reddit/twitter,默认reddit) + """Return the agent profiles for a simulation. + + Query params: + platform: Platform (reddit/twitter, default reddit). """ try: platform = request.args.get('platform', 'reddit') @@ -1023,26 +990,25 @@ def get_simulation_profiles(simulation_id: str): @simulation_bp.route('//profiles/realtime', methods=['GET']) def get_simulation_profiles_realtime(simulation_id: str): - """ - 实时获取模拟的Agent Profile(用于在生成过程中实时查看进度) - - 与 /profiles 接口的区别: - - 直接读取文件,不经过 SimulationManager - - 适用于生成过程中的实时查看 - - 返回额外的元数据(如文件修改时间、是否正在生成等) - - Query参数: - platform: 平台类型(reddit/twitter,默认reddit) - - 返回: + """Return agent profiles in real time (for live progress during generation). + + Differs from /profiles in that: + - Reads files directly, bypassing SimulationManager. + - Suitable for live viewing while generation is still running. + - Returns extra metadata (file mtime, is_generating, etc.). + + Query params: + platform: Platform (reddit/twitter, default reddit). + + Response: { "success": true, "data": { "simulation_id": "sim_xxxx", "platform": "reddit", "count": 15, - "total_expected": 93, // 预期总数(如果有) - "is_generating": true, // 是否正在生成 + "total_expected": 93, // expected total (if known) + "is_generating": true, // whether generation is in progress "file_exists": true, "file_modified_at": "2025-12-04T18:20:00", "profiles": [...] @@ -1056,31 +1022,27 @@ def get_simulation_profiles_realtime(simulation_id: str): try: platform = request.args.get('platform', 'reddit') - # 获取模拟目录 sim_dir = os.path.join(Config.OASIS_SIMULATION_DATA_DIR, simulation_id) - + if not os.path.exists(sim_dir): return jsonify({ "success": False, "error": t("api.error.simulation.m041", simulation_id=simulation_id) }), 404 - - # 确定文件路径 + if platform == "reddit": profiles_file = os.path.join(sim_dir, "reddit_profiles.json") else: profiles_file = os.path.join(sim_dir, "twitter_profiles.csv") - - # 检查文件是否存在 + file_exists = os.path.exists(profiles_file) profiles = [] file_modified_at = None if file_exists: - # 获取文件修改时间 file_stat = os.stat(profiles_file) file_modified_at = datetime.fromtimestamp(file_stat.st_mtime).isoformat() - + try: if platform == "reddit": with open(profiles_file, 'r', encoding='utf-8') as f: @@ -1092,8 +1054,8 @@ def get_simulation_profiles_realtime(simulation_id: str): except (json.JSONDecodeError, Exception) as e: logger.warning(t("log.simulation_api.m042", e=e)) profiles = [] - - # 检查是否正在生成(通过 state.json 判断) + + # Use state.json to detect whether generation is in progress. is_generating = False total_expected = None @@ -1133,25 +1095,24 @@ def get_simulation_profiles_realtime(simulation_id: str): @simulation_bp.route('//config/realtime', methods=['GET']) def get_simulation_config_realtime(simulation_id: str): - """ - 实时获取模拟配置(用于在生成过程中实时查看进度) - - 与 /config 接口的区别: - - 直接读取文件,不经过 SimulationManager - - 适用于生成过程中的实时查看 - - 返回额外的元数据(如文件修改时间、是否正在生成等) - - 即使配置还没生成完也能返回部分信息 - - 返回: + """Return the simulation config in real time (for live progress during generation). + + Differs from /config in that: + - Reads the file directly, bypassing SimulationManager. + - Suitable for live viewing while generation is still running. + - Returns extra metadata (file mtime, is_generating, etc.). + - Returns partial information even if generation has not finished. + + Response: { "success": true, "data": { "simulation_id": "sim_xxxx", "file_exists": true, "file_modified_at": "2025-12-04T18:20:00", - "is_generating": true, // 是否正在生成 - "generation_stage": "generating_config", // 当前生成阶段 - "config": {...} // 配置内容(如果存在) + "is_generating": true, // generation in progress + "generation_stage": "generating_config", // current stage + "config": {...} // config content, if any } } """ @@ -1159,25 +1120,21 @@ def get_simulation_config_realtime(simulation_id: str): from datetime import datetime try: - # 获取模拟目录 sim_dir = os.path.join(Config.OASIS_SIMULATION_DATA_DIR, simulation_id) - + if not os.path.exists(sim_dir): return jsonify({ "success": False, "error": t("api.error.simulation.m044", simulation_id=simulation_id) }), 404 - - # 配置文件路径 + config_file = os.path.join(sim_dir, "simulation_config.json") - - # 检查文件是否存在 + file_exists = os.path.exists(config_file) config = None file_modified_at = None - + if file_exists: - # 获取文件修改时间 file_stat = os.stat(config_file) file_modified_at = datetime.fromtimestamp(file_stat.st_mtime).isoformat() @@ -1187,8 +1144,8 @@ def get_simulation_config_realtime(simulation_id: str): except (json.JSONDecodeError, Exception) as e: logger.warning(t("log.simulation_api.m045", e=e)) config = None - - # 检查是否正在生成(通过 state.json 判断) + + # Use state.json to detect whether generation is in progress. is_generating = False generation_stage = None config_generated = False @@ -1201,8 +1158,8 @@ def get_simulation_config_realtime(simulation_id: str): status = state_data.get("status", "") is_generating = status == "preparing" config_generated = state_data.get("config_generated", False) - - # 判断当前阶段 + + # Derive the current stage. if is_generating: if state_data.get("profiles_generated", False): generation_stage = "generating_config" @@ -1212,8 +1169,7 @@ def get_simulation_config_realtime(simulation_id: str): generation_stage = "completed" except Exception: pass - - # 构建返回数据 + response_data = { "simulation_id": simulation_id, "file_exists": file_exists, @@ -1223,8 +1179,8 @@ def get_simulation_config_realtime(simulation_id: str): "config_generated": config_generated, "config": config } - - # 如果配置存在,提取一些关键统计信息 + + # When config is present, surface a few key summary stats. if config: response_data["summary"] = { "total_agents": len(config.get("agent_configs", [])), @@ -1253,15 +1209,14 @@ def get_simulation_config_realtime(simulation_id: str): @simulation_bp.route('//config', methods=['GET']) def get_simulation_config(simulation_id: str): - """ - 获取模拟配置(LLM智能生成的完整配置) - - 返回包含: - - time_config: 时间配置(模拟时长、轮次、高峰/低谷时段) - - agent_configs: 每个Agent的活动配置(活跃度、发言频率、立场等) - - event_config: 事件配置(初始帖子、热点话题) - - platform_configs: 平台配置 - - generation_reasoning: LLM的配置推理说明 + """Return the simulation config (the full LLM-generated config). + + Returns: + - time_config: Time configuration (sim length, rounds, peak/off-peak windows). + - agent_configs: Per-agent activity configuration (activity, posting rate, stance). + - event_config: Event configuration (initial posts, hot topics). + - platform_configs: Platform configuration. + - generation_reasoning: The LLM's reasoning notes for the config. """ try: manager = SimulationManager() @@ -1289,7 +1244,7 @@ def get_simulation_config(simulation_id: str): @simulation_bp.route('//config/download', methods=['GET']) def download_simulation_config(simulation_id: str): - """下载模拟配置文件""" + """Download the simulation config file.""" try: manager = SimulationManager() sim_dir = manager._get_simulation_dir(simulation_id) @@ -1318,20 +1273,19 @@ def download_simulation_config(simulation_id: str): @simulation_bp.route('/script//download', methods=['GET']) def download_simulation_script(script_name: str): - """ - 下载模拟运行脚本文件(通用脚本,位于 backend/scripts/) - - script_name可选值: + """Download a simulation runner script (shared scripts in backend/scripts/). + + Allowed values for script_name: - run_twitter_simulation.py - run_reddit_simulation.py - run_parallel_simulation.py - action_logger.py """ try: - # 脚本位于 backend/scripts/ 目录 + # Scripts live in the backend/scripts/ directory. scripts_dir = os.path.abspath(os.path.join(os.path.dirname(__file__), '../../scripts')) - - # 验证脚本名称 + + # Allow only known script names. allowed_scripts = [ "run_twitter_simulation.py", "run_reddit_simulation.py", @@ -1368,19 +1322,18 @@ def download_simulation_script(script_name: str): }), 500 -# ============== Profile生成接口(独立使用) ============== +# ============== Standalone profile generation endpoints ============== @simulation_bp.route('/generate-profiles', methods=['POST']) def generate_profiles(): - """ - 直接从图谱生成OASIS Agent Profile(不创建模拟) - - 请求(JSON): + """Generate OASIS agent profiles directly from a graph (without creating a simulation). + + Request (JSON): { - "graph_id": "mirofish_xxxx", // 必填 - "entity_types": ["Student"], // 可选 - "use_llm": true, // 可选 - "platform": "reddit" // 可选 + "graph_id": "mirofish_xxxx", // required + "entity_types": ["Student"], // optional + "use_llm": true, // optional + "platform": "reddit" // optional } """ try: @@ -1442,35 +1395,34 @@ def generate_profiles(): }), 500 -# ============== 模拟运行控制接口 ============== +# ============== Simulation run-control endpoints ============== @simulation_bp.route('/start', methods=['POST']) def start_simulation(): - """ - 开始运行模拟 + """Start running a simulation. - 请求(JSON): + Request (JSON): { - "simulation_id": "sim_xxxx", // 必填,模拟ID - "platform": "parallel", // 可选: twitter / reddit / parallel (默认) - "max_rounds": 100, // 可选: 最大模拟轮数,用于截断过长的模拟 - "enable_graph_memory_update": false, // 可选: 是否将Agent活动动态更新到Zep图谱记忆 - "force": false // 可选: 强制重新开始(会停止运行中的模拟并清理日志) + "simulation_id": "sim_xxxx", // required + "platform": "parallel", // optional: twitter / reddit / parallel (default) + "max_rounds": 100, // optional: max simulation rounds (truncate long sims) + "enable_graph_memory_update": false, // optional: stream agent activity into Zep memory + "force": false // optional: force restart (stops running sim, clears logs) } - 关于 force 参数: - - 启用后,如果模拟正在运行或已完成,会先停止并清理运行日志 - - 清理的内容包括:run_state.json, actions.jsonl, simulation.log 等 - - 不会清理配置文件(simulation_config.json)和 profile 文件 - - 适用于需要重新运行模拟的场景 + About `force`: + - When enabled, if the simulation is running or completed, it is stopped and run logs are cleared. + - Cleared artefacts: run_state.json, actions.jsonl, simulation.log, etc. + - Config files (simulation_config.json) and profiles are NOT cleared. + - Use this when you need to re-run a simulation from scratch. - 关于 enable_graph_memory_update: - - 启用后,模拟中所有Agent的活动(发帖、评论、点赞等)都会实时更新到Zep图谱 - - 这可以让图谱"记住"模拟过程,用于后续分析或AI对话 - - 需要模拟关联的项目有有效的 graph_id - - 采用批量更新机制,减少API调用次数 + About `enable_graph_memory_update`: + - When enabled, all agent activity (posts, comments, likes, etc.) is pushed into the Zep graph + in real time, so the graph "remembers" the simulation for later analysis or chat. + - Requires the linked project to have a valid graph_id. + - Uses batch updates to reduce API calls. - 返回: + Response: { "success": true, "data": { @@ -1480,8 +1432,8 @@ def start_simulation(): "twitter_running": true, "reddit_running": true, "started_at": "2025-12-01T10:00:00", - "graph_memory_update_enabled": true, // 是否启用了图谱记忆更新 - "force_restarted": true // 是否是强制重新开始 + "graph_memory_update_enabled": true, // graph memory update was enabled + "force_restarted": true // restart was forced } } """ @@ -1496,11 +1448,10 @@ def start_simulation(): }), 400 platform = data.get('platform', 'parallel') - max_rounds = data.get('max_rounds') # 可选:最大模拟轮数 - enable_graph_memory_update = data.get('enable_graph_memory_update', False) # 可选:是否启用图谱记忆更新 - force = data.get('force', False) # 可选:强制重新开始 + max_rounds = data.get('max_rounds') # optional: max simulation rounds + enable_graph_memory_update = data.get('enable_graph_memory_update', False) # optional: enable graph memory update + force = data.get('force', False) # optional: force restart - # 验证 max_rounds 参数 if max_rounds is not None: try: max_rounds = int(max_rounds) @@ -1521,7 +1472,7 @@ def start_simulation(): "error": t("api.error.simulation.m060", platform=platform) }), 400 - # 检查模拟是否已准备好 + # Verify the simulation is ready. manager = SimulationManager() state = manager.get_simulation(simulation_id) @@ -1532,21 +1483,19 @@ def start_simulation(): }), 404 force_restarted = False - - # 智能处理状态:如果准备工作已完成,允许重新启动 + + # If preparation is complete, allow re-starting even from a non-READY status. if state.status != SimulationStatus.READY: - # 检查准备工作是否已完成 is_prepared, prepare_info = _check_simulation_prepared(simulation_id) if is_prepared: - # 准备工作已完成,检查是否有正在运行的进程 + # Preparation is complete; check whether a process is still running. if state.status == SimulationStatus.RUNNING: - # 检查模拟进程是否真的在运行 run_state = SimulationRunner.get_run_state(simulation_id) if run_state and run_state.runner_status.value == "running": - # 进程确实在运行 + # The process is genuinely running. if force: - # 强制模式:停止运行中的模拟 + # Force mode: stop the running simulation. logger.info(t("log.simulation_api.m062", simulation_id=simulation_id)) try: SimulationRunner.stop_simulation(simulation_id) @@ -1558,7 +1507,7 @@ def start_simulation(): "error": t("api.error.simulation.m064") }), 400 - # 如果是强制模式,清理运行日志 + # When forcing, also clear run logs. if force: logger.info(t("log.simulation_api.m065", simulation_id=simulation_id)) cleanup_result = SimulationRunner.cleanup_simulation_logs(simulation_id) @@ -1566,37 +1515,35 @@ def start_simulation(): logger.warning(t("log.simulation_api.m066", cleanup_result=cleanup_result.get('errors'))) force_restarted = True - # 进程不存在或已结束,重置状态为 ready + # Process is gone or finished; reset status to ready. logger.info(t("log.simulation_api.m067", simulation_id=simulation_id, state=state.status.value)) state.status = SimulationStatus.READY manager._save_simulation_state(state) else: - # 准备工作未完成 + # Preparation has not finished. return jsonify({ "success": False, "error": t("api.error.simulation.m068", state=state.status.value) }), 400 - - # 获取图谱ID(用于图谱记忆更新) + + # Resolve graph_id (used by graph memory update). graph_id = None if enable_graph_memory_update: - # 从模拟状态或项目中获取 graph_id graph_id = state.graph_id if not graph_id: - # 尝试从项目中获取 + # Fall back to the project's graph_id. project = ProjectManager.get_project(state.project_id) if project: graph_id = project.graph_id - + if not graph_id: return jsonify({ "success": False, "error": t("api.error.simulation.m069") }), 400 - + logger.info(t("log.simulation_api.m070", simulation_id=simulation_id, graph_id=graph_id)) - - # 启动模拟 + run_state = SimulationRunner.start_simulation( simulation_id=simulation_id, platform=platform, @@ -1604,8 +1551,7 @@ def start_simulation(): enable_graph_memory_update=enable_graph_memory_update, graph_id=graph_id ) - - # 更新模拟状态 + state.status = SimulationStatus.RUNNING manager._save_simulation_state(state) @@ -1639,15 +1585,14 @@ def start_simulation(): @simulation_bp.route('/stop', methods=['POST']) def stop_simulation(): - """ - 停止模拟 - - 请求(JSON): + """Stop a simulation. + + Request (JSON): { - "simulation_id": "sim_xxxx" // 必填,模拟ID + "simulation_id": "sim_xxxx" // required } - - 返回: + + Response: { "success": true, "data": { @@ -1668,8 +1613,7 @@ def stop_simulation(): }), 400 run_state = SimulationRunner.stop_simulation(simulation_id) - - # 更新模拟状态 + manager = SimulationManager() state = manager.get_simulation(simulation_id) if state: @@ -1696,14 +1640,13 @@ def stop_simulation(): }), 500 -# ============== 实时状态监控接口 ============== +# ============== Real-time status monitoring endpoints ============== @simulation_bp.route('//run-status', methods=['GET']) def get_run_status(simulation_id: str): - """ - 获取模拟运行实时状态(用于前端轮询) - - 返回: + """Return real-time simulation run status (for frontend polling). + + Response: { "success": true, "data": { @@ -1758,15 +1701,14 @@ def get_run_status(simulation_id: str): @simulation_bp.route('//run-status/detail', methods=['GET']) def get_run_status_detail(simulation_id: str): - """ - 获取模拟运行详细状态(包含所有动作) - - 用于前端展示实时动态 - - Query参数: - platform: 过滤平台(twitter/reddit,可选) - - 返回: + """Return detailed simulation run status (including all actions). + + Used by the frontend for live activity views. + + Query params: + platform: Filter platform (twitter/reddit, optional). + + Response: { "success": true, "data": { @@ -1788,8 +1730,8 @@ def get_run_status_detail(simulation_id: str): }, ... ], - "twitter_actions": [...], # Twitter 平台的所有动作 - "reddit_actions": [...] # Reddit 平台的所有动作 + "twitter_actions": [...], # All actions on the Twitter platform + "reddit_actions": [...] # All actions on the Reddit platform } } """ @@ -1809,38 +1751,35 @@ def get_run_status_detail(simulation_id: str): } }) - # 获取完整的动作列表 all_actions = SimulationRunner.get_all_actions( simulation_id=simulation_id, platform=platform_filter ) - - # 分平台获取动作 + + # Per-platform action lists. twitter_actions = SimulationRunner.get_all_actions( simulation_id=simulation_id, platform="twitter" ) if not platform_filter or platform_filter == "twitter" else [] - + reddit_actions = SimulationRunner.get_all_actions( simulation_id=simulation_id, platform="reddit" ) if not platform_filter or platform_filter == "reddit" else [] - - # 获取当前轮次的动作(recent_actions 只展示最新一轮) + + # `recent_actions` only surfaces the latest round. current_round = run_state.current_round recent_actions = SimulationRunner.get_all_actions( simulation_id=simulation_id, platform=platform_filter, round_num=current_round ) if current_round > 0 else [] - - # 获取基础状态信息 + result = run_state.to_dict() result["all_actions"] = [a.to_dict() for a in all_actions] result["twitter_actions"] = [a.to_dict() for a in twitter_actions] result["reddit_actions"] = [a.to_dict() for a in reddit_actions] result["rounds_count"] = len(run_state.rounds) - # recent_actions 只展示当前最新一轮两个平台的内容 result["recent_actions"] = [a.to_dict() for a in recent_actions] return jsonify({ @@ -1859,17 +1798,16 @@ def get_run_status_detail(simulation_id: str): @simulation_bp.route('//actions', methods=['GET']) def get_simulation_actions(simulation_id: str): - """ - 获取模拟中的Agent动作历史 - - Query参数: - limit: 返回数量(默认100) - offset: 偏移量(默认0) - platform: 过滤平台(twitter/reddit) - agent_id: 过滤Agent ID - round_num: 过滤轮次 - - 返回: + """Return the agent action history for a simulation. + + Query params: + limit: Number of items to return (default 100). + offset: Offset (default 0). + platform: Filter platform (twitter/reddit). + agent_id: Filter agent ID. + round_num: Filter round. + + Response: { "success": true, "data": { @@ -1913,16 +1851,16 @@ def get_simulation_actions(simulation_id: str): @simulation_bp.route('//timeline', methods=['GET']) def get_simulation_timeline(simulation_id: str): - """ - 获取模拟时间线(按轮次汇总) - - 用于前端展示进度条和时间线视图 - - Query参数: - start_round: 起始轮次(默认0) - end_round: 结束轮次(默认全部) - - 返回每轮的汇总信息 + """Return the simulation timeline (summary per round). + + Used by the frontend for the progress bar and timeline view. + + Query params: + start_round: Starting round (default 0). + end_round: Ending round (default: all). + + Returns: + Per-round summary info. """ try: start_round = request.args.get('start_round', 0, type=int) @@ -1953,10 +1891,9 @@ def get_simulation_timeline(simulation_id: str): @simulation_bp.route('//agent-stats', methods=['GET']) def get_agent_stats(simulation_id: str): - """ - 获取每个Agent的统计信息 - - 用于前端展示Agent活跃度排行、动作分布等 + """Return per-agent statistics. + + Used by the frontend to show agent activity rankings, action distribution, etc. """ try: stats = SimulationRunner.get_agent_stats(simulation_id) @@ -1978,19 +1915,19 @@ def get_agent_stats(simulation_id: str): }), 500 -# ============== 数据库查询接口 ============== +# ============== Database query endpoints ============== @simulation_bp.route('//posts', methods=['GET']) def get_simulation_posts(simulation_id: str): - """ - 获取模拟中的帖子 - - Query参数: - platform: 平台类型(twitter/reddit) - limit: 返回数量(默认50) - offset: 偏移量 - - 返回帖子列表(从SQLite数据库读取) + """Return the posts created in a simulation. + + Query params: + platform: Platform (twitter/reddit). + limit: Number of items to return (default 50). + offset: Offset. + + Returns: + List of posts (read from the SQLite database). """ try: platform = request.args.get('platform', 'reddit') @@ -2060,13 +1997,12 @@ def get_simulation_posts(simulation_id: str): @simulation_bp.route('//comments', methods=['GET']) def get_simulation_comments(simulation_id: str): - """ - 获取模拟中的评论(仅Reddit) - - Query参数: - post_id: 过滤帖子ID(可选) - limit: 返回数量 - offset: 偏移量 + """Return comments from a simulation (Reddit only). + + Query params: + post_id: Filter by post ID (optional). + limit: Number of items to return. + offset: Offset. """ try: post_id = request.args.get('post_id') @@ -2133,31 +2069,31 @@ def get_simulation_comments(simulation_id: str): }), 500 -# ============== Interview 采访接口 ============== +# ============== Interview endpoints ============== @simulation_bp.route('/interview', methods=['POST']) def interview_agent(): - """ - 采访单个Agent + """Interview a single agent. - 注意:此功能需要模拟环境处于运行状态(完成模拟循环后进入等待命令模式) + Note: requires the simulation environment to be running (i.e. the sim loop has + finished and the runner is in command-wait mode). - 请求(JSON): + Request (JSON): { - "simulation_id": "sim_xxxx", // 必填,模拟ID - "agent_id": 0, // 必填,Agent ID - "prompt": "你对这件事有什么看法?", // 必填,采访问题 - "platform": "twitter", // 可选,指定平台(twitter/reddit) - // 不指定时:双平台模拟同时采访两个平台 - "timeout": 60 // 可选,超时时间(秒),默认60 + "simulation_id": "sim_xxxx", // required + "agent_id": 0, // required + "prompt": "...", // required, interview question + "platform": "twitter", // optional (twitter/reddit) + // omit -> dual-platform sims interview both platforms + "timeout": 60 // optional, timeout in seconds, default 60 } - 返回(不指定platform,双平台模式): + Response (when `platform` is omitted; dual-platform mode): { "success": true, "data": { "agent_id": 0, - "prompt": "你对这件事有什么看法?", + "prompt": "...", "result": { "agent_id": 0, "prompt": "...", @@ -2170,15 +2106,15 @@ def interview_agent(): } } - 返回(指定platform): + Response (when `platform` is specified): { "success": true, "data": { "agent_id": 0, - "prompt": "你对这件事有什么看法?", + "prompt": "...", "result": { "agent_id": 0, - "response": "我认为...", + "response": "...", "platform": "twitter", "timestamp": "2025-12-08T10:00:00" }, @@ -2192,7 +2128,7 @@ def interview_agent(): simulation_id = data.get('simulation_id') agent_id = data.get('agent_id') prompt = data.get('prompt') - platform = data.get('platform') # 可选:twitter/reddit/None + platform = data.get('platform') # optional: twitter / reddit / None timeout = data.get('timeout', 60) if not simulation_id: @@ -2213,21 +2149,19 @@ def interview_agent(): "error": t("api.error.simulation.m083") }), 400 - # 验证platform参数 if platform and platform not in ("twitter", "reddit"): return jsonify({ "success": False, "error": t("api.error.simulation.m084") }), 400 - - # 检查环境状态 + if not SimulationRunner.check_env_alive(simulation_id): return jsonify({ "success": False, "error": t("api.error.simulation.m085") }), 400 - - # 优化prompt,添加前缀避免Agent调用工具 + + # Inject the no-tool-call prefix into the prompt. optimized_prompt = optimize_interview_prompt(prompt) result = SimulationRunner.interview_agent( @@ -2266,31 +2200,30 @@ def interview_agent(): @simulation_bp.route('/interview/batch', methods=['POST']) def interview_agents_batch(): - """ - 批量采访多个Agent + """Interview multiple agents in batch. - 注意:此功能需要模拟环境处于运行状态 + Note: requires the simulation environment to be running. - 请求(JSON): + Request (JSON): { - "simulation_id": "sim_xxxx", // 必填,模拟ID - "interviews": [ // 必填,采访列表 + "simulation_id": "sim_xxxx", // required + "interviews": [ // required { "agent_id": 0, - "prompt": "你对A有什么看法?", - "platform": "twitter" // 可选,指定该Agent的采访平台 + "prompt": "...", + "platform": "twitter" // optional, per-agent platform override }, { "agent_id": 1, - "prompt": "你对B有什么看法?" // 不指定platform则使用默认值 + "prompt": "..." // omit `platform` to use the default } ], - "platform": "reddit", // 可选,默认平台(被每项的platform覆盖) - // 不指定时:双平台模拟每个Agent同时采访两个平台 - "timeout": 120 // 可选,超时时间(秒),默认120 + "platform": "reddit", // optional default platform (overridden by each item's platform) + // omit -> dual-platform sims interview each agent on both platforms + "timeout": 120 // optional, timeout in seconds, default 120 } - 返回: + Response: { "success": true, "data": { @@ -2313,7 +2246,7 @@ def interview_agents_batch(): simulation_id = data.get('simulation_id') interviews = data.get('interviews') - platform = data.get('platform') # 可选:twitter/reddit/None + platform = data.get('platform') # optional: twitter / reddit / None timeout = data.get('timeout', 120) if not simulation_id: @@ -2328,14 +2261,13 @@ def interview_agents_batch(): "error": t("api.error.simulation.m089") }), 400 - # 验证platform参数 if platform and platform not in ("twitter", "reddit"): return jsonify({ "success": False, "error": t("api.error.simulation.m090") }), 400 - # 验证每个采访项 + # Validate each interview item. for i, interview in enumerate(interviews): if 'agent_id' not in interview: return jsonify({ @@ -2347,7 +2279,7 @@ def interview_agents_batch(): "success": False, "error": t("api.error.simulation.m092", i=i + 1) }), 400 - # 验证每项的platform(如果有) + # Validate each item's platform (if present). item_platform = interview.get('platform') if item_platform and item_platform not in ("twitter", "reddit"): return jsonify({ @@ -2355,14 +2287,13 @@ def interview_agents_batch(): "error": t("api.error.simulation.m093", i=i + 1) }), 400 - # 检查环境状态 if not SimulationRunner.check_env_alive(simulation_id): return jsonify({ "success": False, "error": t("api.error.simulation.m094") }), 400 - # 优化每个采访项的prompt,添加前缀避免Agent调用工具 + # Inject the no-tool-call prefix into every interview prompt. optimized_interviews = [] for interview in interviews: optimized_interview = interview.copy() @@ -2404,21 +2335,20 @@ def interview_agents_batch(): @simulation_bp.route('/interview/all', methods=['POST']) def interview_all_agents(): - """ - 全局采访 - 使用相同问题采访所有Agent + """Global interview — ask the same question of every agent. - 注意:此功能需要模拟环境处于运行状态 + Note: requires the simulation environment to be running. - 请求(JSON): + Request (JSON): { - "simulation_id": "sim_xxxx", // 必填,模拟ID - "prompt": "你对这件事整体有什么看法?", // 必填,采访问题(所有Agent使用相同问题) - "platform": "reddit", // 可选,指定平台(twitter/reddit) - // 不指定时:双平台模拟每个Agent同时采访两个平台 - "timeout": 180 // 可选,超时时间(秒),默认180 + "simulation_id": "sim_xxxx", // required + "prompt": "...", // required, the same question for every agent + "platform": "reddit", // optional (twitter/reddit) + // omit -> dual-platform sims interview each agent on both platforms + "timeout": 180 // optional, timeout in seconds, default 180 } - 返回: + Response: { "success": true, "data": { @@ -2440,7 +2370,7 @@ def interview_all_agents(): simulation_id = data.get('simulation_id') prompt = data.get('prompt') - platform = data.get('platform') # 可选:twitter/reddit/None + platform = data.get('platform') # optional: twitter / reddit / None timeout = data.get('timeout', 180) if not simulation_id: @@ -2455,21 +2385,19 @@ def interview_all_agents(): "error": t("api.error.simulation.m098") }), 400 - # 验证platform参数 if platform and platform not in ("twitter", "reddit"): return jsonify({ "success": False, "error": t("api.error.simulation.m099") }), 400 - # 检查环境状态 if not SimulationRunner.check_env_alive(simulation_id): return jsonify({ "success": False, "error": t("api.error.simulation.m100") }), 400 - # 优化prompt,添加前缀避免Agent调用工具 + # Inject the no-tool-call prefix into the prompt. optimized_prompt = optimize_interview_prompt(prompt) result = SimulationRunner.interview_all_agents( @@ -2507,21 +2435,20 @@ def interview_all_agents(): @simulation_bp.route('/interview/history', methods=['POST']) def get_interview_history(): - """ - 获取Interview历史记录 + """Return interview history. - 从模拟数据库中读取所有Interview记录 + Reads all interview records from the simulation database. - 请求(JSON): + Request (JSON): { - "simulation_id": "sim_xxxx", // 必填,模拟ID - "platform": "reddit", // 可选,平台类型(reddit/twitter) - // 不指定则返回两个平台的所有历史 - "agent_id": 0, // 可选,只获取该Agent的采访历史 - "limit": 100 // 可选,返回数量,默认100 + "simulation_id": "sim_xxxx", // required + "platform": "reddit", // optional (reddit/twitter) + // omit -> return history for both platforms + "agent_id": 0, // optional, restrict to one agent + "limit": 100 // optional, default 100 } - 返回: + Response: { "success": true, "data": { @@ -2529,8 +2456,8 @@ def get_interview_history(): "history": [ { "agent_id": 0, - "response": "我认为...", - "prompt": "你对这件事有什么看法?", + "response": "...", + "prompt": "...", "timestamp": "2025-12-08T10:00:00", "platform": "reddit" }, @@ -2543,7 +2470,7 @@ def get_interview_history(): data = request.get_json() or {} simulation_id = data.get('simulation_id') - platform = data.get('platform') # 不指定则返回两个平台的历史 + platform = data.get('platform') # When omitted, returns history for both platforms. agent_id = data.get('agent_id') limit = data.get('limit', 100) @@ -2579,17 +2506,17 @@ def get_interview_history(): @simulation_bp.route('/env-status', methods=['POST']) def get_env_status(): - """ - 获取模拟环境状态 + """Return the simulation environment status. - 检查模拟环境是否存活(可以接收Interview命令) + Checks whether the simulation environment is alive (i.e. able to accept + interview commands). - 请求(JSON): + Request (JSON): { - "simulation_id": "sim_xxxx" // 必填,模拟ID + "simulation_id": "sim_xxxx" // required } - 返回: + Response: { "success": true, "data": { @@ -2597,7 +2524,7 @@ def get_env_status(): "env_alive": true, "twitter_available": true, "reddit_available": true, - "message": "环境正在运行,可以接收Interview命令" + "message": "..." } } """ @@ -2613,8 +2540,7 @@ def get_env_status(): }), 400 env_alive = SimulationRunner.check_env_alive(simulation_id) - - # 获取更详细的状态信息 + env_status = SimulationRunner.get_env_status_detail(simulation_id) if env_alive: @@ -2644,25 +2570,25 @@ def get_env_status(): @simulation_bp.route('/close-env', methods=['POST']) def close_simulation_env(): - """ - 关闭模拟环境 - - 向模拟发送关闭环境命令,使其优雅退出等待命令模式。 - - 注意:这不同于 /stop 接口,/stop 会强制终止进程, - 而此接口会让模拟优雅地关闭环境并退出。 - - 请求(JSON): + """Close the simulation environment. + + Sends a "close-env" command to the simulation so it can gracefully exit + command-wait mode. + + Note: this is different from `/stop`, which kills the process. This + endpoint asks the simulation to shut down its environment cleanly. + + Request (JSON): { - "simulation_id": "sim_xxxx", // 必填,模拟ID - "timeout": 30 // 可选,超时时间(秒),默认30 + "simulation_id": "sim_xxxx", // required + "timeout": 30 // optional, timeout in seconds, default 30 } - - 返回: + + Response: { "success": true, "data": { - "message": "环境关闭命令已发送", + "message": "...", "result": {...}, "timestamp": "2025-12-08T10:00:01" } @@ -2685,7 +2611,6 @@ def close_simulation_env(): timeout=timeout ) - # 更新模拟状态 manager = SimulationManager() state = manager.get_simulation(simulation_id) if state: diff --git a/backend/app/config.py b/backend/app/config.py index e6939c78..ab0867d3 100644 --- a/backend/app/config.py +++ b/backend/app/config.py @@ -1,38 +1,40 @@ -""" -配置管理 -统一从项目根目录的 .env 文件加载配置 +"""Configuration management. + +Loads configuration values from the project-root ``.env`` file. """ import os from dotenv import load_dotenv -# 加载项目根目录的 .env 文件 -# 路径: MiroFish/.env (相对于 backend/app/config.py) +# Load the project-root .env file. +# Path: MiroFish/.env (relative to backend/app/config.py). project_root_env = os.path.join(os.path.dirname(__file__), '../../.env') if os.path.exists(project_root_env): load_dotenv(project_root_env, override=True) else: - # 如果根目录没有 .env,尝试加载环境变量(用于生产环境) + # If the project root has no .env, fall back to the process environment + # (used in production deployments). load_dotenv(override=True) class Config: - """Flask配置类""" - - # Flask配置 + """Flask configuration class.""" + + # Flask settings. SECRET_KEY = os.environ.get('SECRET_KEY', 'mirofish-secret-key') DEBUG = os.environ.get('FLASK_DEBUG', 'True').lower() == 'true' - - # JSON配置 - 禁用ASCII转义,让中文直接显示(而不是 \uXXXX 格式) + + # JSON settings: disable ASCII escaping so non-ASCII output renders literally + # rather than as \uXXXX escape sequences. JSON_AS_ASCII = False - - # LLM配置(统一使用OpenAI格式) + + # LLM settings (called via the OpenAI-compatible API surface). LLM_API_KEY = os.environ.get('LLM_API_KEY') LLM_BASE_URL = os.environ.get('LLM_BASE_URL', 'https://api.openai.com/v1') LLM_MODEL_NAME = os.environ.get('LLM_MODEL_NAME', 'gpt-4o-mini') - - # Neo4j + Graphiti配置(替代 Zep Cloud) + + # Neo4j + Graphiti settings (replacement for Zep Cloud). NEO4J_URI = os.environ.get('NEO4J_URI', 'bolt://localhost:7687') NEO4J_USER = os.environ.get('NEO4J_USER', 'neo4j') NEO4J_PASSWORD = os.environ.get('NEO4J_PASSWORD', 'mirofish123') @@ -50,23 +52,23 @@ class Config: EMBEDDING_API_KEY = os.environ.get('EMBEDDING_API_KEY') EMBEDDING_BASE_URL = os.environ.get('EMBEDDING_BASE_URL') - # Zep配置(保留兼容性,已废弃) + # Zep settings (kept for backwards compatibility; deprecated). ZEP_API_KEY = os.environ.get('ZEP_API_KEY', '') - - # 文件上传配置 + + # File upload settings. MAX_CONTENT_LENGTH = 50 * 1024 * 1024 # 50MB UPLOAD_FOLDER = os.path.join(os.path.dirname(__file__), '../uploads') ALLOWED_EXTENSIONS = {'pdf', 'md', 'txt', 'markdown'} - - # 文本处理配置 - DEFAULT_CHUNK_SIZE = 500 # 默认切块大小 - DEFAULT_CHUNK_OVERLAP = 50 # 默认重叠大小 - - # OASIS模拟配置 + + # Text processing settings. + DEFAULT_CHUNK_SIZE = 500 # default chunk size in characters + DEFAULT_CHUNK_OVERLAP = 50 # default overlap in characters + + # OASIS simulation settings. OASIS_DEFAULT_MAX_ROUNDS = int(os.environ.get('OASIS_DEFAULT_MAX_ROUNDS', '10')) OASIS_SIMULATION_DATA_DIR = os.path.join(os.path.dirname(__file__), '../uploads/simulations') - - # OASIS平台可用动作配置 + + # OASIS per-platform allowed action lists. OASIS_TWITTER_ACTIONS = [ 'CREATE_POST', 'LIKE_POST', 'REPOST', 'FOLLOW', 'DO_NOTHING', 'QUOTE_POST' ] @@ -76,14 +78,14 @@ class Config: 'TREND', 'REFRESH', 'DO_NOTHING', 'FOLLOW', 'MUTE' ] - # Report Agent配置 + # Report agent settings. REPORT_AGENT_MAX_TOOL_CALLS = int(os.environ.get('REPORT_AGENT_MAX_TOOL_CALLS', '5')) REPORT_AGENT_MAX_REFLECTION_ROUNDS = int(os.environ.get('REPORT_AGENT_MAX_REFLECTION_ROUNDS', '2')) REPORT_AGENT_TEMPERATURE = float(os.environ.get('REPORT_AGENT_TEMPERATURE', '0.5')) - + @classmethod def validate(cls): - """验证必要配置""" + """Validate that required configuration values are present.""" errors = [] if not cls.LLM_API_KEY: errors.append("LLM_API_KEY 未配置") diff --git a/backend/app/models/__init__.py b/backend/app/models/__init__.py index 55bec619..b5118d01 100644 --- a/backend/app/models/__init__.py +++ b/backend/app/models/__init__.py @@ -1,6 +1,4 @@ -""" -数据模型模块 -""" +"""Data model package.""" from .task import TaskManager, TaskStatus from .project import Project, ProjectStatus, ProjectManager diff --git a/backend/app/models/project.py b/backend/app/models/project.py index 08978937..81d9a3e7 100644 --- a/backend/app/models/project.py +++ b/backend/app/models/project.py @@ -1,6 +1,7 @@ -""" -项目上下文管理 -用于在服务端持久化项目状态,避免前端在接口间传递大量数据 +"""Project context management. + +Persists project state on the server so the frontend does not have to round-trip +large blobs of context between API calls. """ import os @@ -15,45 +16,45 @@ from ..config import Config class ProjectStatus(str, Enum): - """项目状态""" - CREATED = "created" # 刚创建,文件已上传 - ONTOLOGY_GENERATED = "ontology_generated" # 本体已生成 - GRAPH_BUILDING = "graph_building" # 图谱构建中 - GRAPH_COMPLETED = "graph_completed" # 图谱构建完成 - FAILED = "failed" # 失败 + """Project lifecycle status.""" + CREATED = "created" # just created, files uploaded + ONTOLOGY_GENERATED = "ontology_generated" # ontology has been generated + GRAPH_BUILDING = "graph_building" # graph build in progress + GRAPH_COMPLETED = "graph_completed" # graph build finished + FAILED = "failed" # build failed @dataclass class Project: - """项目数据模型""" + """Project data model.""" project_id: str name: str status: ProjectStatus created_at: str updated_at: str - - # 文件信息 + + # File information files: List[Dict[str, str]] = field(default_factory=list) # [{filename, path, size}] total_text_length: int = 0 - - # 本体信息(接口1生成后填充) + + # Ontology information (filled in after step 1 generates it) ontology: Optional[Dict[str, Any]] = None analysis_summary: Optional[str] = None - - # 图谱信息(接口2完成后填充) + + # Graph information (filled in after step 2 finishes) graph_id: Optional[str] = None graph_build_task_id: Optional[str] = None - - # 配置 + + # Configuration simulation_requirement: Optional[str] = None chunk_size: int = 500 chunk_overlap: int = 50 - - # 错误信息 + + # Error message when status == FAILED error: Optional[str] = None - + def to_dict(self) -> Dict[str, Any]: - """转换为字典""" + """Serialize the project to a JSON-friendly dict.""" return { "project_id": self.project_id, "name": self.name, @@ -71,14 +72,14 @@ class Project: "chunk_overlap": self.chunk_overlap, "error": self.error } - + @classmethod def from_dict(cls, data: Dict[str, Any]) -> 'Project': - """从字典创建""" + """Reconstruct a project from its serialized dict.""" status = data.get('status', 'created') if isinstance(status, str): status = ProjectStatus(status) - + return cls( project_id=data['project_id'], name=data.get('name', 'Unnamed Project'), @@ -99,52 +100,51 @@ class Project: class ProjectManager: - """项目管理器 - 负责项目的持久化存储和检索""" - - # 项目存储根目录 + """Project manager: handles persistence and retrieval of projects on disk.""" + + # Root directory for project storage PROJECTS_DIR = os.path.join(Config.UPLOAD_FOLDER, 'projects') - + @classmethod def _ensure_projects_dir(cls): - """确保项目目录存在""" + """Ensure the projects root directory exists.""" os.makedirs(cls.PROJECTS_DIR, exist_ok=True) - + @classmethod def _get_project_dir(cls, project_id: str) -> str: - """获取项目目录路径""" + """Return the on-disk directory for a project.""" return os.path.join(cls.PROJECTS_DIR, project_id) - + @classmethod def _get_project_meta_path(cls, project_id: str) -> str: - """获取项目元数据文件路径""" + """Return the path to a project's metadata JSON file.""" return os.path.join(cls._get_project_dir(project_id), 'project.json') - + @classmethod def _get_project_files_dir(cls, project_id: str) -> str: - """获取项目文件存储目录""" + """Return the directory where project source files are stored.""" return os.path.join(cls._get_project_dir(project_id), 'files') - + @classmethod def _get_project_text_path(cls, project_id: str) -> str: - """获取项目提取文本存储路径""" + """Return the path to a project's extracted text file.""" return os.path.join(cls._get_project_dir(project_id), 'extracted_text.txt') - + @classmethod def create_project(cls, name: str = "Unnamed Project") -> Project: - """ - 创建新项目 - + """Create a new project. + Args: - name: 项目名称 - + name: Display name for the project. + Returns: - 新创建的Project对象 + The newly created ``Project`` instance. """ cls._ensure_projects_dir() - + project_id = f"proj_{uuid.uuid4().hex[:12]}" now = datetime.now().isoformat() - + project = Project( project_id=project_id, name=name, @@ -152,154 +152,147 @@ class ProjectManager: created_at=now, updated_at=now ) - - # 创建项目目录结构 + + # Create the on-disk project directory layout project_dir = cls._get_project_dir(project_id) files_dir = cls._get_project_files_dir(project_id) os.makedirs(project_dir, exist_ok=True) os.makedirs(files_dir, exist_ok=True) - - # 保存项目元数据 + + # Persist project metadata cls.save_project(project) - + return project - + @classmethod def save_project(cls, project: Project) -> None: - """保存项目元数据""" + """Persist project metadata to disk.""" project.updated_at = datetime.now().isoformat() meta_path = cls._get_project_meta_path(project.project_id) - + with open(meta_path, 'w', encoding='utf-8') as f: json.dump(project.to_dict(), f, ensure_ascii=False, indent=2) - + @classmethod def get_project(cls, project_id: str) -> Optional[Project]: - """ - 获取项目 - + """Load a project by id. + Args: - project_id: 项目ID - + project_id: Project identifier. + Returns: - Project对象,如果不存在返回None + The ``Project`` if it exists, otherwise ``None``. """ meta_path = cls._get_project_meta_path(project_id) - + if not os.path.exists(meta_path): return None - + with open(meta_path, 'r', encoding='utf-8') as f: data = json.load(f) - + return Project.from_dict(data) - + @classmethod def list_projects(cls, limit: int = 50) -> List[Project]: - """ - 列出所有项目 - + """List existing projects, newest first. + Args: - limit: 返回数量限制 - + limit: Maximum number of projects to return. + Returns: - 项目列表,按创建时间倒序 + Projects ordered by ``created_at`` descending. """ cls._ensure_projects_dir() - + projects = [] for project_id in os.listdir(cls.PROJECTS_DIR): project = cls.get_project(project_id) if project: projects.append(project) - - # 按创建时间倒序排序 + projects.sort(key=lambda p: p.created_at, reverse=True) - + return projects[:limit] - + @classmethod def delete_project(cls, project_id: str) -> bool: - """ - 删除项目及其所有文件 - + """Delete a project and all of its files. + Args: - project_id: 项目ID - + project_id: Project identifier. + Returns: - 是否删除成功 + ``True`` if the project existed and was removed, ``False`` otherwise. """ project_dir = cls._get_project_dir(project_id) - + if not os.path.exists(project_dir): return False - + shutil.rmtree(project_dir) return True - + @classmethod def save_file_to_project(cls, project_id: str, file_storage, original_filename: str) -> Dict[str, str]: - """ - 保存上传的文件到项目目录 - + """Save an uploaded file under the project's files directory. + Args: - project_id: 项目ID - file_storage: Flask的FileStorage对象 - original_filename: 原始文件名 - + project_id: Project identifier. + file_storage: Flask ``FileStorage`` object from the request. + original_filename: The user-supplied filename. + Returns: - 文件信息字典 {filename, path, size} + Dict describing the saved file: ``{original_filename, saved_filename, path, size}``. """ files_dir = cls._get_project_files_dir(project_id) os.makedirs(files_dir, exist_ok=True) - - # 生成安全的文件名 + + # Generate a safe randomized filename to avoid collisions ext = os.path.splitext(original_filename)[1].lower() safe_filename = f"{uuid.uuid4().hex[:8]}{ext}" file_path = os.path.join(files_dir, safe_filename) - - # 保存文件 + file_storage.save(file_path) - - # 获取文件大小 + file_size = os.path.getsize(file_path) - + return { "original_filename": original_filename, "saved_filename": safe_filename, "path": file_path, "size": file_size } - + @classmethod def save_extracted_text(cls, project_id: str, text: str) -> None: - """保存提取的文本""" + """Persist the project's extracted full text to disk.""" text_path = cls._get_project_text_path(project_id) with open(text_path, 'w', encoding='utf-8') as f: f.write(text) - + @classmethod def get_extracted_text(cls, project_id: str) -> Optional[str]: - """获取提取的文本""" + """Read back the project's extracted full text, or ``None`` if absent.""" text_path = cls._get_project_text_path(project_id) - + if not os.path.exists(text_path): return None - + with open(text_path, 'r', encoding='utf-8') as f: return f.read() - + @classmethod def get_project_files(cls, project_id: str) -> List[str]: - """获取项目的所有文件路径""" + """Return the on-disk paths of all files in the project.""" files_dir = cls._get_project_files_dir(project_id) - + if not os.path.exists(files_dir): return [] - + return [ - os.path.join(files_dir, f) - for f in os.listdir(files_dir) + os.path.join(files_dir, f) + for f in os.listdir(files_dir) if os.path.isfile(os.path.join(files_dir, f)) ] diff --git a/backend/app/models/task.py b/backend/app/models/task.py index dfebed23..c36290f1 100644 --- a/backend/app/models/task.py +++ b/backend/app/models/task.py @@ -1,6 +1,6 @@ -""" -任务状态管理 -用于跟踪长时间运行的任务(如图谱构建) +"""Task state management. + +Tracks long-running tasks (e.g. graph build) so callers can poll progress. """ import uuid @@ -14,30 +14,30 @@ from ..utils.locale import t class TaskStatus(str, Enum): - """任务状态枚举""" - PENDING = "pending" # 等待中 - PROCESSING = "processing" # 处理中 - COMPLETED = "completed" # 已完成 - FAILED = "failed" # 失败 + """Task status enum.""" + PENDING = "pending" # waiting + PROCESSING = "processing" # in progress + COMPLETED = "completed" # finished successfully + FAILED = "failed" # finished with error @dataclass class Task: - """任务数据类""" + """Task data class.""" task_id: str task_type: str status: TaskStatus created_at: datetime updated_at: datetime - progress: int = 0 # 总进度百分比 0-100 - message: str = "" # 状态消息 - result: Optional[Dict] = None # 任务结果 - error: Optional[str] = None # 错误信息 - metadata: Dict = field(default_factory=dict) # 额外元数据 - progress_detail: Dict = field(default_factory=dict) # 详细进度信息 - + progress: int = 0 # overall progress percentage 0-100 + message: str = "" # human-readable status message + result: Optional[Dict] = None # task result payload + error: Optional[str] = None # error message when failed + metadata: Dict = field(default_factory=dict) # arbitrary caller metadata + progress_detail: Dict = field(default_factory=dict) # fine-grained progress info + def to_dict(self) -> Dict[str, Any]: - """转换为字典""" + """Serialize the task to a JSON-friendly dict.""" return { "task_id": self.task_id, "task_type": self.task_type, @@ -54,16 +54,12 @@ class Task: class TaskManager: - """ - 任务管理器 - 线程安全的任务状态管理 - """ - + """Thread-safe singleton task registry.""" + _instance = None _lock = threading.Lock() - + def __new__(cls): - """单例模式""" if cls._instance is None: with cls._lock: if cls._instance is None: @@ -71,21 +67,20 @@ class TaskManager: cls._instance._tasks: Dict[str, Task] = {} cls._instance._task_lock = threading.Lock() return cls._instance - + def create_task(self, task_type: str, metadata: Optional[Dict] = None) -> str: - """ - 创建新任务 - + """Create a new task. + Args: - task_type: 任务类型 - metadata: 额外元数据 - + task_type: Task type identifier. + metadata: Optional caller-supplied metadata. + Returns: - 任务ID + The newly created task id. """ task_id = str(uuid.uuid4()) now = datetime.now() - + task = Task( task_id=task_id, task_type=task_type, @@ -94,17 +89,17 @@ class TaskManager: updated_at=now, metadata=metadata or {} ) - + with self._task_lock: self._tasks[task_id] = task - + return task_id - + def get_task(self, task_id: str) -> Optional[Task]: - """获取任务""" + """Return the task for ``task_id`` or ``None`` if unknown.""" with self._task_lock: return self._tasks.get(task_id) - + def update_task( self, task_id: str, @@ -115,17 +110,16 @@ class TaskManager: error: Optional[str] = None, progress_detail: Optional[Dict] = None ): - """ - 更新任务状态 - + """Update mutable fields on an existing task. + Args: - task_id: 任务ID - status: 新状态 - progress: 进度 - message: 消息 - result: 结果 - error: 错误信息 - progress_detail: 详细进度信息 + task_id: Task id to update. + status: New status, if changing. + progress: New overall progress (0-100), if changing. + message: New status message, if changing. + result: New result payload, if changing. + error: New error message, if changing. + progress_detail: New fine-grained progress info, if changing. """ with self._task_lock: task = self._tasks.get(task_id) @@ -143,9 +137,9 @@ class TaskManager: task.error = error if progress_detail is not None: task.progress_detail = progress_detail - + def complete_task(self, task_id: str, result: Dict): - """标记任务完成""" + """Mark a task as completed and attach the result.""" self.update_task( task_id, status=TaskStatus.COMPLETED, @@ -153,29 +147,29 @@ class TaskManager: message=t('progress.taskComplete'), result=result ) - + def fail_task(self, task_id: str, error: str): - """标记任务失败""" + """Mark a task as failed and attach the error message.""" self.update_task( task_id, status=TaskStatus.FAILED, message=t('progress.taskFailed'), error=error ) - + def list_tasks(self, task_type: Optional[str] = None) -> list: - """列出任务""" + """List tasks, optionally filtered by ``task_type``, newest first.""" with self._task_lock: tasks = list(self._tasks.values()) if task_type: tasks = [t for t in tasks if t.task_type == task_type] return [t.to_dict() for t in sorted(tasks, key=lambda x: x.created_at, reverse=True)] - + def cleanup_old_tasks(self, max_age_hours: int = 24): - """清理旧任务""" + """Drop completed/failed tasks older than ``max_age_hours``.""" from datetime import timedelta cutoff = datetime.now() - timedelta(hours=max_age_hours) - + with self._task_lock: old_ids = [ tid for tid, task in self._tasks.items() diff --git a/backend/app/services/__init__.py b/backend/app/services/__init__.py index 8db85d86..b0d4018a 100644 --- a/backend/app/services/__init__.py +++ b/backend/app/services/__init__.py @@ -1,6 +1,4 @@ -""" -业务服务模块 -""" +"""Business services package.""" from .ontology_generator import OntologyGenerator from .graph_builder import GraphBuilderService diff --git a/backend/app/services/graph_builder.py b/backend/app/services/graph_builder.py index 57262ab5..c21f44cb 100644 --- a/backend/app/services/graph_builder.py +++ b/backend/app/services/graph_builder.py @@ -1,6 +1,7 @@ -""" -图谱构建服务 -接口2:使用Zep API构建Standalone Graph +"""Graph build service. + +Pipeline step 2: build the project's standalone knowledge graph through the +Zep/Graphiti API. """ import os @@ -69,7 +70,7 @@ def _classify_entity_type(name: str, summary: str, ontology: Optional[Dict]) -> @dataclass class GraphInfo: - """图谱信息""" + """Summary information about a built graph.""" graph_id: str node_count: int edge_count: int @@ -85,10 +86,7 @@ class GraphInfo: class GraphBuilderService: - """ - 图谱构建服务 - 负责调用Zep API构建知识图谱 - """ + """Drives knowledge-graph construction via the Zep/Graphiti API.""" def __init__(self, api_key: Optional[str] = None): self.client = GraphitiAdapter() @@ -103,21 +101,20 @@ class GraphBuilderService: chunk_overlap: int = 50, batch_size: int = 3 ) -> str: - """ - 异步构建图谱 - + """Kick off a graph build asynchronously. + Args: - text: 输入文本 - ontology: 本体定义(来自接口1的输出) - graph_name: 图谱名称 - chunk_size: 文本块大小 - chunk_overlap: 块重叠大小 - batch_size: 每批发送的块数量 - + text: Source text to ingest. + ontology: Ontology definition (the output of pipeline step 1). + graph_name: Display name for the graph. + chunk_size: Characters per text chunk. + chunk_overlap: Overlap (in characters) between consecutive chunks. + batch_size: Number of chunks pushed to Zep per batch. + Returns: - 任务ID + The id of the task tracking the build. """ - # 创建任务 + # Register a task to track build progress. task_id = self.task_manager.create_task( task_type="graph_build", metadata={ @@ -130,7 +127,7 @@ class GraphBuilderService: # Capture locale before spawning background thread current_locale = get_locale() - # 在后台线程中执行构建 + # Run the build on a background thread so the request returns immediately. thread = threading.Thread( target=self._build_graph_worker, args=(task_id, text, ontology, graph_name, chunk_size, chunk_overlap, batch_size, current_locale) @@ -151,7 +148,7 @@ class GraphBuilderService: batch_size: int, locale: str = 'zh' ): - """图谱构建工作线程""" + """Background worker that performs the graph build.""" set_locale(locale) try: self.task_manager.update_task( @@ -161,7 +158,7 @@ class GraphBuilderService: message=t('progress.startBuildingGraph') ) - # 1. 创建图谱 + # 1. Create the graph. graph_id = self.create_graph(graph_name) self.task_manager.update_task( task_id, @@ -169,7 +166,7 @@ class GraphBuilderService: message=t('progress.graphCreated', graphId=graph_id) ) - # 2. 设置本体 + # 2. Set the ontology. self.set_ontology(graph_id, ontology) self.task_manager.update_task( task_id, @@ -177,7 +174,7 @@ class GraphBuilderService: message=t('progress.ontologySet') ) - # 3. 文本分块 + # 3. Split source text into chunks. chunks = TextProcessor.split_text(text, chunk_size, chunk_overlap) total_chunks = len(chunks) self.task_manager.update_task( @@ -186,7 +183,7 @@ class GraphBuilderService: message=t('progress.textSplit', count=total_chunks) ) - # 4. 分批发送数据 + # 4. Push chunks to the graph in batches. episode_uuids = self.add_text_batches( graph_id, chunks, batch_size, lambda msg, prog: self.task_manager.update_task( @@ -196,7 +193,7 @@ class GraphBuilderService: ) ) - # 5. 等待Zep处理完成 + # 5. Wait for Zep to finish processing the episodes. self.task_manager.update_task( task_id, progress=60, @@ -212,7 +209,7 @@ class GraphBuilderService: ) ) - # 6. 获取图谱信息 + # 6. Fetch the final graph metadata. self.task_manager.update_task( task_id, progress=90, @@ -220,8 +217,7 @@ class GraphBuilderService: ) graph_info = self._get_graph_info(graph_id) - - # 完成 + self.task_manager.complete_task(task_id, { "graph_id": graph_id, "graph_info": graph_info.to_dict(), @@ -234,7 +230,7 @@ class GraphBuilderService: self.task_manager.fail_task(task_id, error_msg) def create_graph(self, name: str) -> str: - """创建Zep图谱(公开方法)""" + """Create a new Zep graph and return its id (public API).""" graph_id = f"mirofish_{uuid.uuid4().hex[:16]}" self.client.graph.create( @@ -246,7 +242,7 @@ class GraphBuilderService: return graph_id def set_ontology(self, graph_id: str, ontology: Dict[str, Any]): - """设置图谱本体提示(Graphiti自动提取实体,本体作为提示存储)""" + """Register the ontology with the graph (Graphiti uses it as an extraction prompt).""" self.client.graph.set_ontology( graph_ids=[graph_id], entities=ontology.get("entity_types"), @@ -261,8 +257,11 @@ class GraphBuilderService: progress_callback: Optional[Callable] = None, skip_chunks: int = 0, ) -> List[str]: - """分批添加文本到图谱,返回所有 episode 的 uuid 列表。 - skip_chunks: 跳过已处理的块数(用于断点续传)。""" + """Push chunks to the graph in batches; returns the uuids of all episodes added. + + Args: + skip_chunks: Number of chunks to skip (used for resume-after-restart). + """ episode_uuids = [] total_chunks = len(chunks) @@ -279,27 +278,26 @@ class GraphBuilderService: ) - # 构建episode数据 + # Build the per-episode payload structures expected by the client. episodes = [ type('Episode', (), {'data': chunk, 'type': 'text'})() for chunk in batch_chunks ] - # 发送到Zep try: batch_result = self.client.graph.add_batch( graph_id=graph_id, episodes=episodes ) - - # 收集返回的 episode uuid + + # Collect the uuids returned for each episode. if batch_result and isinstance(batch_result, list): for ep in batch_result: ep_uuid = getattr(ep, 'uuid_', None) or getattr(ep, 'uuid', None) if ep_uuid: episode_uuids.append(ep_uuid) - - # 避免请求过快 + + # Throttle to avoid overwhelming the upstream API. time.sleep(1) except Exception as e: @@ -315,7 +313,7 @@ class GraphBuilderService: progress_callback: Optional[Callable] = None, timeout: int = 600 ): - """等待所有 episode 处理完成(通过查询每个 episode 的 processed 状态)""" + """Poll each episode until Zep marks it processed, or the timeout expires.""" if not episode_uuids: if progress_callback: progress_callback(t('progress.noEpisodesWait'), 1.0) @@ -338,18 +336,18 @@ class GraphBuilderService: ) break - # 检查每个 episode 的处理状态 + # Check the processing state of each pending episode. for ep_uuid in list(pending_episodes): try: episode = self.client.graph.episode.get(uuid_=ep_uuid) is_processed = getattr(episode, 'processed', False) - + if is_processed: pending_episodes.remove(ep_uuid) completed_count += 1 - + except Exception as e: - # 忽略单个查询错误,继续 + # Tolerate a single failed query; the next loop iteration retries. pass elapsed = int(time.time() - start_time) @@ -360,20 +358,17 @@ class GraphBuilderService: ) if pending_episodes: - time.sleep(3) # 每3秒检查一次 + time.sleep(3) # poll every 3 seconds if progress_callback: progress_callback(t('progress.processingComplete', completed=completed_count, total=total_episodes), 1.0) def _get_graph_info(self, graph_id: str) -> GraphInfo: - """获取图谱信息""" - # 获取节点(分页) + """Fetch summary info (counts and entity types) for a graph.""" nodes = fetch_all_nodes(self.client, graph_id) - - # 获取边(分页) edges = fetch_all_edges(self.client, graph_id) - # 统计实体类型 + # Tally distinct entity types across all nodes. entity_types = set() for node in nodes: if node.labels: @@ -389,26 +384,24 @@ class GraphBuilderService: ) def get_graph_data(self, graph_id: str, ontology: Optional[Dict] = None) -> Dict[str, Any]: - """ - 获取完整图谱数据(包含详细信息) - + """Return the full graph payload including timestamps, attributes, and edges. + Args: - graph_id: 图谱ID - + graph_id: Graph identifier. + Returns: - 包含nodes和edges的字典,包括时间信息、属性等详细数据 + Dict with ``nodes``, ``edges``, and aggregate counts. """ nodes = fetch_all_nodes(self.client, graph_id) edges = fetch_all_edges(self.client, graph_id) - # 创建节点映射用于获取节点名称 + # Build a uuid->name map so edge endpoints can be labeled. node_map = {} for node in nodes: node_map[node.uuid_] = node.name or "" - + nodes_data = [] for node in nodes: - # 获取创建时间 created_at = getattr(node, 'created_at', None) if created_at: created_at = str(created_at) @@ -429,20 +422,18 @@ class GraphBuilderService: edges_data = [] for edge in edges: - # 获取时间信息 created_at = getattr(edge, 'created_at', None) valid_at = getattr(edge, 'valid_at', None) invalid_at = getattr(edge, 'invalid_at', None) expired_at = getattr(edge, 'expired_at', None) - - # 获取 episodes + + # Normalize the episode list (the field may be missing or a single id). episodes = getattr(edge, 'episodes', None) or getattr(edge, 'episode_ids', None) if episodes and not isinstance(episodes, list): episodes = [str(episodes)] elif episodes: episodes = [str(e) for e in episodes] - - # 获取 fact_type + fact_type = getattr(edge, 'fact_type', None) or edge.name or "" edges_data.append({ @@ -471,6 +462,6 @@ class GraphBuilderService: } def delete_graph(self, graph_id: str): - """删除图谱""" + """Delete a graph by id.""" self.client.graph.delete(graph_id=graph_id) diff --git a/backend/app/services/oasis_profile_generator.py b/backend/app/services/oasis_profile_generator.py index 98236ffd..e6cd57c6 100644 --- a/backend/app/services/oasis_profile_generator.py +++ b/backend/app/services/oasis_profile_generator.py @@ -1,11 +1,13 @@ """ -OASIS Agent Profile生成器 -将Zep图谱中的实体转换为OASIS模拟平台所需的Agent Profile格式 +OASIS Agent Profile generator. -优化改进: -1. 调用Zep检索功能二次丰富节点信息 -2. 优化提示词生成非常详细的人设 -3. 区分个人实体和抽象群体实体 +Converts entities from the Zep graph into the Agent Profile format required by +the OASIS simulation platform. + +Improvements: +1. Call Zep retrieval to further enrich node information. +2. Optimized prompts that produce highly detailed personas. +3. Distinguishes individual entities from abstract group entities. """ import json @@ -28,38 +30,38 @@ logger = get_logger('mirofish.oasis_profile') @dataclass class OasisAgentProfile: - """OASIS Agent Profile数据结构""" - # 通用字段 + """OASIS Agent Profile data structure.""" + # Common fields user_id: int user_name: str name: str bio: str persona: str - - # 可选字段 - Reddit风格 + + # Optional fields - Reddit style karma: int = 1000 - - # 可选字段 - Twitter风格 + + # Optional fields - Twitter style friend_count: int = 100 follower_count: int = 150 statuses_count: int = 500 - - # 额外人设信息 + + # Additional persona information age: Optional[int] = None gender: Optional[str] = None mbti: Optional[str] = None country: Optional[str] = None profession: Optional[str] = None interested_topics: List[str] = field(default_factory=list) - - # 来源实体信息 + + # Source entity information source_entity_uuid: Optional[str] = None source_entity_type: Optional[str] = None created_at: str = field(default_factory=lambda: datetime.now().strftime("%Y-%m-%d")) def to_reddit_format(self) -> Dict[str, Any]: - """转换为Reddit平台格式""" + """Convert to Reddit platform format.""" profile = { "user_id": self.user_id, "username": self.user_name, # OASIS 库要求字段名为 username(无下划线) @@ -69,8 +71,7 @@ class OasisAgentProfile: "karma": self.karma, "created_at": self.created_at, } - - # 添加额外人设信息(如果有) + if self.age: profile["age"] = self.age if self.gender: @@ -83,11 +84,11 @@ class OasisAgentProfile: profile["profession"] = self.profession if self.interested_topics: profile["interested_topics"] = self.interested_topics - + return profile - + def to_twitter_format(self) -> Dict[str, Any]: - """转换为Twitter平台格式""" + """Convert to Twitter platform format.""" profile = { "user_id": self.user_id, "username": self.user_name, # OASIS 库要求字段名为 username(无下划线) @@ -99,8 +100,7 @@ class OasisAgentProfile: "statuses_count": self.statuses_count, "created_at": self.created_at, } - - # 添加额外人设信息 + if self.age: profile["age"] = self.age if self.gender: @@ -117,7 +117,7 @@ class OasisAgentProfile: return profile def to_dict(self) -> Dict[str, Any]: - """转换为完整字典格式""" + """Convert to a full dictionary representation.""" return { "user_id": self.user_id, "user_name": self.user_name, @@ -141,40 +141,39 @@ class OasisAgentProfile: class OasisProfileGenerator: + """OASIS Profile generator. + + Converts entities from the Zep graph into the Agent Profiles required by + the OASIS simulation. + + Highlights: + 1. Uses Zep graph retrieval to gather richer context. + 2. Produces highly detailed personas (basic info, career history, traits, + social-media behavior, etc.). + 3. Distinguishes individual entities from group/institution entities. """ - OASIS Profile生成器 - - 将Zep图谱中的实体转换为OASIS模拟所需的Agent Profile - - 优化特性: - 1. 调用Zep图谱检索功能获取更丰富的上下文 - 2. 生成非常详细的人设(包括基本信息、职业经历、性格特征、社交媒体行为等) - 3. 区分个人实体和抽象群体实体 - """ - - # MBTI类型列表 + MBTI_TYPES = [ "INTJ", "INTP", "ENTJ", "ENTP", "INFJ", "INFP", "ENFJ", "ENFP", "ISTJ", "ISFJ", "ESTJ", "ESFJ", "ISTP", "ISFP", "ESTP", "ESFP" ] - - # 常见国家列表 + COUNTRIES = [ - "China", "US", "UK", "Japan", "Germany", "France", + "China", "US", "UK", "Japan", "Germany", "France", "Canada", "Australia", "Brazil", "India", "South Korea" ] - - # 个人类型实体(需要生成具体人设) + + # Individual entity types — generate a concrete persona for each. INDIVIDUAL_ENTITY_TYPES = [ - "student", "alumni", "professor", "person", "publicfigure", + "student", "alumni", "professor", "person", "publicfigure", "expert", "faculty", "official", "journalist", "activist" ] - - # 群体/机构类型实体(需要生成群体代表人设) + + # Group / institution entity types — generate a representative-account persona. GROUP_ENTITY_TYPES = [ - "university", "governmentagency", "organization", "ngo", + "university", "governmentagency", "organization", "ngo", "mediaoutlet", "company", "institution", "group", "community" ] @@ -207,28 +206,24 @@ class OasisProfileGenerator: user_id: int, use_llm: bool = True ) -> OasisAgentProfile: - """ - 从Zep实体生成OASIS Agent Profile - + """Generate an OASIS Agent Profile from a Zep entity. + Args: - entity: Zep实体节点 - user_id: 用户ID(用于OASIS) - use_llm: 是否使用LLM生成详细人设 - + entity: The Zep entity node. + user_id: The OASIS user id to assign. + use_llm: Whether to use the LLM to generate a detailed persona. + Returns: OasisAgentProfile """ entity_type = entity.get_entity_type() or "Entity" - - # 基础信息 + name = entity.name user_name = self._generate_username(name) - - # 构建上下文信息 + context = self._build_entity_context(entity) - + if use_llm: - # 使用LLM生成详细人设 profile_data = self._generate_profile_with_llm( entity_name=name, entity_type=entity_type, @@ -237,7 +232,6 @@ class OasisProfileGenerator: context=context ) else: - # 使用规则生成基础人设 profile_data = self._generate_profile_rule_based( entity_name=name, entity_type=entity_type, @@ -266,27 +260,27 @@ class OasisProfileGenerator: ) def _generate_username(self, name: str) -> str: - """生成用户名""" - # 移除特殊字符,转换为小写 + """Generate a username from an entity name.""" + # Strip special characters and lowercase the name. username = name.lower().replace(" ", "_") username = ''.join(c for c in username if c.isalnum() or c == '_') - - # 添加随机后缀避免重复 + + # Append a random numeric suffix to avoid collisions. suffix = random.randint(100, 999) return f"{username}_{suffix}" def _search_zep_for_entity(self, entity: EntityNode) -> Dict[str, Any]: - """ - 使用Zep图谱混合搜索功能获取实体相关的丰富信息 - - Zep没有内置混合搜索接口,需要分别搜索edges和nodes然后合并结果。 - 使用并行请求同时搜索,提高效率。 - + """Use Zep hybrid graph search to gather rich context for an entity. + + Zep does not expose a built-in hybrid search endpoint, so we search + edges and nodes separately and merge the results. The two searches + run in parallel for throughput. + Args: - entity: 实体节点对象 - + entity: The entity node to search around. + Returns: - 包含facts, node_summaries, context的字典 + A dict with keys ``facts``, ``node_summaries`` and ``context``. """ import concurrent.futures @@ -301,7 +295,7 @@ class OasisProfileGenerator: "context": "" } - # 必须有graph_id才能进行搜索 + # A graph_id is required for any retrieval. if not self.graph_id: logger.debug(t("log.profile_generator.m001")) return results @@ -309,7 +303,7 @@ class OasisProfileGenerator: comprehensive_query = t('progress.zepSearchQuery', name=entity_name) def search_edges(): - """搜索边(事实/关系)- 带重试机制""" + """Search edges (facts / relationships) with retries.""" max_retries = 3 last_exception = None delay = 2.0 @@ -333,7 +327,7 @@ class OasisProfileGenerator: return None def search_nodes(): - """搜索节点(实体摘要)- 带重试机制""" + """Search nodes (entity summaries) with retries.""" max_retries = 3 last_exception = None delay = 2.0 @@ -357,24 +351,23 @@ class OasisProfileGenerator: return None try: - # 并行执行edges和nodes搜索 + # Run edge and node searches in parallel. with concurrent.futures.ThreadPoolExecutor(max_workers=2) as executor: edge_future = executor.submit(search_edges) node_future = executor.submit(search_nodes) - - # 获取结果 + edge_result = edge_future.result(timeout=30) node_result = node_future.result(timeout=30) - - # 处理边搜索结果 + + # Process edge-search results. all_facts = set() if edge_result and hasattr(edge_result, 'edges') and edge_result.edges: for edge in edge_result.edges: if hasattr(edge, 'fact') and edge.fact: all_facts.add(edge.fact) results["facts"] = list(all_facts) - - # 处理节点搜索结果 + + # Process node-search results. all_summaries = set() if node_result and hasattr(node_result, 'nodes') and node_result.nodes: for node in node_result.nodes: @@ -383,8 +376,8 @@ class OasisProfileGenerator: if hasattr(node, 'name') and node.name and node.name != entity_name: all_summaries.add(f"相关实体: {node.name}") results["node_summaries"] = list(all_summaries) - - # 构建综合上下文 + + # Assemble the combined context block. context_parts = [] if results["facts"]: context_parts.append("事实信息:\n" + "\n".join(f"- {f}" for f in results["facts"][:20])) @@ -402,17 +395,16 @@ class OasisProfileGenerator: return results def _build_entity_context(self, entity: EntityNode) -> str: - """ - 构建实体的完整上下文信息 - - 包括: - 1. 实体本身的边信息(事实) - 2. 关联节点的详细信息 - 3. Zep混合检索到的丰富信息 + """Assemble the full context block for an entity. + + Includes: + 1. The entity's own edge information (facts). + 2. Detailed information about related nodes. + 3. Additional context retrieved from Zep hybrid search. """ context_parts = [] - - # 1. 添加实体属性信息 + + # 1. Entity attributes. if entity.attributes: attrs = [] for key, value in entity.attributes.items(): @@ -421,11 +413,11 @@ class OasisProfileGenerator: if attrs: context_parts.append("### 实体属性\n" + "\n".join(attrs)) - # 2. 添加相关边信息(事实/关系) + # 2. Related edges (facts / relationships). existing_facts = set() if entity.related_edges: relationships = [] - for edge in entity.related_edges: # 不限制数量 + for edge in entity.related_edges: # No cap on count. fact = edge.get("fact", "") edge_name = edge.get("edge_name", "") direction = edge.get("direction", "") @@ -442,15 +434,15 @@ class OasisProfileGenerator: if relationships: context_parts.append("### 相关事实和关系\n" + "\n".join(relationships)) - # 3. 添加关联节点的详细信息 + # 3. Detailed information for related nodes. if entity.related_nodes: related_info = [] - for node in entity.related_nodes: # 不限制数量 + for node in entity.related_nodes: # No cap on count. node_name = node.get("name", "") node_labels = node.get("labels", []) node_summary = node.get("summary", "") - - # 过滤掉默认标签 + + # Drop the default labels added by the graph store. custom_labels = [l for l in node_labels if l not in ["Entity", "Node"]] label_str = f" ({', '.join(custom_labels)})" if custom_labels else "" @@ -462,11 +454,11 @@ class OasisProfileGenerator: if related_info: context_parts.append("### 关联实体信息\n" + "\n".join(related_info)) - # 4. 使用Zep混合检索获取更丰富的信息 + # 4. Augment with Zep hybrid retrieval. zep_results = self._search_zep_for_entity(entity) - + if zep_results.get("facts"): - # 去重:排除已存在的事实 + # Deduplicate against already-known facts. new_facts = [f for f in zep_results["facts"] if f not in existing_facts] if new_facts: context_parts.append("### Zep检索到的事实信息\n" + "\n".join(f"- {f}" for f in new_facts[:15])) @@ -477,11 +469,11 @@ class OasisProfileGenerator: return "\n\n".join(context_parts) def _is_individual_entity(self, entity_type: str) -> bool: - """判断是否是个人类型实体""" + """Return True if the entity type represents an individual.""" return entity_type.lower() in self.INDIVIDUAL_ENTITY_TYPES - + def _is_group_entity(self, entity_type: str) -> bool: - """判断是否是群体/机构类型实体""" + """Return True if the entity type represents a group or institution.""" return entity_type.lower() in self.GROUP_ENTITY_TYPES def _generate_profile_with_llm( @@ -492,14 +484,13 @@ class OasisProfileGenerator: entity_attributes: Dict[str, Any], context: str ) -> Dict[str, Any]: + """Generate a highly detailed persona using the LLM. + + Branches on entity type: + - Individual entities: produces a concrete persona for a person. + - Group / institution entities: produces a representative-account persona. """ - 使用LLM生成非常详细的人设 - - 根据实体类型区分: - - 个人实体:生成具体的人物设定 - - 群体/机构实体:生成代表性账号设定 - """ - + is_individual = self._is_individual_entity(entity_type) if is_individual: @@ -511,7 +502,7 @@ class OasisProfileGenerator: entity_name, entity_type, entity_summary, entity_attributes, context ) - # 尝试多次生成,直到成功或达到最大重试次数 + # Retry generation up to max_attempts times. max_attempts = 3 last_error = None @@ -524,23 +515,23 @@ class OasisProfileGenerator: {"role": "user", "content": prompt} ], response_format={"type": "json_object"}, - temperature=0.7 - (attempt * 0.1) # 每次重试降低温度 - # 不设置max_tokens,让LLM自由发挥 + temperature=0.7 - (attempt * 0.1) # Lower the temperature on each retry. + # No max_tokens cap so the LLM can produce a full persona. ) - + content = response.choices[0].message.content - - # 检查是否被截断(finish_reason不是'stop') + + # Detect truncation (finish_reason other than 'stop'). finish_reason = response.choices[0].finish_reason if finish_reason == 'length': logger.warning(t("log.profile_generator.m009", attempt=attempt + 1)) content = self._fix_truncated_json(content) - # 尝试解析JSON + # Parse the JSON payload. try: result = json.loads(content) - - # 验证必需字段 + + # Backfill required fields when missing. if "bio" not in result or not result["bio"]: result["bio"] = entity_summary[:200] if entity_summary else f"{entity_type}: {entity_name}" if "persona" not in result or not result["persona"]: @@ -550,8 +541,8 @@ class OasisProfileGenerator: except json.JSONDecodeError as je: logger.warning(t("log.profile_generator.m010", attempt=attempt + 1, str=str(je)[:80])) - - # 尝试修复JSON + + # Attempt to repair the JSON. result = self._try_fix_json(content, entity_name, entity_type, entity_summary) if result.get("_fixed"): del result["_fixed"] @@ -563,7 +554,7 @@ class OasisProfileGenerator: logger.warning(t("log.profile_generator.m011", attempt=attempt + 1, str=str(e)[:80])) last_error = e import time - time.sleep(1 * (attempt + 1)) # 指数退避 + time.sleep(1 * (attempt + 1)) # Exponential backoff. logger.warning(t("log.profile_generator.m012", max_attempts=max_attempts, last_error=last_error)) return self._generate_profile_rule_based( @@ -571,79 +562,78 @@ class OasisProfileGenerator: ) def _fix_truncated_json(self, content: str) -> str: - """修复被截断的JSON(输出被max_tokens限制截断)""" + """Repair JSON output truncated by a max_tokens limit.""" import re - - # 如果JSON被截断,尝试闭合它 + + # Trim whitespace before closing the structure. content = content.strip() - - # 计算未闭合的括号 + + # Count unbalanced brackets and braces. open_braces = content.count('{') - content.count('}') open_brackets = content.count('[') - content.count(']') - - # 检查是否有未闭合的字符串 - # 简单检查:如果最后一个引号后没有逗号或闭合括号,可能是字符串被截断 + + # Heuristic: if the last char is not a quote, comma, or closing bracket, + # the trailing string value was likely truncated mid-token. if content and content[-1] not in '",}]': - # 尝试闭合字符串 + # Close the dangling string. content += '"' - - # 闭合括号 + + # Close any open brackets and braces. content += ']' * open_brackets content += '}' * open_braces - + return content def _try_fix_json(self, content: str, entity_name: str, entity_type: str, entity_summary: str = "") -> Dict[str, Any]: - """尝试修复损坏的JSON""" + """Best-effort repair of damaged JSON output.""" import re - - # 1. 首先尝试修复被截断的情况 + + # 1. Repair truncation first. content = self._fix_truncated_json(content) - - # 2. 尝试提取JSON部分 + + # 2. Extract the JSON object span. json_match = re.search(r'\{[\s\S]*\}', content) if json_match: json_str = json_match.group() - - # 3. 处理字符串中的换行符问题 - # 找到所有字符串值并替换其中的换行符 + + # 3. Fix newlines inside string values. def fix_string_newlines(match): s = match.group(0) - # 替换字符串内的实际换行符为空格 + # Replace literal newlines inside string values with spaces. s = s.replace('\n', ' ').replace('\r', ' ') - # 替换多余空格 + # Collapse runs of whitespace. s = re.sub(r'\s+', ' ', s) return s - - # 匹配JSON字符串值 + + # Match JSON string values. json_str = re.sub(r'"[^"\\]*(?:\\.[^"\\]*)*"', fix_string_newlines, json_str) - - # 4. 尝试解析 + + # 4. Try to parse. try: result = json.loads(json_str) result["_fixed"] = True return result except json.JSONDecodeError as e: - # 5. 如果还是失败,尝试更激进的修复 + # 5. Fall back to a more aggressive repair pass. try: - # 移除所有控制字符 + # Strip control characters. json_str = re.sub(r'[\x00-\x1f\x7f-\x9f]', ' ', json_str) - # 替换所有连续空白 + # Collapse all consecutive whitespace. json_str = re.sub(r'\s+', ' ', json_str) result = json.loads(json_str) result["_fixed"] = True return result except: pass - - # 6. 尝试从内容中提取部分信息 + + # 6. Last resort: scrape partial fields out of the content. bio_match = re.search(r'"bio"\s*:\s*"([^"]*)"', content) - persona_match = re.search(r'"persona"\s*:\s*"([^"]*)', content) # 可能被截断 + persona_match = re.search(r'"persona"\s*:\s*"([^"]*)', content) # May be truncated. bio = bio_match.group(1) if bio_match else (entity_summary[:200] if entity_summary else f"{entity_type}: {entity_name}") persona = persona_match.group(1) if persona_match else (entity_summary or f"{entity_name}是一个{entity_type}。") - # 如果提取到了有意义的内容,标记为已修复 + # If we recovered something meaningful, mark the result as fixed. if bio_match or persona_match: logger.info(t("log.profile_generator.m013")) return { @@ -652,7 +642,7 @@ class OasisProfileGenerator: "_fixed": True } - # 7. 完全失败,返回基础结构 + # 7. Total failure: return a minimal fallback structure. logger.warning(t("log.profile_generator.m014")) return { "bio": entity_summary[:200] if entity_summary else f"{entity_type}: {entity_name}", @@ -660,7 +650,7 @@ class OasisProfileGenerator: } def _get_system_prompt(self, is_individual: bool) -> str: - """获取系统提示词""" + """Return the system prompt for persona generation.""" base_prompt = "You are an expert in social-media user-persona generation. Produce detailed, realistic personas for opinion simulation that faithfully reflect existing real-world conditions. You MUST return valid JSON; no string value may contain unescaped newlines." return f"{base_prompt}\n\n{get_language_instruction()}" @@ -672,7 +662,7 @@ class OasisProfileGenerator: entity_attributes: Dict[str, Any], context: str ) -> str: - """构建个人实体的详细人设提示词""" + """Build the detailed persona prompt for an individual entity.""" attrs_str = json.dumps(entity_attributes, ensure_ascii=False) if entity_attributes else "None" context_str = context[:3000] if context else "No additional context" @@ -721,7 +711,7 @@ Important: entity_attributes: Dict[str, Any], context: str ) -> str: - """构建群体/机构实体的详细人设提示词""" + """Build the detailed persona prompt for a group or institution entity.""" attrs_str = json.dumps(entity_attributes, ensure_ascii=False) if entity_attributes else "None" context_str = context[:3000] if context else "No additional context" @@ -768,9 +758,9 @@ Important: entity_summary: str, entity_attributes: Dict[str, Any] ) -> Dict[str, Any]: - """使用规则生成基础人设""" - - # 根据实体类型生成不同的人设 + """Rule-based fallback that generates a basic persona.""" + + # Branch on entity type to pick a persona shape. entity_type_lower = entity_type.lower() if entity_type_lower in ["student", "alumni"]: @@ -822,7 +812,7 @@ Important: } else: - # 默认人设 + # Default persona for unrecognised entity types. return { "bio": entity_summary[:150] if entity_summary else f"{entity_type}: {entity_name}", "persona": entity_summary or f"{entity_name} is a {entity_type.lower()} participating in social discussions.", @@ -835,7 +825,7 @@ Important: } def set_graph_id(self, graph_id: str): - """设置图谱ID用于Zep检索""" + """Set the graph id used for Zep retrieval.""" self.graph_id = graph_id def generate_profiles_from_entities( @@ -848,53 +838,51 @@ Important: realtime_output_path: Optional[str] = None, output_platform: str = "reddit" ) -> List[OasisAgentProfile]: - """ - 批量从实体生成Agent Profile(支持并行生成) - + """Batch-generate Agent Profiles from entities (in parallel). + Args: - entities: 实体列表 - use_llm: 是否使用LLM生成详细人设 - progress_callback: 进度回调函数 (current, total, message) - graph_id: 图谱ID,用于Zep检索获取更丰富上下文 - parallel_count: 并行生成数量,默认5 - realtime_output_path: 实时写入的文件路径(如果提供,每生成一个就写入一次) - output_platform: 输出平台格式 ("reddit" 或 "twitter") - + entities: The entities to convert. + use_llm: Whether to use the LLM to generate detailed personas. + progress_callback: Progress callback ``(current, total, message)``. + graph_id: Graph id used for Zep retrieval to gather richer context. + parallel_count: Number of profiles to generate concurrently (default 5). + realtime_output_path: If set, profiles are flushed to this path after + each successful generation. + output_platform: Output platform format, ``"reddit"`` or ``"twitter"``. + Returns: - Agent Profile列表 + The generated list of Agent Profiles. """ import concurrent.futures from threading import Lock - # 设置graph_id用于Zep检索 + # Set the graph id used for Zep retrieval. if graph_id: self.graph_id = graph_id - + total = len(entities) - profiles = [None] * total # 预分配列表保持顺序 - completed_count = [0] # 使用列表以便在闭包中修改 + profiles = [None] * total # Preallocate to keep insertion order. + completed_count = [0] # List wrapper so closures can mutate the count. lock = Lock() - - # 实时写入文件的辅助函数 + def save_profiles_realtime(): - """实时保存已生成的 profiles 到文件""" + """Flush the profiles generated so far to ``realtime_output_path``.""" if not realtime_output_path: return with lock: - # 过滤出已生成的 profiles existing_profiles = [p for p in profiles if p is not None] if not existing_profiles: return - + try: if output_platform == "reddit": - # Reddit JSON 格式 + # Reddit JSON format. profiles_data = [p.to_reddit_format() for p in existing_profiles] with open(realtime_output_path, 'w', encoding='utf-8') as f: json.dump(profiles_data, f, ensure_ascii=False, indent=2) else: - # Twitter CSV 格式 + # Twitter CSV format. import csv profiles_data = [p.to_twitter_format() for p in existing_profiles] if profiles_data: @@ -910,7 +898,7 @@ Important: current_locale = get_locale() def generate_single_profile(idx: int, entity: EntityNode) -> tuple: - """生成单个profile的工作函数""" + """Worker function that generates a single profile.""" set_locale(current_locale) entity_type = entity.get_entity_type() or "Entity" @@ -921,14 +909,14 @@ Important: use_llm=use_llm ) - # 实时输出生成的人设到控制台和日志 + # Stream the generated persona to the console and log. self._print_generated_profile(entity.name, entity_type, profile) return idx, profile, None except Exception as e: logger.error(t("log.profile_generator.m016", entity=entity.name, str=str(e))) - # 创建一个基础profile + # Build a minimal fallback profile. fallback_profile = OasisAgentProfile( user_id=idx, user_name=self._generate_username(entity.name), @@ -945,15 +933,13 @@ Important: print(t("log.profile_generator.m024", total=total, parallel_count=parallel_count)) print(f"{'='*60}\n") - # 使用线程池并行执行 + # Run generation across a thread pool. with concurrent.futures.ThreadPoolExecutor(max_workers=parallel_count) as executor: - # 提交所有任务 future_to_entity = { executor.submit(generate_single_profile, idx, entity): (idx, entity) for idx, entity in enumerate(entities) } - - # 收集结果 + for future in concurrent.futures.as_completed(future_to_entity): idx, entity = future_to_entity[future] entity_type = entity.get_entity_type() or "Entity" @@ -966,9 +952,9 @@ Important: completed_count[0] += 1 current = completed_count[0] - # 实时写入文件 + # Flush profiles to disk in real time. save_profiles_realtime() - + if progress_callback: progress_callback( current, @@ -994,7 +980,7 @@ Important: source_entity_uuid=entity.uuid, source_entity_type=entity_type, ) - # 实时写入文件(即使是备用人设) + # Flush profiles to disk even when only the fallback was produced. save_profiles_realtime() print(f"\n{'='*60}") @@ -1004,10 +990,10 @@ Important: return profiles def _print_generated_profile(self, entity_name: str, entity_type: str, profile: OasisAgentProfile): - """实时输出生成的人设到控制台(完整内容,不截断)""" + """Stream the generated persona to the console (full content, untruncated).""" separator = "-" * 70 - - # 构建完整输出内容(不截断) + + # Assemble the full output (no truncation). topics_str = ', '.join(profile.interested_topics) if profile.interested_topics else '无' output_lines = [ @@ -1031,7 +1017,8 @@ Important: output = "\n".join(output_lines) - # 只输出到控制台(避免重复,logger不再输出完整内容) + # Print to the console only — the logger no longer emits the full content + # to avoid duplicate output. print(output) def save_profiles( @@ -1040,17 +1027,16 @@ Important: file_path: str, platform: str = "reddit" ): - """ - 保存Profile到文件(根据平台选择正确格式) - - OASIS平台格式要求: - - Twitter: CSV格式 - - Reddit: JSON格式 - + """Save profiles to a file using the platform-specific format. + + OASIS format requirements: + - Twitter: CSV format. + - Reddit: JSON format. + Args: - profiles: Profile列表 - file_path: 文件路径 - platform: 平台类型 ("reddit" 或 "twitter") + profiles: The profiles to save. + file_path: Destination file path. + platform: Platform type, ``"reddit"`` or ``"twitter"``. """ if platform == "twitter": self._save_twitter_csv(profiles, file_path) @@ -1058,74 +1044,73 @@ Important: self._save_reddit_json(profiles, file_path) def _save_twitter_csv(self, profiles: List[OasisAgentProfile], file_path: str): - """ - 保存Twitter Profile为CSV格式(符合OASIS官方要求) - - OASIS Twitter要求的CSV字段: - - user_id: 用户ID(根据CSV顺序从0开始) - - name: 用户真实姓名 - - username: 系统中的用户名 - - user_char: 详细人设描述(注入到LLM系统提示中,指导Agent行为) - - description: 简短的公开简介(显示在用户资料页面) - - user_char vs description 区别: - - user_char: 内部使用,LLM系统提示,决定Agent如何思考和行动 - - description: 外部显示,其他用户可见的简介 + """Save Twitter profiles as CSV (matches OASIS's official format). + + Required CSV fields for OASIS Twitter: + - user_id: User id (zero-indexed by CSV row order). + - name: User's real-world display name. + - username: System username. + - user_char: Detailed persona text injected into the LLM system prompt + to drive agent behavior. + - description: Short public bio shown on the profile page. + + ``user_char`` vs ``description``: + - user_char: Internal — LLM system prompt that controls how the agent + thinks and acts. + - description: External — short bio visible to other users. """ import csv - - # 确保文件扩展名是.csv + + # Ensure the file extension is .csv. if not file_path.endswith('.csv'): file_path = file_path.replace('.json', '.csv') - + with open(file_path, 'w', newline='', encoding='utf-8') as f: writer = csv.writer(f) - - # 写入OASIS要求的表头 + + # Write the OASIS-required header row. headers = ['user_id', 'name', 'username', 'user_char', 'description'] writer.writerow(headers) - - # 写入数据行 + for idx, profile in enumerate(profiles): - # user_char: 完整人设(bio + persona),用于LLM系统提示 + # user_char: full persona (bio + persona), used in the LLM system prompt. user_char = profile.bio if profile.persona and profile.persona != profile.bio: user_char = f"{profile.bio} {profile.persona}" - # 处理换行符(CSV中用空格替代) + # Replace newlines with spaces for CSV compatibility. user_char = user_char.replace('\n', ' ').replace('\r', ' ') - - # description: 简短简介,用于外部显示 + + # description: short bio used for external display. description = profile.bio.replace('\n', ' ').replace('\r', ' ') - + row = [ - idx, # user_id: 从0开始的顺序ID - profile.name, # name: 真实姓名 - profile.user_name, # username: 用户名 - user_char, # user_char: 完整人设(内部LLM使用) - description # description: 简短简介(外部显示) + idx, # user_id: zero-based sequential id + profile.name, # name: real-world display name + profile.user_name, # username: system username + user_char, # user_char: full persona (internal LLM use) + description # description: short bio (external display) ] writer.writerow(row) logger.info(t("log.profile_generator.m021", len=len(profiles), file_path=file_path)) def _normalize_gender(self, gender: Optional[str]) -> str: - """ - 标准化gender字段为OASIS要求的英文格式 - - OASIS要求: male, female, other + """Normalize the gender field into the English form required by OASIS. + + OASIS requires one of: ``male``, ``female``, ``other``. """ if not gender: return "other" - + gender_lower = gender.lower().strip() - - # 中文映射 + + # Mapping from Chinese values to the English literals. gender_map = { "男": "male", "女": "female", "机构": "other", "其他": "other", - # 英文已有 + # Already in English — pass through. "male": "male", "female": "female", "other": "other", @@ -1134,42 +1119,43 @@ Important: return gender_map.get(gender_lower, "other") def _save_reddit_json(self, profiles: List[OasisAgentProfile], file_path: str): - """ - 保存Reddit Profile为JSON格式 - - 使用与 to_reddit_format() 一致的格式,确保 OASIS 能正确读取。 - 必须包含 user_id 字段,这是 OASIS agent_graph.get_agent() 匹配的关键! - - 必需字段: - - user_id: 用户ID(整数,用于匹配 initial_posts 中的 poster_agent_id) - - username: 用户名 - - name: 显示名称 - - bio: 简介 - - persona: 详细人设 - - age: 年龄(整数) - - gender: "male", "female", 或 "other" - - mbti: MBTI类型 - - country: 国家 + """Save Reddit profiles as JSON. + + Uses the same shape as ``to_reddit_format()`` to ensure OASIS can read + the file. The ``user_id`` field is mandatory — it is what + ``agent_graph.get_agent()`` matches against. + + Required fields: + - user_id: User id (integer; matches ``poster_agent_id`` in + ``initial_posts``). + - username: System username. + - name: Display name. + - bio: Short bio. + - persona: Detailed persona. + - age: Age (integer). + - gender: One of ``"male"``, ``"female"``, ``"other"``. + - mbti: MBTI type. + - country: Country. """ data = [] for idx, profile in enumerate(profiles): - # 使用与 to_reddit_format() 一致的格式 + # Match the shape of to_reddit_format(). item = { - "user_id": profile.user_id if profile.user_id is not None else idx, # 关键:必须包含 user_id + "user_id": profile.user_id if profile.user_id is not None else idx, # Critical: must include user_id. "username": profile.user_name, "name": profile.name, "bio": profile.bio[:150] if profile.bio else f"{profile.name}", "persona": profile.persona or f"{profile.name} is a participant in social discussions.", "karma": profile.karma if profile.karma else 1000, "created_at": profile.created_at, - # OASIS必需字段 - 确保都有默认值 + # OASIS-required fields — make sure each has a default. "age": profile.age if profile.age else 30, "gender": self._normalize_gender(profile.gender), "mbti": profile.mbti if profile.mbti else "ISTJ", "country": profile.country if profile.country else "中国", } - - # 可选字段 + + # Optional fields. if profile.profession: item["profession"] = profile.profession if profile.interested_topics: @@ -1182,14 +1168,14 @@ Important: logger.info(t("log.profile_generator.m022", len=len(profiles), file_path=file_path)) - # 保留旧方法名作为别名,保持向后兼容 + # Retained as an alias for the old method name (backwards compatibility). def save_profiles_to_json( self, profiles: List[OasisAgentProfile], file_path: str, platform: str = "reddit" ): - """[已废弃] 请使用 save_profiles() 方法""" + """[Deprecated] Use ``save_profiles()`` instead.""" logger.warning(t("log.profile_generator.m023")) self.save_profiles(profiles, file_path, platform) diff --git a/backend/app/services/ontology_generator.py b/backend/app/services/ontology_generator.py index e6b4a234..0b995554 100644 --- a/backend/app/services/ontology_generator.py +++ b/backend/app/services/ontology_generator.py @@ -1,6 +1,7 @@ -""" -本体生成服务 -接口1:分析文本内容,生成适合社会模拟的实体和关系类型定义 +"""Ontology generation service. + +Pipeline step 1: analyze the source text and propose entity and relationship +types that fit a social-media opinion simulation. """ import json @@ -14,19 +15,19 @@ logger = logging.getLogger(__name__) def _to_pascal_case(name: str) -> str: - """将任意格式的名称转换为 PascalCase(如 'works_for' -> 'WorksFor', 'person' -> 'Person')""" - # 按非字母数字字符分割 + """Convert an arbitrary identifier to PascalCase (e.g. ``works_for`` -> ``WorksFor``).""" + # Split on non-alphanumeric separators first. parts = re.split(r'[^a-zA-Z0-9]+', name) - # 再按 camelCase 边界分割(如 'camelCase' -> ['camel', 'Case']) + # Then split on camelCase boundaries (e.g. ``camelCase`` -> ``['camel', 'Case']``). words = [] for part in parts: words.extend(re.sub(r'([a-z])([A-Z])', r'\1_\2', part).split('_')) - # 每个词首字母大写,过滤空串 + # Title-case each non-empty word and concatenate. result = ''.join(word.capitalize() for word in words if word) return result if result else 'Unknown' -# 本体生成的系统提示词 +# System prompt template for ontology generation. ONTOLOGY_SYSTEM_PROMPT = """You are a professional knowledge-graph ontology designer. Your task is to analyze the supplied text and simulation requirement and design entity types and relationship types suitable for a **social-media public-opinion simulation**. **Important: you must output valid JSON data and nothing else.** @@ -174,10 +175,7 @@ B. **Concrete types (8 entries, designed from the text content)**: class OntologyGenerator: - """ - 本体生成器 - 分析文本内容,生成实体和关系类型定义 - """ + """Generate an entity- and edge-type ontology from arbitrary input text.""" def __init__(self, llm_client: Optional[LLMClient] = None): self.llm_client = llm_client or LLMClient() @@ -188,18 +186,17 @@ class OntologyGenerator: simulation_requirement: str, additional_context: Optional[str] = None ) -> Dict[str, Any]: - """ - 生成本体定义 - + """Generate an ontology definition. + Args: - document_texts: 文档文本列表 - simulation_requirement: 模拟需求描述 - additional_context: 额外上下文 - + document_texts: Source document text segments. + simulation_requirement: Description of the simulation goal. + additional_context: Optional supplemental context. + Returns: - 本体定义(entity_types, edge_types等) + The ontology dict with ``entity_types``, ``edge_types``, and a summary. """ - # 构建用户消息 + # Compose the user message that frames the LLM request. user_message = self._build_user_message( document_texts, simulation_requirement, @@ -213,19 +210,19 @@ class OntologyGenerator: {"role": "user", "content": user_message} ] - # 调用LLM + # Invoke the LLM. result = self.llm_client.chat_json( messages=messages, temperature=0.3, max_tokens=4096 ) - # 验证和后处理 + # Validate the LLM response and post-process it. result = self._validate_and_process(result) return result - # 传给 LLM 的文本最大长度(5万字) + # Maximum length of source text passed to the LLM (50k characters). MAX_TEXT_LENGTH_FOR_LLM = 50000 def _build_user_message( @@ -234,13 +231,14 @@ class OntologyGenerator: simulation_requirement: str, additional_context: Optional[str] ) -> str: - """构建用户消息""" - - # 合并文本 + """Build the user-message string for the ontology LLM call.""" + + # Concatenate the source documents into a single string. combined_text = "\n\n---\n\n".join(document_texts) original_length = len(combined_text) - - # 如果文本超过5万字,截断(仅影响传给LLM的内容,不影响图谱构建) + + # If the combined text exceeds the LLM input cap, truncate it for the + # LLM call only. The full text is still used for graph construction. if len(combined_text) > self.MAX_TEXT_LENGTH_FOR_LLM: combined_text = combined_text[:self.MAX_TEXT_LENGTH_FOR_LLM] combined_text += f"\n\n...(original text is {original_length} characters; only the first {self.MAX_TEXT_LENGTH_FOR_LLM} characters were used for ontology analysis)..." @@ -275,9 +273,9 @@ Based on the content above, design entity types and relationship types suitable return message def _validate_and_process(self, result: Dict[str, Any]) -> Dict[str, Any]: - """验证和后处理结果""" - - # 确保必要字段存在 + """Validate and post-process the LLM-generated ontology dict.""" + + # Ensure required top-level fields exist. if "entity_types" not in result: result["entity_types"] = [] if "edge_types" not in result: @@ -285,11 +283,12 @@ Based on the content above, design entity types and relationship types suitable if "analysis_summary" not in result: result["analysis_summary"] = "" - # 验证实体类型 - # 记录原始名称到 PascalCase 的映射,用于后续修正 edge 的 source_targets 引用 + # Validate entity types. + # Track original-name -> PascalCase mapping so edge source_targets + # references can be fixed up consistently below. entity_name_map = {} for entity in result["entity_types"]: - # 强制将 entity name 转为 PascalCase(Zep API 要求) + # Force entity names to PascalCase (required by the Zep API). if "name" in entity: original_name = entity["name"] entity["name"] = _to_pascal_case(original_name) @@ -300,19 +299,20 @@ Based on the content above, design entity types and relationship types suitable entity["attributes"] = [] if "examples" not in entity: entity["examples"] = [] - # 确保description不超过100字符 + # Truncate descriptions longer than 100 characters. if len(entity.get("description", "")) > 100: entity["description"] = entity["description"][:97] + "..." - - # 验证关系类型 + + # Validate edge types. for edge in result["edge_types"]: - # 强制将 edge name 转为 SCREAMING_SNAKE_CASE(Zep API 要求) + # Force edge names to SCREAMING_SNAKE_CASE (required by the Zep API). if "name" in edge: original_name = edge["name"] edge["name"] = original_name.upper() if edge["name"] != original_name: logger.warning(f"Edge type name '{original_name}' auto-converted to '{edge['name']}'") - # 修正 source_targets 中的实体名称引用,与转换后的 PascalCase 保持一致 + # Rewrite source_targets entity-name references to match the + # PascalCase-normalized entity names. for st in edge.get("source_targets", []): if st.get("source") in entity_name_map: st["source"] = entity_name_map[st["source"]] @@ -325,11 +325,11 @@ Based on the content above, design entity types and relationship types suitable if len(edge.get("description", "")) > 100: edge["description"] = edge["description"][:97] + "..." - # Zep API 限制:最多 10 个自定义实体类型,最多 10 个自定义边类型 + # Zep API caps: at most 10 custom entity types and 10 custom edge types. MAX_ENTITY_TYPES = 10 MAX_EDGE_TYPES = 10 - # 去重:按 name 去重,保留首次出现的 + # Deduplicate by name, keeping the first occurrence. seen_names = set() deduped = [] for entity in result["entity_types"]: @@ -341,7 +341,7 @@ Based on the content above, design entity types and relationship types suitable logger.warning(f"Duplicate entity type '{name}' removed during validation") result["entity_types"] = deduped - # 兜底类型定义 + # Fallback entity-type definitions used when the LLM omits them. person_fallback = { "name": "Person", "description": "Any individual person not fitting other specific person types.", @@ -362,33 +362,31 @@ Based on the content above, design entity types and relationship types suitable "examples": ["small business", "community group"] } - # 检查是否已有兜底类型 + # Check whether the fallback types are already present. entity_names = {e["name"] for e in result["entity_types"]} has_person = "Person" in entity_names has_organization = "Organization" in entity_names - - # 需要添加的兜底类型 + + # Collect missing fallback types to add below. fallbacks_to_add = [] if not has_person: fallbacks_to_add.append(person_fallback) if not has_organization: fallbacks_to_add.append(organization_fallback) - + if fallbacks_to_add: current_count = len(result["entity_types"]) needed_slots = len(fallbacks_to_add) - - # 如果添加后会超过 10 个,需要移除一些现有类型 + + # If adding the fallbacks would exceed the cap, drop some existing types. if current_count + needed_slots > MAX_ENTITY_TYPES: - # 计算需要移除多少个 to_remove = current_count + needed_slots - MAX_ENTITY_TYPES - # 从末尾移除(保留前面更重要的具体类型) + # Drop trailing types first; the more specific types come earlier. result["entity_types"] = result["entity_types"][:-to_remove] - - # 添加兜底类型 + result["entity_types"].extend(fallbacks_to_add) - - # 最终确保不超过限制(防御性编程) + + # Defensive cap enforcement: hard-trim if anything slipped through. if len(result["entity_types"]) > MAX_ENTITY_TYPES: result["entity_types"] = result["entity_types"][:MAX_ENTITY_TYPES] @@ -398,14 +396,13 @@ Based on the content above, design entity types and relationship types suitable return result def generate_python_code(self, ontology: Dict[str, Any]) -> str: - """ - 将本体定义转换为Python代码(类似ontology.py) - + """Render the ontology definition as Python source code. + Args: - ontology: 本体定义 - + ontology: Ontology definition dict. + Returns: - Python代码字符串 + Python source code as a single string. """ code_lines = [ '"""', @@ -421,7 +418,7 @@ Based on the content above, design entity types and relationship types suitable '', ] - # 生成实体类型 + # Emit each entity type as a Python class. for entity in ontology.get("entity_types", []): name = entity["name"] desc = entity.get("description", f"A {name} entity.") @@ -447,10 +444,10 @@ Based on the content above, design entity types and relationship types suitable code_lines.append('# ============== 关系类型定义 ==============') code_lines.append('') - # 生成关系类型 + # Emit each edge type as a Python class. for edge in ontology.get("edge_types", []): name = edge["name"] - # 转换为PascalCase类名 + # Convert SCREAMING_SNAKE_CASE -> PascalCase for the class name. class_name = ''.join(word.capitalize() for word in name.split('_')) desc = edge.get("description", f"A {name} relationship.") @@ -472,7 +469,7 @@ Based on the content above, design entity types and relationship types suitable code_lines.append('') code_lines.append('') - # 生成类型字典 + # Emit the type registries. code_lines.append('# ============== 类型配置 ==============') code_lines.append('') code_lines.append('ENTITY_TYPES = {') @@ -489,7 +486,7 @@ Based on the content above, design entity types and relationship types suitable code_lines.append('}') code_lines.append('') - # 生成边的source_targets映射 + # Emit the edge source_targets map. code_lines.append('EDGE_SOURCE_TARGETS = {') for edge in ontology.get("edge_types", []): name = edge["name"] diff --git a/backend/app/services/report_agent.py b/backend/app/services/report_agent.py index ddba4e9d..db0ddd50 100644 --- a/backend/app/services/report_agent.py +++ b/backend/app/services/report_agent.py @@ -1,12 +1,13 @@ """ -Report Agent服务 -使用LangChain + Zep实现ReACT模式的模拟报告生成 +Report Agent service. -功能: -1. 根据模拟需求和Zep图谱信息生成报告 -2. 先规划目录结构,然后分段生成 -3. 每段采用ReACT多轮思考与反思模式 -4. 支持与用户对话,在对话中自主调用检索工具 +Implements ReACT-style simulation report generation using LangChain + Zep. + +Features: +1. Generate a report from the simulation requirement and the Zep knowledge graph. +2. Plan the table of contents first, then generate one section at a time. +3. Each section uses a ReACT multi-round thought and reflection loop. +4. Support a chat mode that can autonomously invoke retrieval tools. """ import os @@ -35,18 +36,19 @@ logger = get_logger('mirofish.report_agent') class ReportLogger: """ - Report Agent 详细日志记录器 - - 在报告文件夹中生成 agent_log.jsonl 文件,记录每一步详细动作。 - 每行是一个完整的 JSON 对象,包含时间戳、动作类型、详细内容等。 + Detailed log recorder for the Report Agent. + + Writes an ``agent_log.jsonl`` file inside the report folder that captures every + step of agent activity. Each line is a complete JSON object containing a + timestamp, the action type, and the detailed payload. """ - + def __init__(self, report_id: str): """ - 初始化日志记录器 - + Initialize the log recorder. + Args: - report_id: 报告ID,用于确定日志文件路径 + report_id: Report ID used to determine the log file path. """ self.report_id = report_id self.log_file_path = os.path.join( @@ -56,12 +58,12 @@ class ReportLogger: self._ensure_log_file() def _ensure_log_file(self): - """确保日志文件所在目录存在""" + """Ensure the directory for the log file exists.""" log_dir = os.path.dirname(self.log_file_path) os.makedirs(log_dir, exist_ok=True) - + def _get_elapsed_time(self) -> float: - """获取从开始到现在的耗时(秒)""" + """Return the elapsed time in seconds since start.""" return (datetime.now() - self.start_time).total_seconds() def log( @@ -73,14 +75,15 @@ class ReportLogger: section_index: int = None ): """ - 记录一条日志 - + Record a single log entry. + Args: - action: 动作类型,如 'start', 'tool_call', 'llm_response', 'section_complete' 等 - stage: 当前阶段,如 'planning', 'generating', 'completed' - details: 详细内容字典,不截断 - section_title: 当前章节标题(可选) - section_index: 当前章节索引(可选) + action: Action type, e.g. ``"start"``, ``"tool_call"``, ``"llm_response"``, + ``"section_complete"``, etc. + stage: Current stage, e.g. ``"planning"``, ``"generating"``, ``"completed"``. + details: Detail payload dict; never truncated. + section_title: Title of the current section (optional). + section_index: Index of the current section (optional). """ log_entry = { "timestamp": datetime.now().isoformat(), @@ -93,12 +96,11 @@ class ReportLogger: "details": details } - # 追加写入 JSONL 文件 with open(self.log_file_path, 'a', encoding='utf-8') as f: f.write(json.dumps(log_entry, ensure_ascii=False) + '\n') def log_start(self, simulation_id: str, graph_id: str, simulation_requirement: str): - """记录报告生成开始""" + """Record the start of a report generation run.""" self.log( action="report_start", stage="pending", @@ -111,7 +113,7 @@ class ReportLogger: ) def log_planning_start(self): - """记录大纲规划开始""" + """Record the start of outline planning.""" self.log( action="planning_start", stage="planning", @@ -119,7 +121,7 @@ class ReportLogger: ) def log_planning_context(self, context: Dict[str, Any]): - """记录规划时获取的上下文信息""" + """Record the context retrieved during planning.""" self.log( action="planning_context", stage="planning", @@ -130,7 +132,7 @@ class ReportLogger: ) def log_planning_complete(self, outline_dict: Dict[str, Any]): - """记录大纲规划完成""" + """Record the completion of outline planning.""" self.log( action="planning_complete", stage="planning", @@ -141,7 +143,7 @@ class ReportLogger: ) def log_section_start(self, section_title: str, section_index: int): - """记录章节生成开始""" + """Record the start of section generation.""" self.log( action="section_start", stage="generating", @@ -151,7 +153,7 @@ class ReportLogger: ) def log_react_thought(self, section_title: str, section_index: int, iteration: int, thought: str): - """记录 ReACT 思考过程""" + """Record a ReACT thought step.""" self.log( action="react_thought", stage="generating", @@ -172,7 +174,7 @@ class ReportLogger: parameters: Dict[str, Any], iteration: int ): - """记录工具调用""" + """Record a tool invocation.""" self.log( action="tool_call", stage="generating", @@ -194,7 +196,7 @@ class ReportLogger: result: str, iteration: int ): - """记录工具调用结果(完整内容,不截断)""" + """Record a tool-call result (full content, never truncated).""" self.log( action="tool_result", stage="generating", @@ -203,7 +205,7 @@ class ReportLogger: details={ "iteration": iteration, "tool_name": tool_name, - "result": result, # 完整结果,不截断 + "result": result, # Full result, no truncation. "result_length": len(result), "message": t('report.toolResult', toolName=tool_name) } @@ -218,7 +220,7 @@ class ReportLogger: has_tool_calls: bool, has_final_answer: bool ): - """记录 LLM 响应(完整内容,不截断)""" + """Record an LLM response (full content, never truncated).""" self.log( action="llm_response", stage="generating", @@ -226,7 +228,7 @@ class ReportLogger: section_index=section_index, details={ "iteration": iteration, - "response": response, # 完整响应,不截断 + "response": response, # Full response, no truncation. "response_length": len(response), "has_tool_calls": has_tool_calls, "has_final_answer": has_final_answer, @@ -241,14 +243,14 @@ class ReportLogger: content: str, tool_calls_count: int ): - """记录章节内容生成完成(仅记录内容,不代表整个章节完成)""" + """Record completion of section-content generation (content only; not full section completion).""" self.log( action="section_content", stage="generating", section_title=section_title, section_index=section_index, details={ - "content": content, # 完整内容,不截断 + "content": content, # Full content, no truncation. "content_length": len(content), "tool_calls_count": tool_calls_count, "message": t('report.sectionContentDone', title=section_title) @@ -262,9 +264,10 @@ class ReportLogger: full_content: str ): """ - 记录章节生成完成 + Record full completion of a section. - 前端应监听此日志来判断一个章节是否真正完成,并获取完整内容 + The frontend should listen for this log entry to detect when a section is + truly finished and to retrieve its full content. """ self.log( action="section_complete", @@ -279,7 +282,7 @@ class ReportLogger: ) def log_report_complete(self, total_sections: int, total_time_seconds: float): - """记录报告生成完成""" + """Record completion of the entire report.""" self.log( action="report_complete", stage="completed", @@ -291,7 +294,7 @@ class ReportLogger: ) def log_error(self, error_message: str, stage: str, section_title: str = None): - """记录错误""" + """Record an error.""" self.log( action="error", stage=stage, @@ -306,18 +309,19 @@ class ReportLogger: class ReportConsoleLogger: """ - Report Agent 控制台日志记录器 - - 将控制台风格的日志(INFO、WARNING等)写入报告文件夹中的 console_log.txt 文件。 - 这些日志与 agent_log.jsonl 不同,是纯文本格式的控制台输出。 + Console-style log recorder for the Report Agent. + + Mirrors console-style log output (INFO, WARNING, etc.) into a + ``console_log.txt`` file in the report folder. These are plain-text console + logs, distinct from the structured ``agent_log.jsonl`` entries. """ - + def __init__(self, report_id: str): """ - 初始化控制台日志记录器 - + Initialize the console log recorder. + Args: - report_id: 报告ID,用于确定日志文件路径 + report_id: Report ID used to determine the log file path. """ self.report_id = report_id self.log_file_path = os.path.join( @@ -328,43 +332,41 @@ class ReportConsoleLogger: self._setup_file_handler() def _ensure_log_file(self): - """确保日志文件所在目录存在""" + """Ensure the directory for the log file exists.""" log_dir = os.path.dirname(self.log_file_path) os.makedirs(log_dir, exist_ok=True) - + def _setup_file_handler(self): - """设置文件处理器,将日志同时写入文件""" + """Set up the file handler so log records are also written to disk.""" import logging - - # 创建文件处理器 + self._file_handler = logging.FileHandler( self.log_file_path, mode='a', encoding='utf-8' ) self._file_handler.setLevel(logging.INFO) - - # 使用与控制台相同的简洁格式 + + # Use the same compact format as the console handler. formatter = logging.Formatter( '[%(asctime)s] %(levelname)s: %(message)s', datefmt='%H:%M:%S' ) self._file_handler.setFormatter(formatter) - - # 添加到 report_agent 相关的 logger + loggers_to_attach = [ 'mirofish.report_agent', 'mirofish.zep_tools', ] - + for logger_name in loggers_to_attach: target_logger = logging.getLogger(logger_name) - # 避免重复添加 + # Guard against attaching the same handler twice. if self._file_handler not in target_logger.handlers: target_logger.addHandler(self._file_handler) def close(self): - """关闭文件处理器并从 logger 中移除""" + """Close the file handler and detach it from the loggers.""" import logging if self._file_handler: @@ -382,12 +384,12 @@ class ReportConsoleLogger: self._file_handler = None def __del__(self): - """析构时确保关闭文件处理器""" + """Ensure the file handler is closed on destruction.""" self.close() class ReportStatus(str, Enum): - """报告状态""" + """Report status.""" PENDING = "pending" PLANNING = "planning" GENERATING = "generating" @@ -397,7 +399,7 @@ class ReportStatus(str, Enum): @dataclass class ReportSection: - """报告章节""" + """A single report section.""" title: str content: str = "" @@ -408,7 +410,7 @@ class ReportSection: } def to_markdown(self, level: int = 2) -> str: - """转换为Markdown格式""" + """Convert to Markdown format.""" md = f"{'#' * level} {self.title}\n\n" if self.content: md += f"{self.content}\n\n" @@ -417,7 +419,7 @@ class ReportSection: @dataclass class ReportOutline: - """报告大纲""" + """Report outline.""" title: str summary: str sections: List[ReportSection] @@ -430,7 +432,7 @@ class ReportOutline: } def to_markdown(self) -> str: - """转换为Markdown格式""" + """Convert to Markdown format.""" md = f"# {self.title}\n\n" md += f"> {self.summary}\n\n" for section in self.sections: @@ -440,7 +442,7 @@ class ReportOutline: @dataclass class Report: - """完整报告""" + """Full report.""" report_id: str simulation_id: str graph_id: str @@ -468,10 +470,10 @@ class Report: # ═══════════════════════════════════════════════════════════════ -# Prompt 模板常量 +# Prompt template constants # ═══════════════════════════════════════════════════════════════ -# ── 工具描述 ── +# ── Tool descriptions ── TOOL_DESC_INSIGHT_FORGE = """\ [Deep Insight Retrieval — Powerful Analytical Tool] @@ -547,7 +549,7 @@ How it works: [IMPORTANT] A running OASIS simulation environment is required to use this tool!""" -# ── 大纲规划 prompt ── +# ── Outline planning prompt ── PLAN_SYSTEM_PROMPT = """\ You are an expert author of "Future Prediction Reports" with a god's-eye view of the simulated world — you can observe the behavior, statements, and interactions of every agent in the simulation. @@ -610,7 +612,7 @@ Based on these prediction outcomes, design the most appropriate section structur [Reminder] Section count: minimum 2, maximum 5; keep the content tight and focused on the core prediction findings.""" -# ── 章节生成 prompt ── +# ── Section generation prompt ── SECTION_SYSTEM_PROMPT_TEMPLATE = """\ You are an expert author of "Future Prediction Reports" and you are currently writing one section of the report. @@ -791,7 +793,7 @@ Get started: 2. Then call a tool (Action) to retrieve the simulated data. 3. Once you have gathered enough information, output the body prefixed with Final Answer: (plain body, no headings).""" -# ── ReACT 循环内消息模板 ── +# ── In-loop ReACT message templates ── REACT_OBSERVATION_TEMPLATE = """\ Observation (retrieval result): @@ -858,27 +860,29 @@ CHAT_OBSERVATION_SUFFIX = "\n\nPlease answer the question concisely." # ═══════════════════════════════════════════════════════════════ -# ReportAgent 主类 +# ReportAgent main class # ═══════════════════════════════════════════════════════════════ class ReportAgent: """ - Report Agent - 模拟报告生成Agent + Report Agent — simulation report generator. - 采用ReACT(Reasoning + Acting)模式: - 1. 规划阶段:分析模拟需求,规划报告目录结构 - 2. 生成阶段:逐章节生成内容,每章节可多次调用工具获取信息 - 3. 反思阶段:检查内容完整性和准确性 + Uses a ReACT (Reasoning + Acting) loop: + 1. Planning stage: analyze the simulation requirement and plan the report's + table of contents. + 2. Generation stage: generate each section sequentially; each section may + call retrieval tools multiple times. + 3. Reflection stage: verify content completeness and accuracy. """ - - # 最大工具调用次数(每个章节) + + # Per-section maximum number of tool calls. MAX_TOOL_CALLS_PER_SECTION = 5 - - # 最大反思轮数 + + # Maximum number of reflection rounds. MAX_REFLECTION_ROUNDS = 3 - - # 对话中的最大工具调用次数 + + # Maximum number of tool calls allowed in chat mode. MAX_TOOL_CALLS_PER_CHAT = 2 def __init__( @@ -890,14 +894,14 @@ class ReportAgent: zep_tools: Optional[ZepToolsService] = None ): """ - 初始化Report Agent - + Initialize the Report Agent. + Args: - graph_id: 图谱ID - simulation_id: 模拟ID - simulation_requirement: 模拟需求描述 - llm_client: LLM客户端(可选) - zep_tools: Zep工具服务(可选) + graph_id: Graph ID. + simulation_id: Simulation ID. + simulation_requirement: Description of the simulation requirement. + llm_client: Optional LLM client. + zep_tools: Optional Zep tools service. """ self.graph_id = graph_id self.simulation_id = simulation_id @@ -906,18 +910,16 @@ class ReportAgent: self.llm = llm_client or LLMClient() self.zep_tools = zep_tools or ZepToolsService() - # 工具定义 self.tools = self._define_tools() - - # 日志记录器(在 generate_report 中初始化) + + # Loggers are lazily initialized inside generate_report. self.report_logger: Optional[ReportLogger] = None - # 控制台日志记录器(在 generate_report 中初始化) self.console_logger: Optional[ReportConsoleLogger] = None logger.info(t('report.agentInitDone', graphId=graph_id, simulationId=simulation_id)) def _define_tools(self) -> Dict[str, Dict[str, Any]]: - """定义可用工具""" + """Define the tools available to the agent.""" return { "insight_forge": { "name": "insight_forge", @@ -955,15 +957,15 @@ class ReportAgent: def _execute_tool(self, tool_name: str, parameters: Dict[str, Any], report_context: str = "") -> str: """ - 执行工具调用 - + Execute a tool call. + Args: - tool_name: 工具名称 - parameters: 工具参数 - report_context: 报告上下文(用于InsightForge) - + tool_name: Tool name. + parameters: Tool parameters. + report_context: Report context (used by InsightForge). + Returns: - 工具执行结果(文本格式) + The tool execution result as text. """ logger.info(t('report.executingTool', toolName=tool_name, params=parameters)) @@ -980,7 +982,7 @@ class ReportAgent: return result.to_text() elif tool_name == "panorama_search": - # 广度搜索 - 获取全貌 + # Wide-angle search — get the full picture. query = parameters.get("query", "") include_expired = parameters.get("include_expired", True) if isinstance(include_expired, str): @@ -993,7 +995,7 @@ class ReportAgent: return result.to_text() elif tool_name == "quick_search": - # 简单搜索 - 快速检索 + # Lightweight search — fast retrieval. query = parameters.get("query", "") limit = parameters.get("limit", 10) if isinstance(limit, str): @@ -1006,7 +1008,7 @@ class ReportAgent: return result.to_text() elif tool_name == "interview_agents": - # 深度采访 - 调用真实的OASIS采访API获取模拟Agent的回答(双平台) + # Deep interview — call the real OASIS interview API to query the simulated agents on both platforms. interview_topic = parameters.get("interview_topic", parameters.get("query", "")) max_agents = parameters.get("max_agents", 5) if isinstance(max_agents, str): @@ -1020,10 +1022,10 @@ class ReportAgent: ) return result.to_text() - # ========== 向后兼容的旧工具(内部重定向到新工具) ========== - + # ========== Backward-compatible legacy tools (internally redirect to the new tools). ========== + elif tool_name == "search_graph": - # 重定向到 quick_search + # Redirect to quick_search. logger.info(t('report.redirectToQuickSearch')) return self._execute_tool("quick_search", parameters, report_context) @@ -1040,7 +1042,7 @@ class ReportAgent: return json.dumps(result, ensure_ascii=False, indent=2) elif tool_name == "get_simulation_context": - # 重定向到 insight_forge,因为它更强大 + # Redirect to insight_forge — it's the more powerful tool. logger.info(t('report.redirectToInsightForge')) query = parameters.get("query", self.simulation_requirement) return self._execute_tool("insight_forge", {"query": query}, report_context) @@ -1061,20 +1063,20 @@ class ReportAgent: logger.error(t('report.toolExecFailed', toolName=tool_name, error=str(e))) return f"Tool execution failed: {str(e)}" - # 合法的工具名称集合,用于裸 JSON 兜底解析时校验 + # Set of legal tool names; used to validate naked-JSON fallback parses. VALID_TOOL_NAMES = {"insight_forge", "panorama_search", "quick_search", "interview_agents"} def _parse_tool_calls(self, response: str) -> List[Dict[str, Any]]: """ - 从LLM响应中解析工具调用 + Parse tool calls from an LLM response. - 支持的格式(按优先级): - 1. {"name": "tool_name", "parameters": {...}} - 2. 裸 JSON(响应整体或单行就是一个工具调用 JSON) + Supported formats (in priority order): + 1. ``{"name": "tool_name", "parameters": {...}}`` + 2. Naked JSON (the whole response, or a single line, is the tool-call JSON). """ tool_calls = [] - # 格式1: XML风格(标准格式) + # Format 1: XML-style (canonical format). xml_pattern = r'\s*(\{.*?\})\s*' for match in re.finditer(xml_pattern, response, re.DOTALL): try: @@ -1086,8 +1088,8 @@ class ReportAgent: if tool_calls: return tool_calls - # 格式2: 兜底 - LLM 直接输出裸 JSON(没包 标签) - # 只在格式1未匹配时尝试,避免误匹配正文中的 JSON + # Format 2: fallback — the LLM emits naked JSON without a wrapper. + # Only tried when format 1 did not match, to avoid mis-matching JSON embedded in body text. stripped = response.strip() if stripped.startswith('{') and stripped.endswith('}'): try: @@ -1098,7 +1100,7 @@ class ReportAgent: except json.JSONDecodeError: pass - # 响应可能包含思考文字 + 裸 JSON,尝试提取最后一个 JSON 对象 + # The response may include reasoning text plus naked JSON; try to extract the trailing JSON object. json_pattern = r'(\{"(?:name|tool)"\s*:.*?\})\s*$' match = re.search(json_pattern, stripped, re.DOTALL) if match: @@ -1112,11 +1114,11 @@ class ReportAgent: return tool_calls def _is_valid_tool_call(self, data: dict) -> bool: - """校验解析出的 JSON 是否是合法的工具调用""" - # 支持 {"name": ..., "parameters": ...} 和 {"tool": ..., "params": ...} 两种键名 + """Check that a parsed JSON object is a valid tool call.""" + # Accept both {"name": ..., "parameters": ...} and {"tool": ..., "params": ...}. tool_name = data.get("name") or data.get("tool") if tool_name and tool_name in self.VALID_TOOL_NAMES: - # 统一键名为 name / parameters + # Normalize the key names to ``name`` / ``parameters``. if "tool" in data: data["name"] = data.pop("tool") if "params" in data and "parameters" not in data: @@ -1125,7 +1127,7 @@ class ReportAgent: return False def _get_tools_description(self) -> str: - """生成工具描述文本""" + """Build the descriptive tool-listing text.""" desc_parts = ["Available tools:"] for name, tool in self.tools.items(): params_desc = ", ".join([f"{k}: {v}" for k, v in tool["parameters"].items()]) @@ -1139,22 +1141,23 @@ class ReportAgent: progress_callback: Optional[Callable] = None ) -> ReportOutline: """ - 规划报告大纲 - - 使用LLM分析模拟需求,规划报告的目录结构 - + Plan the report outline. + + Use the LLM to analyze the simulation requirement and plan the report's + table of contents. + Args: - progress_callback: 进度回调函数 - + progress_callback: Progress callback function. + Returns: - ReportOutline: 报告大纲 + ReportOutline: The report outline. """ logger.info(t('report.startPlanningOutline')) if progress_callback: progress_callback("planning", 0, t('progress.analyzingRequirements')) - # 首先获取模拟上下文 + # First fetch the simulation context. context = self.zep_tools.get_simulation_context( graph_id=self.graph_id, simulation_requirement=self.simulation_requirement @@ -1185,7 +1188,7 @@ class ReportAgent: if progress_callback: progress_callback("planning", 80, t('progress.parsingOutline')) - # 解析大纲 + # Parse the outline. sections = [] for section_data in response.get("sections", []): sections.append(ReportSection( @@ -1207,7 +1210,7 @@ class ReportAgent: except Exception as e: logger.error(t('report.outlinePlanFailed', error=str(e))) - # 返回默认大纲(3个章节,作为fallback) + # Return a default 3-section fallback outline. return ReportOutline( title="Future Prediction Report", summary="Trend and risk analysis grounded in simulation predictions.", @@ -1227,28 +1230,27 @@ class ReportAgent: section_index: int = 0 ) -> str: """ - 使用ReACT模式生成单个章节内容 - - ReACT循环: - 1. Thought(思考)- 分析需要什么信息 - 2. Action(行动)- 调用工具获取信息 - 3. Observation(观察)- 分析工具返回结果 - 4. 重复直到信息足够或达到最大次数 - 5. Final Answer(最终回答)- 生成章节内容 - + Generate a single section's content using the ReACT pattern. + + ReACT loop: + 1. Thought — analyze what information is needed. + 2. Action — call a tool to fetch information. + 3. Observation — analyze the tool result. + 4. Repeat until enough information has been gathered or the cap is hit. + 5. Final Answer — emit the section content. + Args: - section: 要生成的章节 - outline: 完整大纲 - previous_sections: 之前章节的内容(用于保持连贯性) - progress_callback: 进度回调 - section_index: 章节索引(用于日志记录) - + section: The section to generate. + outline: The full outline. + previous_sections: Content of previously generated sections (for continuity). + progress_callback: Progress callback. + section_index: Section index (used for logging). + Returns: - 章节内容(Markdown格式) + The section content in Markdown format. """ logger.info(t('report.reactGenerateSection', title=section.title)) - # 记录章节开始日志 if self.report_logger: self.report_logger.log_section_start(section.title, section_index) @@ -1261,11 +1263,11 @@ class ReportAgent: ) system_prompt = f"{system_prompt}\n\n{get_language_instruction()}" - # 构建用户prompt - 每个已完成章节各传入最大4000字 + # Build the user prompt — pass at most 4000 chars per completed section. if previous_sections: previous_parts = [] for sec in previous_sections: - # 每个章节最多4000字 + # Cap at 4000 chars per section. truncated = sec[:4000] + "..." if len(sec) > 4000 else sec previous_parts.append(truncated) previous_content = "\n\n---\n\n".join(previous_parts) @@ -1282,15 +1284,15 @@ class ReportAgent: {"role": "user", "content": user_prompt} ] - # ReACT循环 + # ReACT loop. tool_calls_count = 0 - max_iterations = 5 # 最大迭代轮数 - min_tool_calls = 3 # 最少工具调用次数 - conflict_retries = 0 # 工具调用与Final Answer同时出现的连续冲突次数 - used_tools = set() # 记录已调用过的工具名 + max_iterations = 5 # Max iteration rounds. + min_tool_calls = 3 # Minimum required tool-call count. + conflict_retries = 0 # Number of consecutive tool-call + Final-Answer conflicts. + used_tools = set() # Tracks the names of tools already invoked. all_tools = {"insight_forge", "panorama_search", "quick_search", "interview_agents"} - # 报告上下文,用于InsightForge的子问题生成 + # Report context, used by InsightForge to drive sub-question generation. report_context = f"Section title: {section.title}\nSimulation requirement: {self.simulation_requirement}" for iteration in range(max_iterations): @@ -1301,32 +1303,31 @@ class ReportAgent: t('progress.deepSearchAndWrite', current=tool_calls_count, max=self.MAX_TOOL_CALLS_PER_SECTION) ) - # 调用LLM response = self.llm.chat( messages=messages, temperature=0.5, max_tokens=4096 ) - # 检查 LLM 返回是否为 None(API 异常或内容为空) + # Guard against a None response (API error or empty content). if response is None: logger.warning(t('report.sectionIterNone', title=section.title, iteration=iteration + 1)) - # 如果还有迭代次数,添加消息并重试 + # If iterations remain, append a nudge and retry. if iteration < max_iterations - 1: messages.append({"role": "assistant", "content": "(empty response)"}) messages.append({"role": "user", "content": "Please continue generating content."}) continue - # 最后一次迭代也返回 None,跳出循环进入强制收尾 + # Last iteration also returned None — break out into the forced wrap-up. break logger.debug(t("log.report_agent.m001", response=response[:200])) - # 解析一次,复用结果 + # Parse once; reuse the result downstream. tool_calls = self._parse_tool_calls(response) has_tool_calls = bool(tool_calls) has_final_answer = "Final Answer:" in response - # ── 冲突处理:LLM 同时输出了工具调用和 Final Answer ── + # ── Conflict handling: LLM produced both a tool call and a Final Answer. ── if has_tool_calls and has_final_answer: conflict_retries += 1 logger.warning( @@ -1334,7 +1335,7 @@ class ReportAgent: ) if conflict_retries <= 2: - # 前两次:丢弃本次响应,要求 LLM 重新回复 + # First two strikes: drop the response and ask the LLM to retry. messages.append({"role": "assistant", "content": response}) messages.append({ "role": "user", @@ -1348,7 +1349,7 @@ class ReportAgent: }) continue else: - # 第三次:降级处理,截断到第一个工具调用,强制执行 + # Third strike: degrade — truncate at the first tool call and execute it. logger.warning( t('report.sectionConflictDowngrade', title=section.title, conflictCount=conflict_retries) ) @@ -1360,7 +1361,6 @@ class ReportAgent: has_final_answer = False conflict_retries = 0 - # 记录 LLM 响应日志 if self.report_logger: self.report_logger.log_llm_response( section_title=section.title, @@ -1371,9 +1371,9 @@ class ReportAgent: has_final_answer=has_final_answer ) - # ── 情况1:LLM 输出了 Final Answer ── + # ── Case 1: LLM produced a Final Answer. ── if has_final_answer: - # 工具调用次数不足,拒绝并要求继续调工具 + # Not enough tool calls yet — refuse and ask the agent to keep retrieving. if tool_calls_count < min_tool_calls: messages.append({"role": "assistant", "content": response}) unused_tools = all_tools - used_tools @@ -1388,7 +1388,7 @@ class ReportAgent: }) continue - # 正常结束 + # Normal termination. final_answer = response.split("Final Answer:")[-1].strip() logger.info(t('report.sectionGenDone', title=section.title, count=tool_calls_count)) @@ -1401,9 +1401,9 @@ class ReportAgent: ) return final_answer - # ── 情况2:LLM 尝试调用工具 ── + # ── Case 2: LLM tried to call a tool. ── if has_tool_calls: - # 工具额度已耗尽 → 明确告知,要求输出 Final Answer + # Tool budget exhausted → tell the agent explicitly and demand a Final Answer. if tool_calls_count >= self.MAX_TOOL_CALLS_PER_SECTION: messages.append({"role": "assistant", "content": response}) messages.append({ @@ -1415,7 +1415,7 @@ class ReportAgent: }) continue - # 只执行第一个工具调用 + # Only execute the first tool call. call = tool_calls[0] if len(tool_calls) > 1: logger.info(t('report.multiToolOnlyFirst', total=len(tool_calls), toolName=call['name'])) @@ -1447,7 +1447,7 @@ class ReportAgent: tool_calls_count += 1 used_tools.add(call['name']) - # 构建未使用工具提示 + # Build the "unused tools" hint. unused_tools = all_tools - used_tools unused_hint = "" if unused_tools and tool_calls_count < self.MAX_TOOL_CALLS_PER_SECTION: @@ -1467,11 +1467,11 @@ class ReportAgent: }) continue - # ── 情况3:既没有工具调用,也没有 Final Answer ── + # ── Case 3: neither a tool call nor a Final Answer. ── messages.append({"role": "assistant", "content": response}) if tool_calls_count < min_tool_calls: - # 工具调用次数不足,推荐未用过的工具 + # Not enough tool calls yet — suggest the unused tools. unused_tools = all_tools - used_tools unused_hint = f"(These tools have not been used yet — try them: {', '.join(unused_tools)})" if unused_tools else "" @@ -1485,8 +1485,8 @@ class ReportAgent: }) continue - # 工具调用已足够,LLM 输出了内容但没带 "Final Answer:" 前缀 - # 直接将这段内容作为最终答案,不再空转 + # Enough tool calls already; the LLM emitted content without the "Final Answer:" prefix. + # Treat the content as the final answer rather than spinning further. logger.info(t('report.sectionNoPrefix', title=section.title, count=tool_calls_count)) final_answer = response.strip() @@ -1499,7 +1499,7 @@ class ReportAgent: ) return final_answer - # 达到最大迭代次数,强制生成内容 + # Reached the iteration cap — force the content out. logger.warning(t('report.sectionMaxIter', title=section.title)) messages.append({"role": "user", "content": REACT_FORCE_FINAL_MSG}) @@ -1509,7 +1509,7 @@ class ReportAgent: max_tokens=4096 ) - # 检查强制收尾时 LLM 返回是否为 None + # Guard against a None response on the forced wrap-up call. if response is None: logger.error(t('report.sectionForceFailed', title=section.title)) final_answer = t('report.sectionGenFailedContent') @@ -1518,7 +1518,6 @@ class ReportAgent: else: final_answer = response - # 记录章节内容生成完成日志 if self.report_logger: self.report_logger.log_section_content( section_title=section.title, @@ -1526,7 +1525,7 @@ class ReportAgent: content=final_answer, tool_calls_count=tool_calls_count ) - + return final_answer def generate_report( @@ -1535,29 +1534,32 @@ class ReportAgent: report_id: Optional[str] = None ) -> Report: """ - 生成完整报告(分章节实时输出) - - 每个章节生成完成后立即保存到文件夹,不需要等待整个报告完成。 - 文件结构: - reports/{report_id}/ - meta.json - 报告元信息 - outline.json - 报告大纲 - progress.json - 生成进度 - section_01.md - 第1章节 - section_02.md - 第2章节 - ... - full_report.md - 完整报告 - + Generate the full report, streaming each section out as it finishes. + + Each section is saved to disk as soon as it is generated; the caller does + not have to wait for the whole report to complete. + + File layout:: + + reports/{report_id}/ + meta.json - Report metadata. + outline.json - Report outline. + progress.json - Generation progress. + section_01.md - Section 1. + section_02.md - Section 2. + ... + full_report.md - Full report. + Args: - progress_callback: 进度回调函数 (stage, progress, message) - report_id: 报告ID(可选,如果不传则自动生成) - + progress_callback: Progress callback ``(stage, progress, message)``. + report_id: Optional report ID; auto-generated if not provided. + Returns: - Report: 完整报告 + Report: The completed report object. """ import uuid - # 如果没有传入 report_id,则自动生成 + # Auto-generate a report_id if the caller didn't supply one. if not report_id: report_id = f"report_{uuid.uuid4().hex[:12]}" start_time = datetime.now() @@ -1571,14 +1573,14 @@ class ReportAgent: created_at=datetime.now().isoformat() ) - # 已完成的章节标题列表(用于进度追踪) + # Titles of sections that have already been completed (used for progress tracking). completed_section_titles = [] - + try: - # 初始化:创建报告文件夹并保存初始状态 + # Bootstrap: create the report folder and persist the initial state. ReportManager._ensure_report_folder(report_id) - - # 初始化日志记录器(结构化日志 agent_log.jsonl) + + # Initialize the structured logger (agent_log.jsonl). self.report_logger = ReportLogger(report_id) self.report_logger.log_start( simulation_id=self.simulation_id, @@ -1586,7 +1588,7 @@ class ReportAgent: simulation_requirement=self.simulation_requirement ) - # 初始化控制台日志记录器(console_log.txt) + # Initialize the console logger (console_log.txt). self.console_logger = ReportConsoleLogger(report_id) ReportManager.update_progress( @@ -1595,14 +1597,13 @@ class ReportAgent: ) ReportManager.save_report(report) - # 阶段1: 规划大纲 + # Stage 1: plan the outline. report.status = ReportStatus.PLANNING ReportManager.update_progress( report_id, "planning", 5, t('progress.startPlanningOutline'), completed_sections=[] ) - # 记录规划开始日志 self.report_logger.log_planning_start() if progress_callback: @@ -1614,10 +1615,9 @@ class ReportAgent: ) report.outline = outline - # 记录规划完成日志 self.report_logger.log_planning_complete(outline.to_dict()) - - # 保存大纲到文件 + + # Persist the outline to disk. ReportManager.save_outline(report_id, outline) ReportManager.update_progress( report_id, "planning", 15, t('progress.outlineDone', count=len(outline.sections)), @@ -1627,17 +1627,17 @@ class ReportAgent: logger.info(t('report.outlineSavedToFile', reportId=report_id)) - # 阶段2: 逐章节生成(分章节保存) + # Stage 2: generate the report section by section, saving each as it completes. report.status = ReportStatus.GENERATING - + total_sections = len(outline.sections) - generated_sections = [] # 保存内容用于上下文 + generated_sections = [] # Keep the content around for context. for i, section in enumerate(outline.sections): section_num = i + 1 base_progress = 20 + int((i / total_sections) * 70) - # 更新进度 + # Update progress. ReportManager.update_progress( report_id, "generating", base_progress, t('progress.generatingSection', title=section.title, current=section_num, total=total_sections), @@ -1652,7 +1652,7 @@ class ReportAgent: t('progress.generatingSection', title=section.title, current=section_num, total=total_sections) ) - # 生成主章节内容 + # Generate the main section body. section_content = self._generate_section_react( section=section, outline=outline, @@ -1669,11 +1669,10 @@ class ReportAgent: section.content = section_content generated_sections.append(f"## {section.title}\n\n{section_content}") - # 保存章节 + # Persist the section. ReportManager.save_section(report_id, section_num, section) completed_section_titles.append(section.title) - # 记录章节完成日志 full_section_content = f"## {section.title}\n\n{section_content}" if self.report_logger: @@ -1684,17 +1683,17 @@ class ReportAgent: ) logger.info(t('report.sectionSaved', reportId=report_id, sectionNum=f"{section_num:02d}")) - - # 更新进度 + + # Update progress. ReportManager.update_progress( - report_id, "generating", + report_id, "generating", base_progress + int(70 / total_sections), t('progress.sectionDone', title=section.title), current_section=None, completed_sections=completed_section_titles ) - # 阶段3: 组装完整报告 + # Stage 3: assemble the full report. if progress_callback: progress_callback("generating", 95, t('progress.assemblingReport')) @@ -1703,22 +1702,21 @@ class ReportAgent: completed_sections=completed_section_titles ) - # 使用ReportManager组装完整报告 + # Assemble the full report via ReportManager. report.markdown_content = ReportManager.assemble_full_report(report_id, outline) report.status = ReportStatus.COMPLETED report.completed_at = datetime.now().isoformat() - # 计算总耗时 + # Compute total elapsed time. total_time_seconds = (datetime.now() - start_time).total_seconds() - - # 记录报告完成日志 + if self.report_logger: self.report_logger.log_report_complete( total_sections=total_sections, total_time_seconds=total_time_seconds ) - # 保存最终报告 + # Save the final report. ReportManager.save_report(report) ReportManager.update_progress( report_id, "completed", 100, t('progress.reportComplete'), @@ -1730,23 +1728,22 @@ class ReportAgent: logger.info(t('report.reportGenDone', reportId=report_id)) - # 关闭控制台日志记录器 + # Close the console logger. if self.console_logger: self.console_logger.close() self.console_logger = None - + return report - + except Exception as e: logger.error(t('report.reportGenFailed', error=str(e))) report.status = ReportStatus.FAILED report.error = str(e) - - # 记录错误日志 + if self.report_logger: self.report_logger.log_error(str(e), "failed") - - # 保存失败状态 + + # Persist the failed status. try: ReportManager.save_report(report) ReportManager.update_progress( @@ -1754,9 +1751,9 @@ class ReportAgent: completed_sections=completed_section_titles ) except Exception: - pass # 忽略保存失败的错误 - - # 关闭控制台日志记录器 + pass # Ignore failures while persisting the failure state. + + # Close the console logger. if self.console_logger: self.console_logger.close() self.console_logger = None @@ -1769,31 +1766,32 @@ class ReportAgent: chat_history: List[Dict[str, str]] = None ) -> Dict[str, Any]: """ - 与Report Agent对话 - - 在对话中Agent可以自主调用检索工具来回答问题 - + Chat with the Report Agent. + + In chat mode the agent can autonomously call retrieval tools to answer + the user's question. + Args: - message: 用户消息 - chat_history: 对话历史 - + message: User message. + chat_history: Conversation history. + Returns: - { - "response": "Agent回复", - "tool_calls": [调用的工具列表], - "sources": [信息来源] - } + ``{ + "response": "Agent reply", + "tool_calls": [list of tools that were invoked], + "sources": [information sources] + }`` """ logger.info(t('report.agentChat', message=message[:50])) chat_history = chat_history or [] - # 获取已生成的报告内容 + # Fetch the already-generated report content. report_content = "" try: report = ReportManager.get_report_by_simulation(self.simulation_id) if report and report.markdown_content: - # 限制报告长度,避免上下文过长 + # Cap the report length to keep the context window manageable. report_content = report.markdown_content[:15000] if len(report.markdown_content) > 15000: report_content += "\n\n... [report content truncated] ..." @@ -1807,22 +1805,22 @@ class ReportAgent: ) system_prompt = f"{system_prompt}\n\n{get_language_instruction()}" - # 构建消息 + # Build the messages list. messages = [{"role": "system", "content": system_prompt}] - - # 添加历史对话 - for h in chat_history[-10:]: # 限制历史长度 + + # Append conversation history. + for h in chat_history[-10:]: # Cap the history length. messages.append(h) - - # 添加用户消息 + + # Append the user's new message. messages.append({ - "role": "user", + "role": "user", "content": message }) - - # ReACT循环(简化版) + + # Simplified ReACT loop. tool_calls_made = [] - max_iterations = 2 # 减少迭代轮数 + max_iterations = 2 # Fewer iterations than the section loop. for iteration in range(max_iterations): response = self.llm.chat( @@ -1830,11 +1828,11 @@ class ReportAgent: temperature=0.5 ) - # 解析工具调用 + # Parse tool calls. tool_calls = self._parse_tool_calls(response) - + if not tool_calls: - # 没有工具调用,直接返回响应 + # No tool calls — return the response directly. clean_response = re.sub(r'.*?', '', response, flags=re.DOTALL) clean_response = re.sub(r'\[TOOL_CALL\].*?\)', '', clean_response) @@ -1844,19 +1842,19 @@ class ReportAgent: "sources": [tc.get("parameters", {}).get("query", "") for tc in tool_calls_made] } - # 执行工具调用(限制数量) + # Execute tool calls (with a hard cap). tool_results = [] - for call in tool_calls[:1]: # 每轮最多执行1次工具调用 + for call in tool_calls[:1]: # At most one tool call per iteration. if len(tool_calls_made) >= self.MAX_TOOL_CALLS_PER_CHAT: break result = self._execute_tool(call["name"], call.get("parameters", {})) tool_results.append({ "tool": call["name"], - "result": result[:1500] # 限制结果长度 + "result": result[:1500] # Cap the result length. }) tool_calls_made.append(call) - - # 将结果添加到消息 + + # Append the result back into the message stream. messages.append({"role": "assistant", "content": response}) observation = "\n".join([f"[{r['tool']} result]\n{r['result']}" for r in tool_results]) messages.append({ @@ -1864,13 +1862,13 @@ class ReportAgent: "content": observation + CHAT_OBSERVATION_SUFFIX }) - # 达到最大迭代,获取最终响应 + # Iteration cap reached — fetch a final response. final_response = self.llm.chat( messages=messages, temperature=0.5 ) - # 清理响应 + # Clean up the response. clean_response = re.sub(r'.*?', '', final_response, flags=re.DOTALL) clean_response = re.sub(r'\[TOOL_CALL\].*?\)', '', clean_response) @@ -1883,96 +1881,99 @@ class ReportAgent: class ReportManager: """ - 报告管理器 - - 负责报告的持久化存储和检索 - - 文件结构(分章节输出): - reports/ - {report_id}/ - meta.json - 报告元信息和状态 - outline.json - 报告大纲 - progress.json - 生成进度 - section_01.md - 第1章节 - section_02.md - 第2章节 - ... - full_report.md - 完整报告 + Report manager. + + Handles persistence and retrieval of reports. + + File layout (one folder per report):: + + reports/ + {report_id}/ + meta.json - Report metadata and status. + outline.json - Report outline. + progress.json - Generation progress. + section_01.md - Section 1. + section_02.md - Section 2. + ... + full_report.md - Full report. """ - - # 报告存储目录 + + # Root directory where reports are stored. REPORTS_DIR = os.path.join(Config.UPLOAD_FOLDER, 'reports') - + @classmethod def _ensure_reports_dir(cls): - """确保报告根目录存在""" + """Ensure the reports root directory exists.""" os.makedirs(cls.REPORTS_DIR, exist_ok=True) @classmethod def _get_report_folder(cls, report_id: str) -> str: - """获取报告文件夹路径""" + """Return the report folder path.""" return os.path.join(cls.REPORTS_DIR, report_id) @classmethod def _ensure_report_folder(cls, report_id: str) -> str: - """确保报告文件夹存在并返回路径""" + """Ensure the report folder exists and return its path.""" folder = cls._get_report_folder(report_id) os.makedirs(folder, exist_ok=True) return folder @classmethod def _get_report_path(cls, report_id: str) -> str: - """获取报告元信息文件路径""" + """Return the path of the report metadata file.""" return os.path.join(cls._get_report_folder(report_id), "meta.json") @classmethod def _get_report_markdown_path(cls, report_id: str) -> str: - """获取完整报告Markdown文件路径""" + """Return the path of the full-report Markdown file.""" return os.path.join(cls._get_report_folder(report_id), "full_report.md") @classmethod def _get_outline_path(cls, report_id: str) -> str: - """获取大纲文件路径""" + """Return the path of the outline file.""" return os.path.join(cls._get_report_folder(report_id), "outline.json") @classmethod def _get_progress_path(cls, report_id: str) -> str: - """获取进度文件路径""" + """Return the path of the progress file.""" return os.path.join(cls._get_report_folder(report_id), "progress.json") @classmethod def _get_section_path(cls, report_id: str, section_index: int) -> str: - """获取章节Markdown文件路径""" + """Return the path of the section Markdown file.""" return os.path.join(cls._get_report_folder(report_id), f"section_{section_index:02d}.md") @classmethod def _get_agent_log_path(cls, report_id: str) -> str: - """获取 Agent 日志文件路径""" + """Return the path of the Agent log file.""" return os.path.join(cls._get_report_folder(report_id), "agent_log.jsonl") @classmethod def _get_console_log_path(cls, report_id: str) -> str: - """获取控制台日志文件路径""" + """Return the path of the console log file.""" return os.path.join(cls._get_report_folder(report_id), "console_log.txt") @classmethod def get_console_log(cls, report_id: str, from_line: int = 0) -> Dict[str, Any]: """ - 获取控制台日志内容 - - 这是报告生成过程中的控制台输出日志(INFO、WARNING等), - 与 agent_log.jsonl 的结构化日志不同。 - + Read the console log content. + + These are the console-style log records (INFO, WARNING, etc.) emitted + during report generation, distinct from the structured + ``agent_log.jsonl`` entries. + Args: - report_id: 报告ID - from_line: 从第几行开始读取(用于增量获取,0 表示从头开始) - + report_id: Report ID. + from_line: Line number to start reading from (0 = from the start); + used for incremental fetches. + Returns: - { - "logs": [日志行列表], - "total_lines": 总行数, - "from_line": 起始行号, - "has_more": 是否还有更多日志 - } + ``{ + "logs": [list of log lines], + "total_lines": total line count, + "from_line": starting line number, + "has_more": whether more log content is still available + }`` """ log_path = cls._get_console_log_path(report_id) @@ -1991,26 +1992,26 @@ class ReportManager: for i, line in enumerate(f): total_lines = i + 1 if i >= from_line: - # 保留原始日志行,去掉末尾换行符 + # Preserve the original log line, stripping trailing newlines. logs.append(line.rstrip('\n\r')) - + return { "logs": logs, "total_lines": total_lines, "from_line": from_line, - "has_more": False # 已读取到末尾 + "has_more": False # Already at end-of-file. } - + @classmethod def get_console_log_stream(cls, report_id: str) -> List[str]: """ - 获取完整的控制台日志(一次性获取全部) - + Fetch the entire console log in one call. + Args: - report_id: 报告ID - + report_id: Report ID. + Returns: - 日志行列表 + List of log lines. """ result = cls.get_console_log(report_id, from_line=0) return result["logs"] @@ -2018,19 +2019,20 @@ class ReportManager: @classmethod def get_agent_log(cls, report_id: str, from_line: int = 0) -> Dict[str, Any]: """ - 获取 Agent 日志内容 - + Read the Agent log content. + Args: - report_id: 报告ID - from_line: 从第几行开始读取(用于增量获取,0 表示从头开始) - + report_id: Report ID. + from_line: Line number to start reading from (0 = from the start); + used for incremental fetches. + Returns: - { - "logs": [日志条目列表], - "total_lines": 总行数, - "from_line": 起始行号, - "has_more": 是否还有更多日志 - } + ``{ + "logs": [list of log entries], + "total_lines": total line count, + "from_line": starting line number, + "has_more": whether more log content is still available + }`` """ log_path = cls._get_agent_log_path(report_id) @@ -2053,26 +2055,26 @@ class ReportManager: log_entry = json.loads(line.strip()) logs.append(log_entry) except json.JSONDecodeError: - # 跳过解析失败的行 + # Skip lines that fail to parse. continue - + return { "logs": logs, "total_lines": total_lines, "from_line": from_line, - "has_more": False # 已读取到末尾 + "has_more": False # Already at end-of-file. } - + @classmethod def get_agent_log_stream(cls, report_id: str) -> List[Dict[str, Any]]: """ - 获取完整的 Agent 日志(用于一次性获取全部) - + Fetch the entire Agent log in one call. + Args: - report_id: 报告ID - + report_id: Report ID. + Returns: - 日志条目列表 + List of log entries. """ result = cls.get_agent_log(report_id, from_line=0) return result["logs"] @@ -2080,9 +2082,9 @@ class ReportManager: @classmethod def save_outline(cls, report_id: str, outline: ReportOutline) -> None: """ - 保存报告大纲 - - 在规划阶段完成后立即调用 + Persist the report outline. + + Called as soon as the planning stage finishes. """ cls._ensure_report_folder(report_id) @@ -2099,27 +2101,28 @@ class ReportManager: section: ReportSection ) -> str: """ - 保存单个章节 + Persist a single section. - 在每个章节生成完成后立即调用,实现分章节输出 + Called as soon as each section finishes generating to provide streamed, + section-by-section output. Args: - report_id: 报告ID - section_index: 章节索引(从1开始) - section: 章节对象 + report_id: Report ID. + section_index: Section index (1-based). + section: The section object. Returns: - 保存的文件路径 + The path of the saved file. """ cls._ensure_report_folder(report_id) - # 构建章节Markdown内容 - 清理可能存在的重复标题 + # Build the section Markdown — strip any duplicate title lines. cleaned_content = cls._clean_section_content(section.content, section.title) md_content = f"## {section.title}\n\n" if cleaned_content: md_content += f"{cleaned_content}\n\n" - # 保存文件 + # Persist the file. file_suffix = f"section_{section_index:02d}.md" file_path = os.path.join(cls._get_report_folder(report_id), file_suffix) with open(file_path, 'w', encoding='utf-8') as f: @@ -2131,17 +2134,17 @@ class ReportManager: @classmethod def _clean_section_content(cls, content: str, section_title: str) -> str: """ - 清理章节内容 - - 1. 移除内容开头与章节标题重复的Markdown标题行 - 2. 将所有 ### 及以下级别的标题转换为粗体文本 - + Clean a section's content. + + 1. Remove a leading Markdown heading line that duplicates the section title. + 2. Convert any ``###`` or deeper headings to bold text. + Args: - content: 原始内容 - section_title: 章节标题 - + content: Raw content. + section_title: Section title. + Returns: - 清理后的内容 + The cleaned content. """ import re @@ -2156,26 +2159,26 @@ class ReportManager: for i, line in enumerate(lines): stripped = line.strip() - # 检查是否是Markdown标题行 + # Detect a Markdown heading line. heading_match = re.match(r'^(#{1,6})\s+(.+)$', stripped) - + if heading_match: level = len(heading_match.group(1)) title_text = heading_match.group(2).strip() - - # 检查是否是与章节标题重复的标题(跳过前5行内的重复) + + # Drop a heading that duplicates the section title (only check the first 5 lines). if i < 5: if title_text == section_title or title_text.replace(' ', '') == section_title.replace(' ', ''): skip_next_empty = True continue - - # 将所有级别的标题(#, ##, ###, ####等)转换为粗体 - # 因为章节标题由系统添加,内容中不应有任何标题 + + # Convert headings of every level (#, ##, ###, ####, etc.) into bold text, + # because the section title is added by the system and the body should have no headings. cleaned_lines.append(f"**{title_text}**") - cleaned_lines.append("") # 添加空行 + cleaned_lines.append("") # Append a blank line. continue - - # 如果上一行是被跳过的标题,且当前行为空,也跳过 + + # Skip the blank line that immediately follows a dropped heading. if skip_next_empty and stripped == '': skip_next_empty = False continue @@ -2183,14 +2186,14 @@ class ReportManager: skip_next_empty = False cleaned_lines.append(line) - # 移除开头的空行 + # Strip leading blank lines. while cleaned_lines and cleaned_lines[0].strip() == '': cleaned_lines.pop(0) - - # 移除开头的分隔线 + + # Strip leading horizontal-rule lines. while cleaned_lines and cleaned_lines[0].strip() in ['---', '***', '___']: cleaned_lines.pop(0) - # 同时移除分隔线后的空行 + # Also strip blank lines that follow the rule. while cleaned_lines and cleaned_lines[0].strip() == '': cleaned_lines.pop(0) @@ -2207,9 +2210,9 @@ class ReportManager: completed_sections: List[str] = None ) -> None: """ - 更新报告生成进度 - - 前端可以通过读取progress.json获取实时进度 + Update report-generation progress. + + The frontend reads ``progress.json`` to display realtime progress. """ cls._ensure_report_folder(report_id) @@ -2227,7 +2230,7 @@ class ReportManager: @classmethod def get_progress(cls, report_id: str) -> Optional[Dict[str, Any]]: - """获取报告生成进度""" + """Return the report's generation progress.""" path = cls._get_progress_path(report_id) if not os.path.exists(path): @@ -2239,9 +2242,9 @@ class ReportManager: @classmethod def get_generated_sections(cls, report_id: str) -> List[Dict[str, Any]]: """ - 获取已生成的章节列表 - - 返回所有已保存的章节文件信息 + Return the list of sections that have already been generated. + + The result describes each section file that has been saved so far. """ folder = cls._get_report_folder(report_id) @@ -2255,7 +2258,7 @@ class ReportManager: with open(file_path, 'r', encoding='utf-8') as f: content = f.read() - # 从文件名解析章节索引 + # Derive the section index from the filename. parts = filename.replace('.md', '').split('_') section_index = int(parts[1]) @@ -2270,26 +2273,27 @@ class ReportManager: @classmethod def assemble_full_report(cls, report_id: str, outline: ReportOutline) -> str: """ - 组装完整报告 - - 从已保存的章节文件组装完整报告,并进行标题清理 + Assemble the full report. + + Combines all saved section files into the complete report and applies + title-cleanup post-processing. """ folder = cls._get_report_folder(report_id) - # 构建报告头部 + # Build the report header. md_content = f"# {outline.title}\n\n" md_content += f"> {outline.summary}\n\n" md_content += f"---\n\n" - - # 按顺序读取所有章节文件 + + # Read every section file in order. sections = cls.get_generated_sections(report_id) for section_info in sections: md_content += section_info["content"] - - # 后处理:清理整个报告的标题问题 + + # Post-process to fix heading issues across the whole report. md_content = cls._post_process_report(md_content, outline) - - # 保存完整报告 + + # Persist the full report. full_path = cls._get_report_markdown_path(report_id) with open(full_path, 'w', encoding='utf-8') as f: f.write(md_content) @@ -2300,18 +2304,19 @@ class ReportManager: @classmethod def _post_process_report(cls, content: str, outline: ReportOutline) -> str: """ - 后处理报告内容 - - 1. 移除重复的标题 - 2. 保留报告主标题(#)和章节标题(##),移除其他级别的标题(###, ####等) - 3. 清理多余的空行和分隔线 - + Post-process the report content. + + 1. Remove duplicate headings. + 2. Keep the report's main heading (``#``) and section headings (``##``); + drop any deeper headings (``###``, ``####``, etc.). + 3. Tidy up extra blank lines and horizontal rules. + Args: - content: 原始报告内容 - outline: 报告大纲 - + content: Raw report content. + outline: Report outline. + Returns: - 处理后的内容 + The processed content. """ import re @@ -2319,7 +2324,7 @@ class ReportManager: processed_lines = [] prev_was_heading = False - # 收集大纲中的所有章节标题 + # Collect every section title from the outline. section_titles = set() for section in outline.sections: section_titles.add(section.title) @@ -2329,14 +2334,14 @@ class ReportManager: line = lines[i] stripped = line.strip() - # 检查是否是标题行 + # Detect a heading line. heading_match = re.match(r'^(#{1,6})\s+(.+)$', stripped) - + if heading_match: level = len(heading_match.group(1)) title = heading_match.group(2).strip() - - # 检查是否是重复标题(在连续5行内出现相同内容的标题) + + # Detect a duplicate heading — same text appearing within the previous 5 lines. is_duplicate = False for j in range(max(0, len(processed_lines) - 5), len(processed_lines)): prev_line = processed_lines[j].strip() @@ -2348,43 +2353,43 @@ class ReportManager: break if is_duplicate: - # 跳过重复标题及其后的空行 + # Skip the duplicate heading and any blank lines that follow it. i += 1 while i < len(lines) and lines[i].strip() == '': i += 1 continue - - # 标题层级处理: - # - # (level=1) 只保留报告主标题 - # - ## (level=2) 保留章节标题 - # - ### 及以下 (level>=3) 转换为粗体文本 - + + # Heading-level handling: + # - # (level=1): keep only the report's main heading. + # - ## (level=2): keep section headings. + # - ### and deeper (level>=3): convert to bold text. + if level == 1: if title == outline.title: - # 保留报告主标题 + # Keep the report's main heading. processed_lines.append(line) prev_was_heading = True elif title in section_titles: - # 章节标题错误使用了#,修正为## + # A section heading mistakenly used ``#``; rewrite it to ``##``. processed_lines.append(f"## {title}") prev_was_heading = True else: - # 其他一级标题转为粗体 + # Other H1 headings become bold text. processed_lines.append(f"**{title}**") processed_lines.append("") prev_was_heading = False elif level == 2: if title in section_titles or title == outline.title: - # 保留章节标题 + # Keep the section heading. processed_lines.append(line) prev_was_heading = True else: - # 非章节的二级标题转为粗体 + # Non-section H2 headings become bold text. processed_lines.append(f"**{title}**") processed_lines.append("") prev_was_heading = False else: - # ### 及以下级别的标题转换为粗体文本 + # H3 and deeper headings become bold text. processed_lines.append(f"**{title}**") processed_lines.append("") prev_was_heading = False @@ -2393,12 +2398,12 @@ class ReportManager: continue elif stripped == '---' and prev_was_heading: - # 跳过标题后紧跟的分隔线 + # Drop a horizontal rule that immediately follows a heading. i += 1 continue - + elif stripped == '' and prev_was_heading: - # 标题后只保留一个空行 + # Keep at most one blank line after a heading. if processed_lines and processed_lines[-1].strip() != '': processed_lines.append(line) prev_was_heading = False @@ -2409,7 +2414,7 @@ class ReportManager: i += 1 - # 清理连续的多个空行(保留最多2个) + # Collapse consecutive blank lines, keeping at most two. result_lines = [] empty_count = 0 for line in processed_lines: @@ -2425,18 +2430,18 @@ class ReportManager: @classmethod def save_report(cls, report: Report) -> None: - """保存报告元信息和完整报告""" + """Persist the report metadata and the full report.""" cls._ensure_report_folder(report.report_id) - - # 保存元信息JSON + + # Save the metadata JSON. with open(cls._get_report_path(report.report_id), 'w', encoding='utf-8') as f: json.dump(report.to_dict(), f, ensure_ascii=False, indent=2) - - # 保存大纲 + + # Save the outline. if report.outline: cls.save_outline(report.report_id, report.outline) - - # 保存完整Markdown报告 + + # Save the full Markdown report. if report.markdown_content: with open(cls._get_report_markdown_path(report.report_id), 'w', encoding='utf-8') as f: f.write(report.markdown_content) @@ -2445,11 +2450,11 @@ class ReportManager: @classmethod def get_report(cls, report_id: str) -> Optional[Report]: - """获取报告""" + """Fetch a report.""" path = cls._get_report_path(report_id) if not os.path.exists(path): - # 兼容旧格式:检查直接存储在reports目录下的文件 + # Legacy format: check for a file stored directly under the reports root. old_path = os.path.join(cls.REPORTS_DIR, f"{report_id}.json") if os.path.exists(old_path): path = old_path @@ -2459,7 +2464,7 @@ class ReportManager: with open(path, 'r', encoding='utf-8') as f: data = json.load(f) - # 重建Report对象 + # Reconstruct the Report object. outline = None if data.get('outline'): outline_data = data['outline'] @@ -2475,7 +2480,7 @@ class ReportManager: sections=sections ) - # 如果markdown_content为空,尝试从full_report.md读取 + # When markdown_content is empty, fall back to reading full_report.md. markdown_content = data.get('markdown_content', '') if not markdown_content: full_report_path = cls._get_report_markdown_path(report_id) @@ -2498,66 +2503,66 @@ class ReportManager: @classmethod def get_report_by_simulation(cls, simulation_id: str) -> Optional[Report]: - """根据模拟ID获取报告""" + """Look up a report by its simulation ID.""" cls._ensure_reports_dir() for item in os.listdir(cls.REPORTS_DIR): item_path = os.path.join(cls.REPORTS_DIR, item) - # 新格式:文件夹 + # New format: folder. if os.path.isdir(item_path): report = cls.get_report(item) if report and report.simulation_id == simulation_id: return report - # 兼容旧格式:JSON文件 + # Legacy format: JSON file. elif item.endswith('.json'): report_id = item[:-5] report = cls.get_report(report_id) if report and report.simulation_id == simulation_id: return report - + return None - + @classmethod def list_reports(cls, simulation_id: Optional[str] = None, limit: int = 50) -> List[Report]: - """列出报告""" + """List reports.""" cls._ensure_reports_dir() - + reports = [] for item in os.listdir(cls.REPORTS_DIR): item_path = os.path.join(cls.REPORTS_DIR, item) - # 新格式:文件夹 + # New format: folder. if os.path.isdir(item_path): report = cls.get_report(item) if report: if simulation_id is None or report.simulation_id == simulation_id: reports.append(report) - # 兼容旧格式:JSON文件 + # Legacy format: JSON file. elif item.endswith('.json'): report_id = item[:-5] report = cls.get_report(report_id) if report: if simulation_id is None or report.simulation_id == simulation_id: reports.append(report) - - # 按创建时间倒序 + + # Sort by creation time, newest first. reports.sort(key=lambda r: r.created_at, reverse=True) return reports[:limit] @classmethod def delete_report(cls, report_id: str) -> bool: - """删除报告(整个文件夹)""" + """Delete a report (the entire folder).""" import shutil folder_path = cls._get_report_folder(report_id) - # 新格式:删除整个文件夹 + # New format: remove the entire folder. if os.path.exists(folder_path) and os.path.isdir(folder_path): shutil.rmtree(folder_path) logger.info(t('report.reportFolderDeleted', reportId=report_id)) return True - - # 兼容旧格式:删除单独的文件 + + # Legacy format: remove the standalone files. deleted = False old_json_path = os.path.join(cls.REPORTS_DIR, f"{report_id}.json") old_md_path = os.path.join(cls.REPORTS_DIR, f"{report_id}.md") diff --git a/backend/app/services/simulation_config_generator.py b/backend/app/services/simulation_config_generator.py index 9eab7432..1b0bc582 100644 --- a/backend/app/services/simulation_config_generator.py +++ b/backend/app/services/simulation_config_generator.py @@ -1,13 +1,16 @@ """ -模拟配置智能生成器 -使用LLM根据模拟需求、文档内容、图谱信息自动生成细致的模拟参数 -实现全程自动化,无需人工设置参数 +Intelligent simulation-configuration generator. -采用分步生成策略,避免一次性生成过长内容导致失败: -1. 生成时间配置 -2. 生成事件配置 -3. 分批生成Agent配置 -4. 生成平台配置 +Uses an LLM to derive detailed simulation parameters from the simulation +requirement, document content, and knowledge-graph information, fully +automating parameter setup without manual intervention. + +Employs a step-wise generation strategy to avoid failures caused by +producing too much content in a single call: +1. Generate time configuration +2. Generate event configuration +3. Generate agent configurations in batches +4. Generate platform configuration """ import json @@ -25,156 +28,156 @@ from .zep_entity_reader import EntityNode, ZepEntityReader logger = get_logger('mirofish.simulation_config') -# 中国作息时间配置(北京时间) +# Daily-rhythm config for China (Beijing time, UTC+8). CHINA_TIMEZONE_CONFIG = { - # 深夜时段(几乎无人活动) + # Late-night hours: almost no activity. "dead_hours": [0, 1, 2, 3, 4, 5], - # 早间时段(逐渐醒来) + # Morning hours: gradually waking up. "morning_hours": [6, 7, 8], - # 工作时段 + # Working hours. "work_hours": [9, 10, 11, 12, 13, 14, 15, 16, 17, 18], - # 晚间高峰(最活跃) + # Evening peak: most active. "peak_hours": [19, 20, 21, 22], - # 夜间时段(活跃度下降) + # Late-evening hours: activity declining. "night_hours": [23], - # 活跃度系数 + # Activity multipliers. "activity_multipliers": { - "dead": 0.05, # 凌晨几乎无人 - "morning": 0.4, # 早间逐渐活跃 - "work": 0.7, # 工作时段中等 - "peak": 1.5, # 晚间高峰 - "night": 0.5 # 深夜下降 + "dead": 0.05, # Overnight: almost no one online. + "morning": 0.4, # Morning ramp-up. + "work": 0.7, # Working hours: moderate activity. + "peak": 1.5, # Evening peak. + "night": 0.5 # Late-night decline. } } @dataclass class AgentActivityConfig: - """单个Agent的活动配置""" + """Activity configuration for a single agent.""" agent_id: int entity_uuid: str entity_name: str entity_type: str - - # 活跃度配置 (0.0-1.0) - activity_level: float = 0.5 # 整体活跃度 - - # 发言频率(每小时预期发言次数) + + # Activity configuration (0.0-1.0). + activity_level: float = 0.5 # Overall activity level. + + # Posting frequency (expected posts per hour). posts_per_hour: float = 1.0 comments_per_hour: float = 2.0 - - # 活跃时间段(24小时制,0-23) + + # Active hours (24-hour clock, 0-23). active_hours: List[int] = field(default_factory=lambda: list(range(8, 23))) - - # 响应速度(对热点事件的反应延迟,单位:模拟分钟) + + # Response speed: latency to react to hot events, in simulated minutes. response_delay_min: int = 5 response_delay_max: int = 60 - - # 情感倾向 (-1.0到1.0,负面到正面) + + # Sentiment bias (-1.0 to 1.0, negative to positive). sentiment_bias: float = 0.0 - - # 立场(对特定话题的态度) + + # Stance: attitude toward a given topic. stance: str = "neutral" # supportive, opposing, neutral, observer - - # 影响力权重(决定其发言被其他Agent看到的概率) + + # Influence weight: probability of an agent's post being seen by others. influence_weight: float = 1.0 @dataclass class TimeSimulationConfig: - """时间模拟配置(基于中国人作息习惯)""" - # 模拟总时长(模拟小时数) - total_simulation_hours: int = 72 # 默认模拟72小时(3天) - - # 每轮代表的时间(模拟分钟)- 默认60分钟(1小时),加快时间流速 + """Time-simulation configuration (modelled on a Chinese daily rhythm).""" + # Total simulated duration (simulated hours). + total_simulation_hours: int = 72 # Default: 72 simulated hours (3 days). + + # Time represented by each round (simulated minutes); default 60 (1 hour) to speed up the simulated clock. minutes_per_round: int = 60 - - # 每小时激活的Agent数量范围 + + # Range of agents activated per hour. agents_per_hour_min: int = 5 agents_per_hour_max: int = 20 - - # 高峰时段(晚间19-22点,中国人最活跃的时间) + + # Peak hours (evenings 19:00-22:00, most active for the modelled audience). peak_hours: List[int] = field(default_factory=lambda: [19, 20, 21, 22]) peak_activity_multiplier: float = 1.5 - - # 低谷时段(凌晨0-5点,几乎无人活动) + + # Off-peak hours (00:00-05:00, almost no activity). off_peak_hours: List[int] = field(default_factory=lambda: [0, 1, 2, 3, 4, 5]) - off_peak_activity_multiplier: float = 0.05 # 凌晨活跃度极低 - - # 早间时段 + off_peak_activity_multiplier: float = 0.05 # Overnight activity is very low. + + # Morning hours. morning_hours: List[int] = field(default_factory=lambda: [6, 7, 8]) morning_activity_multiplier: float = 0.4 - - # 工作时段 + + # Working hours. work_hours: List[int] = field(default_factory=lambda: [9, 10, 11, 12, 13, 14, 15, 16, 17, 18]) work_activity_multiplier: float = 0.7 @dataclass class EventConfig: - """事件配置""" - # 初始事件(模拟开始时的触发事件) + """Event configuration.""" + # Initial events: triggers fired when the simulation begins. initial_posts: List[Dict[str, Any]] = field(default_factory=list) - - # 定时事件(在特定时间触发的事件) + + # Scheduled events: events fired at specific times. scheduled_events: List[Dict[str, Any]] = field(default_factory=list) - - # 热点话题关键词 + + # Hot-topic keywords. hot_topics: List[str] = field(default_factory=list) - - # 舆论引导方向 + + # Narrative direction for public-opinion guidance. narrative_direction: str = "" @dataclass class PlatformConfig: - """平台特定配置""" + """Platform-specific configuration.""" platform: str # twitter or reddit - - # 推荐算法权重 - recency_weight: float = 0.4 # 时间新鲜度 - popularity_weight: float = 0.3 # 热度 - relevance_weight: float = 0.3 # 相关性 - - # 病毒传播阈值(达到多少互动后触发扩散) + + # Recommendation-algorithm weights. + recency_weight: float = 0.4 # Recency. + popularity_weight: float = 0.3 # Popularity. + relevance_weight: float = 0.3 # Relevance. + + # Viral-spread threshold: number of interactions required to trigger spreading. viral_threshold: int = 10 - - # 回声室效应强度(相似观点聚集程度) + + # Echo-chamber strength: how strongly similar viewpoints cluster together. echo_chamber_strength: float = 0.5 @dataclass class SimulationParameters: - """完整的模拟参数配置""" - # 基础信息 + """Complete simulation-parameter configuration.""" + # Basic identifiers. simulation_id: str project_id: str graph_id: str simulation_requirement: str - - # 时间配置 + + # Time configuration. time_config: TimeSimulationConfig = field(default_factory=TimeSimulationConfig) - - # Agent配置列表 + + # Agent configuration list. agent_configs: List[AgentActivityConfig] = field(default_factory=list) - - # 事件配置 + + # Event configuration. event_config: EventConfig = field(default_factory=EventConfig) - - # 平台配置 + + # Platform configurations. twitter_config: Optional[PlatformConfig] = None reddit_config: Optional[PlatformConfig] = None - - # LLM配置 + + # LLM configuration. llm_model: str = "" llm_base_url: str = "" - - # 生成元数据 + + # Generation metadata. generated_at: str = field(default_factory=lambda: datetime.now().isoformat()) - generation_reasoning: str = "" # LLM的推理说明 - + generation_reasoning: str = "" # LLM-provided rationale. + def to_dict(self) -> Dict[str, Any]: - """转换为字典""" + """Return the parameters as a dictionary.""" time_dict = asdict(self.time_config) return { "simulation_id": self.simulation_id, @@ -193,34 +196,35 @@ class SimulationParameters: } def to_json(self, indent: int = 2) -> str: - """转换为JSON字符串""" + """Return the parameters as a JSON string.""" return json.dumps(self.to_dict(), ensure_ascii=False, indent=indent) class SimulationConfigGenerator: """ - 模拟配置智能生成器 - - 使用LLM分析模拟需求、文档内容、图谱实体信息, - 自动生成最佳的模拟参数配置 - - 采用分步生成策略: - 1. 生成时间配置和事件配置(轻量级) - 2. 分批生成Agent配置(每批10-20个) - 3. 生成平台配置 + Intelligent simulation-configuration generator. + + Uses an LLM to analyse the simulation requirement, document content, + and graph entity information to automatically derive the best + simulation parameter configuration. + + Step-wise generation strategy: + 1. Generate time and event configurations (lightweight). + 2. Generate agent configurations in batches (10-20 per batch). + 3. Generate platform configuration. """ - - # 上下文最大字符数 + + # Maximum context length (characters). MAX_CONTEXT_LENGTH = 50000 - # 每批生成的Agent数量 + # Number of agents generated per batch. AGENTS_PER_BATCH = 15 - - # 各步骤的上下文截断长度(字符数) - TIME_CONFIG_CONTEXT_LENGTH = 10000 # 时间配置 - EVENT_CONFIG_CONTEXT_LENGTH = 8000 # 事件配置 - ENTITY_SUMMARY_LENGTH = 300 # 实体摘要 - AGENT_SUMMARY_LENGTH = 300 # Agent配置中的实体摘要 - ENTITIES_PER_TYPE_DISPLAY = 20 # 每类实体显示数量 + + # Per-step context truncation lengths (characters). + TIME_CONFIG_CONTEXT_LENGTH = 10000 # Time configuration. + EVENT_CONFIG_CONTEXT_LENGTH = 8000 # Event configuration. + ENTITY_SUMMARY_LENGTH = 300 # Entity summary. + AGENT_SUMMARY_LENGTH = 300 # Entity summary used in agent configs. + ENTITIES_PER_TYPE_DISPLAY = 20 # Number of entities displayed per type. def __init__( self, @@ -252,28 +256,27 @@ class SimulationConfigGenerator: enable_reddit: bool = True, progress_callback: Optional[Callable[[int, int, str], None]] = None, ) -> SimulationParameters: - """ - 智能生成完整的模拟配置(分步生成) - + """Intelligently generate a complete simulation configuration (step-wise). + Args: - simulation_id: 模拟ID - project_id: 项目ID - graph_id: 图谱ID - simulation_requirement: 模拟需求描述 - document_text: 原始文档内容 - entities: 过滤后的实体列表 - enable_twitter: 是否启用Twitter - enable_reddit: 是否启用Reddit - progress_callback: 进度回调函数(current_step, total_steps, message) - + simulation_id: Simulation ID. + project_id: Project ID. + graph_id: Graph ID. + simulation_requirement: Description of the simulation requirement. + document_text: Original document content. + entities: Filtered list of entities. + enable_twitter: Whether to enable Twitter. + enable_reddit: Whether to enable Reddit. + progress_callback: Progress callback (current_step, total_steps, message). + Returns: - SimulationParameters: 完整的模拟参数 + SimulationParameters: The complete simulation parameters. """ logger.info(t("log.simulation_config.m001", simulation_id=simulation_id, len=len(entities))) - # 计算总步骤数 + # Compute total step count. num_batches = math.ceil(len(entities) / self.AGENTS_PER_BATCH) - total_steps = 3 + num_batches # 时间配置 + 事件配置 + N批Agent + 平台配置 + total_steps = 3 + num_batches # Time config + event config + N agent batches + platform config. current_step = 0 def report_progress(step: int, message: str): @@ -283,7 +286,7 @@ class SimulationConfigGenerator: progress_callback(step, total_steps, message) logger.info(f"[{step}/{total_steps}] {message}") - # 1. 构建基础上下文信息 + # 1. Build base context information. context = self._build_context( simulation_requirement=simulation_requirement, document_text=document_text, @@ -292,20 +295,20 @@ class SimulationConfigGenerator: reasoning_parts = [] - # ========== 步骤1: 生成时间配置 ========== + # ========== Step 1: generate time configuration ========== report_progress(1, t('progress.generatingTimeConfig')) num_entities = len(entities) time_config_result = self._generate_time_config(context, num_entities) time_config = self._parse_time_config(time_config_result, num_entities) reasoning_parts.append(f"{t('progress.timeConfigLabel')}: {time_config_result.get('reasoning', t('common.success'))}") - # ========== 步骤2: 生成事件配置 ========== + # ========== Step 2: generate event configuration ========== report_progress(2, t('progress.generatingEventConfig')) event_config_result = self._generate_event_config(context, simulation_requirement, entities) event_config = self._parse_event_config(event_config_result) reasoning_parts.append(f"{t('progress.eventConfigLabel')}: {event_config_result.get('reasoning', t('common.success'))}") - # ========== 步骤3-N: 分批生成Agent配置 ========== + # ========== Steps 3-N: generate agent configurations in batches ========== all_agent_configs = [] for batch_idx in range(num_batches): start_idx = batch_idx * self.AGENTS_PER_BATCH @@ -327,13 +330,13 @@ class SimulationConfigGenerator: reasoning_parts.append(t('progress.agentConfigResult', count=len(all_agent_configs))) - # ========== 为初始帖子分配发布者 Agent ========== + # ========== Assign poster agents to initial posts ========== logger.info(t("log.simulation_config.m002")) event_config = self._assign_initial_post_agents(event_config, all_agent_configs) assigned_count = len([p for p in event_config.initial_posts if p.get("poster_agent_id") is not None]) reasoning_parts.append(t('progress.postAssignResult', count=assigned_count)) - # ========== 最后一步: 生成平台配置 ========== + # ========== Final step: generate platform configuration ========== report_progress(total_steps, t('progress.generatingPlatformConfig')) twitter_config = None reddit_config = None @@ -358,7 +361,7 @@ class SimulationConfigGenerator: echo_chamber_strength=0.6 ) - # 构建最终参数 + # Build final parameters. params = SimulationParameters( simulation_id=simulation_id, project_id=project_id, @@ -384,19 +387,19 @@ class SimulationConfigGenerator: document_text: str, entities: List[EntityNode] ) -> str: - """构建LLM上下文,截断到最大长度""" - - # 实体摘要 + """Build the LLM context, truncated to the maximum length.""" + + # Entity summary. entity_summary = self._summarize_entities(entities) - # 构建上下文 + # Build the context. context_parts = [ f"## Simulation Requirement\n{simulation_requirement}", f"\n## Entities ({len(entities)})\n{entity_summary}", ] current_length = sum(len(p) for p in context_parts) - remaining_length = self.MAX_CONTEXT_LENGTH - current_length - 500 # 留500字符余量 + remaining_length = self.MAX_CONTEXT_LENGTH - current_length - 500 # Reserve 500-char headroom. if remaining_length > 0 and document_text: doc_text = document_text[:remaining_length] @@ -407,10 +410,10 @@ class SimulationConfigGenerator: return "\n".join(context_parts) def _summarize_entities(self, entities: List[EntityNode]) -> str: - """生成实体摘要""" + """Generate an entity summary.""" lines = [] - - # 按类型分组 + + # Group by type. by_type: Dict[str, List[EntityNode]] = {} for e in entities: t = e.get_entity_type() or "Unknown" @@ -420,7 +423,7 @@ class SimulationConfigGenerator: for entity_type, type_entities in by_type.items(): lines.append(f"\n### {entity_type} ({len(type_entities)})") - # 使用配置的显示数量和摘要长度 + # Use configured display count and summary length. display_count = self.ENTITIES_PER_TYPE_DISPLAY summary_len = self.ENTITY_SUMMARY_LENGTH for e in type_entities[:display_count]: @@ -432,7 +435,7 @@ class SimulationConfigGenerator: return "\n".join(lines) def _call_llm_with_retry(self, prompt: str, system_prompt: str) -> Dict[str, Any]: - """带重试的LLM调用,包含JSON修复逻辑""" + """LLM call with retries, including JSON repair logic.""" import re max_attempts = 3 @@ -447,25 +450,25 @@ class SimulationConfigGenerator: {"role": "user", "content": prompt} ], response_format={"type": "json_object"}, - temperature=0.7 - (attempt * 0.1) # 每次重试降低温度 - # 不设置max_tokens,让LLM自由发挥 + temperature=0.7 - (attempt * 0.1) # Lower temperature on each retry. + # max_tokens is intentionally unset so the LLM can use its full budget. ) - + content = response.choices[0].message.content finish_reason = response.choices[0].finish_reason - - # 检查是否被截断 + + # Detect truncation. if finish_reason == 'length': logger.warning(t("log.simulation_config.m004", attempt=attempt + 1)) content = self._fix_truncated_json(content) - # 尝试解析JSON + # Attempt to parse JSON. try: return json.loads(content) except json.JSONDecodeError as e: logger.warning(t("log.simulation_config.m005", attempt=attempt + 1, str=str(e)[:80])) - - # 尝试修复JSON + + # Attempt to repair the JSON. fixed = self._try_fix_config_json(content) if fixed: return fixed @@ -481,36 +484,36 @@ class SimulationConfigGenerator: raise last_error or Exception("LLM调用失败") def _fix_truncated_json(self, content: str) -> str: - """修复被截断的JSON""" + """Repair truncated JSON.""" content = content.strip() - - # 计算未闭合的括号 + + # Count unclosed brackets. open_braces = content.count('{') - content.count('}') open_brackets = content.count('[') - content.count(']') - - # 检查是否有未闭合的字符串 + + # Check for an unclosed string. if content and content[-1] not in '",}]': content += '"' - - # 闭合括号 + + # Close brackets. content += ']' * open_brackets content += '}' * open_braces return content def _try_fix_config_json(self, content: str) -> Optional[Dict[str, Any]]: - """尝试修复配置JSON""" + """Attempt to repair a configuration JSON payload.""" import re - - # 修复被截断的情况 + + # Repair truncation first. content = self._fix_truncated_json(content) - - # 提取JSON部分 + + # Extract the JSON portion. json_match = re.search(r'\{[\s\S]*\}', content) if json_match: json_str = json_match.group() - - # 移除字符串中的换行符 + + # Remove line breaks from inside strings. def fix_string(match): s = match.group(0) s = s.replace('\n', ' ').replace('\r', ' ') @@ -522,7 +525,7 @@ class SimulationConfigGenerator: try: return json.loads(json_str) except: - # 尝试移除所有控制字符 + # Strip all control characters and try again. json_str = re.sub(r'[\x00-\x1f\x7f-\x9f]', ' ', json_str) json_str = re.sub(r'\s+', ' ', json_str) try: @@ -533,11 +536,11 @@ class SimulationConfigGenerator: return None def _generate_time_config(self, context: str, num_entities: int) -> Dict[str, Any]: - """生成时间配置""" - # 使用配置的上下文截断长度 + """Generate the time configuration.""" + # Use the configured context truncation length. context_truncated = context[:self.TIME_CONFIG_CONTEXT_LENGTH] - - # 计算最大允许值(80%的agent数) + + # Compute the upper bound (90% of the agent count). max_agents_allowed = max(1, int(num_entities * 0.9)) prompt = f"""Based on the simulation requirement below, generate a time-simulation configuration. @@ -595,10 +598,10 @@ Field guide: return self._get_default_time_config(num_entities) def _get_default_time_config(self, num_entities: int) -> Dict[str, Any]: - """获取默认时间配置(中国人作息)""" + """Return the default time configuration (Chinese daily rhythm).""" return { "total_simulation_hours": 72, - "minutes_per_round": 60, # 每轮1小时,加快时间流速 + "minutes_per_round": 60, # 1 hour per round to speed up the simulated clock. "agents_per_hour_min": max(1, num_entities // 15), "agents_per_hour_max": max(5, num_entities // 5), "peak_hours": [19, 20, 21, 22], @@ -609,12 +612,12 @@ Field guide: } def _parse_time_config(self, result: Dict[str, Any], num_entities: int) -> TimeSimulationConfig: - """解析时间配置结果,并验证agents_per_hour值不超过总agent数""" - # 获取原始值 + """Parse the time-configuration result and ensure agents_per_hour values do not exceed the total agent count.""" + # Pull raw values. agents_per_hour_min = result.get("agents_per_hour_min", max(1, num_entities // 15)) agents_per_hour_max = result.get("agents_per_hour_max", max(5, num_entities // 5)) - - # 验证并修正:确保不超过总agent数 + + # Validate and correct: ensure values do not exceed the total agent count. if agents_per_hour_min > num_entities: logger.warning(t("log.simulation_config.m008", agents_per_hour_min=agents_per_hour_min, num_entities=num_entities)) agents_per_hour_min = max(1, num_entities // 10) @@ -623,19 +626,19 @@ Field guide: logger.warning(t("log.simulation_config.m009", agents_per_hour_max=agents_per_hour_max, num_entities=num_entities)) agents_per_hour_max = max(agents_per_hour_min + 1, num_entities // 2) - # 确保 min < max + # Ensure min < max. if agents_per_hour_min >= agents_per_hour_max: agents_per_hour_min = max(1, agents_per_hour_max // 2) logger.warning(t("log.simulation_config.m010", agents_per_hour_min=agents_per_hour_min)) return TimeSimulationConfig( total_simulation_hours=result.get("total_simulation_hours", 72), - minutes_per_round=result.get("minutes_per_round", 60), # 默认每轮1小时 + minutes_per_round=result.get("minutes_per_round", 60), # Default: 1 simulated hour per round. agents_per_hour_min=agents_per_hour_min, agents_per_hour_max=agents_per_hour_max, peak_hours=result.get("peak_hours", [19, 20, 21, 22]), off_peak_hours=result.get("off_peak_hours", [0, 1, 2, 3, 4, 5]), - off_peak_activity_multiplier=0.05, # 凌晨几乎无人 + off_peak_activity_multiplier=0.05, # Overnight: almost no one online. morning_hours=result.get("morning_hours", [6, 7, 8]), morning_activity_multiplier=0.4, work_hours=result.get("work_hours", list(range(9, 19))), @@ -649,14 +652,14 @@ Field guide: simulation_requirement: str, entities: List[EntityNode] ) -> Dict[str, Any]: - """生成事件配置""" - - # 获取可用的实体类型列表,供 LLM 参考 + """Generate the event configuration.""" + + # Build the list of available entity types for the LLM to reference. entity_types_available = list(set( e.get_entity_type() or "Unknown" for e in entities )) - - # 为每种类型列出代表性实体名称 + + # Collect representative entity names per type. type_examples = {} for e in entities: etype = e.get_entity_type() or "Unknown" @@ -670,7 +673,7 @@ Field guide: for t, examples in type_examples.items() ]) - # 使用配置的上下文截断长度 + # Use the configured context truncation length. context_truncated = context[:self.EVENT_CONFIG_CONTEXT_LENGTH] prompt = f"""Based on the simulation requirement below, generate an event configuration. @@ -717,7 +720,7 @@ Return strict JSON (no markdown): } def _parse_event_config(self, result: Dict[str, Any]) -> EventConfig: - """解析事件配置结果""" + """Parse the event-configuration result.""" return EventConfig( initial_posts=result.get("initial_posts", []), scheduled_events=[], @@ -730,15 +733,15 @@ Return strict JSON (no markdown): event_config: EventConfig, agent_configs: List[AgentActivityConfig] ) -> EventConfig: - """ - 为初始帖子分配合适的发布者 Agent - - 根据每个帖子的 poster_type 匹配最合适的 agent_id + """Assign a suitable poster agent to each initial post. + + Matches the most appropriate agent_id for each post based on its + poster_type. """ if not event_config.initial_posts: return event_config - - # 按实体类型建立 agent 索引 + + # Build an agent index keyed by entity type. agents_by_type: Dict[str, List[AgentActivityConfig]] = {} for agent in agent_configs: etype = agent.entity_type.lower() @@ -746,7 +749,7 @@ Return strict JSON (no markdown): agents_by_type[etype] = [] agents_by_type[etype].append(agent) - # 类型映射表(处理 LLM 可能输出的不同格式) + # Type alias map (handles the different formats the LLM might emit). type_aliases = { "official": ["official", "university", "governmentagency", "government"], "university": ["university", "official"], @@ -758,7 +761,7 @@ Return strict JSON (no markdown): "person": ["person", "student", "alumni"], } - # 记录每种类型已使用的 agent 索引,避免重复使用同一个 agent + # Track the next agent index used per type to avoid reusing the same agent twice. used_indices: Dict[str, int] = {} updated_posts = [] @@ -766,17 +769,17 @@ Return strict JSON (no markdown): poster_type = post.get("poster_type", "").lower() content = post.get("content", "") - # 尝试找到匹配的 agent + # Try to find a matching agent. matched_agent_id = None - - # 1. 直接匹配 + + # 1. Direct match. if poster_type in agents_by_type: agents = agents_by_type[poster_type] idx = used_indices.get(poster_type, 0) % len(agents) matched_agent_id = agents[idx].agent_id used_indices[poster_type] = idx + 1 else: - # 2. 使用别名匹配 + # 2. Match via aliases. for alias_key, aliases in type_aliases.items(): if poster_type in aliases or alias_key == poster_type: for alias in aliases: @@ -789,11 +792,11 @@ Return strict JSON (no markdown): if matched_agent_id is not None: break - # 3. 如果仍未找到,使用影响力最高的 agent + # 3. If still unresolved, fall back to the most influential agent. if matched_agent_id is None: logger.warning(t("log.simulation_config.m012", poster_type=poster_type)) if agent_configs: - # 按影响力排序,选择影响力最高的 + # Sort by influence and pick the highest. sorted_agents = sorted(agent_configs, key=lambda a: a.influence_weight, reverse=True) matched_agent_id = sorted_agents[0].agent_id else: @@ -817,9 +820,9 @@ Return strict JSON (no markdown): start_idx: int, simulation_requirement: str ) -> List[AgentActivityConfig]: - """分批生成Agent配置""" - - # 构建实体信息(使用配置的摘要长度) + """Generate agent configurations in batches.""" + + # Build entity information (using the configured summary length). entity_list = [] summary_len = self.AGENT_SUMMARY_LENGTH for i, e in enumerate(entities): @@ -876,13 +879,13 @@ Return strict JSON (no markdown): logger.warning(t("log.simulation_config.m014", e=e)) llm_configs = {} - # 构建AgentActivityConfig对象 + # Build AgentActivityConfig objects. configs = [] for i, entity in enumerate(entities): agent_id = start_idx + i cfg = llm_configs.get(agent_id, {}) - - # 如果LLM没有生成,使用规则生成 + + # If the LLM did not produce a config, fall back to rule-based generation. if not cfg: cfg = self._generate_agent_config_by_rule(entity) @@ -906,16 +909,16 @@ Return strict JSON (no markdown): return configs def _generate_agent_config_by_rule(self, entity: EntityNode) -> Dict[str, Any]: - """基于规则生成单个Agent配置(中国人作息)""" + """Rule-based generation for a single agent's configuration (Chinese daily rhythm).""" entity_type = (entity.get_entity_type() or "Unknown").lower() - + if entity_type in ["university", "governmentagency", "ngo"]: - # 官方机构:工作时间活动,低频率,高影响力 + # Official institutions: active during working hours, low frequency, high influence. return { "activity_level": 0.2, "posts_per_hour": 0.1, "comments_per_hour": 0.05, - "active_hours": list(range(9, 18)), # 9:00-17:59 + "active_hours": list(range(9, 18)), # 09:00-17:59 "response_delay_min": 60, "response_delay_max": 240, "sentiment_bias": 0.0, @@ -923,12 +926,12 @@ Return strict JSON (no markdown): "influence_weight": 3.0 } elif entity_type in ["mediaoutlet"]: - # 媒体:全天活动,中等频率,高影响力 + # Media: active throughout the day, medium frequency, high influence. return { "activity_level": 0.5, "posts_per_hour": 0.8, "comments_per_hour": 0.3, - "active_hours": list(range(7, 24)), # 7:00-23:59 + "active_hours": list(range(7, 24)), # 07:00-23:59 "response_delay_min": 5, "response_delay_max": 30, "sentiment_bias": 0.0, @@ -936,12 +939,12 @@ Return strict JSON (no markdown): "influence_weight": 2.5 } elif entity_type in ["professor", "expert", "official"]: - # 专家/教授:工作+晚间活动,中等频率 + # Experts / professors: active during work and evening, medium frequency. return { "activity_level": 0.4, "posts_per_hour": 0.3, "comments_per_hour": 0.5, - "active_hours": list(range(8, 22)), # 8:00-21:59 + "active_hours": list(range(8, 22)), # 08:00-21:59 "response_delay_min": 15, "response_delay_max": 90, "sentiment_bias": 0.0, @@ -949,12 +952,12 @@ Return strict JSON (no markdown): "influence_weight": 2.0 } elif entity_type in ["student"]: - # 学生:晚间为主,高频率 + # Students: mostly evening, high frequency. return { "activity_level": 0.8, "posts_per_hour": 0.6, "comments_per_hour": 1.5, - "active_hours": [8, 9, 10, 11, 12, 13, 18, 19, 20, 21, 22, 23], # 上午+晚间 + "active_hours": [8, 9, 10, 11, 12, 13, 18, 19, 20, 21, 22, 23], # Morning + evening. "response_delay_min": 1, "response_delay_max": 15, "sentiment_bias": 0.0, @@ -962,12 +965,12 @@ Return strict JSON (no markdown): "influence_weight": 0.8 } elif entity_type in ["alumni"]: - # 校友:晚间为主 + # Alumni: mostly evening. return { "activity_level": 0.6, "posts_per_hour": 0.4, "comments_per_hour": 0.8, - "active_hours": [12, 13, 19, 20, 21, 22, 23], # 午休+晚间 + "active_hours": [12, 13, 19, 20, 21, 22, 23], # Lunch break + evening. "response_delay_min": 5, "response_delay_max": 30, "sentiment_bias": 0.0, @@ -975,12 +978,12 @@ Return strict JSON (no markdown): "influence_weight": 1.0 } else: - # 普通人:晚间高峰 + # General public: evening peak. return { "activity_level": 0.7, "posts_per_hour": 0.5, "comments_per_hour": 1.2, - "active_hours": [9, 10, 11, 12, 13, 18, 19, 20, 21, 22, 23], # 白天+晚间 + "active_hours": [9, 10, 11, 12, 13, 18, 19, 20, 21, 22, 23], # Daytime + evening. "response_delay_min": 2, "response_delay_max": 20, "sentiment_bias": 0.0, diff --git a/backend/app/services/simulation_ipc.py b/backend/app/services/simulation_ipc.py index be2eac32..68428b8f 100644 --- a/backend/app/services/simulation_ipc.py +++ b/backend/app/services/simulation_ipc.py @@ -1,11 +1,12 @@ -""" -模拟IPC通信模块 -用于Flask后端和模拟脚本之间的进程间通信 +"""Simulation IPC module. -通过文件系统实现简单的命令/响应模式: -1. Flask写入命令到 commands/ 目录 -2. 模拟脚本轮询命令目录,执行命令并写入响应到 responses/ 目录 -3. Flask轮询响应目录获取结果 +Inter-process communication between the Flask backend and the simulation +subprocess. Implements a simple file-system command/response pattern: + +1. Flask writes commands into ``commands/``. +2. The simulation script polls for commands, executes them, and writes + responses into ``responses/``. +3. Flask polls the responses directory for results. """ import os @@ -24,14 +25,14 @@ logger = get_logger('mirofish.simulation_ipc') class CommandType(str, Enum): - """命令类型""" - INTERVIEW = "interview" # 单个Agent采访 - BATCH_INTERVIEW = "batch_interview" # 批量采访 - CLOSE_ENV = "close_env" # 关闭环境 + """IPC command types.""" + INTERVIEW = "interview" # interview a single agent + BATCH_INTERVIEW = "batch_interview" # interview multiple agents at once + CLOSE_ENV = "close_env" # tear down the environment class CommandStatus(str, Enum): - """命令状态""" + """IPC command status.""" PENDING = "pending" PROCESSING = "processing" COMPLETED = "completed" @@ -40,12 +41,12 @@ class CommandStatus(str, Enum): @dataclass class IPCCommand: - """IPC命令""" + """A command sent over the IPC channel.""" command_id: str command_type: CommandType args: Dict[str, Any] timestamp: str = field(default_factory=lambda: datetime.now().isoformat()) - + def to_dict(self) -> Dict[str, Any]: return { "command_id": self.command_id, @@ -53,7 +54,7 @@ class IPCCommand: "args": self.args, "timestamp": self.timestamp } - + @classmethod def from_dict(cls, data: Dict[str, Any]) -> 'IPCCommand': return cls( @@ -66,13 +67,13 @@ class IPCCommand: @dataclass class IPCResponse: - """IPC响应""" + """A response returned over the IPC channel.""" command_id: str status: CommandStatus result: Optional[Dict[str, Any]] = None error: Optional[str] = None timestamp: str = field(default_factory=lambda: datetime.now().isoformat()) - + def to_dict(self) -> Dict[str, Any]: return { "command_id": self.command_id, @@ -81,7 +82,7 @@ class IPCResponse: "error": self.error, "timestamp": self.timestamp } - + @classmethod def from_dict(cls, data: Dict[str, Any]) -> 'IPCResponse': return cls( @@ -94,27 +95,25 @@ class IPCResponse: class SimulationIPCClient: + """IPC client used by the Flask side. + + Sends commands to the simulation process and waits for responses. """ - 模拟IPC客户端(Flask端使用) - - 用于向模拟进程发送命令并等待响应 - """ - + def __init__(self, simulation_dir: str): - """ - 初始化IPC客户端 - + """Initialize the IPC client. + Args: - simulation_dir: 模拟数据目录 + simulation_dir: Directory holding the simulation's IPC files. """ self.simulation_dir = simulation_dir self.commands_dir = os.path.join(simulation_dir, "ipc_commands") self.responses_dir = os.path.join(simulation_dir, "ipc_responses") - - # 确保目录存在 + + # Ensure both directories exist before use. os.makedirs(self.commands_dir, exist_ok=True) os.makedirs(self.responses_dir, exist_ok=True) - + def send_command( self, command_type: CommandType, @@ -122,20 +121,19 @@ class SimulationIPCClient: timeout: float = 60.0, poll_interval: float = 0.5 ) -> IPCResponse: - """ - 发送命令并等待响应 - + """Send a command and wait for the response. + Args: - command_type: 命令类型 - args: 命令参数 - timeout: 超时时间(秒) - poll_interval: 轮询间隔(秒) - + command_type: Command type to send. + args: Command arguments. + timeout: Timeout in seconds. + poll_interval: Polling interval in seconds. + Returns: - IPCResponse - + The ``IPCResponse``. + Raises: - TimeoutError: 等待响应超时 + TimeoutError: When no response arrives before ``timeout``. """ command_id = str(uuid.uuid4()) command = IPCCommand( @@ -143,50 +141,50 @@ class SimulationIPCClient: command_type=command_type, args=args ) - - # 写入命令文件 + + # Write the command file. command_file = os.path.join(self.commands_dir, f"{command_id}.json") with open(command_file, 'w', encoding='utf-8') as f: json.dump(command.to_dict(), f, ensure_ascii=False, indent=2) - + logger.info(t("log.simulation_ipc.m001", command_type=command_type.value, command_id=command_id)) - - # 等待响应 + + # Poll for the response file. response_file = os.path.join(self.responses_dir, f"{command_id}.json") start_time = time.time() - + while time.time() - start_time < timeout: if os.path.exists(response_file): try: with open(response_file, 'r', encoding='utf-8') as f: response_data = json.load(f) response = IPCResponse.from_dict(response_data) - - # 清理命令和响应文件 + + # Clean up command and response files after successful read. try: os.remove(command_file) os.remove(response_file) except OSError: pass - + logger.info(t("log.simulation_ipc.m002", command_id=command_id, response=response.status.value)) return response except (json.JSONDecodeError, KeyError) as e: logger.warning(t("log.simulation_ipc.m003", e=e)) - + time.sleep(poll_interval) - - # 超时 + + # Timed out waiting for the response. logger.error(t("log.simulation_ipc.m004", command_id=command_id)) - - # 清理命令文件 + + # Clean up the unanswered command file. try: os.remove(command_file) except OSError: pass - + raise TimeoutError(f"等待命令响应超时 ({timeout}秒)") - + def send_interview( self, agent_id: int, @@ -194,20 +192,19 @@ class SimulationIPCClient: platform: str = None, timeout: float = 60.0 ) -> IPCResponse: - """ - 发送单个Agent采访命令 - + """Send a single-agent interview command. + Args: - agent_id: Agent ID - prompt: 采访问题 - platform: 指定平台(可选) - - "twitter": 只采访Twitter平台 - - "reddit": 只采访Reddit平台 - - None: 双平台模拟时同时采访两个平台,单平台模拟时采访该平台 - timeout: 超时时间 - + agent_id: Agent id to interview. + prompt: Interview question. + platform: Optional platform selector. + - ``"twitter"``: interview only on Twitter. + - ``"reddit"``: interview only on Reddit. + - ``None``: dual-platform if applicable, else the single active platform. + timeout: Timeout in seconds. + Returns: - IPCResponse,result字段包含采访结果 + ``IPCResponse`` whose ``result`` carries the interview response. """ args = { "agent_id": agent_id, @@ -215,69 +212,66 @@ class SimulationIPCClient: } if platform: args["platform"] = platform - + return self.send_command( command_type=CommandType.INTERVIEW, args=args, timeout=timeout ) - + def send_batch_interview( self, interviews: List[Dict[str, Any]], platform: str = None, timeout: float = 120.0 ) -> IPCResponse: - """ - 发送批量采访命令 - + """Send a batched interview command. + Args: - interviews: 采访列表,每个元素包含 {"agent_id": int, "prompt": str, "platform": str(可选)} - platform: 默认平台(可选,会被每个采访项的platform覆盖) - - "twitter": 默认只采访Twitter平台 - - "reddit": 默认只采访Reddit平台 - - None: 双平台模拟时每个Agent同时采访两个平台 - timeout: 超时时间 - + interviews: List of items shaped ``{"agent_id": int, "prompt": str, "platform": str?}``. + platform: Default platform; per-item ``platform`` overrides this. + - ``"twitter"``: default to Twitter. + - ``"reddit"``: default to Reddit. + - ``None``: dual-platform interview when applicable. + timeout: Timeout in seconds. + Returns: - IPCResponse,result字段包含所有采访结果 + ``IPCResponse`` whose ``result`` carries every interview response. """ args = {"interviews": interviews} if platform: args["platform"] = platform - + return self.send_command( command_type=CommandType.BATCH_INTERVIEW, args=args, timeout=timeout ) - + def send_close_env(self, timeout: float = 30.0) -> IPCResponse: - """ - 发送关闭环境命令 - + """Send a tear-down-environment command. + Args: - timeout: 超时时间 - + timeout: Timeout in seconds. + Returns: - IPCResponse + ``IPCResponse``. """ return self.send_command( command_type=CommandType.CLOSE_ENV, args={}, timeout=timeout ) - + def check_env_alive(self) -> bool: - """ - 检查模拟环境是否存活 - - 通过检查 env_status.json 文件来判断 + """Return ``True`` if the simulation environment reports as alive. + + Reads ``env_status.json`` written by the IPC server side. """ status_file = os.path.join(self.simulation_dir, "env_status.json") if not os.path.exists(status_file): return False - + try: with open(status_file, 'r', encoding='utf-8') as f: status = json.load(f) @@ -287,68 +281,65 @@ class SimulationIPCClient: class SimulationIPCServer: + """IPC server used by the simulation script. + + Polls the commands directory, executes commands, and writes responses. """ - 模拟IPC服务器(模拟脚本端使用) - - 轮询命令目录,执行命令并返回响应 - """ - + def __init__(self, simulation_dir: str): - """ - 初始化IPC服务器 - + """Initialize the IPC server. + Args: - simulation_dir: 模拟数据目录 + simulation_dir: Directory holding the simulation's IPC files. """ self.simulation_dir = simulation_dir self.commands_dir = os.path.join(simulation_dir, "ipc_commands") self.responses_dir = os.path.join(simulation_dir, "ipc_responses") - - # 确保目录存在 + + # Ensure both directories exist before use. os.makedirs(self.commands_dir, exist_ok=True) os.makedirs(self.responses_dir, exist_ok=True) - - # 环境状态 + + # Server-running flag. self._running = False - + def start(self): - """标记服务器为运行状态""" + """Mark the server as alive and persist the state.""" self._running = True self._update_env_status("alive") - + def stop(self): - """标记服务器为停止状态""" + """Mark the server as stopped and persist the state.""" self._running = False self._update_env_status("stopped") - + def _update_env_status(self, status: str): - """更新环境状态文件""" + """Update the persistent environment-status file.""" status_file = os.path.join(self.simulation_dir, "env_status.json") with open(status_file, 'w', encoding='utf-8') as f: json.dump({ "status": status, "timestamp": datetime.now().isoformat() }, f, ensure_ascii=False, indent=2) - + def poll_commands(self) -> Optional[IPCCommand]: - """ - 轮询命令目录,返回第一个待处理的命令 - + """Poll the commands directory and return the next pending command. + Returns: - IPCCommand 或 None + ``IPCCommand`` or ``None`` if no pending commands remain. """ if not os.path.exists(self.commands_dir): return None - - # 按时间排序获取命令文件 + + # Sort by mtime so we process commands in arrival order. command_files = [] for filename in os.listdir(self.commands_dir): if filename.endswith('.json'): filepath = os.path.join(self.commands_dir, filename) command_files.append((filepath, os.path.getmtime(filepath))) - + command_files.sort(key=lambda x: x[1]) - + for filepath, _ in command_files: try: with open(filepath, 'r', encoding='utf-8') as f: @@ -357,37 +348,36 @@ class SimulationIPCServer: except (json.JSONDecodeError, KeyError, OSError) as e: logger.warning(t("log.simulation_ipc.m005", filepath=filepath, e=e)) continue - + return None - + def send_response(self, response: IPCResponse): - """ - 发送响应 - + """Write a response file. + Args: - response: IPC响应 + response: The response to send. """ response_file = os.path.join(self.responses_dir, f"{response.command_id}.json") with open(response_file, 'w', encoding='utf-8') as f: json.dump(response.to_dict(), f, ensure_ascii=False, indent=2) - - # 删除命令文件 + + # Delete the matching command file. command_file = os.path.join(self.commands_dir, f"{response.command_id}.json") try: os.remove(command_file) except OSError: pass - + def send_success(self, command_id: str, result: Dict[str, Any]): - """发送成功响应""" + """Send a success response.""" self.send_response(IPCResponse( command_id=command_id, status=CommandStatus.COMPLETED, result=result )) - + def send_error(self, command_id: str, error: str): - """发送错误响应""" + """Send a failure response.""" self.send_response(IPCResponse( command_id=command_id, status=CommandStatus.FAILED, diff --git a/backend/app/services/simulation_manager.py b/backend/app/services/simulation_manager.py index 2f297e2c..b1af480f 100644 --- a/backend/app/services/simulation_manager.py +++ b/backend/app/services/simulation_manager.py @@ -1,7 +1,7 @@ -""" -OASIS模拟管理器 -管理Twitter和Reddit双平台并行模拟 -使用预设脚本 + LLM智能生成配置参数 +"""OASIS simulation manager. + +Drives parallel Twitter + Reddit simulations using preset scripts plus +LLM-generated configuration parameters. """ import os @@ -23,60 +23,60 @@ logger = get_logger('mirofish.simulation') class SimulationStatus(str, Enum): - """模拟状态""" + """Simulation lifecycle status.""" CREATED = "created" PREPARING = "preparing" READY = "ready" RUNNING = "running" PAUSED = "paused" - STOPPED = "stopped" # 模拟被手动停止 - COMPLETED = "completed" # 模拟自然完成 + STOPPED = "stopped" # manually stopped + COMPLETED = "completed" # finished naturally FAILED = "failed" class PlatformType(str, Enum): - """平台类型""" + """Simulated platform types.""" TWITTER = "twitter" REDDIT = "reddit" @dataclass class SimulationState: - """模拟状态""" + """In-memory + persisted state for a single simulation.""" simulation_id: str project_id: str graph_id: str - - # 平台启用状态 + + # Per-platform enable flags. enable_twitter: bool = True enable_reddit: bool = True - - # 状态 + + # Lifecycle status. status: SimulationStatus = SimulationStatus.CREATED - - # 准备阶段数据 + + # Counters captured during the prepare phase. entities_count: int = 0 profiles_count: int = 0 entity_types: List[str] = field(default_factory=list) - - # 配置生成信息 + + # Information about the auto-generated config. config_generated: bool = False config_reasoning: str = "" - - # 运行时数据 + + # Runtime data. current_round: int = 0 twitter_status: str = "not_started" reddit_status: str = "not_started" - - # 时间戳 + + # Timestamps. created_at: str = field(default_factory=lambda: datetime.now().isoformat()) updated_at: str = field(default_factory=lambda: datetime.now().isoformat()) - - # 错误信息 + + # Error message when status == FAILED. error: Optional[str] = None - + def to_dict(self) -> Dict[str, Any]: - """完整状态字典(内部使用)""" + """Full state dict (used for persistence and internal callers).""" return { "simulation_id": self.simulation_id, "project_id": self.project_id, @@ -96,9 +96,9 @@ class SimulationState: "updated_at": self.updated_at, "error": self.error, } - + def to_simple_dict(self) -> Dict[str, Any]: - """简化状态字典(API返回使用)""" + """Simplified state dict (used for API responses).""" return { "simulation_id": self.simulation_id, "project_id": self.project_id, @@ -113,61 +113,60 @@ class SimulationState: class SimulationManager: + """Simulation manager. + + Core responsibilities: + 1. Read entities from the Zep graph and filter to the configured types. + 2. Generate OASIS agent profiles per entity. + 3. Use the LLM to generate simulation configuration parameters. + 4. Materialize the files the preset scripts expect. """ - 模拟管理器 - - 核心功能: - 1. 从Zep图谱读取实体并过滤 - 2. 生成OASIS Agent Profile - 3. 使用LLM智能生成模拟配置参数 - 4. 准备预设脚本所需的所有文件 - """ - - # 模拟数据存储目录 + + # Root directory for persisted simulation data. SIMULATION_DATA_DIR = os.path.join( - os.path.dirname(__file__), + os.path.dirname(__file__), '../../uploads/simulations' ) - + def __init__(self): - # 确保目录存在 + # Ensure the simulation data directory exists. os.makedirs(self.SIMULATION_DATA_DIR, exist_ok=True) - - # 内存中的模拟状态缓存 + + # In-memory cache of simulation state objects. self._simulations: Dict[str, SimulationState] = {} - + def _get_simulation_dir(self, simulation_id: str) -> str: - """获取模拟数据目录""" + """Return the on-disk directory for a simulation, creating if missing.""" sim_dir = os.path.join(self.SIMULATION_DATA_DIR, simulation_id) os.makedirs(sim_dir, exist_ok=True) return sim_dir - + def _save_simulation_state(self, state: SimulationState): - """保存模拟状态到文件""" + """Persist a simulation state to disk and update the cache.""" sim_dir = self._get_simulation_dir(state.simulation_id) state_file = os.path.join(sim_dir, "state.json") - + state.updated_at = datetime.now().isoformat() - + with open(state_file, 'w', encoding='utf-8') as f: json.dump(state.to_dict(), f, ensure_ascii=False, indent=2) - + self._simulations[state.simulation_id] = state - + def _load_simulation_state(self, simulation_id: str) -> Optional[SimulationState]: - """从文件加载模拟状态""" + """Load a simulation state from disk (or cache) by id.""" if simulation_id in self._simulations: return self._simulations[simulation_id] - + sim_dir = self._get_simulation_dir(simulation_id) state_file = os.path.join(sim_dir, "state.json") - + if not os.path.exists(state_file): return None - + with open(state_file, 'r', encoding='utf-8') as f: data = json.load(f) - + state = SimulationState( simulation_id=simulation_id, project_id=data.get("project_id", ""), @@ -187,10 +186,10 @@ class SimulationManager: updated_at=data.get("updated_at", datetime.now().isoformat()), error=data.get("error"), ) - + self._simulations[simulation_id] = state return state - + def create_simulation( self, project_id: str, @@ -198,21 +197,20 @@ class SimulationManager: enable_twitter: bool = True, enable_reddit: bool = True, ) -> SimulationState: - """ - 创建新的模拟 - + """Create a new simulation in the ``CREATED`` state. + Args: - project_id: 项目ID - graph_id: Zep图谱ID - enable_twitter: 是否启用Twitter模拟 - enable_reddit: 是否启用Reddit模拟 - + project_id: Owning project id. + graph_id: Source Zep graph id. + enable_twitter: When ``True``, the Twitter simulation runs. + enable_reddit: When ``True``, the Reddit simulation runs. + Returns: - SimulationState + The created ``SimulationState``. """ import uuid simulation_id = f"sim_{uuid.uuid4().hex[:12]}" - + state = SimulationState( simulation_id=simulation_id, project_id=project_id, @@ -221,12 +219,12 @@ class SimulationManager: enable_reddit=enable_reddit, status=SimulationStatus.CREATED, ) - + self._save_simulation_state(state) logger.info(t("log.simulation_manager.m001", simulation_id=simulation_id, project_id=project_id, graph_id=graph_id)) - + return state - + def prepare_simulation( self, simulation_id: str, @@ -237,56 +235,55 @@ class SimulationManager: progress_callback: Optional[callable] = None, parallel_profile_count: int = 3 ) -> SimulationState: - """ - 准备模拟环境(全程自动化) - - 步骤: - 1. 从Zep图谱读取并过滤实体 - 2. 为每个实体生成OASIS Agent Profile(可选LLM增强,支持并行) - 3. 使用LLM智能生成模拟配置参数(时间、活跃度、发言频率等) - 4. 保存配置文件和Profile文件 - 5. 复制预设脚本到模拟目录 - + """Prepare the simulation environment end-to-end. + + Steps: + 1. Read and filter entities from the graph. + 2. Generate OASIS agent profiles (optional LLM enrichment, parallel-capable). + 3. Use the LLM to produce simulation parameters (timing, activity, posting frequency). + 4. Save the configuration and profile files. + 5. Copy preset scripts into the simulation directory. + Args: - simulation_id: 模拟ID - simulation_requirement: 模拟需求描述(用于LLM生成配置) - document_text: 原始文档内容(用于LLM理解背景) - defined_entity_types: 预定义的实体类型(可选) - use_llm_for_profiles: 是否使用LLM生成详细人设 - progress_callback: 进度回调函数 (stage, progress, message) - parallel_profile_count: 并行生成人设的数量,默认3 - + simulation_id: Simulation id. + simulation_requirement: Free-text description of the simulation goal. + document_text: Raw source document text passed to the LLM for context. + defined_entity_types: Optional list of allowed entity types. + use_llm_for_profiles: When ``True``, enrich profiles via the LLM. + progress_callback: Optional callback ``(stage, progress, message, **extras)``. + parallel_profile_count: Number of profile generations to run in parallel. + Returns: - SimulationState + The updated ``SimulationState``. """ state = self._load_simulation_state(simulation_id) if not state: raise ValueError(f"模拟不存在: {simulation_id}") - + try: state.status = SimulationStatus.PREPARING self._save_simulation_state(state) - + sim_dir = self._get_simulation_dir(simulation_id) - - # ========== 阶段1: 读取并过滤实体 ========== + + # ========== Stage 1: read and filter entities ========== if progress_callback: progress_callback("reading", 0, t('progress.connectingZepGraph')) - + reader = ZepEntityReader() - + if progress_callback: progress_callback("reading", 30, t('progress.readingNodeData')) - + filtered = reader.filter_defined_entities( graph_id=state.graph_id, defined_entity_types=defined_entity_types, enrich_with_edges=True ) - + state.entities_count = filtered.filtered_count state.entity_types = list(filtered.entity_types) - + if progress_callback: progress_callback( "reading", 100, @@ -294,16 +291,16 @@ class SimulationManager: current=filtered.filtered_count, total=filtered.filtered_count ) - + if filtered.filtered_count == 0: state.status = SimulationStatus.FAILED state.error = "没有找到符合条件的实体,请检查图谱是否正确构建" self._save_simulation_state(state) return state - - # ========== 阶段2: 生成Agent Profile ========== + + # ========== Stage 2: generate agent profiles ========== total_entities = len(filtered.entities) - + if progress_callback: progress_callback( "generating_profiles", 0, @@ -311,22 +308,22 @@ class SimulationManager: current=0, total=total_entities ) - - # 传入graph_id以启用Zep检索功能,获取更丰富的上下文 + + # Pass the graph_id so the generator can use Zep retrieval for richer context. generator = OasisProfileGenerator(graph_id=state.graph_id) - + def profile_progress(current, total, msg): if progress_callback: progress_callback( - "generating_profiles", - int(current / total * 100), + "generating_profiles", + int(current / total * 100), msg, current=current, total=total, item_name=msg ) - - # 设置实时保存的文件路径(优先使用 Reddit JSON 格式) + + # Configure the realtime save target (prefer Reddit JSON if Reddit is enabled). realtime_output_path = None realtime_platform = "reddit" if state.enable_reddit: @@ -335,21 +332,21 @@ class SimulationManager: elif state.enable_twitter: realtime_output_path = os.path.join(sim_dir, "twitter_profiles.csv") realtime_platform = "twitter" - + profiles = generator.generate_profiles_from_entities( entities=filtered.entities, use_llm=use_llm_for_profiles, progress_callback=profile_progress, - graph_id=state.graph_id, # 传入graph_id用于Zep检索 - parallel_count=parallel_profile_count, # 并行生成数量 - realtime_output_path=realtime_output_path, # 实时保存路径 - output_platform=realtime_platform # 输出格式 + graph_id=state.graph_id, # used for Zep retrieval enrichment + parallel_count=parallel_profile_count, + realtime_output_path=realtime_output_path, + output_platform=realtime_platform ) - + state.profiles_count = len(profiles) - - # 保存Profile文件(注意:Twitter使用CSV格式,Reddit使用JSON格式) - # Reddit 已经在生成过程中实时保存了,这里再保存一次确保完整性 + + # Save profile files. Reddit also writes JSON during generation; this is + # a final consistency write. Twitter requires CSV per OASIS conventions. if progress_callback: progress_callback( "generating_profiles", 95, @@ -357,22 +354,22 @@ class SimulationManager: current=total_entities, total=total_entities ) - + if state.enable_reddit: generator.save_profiles( profiles=profiles, file_path=os.path.join(sim_dir, "reddit_profiles.json"), platform="reddit" ) - + if state.enable_twitter: - # Twitter使用CSV格式!这是OASIS的要求 + # Twitter uses CSV format — required by OASIS. generator.save_profiles( profiles=profiles, file_path=os.path.join(sim_dir, "twitter_profiles.csv"), platform="twitter" ) - + if progress_callback: progress_callback( "generating_profiles", 100, @@ -380,8 +377,8 @@ class SimulationManager: current=len(profiles), total=len(profiles) ) - - # ========== 阶段3: LLM智能生成模拟配置 ========== + + # ========== Stage 3: LLM-driven simulation config ========== if progress_callback: progress_callback( "generating_config", 0, @@ -389,9 +386,9 @@ class SimulationManager: current=0, total=3 ) - + config_generator = SimulationConfigGenerator() - + if progress_callback: progress_callback( "generating_config", 30, @@ -399,7 +396,7 @@ class SimulationManager: current=1, total=3 ) - + sim_params = config_generator.generate_config( simulation_id=simulation_id, project_id=state.project_id, @@ -410,7 +407,7 @@ class SimulationManager: enable_twitter=state.enable_twitter, enable_reddit=state.enable_reddit ) - + if progress_callback: progress_callback( "generating_config", 70, @@ -418,15 +415,15 @@ class SimulationManager: current=2, total=3 ) - - # 保存配置文件 + + # Save the configuration file. config_path = os.path.join(sim_dir, "simulation_config.json") with open(config_path, 'w', encoding='utf-8') as f: f.write(sim_params.to_json()) - + state.config_generated = True state.config_reasoning = sim_params.generation_reasoning - + if progress_callback: progress_callback( "generating_config", 100, @@ -434,18 +431,17 @@ class SimulationManager: current=3, total=3 ) - - # 注意:运行脚本保留在 backend/scripts/ 目录,不再复制到模拟目录 - # 启动模拟时,simulation_runner 会从 scripts/ 目录运行脚本 - - # 更新状态 + + # The runtime scripts now live under backend/scripts/; we no longer copy + # them per-simulation. simulation_runner invokes them in place. + state.status = SimulationStatus.READY self._save_simulation_state(state) - + logger.info(t("log.simulation_manager.m002", simulation_id=simulation_id, state=state.entities_count, state_2=state.profiles_count)) - + return state - + except Exception as e: logger.error(t("log.simulation_manager.m003", simulation_id=simulation_id, str=str(e))) import traceback @@ -454,61 +450,61 @@ class SimulationManager: state.error = str(e) self._save_simulation_state(state) raise - + def get_simulation(self, simulation_id: str) -> Optional[SimulationState]: - """获取模拟状态""" + """Return the simulation's state, or ``None`` if unknown.""" return self._load_simulation_state(simulation_id) - + def list_simulations(self, project_id: Optional[str] = None) -> List[SimulationState]: - """列出所有模拟""" + """List all simulations, optionally filtered by ``project_id``.""" simulations = [] - + if os.path.exists(self.SIMULATION_DATA_DIR): for sim_id in os.listdir(self.SIMULATION_DATA_DIR): - # 跳过隐藏文件(如 .DS_Store)和非目录文件 + # Skip dotfiles (e.g. .DS_Store) and non-directories. sim_path = os.path.join(self.SIMULATION_DATA_DIR, sim_id) if sim_id.startswith('.') or not os.path.isdir(sim_path): continue - + state = self._load_simulation_state(sim_id) if state: if project_id is None or state.project_id == project_id: simulations.append(state) - + return simulations - + def get_profiles(self, simulation_id: str, platform: str = "reddit") -> List[Dict[str, Any]]: - """获取模拟的Agent Profile""" + """Return the persisted agent profiles for a platform.""" state = self._load_simulation_state(simulation_id) if not state: raise ValueError(f"模拟不存在: {simulation_id}") - + sim_dir = self._get_simulation_dir(simulation_id) profile_path = os.path.join(sim_dir, f"{platform}_profiles.json") - + if not os.path.exists(profile_path): return [] - + with open(profile_path, 'r', encoding='utf-8') as f: return json.load(f) - + def get_simulation_config(self, simulation_id: str) -> Optional[Dict[str, Any]]: - """获取模拟配置""" + """Return the persisted simulation config dict, or ``None`` if absent.""" sim_dir = self._get_simulation_dir(simulation_id) config_path = os.path.join(sim_dir, "simulation_config.json") - + if not os.path.exists(config_path): return None - + with open(config_path, 'r', encoding='utf-8') as f: return json.load(f) - + def get_run_instructions(self, simulation_id: str) -> Dict[str, str]: - """获取运行说明""" + """Return shell commands and instructions to launch the simulation manually.""" sim_dir = self._get_simulation_dir(simulation_id) config_path = os.path.join(sim_dir, "simulation_config.json") scripts_dir = os.path.abspath(os.path.join(os.path.dirname(__file__), '../../scripts')) - + return { "simulation_dir": sim_dir, "scripts_dir": scripts_dir, diff --git a/backend/app/services/simulation_runner.py b/backend/app/services/simulation_runner.py index 3afd2278..524f7446 100644 --- a/backend/app/services/simulation_runner.py +++ b/backend/app/services/simulation_runner.py @@ -1,6 +1,7 @@ """ -OASIS模拟运行器 -在后台运行模拟并记录每个Agent的动作,支持实时状态监控 +OASIS simulation runner. + +Runs the simulation in the background, records each agent's actions, and supports real-time status monitoring. """ import os @@ -26,15 +27,14 @@ from .simulation_ipc import SimulationIPCClient, CommandType, IPCResponse logger = get_logger('mirofish.simulation_runner') -# 标记是否已注册清理函数 +# Tracks whether the cleanup handler has been registered (guards against double registration in Flask reloader). _cleanup_registered = False -# 平台检测 IS_WINDOWS = sys.platform == 'win32' class RunnerStatus(str, Enum): - """运行器状态""" + """Runner lifecycle states.""" IDLE = "idle" STARTING = "starting" RUNNING = "running" @@ -47,7 +47,7 @@ class RunnerStatus(str, Enum): @dataclass class AgentAction: - """Agent动作记录""" + """A single recorded agent action.""" round_num: int timestamp: str platform: str # twitter / reddit @@ -74,7 +74,7 @@ class AgentAction: @dataclass class RoundSummary: - """每轮摘要""" + """Per-round summary statistics.""" round_num: int start_time: str end_time: Optional[str] = None @@ -100,52 +100,47 @@ class RoundSummary: @dataclass class SimulationRunState: - """模拟运行状态(实时)""" + """Live runtime state for a simulation.""" simulation_id: str runner_status: RunnerStatus = RunnerStatus.IDLE - - # 进度信息 + current_round: int = 0 total_rounds: int = 0 simulated_hours: int = 0 total_simulation_hours: int = 0 - - # 各平台独立轮次和模拟时间(用于双平台并行显示) + + # Per-platform round and simulated-time counters (used when both platforms run in parallel). twitter_current_round: int = 0 reddit_current_round: int = 0 twitter_simulated_hours: int = 0 reddit_simulated_hours: int = 0 - - # 平台状态 + twitter_running: bool = False reddit_running: bool = False twitter_actions_count: int = 0 reddit_actions_count: int = 0 - - # 平台完成状态(通过检测 actions.jsonl 中的 simulation_end 事件) + + # Per-platform completion flags, set when a simulation_end event is observed in actions.jsonl. twitter_completed: bool = False reddit_completed: bool = False - - # 每轮摘要 + rounds: List[RoundSummary] = field(default_factory=list) - - # 最近动作(用于前端实时展示) + + # Recent actions buffer; surfaced to the frontend for the live feed. recent_actions: List[AgentAction] = field(default_factory=list) max_recent_actions: int = 50 - - # 时间戳 + started_at: Optional[str] = None updated_at: str = field(default_factory=lambda: datetime.now().isoformat()) completed_at: Optional[str] = None - - # 错误信息 + error: Optional[str] = None - - # 进程ID(用于停止) + + # Main subprocess PID — captured so the process can later be stopped. process_pid: Optional[int] = None - + def add_action(self, action: AgentAction): - """添加动作到最近动作列表""" + """Prepend an action to the recent-actions buffer and update counters.""" self.recent_actions.insert(0, action) if len(self.recent_actions) > self.max_recent_actions: self.recent_actions = self.recent_actions[:self.max_recent_actions] @@ -166,7 +161,7 @@ class SimulationRunState: "simulated_hours": self.simulated_hours, "total_simulation_hours": self.total_simulation_hours, "progress_percent": round(self.current_round / max(self.total_rounds, 1) * 100, 1), - # 各平台独立轮次和时间 + # Per-platform round and simulated-time counters. "twitter_current_round": self.twitter_current_round, "reddit_current_round": self.reddit_current_round, "twitter_simulated_hours": self.twitter_simulated_hours, @@ -186,7 +181,7 @@ class SimulationRunState: } def to_detail_dict(self) -> Dict[str, Any]: - """包含最近动作的详细信息""" + """Return the dict form of the state including recent actions.""" result = self.to_dict() result["recent_actions"] = [a.to_dict() for a in self.recent_actions] result["rounds_count"] = len(self.rounds) @@ -195,53 +190,50 @@ class SimulationRunState: class SimulationRunner: """ - 模拟运行器 - - 负责: - 1. 在后台进程中运行OASIS模拟 - 2. 解析运行日志,记录每个Agent的动作 - 3. 提供实时状态查询接口 - 4. 支持暂停/停止/恢复操作 + Simulation runner. + + Responsibilities: + 1. Run the OASIS simulation in a background subprocess. + 2. Parse the run logs and record each agent's actions. + 3. Provide real-time status query interfaces. + 4. Support pause/stop/resume operations. """ - - # 运行状态存储目录 + RUN_STATE_DIR = os.path.join( os.path.dirname(__file__), '../../uploads/simulations' ) - - # 脚本目录 + SCRIPTS_DIR = os.path.join( os.path.dirname(__file__), '../../scripts' ) - - # 内存中的运行状态 + + # In-memory caches of runtime state, processes, queues, monitor threads, and log file handles. _run_states: Dict[str, SimulationRunState] = {} _processes: Dict[str, subprocess.Popen] = {} _action_queues: Dict[str, Queue] = {} _monitor_threads: Dict[str, threading.Thread] = {} - _stdout_files: Dict[str, Any] = {} # 存储 stdout 文件句柄 - _stderr_files: Dict[str, Any] = {} # 存储 stderr 文件句柄 - - # 图谱记忆更新配置 - _graph_memory_enabled: Dict[str, bool] = {} # simulation_id -> enabled - + _stdout_files: Dict[str, Any] = {} + _stderr_files: Dict[str, Any] = {} + + # Graph-memory-update flag per simulation_id. + _graph_memory_enabled: Dict[str, bool] = {} + @classmethod def get_run_state(cls, simulation_id: str) -> Optional[SimulationRunState]: - """获取运行状态""" + """Return the cached run state, falling back to disk if not loaded yet.""" if simulation_id in cls._run_states: return cls._run_states[simulation_id] - - # 尝试从文件加载 + state = cls._load_run_state(simulation_id) if state: cls._run_states[simulation_id] = state return state - + @classmethod def _load_run_state(cls, simulation_id: str) -> Optional[SimulationRunState]: - """从文件加载运行状态""" + """Load run state from the on-disk JSON snapshot.""" state_file = os.path.join(cls.RUN_STATE_DIR, simulation_id, "run_state.json") if not os.path.exists(state_file): return None @@ -257,7 +249,7 @@ class SimulationRunner: total_rounds=data.get("total_rounds", 0), simulated_hours=data.get("simulated_hours", 0), total_simulation_hours=data.get("total_simulation_hours", 0), - # 各平台独立轮次和时间 + # Per-platform round and simulated-time counters. twitter_current_round=data.get("twitter_current_round", 0), reddit_current_round=data.get("reddit_current_round", 0), twitter_simulated_hours=data.get("twitter_simulated_hours", 0), @@ -275,7 +267,7 @@ class SimulationRunner: process_pid=data.get("process_pid"), ) - # 加载最近动作 + # Restore the recent-actions buffer. actions_data = data.get("recent_actions", []) for a in actions_data: state.recent_actions.append(AgentAction( @@ -297,7 +289,7 @@ class SimulationRunner: @classmethod def _save_run_state(cls, state: SimulationRunState): - """保存运行状态到文件""" + """Persist the run state to its JSON snapshot file.""" sim_dir = os.path.join(cls.RUN_STATE_DIR, state.simulation_id) os.makedirs(sim_dir, exist_ok=True) state_file = os.path.join(sim_dir, "run_state.json") @@ -314,29 +306,29 @@ class SimulationRunner: cls, simulation_id: str, platform: str = "parallel", # twitter / reddit / parallel - max_rounds: int = None, # 最大模拟轮数(可选,用于截断过长的模拟) - enable_graph_memory_update: bool = False, # 是否将活动更新到Zep图谱 - graph_id: str = None # Zep图谱ID(启用图谱更新时必需) + max_rounds: int = None, # Optional cap on simulation rounds (truncates overly long runs). + enable_graph_memory_update: bool = False, # Whether to push activity into the Zep graph. + graph_id: str = None # Zep graph ID (required when graph-memory updates are enabled). ) -> SimulationRunState: """ - 启动模拟 - + Start the simulation. + Args: - simulation_id: 模拟ID - platform: 运行平台 (twitter/reddit/parallel) - max_rounds: 最大模拟轮数(可选,用于截断过长的模拟) - enable_graph_memory_update: 是否将Agent活动动态更新到Zep图谱 - graph_id: Zep图谱ID(启用图谱更新时必需) - + simulation_id: Simulation ID. + platform: Platform to run (twitter/reddit/parallel). + max_rounds: Optional cap on simulation rounds (truncates overly long runs). + enable_graph_memory_update: Whether to push agent activity to the Zep graph in real time. + graph_id: Zep graph ID (required when graph-memory updates are enabled). + Returns: SimulationRunState """ - # 检查是否已在运行 + # Refuse to start a duplicate run for the same simulation_id. existing = cls.get_run_state(simulation_id) if existing and existing.runner_status in [RunnerStatus.RUNNING, RunnerStatus.STARTING]: raise ValueError(f"模拟已在运行中: {simulation_id}") - # 加载模拟配置 + # Load the simulation configuration written during preparation. sim_dir = os.path.join(cls.RUN_STATE_DIR, simulation_id) config_path = os.path.join(sim_dir, "simulation_config.json") @@ -346,13 +338,13 @@ class SimulationRunner: with open(config_path, 'r', encoding='utf-8') as f: config = json.load(f) - # 初始化运行状态 + # Compute total rounds from time-window settings. time_config = config.get("time_config", {}) total_hours = time_config.get("total_simulation_hours", 72) minutes_per_round = time_config.get("minutes_per_round", 30) total_rounds = int(total_hours * 60 / minutes_per_round) - # 如果指定了最大轮数,则截断 + # If a cap was provided, clamp total_rounds. if max_rounds is not None and max_rounds > 0: original_rounds = total_rounds total_rounds = min(total_rounds, max_rounds) @@ -369,7 +361,7 @@ class SimulationRunner: cls._save_run_state(state) - # 如果启用图谱记忆更新,创建更新器 + # Spin up a graph-memory updater if requested. if enable_graph_memory_update: if not graph_id: raise ValueError("启用图谱记忆更新时必须提供 graph_id") @@ -384,7 +376,7 @@ class SimulationRunner: else: cls._graph_memory_enabled[simulation_id] = False - # 确定运行哪个脚本(脚本位于 backend/scripts/ 目录) + # Pick the entry script (lives in backend/scripts/) based on the requested platform. if platform == "twitter": script_name = "run_twitter_simulation.py" state.twitter_running = True @@ -401,55 +393,52 @@ class SimulationRunner: if not os.path.exists(script_path): raise ValueError(f"脚本不存在: {script_path}") - # 创建动作队列 action_queue = Queue() cls._action_queues[simulation_id] = action_queue - - # 启动模拟进程 + try: - # 构建运行命令,使用完整路径 - # 新的日志结构: - # twitter/actions.jsonl - Twitter 动作日志 - # reddit/actions.jsonl - Reddit 动作日志 - # simulation.log - 主进程日志 - + # Log layout written by the subprocess: + # twitter/actions.jsonl - Twitter action log + # reddit/actions.jsonl - Reddit action log + # simulation.log - main-process log + cmd = [ - sys.executable, # Python解释器 + sys.executable, script_path, - "--config", config_path, # 使用完整配置文件路径 + "--config", config_path, ] - - # 如果指定了最大轮数,添加到命令行参数 + if max_rounds is not None and max_rounds > 0: cmd.extend(["--max-rounds", str(max_rounds)]) - - # 创建主日志文件,避免 stdout/stderr 管道缓冲区满导致进程阻塞 + + # Redirect stdout/stderr to a file so a full pipe buffer cannot block the subprocess. main_log_path = os.path.join(sim_dir, "simulation.log") main_log_file = open(main_log_path, 'w', encoding='utf-8') - - # 设置子进程环境变量,确保 Windows 上使用 UTF-8 编码 - # 这可以修复第三方库(如 OASIS)读取文件时未指定编码的问题 + + # Force UTF-8 in the child so third-party libs (e.g. OASIS) that open files without an + # explicit encoding work correctly on Windows. env = os.environ.copy() - env['PYTHONUTF8'] = '1' # Python 3.7+ 支持,让所有 open() 默认使用 UTF-8 - env['PYTHONIOENCODING'] = 'utf-8' # 确保 stdout/stderr 使用 UTF-8 - - # 设置工作目录为模拟目录(数据库等文件会生成在此) - # 使用 start_new_session=True 创建新的进程组,确保可以通过 os.killpg 终止所有子进程 + env['PYTHONUTF8'] = '1' + env['PYTHONIOENCODING'] = 'utf-8' + + # cwd is the simulation directory so generated artifacts (databases, etc.) land there. + # start_new_session=True creates a fresh process group so os.killpg can terminate the + # entire tree on shutdown. process = subprocess.Popen( cmd, cwd=sim_dir, stdout=main_log_file, - stderr=subprocess.STDOUT, # stderr 也写入同一个文件 + stderr=subprocess.STDOUT, text=True, - encoding='utf-8', # 显式指定编码 + encoding='utf-8', bufsize=1, - env=env, # 传递带有 UTF-8 设置的环境变量 - start_new_session=True, # 创建新进程组,确保服务器关闭时能终止所有相关进程 + env=env, + start_new_session=True, ) - - # 保存文件句柄以便后续关闭 + + # Retain the log file handle so it can be closed after the subprocess exits. cls._stdout_files[simulation_id] = main_log_file - cls._stderr_files[simulation_id] = None # 不再需要单独的 stderr + cls._stderr_files[simulation_id] = None state.process_pid = process.pid state.runner_status = RunnerStatus.RUNNING @@ -459,7 +448,7 @@ class SimulationRunner: # Capture locale before spawning monitor thread current_locale = get_locale() - # 启动监控线程 + # Spawn the log-tailing monitor thread. monitor_thread = threading.Thread( target=cls._monitor_simulation, args=(simulation_id, current_locale), @@ -480,11 +469,10 @@ class SimulationRunner: @classmethod def _monitor_simulation(cls, simulation_id: str, locale: str = 'zh'): - """监控模拟进程,解析动作日志""" + """Monitor the simulation process and tail its per-platform action logs.""" set_locale(locale) sim_dir = os.path.join(cls.RUN_STATE_DIR, simulation_id) - - # 新的日志结构:分平台的动作日志 + twitter_actions_log = os.path.join(sim_dir, "twitter", "actions.jsonl") reddit_actions_log = os.path.join(sim_dir, "reddit", "actions.jsonl") @@ -498,30 +486,26 @@ class SimulationRunner: reddit_position = 0 try: - while process.poll() is None: # 进程仍在运行 - # 读取 Twitter 动作日志 + while process.poll() is None: if os.path.exists(twitter_actions_log): twitter_position = cls._read_action_log( twitter_actions_log, twitter_position, state, "twitter" ) - - # 读取 Reddit 动作日志 + if os.path.exists(reddit_actions_log): reddit_position = cls._read_action_log( reddit_actions_log, reddit_position, state, "reddit" ) - - # 更新状态 + cls._save_run_state(state) time.sleep(2) - - # 进程结束后,最后读取一次日志 + + # Drain any log lines written between the last poll and the process exit. if os.path.exists(twitter_actions_log): cls._read_action_log(twitter_actions_log, twitter_position, state, "twitter") if os.path.exists(reddit_actions_log): cls._read_action_log(reddit_actions_log, reddit_position, state, "reddit") - - # 进程结束 + exit_code = process.returncode if exit_code == 0: @@ -530,13 +514,13 @@ class SimulationRunner: logger.info(t("log.simulation_runner.m006", simulation_id=simulation_id)) else: state.runner_status = RunnerStatus.FAILED - # 从主日志文件读取错误信息 + # Pull the tail of the main log so the failure context is surfaced in state.error. main_log_path = os.path.join(sim_dir, "simulation.log") error_info = "" try: if os.path.exists(main_log_path): with open(main_log_path, 'r', encoding='utf-8') as f: - error_info = f.read()[-2000:] # 取最后2000字符 + error_info = f.read()[-2000:] # keep only the last 2000 chars except Exception: pass state.error = f"进程退出码: {exit_code}, 错误: {error_info}" @@ -553,7 +537,7 @@ class SimulationRunner: cls._save_run_state(state) finally: - # 停止图谱记忆更新器 + # Tear down the graph-memory updater, if we started one. if cls._graph_memory_enabled.get(simulation_id, False): try: ZepGraphMemoryManager.stop_updater(simulation_id) @@ -561,12 +545,11 @@ class SimulationRunner: except Exception as e: logger.error(t("log.simulation_runner.m010", e=e)) cls._graph_memory_enabled.pop(simulation_id, None) - - # 清理进程资源 + cls._processes.pop(simulation_id, None) cls._action_queues.pop(simulation_id, None) - - # 关闭日志文件句柄 + + # Close the retained log file handles. if simulation_id in cls._stdout_files: try: cls._stdout_files[simulation_id].close() @@ -589,18 +572,17 @@ class SimulationRunner: platform: str ) -> int: """ - 读取动作日志文件 - + Read new entries from a per-platform action log. + Args: - log_path: 日志文件路径 - position: 上次读取位置 - state: 运行状态对象 - platform: 平台名称 (twitter/reddit) - + log_path: Path to the action-log file. + position: Byte offset where the previous read finished. + state: Run-state object to mutate. + platform: Platform name (twitter/reddit). + Returns: - 新的读取位置 + New byte offset after this read. """ - # 检查是否启用了图谱记忆更新 graph_memory_enabled = cls._graph_memory_enabled.get(state.simulation_id, False) graph_updater = None if graph_memory_enabled: @@ -614,12 +596,12 @@ class SimulationRunner: if line: try: action_data = json.loads(line) - - # 处理事件类型的条目 + + # Event records (simulation_start/end, round_end, ...) are routed here. if "event_type" in action_data: event_type = action_data.get("event_type") - - # 检测 simulation_end 事件,标记平台已完成 + + # simulation_end means the platform finished its run. if event_type == "simulation_end": if platform == "twitter": state.twitter_completed = True @@ -630,21 +612,19 @@ class SimulationRunner: state.reddit_running = False logger.info(t("log.simulation_runner.m012", state=state.simulation_id, action_data=action_data.get('total_rounds'), action_data_2=action_data.get('total_actions'))) - # 检查是否所有启用的平台都已完成 - # 如果只运行了一个平台,只检查那个平台 - # 如果运行了两个平台,需要两个都完成 + # Mark the run as completed once every enabled platform has reported + # simulation_end. Single-platform runs only need that one. all_completed = cls._check_all_platforms_completed(state) if all_completed: state.runner_status = RunnerStatus.COMPLETED state.completed_at = datetime.now().isoformat() logger.info(t("log.simulation_runner.m013", state=state.simulation_id)) - # 更新轮次信息(从 round_end 事件) + # Round counters come from round_end events. elif event_type == "round_end": round_num = action_data.get("round", 0) simulated_hours = action_data.get("simulated_hours", 0) - - # 更新各平台独立的轮次和时间 + if platform == "twitter": if round_num > state.twitter_current_round: state.twitter_current_round = round_num @@ -653,13 +633,12 @@ class SimulationRunner: if round_num > state.reddit_current_round: state.reddit_current_round = round_num state.reddit_simulated_hours = simulated_hours - - # 总体轮次取两个平台的最大值 + + # Overall counters track the max across enabled platforms. if round_num > state.current_round: state.current_round = round_num - # 总体时间取两个平台的最大值 state.simulated_hours = max(state.twitter_simulated_hours, state.reddit_simulated_hours) - + continue action = AgentAction( @@ -674,12 +653,11 @@ class SimulationRunner: success=action_data.get("success", True), ) state.add_action(action) - - # 更新轮次 + if action.round_num and action.round_num > state.current_round: state.current_round = action.round_num - - # 如果启用了图谱记忆更新,将活动发送到Zep + + # Forward the activity to the Zep graph when the updater is enabled. if graph_updater: graph_updater.add_activity_from_dict(action_data, platform) @@ -693,46 +671,44 @@ class SimulationRunner: @classmethod def _check_all_platforms_completed(cls, state: SimulationRunState) -> bool: """ - 检查所有启用的平台是否都已完成模拟 - - 通过检查对应的 actions.jsonl 文件是否存在来判断平台是否被启用 - + Return whether every enabled platform has completed its simulation. + + A platform counts as enabled when its corresponding actions.jsonl file exists on disk. + Returns: - True 如果所有启用的平台都已完成 + True if all enabled platforms have completed. """ sim_dir = os.path.join(cls.RUN_STATE_DIR, state.simulation_id) twitter_log = os.path.join(sim_dir, "twitter", "actions.jsonl") reddit_log = os.path.join(sim_dir, "reddit", "actions.jsonl") - - # 检查哪些平台被启用(通过文件是否存在判断) + + # File presence is our enabled-platform signal. twitter_enabled = os.path.exists(twitter_log) reddit_enabled = os.path.exists(reddit_log) - - # 如果平台被启用但未完成,则返回 False + if twitter_enabled and not state.twitter_completed: return False if reddit_enabled and not state.reddit_completed: return False - - # 至少有一个平台被启用且已完成 + + # At least one platform must be enabled (and, by the checks above, completed). return twitter_enabled or reddit_enabled @classmethod def _terminate_process(cls, process: subprocess.Popen, simulation_id: str, timeout: int = 10): """ - 跨平台终止进程及其子进程 - + Terminate a process and its subprocesses in a cross-platform way. + Args: - process: 要终止的进程 - simulation_id: 模拟ID(用于日志) - timeout: 等待进程退出的超时时间(秒) + process: Process to terminate. + simulation_id: Simulation ID (used for log messages). + timeout: Seconds to wait for graceful exit before escalating. """ if IS_WINDOWS: - # Windows: 使用 taskkill 命令终止进程树 - # /F = 强制终止, /T = 终止进程树(包括子进程) + # Windows: taskkill /T tears down the whole process tree, /F escalates to a hard kill. logger.info(t("log.simulation_runner.m015", simulation_id=simulation_id, process=process.pid)) try: - # 先尝试优雅终止 + # Graceful termination first. subprocess.run( ['taskkill', '/PID', str(process.pid), '/T'], capture_output=True, @@ -741,7 +717,7 @@ class SimulationRunner: try: process.wait(timeout=timeout) except subprocess.TimeoutExpired: - # 强制终止 + # Force kill the tree. logger.warning(t("log.simulation_runner.m016", simulation_id=simulation_id)) subprocess.run( ['taskkill', '/F', '/PID', str(process.pid), '/T'], @@ -757,25 +733,25 @@ class SimulationRunner: except subprocess.TimeoutExpired: process.kill() else: - # Unix: 使用进程组终止 - # 由于使用了 start_new_session=True,进程组 ID 等于主进程 PID + # Unix: kill the entire process group. + # Because the subprocess was started with start_new_session=True the pgid equals the PID. pgid = os.getpgid(process.pid) logger.info(t("log.simulation_runner.m018", simulation_id=simulation_id, pgid=pgid)) - - # 先发送 SIGTERM 给整个进程组 + + # SIGTERM first to allow graceful shutdown. os.killpg(pgid, signal.SIGTERM) - + try: process.wait(timeout=timeout) except subprocess.TimeoutExpired: - # 如果超时后还没结束,强制发送 SIGKILL + # Escalate to SIGKILL on timeout. logger.warning(t("log.simulation_runner.m019", simulation_id=simulation_id)) os.killpg(pgid, signal.SIGKILL) process.wait(timeout=5) @classmethod def stop_simulation(cls, simulation_id: str) -> SimulationRunState: - """停止模拟""" + """Stop the simulation subprocess and update its state.""" state = cls.get_run_state(simulation_id) if not state: raise ValueError(f"模拟不存在: {simulation_id}") @@ -786,17 +762,16 @@ class SimulationRunner: state.runner_status = RunnerStatus.STOPPING cls._save_run_state(state) - # 终止进程 process = cls._processes.get(simulation_id) if process and process.poll() is None: try: cls._terminate_process(process, simulation_id) except ProcessLookupError: - # 进程已经不存在 + # The process has already exited. pass except Exception as e: logger.error(t("log.simulation_runner.m020", simulation_id=simulation_id, e=e)) - # 回退到直接终止进程 + # Fall back to direct termination on the Popen handle. try: process.terminate() process.wait(timeout=5) @@ -808,8 +783,8 @@ class SimulationRunner: state.reddit_running = False state.completed_at = datetime.now().isoformat() cls._save_run_state(state) - - # 停止图谱记忆更新器 + + # Tear down the graph-memory updater, if any. if cls._graph_memory_enabled.get(simulation_id, False): try: ZepGraphMemoryManager.stop_updater(simulation_id) @@ -831,14 +806,14 @@ class SimulationRunner: round_num: Optional[int] = None ) -> List[AgentAction]: """ - 从单个动作文件中读取动作 - + Read actions from a single action-log file. + Args: - file_path: 动作日志文件路径 - default_platform: 默认平台(当动作记录中没有 platform 字段时使用) - platform_filter: 过滤平台 - agent_id: 过滤 Agent ID - round_num: 过滤轮次 + file_path: Path to the action-log file. + default_platform: Platform to assume when a record has no `platform` field. + platform_filter: Optional platform filter. + agent_id: Optional agent-id filter. + round_num: Optional round-number filter. """ if not os.path.exists(file_path): return [] @@ -853,19 +828,18 @@ class SimulationRunner: try: data = json.loads(line) - - # 跳过非动作记录(如 simulation_start, round_start, round_end 等事件) + + # Skip event records (simulation_start, round_start, round_end, ...). if "event_type" in data: continue - - # 跳过没有 agent_id 的记录(非 Agent 动作) + + # Skip records without an agent_id (non-agent actions). if "agent_id" not in data: continue - - # 获取平台:优先使用记录中的 platform,否则使用默认平台 + + # Prefer the record's own platform; fall back to the default for legacy entries. record_platform = data.get("platform") or default_platform or "" - - # 过滤 + if platform_filter and record_platform != platform_filter: continue if agent_id is not None and data.get("agent_id") != agent_id: @@ -899,54 +873,54 @@ class SimulationRunner: round_num: Optional[int] = None ) -> List[AgentAction]: """ - 获取所有平台的完整动作历史(无分页限制) - + Return the complete action history across all platforms (no pagination). + Args: - simulation_id: 模拟ID - platform: 过滤平台(twitter/reddit) - agent_id: 过滤Agent - round_num: 过滤轮次 - + simulation_id: Simulation ID. + platform: Optional platform filter (twitter/reddit). + agent_id: Optional agent filter. + round_num: Optional round filter. + Returns: - 完整的动作列表(按时间戳排序,新的在前) + Full action list, sorted by timestamp with newest first. """ sim_dir = os.path.join(cls.RUN_STATE_DIR, simulation_id) actions = [] - - # 读取 Twitter 动作文件(根据文件路径自动设置 platform 为 twitter) + + # Twitter action log: derive platform from the file path. twitter_actions_log = os.path.join(sim_dir, "twitter", "actions.jsonl") if not platform or platform == "twitter": actions.extend(cls._read_actions_from_file( twitter_actions_log, - default_platform="twitter", # 自动填充 platform 字段 + default_platform="twitter", platform_filter=platform, - agent_id=agent_id, + agent_id=agent_id, round_num=round_num )) - - # 读取 Reddit 动作文件(根据文件路径自动设置 platform 为 reddit) + + # Reddit action log: derive platform from the file path. reddit_actions_log = os.path.join(sim_dir, "reddit", "actions.jsonl") if not platform or platform == "reddit": actions.extend(cls._read_actions_from_file( reddit_actions_log, - default_platform="reddit", # 自动填充 platform 字段 + default_platform="reddit", platform_filter=platform, agent_id=agent_id, round_num=round_num )) - - # 如果分平台文件不存在,尝试读取旧的单一文件格式 + + # Fall back to the legacy single-file layout if no per-platform files exist. if not actions: actions_log = os.path.join(sim_dir, "actions.jsonl") actions = cls._read_actions_from_file( actions_log, - default_platform=None, # 旧格式文件中应该有 platform 字段 + default_platform=None, # Legacy files carry their own platform field. platform_filter=platform, agent_id=agent_id, round_num=round_num ) - - # 按时间戳排序(新的在前) + + # Newest-first by timestamp. actions.sort(key=lambda x: x.timestamp, reverse=True) return actions @@ -962,18 +936,18 @@ class SimulationRunner: round_num: Optional[int] = None ) -> List[AgentAction]: """ - 获取动作历史(带分页) - + Return action history with pagination. + Args: - simulation_id: 模拟ID - limit: 返回数量限制 - offset: 偏移量 - platform: 过滤平台 - agent_id: 过滤Agent - round_num: 过滤轮次 - + simulation_id: Simulation ID. + limit: Maximum number of actions to return. + offset: Offset into the sorted result list. + platform: Optional platform filter. + agent_id: Optional agent filter. + round_num: Optional round filter. + Returns: - 动作列表 + A page of actions. """ actions = cls.get_all_actions( simulation_id=simulation_id, @@ -981,8 +955,7 @@ class SimulationRunner: agent_id=agent_id, round_num=round_num ) - - # 分页 + return actions[offset:offset + limit] @classmethod @@ -993,19 +966,19 @@ class SimulationRunner: end_round: Optional[int] = None ) -> List[Dict[str, Any]]: """ - 获取模拟时间线(按轮次汇总) - + Return a per-round timeline summary for the simulation. + Args: - simulation_id: 模拟ID - start_round: 起始轮次 - end_round: 结束轮次 - + simulation_id: Simulation ID. + start_round: First round to include (inclusive). + end_round: Last round to include (inclusive); None means no upper bound. + Returns: - 每轮的汇总信息 + One summary entry per round. """ actions = cls.get_actions(simulation_id, limit=10000) - - # 按轮次分组 + + # Group actions by round. rounds: Dict[int, Dict[str, Any]] = {} for action in actions: @@ -1038,7 +1011,7 @@ class SimulationRunner: r["action_types"][action.action_type] = r["action_types"].get(action.action_type, 0) + 1 r["last_action_time"] = action.timestamp - # 转换为列表 + # Materialise into a sorted list. result = [] for round_num in sorted(rounds.keys()): r = rounds[round_num] @@ -1059,10 +1032,10 @@ class SimulationRunner: @classmethod def get_agent_stats(cls, simulation_id: str) -> List[Dict[str, Any]]: """ - 获取每个Agent的统计信息 - + Return per-agent statistics for the simulation. + Returns: - Agent统计列表 + Per-agent statistics, sorted by total action count (descending). """ actions = cls.get_actions(simulation_id, limit=10000) @@ -1094,7 +1067,6 @@ class SimulationRunner: stats["action_types"][action.action_type] = stats["action_types"].get(action.action_type, 0) + 1 stats["last_action_time"] = action.timestamp - # 按总动作数排序 result = sorted(agent_stats.values(), key=lambda x: x["total_actions"], reverse=True) return result @@ -1102,25 +1074,25 @@ class SimulationRunner: @classmethod def cleanup_simulation_logs(cls, simulation_id: str) -> Dict[str, Any]: """ - 清理模拟的运行日志(用于强制重新开始模拟) - - 会删除以下文件: + Clean up the simulation's run logs so the simulation can be force-restarted. + + Deletes the following files: - run_state.json - twitter/actions.jsonl - reddit/actions.jsonl - simulation.log - stdout.log / stderr.log - - twitter_simulation.db(模拟数据库) - - reddit_simulation.db(模拟数据库) - - env_status.json(环境状态) - - 注意:不会删除配置文件(simulation_config.json)和 profile 文件 - + - twitter_simulation.db (simulation database) + - reddit_simulation.db (simulation database) + - env_status.json (environment status) + + Note: simulation_config.json and the profile files are preserved. + Args: - simulation_id: 模拟ID - + simulation_id: Simulation ID. + Returns: - 清理结果信息 + Cleanup result info. """ import shutil @@ -1132,21 +1104,20 @@ class SimulationRunner: cleaned_files = [] errors = [] - # 要删除的文件列表(包括数据库文件) + # Files to delete (includes per-platform databases). files_to_delete = [ "run_state.json", "simulation.log", "stdout.log", "stderr.log", - "twitter_simulation.db", # Twitter 平台数据库 - "reddit_simulation.db", # Reddit 平台数据库 - "env_status.json", # 环境状态文件 + "twitter_simulation.db", # Twitter platform database. + "reddit_simulation.db", # Reddit platform database. + "env_status.json", # Environment-status file. ] - - # 要删除的目录列表(包含动作日志) + + # Per-platform directories whose action logs should be cleaned. dirs_to_clean = ["twitter", "reddit"] - - # 删除文件 + for filename in files_to_delete: file_path = os.path.join(sim_dir, filename) if os.path.exists(file_path): @@ -1155,8 +1126,8 @@ class SimulationRunner: cleaned_files.append(filename) except Exception as e: errors.append(f"删除 {filename} 失败: {str(e)}") - - # 清理平台目录中的动作日志 + + # Clean per-platform action logs. for dir_name in dirs_to_clean: dir_path = os.path.join(sim_dir, dir_name) if os.path.exists(dir_path): @@ -1168,7 +1139,7 @@ class SimulationRunner: except Exception as e: errors.append(f"删除 {dir_name}/actions.jsonl 失败: {str(e)}") - # 清理内存中的运行状态 + # Drop the in-memory run state for this simulation. if simulation_id in cls._run_states: del cls._run_states[simulation_id] @@ -1180,57 +1151,55 @@ class SimulationRunner: "errors": errors if errors else None } - # 防止重复清理的标志 + # Guard so cleanup_all_simulations only runs once per process lifetime. _cleanup_done = False - + @classmethod def cleanup_all_simulations(cls): """ - 清理所有运行中的模拟进程 - - 在服务器关闭时调用,确保所有子进程被终止 + Clean up every running simulation subprocess. + + Invoked at server shutdown to guarantee no child processes leak. """ - # 防止重复清理 if cls._cleanup_done: return cls._cleanup_done = True - - # 检查是否有内容需要清理(避免空进程的进程打印无用日志) + + # Skip the "shutting down" log entirely if there's nothing to clean up. has_processes = bool(cls._processes) has_updaters = bool(cls._graph_memory_enabled) - + if not has_processes and not has_updaters: - return # 没有需要清理的内容,静默返回 - + return + logger.info(t("log.simulation_runner.m025")) - - # 首先停止所有图谱记忆更新器(stop_all 内部会打印日志) + + # Stop graph-memory updaters first (stop_all logs internally). try: ZepGraphMemoryManager.stop_all() except Exception as e: logger.error(t("log.simulation_runner.m026", e=e)) cls._graph_memory_enabled.clear() - - # 复制字典以避免在迭代时修改 + + # Snapshot the process map so we can mutate it during iteration. processes = list(cls._processes.items()) - + for simulation_id, process in processes: try: - if process.poll() is None: # 进程仍在运行 + if process.poll() is None: logger.info(t("log.simulation_runner.m027", simulation_id=simulation_id, process=process.pid)) - + try: - # 使用跨平台的进程终止方法 cls._terminate_process(process, simulation_id, timeout=5) except (ProcessLookupError, OSError): - # 进程可能已经不存在,尝试直接终止 + # The process may already be gone; fall back to direct termination. try: process.terminate() process.wait(timeout=3) except Exception: process.kill() - - # 更新 run_state.json + + # Update run_state.json so external readers see the stopped status. state = cls.get_run_state(simulation_id) if state: state.runner_status = RunnerStatus.STOPPED @@ -1240,7 +1209,7 @@ class SimulationRunner: state.error = "服务器关闭,模拟被终止" cls._save_run_state(state) - # 同时更新 state.json,将状态设为 stopped + # Also flip the project-level state.json status to "stopped". try: sim_dir = os.path.join(cls.RUN_STATE_DIR, simulation_id) state_file = os.path.join(sim_dir, "state.json") @@ -1261,7 +1230,7 @@ class SimulationRunner: except Exception as e: logger.error(t("log.simulation_runner.m032", simulation_id=simulation_id, e=e)) - # 清理文件句柄 + # Close any retained log file handles. for simulation_id, file_handle in list(cls._stdout_files.items()): try: if file_handle: @@ -1278,7 +1247,7 @@ class SimulationRunner: pass cls._stderr_files.clear() - # 清理内存中的状态 + # Drop in-memory bookkeeping. cls._processes.clear() cls._action_queues.clear() @@ -1287,99 +1256,98 @@ class SimulationRunner: @classmethod def register_cleanup(cls): """ - 注册清理函数 - - 在 Flask 应用启动时调用,确保服务器关闭时清理所有模拟进程 + Register the shutdown cleanup hook. + + Called at Flask application startup so that all simulation subprocesses are torn down + when the server stops. """ global _cleanup_registered - + if _cleanup_registered: return - - # Flask debug 模式下,只在 reloader 子进程中注册清理(实际运行应用的进程) - # WERKZEUG_RUN_MAIN=true 表示是 reloader 子进程 - # 如果不是 debug 模式,则没有这个环境变量,也需要注册 + + # In Flask debug mode the reloader spawns a child process that actually runs the app + # (signaled by WERKZEUG_RUN_MAIN=true). Outside debug mode that variable is unset and we + # still want to register the cleanup hook. is_reloader_process = os.environ.get('WERKZEUG_RUN_MAIN') == 'true' is_debug_mode = os.environ.get('FLASK_DEBUG') == '1' or os.environ.get('WERKZEUG_RUN_MAIN') is not None - - # 在 debug 模式下,只在 reloader 子进程中注册;非 debug 模式下始终注册 + + # Debug mode: only register inside the reloader child. Non-debug: always register. if is_debug_mode and not is_reloader_process: - _cleanup_registered = True # 标记已注册,防止子进程再次尝试 + _cleanup_registered = True # Prevent the parent process from retrying. return - - # 保存原有的信号处理器 + + # Capture the previously installed signal handlers so we can chain to them. original_sigint = signal.getsignal(signal.SIGINT) original_sigterm = signal.getsignal(signal.SIGTERM) - # SIGHUP 只在 Unix 系统存在(macOS/Linux),Windows 没有 + # SIGHUP exists only on Unix (macOS/Linux); Windows does not have it. original_sighup = None has_sighup = hasattr(signal, 'SIGHUP') if has_sighup: original_sighup = signal.getsignal(signal.SIGHUP) - + def cleanup_handler(signum=None, frame=None): - """信号处理器:先清理模拟进程,再调用原处理器""" - # 只有在有进程需要清理时才打印日志 + """Signal handler that cleans up simulations before delegating to the original handler.""" + # Only log when there is actually something to clean up. if cls._processes or cls._graph_memory_enabled: logger.info(t("log.simulation_runner.m034", signum=signum)) cls.cleanup_all_simulations() - - # 调用原有的信号处理器,让 Flask 正常退出 + + # Chain to the original handler so Flask exits normally. if signum == signal.SIGINT and callable(original_sigint): original_sigint(signum, frame) elif signum == signal.SIGTERM and callable(original_sigterm): original_sigterm(signum, frame) elif has_sighup and signum == signal.SIGHUP: - # SIGHUP: 终端关闭时发送 + # SIGHUP is sent when the terminal is closed. if callable(original_sighup): original_sighup(signum, frame) else: - # 默认行为:正常退出 + # Default behavior: exit cleanly. sys.exit(0) else: - # 如果原处理器不可调用(如 SIG_DFL),则使用默认行为 + # If the original handler is not callable (e.g. SIG_DFL), use the default behavior. raise KeyboardInterrupt - - # 注册 atexit 处理器(作为备用) + + # Register the atexit handler as a fallback. atexit.register(cls.cleanup_all_simulations) - - # 注册信号处理器(仅在主线程中) + + # Register signal handlers (only valid from the main thread). try: - # SIGTERM: kill 命令默认信号 + # SIGTERM: default signal sent by `kill`. signal.signal(signal.SIGTERM, cleanup_handler) # SIGINT: Ctrl+C signal.signal(signal.SIGINT, cleanup_handler) - # SIGHUP: 终端关闭(仅 Unix 系统) + # SIGHUP: terminal close (Unix only). if has_sighup: signal.signal(signal.SIGHUP, cleanup_handler) except ValueError: - # 不在主线程中,只能使用 atexit + # Not the main thread — fall back to the atexit hook. logger.warning(t("log.simulation_runner.m035")) - + _cleanup_registered = True @classmethod def get_running_simulations(cls) -> List[str]: - """ - 获取所有正在运行的模拟ID列表 - """ + """Return a list of every simulation ID with a live subprocess.""" running = [] for sim_id, process in cls._processes.items(): if process.poll() is None: running.append(sim_id) return running - # ============== Interview 功能 ============== - + # ============== Interview feature ============== + @classmethod def check_env_alive(cls, simulation_id: str) -> bool: """ - 检查模拟环境是否存活(可以接收Interview命令) + Check whether the simulation environment is alive and able to receive interview commands. Args: - simulation_id: 模拟ID + simulation_id: Simulation ID. Returns: - True 表示环境存活,False 表示环境已关闭 + True if the environment is alive, False if it has shut down. """ sim_dir = os.path.join(cls.RUN_STATE_DIR, simulation_id) if not os.path.exists(sim_dir): @@ -1391,13 +1359,13 @@ class SimulationRunner: @classmethod def get_env_status_detail(cls, simulation_id: str) -> Dict[str, Any]: """ - 获取模拟环境的详细状态信息 + Return detailed status info for the simulation environment. Args: - simulation_id: 模拟ID + simulation_id: Simulation ID. Returns: - 状态详情字典,包含 status, twitter_available, reddit_available, timestamp + Status dict containing status, twitter_available, reddit_available, timestamp. """ sim_dir = os.path.join(cls.RUN_STATE_DIR, simulation_id) status_file = os.path.join(sim_dir, "env_status.json") @@ -1434,24 +1402,24 @@ class SimulationRunner: timeout: float = 60.0 ) -> Dict[str, Any]: """ - 采访单个Agent + Interview a single agent. Args: - simulation_id: 模拟ID - agent_id: Agent ID - prompt: 采访问题 - platform: 指定平台(可选) - - "twitter": 只采访Twitter平台 - - "reddit": 只采访Reddit平台 - - None: 双平台模拟时同时采访两个平台,返回整合结果 - timeout: 超时时间(秒) + simulation_id: Simulation ID. + agent_id: Agent ID. + prompt: Interview question. + platform: Optional platform selector. + - "twitter": only interview the agent on Twitter. + - "reddit": only interview the agent on Reddit. + - None: in dual-platform runs, interview both platforms and return a merged result. + timeout: Timeout in seconds. Returns: - 采访结果字典 + Interview result dict. Raises: - ValueError: 模拟不存在或环境未运行 - TimeoutError: 等待响应超时 + ValueError: Simulation does not exist or its environment is not running. + TimeoutError: Timed out waiting for the response. """ sim_dir = os.path.join(cls.RUN_STATE_DIR, simulation_id) if not os.path.exists(sim_dir): @@ -1497,23 +1465,23 @@ class SimulationRunner: timeout: float = 120.0 ) -> Dict[str, Any]: """ - 批量采访多个Agent + Interview multiple agents in batch. Args: - simulation_id: 模拟ID - interviews: 采访列表,每个元素包含 {"agent_id": int, "prompt": str, "platform": str(可选)} - platform: 默认平台(可选,会被每个采访项的platform覆盖) - - "twitter": 默认只采访Twitter平台 - - "reddit": 默认只采访Reddit平台 - - None: 双平台模拟时每个Agent同时采访两个平台 - timeout: 超时时间(秒) + simulation_id: Simulation ID. + interviews: Interview list; each entry is {"agent_id": int, "prompt": str, "platform": str (optional)}. + platform: Optional default platform (overridden per-interview by an entry's own `platform`). + - "twitter": default to interviewing only Twitter. + - "reddit": default to interviewing only Reddit. + - None: in dual-platform runs, interview every agent on both platforms. + timeout: Timeout in seconds. Returns: - 批量采访结果字典 + Batch interview result dict. Raises: - ValueError: 模拟不存在或环境未运行 - TimeoutError: 等待响应超时 + ValueError: Simulation does not exist or its environment is not running. + TimeoutError: Timed out waiting for the response. """ sim_dir = os.path.join(cls.RUN_STATE_DIR, simulation_id) if not os.path.exists(sim_dir): @@ -1556,27 +1524,27 @@ class SimulationRunner: timeout: float = 180.0 ) -> Dict[str, Any]: """ - 采访所有Agent(全局采访) + Interview every agent in the simulation (global interview). - 使用相同的问题采访模拟中的所有Agent + Sends the same prompt to every agent in the simulation. Args: - simulation_id: 模拟ID - prompt: 采访问题(所有Agent使用相同问题) - platform: 指定平台(可选) - - "twitter": 只采访Twitter平台 - - "reddit": 只采访Reddit平台 - - None: 双平台模拟时每个Agent同时采访两个平台 - timeout: 超时时间(秒) + simulation_id: Simulation ID. + prompt: Interview question used for every agent. + platform: Optional platform selector. + - "twitter": only interview Twitter. + - "reddit": only interview Reddit. + - None: in dual-platform runs, interview every agent on both platforms. + timeout: Timeout in seconds. Returns: - 全局采访结果字典 + Global interview result dict. """ sim_dir = os.path.join(cls.RUN_STATE_DIR, simulation_id) if not os.path.exists(sim_dir): raise ValueError(f"模拟不存在: {simulation_id}") - # 从配置文件获取所有Agent信息 + # Read every agent from the simulation config. config_path = os.path.join(sim_dir, "simulation_config.json") if not os.path.exists(config_path): raise ValueError(f"模拟配置不存在: {simulation_id}") @@ -1588,7 +1556,7 @@ class SimulationRunner: if not agent_configs: raise ValueError(f"模拟配置中没有Agent: {simulation_id}") - # 构建批量采访列表 + # Build the batch-interview payload. interviews = [] for agent_config in agent_configs: agent_id = agent_config.get("agent_id") @@ -1614,16 +1582,17 @@ class SimulationRunner: timeout: float = 30.0 ) -> Dict[str, Any]: """ - 关闭模拟环境(而不是停止模拟进程) - - 向模拟发送关闭环境命令,使其优雅退出等待命令模式 - + Close the simulation environment (does not stop the simulation subprocess). + + Sends a close-environment command to the simulation so it exits its wait-for-command mode + gracefully. + Args: - simulation_id: 模拟ID - timeout: 超时时间(秒) - + simulation_id: Simulation ID. + timeout: Timeout in seconds. + Returns: - 操作结果字典 + Operation-result dict. """ sim_dir = os.path.join(cls.RUN_STATE_DIR, simulation_id) if not os.path.exists(sim_dir): @@ -1649,7 +1618,7 @@ class SimulationRunner: "timestamp": response.timestamp } except TimeoutError: - # 超时可能是因为环境正在关闭 + # Timing out can simply mean the environment is already shutting down. return { "success": True, "message": "环境关闭命令已发送(等待响应超时,环境可能正在关闭)" @@ -1663,7 +1632,7 @@ class SimulationRunner: agent_id: Optional[int] = None, limit: int = 100 ) -> List[Dict[str, Any]]: - """从单个数据库获取Interview历史""" + """Read the interview history from a single per-platform database.""" import sqlite3 if not os.path.exists(db_path): @@ -1722,29 +1691,29 @@ class SimulationRunner: limit: int = 100 ) -> List[Dict[str, Any]]: """ - 获取Interview历史记录(从数据库读取) - + Return the interview history (read from the per-platform databases). + Args: - simulation_id: 模拟ID - platform: 平台类型(reddit/twitter/None) - - "reddit": 只获取Reddit平台的历史 - - "twitter": 只获取Twitter平台的历史 - - None: 获取两个平台的所有历史 - agent_id: 指定Agent ID(可选,只获取该Agent的历史) - limit: 每个平台返回数量限制 - + simulation_id: Simulation ID. + platform: Platform selector (reddit/twitter/None). + - "reddit": only return Reddit history. + - "twitter": only return Twitter history. + - None: return history from both platforms. + agent_id: Optional agent-id filter; if set, only that agent's history is returned. + limit: Max number of records per platform. + Returns: - Interview历史记录列表 + Interview-history list. """ sim_dir = os.path.join(cls.RUN_STATE_DIR, simulation_id) - + results = [] - - # 确定要查询的平台 + + # Decide which platform databases to query. if platform in ("reddit", "twitter"): platforms = [platform] else: - # 不指定platform时,查询两个平台 + # No platform specified: query both. platforms = ["twitter", "reddit"] for p in platforms: @@ -1757,10 +1726,10 @@ class SimulationRunner: ) results.extend(platform_results) - # 按时间降序排序 + # Newest-first by timestamp. results.sort(key=lambda x: x.get("timestamp", ""), reverse=True) - - # 如果查询了多个平台,限制总数 + + # When multiple platforms were queried, cap the merged result size. if len(platforms) > 1 and len(results) > limit: results = results[:limit] diff --git a/backend/app/services/text_processor.py b/backend/app/services/text_processor.py index 91e32acc..9364cbc2 100644 --- a/backend/app/services/text_processor.py +++ b/backend/app/services/text_processor.py @@ -1,68 +1,64 @@ -""" -文本处理服务 -""" +"""Text processing service.""" from typing import List, Optional from ..utils.file_parser import FileParser, split_text_into_chunks class TextProcessor: - """文本处理器""" - + """Facade for the text-extraction and chunking pipeline.""" + @staticmethod def extract_from_files(file_paths: List[str]) -> str: - """从多个文件提取文本""" + """Extract and concatenate text from multiple files.""" return FileParser.extract_from_multiple(file_paths) - + @staticmethod def split_text( text: str, chunk_size: int = 500, overlap: int = 50 ) -> List[str]: - """ - 分割文本 - + """Split text into chunks. + Args: - text: 原始文本 - chunk_size: 块大小 - overlap: 重叠大小 - + text: The source text. + chunk_size: Target characters per chunk. + overlap: Overlap between consecutive chunks. + Returns: - 文本块列表 + A list of chunk strings. """ return split_text_into_chunks(text, chunk_size, overlap) - + @staticmethod def preprocess_text(text: str) -> str: - """ - 预处理文本 - - 移除多余空白 - - 标准化换行 - + """Pre-process text by normalizing whitespace and line endings. + + - Collapse runs of blank lines to at most two newlines. + - Normalize line endings to ``\\n``. + - Strip leading/trailing whitespace from each line. + Args: - text: 原始文本 - + text: The source text. + Returns: - 处理后的文本 + The cleaned text. """ import re - - # 标准化换行 + text = text.replace('\r\n', '\n').replace('\r', '\n') - - # 移除连续空行(保留最多两个换行) + + # Collapse 3+ consecutive newlines down to a blank-line separator. text = re.sub(r'\n{3,}', '\n\n', text) - - # 移除行首行尾空白 + lines = [line.strip() for line in text.split('\n')] text = '\n'.join(lines) - + return text.strip() - + @staticmethod def get_text_stats(text: str) -> dict: - """获取文本统计信息""" + """Return basic text statistics: total chars, lines, and words.""" return { "total_chars": len(text), "total_lines": text.count('\n') + 1, diff --git a/backend/app/services/zep_entity_reader.py b/backend/app/services/zep_entity_reader.py index 905468ac..ca1dd0c5 100644 --- a/backend/app/services/zep_entity_reader.py +++ b/backend/app/services/zep_entity_reader.py @@ -1,6 +1,7 @@ -""" -Zep实体读取与过滤服务 -从Zep图谱中读取节点,筛选出符合预定义实体类型的节点 +"""Zep entity reader and filter service. + +Reads nodes from a Zep graph and filters down to those that match a +predefined ontology of entity types. """ import time @@ -16,23 +17,23 @@ from ..utils.locale import t logger = get_logger('mirofish.zep_entity_reader') -# 用于泛型返回类型 +# Generic return-type variable. T = TypeVar('T') @dataclass class EntityNode: - """实体节点数据结构""" + """In-memory representation of an entity node from the graph.""" uuid: str name: str labels: List[str] summary: str attributes: Dict[str, Any] - # 相关的边信息 + # Edges connected to this entity. related_edges: List[Dict[str, Any]] = field(default_factory=list) - # 相关的其他节点信息 + # Other nodes connected through related edges. related_nodes: List[Dict[str, Any]] = field(default_factory=list) - + def to_dict(self) -> Dict[str, Any]: return { "uuid": self.uuid, @@ -43,9 +44,9 @@ class EntityNode: "related_edges": self.related_edges, "related_nodes": self.related_nodes, } - + def get_entity_type(self) -> Optional[str]: - """获取实体类型(排除默认的Entity标签)""" + """Return the first non-default label, or ``None`` if only defaults are present.""" for label in self.labels: if label not in ["Entity", "Node"]: return label @@ -54,12 +55,12 @@ class EntityNode: @dataclass class FilteredEntities: - """过滤后的实体集合""" + """Result of a filter pass over the graph: matching entities + counts.""" entities: List[EntityNode] entity_types: Set[str] total_count: int filtered_count: int - + def to_dict(self) -> Dict[str, Any]: return { "entities": [e.to_dict() for e in self.entities], @@ -70,40 +71,38 @@ class FilteredEntities: class ZepEntityReader: + """Read entities from a Zep graph and filter to ontology-defined types. + + Capabilities: + 1. Read all nodes from the graph. + 2. Keep nodes whose labels include something other than the default ``Entity``. + 3. Optionally enrich each entity with its connected edges and neighboring nodes. """ - Zep实体读取与过滤服务 - - 主要功能: - 1. 从Zep图谱读取所有节点 - 2. 筛选出符合预定义实体类型的节点(Labels不只是Entity的节点) - 3. 获取每个实体的相关边和关联节点信息 - """ - + def __init__(self, api_key: Optional[str] = None): self.client = GraphitiAdapter() - + def _call_with_retry( - self, - func: Callable[[], T], + self, + func: Callable[[], T], operation_name: str, max_retries: int = 3, initial_delay: float = 2.0 ) -> T: - """ - 带重试机制的Zep API调用 - + """Call a Zep API function with retry on failure. + Args: - func: 要执行的函数(无参数的lambda或callable) - operation_name: 操作名称,用于日志 - max_retries: 最大重试次数(默认3次,即最多尝试3次) - initial_delay: 初始延迟秒数 - + func: A zero-argument callable performing the request. + operation_name: Operation label used in log output. + max_retries: Maximum number of attempts (default 3 — i.e. up to 3 tries total). + initial_delay: Initial delay between retries in seconds. + Returns: - API调用结果 + The return value of ``func``. """ last_exception = None delay = initial_delay - + for attempt in range(max_retries): try: return func() @@ -114,21 +113,20 @@ class ZepEntityReader: t("log.zep_entity_reader.m001", operation_name=operation_name, attempt=attempt + 1, str=str(e)[:100], delay=delay) ) time.sleep(delay) - delay *= 2 # 指数退避 + delay *= 2 # exponential backoff else: logger.error(t("log.zep_entity_reader.m002", operation_name=operation_name, max_retries=max_retries, str=str(e))) - + raise last_exception - + def get_all_nodes(self, graph_id: str) -> List[Dict[str, Any]]: - """ - 获取图谱的所有节点(分页获取) + """Return every node in the graph (paginated under the hood). Args: - graph_id: 图谱ID + graph_id: Graph identifier. Returns: - 节点列表 + A list of node dicts. """ logger.info(t("log.zep_entity_reader.m003", graph_id=graph_id)) @@ -148,14 +146,13 @@ class ZepEntityReader: return nodes_data def get_all_edges(self, graph_id: str) -> List[Dict[str, Any]]: - """ - 获取图谱的所有边(分页获取) + """Return every edge in the graph (paginated under the hood). Args: - graph_id: 图谱ID + graph_id: Graph identifier. Returns: - 边列表 + A list of edge dicts. """ logger.info(t("log.zep_entity_reader.m005", graph_id=graph_id)) @@ -174,24 +171,23 @@ class ZepEntityReader: logger.info(t("log.zep_entity_reader.m006", len=len(edges_data))) return edges_data - + def get_node_edges(self, node_uuid: str) -> List[Dict[str, Any]]: - """ - 获取指定节点的所有相关边(带重试机制) - + """Return every edge connected to the given node (with retry). + Args: - node_uuid: 节点UUID - + node_uuid: Node UUID. + Returns: - 边列表 + A list of edge dicts. """ try: - # 使用重试机制调用Zep API + # Wrap the API call in retry logic. edges = self._call_with_retry( func=lambda: self.client.graph.node.get_entity_edges(node_uuid=node_uuid), operation_name=f"获取节点边(node={node_uuid[:8]}...)" ) - + edges_data = [] for edge in edges: edges_data.append({ @@ -202,32 +198,31 @@ class ZepEntityReader: "target_node_uuid": edge.target_node_uuid, "attributes": edge.attributes or {}, }) - + return edges_data except Exception as e: logger.warning(t("log.zep_entity_reader.m007", node_uuid=node_uuid, str=str(e))) return [] - + def filter_defined_entities( - self, + self, graph_id: str, defined_entity_types: Optional[List[str]] = None, enrich_with_edges: bool = True ) -> FilteredEntities: - """ - 筛选出符合预定义实体类型的节点 - - 筛选逻辑: - - 如果节点的Labels只有一个"Entity",说明这个实体不符合我们预定义的类型,跳过 - - 如果节点的Labels包含除"Entity"和"Node"之外的标签,说明符合预定义类型,保留 - + """Filter nodes down to entities matching the predefined ontology types. + + Filtering rules: + - Skip nodes whose only label is ``Entity`` (uncategorized). + - Keep nodes whose labels include anything other than ``Entity`` and ``Node``. + Args: - graph_id: 图谱ID - defined_entity_types: 预定义的实体类型列表(可选,如果提供则只保留这些类型) - enrich_with_edges: 是否获取每个实体的相关边信息 - + graph_id: Graph identifier. + defined_entity_types: Optional allow-list; when provided, only matching types are kept. + enrich_with_edges: When ``True``, populate related_edges and related_nodes. + Returns: - FilteredEntities: 过滤后的实体集合 + A ``FilteredEntities`` summary. """ logger.info(t("log.zep_entity_reader.m008", graph_id=graph_id)) @@ -243,7 +238,7 @@ class ZepEntityReader: except Exception: pass - # 获取所有节点 + # Read every node from the graph. all_nodes = self.get_all_nodes(graph_id) total_count = len(all_nodes) @@ -259,27 +254,27 @@ class ZepEntityReader: if entity_type != "Entity": node["labels"] = [entity_type] + labels - # 获取所有边(用于后续关联查找) + # Read every edge so we can enrich entities later. all_edges = self.get_all_edges(graph_id) if enrich_with_edges else [] - # 构建节点UUID到节点数据的映射 + # uuid -> node-data map for fast lookup. node_map = {n["uuid"]: n for n in all_nodes} - # 筛选符合条件的实体 + # Filter to entities that match the criteria. filtered_entities = [] entity_types_found = set() for node in all_nodes: labels = node.get("labels", []) - # 筛选逻辑:Labels必须包含除"Entity"和"Node"之外的标签 + # Filtering rule: labels must contain something other than the defaults. custom_labels = [l for l in labels if l not in ["Entity", "Node"]] if not custom_labels: - # 只有默认标签,跳过 + # Only default labels — skip. continue - - # 如果指定了预定义类型,检查是否匹配 + + # When a predefined-type list is supplied, require a match against it. if defined_entity_types: matching_labels = [l for l in custom_labels if l in defined_entity_types] if not matching_labels: @@ -287,10 +282,9 @@ class ZepEntityReader: entity_type = matching_labels[0] else: entity_type = custom_labels[0] - + entity_types_found.add(entity_type) - - # 创建实体节点对象 + entity = EntityNode( uuid=node["uuid"], name=node["name"], @@ -298,12 +292,12 @@ class ZepEntityReader: summary=node["summary"], attributes=node["attributes"], ) - - # 获取相关边和节点 + + # Enrich with related edges and neighboring nodes. if enrich_with_edges: related_edges = [] related_node_uuids = set() - + for edge in all_edges: if edge["source_node_uuid"] == node["uuid"]: related_edges.append({ @@ -321,10 +315,10 @@ class ZepEntityReader: "source_node_uuid": edge["source_node_uuid"], }) related_node_uuids.add(edge["source_node_uuid"]) - + entity.related_edges = related_edges - - # 获取关联节点的基本信息 + + # Populate basic info for each neighboring node. related_nodes = [] for related_uuid in related_node_uuids: if related_uuid in node_map: @@ -335,56 +329,55 @@ class ZepEntityReader: "labels": related_node["labels"], "summary": related_node.get("summary", ""), }) - + entity.related_nodes = related_nodes - + filtered_entities.append(entity) - + logger.info(t("log.zep_entity_reader.m009", total_count=total_count, len=len(filtered_entities), entity_types_found=entity_types_found)) - + return FilteredEntities( entities=filtered_entities, entity_types=entity_types_found, total_count=total_count, filtered_count=len(filtered_entities), ) - + def get_entity_with_context( - self, - graph_id: str, + self, + graph_id: str, entity_uuid: str ) -> Optional[EntityNode]: - """ - 获取单个实体及其完整上下文(边和关联节点,带重试机制) - + """Fetch a single entity with its full context (edges + neighbors), with retry. + Args: - graph_id: 图谱ID - entity_uuid: 实体UUID - + graph_id: Graph identifier. + entity_uuid: Entity UUID. + Returns: - EntityNode或None + ``EntityNode`` or ``None`` if not found. """ try: - # 使用重试机制获取节点 + # Fetch the node with retry. node = self._call_with_retry( func=lambda: self.client.graph.node.get(uuid_=entity_uuid), operation_name=f"获取节点详情(uuid={entity_uuid[:8]}...)" ) - + if not node: return None - - # 获取节点的边 + + # Edges connected to this node. edges = self.get_node_edges(entity_uuid) - - # 获取所有节点用于关联查找 + + # All graph nodes, used for neighbor lookup. all_nodes = self.get_all_nodes(graph_id) node_map = {n["uuid"]: n for n in all_nodes} - - # 处理相关边和节点 + + # Collect related edges and neighboring uuids. related_edges = [] related_node_uuids = set() - + for edge in edges: if edge["source_node_uuid"] == entity_uuid: related_edges.append({ @@ -402,8 +395,8 @@ class ZepEntityReader: "source_node_uuid": edge["source_node_uuid"], }) related_node_uuids.add(edge["source_node_uuid"]) - - # 获取关联节点信息 + + # Populate basic info for each neighboring node. related_nodes = [] for related_uuid in related_node_uuids: if related_uuid in node_map: @@ -414,7 +407,7 @@ class ZepEntityReader: "labels": related_node["labels"], "summary": related_node.get("summary", ""), }) - + return EntityNode( uuid=getattr(node, 'uuid_', None) or getattr(node, 'uuid', ''), name=node.name or "", @@ -424,27 +417,26 @@ class ZepEntityReader: related_edges=related_edges, related_nodes=related_nodes, ) - + except Exception as e: logger.error(t("log.zep_entity_reader.m010", entity_uuid=entity_uuid, str=str(e))) return None - + def get_entities_by_type( - self, - graph_id: str, + self, + graph_id: str, entity_type: str, enrich_with_edges: bool = True ) -> List[EntityNode]: - """ - 获取指定类型的所有实体 - + """Return every entity matching the given type. + Args: - graph_id: 图谱ID - entity_type: 实体类型(如 "Student", "PublicFigure" 等) - enrich_with_edges: 是否获取相关边信息 - + graph_id: Graph identifier. + entity_type: Entity type label (e.g. ``Student``, ``PublicFigure``). + enrich_with_edges: When ``True``, populate related edges/nodes. + Returns: - 实体列表 + A list of matching ``EntityNode`` instances. """ result = self.filter_defined_entities( graph_id=graph_id, diff --git a/backend/app/services/zep_graph_memory_updater.py b/backend/app/services/zep_graph_memory_updater.py index 83a748e5..837da9cd 100644 --- a/backend/app/services/zep_graph_memory_updater.py +++ b/backend/app/services/zep_graph_memory_updater.py @@ -1,6 +1,7 @@ """ -Zep图谱记忆更新服务 -将模拟中的Agent活动动态更新到Zep图谱中 +Zep graph memory update service. + +Streams agent activity from running simulations into the Zep knowledge graph. """ import os @@ -23,7 +24,7 @@ logger = get_logger('mirofish.zep_graph_memory_updater') @dataclass class AgentActivity: - """Agent活动记录""" + """Record of a single agent activity.""" platform: str # twitter / reddit agent_id: int agent_name: str @@ -33,13 +34,12 @@ class AgentActivity: timestamp: str def to_episode_text(self) -> str: + """Render the activity as a natural-language episode for Zep. + + The text uses plain narrative phrasing so Zep can extract entities and + relationships from it. No simulation-specific prefix is prepended, so + the graph update is not biased by framing words. """ - 将活动转换为可以发送给Zep的文本描述 - - 采用自然语言描述格式,让Zep能够从中提取实体和关系 - 不添加模拟相关的前缀,避免误导图谱更新 - """ - # 根据不同的动作类型生成不同的描述 action_descriptions = { "CREATE_POST": self._describe_create_post, "LIKE_POST": self._describe_like_post, @@ -57,8 +57,8 @@ class AgentActivity: describe_func = action_descriptions.get(self.action_type, self._describe_generic) description = describe_func() - - # 直接返回 "agent名称: 活动描述" 格式,不添加模拟前缀 + + # Return ": " with no simulation prefix. return f"{self.agent_name}: {description}" def _describe_create_post(self) -> str: @@ -68,7 +68,7 @@ class AgentActivity: return "发布了一条帖子" def _describe_like_post(self) -> str: - """点赞帖子 - 包含帖子原文和作者信息""" + """Like a post — includes the post text and author when available.""" post_content = self.action_args.get("post_content", "") post_author = self.action_args.get("post_author_name", "") @@ -81,7 +81,7 @@ class AgentActivity: return "点赞了一条帖子" def _describe_dislike_post(self) -> str: - """踩帖子 - 包含帖子原文和作者信息""" + """Dislike a post — includes the post text and author when available.""" post_content = self.action_args.get("post_content", "") post_author = self.action_args.get("post_author_name", "") @@ -94,7 +94,7 @@ class AgentActivity: return "踩了一条帖子" def _describe_repost(self) -> str: - """转发帖子 - 包含原帖内容和作者信息""" + """Repost — includes the original post text and author when available.""" original_content = self.action_args.get("original_content", "") original_author = self.action_args.get("original_author_name", "") @@ -107,7 +107,7 @@ class AgentActivity: return "转发了一条帖子" def _describe_quote_post(self) -> str: - """引用帖子 - 包含原帖内容、作者信息和引用评论""" + """Quote-post — includes the original post, author, and the quote comment.""" original_content = self.action_args.get("original_content", "") original_author = self.action_args.get("original_author_name", "") quote_content = self.action_args.get("quote_content", "") or self.action_args.get("content", "") @@ -127,7 +127,7 @@ class AgentActivity: return base def _describe_follow(self) -> str: - """关注用户 - 包含被关注用户的名称""" + """Follow a user — includes the followed user's name.""" target_user_name = self.action_args.get("target_user_name", "") if target_user_name: @@ -135,7 +135,7 @@ class AgentActivity: return "关注了一个用户" def _describe_create_comment(self) -> str: - """发表评论 - 包含评论内容和所评论的帖子信息""" + """Create a comment — includes the comment text and the parent post.""" content = self.action_args.get("content", "") post_content = self.action_args.get("post_content", "") post_author = self.action_args.get("post_author_name", "") @@ -151,7 +151,7 @@ class AgentActivity: return "发表了评论" def _describe_like_comment(self) -> str: - """点赞评论 - 包含评论内容和作者信息""" + """Like a comment — includes the comment text and author when available.""" comment_content = self.action_args.get("comment_content", "") comment_author = self.action_args.get("comment_author_name", "") @@ -164,7 +164,7 @@ class AgentActivity: return "点赞了一条评论" def _describe_dislike_comment(self) -> str: - """踩评论 - 包含评论内容和作者信息""" + """Dislike a comment — includes the comment text and author when available.""" comment_content = self.action_args.get("comment_content", "") comment_author = self.action_args.get("comment_author_name", "") @@ -177,17 +177,17 @@ class AgentActivity: return "踩了一条评论" def _describe_search(self) -> str: - """搜索帖子 - 包含搜索关键词""" + """Search posts — includes the search query.""" query = self.action_args.get("query", "") or self.action_args.get("keyword", "") return f"搜索了「{query}」" if query else "进行了搜索" def _describe_search_user(self) -> str: - """搜索用户 - 包含搜索关键词""" + """Search users — includes the search query.""" query = self.action_args.get("query", "") or self.action_args.get("username", "") return f"搜索了用户「{query}」" if query else "搜索了用户" def _describe_mute(self) -> str: - """屏蔽用户 - 包含被屏蔽用户的名称""" + """Mute a user — includes the muted user's name.""" target_user_name = self.action_args.get("target_user_name", "") if target_user_name: @@ -195,80 +195,79 @@ class AgentActivity: return "屏蔽了一个用户" def _describe_generic(self) -> str: - # 对于未知的动作类型,生成通用描述 + # Fallback narration for action types not handled explicitly above. return f"执行了{self.action_type}操作" class ZepGraphMemoryUpdater: - """ - Zep图谱记忆更新器 - - 监控模拟的actions日志文件,将新的agent活动实时更新到Zep图谱中。 - 按平台分组,每累积BATCH_SIZE条活动后批量发送到Zep。 - - 所有有意义的行为都会被更新到Zep,action_args中会包含完整的上下文信息: - - 点赞/踩的帖子原文 - - 转发/引用的帖子原文 - - 关注/屏蔽的用户名 - - 点赞/踩的评论原文 + """Zep graph memory updater. + + Watches a simulation's actions log file and streams new agent activity + into the Zep knowledge graph in near real time. Activities are grouped + by platform; each platform sends a batch once it has accumulated + ``BATCH_SIZE`` items. + + Every meaningful action is forwarded to Zep, with full context preserved + in ``action_args``: + + - Original text of liked / disliked posts + - Original text of reposted / quoted posts + - Names of followed / muted users + - Original text of liked / disliked comments """ - # 批量发送大小(每个平台累积多少条后发送) + # Number of activities to accumulate per platform before sending a batch. BATCH_SIZE = 5 - - # 平台名称映射(用于控制台显示) + + # Platform display names used for console / log output. PLATFORM_DISPLAY_NAMES = { 'twitter': '世界1', 'reddit': '世界2', } - - # 发送间隔(秒),避免请求过快 + + # Pause between sends (seconds) to avoid hammering the Zep API. SEND_INTERVAL = 0.5 - - # 重试配置 + MAX_RETRIES = 3 - RETRY_DELAY = 2 # 秒 + RETRY_DELAY = 2 # seconds def __init__(self, graph_id: str, api_key: Optional[str] = None): - """ - 初始化更新器 - + """Initialize the updater. + Args: - graph_id: Zep图谱ID - api_key: Zep API Key(可选,默认从配置读取) + graph_id: Zep graph ID. + api_key: Optional Zep API key; defaults to the value from config. """ self.graph_id = graph_id self.client = GraphitiAdapter() - - # 活动队列 + self._activity_queue: Queue = Queue() - - # 按平台分组的活动缓冲区(每个平台各自累积到BATCH_SIZE后批量发送) + + # Per-platform buffer; each platform flushes once it reaches BATCH_SIZE. self._platform_buffers: Dict[str, List[AgentActivity]] = { 'twitter': [], 'reddit': [], } self._buffer_lock = threading.Lock() - - # 控制标志 + self._running = False self._worker_thread: Optional[threading.Thread] = None - - # 统计 - self._total_activities = 0 # 实际添加到队列的活动数 - self._total_sent = 0 # 成功发送到Zep的批次数 - self._total_items_sent = 0 # 成功发送到Zep的活动条数 - self._failed_count = 0 # 发送失败的批次数 - self._skipped_count = 0 # 被过滤跳过的活动数(DO_NOTHING) + + # Counters + self._total_activities = 0 # activities accepted into the queue + self._total_sent = 0 # batches successfully sent to Zep + self._total_items_sent = 0 # individual activities successfully sent to Zep + self._failed_count = 0 # batches that failed to send + self._skipped_count = 0 # activities filtered out (e.g. DO_NOTHING) logger.info(t("log.zep_graph_memory_updater.m001", graph_id=graph_id, self=self.BATCH_SIZE)) def _get_platform_display_name(self, platform: str) -> str: - """获取平台的显示名称""" + """Return the human-friendly display name for a platform.""" return self.PLATFORM_DISPLAY_NAMES.get(platform.lower(), platform) def start(self): - """启动后台工作线程""" + """Start the background worker thread.""" if self._running: return @@ -286,10 +285,9 @@ class ZepGraphMemoryUpdater: logger.info(t("log.zep_graph_memory_updater.m002", self=self.graph_id)) def stop(self): - """停止后台工作线程""" + """Stop the background worker thread and flush pending activity.""" self._running = False - - # 发送剩余的活动 + self._flush_remaining() if self._worker_thread and self._worker_thread.is_alive(): @@ -298,27 +296,28 @@ class ZepGraphMemoryUpdater: logger.info(t("log.zep_graph_memory_updater.m003", self=self.graph_id, self_2=self._total_activities, self_3=self._total_sent, self_4=self._total_items_sent, self_5=self._failed_count, self_6=self._skipped_count)) def add_activity(self, activity: AgentActivity): - """ - 添加一个agent活动到队列 - - 所有有意义的行为都会被添加到队列,包括: - - CREATE_POST(发帖) - - CREATE_COMMENT(评论) - - QUOTE_POST(引用帖子) - - SEARCH_POSTS(搜索帖子) - - SEARCH_USER(搜索用户) - - LIKE_POST/DISLIKE_POST(点赞/踩帖子) - - REPOST(转发) - - FOLLOW(关注) - - MUTE(屏蔽) - - LIKE_COMMENT/DISLIKE_COMMENT(点赞/踩评论) - - action_args中会包含完整的上下文信息(如帖子原文、用户名等)。 - + """Enqueue a single agent activity for delivery to Zep. + + Every meaningful action is queued, including: + + - CREATE_POST (post) + - CREATE_COMMENT (comment) + - QUOTE_POST (quote a post) + - SEARCH_POSTS (search posts) + - SEARCH_USER (search users) + - LIKE_POST / DISLIKE_POST (like / dislike a post) + - REPOST (repost) + - FOLLOW (follow) + - MUTE (mute) + - LIKE_COMMENT / DISLIKE_COMMENT (like / dislike a comment) + + ``action_args`` carries the full context (e.g. original post text, + user names) so the graph episode is self-contained. + Args: - activity: Agent活动记录 + activity: The agent activity record to enqueue. """ - # 跳过DO_NOTHING类型的活动 + # DO_NOTHING actions carry no information worth indexing. if activity.action_type == "DO_NOTHING": self._skipped_count += 1 return @@ -328,14 +327,13 @@ class ZepGraphMemoryUpdater: logger.debug(t("log.zep_graph_memory_updater.m004", activity=activity.agent_name, activity_2=activity.action_type)) def add_activity_from_dict(self, data: Dict[str, Any], platform: str): - """ - 从字典数据添加活动 - + """Build an ``AgentActivity`` from a parsed JSON record and enqueue it. + Args: - data: 从actions.jsonl解析的字典数据 - platform: 平台名称 (twitter/reddit) + data: A dict parsed from a single ``actions.jsonl`` line. + platform: Source platform name (``twitter`` or ``reddit``). """ - # 跳过事件类型的条目 + # Event-type rows describe simulation lifecycle, not agent activity. if "event_type" in data: return @@ -352,28 +350,26 @@ class ZepGraphMemoryUpdater: self.add_activity(activity) def _worker_loop(self, locale: str = 'zh'): - """后台工作循环 - 按平台批量发送活动到Zep""" + """Background loop that drains the queue and flushes per-platform batches.""" set_locale(locale) while self._running or not self._activity_queue.empty(): try: - # 尝试从队列获取活动(超时1秒) + # Block briefly so the loop can also notice shutdown requests. try: activity = self._activity_queue.get(timeout=1) - - # 将活动添加到对应平台的缓冲区 + platform = activity.platform.lower() with self._buffer_lock: if platform not in self._platform_buffers: self._platform_buffers[platform] = [] self._platform_buffers[platform].append(activity) - - # 检查该平台是否达到批量大小 + if len(self._platform_buffers[platform]) >= self.BATCH_SIZE: batch = self._platform_buffers[platform][:self.BATCH_SIZE] self._platform_buffers[platform] = self._platform_buffers[platform][self.BATCH_SIZE:] - # 释放锁后再发送 + # Release the lock before issuing the network call. self._send_batch_activities(batch, platform) - # 发送间隔,避免请求过快 + # Throttle so we don't hammer the Zep API. time.sleep(self.SEND_INTERVAL) except Empty: @@ -384,21 +380,20 @@ class ZepGraphMemoryUpdater: time.sleep(1) def _send_batch_activities(self, activities: List[AgentActivity], platform: str): - """ - 批量发送活动到Zep图谱(合并为一条文本) - + """Send a batch of activities to the Zep graph as one combined episode. + Args: - activities: Agent活动列表 - platform: 平台名称 + activities: Agent activity records to send. + platform: Source platform name. """ if not activities: return - - # 将多条活动合并为一条文本,用换行分隔 + + # Concatenate the per-activity narrations into a single newline-separated episode. episode_texts = [activity.to_episode_text() for activity in activities] combined_text = "\n".join(episode_texts) - - # 带重试的发送 + + # Retry on failure with linear backoff. for attempt in range(self.MAX_RETRIES): try: self.client.graph.add( @@ -423,8 +418,8 @@ class ZepGraphMemoryUpdater: self._failed_count += 1 def _flush_remaining(self): - """发送队列和缓冲区中剩余的活动""" - # 首先处理队列中剩余的活动,添加到缓冲区 + """Drain the queue and flush every platform buffer, even partial ones.""" + # Move anything still in the queue into the per-platform buffers. while not self._activity_queue.empty(): try: activity = self._activity_queue.get_nowait() @@ -435,61 +430,55 @@ class ZepGraphMemoryUpdater: self._platform_buffers[platform].append(activity) except Empty: break - - # 然后发送各平台缓冲区中剩余的活动(即使不足BATCH_SIZE条) + + # Flush each platform buffer regardless of whether it reached BATCH_SIZE. with self._buffer_lock: for platform, buffer in self._platform_buffers.items(): if buffer: display_name = self._get_platform_display_name(platform) logger.info(t("log.zep_graph_memory_updater.m010", display_name=display_name, len=len(buffer))) self._send_batch_activities(buffer, platform) - # 清空所有缓冲区 for platform in self._platform_buffers: self._platform_buffers[platform] = [] def get_stats(self) -> Dict[str, Any]: - """获取统计信息""" + """Return a snapshot of updater statistics.""" with self._buffer_lock: buffer_sizes = {p: len(b) for p, b in self._platform_buffers.items()} - + return { "graph_id": self.graph_id, "batch_size": self.BATCH_SIZE, - "total_activities": self._total_activities, # 添加到队列的活动总数 - "batches_sent": self._total_sent, # 成功发送的批次数 - "items_sent": self._total_items_sent, # 成功发送的活动条数 - "failed_count": self._failed_count, # 发送失败的批次数 - "skipped_count": self._skipped_count, # 被过滤跳过的活动数(DO_NOTHING) + "total_activities": self._total_activities, # activities accepted into the queue + "batches_sent": self._total_sent, # batches successfully sent + "items_sent": self._total_items_sent, # activities successfully sent + "failed_count": self._failed_count, # batches that failed to send + "skipped_count": self._skipped_count, # activities filtered out (e.g. DO_NOTHING) "queue_size": self._activity_queue.qsize(), - "buffer_sizes": buffer_sizes, # 各平台缓冲区大小 + "buffer_sizes": buffer_sizes, # per-platform buffer depth "running": self._running, } class ZepGraphMemoryManager: - """ - 管理多个模拟的Zep图谱记忆更新器 - - 每个模拟可以有自己的更新器实例 - """ + """Registry that owns one ``ZepGraphMemoryUpdater`` per active simulation.""" _updaters: Dict[str, ZepGraphMemoryUpdater] = {} _lock = threading.Lock() @classmethod def create_updater(cls, simulation_id: str, graph_id: str) -> ZepGraphMemoryUpdater: - """ - 为模拟创建图谱记忆更新器 - + """Create (and start) a graph-memory updater for a simulation. + Args: - simulation_id: 模拟ID - graph_id: Zep图谱ID - + simulation_id: Simulation ID. + graph_id: Zep graph ID. + Returns: - ZepGraphMemoryUpdater实例 + The started ``ZepGraphMemoryUpdater`` instance. """ with cls._lock: - # 如果已存在,先停止旧的 + # An updater already exists for this simulation — stop it first. if simulation_id in cls._updaters: cls._updaters[simulation_id].stop() @@ -502,25 +491,24 @@ class ZepGraphMemoryManager: @classmethod def get_updater(cls, simulation_id: str) -> Optional[ZepGraphMemoryUpdater]: - """获取模拟的更新器""" + """Return the updater for a simulation, or ``None`` if absent.""" return cls._updaters.get(simulation_id) @classmethod def stop_updater(cls, simulation_id: str): - """停止并移除模拟的更新器""" + """Stop and deregister the updater belonging to a simulation.""" with cls._lock: if simulation_id in cls._updaters: cls._updaters[simulation_id].stop() del cls._updaters[simulation_id] logger.info(t("log.zep_graph_memory_updater.m012", simulation_id=simulation_id)) - # 防止 stop_all 重复调用的标志 + # Idempotency guard so ``stop_all`` only runs once per process lifetime. _stop_all_done = False - + @classmethod def stop_all(cls): - """停止所有更新器""" - # 防止重复调用 + """Stop every registered updater (idempotent).""" if cls._stop_all_done: return cls._stop_all_done = True @@ -537,7 +525,7 @@ class ZepGraphMemoryManager: @classmethod def get_all_stats(cls) -> Dict[str, Dict[str, Any]]: - """获取所有更新器的统计信息""" + """Return statistics for every registered updater.""" return { sim_id: updater.get_stats() for sim_id, updater in cls._updaters.items() diff --git a/backend/app/services/zep_tools.py b/backend/app/services/zep_tools.py index ac3059ff..1bcacce6 100644 --- a/backend/app/services/zep_tools.py +++ b/backend/app/services/zep_tools.py @@ -1,11 +1,13 @@ """ -Zep检索工具服务 -封装图谱搜索、节点读取、边查询等工具,供Report Agent使用 +Zep retrieval tool service. -核心检索工具(优化后): -1. InsightForge(深度洞察检索)- 最强大的混合检索,自动生成子问题并多维度检索 -2. PanoramaSearch(广度搜索)- 获取全貌,包括过期内容 -3. QuickSearch(简单搜索)- 快速检索 +Wraps graph search, node reads, and edge queries as tools consumed by the Report Agent. + +Core retrieval tools (optimized): +1. InsightForge (deep insight search) - the most powerful hybrid retrieval; auto-decomposes the + query into sub-questions and searches across multiple dimensions. +2. PanoramaSearch (breadth search) - returns the full picture including expired content. +3. QuickSearch (simple search) - lightweight, fast retrieval. """ import time @@ -26,7 +28,7 @@ logger = get_logger('mirofish.zep_tools') @dataclass class SearchResult: - """搜索结果""" + """Search result.""" facts: List[str] edges: List[Dict[str, Any]] nodes: List[Dict[str, Any]] @@ -43,7 +45,7 @@ class SearchResult: } def to_text(self) -> str: - """转换为文本格式,供LLM理解""" + """Render to text format for LLM consumption.""" text_parts = [f"搜索查询: {self.query}", f"找到 {self.total_count} 条相关信息"] if self.facts: @@ -56,7 +58,7 @@ class SearchResult: @dataclass class NodeInfo: - """节点信息""" + """Node information.""" uuid: str name: str labels: List[str] @@ -73,14 +75,14 @@ class NodeInfo: } def to_text(self) -> str: - """转换为文本格式""" + """Render to text format.""" entity_type = next((l for l in self.labels if l not in ["Entity", "Node"]), "未知类型") return f"实体: {self.name} (类型: {entity_type})\n摘要: {self.summary}" @dataclass class EdgeInfo: - """边信息""" + """Edge information.""" uuid: str name: str fact: str @@ -88,7 +90,6 @@ class EdgeInfo: target_node_uuid: str source_node_name: Optional[str] = None target_node_name: Optional[str] = None - # 时间信息 created_at: Optional[str] = None valid_at: Optional[str] = None invalid_at: Optional[str] = None @@ -110,7 +111,7 @@ class EdgeInfo: } def to_text(self, include_temporal: bool = False) -> str: - """转换为文本格式""" + """Render to text format.""" source = self.source_node_name or self.source_node_uuid[:8] target = self.target_node_name or self.target_node_uuid[:8] base_text = f"关系: {source} --[{self.name}]--> {target}\n事实: {self.fact}" @@ -126,31 +127,30 @@ class EdgeInfo: @property def is_expired(self) -> bool: - """是否已过期""" + """Whether this edge has expired.""" return self.expired_at is not None - + @property def is_invalid(self) -> bool: - """是否已失效""" + """Whether this edge has been invalidated.""" return self.invalid_at is not None @dataclass class InsightForgeResult: - """ - 深度洞察检索结果 (InsightForge) - 包含多个子问题的检索结果,以及综合分析 + """Deep-insight retrieval result (InsightForge). + + Holds the retrieval results from multiple sub-queries plus the synthesized analysis. """ query: str simulation_requirement: str sub_queries: List[str] - - # 各维度检索结果 - semantic_facts: List[str] = field(default_factory=list) # 语义搜索结果 - entity_insights: List[Dict[str, Any]] = field(default_factory=list) # 实体洞察 - relationship_chains: List[str] = field(default_factory=list) # 关系链 - - # 统计信息 + + # Retrieval results across multiple dimensions. + semantic_facts: List[str] = field(default_factory=list) + entity_insights: List[Dict[str, Any]] = field(default_factory=list) + relationship_chains: List[str] = field(default_factory=list) + total_facts: int = 0 total_entities: int = 0 total_relationships: int = 0 @@ -169,7 +169,7 @@ class InsightForgeResult: } def to_text(self) -> str: - """转换为详细的文本格式,供LLM理解""" + """Render a detailed text representation for the LLM.""" text_parts = [ f"## 未来预测深度分析", f"分析问题: {self.query}", @@ -179,20 +179,17 @@ class InsightForgeResult: f"- 涉及实体: {self.total_entities}个", f"- 关系链: {self.total_relationships}条" ] - - # 子问题 + if self.sub_queries: text_parts.append(f"\n### 分析的子问题") for i, sq in enumerate(self.sub_queries, 1): text_parts.append(f"{i}. {sq}") - - # 语义搜索结果 + if self.semantic_facts: text_parts.append(f"\n### 【关键事实】(请在报告中引用这些原文)") for i, fact in enumerate(self.semantic_facts, 1): text_parts.append(f"{i}. \"{fact}\"") - - # 实体洞察 + if self.entity_insights: text_parts.append(f"\n### 【核心实体】") for entity in self.entity_insights: @@ -201,34 +198,31 @@ class InsightForgeResult: text_parts.append(f" 摘要: \"{entity.get('summary')}\"") if entity.get('related_facts'): text_parts.append(f" 相关事实: {len(entity.get('related_facts', []))}条") - - # 关系链 + if self.relationship_chains: text_parts.append(f"\n### 【关系链】") for chain in self.relationship_chains: text_parts.append(f"- {chain}") - + return "\n".join(text_parts) @dataclass class PanoramaResult: - """ - 广度搜索结果 (Panorama) - 包含所有相关信息,包括过期内容 + """Breadth-search result (Panorama). + + Contains every piece of related information, including expired content. """ query: str - - # 全部节点 + all_nodes: List[NodeInfo] = field(default_factory=list) - # 全部边(包括过期的) + # All edges, including expired ones. all_edges: List[EdgeInfo] = field(default_factory=list) - # 当前有效的事实 + # Currently active facts. active_facts: List[str] = field(default_factory=list) - # 已过期/失效的事实(历史记录) + # Expired or invalidated facts (historical record). historical_facts: List[str] = field(default_factory=list) - - # 统计 + total_nodes: int = 0 total_edges: int = 0 active_count: int = 0 @@ -248,7 +242,7 @@ class PanoramaResult: } def to_text(self) -> str: - """转换为文本格式(完整版本,不截断)""" + """Render the full text format (no truncation).""" text_parts = [ f"## 广度搜索结果(未来全景视图)", f"查询: {self.query}", @@ -258,38 +252,38 @@ class PanoramaResult: f"- 当前有效事实: {self.active_count}条", f"- 历史/过期事实: {self.historical_count}条" ] - - # 当前有效的事实(完整输出,不截断) + + # Currently active facts (emit in full, no truncation). if self.active_facts: text_parts.append(f"\n### 【当前有效事实】(模拟结果原文)") for i, fact in enumerate(self.active_facts, 1): text_parts.append(f"{i}. \"{fact}\"") - - # 历史/过期事实(完整输出,不截断) + + # Historical / expired facts (emit in full, no truncation). if self.historical_facts: text_parts.append(f"\n### 【历史/过期事实】(演变过程记录)") for i, fact in enumerate(self.historical_facts, 1): text_parts.append(f"{i}. \"{fact}\"") - - # 关键实体(完整输出,不截断) + + # Key entities (emit in full, no truncation). if self.all_nodes: text_parts.append(f"\n### 【涉及实体】") for node in self.all_nodes: entity_type = next((l for l in node.labels if l not in ["Entity", "Node"]), "实体") text_parts.append(f"- **{node.name}** ({entity_type})") - + return "\n".join(text_parts) @dataclass class AgentInterview: - """单个Agent的采访结果""" + """Interview result for a single agent.""" agent_name: str - agent_role: str # 角色类型(如:学生、教师、媒体等) - agent_bio: str # 简介 - question: str # 采访问题 - response: str # 采访回答 - key_quotes: List[str] = field(default_factory=list) # 关键引言 + agent_role: str + agent_bio: str + question: str + response: str + key_quotes: List[str] = field(default_factory=list) def to_dict(self) -> Dict[str, Any]: return { @@ -303,21 +297,21 @@ class AgentInterview: def to_text(self) -> str: text = f"**{self.agent_name}** ({self.agent_role})\n" - # 显示完整的agent_bio,不截断 + # Render the full agent_bio without truncation. text += f"_简介: {self.agent_bio}_\n\n" text += f"**Q:** {self.question}\n\n" text += f"**A:** {self.response}\n" if self.key_quotes: text += "\n**关键引言:**\n" for quote in self.key_quotes: - # 清理各种引号 + # Strip the various quote characters (curly quotes and CJK corner brackets). clean_quote = quote.replace('\u201c', '').replace('\u201d', '').replace('"', '') clean_quote = clean_quote.replace('\u300c', '').replace('\u300d', '') clean_quote = clean_quote.strip() - # 去掉开头的标点 + # Drop any leading punctuation. while clean_quote and clean_quote[0] in ',,;;::、。!?\n\r\t ': clean_quote = clean_quote[1:] - # 过滤包含问题编号的垃圾内容(问题1-9) + # Skip junk content that contains a question-number label (e.g. labels 1-9). skip = False for d in '123456789': if f'\u95ee\u9898{d}' in clean_quote: @@ -325,7 +319,7 @@ class AgentInterview: break if skip: continue - # 截断过长内容(按句号截断,而非硬截断) + # Truncate over-long quotes at the next period rather than a hard cut. if len(clean_quote) > 150: dot_pos = clean_quote.find('\u3002', 80) if dot_pos > 0: @@ -339,24 +333,23 @@ class AgentInterview: @dataclass class InterviewResult: + """Interview result. + + Aggregates the responses from multiple simulated agents. """ - 采访结果 (Interview) - 包含多个模拟Agent的采访回答 - """ - interview_topic: str # 采访主题 - interview_questions: List[str] # 采访问题列表 - - # 采访选择的Agent + interview_topic: str + interview_questions: List[str] + + # Agents chosen for the interview. selected_agents: List[Dict[str, Any]] = field(default_factory=list) - # 各Agent的采访回答 + # Per-agent interview responses. interviews: List[AgentInterview] = field(default_factory=list) - - # 选择Agent的理由 + + # Reasoning for the agent selection. selection_reasoning: str = "" - # 整合后的采访摘要 + # Synthesized interview summary. summary: str = "" - - # 统计 + total_agents: int = 0 interviewed_count: int = 0 @@ -373,7 +366,7 @@ class InterviewResult: } def to_text(self) -> str: - """转换为详细的文本格式,供LLM理解和报告引用""" + """Render a detailed text representation for the LLM and report citations.""" text_parts = [ "## 深度采访报告", f"**采访主题:** {self.interview_topic}", @@ -399,44 +392,45 @@ class InterviewResult: class ZepToolsService: + """Zep retrieval tool service. + + Core retrieval tools (optimized): + 1. insight_forge - deep-insight search (most powerful; auto-generates sub-questions + and searches across multiple dimensions). + 2. panorama_search - breadth search (full picture including expired content). + 3. quick_search - simple, fast retrieval. + 4. interview_agents - deep interview (interviews simulated agents and gathers + perspectives from multiple roles). + + Basic tools: + - search_graph - semantic graph search. + - get_all_nodes - fetch every node in the graph. + - get_all_edges - fetch every edge in the graph (with temporal info). + - get_node_detail - fetch a single node's details. + - get_node_edges - fetch edges incident to a node. + - get_entities_by_type - fetch entities filtered by type. + - get_entity_summary - fetch a relationship summary for an entity. """ - Zep检索工具服务 - - 【核心检索工具 - 优化后】 - 1. insight_forge - 深度洞察检索(最强大,自动生成子问题,多维度检索) - 2. panorama_search - 广度搜索(获取全貌,包括过期内容) - 3. quick_search - 简单搜索(快速检索) - 4. interview_agents - 深度采访(采访模拟Agent,获取多视角观点) - - 【基础工具】 - - search_graph - 图谱语义搜索 - - get_all_nodes - 获取图谱所有节点 - - get_all_edges - 获取图谱所有边(含时间信息) - - get_node_detail - 获取节点详细信息 - - get_node_edges - 获取节点相关的边 - - get_entities_by_type - 按类型获取实体 - - get_entity_summary - 获取实体的关系摘要 - """ - - # 重试配置 + + # Retry configuration. MAX_RETRIES = 3 RETRY_DELAY = 2.0 def __init__(self, api_key: Optional[str] = None, llm_client: Optional[LLMClient] = None): self.client = GraphitiAdapter() - # LLM客户端用于InsightForge生成子问题 + # LLM client used by InsightForge to generate sub-questions. self._llm_client = llm_client logger.info(t("log.zep_tools.m001")) @property def llm(self) -> LLMClient: - """延迟初始化LLM客户端""" + """Lazily initialize the LLM client.""" if self._llm_client is None: self._llm_client = LLMClient() return self._llm_client def _call_with_retry(self, func, operation_name: str, max_retries: int = None): - """带重试机制的API调用(自动处理429限速)""" + """API call with retry (auto-handles HTTP 429 rate limiting).""" max_retries = max_retries or self.MAX_RETRIES last_exception = None delay = self.RETRY_DELAY @@ -447,7 +441,7 @@ class ZepToolsService: except Exception as e: last_exception = e if attempt < max_retries - 1: - # 检测429限速错误,使用retry-after头部的等待时间 + # On HTTP 429 rate-limit errors, honour the retry-after header. wait = delay if hasattr(e, 'status_code') and e.status_code == 429: retry_after = None @@ -475,24 +469,23 @@ class ZepToolsService: limit: int = 10, scope: str = "edges" ) -> SearchResult: - """ - 图谱语义搜索 - - 使用混合搜索(语义+BM25)在图谱中搜索相关信息。 - 如果Zep Cloud的search API不可用,则降级为本地关键词匹配。 - + """Semantic graph search. + + Performs a hybrid search (semantic + BM25) over the graph. If the Zep Cloud search + API is unavailable, falls back to local keyword matching. + Args: - graph_id: 图谱ID (Standalone Graph) - query: 搜索查询 - limit: 返回结果数量 - scope: 搜索范围,"edges" 或 "nodes" - + graph_id: Graph identifier (Standalone Graph). + query: Search query. + limit: Maximum number of results to return. + scope: Search scope, either "edges" or "nodes". + Returns: - SearchResult: 搜索结果 + SearchResult: The search result. """ logger.info(t("log.zep_tools.m005", graph_id=graph_id, query=query[:50])) - - # 尝试使用Zep Cloud Search API + + # Try the Zep Cloud Search API first. try: search_results = self._call_with_retry( func=lambda: self.client.graph.search( @@ -508,7 +501,7 @@ class ZepToolsService: edges = [] nodes = [] - # 解析边搜索结果 + # Parse edge search results. if hasattr(search_results, 'edges') and search_results.edges: for edge in search_results.edges: if hasattr(edge, 'fact') and edge.fact: @@ -521,7 +514,7 @@ class ZepToolsService: "target_node_uuid": getattr(edge, 'target_node_uuid', ''), }) - # 解析节点搜索结果 + # Parse node search results. if hasattr(search_results, 'nodes') and search_results.nodes: for node in search_results.nodes: nodes.append({ @@ -530,7 +523,7 @@ class ZepToolsService: "labels": getattr(node, 'labels', []), "summary": getattr(node, 'summary', ''), }) - # 节点摘要也算作事实 + # Treat node summaries as facts too. if hasattr(node, 'summary') and node.summary: facts.append(f"[{node.name}]: {node.summary}") @@ -546,7 +539,7 @@ class ZepToolsService: except Exception as e: logger.warning(t("log.zep_tools.m007", str=str(e))) - # 降级:使用本地关键词匹配搜索 + # Fallback: local keyword-matching search. return self._local_search(graph_id, query, limit, scope) def _local_search( @@ -556,19 +549,18 @@ class ZepToolsService: limit: int = 10, scope: str = "edges" ) -> SearchResult: - """ - 本地关键词匹配搜索(作为Zep Search API的降级方案) - - 获取所有边/节点,然后在本地进行关键词匹配 - + """Local keyword-matching search (fallback for the Zep Search API). + + Loads all edges/nodes and matches them locally on the query keywords. + Args: - graph_id: 图谱ID - query: 搜索查询 - limit: 返回结果数量 - scope: 搜索范围 - + graph_id: Graph identifier. + query: Search query. + limit: Maximum number of results to return. + scope: Search scope. + Returns: - SearchResult: 搜索结果 + SearchResult: The search result. """ logger.info(t("log.zep_tools.m008", query=query[:30])) @@ -576,19 +568,19 @@ class ZepToolsService: edges_result = [] nodes_result = [] - # 提取查询关键词(简单分词) + # Extract query keywords with naive whitespace tokenization. query_lower = query.lower() keywords = [w.strip() for w in query_lower.replace(',', ' ').replace(',', ' ').split() if len(w.strip()) > 1] def match_score(text: str) -> int: - """计算文本与查询的匹配分数""" + """Compute the match score between the text and the query.""" if not text: return 0 text_lower = text.lower() - # 完全匹配查询 + # Exact match against the full query. if query_lower in text_lower: return 100 - # 关键词匹配 + # Per-keyword match. score = 0 for keyword in keywords: if keyword in text_lower: @@ -597,7 +589,7 @@ class ZepToolsService: try: if scope in ["edges", "both"]: - # 获取所有边并匹配 + # Fetch every edge and score each one. all_edges = self.get_all_edges(graph_id) scored_edges = [] for edge in all_edges: @@ -605,7 +597,7 @@ class ZepToolsService: if score > 0: scored_edges.append((score, edge)) - # 按分数排序 + # Sort by score descending. scored_edges.sort(key=lambda x: x[0], reverse=True) for score, edge in scored_edges[:limit]: @@ -620,7 +612,7 @@ class ZepToolsService: }) if scope in ["nodes", "both"]: - # 获取所有节点并匹配 + # Fetch every node and score each one. all_nodes = self.get_all_nodes(graph_id) scored_nodes = [] for node in all_nodes: @@ -654,14 +646,13 @@ class ZepToolsService: ) def get_all_nodes(self, graph_id: str) -> List[NodeInfo]: - """ - 获取图谱的所有节点(分页获取) + """Fetch every node in the graph (with pagination). Args: - graph_id: 图谱ID + graph_id: Graph identifier. Returns: - 节点列表 + List of nodes. """ logger.info(t("log.zep_tools.m011", graph_id=graph_id)) @@ -682,15 +673,14 @@ class ZepToolsService: return result def get_all_edges(self, graph_id: str, include_temporal: bool = True) -> List[EdgeInfo]: - """ - 获取图谱的所有边(分页获取,包含时间信息) + """Fetch every edge in the graph (with pagination), including temporal info. Args: - graph_id: 图谱ID - include_temporal: 是否包含时间信息(默认True) + graph_id: Graph identifier. + include_temporal: Whether to include temporal fields (default True). Returns: - 边列表(包含created_at, valid_at, invalid_at, expired_at) + List of edges, including created_at, valid_at, invalid_at, and expired_at. """ logger.info(t("log.zep_tools.m013", graph_id=graph_id)) @@ -707,7 +697,7 @@ class ZepToolsService: target_node_uuid=edge.target_node_uuid or "" ) - # 添加时间信息 + # Attach temporal info. if include_temporal: edge_info.created_at = getattr(edge, 'created_at', None) edge_info.valid_at = getattr(edge, 'valid_at', None) @@ -720,14 +710,13 @@ class ZepToolsService: return result def get_node_detail(self, node_uuid: str) -> Optional[NodeInfo]: - """ - 获取单个节点的详细信息 - + """Fetch the details of a single node. + Args: - node_uuid: 节点UUID - + node_uuid: Node UUID. + Returns: - 节点信息或None + Node info, or None if not found. """ logger.info(t("log.zep_tools.m015", node_uuid=node_uuid[:8])) @@ -752,27 +741,26 @@ class ZepToolsService: return None def get_node_edges(self, graph_id: str, node_uuid: str) -> List[EdgeInfo]: - """ - 获取节点相关的所有边 - - 通过获取图谱所有边,然后过滤出与指定节点相关的边 - + """Fetch all edges incident to a node. + + Loads every edge in the graph and filters to those connected to the given node. + Args: - graph_id: 图谱ID - node_uuid: 节点UUID - + graph_id: Graph identifier. + node_uuid: Node UUID. + Returns: - 边列表 + List of edges incident to the node. """ logger.info(t("log.zep_tools.m017", node_uuid=node_uuid[:8])) try: - # 获取图谱所有边,然后过滤 + # Load every edge in the graph, then filter. all_edges = self.get_all_edges(graph_id) - + result = [] for edge in all_edges: - # 检查边是否与指定节点相关(作为源或目标) + # Keep the edge if it is incident to this node (as source or target). if edge.source_node_uuid == node_uuid or edge.target_node_uuid == node_uuid: result.append(edge) @@ -788,15 +776,14 @@ class ZepToolsService: graph_id: str, entity_type: str ) -> List[NodeInfo]: - """ - 按类型获取实体 - + """Fetch entities filtered by type. + Args: - graph_id: 图谱ID - entity_type: 实体类型(如 Student, PublicFigure 等) - + graph_id: Graph identifier. + entity_type: Entity type (e.g. Student, PublicFigure). + Returns: - 符合类型的实体列表 + Entities matching the requested type. """ logger.info(t("log.zep_tools.m020", entity_type=entity_type)) @@ -804,7 +791,7 @@ class ZepToolsService: filtered = [] for node in all_nodes: - # 检查labels是否包含指定类型 + # Keep the node if its labels include the requested type. if entity_type in node.labels: filtered.append(node) @@ -816,28 +803,27 @@ class ZepToolsService: graph_id: str, entity_name: str ) -> Dict[str, Any]: - """ - 获取指定实体的关系摘要 - - 搜索与该实体相关的所有信息,并生成摘要 - + """Fetch the relationship summary for an entity. + + Searches for everything related to the entity and assembles a summary. + Args: - graph_id: 图谱ID - entity_name: 实体名称 - + graph_id: Graph identifier. + entity_name: Entity name. + Returns: - 实体摘要信息 + Entity summary information. """ logger.info(t("log.zep_tools.m022", entity_name=entity_name)) - - # 先搜索该实体相关的信息 + + # First, search for information about this entity. search_result = self.search_graph( graph_id=graph_id, query=entity_name, limit=20 ) - # 尝试在所有节点中找到该实体 + # Try to locate the entity in the full node list. all_nodes = self.get_all_nodes(graph_id) entity_node = None for node in all_nodes: @@ -847,7 +833,7 @@ class ZepToolsService: related_edges = [] if entity_node: - # 传入graph_id参数 + # Pass through the graph_id parameter. related_edges = self.get_node_edges(graph_id, entity_node.uuid) return { @@ -859,28 +845,27 @@ class ZepToolsService: } def get_graph_statistics(self, graph_id: str) -> Dict[str, Any]: - """ - 获取图谱的统计信息 - + """Fetch statistics about the graph. + Args: - graph_id: 图谱ID - + graph_id: Graph identifier. + Returns: - 统计信息 + Statistics dictionary. """ logger.info(t("log.zep_tools.m023", graph_id=graph_id)) nodes = self.get_all_nodes(graph_id) edges = self.get_all_edges(graph_id) - # 统计实体类型分布 + # Tally entity type distribution. entity_types = {} for node in nodes: for label in node.labels: if label not in ["Entity", "Node"]: entity_types[label] = entity_types.get(label, 0) + 1 - # 统计关系类型分布 + # Tally relation type distribution. relation_types = {} for edge in edges: relation_types[edge.name] = relation_types.get(edge.name, 0) + 1 @@ -899,35 +884,34 @@ class ZepToolsService: simulation_requirement: str, limit: int = 30 ) -> Dict[str, Any]: - """ - 获取模拟相关的上下文信息 - - 综合搜索与模拟需求相关的所有信息 - + """Fetch simulation-related context. + + Combines a search over the simulation requirement with graph statistics and entities. + Args: - graph_id: 图谱ID - simulation_requirement: 模拟需求描述 - limit: 每类信息的数量限制 - + graph_id: Graph identifier. + simulation_requirement: Description of the simulation requirement. + limit: Per-category result limit. + Returns: - 模拟上下文信息 + Simulation context information. """ logger.info(t("log.zep_tools.m024", simulation_requirement=simulation_requirement[:50])) - - # 搜索与模拟需求相关的信息 + + # Search for information related to the simulation requirement. search_result = self.search_graph( graph_id=graph_id, query=simulation_requirement, limit=limit ) - - # 获取图谱统计 + + # Pull graph statistics. stats = self.get_graph_statistics(graph_id) - - # 获取所有实体节点 + + # Load every entity node. all_nodes = self.get_all_nodes(graph_id) - - # 筛选有实际类型的实体(非纯Entity节点) + + # Keep entities that have a concrete type (skip plain Entity nodes). entities = [] for node in all_nodes: custom_labels = [l for l in node.labels if l not in ["Entity", "Node"]] @@ -942,11 +926,11 @@ class ZepToolsService: "simulation_requirement": simulation_requirement, "related_facts": search_result.facts, "graph_statistics": stats, - "entities": entities[:limit], # 限制数量 + "entities": entities[:limit], # Cap entity count. "total_entities": len(entities) } - # ========== 核心检索工具(优化后) ========== + # ========== Core retrieval tools (optimized) ========== def insight_forge( self, @@ -956,25 +940,25 @@ class ZepToolsService: report_context: str = "", max_sub_queries: int = 5 ) -> InsightForgeResult: - """ - 【InsightForge - 深度洞察检索】 - - 最强大的混合检索函数,自动分解问题并多维度检索: - 1. 使用LLM将问题分解为多个子问题 - 2. 对每个子问题进行语义搜索 - 3. 提取相关实体并获取其详细信息 - 4. 追踪关系链 - 5. 整合所有结果,生成深度洞察 - + """InsightForge - deep-insight retrieval. + + Most powerful hybrid retrieval. Auto-decomposes the user question and searches across + multiple dimensions: + 1. Uses an LLM to decompose the question into sub-questions. + 2. Runs a semantic search for each sub-question. + 3. Extracts related entities and fetches their details. + 4. Traces relationship chains. + 5. Synthesises everything into a deep-insight payload. + Args: - graph_id: 图谱ID - query: 用户问题 - simulation_requirement: 模拟需求描述 - report_context: 报告上下文(可选,用于更精准的子问题生成) - max_sub_queries: 最大子问题数量 - + graph_id: Graph identifier. + query: The user's question. + simulation_requirement: Description of the simulation requirement. + report_context: Report context (optional; used to ground sub-question generation). + max_sub_queries: Maximum number of sub-questions to generate. + Returns: - InsightForgeResult: 深度洞察检索结果 + InsightForgeResult: The deep-insight retrieval result. """ logger.info(t("log.zep_tools.m025", query=query[:50])) @@ -984,7 +968,7 @@ class ZepToolsService: sub_queries=[] ) - # Step 1: 使用LLM生成子问题 + # Step 1: Use the LLM to generate sub-questions. sub_queries = self._generate_sub_queries( query=query, simulation_requirement=simulation_requirement, @@ -994,7 +978,7 @@ class ZepToolsService: result.sub_queries = sub_queries logger.info(t("log.zep_tools.m026", len=len(sub_queries))) - # Step 2: 对每个子问题进行语义搜索 + # Step 2: Run a semantic search for each sub-question. all_facts = [] all_edges = [] seen_facts = set() @@ -1014,7 +998,7 @@ class ZepToolsService: all_edges.extend(search_result.edges) - # 对原始问题也进行搜索 + # Also search using the original question. main_search = self.search_graph( graph_id=graph_id, query=query, @@ -1029,7 +1013,8 @@ class ZepToolsService: result.semantic_facts = all_facts result.total_facts = len(all_facts) - # Step 3: 从边中提取相关实体UUID,只获取这些实体的信息(不获取全部节点) + # Step 3: Pull related entity UUIDs from the edges and only fetch those nodes + # (rather than every node in the graph). entity_uuids = set() for edge_data in all_edges: if isinstance(edge_data, dict): @@ -1040,32 +1025,32 @@ class ZepToolsService: if target_uuid: entity_uuids.add(target_uuid) - # 获取所有相关实体的详情(不限制数量,完整输出) + # Fetch details for every related entity (no cap, emit in full). entity_insights = [] - node_map = {} # 用于后续关系链构建 - - for uuid in list(entity_uuids): # 处理所有实体,不截断 + node_map = {} # Cached for relationship-chain assembly below. + + for uuid in list(entity_uuids): # Walk every related entity, no truncation. if not uuid: continue try: - # 单独获取每个相关节点的信息 + # Fetch each related node individually. node = self.get_node_detail(uuid) if node: node_map[uuid] = node entity_type = next((l for l in node.labels if l not in ["Entity", "Node"]), "实体") - - # 获取该实体相关的所有事实(不截断) + + # Collect every fact related to this entity (no truncation). related_facts = [ - f for f in all_facts + f for f in all_facts if node.name.lower() in f.lower() ] - + entity_insights.append({ "uuid": node.uuid, "name": node.name, "type": entity_type, "summary": node.summary, - "related_facts": related_facts # 完整输出,不截断 + "related_facts": related_facts }) except Exception as e: logger.debug(t("log.zep_tools.m027", uuid=uuid, e=e)) @@ -1074,9 +1059,9 @@ class ZepToolsService: result.entity_insights = entity_insights result.total_entities = len(entity_insights) - # Step 4: 构建所有关系链(不限制数量) + # Step 4: Assemble every relationship chain (no cap). relationship_chains = [] - for edge_data in all_edges: # 处理所有边,不截断 + for edge_data in all_edges: # Walk every edge, no truncation. if isinstance(edge_data, dict): source_uuid = edge_data.get('source_node_uuid', '') target_uuid = edge_data.get('target_node_uuid', '') @@ -1102,10 +1087,10 @@ class ZepToolsService: report_context: str = "", max_queries: int = 5 ) -> List[str]: - """ - 使用LLM生成子问题 - - 将复杂问题分解为多个可以独立检索的子问题 + """Use the LLM to generate sub-questions. + + Decomposes a complex question into multiple sub-questions that can be retrieved + independently. """ system_prompt = """你是一个专业的问题分析专家。你的任务是将一个复杂问题分解为多个可以在模拟世界中独立观察的子问题。 @@ -1135,12 +1120,12 @@ class ZepToolsService: ) sub_queries = response.get("sub_queries", []) - # 确保是字符串列表 + # Coerce to a list of strings. return [str(sq) for sq in sub_queries[:max_queries]] - + except Exception as e: logger.warning(t("log.zep_tools.m029", str=str(e))) - # 降级:返回基于原问题的变体 + # Fallback: return variants of the original question. return [ query, f"{query} 的主要参与者", @@ -1155,41 +1140,41 @@ class ZepToolsService: include_expired: bool = True, limit: int = 50 ) -> PanoramaResult: - """ - 【PanoramaSearch - 广度搜索】 - - 获取全貌视图,包括所有相关内容和历史/过期信息: - 1. 获取所有相关节点 - 2. 获取所有边(包括已过期/失效的) - 3. 分类整理当前有效和历史信息 - - 这个工具适用于需要了解事件全貌、追踪演变过程的场景。 - + """PanoramaSearch - breadth search. + + Returns the full picture, including all related content and historical/expired info: + 1. Fetches every related node. + 2. Fetches every edge (including expired/invalidated ones). + 3. Sorts the facts into currently-active and historical buckets. + + Use this tool when callers need to understand the full event landscape or trace how + something evolved over time. + Args: - graph_id: 图谱ID - query: 搜索查询(用于相关性排序) - include_expired: 是否包含过期内容(默认True) - limit: 返回结果数量限制 - + graph_id: Graph identifier. + query: Search query (used for relevance ranking). + include_expired: Whether to include expired content (default True). + limit: Maximum number of results to return. + Returns: - PanoramaResult: 广度搜索结果 + PanoramaResult: The breadth-search result. """ logger.info(t("log.zep_tools.m030", query=query[:50])) result = PanoramaResult(query=query) - # 获取所有节点 + # Fetch every node. all_nodes = self.get_all_nodes(graph_id) node_map = {n.uuid: n for n in all_nodes} result.all_nodes = all_nodes result.total_nodes = len(all_nodes) - # 获取所有边(包含时间信息) + # Fetch every edge (with temporal info). all_edges = self.get_all_edges(graph_id, include_temporal=True) result.all_edges = all_edges result.total_edges = len(all_edges) - # 分类事实 + # Bucket facts into active vs. historical. active_facts = [] historical_facts = [] @@ -1197,24 +1182,24 @@ class ZepToolsService: if not edge.fact: continue - # 为事实添加实体名称 + # Attach entity names to the fact. source_name = node_map.get(edge.source_node_uuid, NodeInfo('', '', [], '', {})).name or edge.source_node_uuid[:8] target_name = node_map.get(edge.target_node_uuid, NodeInfo('', '', [], '', {})).name or edge.target_node_uuid[:8] - # 判断是否过期/失效 + # Decide whether the edge is historical (expired or invalidated). is_historical = edge.is_expired or edge.is_invalid - + if is_historical: - # 历史/过期事实,添加时间标记 + # Historical/expired fact, prepend a time marker. valid_at = edge.valid_at or "未知" invalid_at = edge.invalid_at or edge.expired_at or "未知" fact_with_time = f"[{valid_at} - {invalid_at}] {edge.fact}" historical_facts.append(fact_with_time) else: - # 当前有效事实 + # Currently active fact. active_facts.append(edge.fact) - # 基于查询进行相关性排序 + # Relevance-rank against the query. query_lower = query.lower() keywords = [w.strip() for w in query_lower.replace(',', ' ').replace(',', ' ').split() if len(w.strip()) > 1] @@ -1228,7 +1213,7 @@ class ZepToolsService: score += 10 return score - # 排序并限制数量 + # Sort and apply the result limit. active_facts.sort(key=relevance_score, reverse=True) historical_facts.sort(key=relevance_score, reverse=True) @@ -1246,25 +1231,22 @@ class ZepToolsService: query: str, limit: int = 10 ) -> SearchResult: - """ - 【QuickSearch - 简单搜索】 - - 快速、轻量级的检索工具: - 1. 直接调用Zep语义搜索 - 2. 返回最相关的结果 - 3. 适用于简单、直接的检索需求 - + """QuickSearch - simple, lightweight retrieval. + + Calls Zep's semantic search directly and returns the most relevant results. Use this + for simple, straightforward retrieval needs. + Args: - graph_id: 图谱ID - query: 搜索查询 - limit: 返回结果数量 - + graph_id: Graph identifier. + query: Search query. + limit: Maximum number of results to return. + Returns: - SearchResult: 搜索结果 + SearchResult: The search result. """ logger.info(t("log.zep_tools.m032", query=query[:50])) - - # 直接调用现有的search_graph方法 + + # Delegate to the existing search_graph implementation. result = self.search_graph( graph_id=graph_id, query=query, @@ -1283,32 +1265,38 @@ class ZepToolsService: max_agents: int = 5, custom_questions: List[str] = None ) -> InterviewResult: - """ - 【InterviewAgents - 深度采访】 - - 调用真实的OASIS采访API,采访模拟中正在运行的Agent: - 1. 自动读取人设文件,了解所有模拟Agent - 2. 使用LLM分析采访需求,智能选择最相关的Agent - 3. 使用LLM生成采访问题 - 4. 调用 /api/simulation/interview/batch 接口进行真实采访(双平台同时采访) - 5. 整合所有采访结果,生成采访报告 - - 【重要】此功能需要模拟环境处于运行状态(OASIS环境未关闭) - - 【使用场景】 - - 需要从不同角色视角了解事件看法 - - 需要收集多方意见和观点 - - 需要获取模拟Agent的真实回答(非LLM模拟) - + """InterviewAgents - deep interview. + + Calls the real OASIS interview API and interviews agents that are currently running + in the simulation: + 1. Reads the agent persona file to learn the available simulated agents. + 2. Uses an LLM to analyse the interview requirement and pick the most relevant + agents. + 3. Uses an LLM to generate interview questions. + 4. Calls /api/simulation/interview/batch to run the real interview (across both + Twitter and Reddit platforms simultaneously). + 5. Aggregates the interview responses into a report. + + Important: this requires the simulation environment to be running (the OASIS + environment must not be torn down). + + Use cases: + - Understanding how different roles view an event. + - Collecting opinions from multiple sides. + - Getting genuine responses from simulated agents (rather than LLM-only + simulation). + Args: - simulation_id: 模拟ID(用于定位人设文件和调用采访API) - interview_requirement: 采访需求描述(非结构化,如"了解学生对事件的看法") - simulation_requirement: 模拟需求背景(可选) - max_agents: 最多采访的Agent数量 - custom_questions: 自定义采访问题(可选,若不提供则自动生成) - + simulation_id: Simulation identifier (used to locate persona files and call the + interview API). + interview_requirement: Free-form interview brief (e.g. "understand how students + view the event"). + simulation_requirement: Background context for the simulation (optional). + max_agents: Maximum number of agents to interview. + custom_questions: Custom interview questions (optional; auto-generated if absent). + Returns: - InterviewResult: 采访结果 + InterviewResult: The interview result. """ from .simulation_runner import SimulationRunner @@ -1319,7 +1307,7 @@ class ZepToolsService: interview_questions=custom_questions or [] ) - # Step 1: 读取人设文件 + # Step 1: Load the persona file. profiles = self._load_agent_profiles(simulation_id) if not profiles: @@ -1330,7 +1318,7 @@ class ZepToolsService: result.total_agents = len(profiles) logger.info(t("log.zep_tools.m036", len=len(profiles))) - # Step 2: 使用LLM选择要采访的Agent(返回agent_id列表) + # Step 2: Use the LLM to pick interview targets (returns a list of agent IDs). selected_agents, selected_indices, selection_reasoning = self._select_agents_for_interview( profiles=profiles, interview_requirement=interview_requirement, @@ -1342,7 +1330,7 @@ class ZepToolsService: result.selection_reasoning = selection_reasoning logger.info(t("log.zep_tools.m037", len=len(selected_agents), selected_indices=selected_indices)) - # Step 3: 生成采访问题(如果没有提供) + # Step 3: Generate interview questions (if none were supplied). if not result.interview_questions: result.interview_questions = self._generate_interview_questions( interview_requirement=interview_requirement, @@ -1351,10 +1339,10 @@ class ZepToolsService: ) logger.info(t("log.zep_tools.m038", len=len(result.interview_questions))) - # 将问题合并为一个采访prompt + # Merge the questions into a single interview prompt. combined_prompt = "\n".join([f"{i+1}. {q}" for i, q in enumerate(result.interview_questions)]) - - # 添加优化前缀,约束Agent回复格式 + + # Prepend an optimised prefix that constrains the agent's reply format. INTERVIEW_PROMPT_PREFIX = ( "你正在接受一次采访。请结合你的人设、所有的过往记忆与行动," "以纯文本方式直接回答以下问题。\n" @@ -1368,38 +1356,39 @@ class ZepToolsService: ) optimized_prompt = f"{INTERVIEW_PROMPT_PREFIX}{combined_prompt}" - # Step 4: 调用真实的采访API(不指定platform,默认双平台同时采访) + # Step 4: Call the real interview API. We omit the platform field so the API + # interviews on both Twitter and Reddit by default. try: - # 构建批量采访列表(不指定platform,双平台采访) + # Build the batch-interview list (no platform => both platforms). interviews_request = [] for agent_idx in selected_indices: interviews_request.append({ "agent_id": agent_idx, - "prompt": optimized_prompt # 使用优化后的prompt - # 不指定platform,API会在twitter和reddit两个平台都采访 + "prompt": optimized_prompt + # Omitting platform asks the API to interview on both Twitter and Reddit. }) logger.info(t("log.zep_tools.m039", len=len(interviews_request))) - # 调用 SimulationRunner 的批量采访方法(不传platform,双平台采访) + # Call SimulationRunner's batch interview helper (no platform => both platforms). api_result = SimulationRunner.interview_agents_batch( simulation_id=simulation_id, interviews=interviews_request, - platform=None, # 不指定platform,双平台采访 - timeout=180.0 # 双平台需要更长超时 + platform=None, # Omitting platform interviews both Twitter and Reddit. + timeout=180.0 # Dual-platform mode needs a longer timeout. ) logger.info(t("log.zep_tools.m040", api_result=api_result.get('interviews_count', 0), api_result_2=api_result.get('success'))) - # 检查API调用是否成功 + # Check whether the API call succeeded. if not api_result.get("success", False): error_msg = api_result.get("error", "未知错误") logger.warning(t("log.zep_tools.m041", error_msg=error_msg)) result.summary = f"采访API调用失败:{error_msg}。请检查OASIS模拟环境状态。" return result - # Step 5: 解析API返回结果,构建AgentInterview对象 - # 双平台模式返回格式: {"twitter_0": {...}, "reddit_0": {...}, "twitter_1": {...}, ...} + # Step 5: Parse the API response and build AgentInterview objects. + # Dual-platform shape: {"twitter_0": {...}, "reddit_0": {...}, "twitter_1": {...}, ...} api_data = api_result.get("result", {}) results_dict = api_data.get("results", {}) if isinstance(api_data, dict) else {} @@ -1409,34 +1398,34 @@ class ZepToolsService: agent_role = agent.get("profession", "未知") agent_bio = agent.get("bio", "") - # 获取该Agent在两个平台的采访结果 + # Fetch this agent's responses from both platforms. twitter_result = results_dict.get(f"twitter_{agent_idx}", {}) reddit_result = results_dict.get(f"reddit_{agent_idx}", {}) twitter_response = twitter_result.get("response", "") reddit_response = reddit_result.get("response", "") - # 清理可能的工具调用 JSON 包裹 + # Strip any tool-call JSON wrapper from the agent's reply. twitter_response = self._clean_tool_call_response(twitter_response) reddit_response = self._clean_tool_call_response(reddit_response) - # 始终输出双平台标记 + # Always emit both platform headers, even when one platform is empty. twitter_text = twitter_response if twitter_response else "(该平台未获得回复)" reddit_text = reddit_response if reddit_response else "(该平台未获得回复)" response_text = f"【Twitter平台回答】\n{twitter_text}\n\n【Reddit平台回答】\n{reddit_text}" - # 提取关键引言(从两个平台的回答中) + # Extract key quotes from the responses on both platforms. import re combined_responses = f"{twitter_response} {reddit_response}" - # 清理响应文本:去掉标记、编号、Markdown 等干扰 + # Clean up the response text: drop markers, numbering, Markdown noise. clean_text = re.sub(r'#{1,6}\s+', '', combined_responses) clean_text = re.sub(r'\{[^}]*tool_name[^}]*\}', '', clean_text) clean_text = re.sub(r'[*_`|>~\-]{2,}', '', clean_text) clean_text = re.sub(r'问题\d+[::]\s*', '', clean_text) clean_text = re.sub(r'【[^】]+】', '', clean_text) - # 策略1(主): 提取完整的有实质内容的句子 + # Primary strategy: extract complete sentences with substantive content. sentences = re.split(r'[。!?]', clean_text) meaningful = [ s.strip() for s in sentences @@ -1447,7 +1436,7 @@ class ZepToolsService: meaningful.sort(key=len, reverse=True) key_quotes = [s + "。" for s in meaningful[:3]] - # 策略2(补充): 正确配对的中文引号「」内长文本 + # Fallback strategy: long text inside properly paired CJK quotation marks「」. if not key_quotes: paired = re.findall(r'\u201c([^\u201c\u201d]{15,100})\u201d', clean_text) paired += re.findall(r'\u300c([^\u300c\u300d]{15,100})\u300d', clean_text) @@ -1456,7 +1445,7 @@ class ZepToolsService: interview = AgentInterview( agent_name=agent_name, agent_role=agent_role, - agent_bio=agent_bio[:1000], # 扩大bio长度限制 + agent_bio=agent_bio[:1000], # Allow a longer bio than the default limit. question=combined_prompt, response=response_text, key_quotes=key_quotes[:5] @@ -1466,7 +1455,7 @@ class ZepToolsService: result.interviewed_count = len(result.interviews) except ValueError as e: - # 模拟环境未运行 + # Simulation environment is not running. logger.warning(t("log.zep_tools.m042", e=e)) result.summary = f"采访失败:{str(e)}。模拟环境可能已关闭,请确保OASIS环境正在运行。" return result @@ -1477,7 +1466,7 @@ class ZepToolsService: result.summary = f"采访过程发生错误:{str(e)}" return result - # Step 6: 生成采访摘要 + # Step 6: Generate the interview summary. if result.interviews: result.summary = self._generate_interview_summary( interviews=result.interviews, @@ -1489,7 +1478,7 @@ class ZepToolsService: @staticmethod def _clean_tool_call_response(response: str) -> str: - """清理 Agent 回复中的 JSON 工具调用包裹,提取实际内容""" + """Strip the JSON tool-call wrapper from an agent reply and return the inner content.""" if not response or not response.strip().startswith('{'): return response text = response.strip() @@ -1509,11 +1498,11 @@ class ZepToolsService: return response def _load_agent_profiles(self, simulation_id: str) -> List[Dict[str, Any]]: - """加载模拟的Agent人设文件""" + """Load the agent persona file for a simulation.""" import os import csv - - # 构建人设文件路径 + + # Build the persona file path. sim_dir = os.path.join( os.path.dirname(__file__), f'../../uploads/simulations/{simulation_id}' @@ -1521,7 +1510,7 @@ class ZepToolsService: profiles = [] - # 优先尝试读取Reddit JSON格式 + # Prefer the Reddit JSON profile if it exists. reddit_profile_path = os.path.join(sim_dir, "reddit_profiles.json") if os.path.exists(reddit_profile_path): try: @@ -1532,14 +1521,14 @@ class ZepToolsService: except Exception as e: logger.warning(t("log.zep_tools.m046", e=e)) - # 尝试读取Twitter CSV格式 + # Otherwise fall back to the Twitter CSV profile. twitter_profile_path = os.path.join(sim_dir, "twitter_profiles.csv") if os.path.exists(twitter_profile_path): try: with open(twitter_profile_path, 'r', encoding='utf-8') as f: reader = csv.DictReader(f) for row in reader: - # CSV格式转换为统一格式 + # Convert each CSV row into the unified profile shape. profiles.append({ "realname": row.get("name", ""), "username": row.get("username", ""), @@ -1561,17 +1550,16 @@ class ZepToolsService: simulation_requirement: str, max_agents: int ) -> tuple: - """ - 使用LLM选择要采访的Agent - + """Use the LLM to choose which agents to interview. + Returns: - tuple: (selected_agents, selected_indices, reasoning) - - selected_agents: 选中Agent的完整信息列表 - - selected_indices: 选中Agent的索引列表(用于API调用) - - reasoning: 选择理由 + tuple: ``(selected_agents, selected_indices, reasoning)`` where + - ``selected_agents`` is the full profile list for the chosen agents, + - ``selected_indices`` is the list of indices to pass to the API, + - ``reasoning`` explains why those agents were chosen. """ - - # 构建Agent摘要列表 + + # Build a compact summary list of every candidate agent. agent_summaries = [] for i, profile in enumerate(profiles): summary = { @@ -1620,7 +1608,7 @@ class ZepToolsService: selected_indices = response.get("selected_indices", [])[:max_agents] reasoning = response.get("reasoning", "基于相关性自动选择") - # 获取选中的Agent完整信息 + # Pull the full profile for each chosen agent. selected_agents = [] valid_indices = [] for idx in selected_indices: @@ -1632,7 +1620,7 @@ class ZepToolsService: except Exception as e: logger.warning(t("log.zep_tools.m049", e=e)) - # 降级:选择前N个 + # Fallback: pick the first N profiles. selected = profiles[:max_agents] indices = list(range(min(max_agents, len(profiles)))) return selected, indices, "使用默认选择策略" @@ -1643,8 +1631,8 @@ class ZepToolsService: simulation_requirement: str, selected_agents: List[Dict[str, Any]] ) -> List[str]: - """使用LLM生成采访问题""" - + """Use the LLM to generate interview questions.""" + agent_roles = [a.get("profession", "未知") for a in selected_agents] system_prompt = """你是一个专业的记者/采访者。根据采访需求,生成3-5个深度采访问题。 @@ -1691,12 +1679,12 @@ class ZepToolsService: interviews: List[AgentInterview], interview_requirement: str ) -> str: - """生成采访摘要""" - + """Generate the interview summary.""" + if not interviews: return "未完成任何采访" - - # 收集所有采访内容 + + # Gather every interview excerpt. interview_texts = [] for interview in interviews: interview_texts.append(f"【{interview.agent_name}({interview.agent_role})】\n{interview.response[:500]}") @@ -1737,5 +1725,5 @@ class ZepToolsService: except Exception as e: logger.warning(t("log.zep_tools.m051", e=e)) - # 降级:简单拼接 + # Fallback: simple concatenation of agent names. return f"共采访了{len(interviews)}位受访者,包括:" + "、".join([i.agent_name for i in interviews]) diff --git a/backend/app/utils/__init__.py b/backend/app/utils/__init__.py index e70161ac..5f13955e 100644 --- a/backend/app/utils/__init__.py +++ b/backend/app/utils/__init__.py @@ -1,6 +1,4 @@ -""" -工具模块 -""" +"""Backend utilities package.""" from .file_parser import FileParser from .llm_client import LLMClient diff --git a/backend/app/utils/file_parser.py b/backend/app/utils/file_parser.py index 3f1d8ed2..fbe42acf 100644 --- a/backend/app/utils/file_parser.py +++ b/backend/app/utils/file_parser.py @@ -1,6 +1,6 @@ -""" -文件解析工具 -支持PDF、Markdown、TXT文件的文本提取 +"""File parsing utilities. + +Supports text extraction from PDF, Markdown, and plain-text files. """ import os @@ -9,30 +9,27 @@ from typing import List, Optional def _read_text_with_fallback(file_path: str) -> str: - """ - 读取文本文件,UTF-8失败时自动探测编码。 - - 采用多级回退策略: - 1. 首先尝试 UTF-8 解码 - 2. 使用 charset_normalizer 检测编码 - 3. 回退到 chardet 检测编码 - 4. 最终使用 UTF-8 + errors='replace' 兜底 - + """Read a text file, falling back through encoding detectors when UTF-8 fails. + + Multi-stage fallback strategy: + 1. Try UTF-8 first. + 2. Use ``charset_normalizer`` to detect the encoding. + 3. Fall back to ``chardet``. + 4. Last resort: decode with UTF-8 + ``errors='replace'``. + Args: - file_path: 文件路径 - + file_path: Path to the file to read. + Returns: - 解码后的文本内容 + The decoded text content. """ data = Path(file_path).read_bytes() - - # 首先尝试 UTF-8 + try: return data.decode('utf-8') except UnicodeDecodeError: pass - - # 尝试使用 charset_normalizer 检测编码 + encoding = None try: from charset_normalizer import from_bytes @@ -41,8 +38,7 @@ def _read_text_with_fallback(file_path: str) -> str: encoding = best.encoding except Exception: pass - - # 回退到 chardet + if not encoding: try: import chardet @@ -50,89 +46,86 @@ def _read_text_with_fallback(file_path: str) -> str: encoding = result.get('encoding') if result else None except Exception: pass - - # 最终兜底:使用 UTF-8 + replace + if not encoding: encoding = 'utf-8' - + return data.decode(encoding, errors='replace') class FileParser: - """文件解析器""" - + """Parser for the supported document formats.""" + SUPPORTED_EXTENSIONS = {'.pdf', '.md', '.markdown', '.txt'} - + @classmethod def extract_text(cls, file_path: str) -> str: - """ - 从文件中提取文本 - + """Extract plain text from a single supported file. + Args: - file_path: 文件路径 - + file_path: Path to the file. + Returns: - 提取的文本内容 + The extracted text content. """ path = Path(file_path) - + if not path.exists(): raise FileNotFoundError(f"文件不存在: {file_path}") - + suffix = path.suffix.lower() - + if suffix not in cls.SUPPORTED_EXTENSIONS: raise ValueError(f"不支持的文件格式: {suffix}") - + if suffix == '.pdf': return cls._extract_from_pdf(file_path) elif suffix in {'.md', '.markdown'}: return cls._extract_from_md(file_path) elif suffix == '.txt': return cls._extract_from_txt(file_path) - + raise ValueError(f"无法处理的文件格式: {suffix}") - + @staticmethod def _extract_from_pdf(file_path: str) -> str: - """从PDF提取文本""" + """Extract text from a PDF file using PyMuPDF.""" try: import fitz # PyMuPDF except ImportError: raise ImportError("需要安装PyMuPDF: pip install PyMuPDF") - + text_parts = [] with fitz.open(file_path) as doc: for page in doc: text = page.get_text() if text.strip(): text_parts.append(text) - + return "\n\n".join(text_parts) - + @staticmethod def _extract_from_md(file_path: str) -> str: - """从Markdown提取文本,支持自动编码检测""" + """Extract text from a Markdown file with automatic encoding detection.""" return _read_text_with_fallback(file_path) - + @staticmethod def _extract_from_txt(file_path: str) -> str: - """从TXT提取文本,支持自动编码检测""" + """Extract text from a plain-text file with automatic encoding detection.""" return _read_text_with_fallback(file_path) - + @classmethod def extract_from_multiple(cls, file_paths: List[str]) -> str: - """ - 从多个文件提取文本并合并 - + """Extract and concatenate text from multiple files. + Args: - file_paths: 文件路径列表 - + file_paths: Paths of files to read. + Returns: - 合并后的文本 + The merged text, with per-file headers separating each section. """ all_texts = [] - + for i, file_path in enumerate(file_paths, 1): try: text = cls.extract_text(file_path) @@ -140,50 +133,48 @@ class FileParser: all_texts.append(f"=== 文档 {i}: {filename} ===\n{text}") except Exception as e: all_texts.append(f"=== 文档 {i}: {file_path} (提取失败: {str(e)}) ===") - + return "\n\n".join(all_texts) def split_text_into_chunks( - text: str, - chunk_size: int = 500, + text: str, + chunk_size: int = 500, overlap: int = 50 ) -> List[str]: - """ - 将文本分割成小块 - + """Split text into overlapping chunks. + Args: - text: 原始文本 - chunk_size: 每块的字符数 - overlap: 重叠字符数 - + text: The source text to split. + chunk_size: Target characters per chunk. + overlap: Number of characters overlapping between consecutive chunks. + Returns: - 文本块列表 + A list of chunk strings. """ if len(text) <= chunk_size: return [text] if text.strip() else [] - + chunks = [] start = 0 - + while start < len(text): end = start + chunk_size - - # 尝试在句子边界处分割 + + # Prefer splitting on a sentence boundary near the chunk end if end < len(text): - # 查找最近的句子结束符 for sep in ['。', '!', '?', '.\n', '!\n', '?\n', '\n\n', '. ', '! ', '? ']: last_sep = text[start:end].rfind(sep) if last_sep != -1 and last_sep > chunk_size * 0.3: end = start + last_sep + len(sep) break - + chunk = text[start:end].strip() if chunk: chunks.append(chunk) - - # 下一个块从重叠位置开始 + + # Next chunk starts at the overlap point start = end - overlap if end < len(text) else len(text) - + return chunks diff --git a/backend/app/utils/llm_client.py b/backend/app/utils/llm_client.py index ae33afbe..c65b1d12 100644 --- a/backend/app/utils/llm_client.py +++ b/backend/app/utils/llm_client.py @@ -1,6 +1,6 @@ -""" -LLM客户端封装 -统一使用OpenAI格式调用 +"""LLM client wrapper. + +All providers are called through the OpenAI-compatible API surface. """ import json @@ -13,7 +13,7 @@ from ..config import Config class LLMClient: - """LLM客户端""" + """Thin wrapper around the OpenAI-compatible chat completions API.""" def __init__( self, @@ -37,17 +37,16 @@ class LLMClient: max_tokens: int = 4096, response_format: Optional[Dict] = None, ) -> str: - """ - 发送聊天请求 + """Send a chat completion request. Args: - messages: 消息列表 - temperature: 温度参数 - max_tokens: 最大token数 - response_format: 响应格式(如JSON模式) + messages: Chat messages in OpenAI format. + temperature: Sampling temperature. + max_tokens: Maximum number of tokens to generate. + response_format: Optional response format hint (e.g. JSON mode). Returns: - 模型响应文本 + The assistant's response text. """ kwargs = { "model": self.model, @@ -61,7 +60,7 @@ class LLMClient: response = self.client.chat.completions.create(**kwargs) content = response.choices[0].message.content - # 部分模型(如MiniMax M2.5)会在content中包含思考内容,需要移除 + # Some reasoning models (e.g. MiniMax M2.5) embed ... blocks; strip them. content = re.sub(r"[\s\S]*?", "", content).strip() return content @@ -79,7 +78,7 @@ class LLMClient: messages=messages, temperature=temperature, max_tokens=max_tokens ) - # 清理markdown代码块标记 + # Strip surrounding markdown code-fence markers if present. cleaned_response = response.strip() cleaned_response = re.sub( r"^```(?:json)?\s*\n?", "", cleaned_response, flags=re.IGNORECASE diff --git a/backend/app/utils/logger.py b/backend/app/utils/logger.py index 1978c0b8..16caebfb 100644 --- a/backend/app/utils/logger.py +++ b/backend/app/utils/logger.py @@ -1,6 +1,7 @@ -""" -日志配置模块 -提供统一的日志管理,同时输出到控制台和文件 +"""Logger configuration module. + +Provides unified logging that writes simultaneously to the console and a +rotating log file. """ import os @@ -11,59 +12,55 @@ from logging.handlers import RotatingFileHandler def _ensure_utf8_stdout(): - """ - 确保 stdout/stderr 使用 UTF-8 编码 - 解决 Windows 控制台中文乱码问题 + """Force stdout/stderr to UTF-8. + + Fixes garbled non-ASCII output on the Windows console. """ if sys.platform == 'win32': - # Windows 下重新配置标准输出为 UTF-8 + # On Windows, reconfigure the standard streams to UTF-8. if hasattr(sys.stdout, 'reconfigure'): sys.stdout.reconfigure(encoding='utf-8', errors='replace') if hasattr(sys.stderr, 'reconfigure'): sys.stderr.reconfigure(encoding='utf-8', errors='replace') -# 日志目录 +# Directory that holds rotated log files. LOG_DIR = os.path.join(os.path.dirname(os.path.dirname(os.path.dirname(__file__))), 'logs') def setup_logger(name: str = 'mirofish', level: int = logging.DEBUG) -> logging.Logger: - """ - 设置日志器 - + """Configure and return a logger. + Args: - name: 日志器名称 - level: 日志级别 - + name: Logger name. + level: Minimum log level for the logger. + Returns: - 配置好的日志器 + The configured logger. """ - # 确保日志目录存在 os.makedirs(LOG_DIR, exist_ok=True) - - # 创建日志器 + logger = logging.getLogger(name) logger.setLevel(level) - - # 阻止日志向上传播到根 logger,避免重复输出 + + # Prevent propagation to the root logger to avoid duplicate output. logger.propagate = False - - # 如果已经有处理器,不重复添加 + + # If handlers are already attached, do not re-add them. if logger.handlers: return logger - - # 日志格式 + detailed_formatter = logging.Formatter( '[%(asctime)s] %(levelname)s [%(name)s.%(funcName)s:%(lineno)d] %(message)s', datefmt='%Y-%m-%d %H:%M:%S' ) - + simple_formatter = logging.Formatter( '[%(asctime)s] %(levelname)s: %(message)s', datefmt='%H:%M:%S' ) - - # 1. 文件处理器 - 详细日志(按日期命名,带轮转) + + # 1. File handler — detailed log, named by date and rotated by size. log_filename = datetime.now().strftime('%Y-%m-%d') + '.log' file_handler = RotatingFileHandler( os.path.join(LOG_DIR, log_filename), @@ -73,30 +70,28 @@ def setup_logger(name: str = 'mirofish', level: int = logging.DEBUG) -> logging. ) file_handler.setLevel(logging.DEBUG) file_handler.setFormatter(detailed_formatter) - - # 2. 控制台处理器 - 简洁日志(INFO及以上) - # 确保 Windows 下使用 UTF-8 编码,避免中文乱码 + + # 2. Console handler — concise log, INFO and above. + # Ensure UTF-8 on Windows so non-ASCII characters render correctly. _ensure_utf8_stdout() console_handler = logging.StreamHandler(sys.stdout) console_handler.setLevel(logging.INFO) console_handler.setFormatter(simple_formatter) - - # 添加处理器 + logger.addHandler(file_handler) logger.addHandler(console_handler) - + return logger def get_logger(name: str = 'mirofish') -> logging.Logger: - """ - 获取日志器(如果不存在则创建) - + """Return an existing logger by name, creating it lazily if needed. + Args: - name: 日志器名称 - + name: Logger name. + Returns: - 日志器实例 + The logger instance. """ logger = logging.getLogger(name) if not logger.handlers: @@ -104,11 +99,11 @@ def get_logger(name: str = 'mirofish') -> logging.Logger: return logger -# 创建默认日志器 +# Default module-level logger. logger = setup_logger() -# 便捷方法 +# Convenience module-level helpers. def debug(msg, *args, **kwargs): logger.debug(msg, *args, **kwargs) diff --git a/backend/app/utils/retry.py b/backend/app/utils/retry.py index 23ecd45c..1eb2b48f 100644 --- a/backend/app/utils/retry.py +++ b/backend/app/utils/retry.py @@ -1,6 +1,7 @@ -""" -API调用重试机制 -用于处理LLM等外部API调用的重试逻辑 +"""API call retry primitives. + +Helpers for retrying calls to external APIs (LLMs, etc.) with exponential +backoff and jitter. """ import time @@ -22,18 +23,17 @@ def retry_with_backoff( exceptions: Tuple[Type[Exception], ...] = (Exception,), on_retry: Optional[Callable[[Exception, int], None]] = None ): - """ - 带指数退避的重试装饰器 - + """Decorator that retries a callable with exponential backoff. + Args: - max_retries: 最大重试次数 - initial_delay: 初始延迟(秒) - max_delay: 最大延迟(秒) - backoff_factor: 退避因子 - jitter: 是否添加随机抖动 - exceptions: 需要重试的异常类型 - on_retry: 重试时的回调函数 (exception, retry_count) - + max_retries: Maximum number of retries before giving up. + initial_delay: Initial delay in seconds before the first retry. + max_delay: Cap on the delay between retries (seconds). + backoff_factor: Multiplicative factor applied to the delay each retry. + jitter: When ``True``, randomize the delay to avoid thundering herd. + exceptions: Exception types that should trigger a retry. + on_retry: Optional callback invoked on each retry as ``(exception, retry_count)``. + Usage: @retry_with_backoff(max_retries=3) def call_llm_api(): @@ -61,7 +61,7 @@ def retry_with_backoff( )) raise - # 计算延迟 + # Compute the next delay, capped at ``max_delay``. current_delay = min(delay, max_delay) if jitter: current_delay = current_delay * (0.5 + random.random()) @@ -92,9 +92,7 @@ def retry_with_backoff_async( exceptions: Tuple[Type[Exception], ...] = (Exception,), on_retry: Optional[Callable[[Exception, int], None]] = None ): - """ - 异步版本的重试装饰器 - """ + """Async variant of :func:`retry_with_backoff`.""" import asyncio def decorator(func: Callable) -> Callable: @@ -141,9 +139,7 @@ def retry_with_backoff_async( class RetryableAPIClient: - """ - 可重试的API客户端封装 - """ + """Class-based wrapper around the retry helpers.""" def __init__( self, @@ -164,17 +160,16 @@ class RetryableAPIClient: exceptions: Tuple[Type[Exception], ...] = (Exception,), **kwargs ) -> Any: - """ - 执行函数调用并在失败时重试 - + """Invoke ``func`` with retry on failure. + Args: - func: 要调用的函数 - *args: 函数参数 - exceptions: 需要重试的异常类型 - **kwargs: 函数关键字参数 - + func: Callable to invoke. + *args: Positional arguments forwarded to ``func``. + exceptions: Exception types that should trigger a retry. + **kwargs: Keyword arguments forwarded to ``func``. + Returns: - 函数返回值 + The value returned by ``func``. """ last_exception = None delay = self.initial_delay @@ -214,17 +209,17 @@ class RetryableAPIClient: exceptions: Tuple[Type[Exception], ...] = (Exception,), continue_on_failure: bool = True ) -> Tuple[list, list]: - """ - 批量调用并对每个失败项单独重试 - + """Process ``items`` in sequence, retrying each independently on failure. + Args: - items: 要处理的项目列表 - process_func: 处理函数,接收单个item作为参数 - exceptions: 需要重试的异常类型 - continue_on_failure: 单项失败后是否继续处理其他项 - + items: Items to process. + process_func: Callable invoked once per item. + exceptions: Exception types that should trigger a retry. + continue_on_failure: When ``True``, keep processing remaining items after a failure. + Returns: - (成功结果列表, 失败项列表) + ``(successes, failures)`` — a list of successful results and a list + of failure descriptors ``{"index", "item", "error"}``. """ results = [] failures = [] diff --git a/backend/app/utils/zep_paging.py b/backend/app/utils/zep_paging.py index eb68d4eb..cc149046 100644 --- a/backend/app/utils/zep_paging.py +++ b/backend/app/utils/zep_paging.py @@ -1,7 +1,8 @@ -"""Zep Graph 分页读取工具。 +"""Zep Graph paging helpers. -Zep 的 node/edge 列表接口使用 UUID cursor 分页, -本模块封装自动翻页逻辑(含单页重试),对调用方透明地返回完整列表。 +Zep's node/edge list APIs paginate with a UUID cursor. This module wraps the +auto-paging loop (including per-page retry) so callers see the full list +transparently. """ from __future__ import annotations @@ -30,7 +31,7 @@ def _fetch_page_with_retry( page_description: str = "page", **kwargs: Any, ) -> list[Any]: - """单页请求,失败时指数退避重试。自动处理429限速。""" + """Fetch one page, retrying with exponential backoff. Handles 429 rate limits.""" if max_retries < 1: raise ValueError("max_retries must be >= 1") @@ -43,7 +44,7 @@ def _fetch_page_with_retry( except Exception as e: last_exception = e if attempt < max_retries - 1: - # 检测429限速,使用retry-after头部指定的等待时间 + # If a 429 rate limit is detected, prefer the retry-after header for the wait. wait = delay logger.warning( f"Zep {page_description} attempt {attempt + 1} failed: {str(e)[:100]}, retrying in {wait:.1f}s..." @@ -65,7 +66,7 @@ def fetch_all_nodes( max_retries: int = _DEFAULT_MAX_RETRIES, retry_delay: float = _DEFAULT_RETRY_DELAY, ) -> list[Any]: - """分页获取图谱节点,最多返回 max_items 条(默认 2000)。每页请求自带重试。""" + """Page through graph nodes; return at most ``max_items`` (default 2000). Each page is retried internally.""" all_nodes: list[Any] = [] cursor: str | None = None page_num = 0 @@ -110,7 +111,7 @@ def fetch_all_edges( max_retries: int = _DEFAULT_MAX_RETRIES, retry_delay: float = _DEFAULT_RETRY_DELAY, ) -> list[Any]: - """分页获取图谱所有边,返回完整列表。每页请求自带重试。""" + """Page through every graph edge and return the full list. Each page is retried internally.""" all_edges: list[Any] = [] cursor: str | None = None page_num = 0 diff --git a/backend/run.py b/backend/run.py index 4e3b04fa..2d2e7cd4 100644 --- a/backend/run.py +++ b/backend/run.py @@ -1,21 +1,20 @@ -""" -MiroFish Backend 启动入口 -""" +"""MiroFish backend entry point.""" import os import sys -# 解决 Windows 控制台中文乱码问题:在所有导入之前设置 UTF-8 编码 +# Force UTF-8 on Windows console before importing anything that might write to +# stdout/stderr; otherwise non-ASCII characters render as mojibake. if sys.platform == 'win32': - # 设置环境变量确保 Python 使用 UTF-8 + # Make sure Python itself uses UTF-8. os.environ.setdefault('PYTHONIOENCODING', 'utf-8') - # 重新配置标准输出流为 UTF-8 + # Reconfigure the standard streams to UTF-8. if hasattr(sys.stdout, 'reconfigure'): sys.stdout.reconfigure(encoding='utf-8', errors='replace') if hasattr(sys.stderr, 'reconfigure'): sys.stderr.reconfigure(encoding='utf-8', errors='replace') -# 添加项目根目录到路径 +# Add the project root to sys.path so the ``app`` package resolves. sys.path.insert(0, os.path.dirname(os.path.abspath(__file__))) from app import create_app @@ -23,8 +22,7 @@ from app.config import Config def main(): - """主函数""" - # 验证配置 + """Validate configuration and start the Flask development server.""" errors = Config.validate() if errors: print("配置错误:") @@ -32,19 +30,16 @@ def main(): print(f" - {err}") print("\n请检查 .env 文件中的配置") sys.exit(1) - - # 创建应用 + app = create_app() - - # 获取运行配置 + + # Resolve runtime host/port from the environment. host = os.environ.get('FLASK_HOST', '0.0.0.0') port = int(os.environ.get('FLASK_PORT', 5001)) debug = Config.DEBUG - - # 启动服务 + app.run(host=host, port=port, debug=debug, threaded=True) if __name__ == '__main__': main() - diff --git a/backend/scripts/action_logger.py b/backend/scripts/action_logger.py index 38d025a6..bea32e20 100644 --- a/backend/scripts/action_logger.py +++ b/backend/scripts/action_logger.py @@ -1,15 +1,17 @@ -""" -动作日志记录器 -用于记录OASIS模拟中每个Agent的动作,供后端监控使用 +"""Action logger. + +Records each agent action during an OASIS simulation so the backend can +monitor progress. + +Log layout:: -日志结构: sim_xxx/ ├── twitter/ - │ └── actions.jsonl # Twitter 平台动作日志 + │ └── actions.jsonl # Twitter action log ├── reddit/ - │ └── actions.jsonl # Reddit 平台动作日志 - ├── simulation.log # 主模拟进程日志 - └── run_state.json # 运行状态(API 查询用) + │ └── actions.jsonl # Reddit action log + ├── simulation.log # main simulation process log + └── run_state.json # run state (queried by the API) """ import json @@ -20,26 +22,25 @@ from typing import Dict, Any, Optional class PlatformActionLogger: - """单平台动作日志记录器""" - + """Per-platform action logger.""" + def __init__(self, platform: str, base_dir: str): - """ - 初始化日志记录器 - + """Initialize the logger. + Args: - platform: 平台名称 (twitter/reddit) - base_dir: 模拟目录的基础路径 + platform: Platform name (``twitter`` or ``reddit``). + base_dir: Base path of the simulation directory. """ self.platform = platform self.base_dir = base_dir self.log_dir = os.path.join(base_dir, platform) self.log_path = os.path.join(self.log_dir, "actions.jsonl") self._ensure_dir() - + def _ensure_dir(self): - """确保目录存在""" + """Ensure the log directory exists.""" os.makedirs(self.log_dir, exist_ok=True) - + def log_action( self, round_num: int, @@ -50,7 +51,7 @@ class PlatformActionLogger: result: Optional[str] = None, success: bool = True ): - """记录一个动作""" + """Append a single action record.""" entry = { "round": round_num, "timestamp": datetime.now().isoformat(), @@ -61,36 +62,36 @@ class PlatformActionLogger: "result": result, "success": success, } - + with open(self.log_path, 'a', encoding='utf-8') as f: f.write(json.dumps(entry, ensure_ascii=False) + '\n') - + def log_round_start(self, round_num: int, simulated_hour: int): - """记录轮次开始""" + """Append a round-start marker.""" entry = { "round": round_num, "timestamp": datetime.now().isoformat(), "event_type": "round_start", "simulated_hour": simulated_hour, } - + with open(self.log_path, 'a', encoding='utf-8') as f: f.write(json.dumps(entry, ensure_ascii=False) + '\n') - + def log_round_end(self, round_num: int, actions_count: int): - """记录轮次结束""" + """Append a round-end marker.""" entry = { "round": round_num, "timestamp": datetime.now().isoformat(), "event_type": "round_end", "actions_count": actions_count, } - + with open(self.log_path, 'a', encoding='utf-8') as f: f.write(json.dumps(entry, ensure_ascii=False) + '\n') - + def log_simulation_start(self, config: Dict[str, Any]): - """记录模拟开始""" + """Append a simulation-start marker.""" entry = { "timestamp": datetime.now().isoformat(), "event_type": "simulation_start", @@ -98,12 +99,12 @@ class PlatformActionLogger: "total_rounds": config.get("time_config", {}).get("total_simulation_hours", 72) * 2, "agents_count": len(config.get("agent_configs", [])), } - + with open(self.log_path, 'a', encoding='utf-8') as f: f.write(json.dumps(entry, ensure_ascii=False) + '\n') - + def log_simulation_end(self, total_rounds: int, total_actions: int): - """记录模拟结束""" + """Append a simulation-end marker.""" entry = { "timestamp": datetime.now().isoformat(), "event_type": "simulation_end", @@ -111,42 +112,42 @@ class PlatformActionLogger: "total_rounds": total_rounds, "total_actions": total_actions, } - + with open(self.log_path, 'a', encoding='utf-8') as f: f.write(json.dumps(entry, ensure_ascii=False) + '\n') class SimulationLogManager: + """Top-level log manager. + + Owns and dispatches to the per-platform action loggers, and exposes a + main process logger for non-action messages. """ - 模拟日志管理器 - 统一管理所有日志文件,按平台分离 - """ - + def __init__(self, simulation_dir: str): - """ - 初始化日志管理器 - + """Initialize the log manager. + Args: - simulation_dir: 模拟目录路径 + simulation_dir: Path to the simulation directory. """ self.simulation_dir = simulation_dir self.twitter_logger: Optional[PlatformActionLogger] = None self.reddit_logger: Optional[PlatformActionLogger] = None self._main_logger: Optional[logging.Logger] = None - - # 设置主日志 + + # Configure the main process logger. self._setup_main_logger() - + def _setup_main_logger(self): - """设置主模拟日志""" + """Configure the main simulation log.""" log_path = os.path.join(self.simulation_dir, "simulation.log") - - # 创建 logger + + # Build the logger. self._main_logger = logging.getLogger(f"simulation.{os.path.basename(self.simulation_dir)}") self._main_logger.setLevel(logging.INFO) self._main_logger.handlers.clear() - - # 文件处理器 + + # File handler. file_handler = logging.FileHandler(log_path, encoding='utf-8', mode='w') file_handler.setLevel(logging.INFO) file_handler.setFormatter(logging.Formatter( @@ -154,8 +155,8 @@ class SimulationLogManager: datefmt='%Y-%m-%d %H:%M:%S' )) self._main_logger.addHandler(file_handler) - - # 控制台处理器 + + # Console handler. console_handler = logging.StreamHandler() console_handler.setLevel(logging.INFO) console_handler.setFormatter(logging.Formatter( @@ -163,56 +164,56 @@ class SimulationLogManager: datefmt='%H:%M:%S' )) self._main_logger.addHandler(console_handler) - + self._main_logger.propagate = False - + def get_twitter_logger(self) -> PlatformActionLogger: - """获取 Twitter 平台日志记录器""" + """Lazily construct and return the Twitter platform logger.""" if self.twitter_logger is None: self.twitter_logger = PlatformActionLogger("twitter", self.simulation_dir) return self.twitter_logger - + def get_reddit_logger(self) -> PlatformActionLogger: - """获取 Reddit 平台日志记录器""" + """Lazily construct and return the Reddit platform logger.""" if self.reddit_logger is None: self.reddit_logger = PlatformActionLogger("reddit", self.simulation_dir) return self.reddit_logger - + def log(self, message: str, level: str = "info"): - """记录主日志""" + """Forward a message to the main logger at the given level.""" if self._main_logger: getattr(self._main_logger, level.lower(), self._main_logger.info)(message) - + def info(self, message: str): self.log(message, "info") - + def warning(self, message: str): self.log(message, "warning") - + def error(self, message: str): self.log(message, "error") - + def debug(self, message: str): self.log(message, "debug") -# ============ 兼容旧接口 ============ +# ============ Legacy interface ============ class ActionLogger: + """Legacy single-platform action logger. + + Prefer :class:`SimulationLogManager` for new code. """ - 动作日志记录器(兼容旧接口) - 建议使用 SimulationLogManager 代替 - """ - + def __init__(self, log_path: str): self.log_path = log_path self._ensure_dir() - + def _ensure_dir(self): log_dir = os.path.dirname(self.log_path) if log_dir: os.makedirs(log_dir, exist_ok=True) - + def log_action( self, round_num: int, @@ -235,10 +236,10 @@ class ActionLogger: "result": result, "success": success, } - + with open(self.log_path, 'a', encoding='utf-8') as f: f.write(json.dumps(entry, ensure_ascii=False) + '\n') - + def log_round_start(self, round_num: int, simulated_hour: int, platform: str): entry = { "round": round_num, @@ -247,10 +248,10 @@ class ActionLogger: "event_type": "round_start", "simulated_hour": simulated_hour, } - + with open(self.log_path, 'a', encoding='utf-8') as f: f.write(json.dumps(entry, ensure_ascii=False) + '\n') - + def log_round_end(self, round_num: int, actions_count: int, platform: str): entry = { "round": round_num, @@ -259,10 +260,10 @@ class ActionLogger: "event_type": "round_end", "actions_count": actions_count, } - + with open(self.log_path, 'a', encoding='utf-8') as f: f.write(json.dumps(entry, ensure_ascii=False) + '\n') - + def log_simulation_start(self, platform: str, config: Dict[str, Any]): entry = { "timestamp": datetime.now().isoformat(), @@ -271,10 +272,10 @@ class ActionLogger: "total_rounds": config.get("time_config", {}).get("total_simulation_hours", 72) * 2, "agents_count": len(config.get("agent_configs", [])), } - + with open(self.log_path, 'a', encoding='utf-8') as f: f.write(json.dumps(entry, ensure_ascii=False) + '\n') - + def log_simulation_end(self, platform: str, total_rounds: int, total_actions: int): entry = { "timestamp": datetime.now().isoformat(), @@ -283,23 +284,23 @@ class ActionLogger: "total_rounds": total_rounds, "total_actions": total_actions, } - + with open(self.log_path, 'a', encoding='utf-8') as f: f.write(json.dumps(entry, ensure_ascii=False) + '\n') -# 全局日志实例(兼容旧接口) +# Process-wide logger instance, used by the legacy interface. _global_logger: Optional[ActionLogger] = None def get_logger(log_path: Optional[str] = None) -> ActionLogger: - """获取全局日志实例(兼容旧接口)""" + """Return the process-wide :class:`ActionLogger` (legacy interface).""" global _global_logger - + if log_path: _global_logger = ActionLogger(log_path) - + if _global_logger is None: _global_logger = ActionLogger("actions.jsonl") - + return _global_logger diff --git a/backend/scripts/run_parallel_simulation.py b/backend/scripts/run_parallel_simulation.py index 2a627ffd..9dd3d8b9 100644 --- a/backend/scripts/run_parallel_simulation.py +++ b/backend/scripts/run_parallel_simulation.py @@ -1,67 +1,70 @@ -""" -OASIS 双平台并行模拟预设脚本 -同时运行Twitter和Reddit模拟,读取相同的配置文件 +"""OASIS dual-platform parallel simulation preset script. -功能特性: -- 双平台(Twitter + Reddit)并行模拟 -- 完成模拟后不立即关闭环境,进入等待命令模式 -- 支持通过IPC接收Interview命令 -- 支持单个Agent采访和批量采访 -- 支持远程关闭环境命令 +Runs Twitter and Reddit simulations simultaneously, reading the same config file. -使用方式: +Features: +- Dual-platform (Twitter + Reddit) parallel simulation +- Keeps environments alive after the simulation finishes and enters wait-for-command mode +- Receives Interview commands via IPC +- Supports single-agent and batch interviews +- Supports a remote close-environment command + +Usage: python run_parallel_simulation.py --config simulation_config.json - python run_parallel_simulation.py --config simulation_config.json --no-wait # 完成后立即关闭 + python run_parallel_simulation.py --config simulation_config.json --no-wait # close immediately when done python run_parallel_simulation.py --config simulation_config.json --twitter-only python run_parallel_simulation.py --config simulation_config.json --reddit-only -日志结构: +Log layout: sim_xxx/ ├── twitter/ - │ └── actions.jsonl # Twitter 平台动作日志 + │ └── actions.jsonl # Twitter platform action log ├── reddit/ - │ └── actions.jsonl # Reddit 平台动作日志 - ├── simulation.log # 主模拟进程日志 - └── run_state.json # 运行状态(API 查询用) + │ └── actions.jsonl # Reddit platform action log + ├── simulation.log # main simulation process log + └── run_state.json # run state (used by API queries) """ # ============================================================ -# 解决 Windows 编码问题:在所有 import 之前设置 UTF-8 编码 -# 这是为了修复 OASIS 第三方库读取文件时未指定编码的问题 +# Fix the Windows encoding issue by forcing UTF-8 before any import. +# This works around the OASIS third-party library opening files without +# specifying an encoding. # ============================================================ import sys import os if sys.platform == 'win32': - # 设置 Python 默认 I/O 编码为 UTF-8 - # 这会影响所有未指定编码的 open() 调用 + # Set Python's default I/O encoding to UTF-8 so every open() call without + # an explicit encoding picks it up. os.environ.setdefault('PYTHONUTF8', '1') os.environ.setdefault('PYTHONIOENCODING', 'utf-8') - - # 重新配置标准输出流为 UTF-8(解决控制台中文乱码) + + # Reconfigure stdout/stderr to UTF-8 to avoid mojibake in the console. if hasattr(sys.stdout, 'reconfigure'): sys.stdout.reconfigure(encoding='utf-8', errors='replace') if hasattr(sys.stderr, 'reconfigure'): sys.stderr.reconfigure(encoding='utf-8', errors='replace') - - # 强制设置默认编码(影响 open() 函数的默认编码) - # 注意:这需要在 Python 启动时就设置,运行时设置可能不生效 - # 所以我们还需要 monkey-patch 内置的 open 函数 + + # Force the default encoding used by open(). The env-var approach above + # only works when set at interpreter startup, so we additionally + # monkey-patch the built-in open(). import builtins _original_open = builtins.open - - def _utf8_open(file, mode='r', buffering=-1, encoding=None, errors=None, + + def _utf8_open(file, mode='r', buffering=-1, encoding=None, errors=None, newline=None, closefd=True, opener=None): + """Wrap open() so text-mode calls default to UTF-8. + + Fixes third-party libraries (such as OASIS) that open files without + specifying an encoding. """ - 包装 open() 函数,对于文本模式默认使用 UTF-8 编码 - 这可以修复第三方库(如 OASIS)读取文件时未指定编码的问题 - """ - # 只对文本模式(非二进制)且未指定编码的情况设置默认编码 + # Only override when the caller is using text mode and did not request + # an explicit encoding. if encoding is None and 'b' not in mode: encoding = 'utf-8' - return _original_open(file, mode, buffering, encoding, errors, + return _original_open(file, mode, buffering, encoding, errors, newline, closefd, opener) - + builtins.open = _utf8_open import argparse @@ -77,26 +80,26 @@ from datetime import datetime from typing import Dict, Any, List, Optional, Tuple -# 全局变量:用于信号处理 +# Globals used by the signal handlers. _shutdown_event = None _cleanup_done = False -# 添加 backend 目录到路径 -# 脚本固定位于 backend/scripts/ 目录 +# Add the backend directory to sys.path. The script always lives in +# backend/scripts/. _scripts_dir = os.path.dirname(os.path.abspath(__file__)) _backend_dir = os.path.abspath(os.path.join(_scripts_dir, '..')) _project_root = os.path.abspath(os.path.join(_backend_dir, '..')) sys.path.insert(0, _scripts_dir) sys.path.insert(0, _backend_dir) -# 加载项目根目录的 .env 文件(包含 LLM_API_KEY 等配置) +# Load the .env from the project root (contains LLM_API_KEY etc.). from dotenv import load_dotenv _env_file = os.path.join(_project_root, '.env') if os.path.exists(_env_file): load_dotenv(_env_file) print(f"已加载环境配置: {_env_file}") else: - # 尝试加载 backend/.env + # Fall back to backend/.env. _backend_env = os.path.join(_backend_dir, '.env') if os.path.exists(_backend_env): load_dotenv(_backend_env) @@ -104,51 +107,51 @@ else: class MaxTokensWarningFilter(logging.Filter): - """过滤掉 camel-ai 关于 max_tokens 的警告(我们故意不设置 max_tokens,让模型自行决定)""" - + """Suppress camel-ai max_tokens warnings. + + We intentionally leave max_tokens unset so the model decides; the warning is noise. + """ + def filter(self, record): - # 过滤掉包含 max_tokens 警告的日志 if "max_tokens" in record.getMessage() and "Invalid or missing" in record.getMessage(): return False return True -# 在模块加载时立即添加过滤器,确保在 camel 代码执行前生效 +# Install the filter at import time so it is active before any camel code runs. logging.getLogger().addFilter(MaxTokensWarningFilter()) def disable_oasis_logging(): + """Disable verbose OASIS library logging. + + OASIS logs every agent observation and action which is extremely noisy; we + rely on our own action_logger instead. """ - 禁用 OASIS 库的详细日志输出 - OASIS 的日志太冗余(记录每个 agent 的观察和动作),我们使用自己的 action_logger - """ - # 禁用 OASIS 的所有日志器 oasis_loggers = [ "social.agent", - "social.twitter", + "social.twitter", "social.rec", "oasis.env", "table", ] - + for logger_name in oasis_loggers: logger = logging.getLogger(logger_name) - logger.setLevel(logging.CRITICAL) # 只记录严重错误 + logger.setLevel(logging.CRITICAL) # only keep severe errors logger.handlers.clear() logger.propagate = False def init_logging_for_simulation(simulation_dir: str): - """ - 初始化模拟的日志配置 - + """Initialize logging for a simulation run. + Args: - simulation_dir: 模拟目录路径 + simulation_dir: path to the simulation directory. """ - # 禁用 OASIS 的详细日志 disable_oasis_logging() - - # 清理旧的 log 目录(如果存在) + + # Clean up any pre-existing log directory. old_log_dir = os.path.join(simulation_dir, "log") if os.path.exists(old_log_dir): import shutil @@ -174,7 +177,8 @@ except ImportError as e: sys.exit(1) -# Twitter可用动作(不包含INTERVIEW,INTERVIEW只能通过ManualAction手动触发) +# Twitter actions available to agents. INTERVIEW is excluded because it can only +# be triggered manually via ManualAction. TWITTER_ACTIONS = [ ActionType.CREATE_POST, ActionType.LIKE_POST, @@ -184,7 +188,8 @@ TWITTER_ACTIONS = [ ActionType.QUOTE_POST, ] -# Reddit可用动作(不包含INTERVIEW,INTERVIEW只能通过ManualAction手动触发) +# Reddit actions available to agents. INTERVIEW is excluded because it can only +# be triggered manually via ManualAction. REDDIT_ACTIONS = [ ActionType.LIKE_POST, ActionType.DISLIKE_POST, @@ -202,23 +207,22 @@ REDDIT_ACTIONS = [ ] -# IPC相关常量 +# IPC-related constants. IPC_COMMANDS_DIR = "ipc_commands" IPC_RESPONSES_DIR = "ipc_responses" ENV_STATUS_FILE = "env_status.json" class CommandType: - """命令类型常量""" + """Command type constants.""" INTERVIEW = "interview" BATCH_INTERVIEW = "batch_interview" CLOSE_ENV = "close_env" class ParallelIPCHandler: - """ - 双平台IPC命令处理器 - - 管理两个平台的环境,处理Interview命令 + """Dual-platform IPC command handler. + + Manages both platform environments and processes Interview commands. """ def __init__( @@ -238,13 +242,12 @@ class ParallelIPCHandler: self.commands_dir = os.path.join(simulation_dir, IPC_COMMANDS_DIR) self.responses_dir = os.path.join(simulation_dir, IPC_RESPONSES_DIR) self.status_file = os.path.join(simulation_dir, ENV_STATUS_FILE) - - # 确保目录存在 + os.makedirs(self.commands_dir, exist_ok=True) os.makedirs(self.responses_dir, exist_ok=True) - + def update_status(self, status: str): - """更新环境状态""" + """Update the recorded environment status.""" with open(self.status_file, 'w', encoding='utf-8') as f: json.dump({ "status": status, @@ -254,11 +257,11 @@ class ParallelIPCHandler: }, f, ensure_ascii=False, indent=2) def poll_command(self) -> Optional[Dict[str, Any]]: - """轮询获取待处理命令""" + """Poll for the next pending command.""" if not os.path.exists(self.commands_dir): return None - - # 获取命令文件(按时间排序) + + # Collect command files sorted by mtime so older commands run first. command_files = [] for filename in os.listdir(self.commands_dir): if filename.endswith('.json'): @@ -277,7 +280,7 @@ class ParallelIPCHandler: return None def send_response(self, command_id: str, status: str, result: Dict = None, error: str = None): - """发送响应""" + """Send a response for a previously dispatched command.""" response = { "command_id": command_id, "status": status, @@ -289,8 +292,8 @@ class ParallelIPCHandler: response_file = os.path.join(self.responses_dir, f"{command_id}.json") with open(response_file, 'w', encoding='utf-8') as f: json.dump(response, f, ensure_ascii=False, indent=2) - - # 删除命令文件 + + # Remove the original command file once a response is recorded. command_file = os.path.join(self.commands_dir, f"{command_id}.json") try: os.remove(command_file) @@ -298,14 +301,14 @@ class ParallelIPCHandler: pass def _get_env_and_graph(self, platform: str): - """ - 获取指定平台的环境和agent_graph - + """Return the environment and agent graph for the given platform. + Args: - platform: 平台名称 ("twitter" 或 "reddit") - + platform: platform name ("twitter" or "reddit"). + Returns: - (env, agent_graph, platform_name) 或 (None, None, None) + Tuple ``(env, agent_graph, platform_name)`` or ``(None, None, None)`` + when the platform is unavailable. """ if platform == "twitter" and self.twitter_env: return self.twitter_env, self.twitter_agent_graph, "twitter" @@ -315,11 +318,10 @@ class ParallelIPCHandler: return None, None, None async def _interview_single_platform(self, agent_id: int, prompt: str, platform: str) -> Dict[str, Any]: - """ - 在单个平台上执行Interview - + """Run an Interview on a single platform. + Returns: - 包含结果的字典,或包含error的字典 + A dict with the interview result, or a dict containing an ``error`` key. """ env, agent_graph, actual_platform = self._get_env_and_graph(platform) @@ -343,22 +345,21 @@ class ParallelIPCHandler: return {"platform": platform, "error": str(e)} async def handle_interview(self, command_id: str, agent_id: int, prompt: str, platform: str = None) -> bool: - """ - 处理单个Agent采访命令 - + """Handle a single-agent interview command. + Args: - command_id: 命令ID - agent_id: Agent ID - prompt: 采访问题 - platform: 指定平台(可选) - - "twitter": 只采访Twitter平台 - - "reddit": 只采访Reddit平台 - - None/不指定: 同时采访两个平台,返回整合结果 - + command_id: command identifier. + agent_id: agent identifier. + prompt: interview prompt. + platform: optional platform selector. + - "twitter": interview on Twitter only. + - "reddit": interview on Reddit only. + - ``None``: interview on both platforms and return a merged result. + Returns: - True 表示成功,False 表示失败 + ``True`` on success, ``False`` on failure. """ - # 如果指定了平台,只采访该平台 + # If a specific platform was requested, only interview on that platform. if platform in ("twitter", "reddit"): result = await self._interview_single_platform(agent_id, prompt, platform) @@ -371,7 +372,7 @@ class ParallelIPCHandler: print(f" Interview完成: agent_id={agent_id}, platform={platform}") return True - # 未指定平台:同时采访两个平台 + # No platform specified: interview on both platforms simultaneously. if not self.twitter_env and not self.reddit_env: self.send_response(command_id, "failed", error="没有可用的模拟环境") return False @@ -383,7 +384,7 @@ class ParallelIPCHandler: } success_count = 0 - # 并行采访两个平台 + # Run the two platform interviews in parallel. tasks = [] platforms_to_interview = [] @@ -394,8 +395,7 @@ class ParallelIPCHandler: if self.reddit_env: tasks.append(self._interview_single_platform(agent_id, prompt, "reddit")) platforms_to_interview.append("reddit") - - # 并行执行 + platform_results = await asyncio.gather(*tasks) for platform_name, platform_result in zip(platforms_to_interview, platform_results): @@ -414,22 +414,21 @@ class ParallelIPCHandler: return False async def handle_batch_interview(self, command_id: str, interviews: List[Dict], platform: str = None) -> bool: - """ - 处理批量采访命令 - + """Handle a batch-interview command. + Args: - command_id: 命令ID - interviews: [{"agent_id": int, "prompt": str, "platform": str(optional)}, ...] - platform: 默认平台(可被每个interview项覆盖) - - "twitter": 只采访Twitter平台 - - "reddit": 只采访Reddit平台 - - None/不指定: 每个Agent同时采访两个平台 + command_id: command identifier. + interviews: ``[{"agent_id": int, "prompt": str, "platform": str(optional)}, ...]``. + platform: default platform (can be overridden per interview entry). + - "twitter": interview on Twitter only. + - "reddit": interview on Reddit only. + - ``None``: interview every agent on both platforms. """ - # 按平台分组 + # Bucket interviews by target platform. twitter_interviews = [] reddit_interviews = [] - both_platforms_interviews = [] # 需要同时采访两个平台的 - + both_platforms_interviews = [] # entries that need both platforms + for interview in interviews: item_platform = interview.get("platform", platform) if item_platform == "twitter": @@ -437,10 +436,10 @@ class ParallelIPCHandler: elif item_platform == "reddit": reddit_interviews.append(interview) else: - # 未指定平台:两个平台都采访 + # No platform specified: interview on both. both_platforms_interviews.append(interview) - - # 把 both_platforms_interviews 拆分到两个平台 + + # Fan the both-platform entries out into the per-platform buckets. if both_platforms_interviews: if self.twitter_env: twitter_interviews.extend(both_platforms_interviews) @@ -448,8 +447,8 @@ class ParallelIPCHandler: reddit_interviews.extend(both_platforms_interviews) results = {} - - # 处理Twitter平台的采访 + + # Run the Twitter-side interviews. if twitter_interviews and self.twitter_env: try: twitter_actions = {} @@ -476,7 +475,7 @@ class ParallelIPCHandler: except Exception as e: print(f" Twitter批量Interview失败: {e}") - # 处理Reddit平台的采访 + # Run the Reddit-side interviews. if reddit_interviews and self.reddit_env: try: reddit_actions = {} @@ -515,7 +514,7 @@ class ParallelIPCHandler: return False def _get_interview_result(self, agent_id: int, platform: str) -> Dict[str, Any]: - """从数据库获取最新的Interview结果""" + """Read the latest Interview result for an agent from the database.""" db_path = os.path.join(self.simulation_dir, f"{platform}_simulation.db") result = { @@ -530,8 +529,8 @@ class ParallelIPCHandler: try: conn = sqlite3.connect(db_path) cursor = conn.cursor() - - # 查询最新的Interview记录 + + # Look up the most recent Interview row for this agent. cursor.execute(""" SELECT user_id, info, created_at FROM trace @@ -558,11 +557,10 @@ class ParallelIPCHandler: return result async def process_commands(self) -> bool: - """ - 处理所有待处理命令 - + """Process all pending commands. + Returns: - True 表示继续运行,False 表示应该退出 + ``True`` to keep running, ``False`` if the process should exit. """ command = self.poll_command() if not command: @@ -602,15 +600,15 @@ class ParallelIPCHandler: def load_config(config_path: str) -> Dict[str, Any]: - """加载配置文件""" + """Load a JSON config file from disk.""" with open(config_path, 'r', encoding='utf-8') as f: return json.load(f) -# 需要过滤掉的非核心动作类型(这些动作对分析价值较低) +# Non-core action types to filter out: they provide little analytical value. FILTERED_ACTIONS = {'refresh', 'sign_up'} -# 动作类型映射表(数据库中的名称 -> 标准名称) +# Action-type mapping (database name -> canonical name). ACTION_TYPE_MAP = { 'create_post': 'CREATE_POST', 'like_post': 'LIKE_POST', @@ -631,16 +629,16 @@ ACTION_TYPE_MAP = { def get_agent_names_from_config(config: Dict[str, Any]) -> Dict[int, str]: - """ - 从 simulation_config 中获取 agent_id -> entity_name 的映射 - - 这样可以在 actions.jsonl 中显示真实的实体名称,而不是 "Agent_0" 这样的代号 - + """Build an ``agent_id -> entity_name`` map from the simulation config. + + Using the entity name lets actions.jsonl display the real entity rather + than placeholder labels like ``Agent_0``. + Args: - config: simulation_config.json 的内容 - + config: contents of ``simulation_config.json``. + Returns: - agent_id -> entity_name 的映射字典 + Mapping from agent id to entity name. """ agent_names = {} agent_configs = config.get("agent_configs", []) @@ -659,18 +657,20 @@ def fetch_new_actions_from_db( last_rowid: int, agent_names: Dict[int, str] ) -> Tuple[List[Dict[str, Any]], int]: - """ - 从数据库中获取新的动作记录,并补充完整的上下文信息 - + """Fetch new action rows from the database and enrich them with context. + Args: - db_path: 数据库文件路径 - last_rowid: 上次读取的最大 rowid 值(使用 rowid 而不是 created_at,因为不同平台的 created_at 格式不同) - agent_names: agent_id -> agent_name 映射 - + db_path: path to the database file. + last_rowid: highest rowid processed previously. We track ``rowid`` + rather than ``created_at`` because the two platforms use different + ``created_at`` formats. + agent_names: ``agent_id -> agent_name`` mapping. + Returns: - (actions_list, new_last_rowid) - - actions_list: 动作列表,每个元素包含 agent_id, agent_name, action_type, action_args(含上下文信息) - - new_last_rowid: 新的最大 rowid 值 + Tuple ``(actions_list, new_last_rowid)``. + - ``actions_list``: action records, each containing ``agent_id``, + ``agent_name``, ``action_type``, and ``action_args`` (with context). + - ``new_last_rowid``: the new highest rowid seen. """ actions = [] new_last_rowid = last_rowid @@ -681,9 +681,10 @@ def fetch_new_actions_from_db( try: conn = sqlite3.connect(db_path) cursor = conn.cursor() - - # 使用 rowid 来追踪已处理的记录(rowid 是 SQLite 的内置自增字段) - # 这样可以避免 created_at 格式差异问题(Twitter 用整数,Reddit 用日期时间字符串) + + # Use ``rowid`` to track processed rows. ``rowid`` is SQLite's built-in + # auto-increment column and avoids the cross-platform ``created_at`` + # format mismatch (Twitter stores integers, Reddit stores datetime strings). cursor.execute(""" SELECT rowid, user_id, action, info FROM trace @@ -692,20 +693,17 @@ def fetch_new_actions_from_db( """, (last_rowid,)) for rowid, user_id, action, info_json in cursor.fetchall(): - # 更新最大 rowid new_last_rowid = rowid - - # 过滤非核心动作 + if action in FILTERED_ACTIONS: continue - - # 解析动作参数 + try: action_args = json.loads(info_json) if info_json else {} except json.JSONDecodeError: action_args = {} - - # 精简 action_args,只保留关键字段(保留完整内容,不截断) + + # Slim ``action_args`` down to the key fields. Content is kept in full (no truncation). simplified_args = {} if 'content' in action_args: simplified_args['content'] = action_args['content'] @@ -726,10 +724,9 @@ def fetch_new_actions_from_db( if 'dislike_id' in action_args: simplified_args['dislike_id'] = action_args['dislike_id'] - # 转换动作类型名称 action_type = ACTION_TYPE_MAP.get(action, action.upper()) - - # 补充上下文信息(帖子内容、用户名等) + + # Enrich with context such as post content and author name. _enrich_action_context(cursor, action_type, simplified_args, agent_names) actions.append({ @@ -752,17 +749,16 @@ def _enrich_action_context( action_args: Dict[str, Any], agent_names: Dict[int, str] ) -> None: - """ - 为动作补充上下文信息(帖子内容、用户名等) - + """Enrich an action's args with context such as post content and author name. + Args: - cursor: 数据库游标 - action_type: 动作类型 - action_args: 动作参数(会被修改) - agent_names: agent_id -> agent_name 映射 + cursor: database cursor. + action_type: action type. + action_args: action args (mutated in place). + agent_names: ``agent_id -> agent_name`` mapping. """ try: - # 点赞/踩帖子:补充帖子内容和作者 + # Like/dislike post: include the post content and author name. if action_type in ('LIKE_POST', 'DISLIKE_POST'): post_id = action_args.get('post_id') if post_id: @@ -771,11 +767,11 @@ def _enrich_action_context( action_args['post_content'] = post_info.get('content', '') action_args['post_author_name'] = post_info.get('author_name', '') - # 转发帖子:补充原帖内容和作者 + # Repost: include the original post content and author name. elif action_type == 'REPOST': new_post_id = action_args.get('new_post_id') if new_post_id: - # 转发帖子的 original_post_id 指向原帖 + # On a repost row, ``original_post_id`` points at the original post. cursor.execute(""" SELECT original_post_id FROM post WHERE post_id = ? """, (new_post_id,)) @@ -787,18 +783,18 @@ def _enrich_action_context( action_args['original_content'] = original_info.get('content', '') action_args['original_author_name'] = original_info.get('author_name', '') - # 引用帖子:补充原帖内容、作者和引用评论 + # Quote post: include the original post content, author name, and quote comment. elif action_type == 'QUOTE_POST': quoted_id = action_args.get('quoted_id') new_post_id = action_args.get('new_post_id') - + if quoted_id: original_info = _get_post_info(cursor, quoted_id, agent_names) if original_info: action_args['original_content'] = original_info.get('content', '') action_args['original_author_name'] = original_info.get('author_name', '') - - # 获取引用帖子的评论内容(quote_content) + + # Read the quote comment (``quote_content``). if new_post_id: cursor.execute(""" SELECT quote_content FROM post WHERE post_id = ? @@ -807,11 +803,11 @@ def _enrich_action_context( if row and row[0]: action_args['quote_content'] = row[0] - # 关注用户:补充被关注用户的名称 + # Follow: include the followee's display name. elif action_type == 'FOLLOW': follow_id = action_args.get('follow_id') if follow_id: - # 从 follow 表获取 followee_id + # Look up ``followee_id`` from the ``follow`` table. cursor.execute(""" SELECT followee_id FROM follow WHERE follow_id = ? """, (follow_id,)) @@ -822,16 +818,16 @@ def _enrich_action_context( if target_name: action_args['target_user_name'] = target_name - # 屏蔽用户:补充被屏蔽用户的名称 + # Mute: include the muted user's display name. elif action_type == 'MUTE': - # 从 action_args 中获取 user_id 或 target_id + # Read ``user_id`` or ``target_id`` from action_args. target_id = action_args.get('user_id') or action_args.get('target_id') if target_id: target_name = _get_user_name(cursor, target_id, agent_names) if target_name: action_args['target_user_name'] = target_name - # 点赞/踩评论:补充评论内容和作者 + # Like/dislike comment: include the comment content and author name. elif action_type in ('LIKE_COMMENT', 'DISLIKE_COMMENT'): comment_id = action_args.get('comment_id') if comment_id: @@ -840,7 +836,7 @@ def _enrich_action_context( action_args['comment_content'] = comment_info.get('content', '') action_args['comment_author_name'] = comment_info.get('author_name', '') - # 发表评论:补充所评论的帖子信息 + # Create comment: include the parent post's content and author name. elif action_type == 'CREATE_COMMENT': post_id = action_args.get('post_id') if post_id: @@ -850,7 +846,7 @@ def _enrich_action_context( action_args['post_author_name'] = post_info.get('author_name', '') except Exception as e: - # 补充上下文失败不影响主流程 + # Failing to enrich context must not break the main flow. print(f"补充动作上下文失败: {e}") @@ -859,16 +855,15 @@ def _get_post_info( post_id: int, agent_names: Dict[int, str] ) -> Optional[Dict[str, str]]: - """ - 获取帖子信息 - + """Look up post info. + Args: - cursor: 数据库游标 - post_id: 帖子ID - agent_names: agent_id -> agent_name 映射 - + cursor: database cursor. + post_id: post identifier. + agent_names: ``agent_id -> agent_name`` mapping. + Returns: - 包含 content 和 author_name 的字典,或 None + Dict with ``content`` and ``author_name``, or ``None`` when not found. """ try: cursor.execute(""" @@ -882,18 +877,18 @@ def _get_post_info( content = row[0] or '' user_id = row[1] agent_id = row[2] - - # 优先使用 agent_names 中的名称 + + # Prefer the entity_name supplied via agent_names. author_name = '' if agent_id is not None and agent_id in agent_names: author_name = agent_names[agent_id] elif user_id: - # 从 user 表获取名称 + # Fall back to the user table. cursor.execute("SELECT name, user_name FROM user WHERE user_id = ?", (user_id,)) user_row = cursor.fetchone() if user_row: author_name = user_row[0] or user_row[1] or '' - + return {'content': content, 'author_name': author_name} except Exception: pass @@ -905,16 +900,15 @@ def _get_user_name( user_id: int, agent_names: Dict[int, str] ) -> Optional[str]: - """ - 获取用户名称 - + """Look up a user's display name. + Args: - cursor: 数据库游标 - user_id: 用户ID - agent_names: agent_id -> agent_name 映射 - + cursor: database cursor. + user_id: user identifier. + agent_names: ``agent_id -> agent_name`` mapping. + Returns: - 用户名称,或 None + Display name, or ``None`` when the user cannot be found. """ try: cursor.execute(""" @@ -925,8 +919,8 @@ def _get_user_name( agent_id = row[0] name = row[1] user_name = row[2] - - # 优先使用 agent_names 中的名称 + + # Prefer the entity_name supplied via agent_names. if agent_id is not None and agent_id in agent_names: return agent_names[agent_id] return name or user_name or '' @@ -940,16 +934,15 @@ def _get_comment_info( comment_id: int, agent_names: Dict[int, str] ) -> Optional[Dict[str, str]]: - """ - 获取评论信息 - + """Look up comment info. + Args: - cursor: 数据库游标 - comment_id: 评论ID - agent_names: agent_id -> agent_name 映射 - + cursor: database cursor. + comment_id: comment identifier. + agent_names: ``agent_id -> agent_name`` mapping. + Returns: - 包含 content 和 author_name 的字典,或 None + Dict with ``content`` and ``author_name``, or ``None`` when not found. """ try: cursor.execute(""" @@ -963,18 +956,18 @@ def _get_comment_info( content = row[0] or '' user_id = row[1] agent_id = row[2] - - # 优先使用 agent_names 中的名称 + + # Prefer the entity_name supplied via agent_names. author_name = '' if agent_id is not None and agent_id in agent_names: author_name = agent_names[agent_id] elif user_id: - # 从 user 表获取名称 + # Fall back to the user table. cursor.execute("SELECT name, user_name FROM user WHERE user_id = ?", (user_id,)) user_row = cursor.fetchone() if user_row: author_name = user_row[0] or user_row[1] or '' - + return {'content': content, 'author_name': author_name} except Exception: pass @@ -982,44 +975,44 @@ def _get_comment_info( def create_model(config: Dict[str, Any], use_boost: bool = False): - """ - 创建LLM模型 - - 支持双 LLM 配置,用于并行模拟时提速: - - 通用配置:LLM_API_KEY, LLM_BASE_URL, LLM_MODEL_NAME - - 加速配置(可选):LLM_BOOST_API_KEY, LLM_BOOST_BASE_URL, LLM_BOOST_MODEL_NAME - - 如果配置了加速 LLM,并行模拟时可以让不同平台使用不同的 API 服务商,提高并发能力。 - + """Create the LLM model used by the simulation. + + Two LLM configurations are supported, which lets parallel simulations run faster: + - default: ``LLM_API_KEY``, ``LLM_BASE_URL``, ``LLM_MODEL_NAME``. + - boost (optional): ``LLM_BOOST_API_KEY``, ``LLM_BOOST_BASE_URL``, ``LLM_BOOST_MODEL_NAME``. + + When a boost LLM is configured, the two platforms can target different API + providers, increasing overall concurrency. + Args: - config: 模拟配置字典 - use_boost: 是否使用加速 LLM 配置(如果可用) + config: simulation config dict. + use_boost: whether to use the boost LLM config when available. """ - # 检查是否有加速配置 + # Inspect the boost configuration. boost_api_key = os.environ.get("LLM_BOOST_API_KEY", "") boost_base_url = os.environ.get("LLM_BOOST_BASE_URL", "") boost_model = os.environ.get("LLM_BOOST_MODEL_NAME", "") has_boost_config = bool(boost_api_key) - - # 根据参数和配置情况选择使用哪个 LLM + + # Choose which LLM to use based on the request and what is configured. if use_boost and has_boost_config: - # 使用加速配置 + # Use the boost configuration. llm_api_key = boost_api_key llm_base_url = boost_base_url llm_model = boost_model or os.environ.get("LLM_MODEL_NAME", "") config_label = "[加速LLM]" else: - # 使用通用配置 + # Use the default configuration. llm_api_key = os.environ.get("LLM_API_KEY", "") llm_base_url = os.environ.get("LLM_BASE_URL", "") llm_model = os.environ.get("LLM_MODEL_NAME", "") config_label = "[通用LLM]" - - # 如果 .env 中没有模型名,则使用 config 作为备用 + + # Fall back to the model name in the config when .env does not provide one. if not llm_model: llm_model = config.get("llm_model", "gpt-4o-mini") - - # 设置 camel-ai 所需的环境变量 + + # Populate the env vars camel-ai expects. if llm_api_key: os.environ["OPENAI_API_KEY"] = llm_api_key @@ -1043,7 +1036,7 @@ def get_active_agents_for_round( current_hour: int, round_num: int ) -> List: - """根据时间和配置决定本轮激活哪些Agent""" + """Decide which agents are active in this round based on time and config.""" time_config = config.get("time_config", {}) agent_configs = config.get("agent_configs", []) @@ -1091,7 +1084,7 @@ def get_active_agents_for_round( class PlatformSimulation: - """平台模拟结果容器""" + """Container for the result of a platform simulation.""" def __init__(self): self.env = None self.agent_graph = None @@ -1105,17 +1098,17 @@ async def run_twitter_simulation( main_logger: Optional[SimulationLogManager] = None, max_rounds: Optional[int] = None ) -> PlatformSimulation: - """运行Twitter模拟 - + """Run the Twitter simulation. + Args: - config: 模拟配置 - simulation_dir: 模拟目录 - action_logger: 动作日志记录器 - main_logger: 主日志管理器 - max_rounds: 最大模拟轮数(可选,用于截断过长的模拟) - + config: simulation config. + simulation_dir: simulation directory. + action_logger: action logger. + main_logger: main log manager. + max_rounds: optional cap on the number of rounds, used to truncate long runs. + Returns: - PlatformSimulation: 包含env和agent_graph的结果对象 + PlatformSimulation containing the env and agent_graph. """ result = PlatformSimulation() @@ -1125,11 +1118,11 @@ async def run_twitter_simulation( print(f"[Twitter] {msg}") log_info("初始化...") - - # Twitter 使用通用 LLM 配置 + + # Twitter uses the default LLM config. model = create_model(config, use_boost=False) - - # OASIS Twitter使用CSV格式 + + # OASIS Twitter expects a CSV profile file. profile_path = os.path.join(simulation_dir, "twitter_profiles.csv") if not os.path.exists(profile_path): log_info(f"错误: Profile文件不存在: {profile_path}") @@ -1141,13 +1134,13 @@ async def run_twitter_simulation( available_actions=TWITTER_ACTIONS, ) - # 从配置文件获取 Agent 真实名称映射(使用 entity_name 而非默认的 Agent_X) + # Pull real agent names from the config (use entity_name rather than the default Agent_X). agent_names = get_agent_names_from_config(config) - # 如果配置中没有某个 agent,则使用 OASIS 的默认名称 + # If the config does not list a particular agent, fall back to OASIS's default name. for agent_id, agent in result.agent_graph.get_agents(): if agent_id not in agent_names: agent_names[agent_id] = getattr(agent, 'name', f'Agent_{agent_id}') - + db_path = os.path.join(simulation_dir, "twitter_simulation.db") if os.path.exists(db_path): os.remove(db_path) @@ -1156,7 +1149,7 @@ async def run_twitter_simulation( agent_graph=result.agent_graph, platform=oasis.DefaultPlatformType.TWITTER, database_path=db_path, - semaphore=30, # 限制最大并发 LLM 请求数,防止 API 过载 + semaphore=30, # cap concurrent LLM requests to avoid overloading the API ) await result.env.reset() @@ -1166,13 +1159,13 @@ async def run_twitter_simulation( action_logger.log_simulation_start(config) total_actions = 0 - last_rowid = 0 # 跟踪数据库中最后处理的行号(使用 rowid 避免 created_at 格式差异) - - # 执行初始事件 + last_rowid = 0 # last processed db row; using rowid avoids created_at format differences + + # Run the initial events. event_config = config.get("event_config", {}) initial_posts = event_config.get("initial_posts", []) - - # 记录 round 0 开始(初始事件阶段) + + # Mark the start of round 0 (the initial-events phase). if action_logger: action_logger.log_round_start(0, 0) # round 0, simulated_hour 0 @@ -1206,17 +1199,17 @@ async def run_twitter_simulation( await result.env.step(initial_actions) log_info(f"已发布 {len(initial_actions)} 条初始帖子") - # 记录 round 0 结束 + # Mark the end of round 0. if action_logger: action_logger.log_round_end(0, initial_action_count) - - # 主模拟循环 + + # Main simulation loop. time_config = config.get("time_config", {}) total_hours = time_config.get("total_simulation_hours", 72) minutes_per_round = time_config.get("minutes_per_round", 30) total_rounds = (total_hours * 60) // minutes_per_round - - # 如果指定了最大轮数,则截断 + + # Truncate when a max round count was supplied. if max_rounds is not None and max_rounds > 0: original_rounds = total_rounds total_rounds = min(total_rounds, max_rounds) @@ -1226,7 +1219,7 @@ async def run_twitter_simulation( start_time = datetime.now() for round_num in range(total_rounds): - # 检查是否收到退出信号 + # Bail out if a shutdown signal was received. if _shutdown_event and _shutdown_event.is_set(): if main_logger: main_logger.info(f"收到退出信号,在第 {round_num + 1} 轮停止模拟") @@ -1240,12 +1233,12 @@ async def run_twitter_simulation( result.env, config, simulated_hour, round_num ) - # 无论是否有活跃agent,都记录round开始 + # Always log round-start, even when no agents are active. if action_logger: action_logger.log_round_start(round_num + 1, simulated_hour) - + if not active_agents: - # 没有活跃agent时也记录round结束(actions_count=0) + # Still emit round-end (with actions_count=0) so the log stays consistent. if action_logger: action_logger.log_round_end(round_num + 1, 0) continue @@ -1253,7 +1246,7 @@ async def run_twitter_simulation( actions = {agent: LLMAction() for _, agent in active_agents} await result.env.step(actions) - # 从数据库获取实际执行的动作并记录 + # Pull the actually-executed actions from the database and log them. actual_actions, last_rowid = fetch_new_actions_from_db( db_path, last_rowid, agent_names ) @@ -1278,7 +1271,7 @@ async def run_twitter_simulation( progress = (round_num + 1) / total_rounds * 100 log_info(f"Day {simulated_day}, {simulated_hour:02d}:00 - Round {round_num + 1}/{total_rounds} ({progress:.1f}%)") - # 注意:不关闭环境,保留给Interview使用 + # Note: do NOT close the env here; we keep it alive for Interview commands. if action_logger: action_logger.log_simulation_end(total_rounds, total_actions) @@ -1297,17 +1290,17 @@ async def run_reddit_simulation( main_logger: Optional[SimulationLogManager] = None, max_rounds: Optional[int] = None ) -> PlatformSimulation: - """运行Reddit模拟 - + """Run the Reddit simulation. + Args: - config: 模拟配置 - simulation_dir: 模拟目录 - action_logger: 动作日志记录器 - main_logger: 主日志管理器 - max_rounds: 最大模拟轮数(可选,用于截断过长的模拟) - + config: simulation config. + simulation_dir: simulation directory. + action_logger: action logger. + main_logger: main log manager. + max_rounds: optional cap on the number of rounds, used to truncate long runs. + Returns: - PlatformSimulation: 包含env和agent_graph的结果对象 + PlatformSimulation containing the env and agent_graph. """ result = PlatformSimulation() @@ -1318,7 +1311,7 @@ async def run_reddit_simulation( log_info("初始化...") - # Reddit 使用加速 LLM 配置(如果有的话,否则回退到通用配置) + # Reddit uses the boost LLM config when available, falling back to the default. model = create_model(config, use_boost=True) profile_path = os.path.join(simulation_dir, "reddit_profiles.json") @@ -1332,13 +1325,13 @@ async def run_reddit_simulation( available_actions=REDDIT_ACTIONS, ) - # 从配置文件获取 Agent 真实名称映射(使用 entity_name 而非默认的 Agent_X) + # Pull real agent names from the config (use entity_name rather than the default Agent_X). agent_names = get_agent_names_from_config(config) - # 如果配置中没有某个 agent,则使用 OASIS 的默认名称 + # If the config does not list a particular agent, fall back to OASIS's default name. for agent_id, agent in result.agent_graph.get_agents(): if agent_id not in agent_names: agent_names[agent_id] = getattr(agent, 'name', f'Agent_{agent_id}') - + db_path = os.path.join(simulation_dir, "reddit_simulation.db") if os.path.exists(db_path): os.remove(db_path) @@ -1347,7 +1340,7 @@ async def run_reddit_simulation( agent_graph=result.agent_graph, platform=oasis.DefaultPlatformType.REDDIT, database_path=db_path, - semaphore=30, # 限制最大并发 LLM 请求数,防止 API 过载 + semaphore=30, # cap concurrent LLM requests to avoid overloading the API ) await result.env.reset() @@ -1357,13 +1350,13 @@ async def run_reddit_simulation( action_logger.log_simulation_start(config) total_actions = 0 - last_rowid = 0 # 跟踪数据库中最后处理的行号(使用 rowid 避免 created_at 格式差异) - - # 执行初始事件 + last_rowid = 0 # last processed db row; using rowid avoids created_at format differences + + # Run the initial events. event_config = config.get("event_config", {}) initial_posts = event_config.get("initial_posts", []) - - # 记录 round 0 开始(初始事件阶段) + + # Mark the start of round 0 (the initial-events phase). if action_logger: action_logger.log_round_start(0, 0) # round 0, simulated_hour 0 @@ -1405,17 +1398,17 @@ async def run_reddit_simulation( await result.env.step(initial_actions) log_info(f"已发布 {len(initial_actions)} 条初始帖子") - # 记录 round 0 结束 + # Mark the end of round 0. if action_logger: action_logger.log_round_end(0, initial_action_count) - - # 主模拟循环 + + # Main simulation loop. time_config = config.get("time_config", {}) total_hours = time_config.get("total_simulation_hours", 72) minutes_per_round = time_config.get("minutes_per_round", 30) total_rounds = (total_hours * 60) // minutes_per_round - - # 如果指定了最大轮数,则截断 + + # Truncate when a max round count was supplied. if max_rounds is not None and max_rounds > 0: original_rounds = total_rounds total_rounds = min(total_rounds, max_rounds) @@ -1425,7 +1418,7 @@ async def run_reddit_simulation( start_time = datetime.now() for round_num in range(total_rounds): - # 检查是否收到退出信号 + # Bail out if a shutdown signal was received. if _shutdown_event and _shutdown_event.is_set(): if main_logger: main_logger.info(f"收到退出信号,在第 {round_num + 1} 轮停止模拟") @@ -1439,12 +1432,12 @@ async def run_reddit_simulation( result.env, config, simulated_hour, round_num ) - # 无论是否有活跃agent,都记录round开始 + # Always log round-start, even when no agents are active. if action_logger: action_logger.log_round_start(round_num + 1, simulated_hour) - + if not active_agents: - # 没有活跃agent时也记录round结束(actions_count=0) + # Still emit round-end (with actions_count=0) so the log stays consistent. if action_logger: action_logger.log_round_end(round_num + 1, 0) continue @@ -1452,7 +1445,7 @@ async def run_reddit_simulation( actions = {agent: LLMAction() for _, agent in active_agents} await result.env.step(actions) - # 从数据库获取实际执行的动作并记录 + # Pull the actually-executed actions from the database and log them. actual_actions, last_rowid = fetch_new_actions_from_db( db_path, last_rowid, agent_names ) @@ -1477,7 +1470,7 @@ async def run_reddit_simulation( progress = (round_num + 1) / total_rounds * 100 log_info(f"Day {simulated_day}, {simulated_hour:02d}:00 - Round {round_num + 1}/{total_rounds} ({progress:.1f}%)") - # 注意:不关闭环境,保留给Interview使用 + # Note: do NOT close the env here; we keep it alive for Interview commands. if action_logger: action_logger.log_simulation_end(total_rounds, total_actions) @@ -1522,7 +1515,8 @@ async def main(): args = parser.parse_args() - # 在 main 函数开始时创建 shutdown 事件,确保整个程序都能响应退出信号 + # Create the shutdown event at the start of main() so the whole program + # can respond to exit signals. global _shutdown_event _shutdown_event = asyncio.Event() @@ -1534,10 +1528,10 @@ async def main(): simulation_dir = os.path.dirname(args.config) or "." wait_for_commands = not args.no_wait - # 初始化日志配置(禁用 OASIS 日志,清理旧文件) + # Initialize logging (disable OASIS logs, clean up stale files). init_logging_for_simulation(simulation_dir) - - # 创建日志管理器 + + # Create the log manager. log_manager = SimulationLogManager(simulation_dir) twitter_logger = log_manager.get_twitter_logger() reddit_logger = log_manager.get_reddit_logger() @@ -1572,7 +1566,7 @@ async def main(): start_time = datetime.now() - # 存储两个平台的模拟结果 + # Holds the result for each platform simulation. twitter_result: Optional[PlatformSimulation] = None reddit_result: Optional[PlatformSimulation] = None @@ -1581,7 +1575,7 @@ async def main(): elif args.reddit_only: reddit_result = await run_reddit_simulation(config, simulation_dir, reddit_logger, log_manager, args.max_rounds) else: - # 并行运行(每个平台使用独立的日志记录器) + # Run both platforms in parallel; each platform uses its own logger. results = await asyncio.gather( run_twitter_simulation(config, simulation_dir, twitter_logger, log_manager, args.max_rounds), run_reddit_simulation(config, simulation_dir, reddit_logger, log_manager, args.max_rounds), @@ -1592,7 +1586,7 @@ async def main(): log_manager.info("=" * 60) log_manager.info(f"模拟循环完成! 总耗时: {total_elapsed:.1f}秒") - # 是否进入等待命令模式 + # Enter wait-for-command mode if requested. if wait_for_commands: log_manager.info("") log_manager.info("=" * 60) @@ -1600,7 +1594,7 @@ async def main(): log_manager.info("支持的命令: interview, batch_interview, close_env") log_manager.info("=" * 60) - # 创建IPC处理器 + # Create the IPC handler. ipc_handler = ParallelIPCHandler( simulation_dir=simulation_dir, twitter_env=twitter_result.env if twitter_result else None, @@ -1610,18 +1604,18 @@ async def main(): ) ipc_handler.update_status("alive") - # 等待命令循环(使用全局 _shutdown_event) + # Command-wait loop (driven by the global ``_shutdown_event``). try: while not _shutdown_event.is_set(): should_continue = await ipc_handler.process_commands() if not should_continue: break - # 使用 wait_for 替代 sleep,这样可以响应 shutdown_event + # Use ``wait_for`` instead of ``sleep`` so the loop reacts to shutdown_event. try: await asyncio.wait_for(_shutdown_event.wait(), timeout=0.5) - break # 收到退出信号 + break # shutdown signal received except asyncio.TimeoutError: - pass # 超时继续循环 + pass # timed out, continue looping except KeyboardInterrupt: print("\n收到中断信号") except asyncio.CancelledError: @@ -1632,7 +1626,7 @@ async def main(): log_manager.info("\n关闭环境...") ipc_handler.update_status("stopped") - # 关闭环境 + # Close the environments. if twitter_result and twitter_result.env: await twitter_result.env.close() log_manager.info("[Twitter] 环境已关闭") @@ -1651,14 +1645,13 @@ async def main(): def setup_signal_handlers(loop=None): - """ - 设置信号处理器,确保收到 SIGTERM/SIGINT 时能够正确退出 - - 持久化模拟场景:模拟完成后不退出,等待 interview 命令 - 当收到终止信号时,需要: - 1. 通知 asyncio 循环退出等待 - 2. 让程序有机会正常清理资源(关闭数据库、环境等) - 3. 然后才退出 + """Install signal handlers that exit cleanly on SIGTERM/SIGINT. + + Persistent-simulation flow: the process keeps running after the simulation + finishes so it can serve interview commands. On a termination signal we: + 1. Tell the asyncio loop to stop waiting. + 2. Give the program a chance to clean up (close databases, envs, ...). + 3. Then exit. """ def signal_handler(signum, frame): global _cleanup_done @@ -1667,12 +1660,12 @@ def setup_signal_handlers(loop=None): if not _cleanup_done: _cleanup_done = True - # 设置事件通知 asyncio 循环退出(让循环有机会清理资源) + # Notify the asyncio loop to exit so it can clean up resources. if _shutdown_event: _shutdown_event.set() - - # 不要直接 sys.exit(),让 asyncio 循环正常退出并清理资源 - # 如果是重复收到信号,才强制退出 + + # Avoid sys.exit() on the first signal: let the asyncio loop exit cleanly. + # Only force-exit if a second signal comes in. else: print("强制退出...") sys.exit(1) @@ -1690,7 +1683,7 @@ if __name__ == "__main__": except SystemExit: pass finally: - # 清理 multiprocessing 资源跟踪器(防止退出时的警告) + # Clean up the multiprocessing resource tracker to avoid exit warnings. try: from multiprocessing import resource_tracker resource_tracker._resource_tracker._stop() diff --git a/backend/scripts/run_reddit_simulation.py b/backend/scripts/run_reddit_simulation.py index 14907cbd..d3adc560 100644 --- a/backend/scripts/run_reddit_simulation.py +++ b/backend/scripts/run_reddit_simulation.py @@ -1,16 +1,16 @@ -""" -OASIS Reddit模拟预设脚本 -此脚本读取配置文件中的参数来执行模拟,实现全程自动化 +"""OASIS Reddit simulation preset script. -功能特性: -- 完成模拟后不立即关闭环境,进入等待命令模式 -- 支持通过IPC接收Interview命令 -- 支持单个Agent采访和批量采访 -- 支持远程关闭环境命令 +This script reads parameters from a config file and runs the simulation end-to-end automatically. -使用方式: +Features: +- After the simulation finishes, the environment stays alive and enters a command-wait mode. +- Accepts Interview commands over IPC. +- Supports single-agent and batch interviews. +- Supports a remote close-environment command. + +Usage: python run_reddit_simulation.py --config /path/to/simulation_config.json - python run_reddit_simulation.py --config /path/to/simulation_config.json --no-wait # 完成后立即关闭 + python run_reddit_simulation.py --config /path/to/simulation_config.json --no-wait # close immediately when done """ import argparse @@ -25,18 +25,18 @@ import sqlite3 from datetime import datetime from typing import Dict, Any, List, Optional -# 全局变量:用于信号处理 +# Globals used by the signal handler. _shutdown_event = None _cleanup_done = False -# 添加项目路径 +# Add project paths to sys.path so sibling modules import correctly. _scripts_dir = os.path.dirname(os.path.abspath(__file__)) _backend_dir = os.path.abspath(os.path.join(_scripts_dir, '..')) _project_root = os.path.abspath(os.path.join(_backend_dir, '..')) sys.path.insert(0, _scripts_dir) sys.path.insert(0, _backend_dir) -# 加载项目根目录的 .env 文件(包含 LLM_API_KEY 等配置) +# Load the .env file from the project root (contains LLM_API_KEY and related settings). from dotenv import load_dotenv _env_file = os.path.join(_project_root, '.env') if os.path.exists(_env_file): @@ -51,7 +51,7 @@ import re class UnicodeFormatter(logging.Formatter): - """自定义格式化器,将 Unicode 转义序列转换为可读字符""" + """Custom log formatter that converts Unicode escape sequences into readable characters.""" UNICODE_ESCAPE_PATTERN = re.compile(r'\\u([0-9a-fA-F]{4})') @@ -68,24 +68,23 @@ class UnicodeFormatter(logging.Formatter): class MaxTokensWarningFilter(logging.Filter): - """过滤掉 camel-ai 关于 max_tokens 的警告(我们故意不设置 max_tokens,让模型自行决定)""" - + """Suppress camel-ai's max_tokens warning (we intentionally leave max_tokens unset and let the model decide).""" + def filter(self, record): - # 过滤掉包含 max_tokens 警告的日志 if "max_tokens" in record.getMessage() and "Invalid or missing" in record.getMessage(): return False return True -# 在模块加载时立即添加过滤器,确保在 camel 代码执行前生效 +# Install the filter at module import time so it takes effect before any camel code runs. logging.getLogger().addFilter(MaxTokensWarningFilter()) def setup_oasis_logging(log_dir: str): - """配置 OASIS 的日志,使用固定名称的日志文件""" + """Configure OASIS logging with fixed log file names.""" os.makedirs(log_dir, exist_ok=True) - - # 清理旧的日志文件 + + # Remove stale log files from previous runs so the new run starts clean. for f in os.listdir(log_dir): old_log = os.path.join(log_dir, f) if os.path.isfile(old_log) and f.endswith('.log'): @@ -131,20 +130,20 @@ except ImportError as e: sys.exit(1) -# IPC相关常量 +# IPC-related constants. IPC_COMMANDS_DIR = "ipc_commands" IPC_RESPONSES_DIR = "ipc_responses" ENV_STATUS_FILE = "env_status.json" class CommandType: - """命令类型常量""" + """Command type constants.""" INTERVIEW = "interview" BATCH_INTERVIEW = "batch_interview" CLOSE_ENV = "close_env" class IPCHandler: - """IPC命令处理器""" + """IPC command handler.""" def __init__(self, simulation_dir: str, env, agent_graph): self.simulation_dir = simulation_dir @@ -154,13 +153,12 @@ class IPCHandler: self.responses_dir = os.path.join(simulation_dir, IPC_RESPONSES_DIR) self.status_file = os.path.join(simulation_dir, ENV_STATUS_FILE) self._running = True - - # 确保目录存在 + os.makedirs(self.commands_dir, exist_ok=True) os.makedirs(self.responses_dir, exist_ok=True) def update_status(self, status: str): - """更新环境状态""" + """Update the environment status file.""" with open(self.status_file, 'w', encoding='utf-8') as f: json.dump({ "status": status, @@ -168,11 +166,11 @@ class IPCHandler: }, f, ensure_ascii=False, indent=2) def poll_command(self) -> Optional[Dict[str, Any]]: - """轮询获取待处理命令""" + """Poll for pending IPC commands.""" if not os.path.exists(self.commands_dir): return None - - # 获取命令文件(按时间排序) + + # Collect command files sorted by modification time so older commands are handled first. command_files = [] for filename in os.listdir(self.commands_dir): if filename.endswith('.json'): @@ -191,7 +189,7 @@ class IPCHandler: return None def send_response(self, command_id: str, status: str, result: Dict = None, error: str = None): - """发送响应""" + """Send an IPC response for a command.""" response = { "command_id": command_id, "status": status, @@ -203,8 +201,8 @@ class IPCHandler: response_file = os.path.join(self.responses_dir, f"{command_id}.json") with open(response_file, 'w', encoding='utf-8') as f: json.dump(response, f, ensure_ascii=False, indent=2) - - # 删除命令文件 + + # Remove the command file once a response has been written so it isn't re-processed. command_file = os.path.join(self.commands_dir, f"{command_id}.json") try: os.remove(command_file) @@ -212,29 +210,25 @@ class IPCHandler: pass async def handle_interview(self, command_id: str, agent_id: int, prompt: str) -> bool: - """ - 处理单个Agent采访命令 - + """Handle a single-agent interview command. + Returns: - True 表示成功,False 表示失败 + True on success, False on failure. """ try: - # 获取Agent agent = self.agent_graph.get_agent(agent_id) - - # 创建Interview动作 + interview_action = ManualAction( action_type=ActionType.INTERVIEW, action_args={"prompt": prompt} ) - - # 执行Interview + actions = {agent: interview_action} await self.env.step(actions) - - # 从数据库获取结果 + + # Read the interview answer back from the simulation database. result = self._get_interview_result(agent_id) - + self.send_response(command_id, "completed", result=result) print(f" Interview完成: agent_id={agent_id}") return True @@ -246,17 +240,15 @@ class IPCHandler: return False async def handle_batch_interview(self, command_id: str, interviews: List[Dict]) -> bool: - """ - 处理批量采访命令 - + """Handle a batch interview command. + Args: interviews: [{"agent_id": int, "prompt": str}, ...] """ try: - # 构建动作字典 actions = {} - agent_prompts = {} # 记录每个agent的prompt - + agent_prompts = {} # Track which prompt was sent to each agent so results can be paired back. + for interview in interviews: agent_id = interview.get("agent_id") prompt = interview.get("prompt", "") @@ -274,11 +266,9 @@ class IPCHandler: if not actions: self.send_response(command_id, "failed", error="没有有效的Agent") return False - - # 执行批量Interview + await self.env.step(actions) - - # 获取所有结果 + results = {} for agent_id in agent_prompts.keys(): result = self._get_interview_result(agent_id) @@ -298,7 +288,7 @@ class IPCHandler: return False def _get_interview_result(self, agent_id: int) -> Dict[str, Any]: - """从数据库获取最新的Interview结果""" + """Fetch the most recent interview result for an agent from the database.""" db_path = os.path.join(self.simulation_dir, "reddit_simulation.db") result = { @@ -313,8 +303,8 @@ class IPCHandler: try: conn = sqlite3.connect(db_path) cursor = conn.cursor() - - # 查询最新的Interview记录 + + # Query the most recent interview row for this agent. cursor.execute(""" SELECT user_id, info, created_at FROM trace @@ -341,11 +331,10 @@ class IPCHandler: return result async def process_commands(self) -> bool: - """ - 处理所有待处理命令 - + """Process all pending IPC commands. + Returns: - True 表示继续运行,False 表示应该退出 + True to keep running, False if the loop should exit. """ command = self.poll_command() if not command: @@ -383,9 +372,9 @@ class IPCHandler: class RedditSimulationRunner: - """Reddit模拟运行器""" - - # Reddit可用动作(不包含INTERVIEW,INTERVIEW只能通过ManualAction手动触发) + """Reddit simulation runner.""" + + # Available Reddit actions (INTERVIEW is excluded because it can only be triggered via ManualAction). AVAILABLE_ACTIONS = [ ActionType.LIKE_POST, ActionType.DISLIKE_POST, @@ -403,12 +392,11 @@ class RedditSimulationRunner: ] def __init__(self, config_path: str, wait_for_commands: bool = True): - """ - 初始化模拟运行器 - + """Initialize the simulation runner. + Args: - config_path: 配置文件路径 (simulation_config.json) - wait_for_commands: 模拟完成后是否等待命令(默认True) + config_path: Path to the configuration file (simulation_config.json). + wait_for_commands: Whether to wait for commands after the simulation finishes (default True). """ self.config_path = config_path self.config = self._load_config() @@ -419,37 +407,36 @@ class RedditSimulationRunner: self.ipc_handler = None def _load_config(self) -> Dict[str, Any]: - """加载配置文件""" + """Load the configuration file.""" with open(self.config_path, 'r', encoding='utf-8') as f: return json.load(f) - + def _get_profile_path(self) -> str: - """获取Profile文件路径""" + """Return the path to the agent profiles file.""" return os.path.join(self.simulation_dir, "reddit_profiles.json") - + def _get_db_path(self) -> str: - """获取数据库路径""" + """Return the path to the simulation database.""" return os.path.join(self.simulation_dir, "reddit_simulation.db") - + def _create_model(self): + """Create the LLM model. + + Configuration is sourced from the project-root ``.env`` file (highest priority): + - LLM_API_KEY: API key. + - LLM_BASE_URL: API base URL. + - LLM_MODEL_NAME: Model name. """ - 创建LLM模型 - - 统一使用项目根目录 .env 文件中的配置(优先级最高): - - LLM_API_KEY: API密钥 - - LLM_BASE_URL: API基础URL - - LLM_MODEL_NAME: 模型名称 - """ - # 优先从 .env 读取配置 + # Prefer values from .env over the per-simulation config. llm_api_key = os.environ.get("LLM_API_KEY", "") llm_base_url = os.environ.get("LLM_BASE_URL", "") llm_model = os.environ.get("LLM_MODEL_NAME", "") - - # 如果 .env 中没有,则使用 config 作为备用 + + # Fall back to the simulation config file if .env did not specify a model. if not llm_model: llm_model = self.config.get("llm_model", "gpt-4o-mini") - - # 设置 camel-ai 所需的环境变量 + + # Export the env vars camel-ai expects. if llm_api_key: os.environ["OPENAI_API_KEY"] = llm_api_key @@ -472,9 +459,7 @@ class RedditSimulationRunner: current_hour: int, round_num: int ) -> List: - """ - 根据时间和配置决定本轮激活哪些Agent - """ + """Decide which agents are active for the current round, based on time of day and config.""" time_config = self.config.get("time_config", {}) agent_configs = self.config.get("agent_configs", []) @@ -521,10 +506,10 @@ class RedditSimulationRunner: return active_agents async def run(self, max_rounds: int = None): - """运行Reddit模拟 - + """Run the Reddit simulation. + Args: - max_rounds: 最大模拟轮数(可选,用于截断过长的模拟) + max_rounds: Optional cap on the number of simulation rounds (used to truncate overly long runs). """ print("=" * 60) print("OASIS Reddit模拟") @@ -538,7 +523,7 @@ class RedditSimulationRunner: minutes_per_round = time_config.get("minutes_per_round", 30) total_rounds = (total_hours * 60) // minutes_per_round - # 如果指定了最大轮数,则截断 + # Truncate if a max_rounds cap was supplied. if max_rounds is not None and max_rounds > 0: original_rounds = total_rounds total_rounds = min(total_rounds, max_rounds) @@ -578,17 +563,16 @@ class RedditSimulationRunner: agent_graph=self.agent_graph, platform=oasis.DefaultPlatformType.REDDIT, database_path=db_path, - semaphore=30, # 限制最大并发 LLM 请求数,防止 API 过载 + semaphore=30, # Cap concurrent LLM requests to avoid overloading the API. ) await self.env.reset() print("环境初始化完成\n") - # 初始化IPC处理器 self.ipc_handler = IPCHandler(self.simulation_dir, self.env, self.agent_graph) self.ipc_handler.update_status("running") - - # 执行初始事件 + + # Apply the configured initial events (seed posts) before starting the main loop. event_config = self.config.get("event_config", {}) initial_posts = event_config.get("initial_posts", []) @@ -619,7 +603,7 @@ class RedditSimulationRunner: await self.env.step(initial_actions) print(f" 已发布 {len(initial_actions)} 条初始帖子") - # 主模拟循环 + # Main simulation loop. print("\n开始模拟循环...") start_time = datetime.now() @@ -655,7 +639,7 @@ class RedditSimulationRunner: print(f" - 总耗时: {total_elapsed:.1f}秒") print(f" - 数据库: {db_path}") - # 是否进入等待命令模式 + # Optionally enter command-wait mode. if self.wait_for_commands: print("\n" + "=" * 60) print("进入等待命令模式 - 环境保持运行") @@ -664,7 +648,7 @@ class RedditSimulationRunner: self.ipc_handler.update_status("alive") - # 等待命令循环(使用全局 _shutdown_event) + # Command-wait loop driven by the global _shutdown_event. try: while not _shutdown_event.is_set(): should_continue = await self.ipc_handler.process_commands() @@ -672,7 +656,7 @@ class RedditSimulationRunner: break try: await asyncio.wait_for(_shutdown_event.wait(), timeout=0.5) - break # 收到退出信号 + break # Shutdown signal received. except asyncio.TimeoutError: pass except KeyboardInterrupt: @@ -683,8 +667,7 @@ class RedditSimulationRunner: print(f"\n命令处理出错: {e}") print("\n关闭环境...") - - # 关闭环境 + self.ipc_handler.update_status("stopped") await self.env.close() @@ -715,7 +698,7 @@ async def main(): args = parser.parse_args() - # 在 main 函数开始时创建 shutdown 事件 + # Create the shutdown event lazily here so it is bound to the running asyncio loop. global _shutdown_event _shutdown_event = asyncio.Event() @@ -723,7 +706,7 @@ async def main(): print(f"错误: 配置文件不存在: {args.config}") sys.exit(1) - # 初始化日志配置(使用固定文件名,清理旧日志) + # Initialize log config with fixed filenames; old logs are cleared inside setup_oasis_logging. simulation_dir = os.path.dirname(args.config) or "." setup_oasis_logging(os.path.join(simulation_dir, "log")) @@ -735,9 +718,9 @@ async def main(): def setup_signal_handlers(): - """ - 设置信号处理器,确保收到 SIGTERM/SIGINT 时能够正确退出 - 让程序有机会正常清理资源(关闭数据库、环境等) + """Install signal handlers so SIGTERM/SIGINT trigger a graceful exit. + + This gives the program a chance to clean up resources (close the database, the OASIS environment, etc.). """ def signal_handler(signum, frame): global _cleanup_done @@ -748,7 +731,7 @@ def setup_signal_handlers(): if _shutdown_event: _shutdown_event.set() else: - # 重复收到信号才强制退出 + # Force exit only on a repeat signal so the user can still hard-kill if cleanup hangs. print("强制退出...") sys.exit(1) diff --git a/backend/scripts/run_twitter_simulation.py b/backend/scripts/run_twitter_simulation.py index caab9e9d..4e96e06b 100644 --- a/backend/scripts/run_twitter_simulation.py +++ b/backend/scripts/run_twitter_simulation.py @@ -1,16 +1,18 @@ """ -OASIS Twitter模拟预设脚本 -此脚本读取配置文件中的参数来执行模拟,实现全程自动化 +OASIS Twitter simulation preset script. -功能特性: -- 完成模拟后不立即关闭环境,进入等待命令模式 -- 支持通过IPC接收Interview命令 -- 支持单个Agent采访和批量采访 -- 支持远程关闭环境命令 +This script reads parameters from a config file to run a fully automated simulation. -使用方式: +Features: +- Does not close the environment immediately when the simulation finishes; enters + command-wait mode instead. +- Receives Interview commands over IPC. +- Supports both single-agent and batch interviews. +- Supports a remote close-environment command. + +Usage: python run_twitter_simulation.py --config /path/to/simulation_config.json - python run_twitter_simulation.py --config /path/to/simulation_config.json --no-wait # 完成后立即关闭 + python run_twitter_simulation.py --config /path/to/simulation_config.json --no-wait # close immediately when done """ import argparse @@ -25,18 +27,17 @@ import sqlite3 from datetime import datetime from typing import Dict, Any, List, Optional -# 全局变量:用于信号处理 +# Globals used by the signal handler. _shutdown_event = None _cleanup_done = False -# 添加项目路径 _scripts_dir = os.path.dirname(os.path.abspath(__file__)) _backend_dir = os.path.abspath(os.path.join(_scripts_dir, '..')) _project_root = os.path.abspath(os.path.join(_backend_dir, '..')) sys.path.insert(0, _scripts_dir) sys.path.insert(0, _backend_dir) -# 加载项目根目录的 .env 文件(包含 LLM_API_KEY 等配置) +# Load the project-root .env (it carries LLM_API_KEY and friends). from dotenv import load_dotenv _env_file = os.path.join(_project_root, '.env') if os.path.exists(_env_file): @@ -51,7 +52,7 @@ import re class UnicodeFormatter(logging.Formatter): - """自定义格式化器,将 Unicode 转义序列转换为可读字符""" + """Custom formatter that turns Unicode escape sequences into readable characters.""" UNICODE_ESCAPE_PATTERN = re.compile(r'\\u([0-9a-fA-F]{4})') @@ -68,24 +69,23 @@ class UnicodeFormatter(logging.Formatter): class MaxTokensWarningFilter(logging.Filter): - """过滤掉 camel-ai 关于 max_tokens 的警告(我们故意不设置 max_tokens,让模型自行决定)""" - + """Suppress camel-ai's max_tokens warning — we intentionally leave it unset and let the model decide.""" + def filter(self, record): - # 过滤掉包含 max_tokens 警告的日志 if "max_tokens" in record.getMessage() and "Invalid or missing" in record.getMessage(): return False return True -# 在模块加载时立即添加过滤器,确保在 camel 代码执行前生效 +# Install the filter at import time so it is active before any camel code runs. logging.getLogger().addFilter(MaxTokensWarningFilter()) def setup_oasis_logging(log_dir: str): - """配置 OASIS 的日志,使用固定名称的日志文件""" + """Configure OASIS logging with fixed log filenames.""" os.makedirs(log_dir, exist_ok=True) - - # 清理旧的日志文件 + + # Wipe stale log files from previous runs. for f in os.listdir(log_dir): old_log = os.path.join(log_dir, f) if os.path.isfile(old_log) and f.endswith('.log'): @@ -131,21 +131,21 @@ except ImportError as e: sys.exit(1) -# IPC相关常量 +# IPC-related constants. IPC_COMMANDS_DIR = "ipc_commands" IPC_RESPONSES_DIR = "ipc_responses" ENV_STATUS_FILE = "env_status.json" class CommandType: - """命令类型常量""" + """Command type constants.""" INTERVIEW = "interview" BATCH_INTERVIEW = "batch_interview" CLOSE_ENV = "close_env" class IPCHandler: - """IPC命令处理器""" - + """Handles IPC commands directed at the running simulation.""" + def __init__(self, simulation_dir: str, env, agent_graph): self.simulation_dir = simulation_dir self.env = env @@ -154,13 +154,12 @@ class IPCHandler: self.responses_dir = os.path.join(simulation_dir, IPC_RESPONSES_DIR) self.status_file = os.path.join(simulation_dir, ENV_STATUS_FILE) self._running = True - - # 确保目录存在 + os.makedirs(self.commands_dir, exist_ok=True) os.makedirs(self.responses_dir, exist_ok=True) - + def update_status(self, status: str): - """更新环境状态""" + """Write the current environment status to the status file.""" with open(self.status_file, 'w', encoding='utf-8') as f: json.dump({ "status": status, @@ -168,11 +167,11 @@ class IPCHandler: }, f, ensure_ascii=False, indent=2) def poll_command(self) -> Optional[Dict[str, Any]]: - """轮询获取待处理命令""" + """Poll for the next pending command.""" if not os.path.exists(self.commands_dir): return None - - # 获取命令文件(按时间排序) + + # Collect command files ordered by mtime. command_files = [] for filename in os.listdir(self.commands_dir): if filename.endswith('.json'): @@ -191,7 +190,7 @@ class IPCHandler: return None def send_response(self, command_id: str, status: str, result: Dict = None, error: str = None): - """发送响应""" + """Send a response for a processed command.""" response = { "command_id": command_id, "status": status, @@ -203,8 +202,8 @@ class IPCHandler: response_file = os.path.join(self.responses_dir, f"{command_id}.json") with open(response_file, 'w', encoding='utf-8') as f: json.dump(response, f, ensure_ascii=False, indent=2) - - # 删除命令文件 + + # Remove the command file once a response has been written. command_file = os.path.join(self.commands_dir, f"{command_id}.json") try: os.remove(command_file) @@ -212,27 +211,23 @@ class IPCHandler: pass async def handle_interview(self, command_id: str, agent_id: int, prompt: str) -> bool: - """ - 处理单个Agent采访命令 - + """Handle a single-agent interview command. + Returns: - True 表示成功,False 表示失败 + True on success, False on failure. """ try: - # 获取Agent agent = self.agent_graph.get_agent(agent_id) - - # 创建Interview动作 + interview_action = ManualAction( action_type=ActionType.INTERVIEW, action_args={"prompt": prompt} ) - - # 执行Interview + actions = {agent: interview_action} await self.env.step(actions) - - # 从数据库获取结果 + + # Pull the resulting transcript from the simulation database. result = self._get_interview_result(agent_id) self.send_response(command_id, "completed", result=result) @@ -246,17 +241,15 @@ class IPCHandler: return False async def handle_batch_interview(self, command_id: str, interviews: List[Dict]) -> bool: - """ - 处理批量采访命令 - + """Handle a batch interview command. + Args: interviews: [{"agent_id": int, "prompt": str}, ...] """ try: - # 构建动作字典 actions = {} - agent_prompts = {} # 记录每个agent的prompt - + agent_prompts = {} # Track the prompt issued to each agent for later result lookup. + for interview in interviews: agent_id = interview.get("agent_id") prompt = interview.get("prompt", "") @@ -274,11 +267,10 @@ class IPCHandler: if not actions: self.send_response(command_id, "failed", error="没有有效的Agent") return False - - # 执行批量Interview + await self.env.step(actions) - - # 获取所有结果 + + # Collect the per-agent interview results. results = {} for agent_id in agent_prompts.keys(): result = self._get_interview_result(agent_id) @@ -298,7 +290,7 @@ class IPCHandler: return False def _get_interview_result(self, agent_id: int) -> Dict[str, Any]: - """从数据库获取最新的Interview结果""" + """Fetch the most recent interview result for an agent from the database.""" db_path = os.path.join(self.simulation_dir, "twitter_simulation.db") result = { @@ -313,8 +305,8 @@ class IPCHandler: try: conn = sqlite3.connect(db_path) cursor = conn.cursor() - - # 查询最新的Interview记录 + + # Pull the most recent INTERVIEW trace row for this agent. cursor.execute(""" SELECT user_id, info, created_at FROM trace @@ -341,11 +333,10 @@ class IPCHandler: return result async def process_commands(self) -> bool: - """ - 处理所有待处理命令 - + """Process pending commands. + Returns: - True 表示继续运行,False 表示应该退出 + True if the run loop should continue, False if it should exit. """ command = self.poll_command() if not command: @@ -383,9 +374,9 @@ class IPCHandler: class TwitterSimulationRunner: - """Twitter模拟运行器""" - - # Twitter可用动作(不包含INTERVIEW,INTERVIEW只能通过ManualAction手动触发) + """Drives a single Twitter simulation run.""" + + # Available Twitter actions. INTERVIEW is intentionally excluded — it can only be triggered via ManualAction. AVAILABLE_ACTIONS = [ ActionType.CREATE_POST, ActionType.LIKE_POST, @@ -396,12 +387,11 @@ class TwitterSimulationRunner: ] def __init__(self, config_path: str, wait_for_commands: bool = True): - """ - 初始化模拟运行器 - + """Initialize the simulation runner. + Args: - config_path: 配置文件路径 (simulation_config.json) - wait_for_commands: 模拟完成后是否等待命令(默认True) + config_path: Path to the config file (simulation_config.json). + wait_for_commands: Whether to wait for IPC commands after the simulation completes (default True). """ self.config_path = config_path self.config = self._load_config() @@ -412,37 +402,36 @@ class TwitterSimulationRunner: self.ipc_handler = None def _load_config(self) -> Dict[str, Any]: - """加载配置文件""" + """Load the simulation config file.""" with open(self.config_path, 'r', encoding='utf-8') as f: return json.load(f) - + def _get_profile_path(self) -> str: - """获取Profile文件路径(OASIS Twitter使用CSV格式)""" + """Return the agent profile path (OASIS Twitter expects CSV).""" return os.path.join(self.simulation_dir, "twitter_profiles.csv") - + def _get_db_path(self) -> str: - """获取数据库路径""" + """Return the simulation SQLite database path.""" return os.path.join(self.simulation_dir, "twitter_simulation.db") - + def _create_model(self): + """Create the LLM model. + + Uses the project-root .env file (highest precedence): + - LLM_API_KEY: API key + - LLM_BASE_URL: API base URL + - LLM_MODEL_NAME: model name """ - 创建LLM模型 - - 统一使用项目根目录 .env 文件中的配置(优先级最高): - - LLM_API_KEY: API密钥 - - LLM_BASE_URL: API基础URL - - LLM_MODEL_NAME: 模型名称 - """ - # 优先从 .env 读取配置 + # Prefer values from .env. llm_api_key = os.environ.get("LLM_API_KEY", "") llm_base_url = os.environ.get("LLM_BASE_URL", "") llm_model = os.environ.get("LLM_MODEL_NAME", "") - - # 如果 .env 中没有,则使用 config 作为备用 + + # Fall back to the simulation config if .env did not provide a model name. if not llm_model: llm_model = self.config.get("llm_model", "gpt-4o-mini") - - # 设置 camel-ai 所需的环境变量 + + # camel-ai reads OPENAI_API_KEY from the environment. if llm_api_key: os.environ["OPENAI_API_KEY"] = llm_api_key @@ -465,25 +454,24 @@ class TwitterSimulationRunner: current_hour: int, round_num: int ) -> List: - """ - 根据时间和配置决定本轮激活哪些Agent - + """Decide which agents activate this round, based on time and config. + Args: - env: OASIS环境 - current_hour: 当前模拟小时(0-23) - round_num: 当前轮数 - + env: The OASIS environment. + current_hour: Current simulated hour (0-23). + round_num: Current round number. + Returns: - 激活的Agent列表 + The list of agents activated this round. """ time_config = self.config.get("time_config", {}) agent_configs = self.config.get("agent_configs", []) - - # 基础激活数量 + + # Base activation count per round. base_min = time_config.get("agents_per_hour_min", 5) base_max = time_config.get("agents_per_hour_max", 20) - - # 根据时段调整 + + # Adjust by time-of-day (peak vs. off-peak hours). peak_hours = time_config.get("peak_hours", [9, 10, 11, 14, 15, 20, 21, 22]) off_peak_hours = time_config.get("off_peak_hours", [0, 1, 2, 3, 4, 5]) @@ -495,29 +483,27 @@ class TwitterSimulationRunner: multiplier = 1.0 target_count = int(random.uniform(base_min, base_max) * multiplier) - - # 根据每个Agent的配置计算激活概率 + + # Compute activation probability for each configured agent. candidates = [] for cfg in agent_configs: agent_id = cfg.get("agent_id", 0) active_hours = cfg.get("active_hours", list(range(8, 23))) activity_level = cfg.get("activity_level", 0.5) - - # 检查是否在活跃时间 + if current_hour not in active_hours: continue - - # 根据活跃度计算概率 + if random.random() < activity_level: candidates.append(agent_id) - - # 随机选择 + + # Pick a random subset of the eligible candidates. selected_ids = random.sample( - candidates, + candidates, min(target_count, len(candidates)) ) if candidates else [] - - # 转换为Agent对象 + + # Resolve IDs to Agent objects. active_agents = [] for agent_id in selected_ids: try: @@ -529,10 +515,10 @@ class TwitterSimulationRunner: return active_agents async def run(self, max_rounds: int = None): - """运行Twitter模拟 - + """Run the Twitter simulation. + Args: - max_rounds: 最大模拟轮数(可选,用于截断过长的模拟) + max_rounds: Optional cap on the number of rounds, used to truncate overly long simulations. """ print("=" * 60) print("OASIS Twitter模拟") @@ -540,16 +526,14 @@ class TwitterSimulationRunner: print(f"模拟ID: {self.config.get('simulation_id', 'unknown')}") print(f"等待命令模式: {'启用' if self.wait_for_commands else '禁用'}") print("=" * 60) - - # 加载时间配置 + time_config = self.config.get("time_config", {}) total_hours = time_config.get("total_simulation_hours", 72) minutes_per_round = time_config.get("minutes_per_round", 30) - - # 计算总轮数 + total_rounds = (total_hours * 60) // minutes_per_round - - # 如果指定了最大轮数,则截断 + + # Truncate to max_rounds when one was supplied. if max_rounds is not None and max_rounds > 0: original_rounds = total_rounds total_rounds = min(total_rounds, max_rounds) @@ -563,12 +547,11 @@ class TwitterSimulationRunner: if max_rounds: print(f" - 最大轮数限制: {max_rounds}") print(f" - Agent数量: {len(self.config.get('agent_configs', []))}") - - # 创建模型 + print("\n初始化LLM模型...") model = self._create_model() - - # 加载Agent图 + + # Load the agent graph from the profile CSV. print("加载Agent Profile...") profile_path = self._get_profile_path() if not os.path.exists(profile_path): @@ -581,29 +564,27 @@ class TwitterSimulationRunner: available_actions=self.AVAILABLE_ACTIONS, ) - # 数据库路径 + # Reset the simulation database for a clean run. db_path = self._get_db_path() if os.path.exists(db_path): os.remove(db_path) print(f"已删除旧数据库: {db_path}") - - # 创建环境 + print("创建OASIS环境...") self.env = oasis.make( agent_graph=self.agent_graph, platform=oasis.DefaultPlatformType.TWITTER, database_path=db_path, - semaphore=30, # 限制最大并发 LLM 请求数,防止 API 过载 + semaphore=30, # Cap concurrent LLM requests to avoid API overload. ) await self.env.reset() print("环境初始化完成\n") - - # 初始化IPC处理器 + self.ipc_handler = IPCHandler(self.simulation_dir, self.env, self.agent_graph) self.ipc_handler.update_status("running") - - # 执行初始事件 + + # Run the initial seeded events (kickoff posts). event_config = self.config.get("event_config", {}) initial_posts = event_config.get("initial_posts", []) @@ -625,35 +606,32 @@ class TwitterSimulationRunner: if initial_actions: await self.env.step(initial_actions) print(f" 已发布 {len(initial_actions)} 条初始帖子") - - # 主模拟循环 + + # Main simulation loop. print("\n开始模拟循环...") start_time = datetime.now() - + for round_num in range(total_rounds): - # 计算当前模拟时间 + # Map round number to simulated wall-clock time. simulated_minutes = round_num * minutes_per_round simulated_hour = (simulated_minutes // 60) % 24 simulated_day = simulated_minutes // (60 * 24) + 1 - - # 获取本轮激活的Agent + active_agents = self._get_active_agents_for_round( self.env, simulated_hour, round_num ) - + if not active_agents: continue - - # 构建动作 + actions = { agent: LLMAction() for _, agent in active_agents } - - # 执行动作 + await self.env.step(actions) - - # 打印进度 + + # Periodic progress log. if (round_num + 1) % 10 == 0 or round_num == 0: elapsed = (datetime.now() - start_time).total_seconds() progress = (round_num + 1) / total_rounds * 100 @@ -667,7 +645,7 @@ class TwitterSimulationRunner: print(f" - 总耗时: {total_elapsed:.1f}秒") print(f" - 数据库: {db_path}") - # 是否进入等待命令模式 + # Optionally enter command-wait mode. if self.wait_for_commands: print("\n" + "=" * 60) print("进入等待命令模式 - 环境保持运行") @@ -675,8 +653,8 @@ class TwitterSimulationRunner: print("=" * 60) self.ipc_handler.update_status("alive") - - # 等待命令循环(使用全局 _shutdown_event) + + # Command-wait loop, driven by the global _shutdown_event. try: while not _shutdown_event.is_set(): should_continue = await self.ipc_handler.process_commands() @@ -684,7 +662,7 @@ class TwitterSimulationRunner: break try: await asyncio.wait_for(_shutdown_event.wait(), timeout=0.5) - break # 收到退出信号 + break # Shutdown signal received. except asyncio.TimeoutError: pass except KeyboardInterrupt: @@ -695,8 +673,7 @@ class TwitterSimulationRunner: print(f"\n命令处理出错: {e}") print("\n关闭环境...") - - # 关闭环境 + self.ipc_handler.update_status("stopped") await self.env.close() @@ -726,16 +703,16 @@ async def main(): ) args = parser.parse_args() - - # 在 main 函数开始时创建 shutdown 事件 + + # Create the shutdown event inside the running event loop. global _shutdown_event _shutdown_event = asyncio.Event() - + if not os.path.exists(args.config): print(f"错误: 配置文件不存在: {args.config}") sys.exit(1) - - # 初始化日志配置(使用固定文件名,清理旧日志) + + # Initialize logging with fixed filenames; old logs are wiped. simulation_dir = os.path.dirname(args.config) or "." setup_oasis_logging(os.path.join(simulation_dir, "log")) @@ -747,9 +724,11 @@ async def main(): def setup_signal_handlers(): - """ - 设置信号处理器,确保收到 SIGTERM/SIGINT 时能够正确退出 - 让程序有机会正常清理资源(关闭数据库、环境等) + """Install signal handlers so SIGTERM/SIGINT trigger an orderly shutdown. + + The handler gives the program a chance to clean up resources properly + (closing the database, the OASIS environment, etc.) on the first signal, + and only force-exits on a repeated signal. """ def signal_handler(signum, frame): global _cleanup_done @@ -760,7 +739,7 @@ def setup_signal_handlers(): if _shutdown_event: _shutdown_event.set() else: - # 重复收到信号才强制退出 + # Force exit only on a repeat signal. print("强制退出...") sys.exit(1) diff --git a/backend/scripts/test_profile_format.py b/backend/scripts/test_profile_format.py index 354e8b5c..5e312e60 100644 --- a/backend/scripts/test_profile_format.py +++ b/backend/scripts/test_profile_format.py @@ -1,8 +1,8 @@ -""" -测试Profile格式生成是否符合OASIS要求 -验证: -1. Twitter Profile生成CSV格式 -2. Reddit Profile生成JSON详细格式 +"""Profile-format generation tests for OASIS compatibility. + +Verifies that: +1. Twitter profiles serialize to CSV format. +2. Reddit profiles serialize to detailed JSON format. """ import os @@ -11,19 +11,19 @@ import json import csv import tempfile -# 添加项目路径 +# Add the project root to sys.path so the ``app`` package resolves. sys.path.insert(0, os.path.dirname(os.path.dirname(os.path.abspath(__file__)))) from app.services.oasis_profile_generator import OasisProfileGenerator, OasisAgentProfile def test_profile_formats(): - """测试Profile格式""" + """Exercise both profile-format outputs end-to-end.""" print("=" * 60) print("OASIS Profile格式测试") print("=" * 60) - - # 创建测试Profile数据 + + # Build a small set of test profiles. test_profiles = [ OasisAgentProfile( user_id=0, @@ -62,18 +62,18 @@ def test_profile_formats(): ] generator = OasisProfileGenerator.__new__(OasisProfileGenerator) - - # 使用临时目录 + + # Use a temp directory for the test fixtures. with tempfile.TemporaryDirectory() as temp_dir: twitter_path = os.path.join(temp_dir, "twitter_profiles.csv") reddit_path = os.path.join(temp_dir, "reddit_profiles.json") - - # 测试Twitter CSV格式 + + # Twitter CSV format. print("\n1. 测试Twitter Profile (CSV格式)") print("-" * 40) generator._save_twitter_csv(test_profiles, twitter_path) - - # 读取并验证CSV + + # Read back and verify the CSV. with open(twitter_path, 'r', encoding='utf-8') as f: reader = csv.DictReader(f) rows = list(reader) @@ -85,8 +85,8 @@ def test_profile_formats(): for key, value in rows[0].items(): print(f" {key}: {value}") - # 验证必需字段 - required_twitter_fields = ['user_id', 'user_name', 'name', 'bio', + # Verify the required fields are present. + required_twitter_fields = ['user_id', 'user_name', 'name', 'bio', 'friend_count', 'follower_count', 'statuses_count', 'created_at'] missing = set(required_twitter_fields) - set(rows[0].keys()) if missing: @@ -94,12 +94,12 @@ def test_profile_formats(): else: print(f"\n [通过] 所有必需字段都存在") - # 测试Reddit JSON格式 + # Reddit JSON format. print("\n2. 测试Reddit Profile (JSON详细格式)") print("-" * 40) generator._save_reddit_json(test_profiles, reddit_path) - - # 读取并验证JSON + + # Read back and verify the JSON. with open(reddit_path, 'r', encoding='utf-8') as f: reddit_data = json.load(f) @@ -109,7 +109,7 @@ def test_profile_formats(): print(f"\n 示例数据 (第1条):") print(json.dumps(reddit_data[0], ensure_ascii=False, indent=4)) - # 验证详细格式字段 + # Verify the detailed Reddit format fields. required_reddit_fields = ['realname', 'username', 'bio', 'persona'] optional_reddit_fields = ['age', 'gender', 'mbti', 'country', 'profession', 'interested_topics'] @@ -128,7 +128,7 @@ def test_profile_formats(): def show_expected_formats(): - """显示OASIS期望的格式""" + """Print the canonical OASIS-expected profile formats for reference.""" print("\n" + "=" * 60) print("OASIS 期望的Profile格式参考") print("=" * 60)