From 7f74e0a3f8301117488c9e9fdcbcb77062f729dc Mon Sep 17 00:00:00 2001 From: Dominik Seemann Date: Thu, 7 May 2026 23:36:42 +0000 Subject: [PATCH] fix(i18n): translate oasis profile generator prompts to english MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The oasis_profile_generator.py system prompt, both user-message templates (individual + group personas), the context-builder section labels embedded into the prompt context, the fallback persona templates, and the per-batch console output banners were all written in Chinese. Even when Accept-Language was en, the Chinese base prompt and embedded section labels biased the LLM toward Chinese persona output. Translate every owned prompt-assembly literal to English while preserving all functional contracts: f-string interpolations, the required JSON output keys, the gender/age literal-token rules, the get_language_instruction() postfix call sites, the _normalize_gender mapping (which still accepts Chinese gender keys from upstream), and the rule-based country: "中国" data default. Logger calls, docstrings, and inline comments are out of scope (issues #6 / #7) and were not touched. Closes #25 --- .../design.md | 317 ++++++++++++++++++ .../requirements.md | 168 ++++++++++ .../spec.json | 27 ++ .../tasks.md | 92 +++++ .../app/services/oasis_profile_generator.py | 180 +++++----- 5 files changed, 694 insertions(+), 90 deletions(-) create mode 100644 .kiro/specs/i18n-oasis-profile-generator-prompts/design.md create mode 100644 .kiro/specs/i18n-oasis-profile-generator-prompts/requirements.md create mode 100644 .kiro/specs/i18n-oasis-profile-generator-prompts/spec.json create mode 100644 .kiro/specs/i18n-oasis-profile-generator-prompts/tasks.md diff --git a/.kiro/specs/i18n-oasis-profile-generator-prompts/design.md b/.kiro/specs/i18n-oasis-profile-generator-prompts/design.md new file mode 100644 index 00000000..e15a6a60 --- /dev/null +++ b/.kiro/specs/i18n-oasis-profile-generator-prompts/design.md @@ -0,0 +1,317 @@ +# Design Document — i18n-oasis-profile-generator-prompts + +## Overview + +**Purpose**: Translate the Chinese prompt strings, context-builder section labels, fallback persona templates, and console-output formatting in `backend/app/services/oasis_profile_generator.py` to English while preserving every functional contract — LLM JSON output schema, the `_normalize_gender` mapping that must continue to accept Chinese gender values, the `_generate_profile_rule_based` default `country: "中国"` data value, all f-string interpolations, and the `get_language_instruction()` locale-postfix mechanism. The goal is to remove the Chinese-language base-prompt and context-label bias that currently leaks Chinese structure and word choice into OASIS profile output even when `Accept-Language: en`. + +**Users**: MiroFish operators running the Step 2 OASIS profile generation under any locale; downstream OASIS / CAMEL-OASIS consumers of the agent JSON / CSV produced by `OasisProfileGenerator`. + +**Impact**: Replaces approximately one base-prompt string, two large user-message templates, four context-builder section labels, three fallback persona templates, and ten console-output strings with English equivalents inside one file. No API surface change. No new dependencies. No new files. Callers (`backend/app/api/simulation.py`, etc.) and OASIS consumers are unaffected. + +### Goals + +- Zero CJK characters in any prompt string literal contributed by `oasis_profile_generator.py` to the system prompt, the user message, or the context block. +- Zero CJK characters in any console-output literal in `_print_generated_profile` and the surrounding banners. +- English `bio` / `persona` output under `Accept-Language: en`. +- Continued Chinese `bio` / `persona` output under `Accept-Language: zh`, of equivalent quality to the pre-change behaviour. +- No diff to public signatures, dataclass schema, LLM-call parameters, or call sites. + +### Non-Goals + +- Externalizing prompts to `/locales/*.json` (out of scope per ticket and consistent with `i18n-ontology-generator-prompts`). +- Translating logger calls in this file (covered by issue #6). +- Translating module/class/method docstrings or inline comments in this file (covered by issue #7). +- Refactoring the OASIS profile JSON schema, the OASIS adapter, or the simulation flow. +- Modifying the `_normalize_gender` mapping table (it must keep accepting Chinese gender keys). +- Modifying the `_generate_profile_rule_based` default `"中国"` country value (data, not prompt). +- Modifying the `ValueError("LLM_API_KEY 未配置")` raise (covered by issue #6). +- Modifying `backend/app/utils/locale.py`, the locale registries, or any non-target file. + +## Boundary Commitments + +### This Spec Owns + +- The English content of the `base_prompt` string in `OasisProfileGenerator._get_system_prompt` (line 664). +- The English content of every string literal in `OasisProfileGenerator._build_individual_persona_prompt` (lines 677–714). +- The English content of every string literal in `OasisProfileGenerator._build_group_persona_prompt` (lines 726–762). +- The English content of the section-label literals embedded in `OasisProfileGenerator._search_zep_for_entity` (lines 384, 390, 392) and `OasisProfileGenerator._build_entity_context` (lines 422, 438, 440, 443, 463, 472, 475). +- The English content of the fallback persona templates in `OasisProfileGenerator._generate_profile_with_llm` (line 547) and `OasisProfileGenerator._try_fix_json` (lines 644, 659). +- The English content of the no-attributes / no-context placeholder literals (`"无"`, `"无额外上下文"`) at lines 677, 678, 726, 727. +- The English content of every string literal in `OasisProfileGenerator._print_generated_profile` (lines 1011, 1017, 1019, 1022, 1025, 1026, 1027, 1028) and the surrounding banners in `OasisProfileGenerator.generate_profiles_from_entities` (lines 945, 1001). + +### Out of Boundary + +- Locale resolution machinery (`backend/app/utils/locale.py`). +- Per-locale `llmInstruction` definitions (`/locales/languages.json`). +- Reasoning-model output stripping (`backend/app/utils/llm_client.py`). +- All `logger.*` calls (already keyed via `t("log.profile_generator.*")`; covered by issue #6). +- Module / class / method docstrings and inline comments (covered by issue #7), including the inline comments at lines 65, 93, 641, 804–807, 816–819. +- The `_normalize_gender` mapping table (lines 1123–1132) — must continue to accept Chinese gender keys from upstream. +- The hard-coded `country: "中国"` default in `_generate_profile_rule_based` (lines 807, 819) — this is a data value, not a prompt. +- The `ValueError("LLM_API_KEY 未配置")` raise (line 194) — covered by issue #6. +- All callers of `OasisProfileGenerator`, including `backend/app/api/simulation.py`. +- Tests, scripts, and frontend code. + +### Allowed Dependencies + +- Existing `get_language_instruction`, `get_locale`, `set_locale`, `t` imports from `..utils.locale` (already imported; unchanged). +- Existing `OpenAI` SDK invocation (unchanged). +- No new imports. + +### Revalidation Triggers + +The following changes elsewhere would invalidate this design and require revisiting the prompt: + +- A change to the JSON contract emitted by the LLM (`bio`, `persona`, `age`, `gender`, `mbti`, `country`, `profession`, `interested_topics`). +- A change to `OasisAgentProfile` field semantics. +- A change to `get_language_instruction()` semantics or the per-locale `llmInstruction` strings. +- A change to OASIS / CAMEL-OASIS profile field expectations (e.g. if `gender` accepts more than `male` / `female` / `other`). + +## Architecture + +### Existing Architecture Analysis + +`OasisProfileGenerator` lives in `backend/app/services/`, follows the in-process service pattern with bounded thread-pool fan-out for batched profile generation, and is invoked from `backend/app/api/simulation.py` inside a background `Task`. It depends on: + +- `OpenAI` SDK for the LLM call. +- `GraphitiAdapter` (legacy `zep_client` field name) for the Zep / Graphiti graph search. +- `get_language_instruction()` for locale steering. +- `t()` for already-keyed log strings. + +The relevant flow is: + +1. The Flask handler resolves the request locale via `Accept-Language`; the locale is propagated to thread-pool workers via the `set_locale(current_locale)` capture in `generate_profiles_from_entities` (line 914). +2. For each entity, `_build_entity_context()` is called: it composes a context block by concatenating headed sub-sections (entity attributes, related facts/edges, related node summaries, Graphiti-search facts, Graphiti-search nodes). Some of these labels are currently in Chinese. +3. The context string is interpolated into the user-message template by either `_build_individual_persona_prompt` or `_build_group_persona_prompt`. Both templates are currently in Chinese, with English `gender` token directives interleaved. +4. The system prompt is built by `_get_system_prompt`: a Chinese base prompt followed by the locale-appropriate `get_language_instruction()`. +5. The two messages are sent to `chat.completions.create` with `response_format={"type": "json_object"}`. The result flows through `json.loads` → `_try_fix_json` → `_fix_truncated_json` fallback chain. Synthesized fallback personas use the Chinese template `f"{entity_name}是一个{entity_type}。"` if the LLM result is unusable. +6. After per-profile completion, `_print_generated_profile` writes a Chinese-headed banner to stdout, and `generate_profiles_from_entities` writes Chinese batch banners. + +This design preserves all of the above structurally. The change is purely lexical inside the seven regions of one file. + +### Architecture Pattern & Boundary Map + +```mermaid +graph TB + Caller[simulation.py handler] + Generator[OasisProfileGenerator] + Locale[locale.get_language_instruction] + Graph[GraphitiAdapter graph.search] + LLM[OpenAI chat.completions] + + Caller -->|generate_profiles_from_entities| Generator + Generator -->|build context block| Generator + Generator -->|read locale postfix| Locale + Generator -->|search facts/nodes| Graph + Generator -->|JSON request| LLM + LLM -->|raw JSON| Generator + Generator -->|OasisAgentProfile| Caller +``` + +**Architecture Integration**: + +- Selected pattern: **In-place lexical translation** of seven regions of an existing service. No structural change. +- Domain/feature boundaries: locale machinery vs. prompt assembly vs. LLM transport remain cleanly separated. +- Existing patterns preserved: prompt-as-f-string user-message construction; Chinese-keyed `_normalize_gender` mapping; `t(...)` for log strings; `get_language_instruction()` postfix concatenation. +- New components rationale: none — no new components. +- Steering compliance: matches the established `i18n-*-prompts` family pattern (issues #2, #3, #4, #5) of in-place translation rather than `t()` keying for prompt bodies. Respects the steering note that "existing files mix English and Chinese in comments/docstrings — preserve both; do not translate one into the other unless asked." This ticket is the explicit ask for prompt strings, scoped to exclude comments/docstrings. + +### Technology Stack + +| Layer | Choice / Version | Role in Feature | Notes | +|-------|------------------|-----------------|-------| +| Backend / Services | Python 3.11+ | Hosts `OasisProfileGenerator` | Existing — unchanged. | +| Backend / Services | `openai` SDK | Issues the prompt; returns JSON | Existing — unchanged. | +| Backend / Services | `backend/app/utils/locale.py` | Resolves `Accept-Language` → `llmInstruction` postfix | Existing — unchanged. | +| Backend / Services | `GraphitiAdapter` | Provides Graphiti graph search facts/nodes | Existing — unchanged. | + +No new dependencies. No version changes. + +## File Structure Plan + +### Modified Files + +- `backend/app/services/oasis_profile_generator.py` — Replace the body of `_get_system_prompt` `base_prompt`; replace every Chinese string literal in `_build_individual_persona_prompt` and `_build_group_persona_prompt` with English equivalents; replace the four section labels in `_search_zep_for_entity` and the six section labels in `_build_entity_context`; replace the three fallback persona templates; replace the two `"无"` / `"无额外上下文"` placeholders; replace the console-output literals in `_print_generated_profile` and the two `print(...)` banners in `generate_profiles_from_entities`. Preserve every other character of the file. + +No new files. No deletions. No moves. + +## System Flows + +The control-flow diagram in *Architecture Pattern & Boundary Map* covers the relevant flow; no additional diagrams are needed for this string-literal change. + +## Requirements Traceability + +| Requirement | Summary | Components | Interfaces | Flows | +|-------------|---------|------------|------------|-------| +| 1.1–1.4 | English `_get_system_prompt` `base_prompt`; preserve `get_language_instruction()` site | OasisProfileGenerator → `_get_system_prompt` | None changed | Architecture diagram | +| 2.1–2.9 | English `_build_individual_persona_prompt`; preserve interpolations and JSON keys | OasisProfileGenerator → `_build_individual_persona_prompt` | f-string interpolation | n/a | +| 3.1–3.9 | English `_build_group_persona_prompt`; preserve fixed-value rules and interpolations | OasisProfileGenerator → `_build_group_persona_prompt` | f-string interpolation | n/a | +| 4.1–4.10 | English context-builder section labels | OasisProfileGenerator → `_search_zep_for_entity`, `_build_entity_context` | Prompt-only | n/a | +| 5.1–5.3 | English fallback persona templates | OasisProfileGenerator → `_generate_profile_with_llm`, `_try_fix_json` | None changed | n/a | +| 6.1–6.7 | English console-output formatting | OasisProfileGenerator → `_print_generated_profile`, `generate_profiles_from_entities` | None changed | n/a | +| 7.1–7.4 | Locale switching preserved via `get_language_instruction()` | OasisProfileGenerator + Locale | `get_language_instruction()` | Architecture diagram | +| 8.1–8.6 | Public API and call-site stability; preserve `_normalize_gender` and `country: "中国"` data default | OasisProfileGenerator (signatures, dataclass) | Public surface | n/a | +| 9.1–9.3 | Reasoning-model compatibility | OasisProfileGenerator → `chat.completions.create` + `_try_fix_json` | OpenAI SDK | Architecture diagram | +| 10.1–10.7 | Out-of-scope surfaces untouched | OasisProfileGenerator (boundary commitment) | n/a | n/a | + +## Components and Interfaces + +| Component | Domain/Layer | Intent | Req Coverage | Key Dependencies (P0/P1) | Contracts | +|-----------|--------------|--------|--------------|--------------------------|-----------| +| OasisProfileGenerator (modified) | Backend / Service | Render English profile-generation prompts and context labels; preserve all behaviour | 1.1–10.7 | `OpenAI.chat.completions.create` (P0), `get_language_instruction` (P0), `GraphitiAdapter.graph.search` (P1), `_normalize_gender` (P0) | Service | + +### Backend / Service + +#### OasisProfileGenerator (modified) + +| Field | Detail | +|-------|--------| +| Intent | Translate prompt strings, context labels, fallback persona templates, and console output to English while preserving every functional contract. | +| Requirements | 1.1, 1.2, 1.3, 1.4, 2.1–2.9, 3.1–3.9, 4.1–4.10, 5.1–5.3, 6.1–6.7, 7.1–7.4, 8.1–8.6, 9.1–9.3, 10.1–10.7 | + +**Responsibilities & Constraints** + +- Owns: the English wording of the system prompt body, the two user-message templates, the context-builder section labels, the fallback persona templates, the no-attributes / no-context placeholders, and the console-output formatting. +- Domain boundary: prompt content and proximate console output only. Does not own locale resolution, transport, validation, or data values like the OASIS `country` default. +- Invariants: + - All seven owned regions after translation MUST contain zero CJK characters. + - The translated user-message templates MUST present the same eight required JSON keys: `bio`, `persona`, `age`, `gender`, `mbti`, `country`, `profession`, `interested_topics`. + - The translated individual-persona template MUST require `gender ∈ {"male", "female"}` and `age` to be a valid integer. + - The translated group-persona template MUST require `age == 30` and `gender == "other"`. + - The translated user-message templates MUST preserve the f-string interpolations: `{entity_name}`, `{entity_type}`, `{entity_summary}`, `{attrs_str}`, `{context_str}`, `{get_language_instruction()}`. + - The translated context-builder labels MUST preserve the section structure (heading + bulleted body). + - The translated fallback persona templates MUST preserve the `entity_summary or template` priority order. + - The call to `get_language_instruction()` MUST remain at its current locations. + - The call to `self.client.chat.completions.create(...)` MUST remain unchanged. + - All public signatures, dataclass schema, and the private helper signatures MUST remain unchanged. + - All `logger.*` calls (already keyed) and inline comments and docstrings in this file MUST remain unchanged (out of scope per #6 and #7). + - The `_normalize_gender` mapping table MUST remain unchanged. + - The rule-based `country: "中国"` default MUST remain unchanged. + +**Dependencies** + +- Inbound: `backend/app/api/simulation.py` — production caller (P0). +- Outbound: `backend/app/utils/locale.get_language_instruction` — locale postfix (P0); `backend/app/utils/locale.t` — already-keyed log strings (P0); `backend/app/services/graphiti_adapter.GraphitiAdapter.graph.search` — facts/nodes retrieval (P1); `OpenAI.chat.completions.create` — JSON LLM transport (P0). +- External: none. + +**Contracts**: Service [x] / API [ ] / Event [ ] / Batch [ ] / State [ ] + +##### Service Interface + +The public Python interface is unchanged. Representative signatures: + +```python +class OasisProfileGenerator: + def __init__( + self, + api_key: Optional[str] = None, + base_url: Optional[str] = None, + model_name: Optional[str] = None, + zep_api_key: Optional[str] = None, + graph_id: Optional[str] = None, + ) -> None: ... + + def generate_profile_from_entity( + self, + entity: EntityNode, + user_id: int, + use_llm: bool = True, + ) -> OasisAgentProfile: ... + + def generate_profiles_from_entities( + self, + entities: List[EntityNode], + use_llm: bool = True, + progress_callback: Optional[callable] = None, + graph_id: Optional[str] = None, + parallel_count: int = 5, + realtime_output_path: Optional[str] = None, + output_platform: str = "reddit", + ) -> List[OasisAgentProfile]: ... + + def save_profiles( + self, + profiles: List[OasisAgentProfile], + file_path: str, + platform: str = "reddit", + ) -> None: ... +``` + +- Preconditions: a configured LLM provider; a configured Graphiti / Neo4j graph; a non-empty `entities` list when batching. +- Postconditions: `OasisAgentProfile` instances with English `bio` and `persona` under locale `en`, Chinese under locale `zh`, and structurally equivalent across locales. +- Invariants: see *Responsibilities & Constraints*. + +**Implementation Notes** + +- **Integration**: No new imports. No call-site changes. The diff is confined to seven regions of one file. +- **Validation**: After implementation, run a targeted regex check (`[一-鿿]`) over the seven owned regions to confirm zero CJK; smoke-test `_build_individual_persona_prompt(...)` and `_build_group_persona_prompt(...)` with representative inputs to confirm interpolations still work; round-trip a single profile end-to-end under both `en` and `zh` locales. +- **Risks**: English-base bias on Chinese-locale output (mitigated by the `llmInstruction` postfix already present in both system and user messages). Reduced LLM compliance with `gender ∈ {male, female}` for individual entities (mitigated by retaining the explicit English-token directive verbatim in the rules block). + +## Data Models + +No data-model changes. The `OasisAgentProfile` dataclass is preserved verbatim. + +## Error Handling + +### Error Strategy + +Error handling is unchanged from the existing implementation: + +- LLM transport errors propagate from `chat.completions.create`. +- Truncation (`finish_reason == "length"`) is repaired by `_fix_truncated_json`. +- Invalid JSON falls through to `_try_fix_json`, then to a synthesized fallback profile (now with English persona text). +- Per-entity exceptions are caught and a fallback `OasisAgentProfile` is constructed with English fallback strings. + +### Error Categories and Responses + +- **User errors (4xx)**: not applicable at this layer; surfaced by the API handler. +- **System errors (5xx)**: LLM/network failures propagate to the API handler, which converts them to JSON error responses. +- **Business logic errors**: malformed JSON is auto-repaired or replaced with a fallback profile. + +### Monitoring + +Existing `logger.*` calls (keyed via `t("log.profile_generator.*")`) cover progress and warnings; no new monitoring is added. + +## Testing Strategy + +### Unit Tests + +Given the project's intentionally minimal test harness (`backend/scripts/test_profile_format.py` only), the change is verified via: + +- **Static check**: a one-shot regex assertion against the patched module ensuring zero CJK characters in the seven owned regions. This can be a quick `python -c` invocation during PR review. +- **Round-trip smoke test**: instantiate `OasisProfileGenerator()`, call `_build_individual_persona_prompt(...)` and `_build_group_persona_prompt(...)` with representative inputs, and verify all required interpolations appear in the output and no CJK characters remain. +- **Fallback rendering**: simulate a JSON parse failure and verify the English fallback persona template is produced. + +### Integration Tests + +- **Step 2 profile generation under EN locale**: run a small batched profile generation against a real Graphiti graph with locale `en`. Verify produced profiles have English `bio` / `persona` and pass the existing OASIS profile-format check. + +### E2E/UI Tests + +Not applicable — change does not affect frontend. + +### Performance/Load + +Not applicable — token counts may differ slightly between Chinese and English renderings, but the LLM call has no `max_tokens` cap and remains within provider-acceptable limits. + +## Optional Sections + +### Security Considerations + +Not applicable. Translation does not introduce new authentication, authorization, data-handling, or input-validation paths. + +### Performance & Scalability + +Not applicable. + +### Migration Strategy + +Not applicable. The change is a single in-place edit; no data migration. Rollback is `git revert`. + +## Supporting References + +- `backend/app/services/oasis_profile_generator.py` — current Chinese prompt content (the source of translation). +- `backend/app/utils/locale.py` — locale resolver. +- `backend/app/api/simulation.py` — call site. +- `.kiro/specs/i18n-ontology-generator-prompts/design.md` — adjacent reference design for in-place prompt translation. +- `.ticket/25.md` — ticket snapshot. diff --git a/.kiro/specs/i18n-oasis-profile-generator-prompts/requirements.md b/.kiro/specs/i18n-oasis-profile-generator-prompts/requirements.md new file mode 100644 index 00000000..cb2443d3 --- /dev/null +++ b/.kiro/specs/i18n-oasis-profile-generator-prompts/requirements.md @@ -0,0 +1,168 @@ +# Requirements Document + +## Introduction + +This specification covers the English translation of the LLM-prompt assembly strings in `backend/app/services/oasis_profile_generator.py`. The file generates OASIS Agent profiles (bio, persona, demographics) from Graphiti/Zep entities during pipeline Step 2. Today, the system prompt and the two user-message builders (`_build_individual_persona_prompt`, `_build_group_persona_prompt`) are written in Chinese, and the runtime context-builders (`_search_zep_for_entity`, `_build_entity_context`) embed Chinese section labels (`事实信息:`, `相关实体:`, `### 实体属性`, `### 关联实体信息`, etc.) into the prompt context that is later interpolated into the user message. Locale is steered at runtime by appending `get_language_instruction()` to the system message and the user-message rules block, but the base-prompt language and the embedded context labels bias the LLM toward Chinese output even when `Accept-Language: en`. Translating the prompt body and the context labels removes that bias while preserving the existing locale-switching mechanism for non-English locales. + +This work tracks GitHub issue [#25](https://github.com/salestech-group/MiroFish/issues/25). + +## Boundary Context + +- **In scope**: + - Translating the system-prompt base string in `_get_system_prompt` (`base_prompt = "你是社交媒体用户画像生成专家..."`). + - Translating the user-message body in `_build_individual_persona_prompt` (header line, field labels, JSON-field descriptions, "重要" rules block). + - Translating the user-message body in `_build_group_persona_prompt` (header line, field labels, JSON-field descriptions, "重要" rules block). + - Translating the placeholder values used inside those builders: `"无"` and `"无额外上下文"` (substituted when an entity has no attributes or no context). + - Translating the section-heading labels prepended to context fragments by `_search_zep_for_entity` (`"相关实体: "` prefix on node-name labels; `"事实信息:"`, `"相关实体:"` block headings). + - Translating the section-heading labels prepended to context fragments by `_build_entity_context` (`"### 实体属性"`, `"### 相关事实和关系"`, `"### 关联实体信息"`, `"### Zep检索到的事实信息"`, `"### Zep检索到的相关节点"`, plus the inline `(相关实体)` placeholder in edge-direction fragments). + - Translating the fallback persona templates (`f"{entity_name}是一个{entity_type}。"`) used when LLM JSON parsing fails or fields are missing. + - Translating the console-output formatting in `_print_generated_profile` (the `【简介】`, `【详细人设】`, `【基本属性】` headings and the `用户名:`, `年龄:`, `性别:`, `MBTI:`, `职业:`, `国家:`, `兴趣话题:` row labels) and the surrounding `print` banners in `generate_profiles_from_entities` (`开始生成Agent人设...`, `人设生成完成!...`). + - Translating the `'无'` sentinel emitted when `interested_topics` is empty in `_print_generated_profile`. + - Preserving all functional contracts: f-string interpolations, JSON output schema, `get_language_instruction()` postfix call sites, `_normalize_gender` mappings (Chinese `男`/`女`/`机构`/`其他` keys remain — input data may still arrive in those forms), the `country: "中国"` rule-based default in `_generate_profile_rule_based`, the `OASIS 库要求字段名为 username(无下划线)` inline comments at lines 65 and 93 (these are code-level documentation, owned by issue #7), and the `# 可能被截断` / `# 机构虚拟年龄` etc. inline comments (owned by issue #7). +- **Out of scope**: + - Logger calls in this file (covered by issue #6 and the in-flight #24/#25 backend-log work — the logger calls already use `t("log.profile_generator.*")` keys). + - Module/class/method docstrings and inline code comments (covered by issue #7 — including the `# OASIS 库要求字段名为 username` and `# 机构虚拟年龄` style comments). + - The `_normalize_gender` mapping table (it must continue to accept Chinese gender inputs that may still arrive from upstream LLM output or user-supplied data). + - The hard-coded `"中国"` rule-based country default (this is a data value that downstream OASIS expects in a free-form `country` field; changing the default is a data migration, not a translation). + - The Chinese identifier in the `ValueError("LLM_API_KEY 未配置")` raise — that is an exception message, not a prompt fragment, and will be translated under issue #6 (already partially in progress under #24). + - Externalising prompt strings to `/locales/*.json` (out of scope per the `i18n-*-prompts` family of tickets — same pattern as issues #2/#3/#4/#5). + - Editing call sites of `OasisProfileGenerator` (`api/simulation.py`, etc.). + - Editing `backend/app/utils/locale.py`, the locale registries, or `/locales/`. +- **Adjacent expectations**: + - The OASIS / CAMEL-OASIS simulation layer must continue to consume profile JSON unchanged. No coupling to prompt language exists in the OASIS adapter. + - The locale resolution chain (`Accept-Language` header → `get_locale()` → `get_language_instruction()`) is owned by `backend/app/utils/locale.py` and is unchanged by this work. Translating the base prompt does not modify locale resolution semantics. + - Companion i18n issues (#3, #4, #5, #6, #7, #9, #10, #23, #24, #26) operate on different files or scopes and should not be touched here. + +## Requirements + +### Requirement 1: English Translation of the Profile-Generation System Prompt + +**Objective:** As a MiroFish operator running the pipeline under `Accept-Language: en`, I want the profile-generation system prompt to be authored in English, so that the LLM's persona output is not biased toward Chinese structure or word choice. + +#### Acceptance Criteria + +1. The OASIS Profile Generator shall define `base_prompt` (in `_get_system_prompt`) containing zero CJK characters in any string-literal content. +2. The OASIS Profile Generator shall preserve the system-prompt requirement that the model returns valid JSON whose string values do not contain unescaped newline characters. +3. The OASIS Profile Generator shall preserve the call to `get_language_instruction()` appended to `base_prompt`, exactly at the existing concatenation site, so locale steering continues to work for non-English locales. +4. The OASIS Profile Generator shall preserve the `is_individual` parameter of `_get_system_prompt` and continue to return a single concatenated system-prompt string of the form `"{base_prompt}\n\n{language_instruction}"`. + +### Requirement 2: English Translation of the Individual-Persona User-Message Template + +**Objective:** As a MiroFish operator running the pipeline under `Accept-Language: en`, I want the individual-persona user-message template constructed by `_build_individual_persona_prompt` to be authored in English, so that the rendered prompt does not interleave English instructions with Chinese section headings, and the LLM is not biased toward Chinese output. + +#### Acceptance Criteria + +1. The OASIS Profile Generator shall render the individual-persona user message with English field labels in place of `实体名称`, `实体类型`, `实体摘要`, `实体属性`, and `上下文信息`. +2. The OASIS Profile Generator shall render the JSON-field descriptions (the `请生成JSON,包含以下字段` enumeration) in English while preserving the eight required output keys verbatim by name (`bio`, `persona`, `age`, `gender`, `mbti`, `country`, `profession`, `interested_topics`). +3. The OASIS Profile Generator shall preserve the requirement language that `gender` MUST be the literal English token `"male"` or `"female"` for individual entities, and that `age` MUST be a valid integer. +4. The OASIS Profile Generator shall preserve the trailing rules block (the `重要:` enumeration) in English, conveying the same constraints: all field values must be strings or numbers, no embedded newlines; persona must be a coherent single text block; the `gender` field uses English `male`/`female`; content must remain consistent with the entity information; `age` must be a valid integer. +5. The OASIS Profile Generator shall preserve the call to `get_language_instruction()` interpolated into the rules block. +6. The OASIS Profile Generator shall preserve all f-string interpolations verbatim by name and position: `{entity_name}`, `{entity_type}`, `{entity_summary}`, `{attrs_str}`, `{context_str}`, `{get_language_instruction()}`. +7. The OASIS Profile Generator shall replace the no-attributes placeholder `"无"` with the English `"None"` when `entity_attributes` is empty / falsy, and the no-context placeholder `"无额外上下文"` with an English equivalent (e.g. `"No additional context"`) when `context` is empty / falsy. +8. The OASIS Profile Generator shall return zero CJK characters across all string literals contributed by `_build_individual_persona_prompt`. +9. The OASIS Profile Generator shall preserve the existing `country` field instruction semantics (a free-form country name is requested) but replace the example `"中国"` with a locale-neutral English phrasing that does not bias the model toward any single country (e.g. `Free-form country name`). + +### Requirement 3: English Translation of the Group/Institution-Persona User-Message Template + +**Objective:** As a MiroFish operator running the pipeline under `Accept-Language: en`, I want the group-persona user-message template constructed by `_build_group_persona_prompt` to be authored in English, with the same scope and contract as Requirement 2 but for institutional entities. + +#### Acceptance Criteria + +1. The OASIS Profile Generator shall render the group-persona user message with English field labels in place of `实体名称`, `实体类型`, `实体摘要`, `实体属性`, and `上下文信息`. +2. The OASIS Profile Generator shall render the JSON-field descriptions (the `请生成JSON,包含以下字段` enumeration) in English while preserving the eight required output keys verbatim by name (`bio`, `persona`, `age`, `gender`, `mbti`, `country`, `profession`, `interested_topics`). +3. The OASIS Profile Generator shall preserve the fixed-value requirements: `age` MUST be the integer literal `30`; `gender` MUST be the literal English token `"other"`. +4. The OASIS Profile Generator shall preserve the trailing rules block (the `重要:` enumeration) in English, conveying the same constraints: all field values must be strings or numbers (no nulls); persona must be a coherent single text block (no embedded newlines); the `gender` field uses English `"other"`; `age` must be the integer `30`; the institutional account's voice must match its identity. +5. The OASIS Profile Generator shall preserve the call to `get_language_instruction()` interpolated into the rules block. +6. The OASIS Profile Generator shall preserve all f-string interpolations verbatim by name and position: `{entity_name}`, `{entity_type}`, `{entity_summary}`, `{attrs_str}`, `{context_str}`, `{get_language_instruction()}`. +7. The OASIS Profile Generator shall use the same English placeholders as Requirement 2 for the no-attributes and no-context cases. +8. The OASIS Profile Generator shall return zero CJK characters across all string literals contributed by `_build_group_persona_prompt`. +9. The OASIS Profile Generator shall preserve the existing `country` field instruction with a locale-neutral English phrasing (matching Requirement 2.9). + +### Requirement 4: English Translation of the Context-Builder Section Labels + +**Objective:** As a MiroFish operator running the pipeline under `Accept-Language: en`, I want the section labels embedded in the context string by `_search_zep_for_entity` and `_build_entity_context` to be in English, so that the prompt context block interpolated into the user message is fully English and the LLM is not biased toward Chinese output by the context labels. + +#### Acceptance Criteria + +1. The OASIS Profile Generator shall render the related-node prefix (currently `"相关实体: "`) in English (e.g. `"Related entity: "`) in `_search_zep_for_entity`. +2. The OASIS Profile Generator shall render the facts block heading (currently `"事实信息:"`) in English (e.g. `"Facts:"`) in `_search_zep_for_entity`. +3. The OASIS Profile Generator shall render the related-entities block heading (currently `"相关实体:"`) in English (e.g. `"Related entities:"`) in `_search_zep_for_entity`. +4. The OASIS Profile Generator shall render the entity-attributes section heading (currently `"### 实体属性"`) in English (e.g. `"### Entity attributes"`) in `_build_entity_context`. +5. The OASIS Profile Generator shall render the related-facts/relationships section heading (currently `"### 相关事实和关系"`) in English (e.g. `"### Related facts and relationships"`) in `_build_entity_context`. +6. The OASIS Profile Generator shall render the related-entity-information section heading (currently `"### 关联实体信息"`) in English (e.g. `"### Related entity information"`) in `_build_entity_context`. +7. The OASIS Profile Generator shall render the Zep-retrieved facts section heading (currently `"### Zep检索到的事实信息"`) in English (e.g. `"### Facts retrieved from the graph"`) in `_build_entity_context`. +8. The OASIS Profile Generator shall render the Zep-retrieved related-nodes section heading (currently `"### Zep检索到的相关节点"`) in English (e.g. `"### Related nodes retrieved from the graph"`) in `_build_entity_context`. +9. The OASIS Profile Generator shall render the inline edge-direction placeholder (currently `(相关实体)`) in English (e.g. `(related entity)`) in both outgoing and incoming branches of `_build_entity_context`. +10. The OASIS Profile Generator shall return zero CJK characters across all section-label string literals contributed by `_search_zep_for_entity` and `_build_entity_context`. + +### Requirement 5: English Translation of the Fallback Persona Templates + +**Objective:** As a MiroFish operator running the pipeline under `Accept-Language: en`, when the LLM JSON parse fails or returns missing fields and the code falls back to a synthesized persona template, I want the fallback persona to be in English so that the resulting profile JSON does not contain unintended Chinese strings. + +#### Acceptance Criteria + +1. The OASIS Profile Generator shall replace the fallback persona template `f"{entity_name}是一个{entity_type}。"` at every occurrence (currently at the persona-validation branch in `_generate_profile_with_llm` line 547, the regex-extraction branch in `_try_fix_json` line 644, and the catastrophic-failure branch line 659) with an English equivalent (e.g. `f"{entity_name} is a {entity_type}."`). +2. The OASIS Profile Generator shall preserve the priority order of the fallback chain (`entity_summary or template`). +3. The OASIS Profile Generator shall return zero CJK characters across all fallback persona literals. + +### Requirement 6: English Translation of the Console-Output Formatting + +**Objective:** As a MiroFish operator monitoring profile generation in the console under `Accept-Language: en`, I want the per-profile diagnostic banner and the start/end batch banners to be in English so that the entire console stream is consistent with the requested locale. + +#### Acceptance Criteria + +1. The OASIS Profile Generator shall render the per-profile section headings in English in `_print_generated_profile`: `【简介】` → `[Bio]`, `【详细人设】` → `[Persona]`, `【基本属性】` → `[Basic attributes]` (or equivalent English markers). +2. The OASIS Profile Generator shall render the per-profile row labels in English in `_print_generated_profile`: `用户名:` → `Username:`, `年龄:` → `Age:`, `性别:` → `Gender:`, `职业:` → `Profession:`, `国家:` → `Country:`, `兴趣话题:` → `Interested topics:`. +3. The OASIS Profile Generator shall replace the empty-topics sentinel `'无'` in `_print_generated_profile` with an English equivalent (e.g. `'None'`). +4. The OASIS Profile Generator shall render the start-of-batch and end-of-batch banners in `generate_profiles_from_entities` in English: `开始生成Agent人设 - 共 {total} 个实体,并行数: {parallel_count}` → `Generating agent profiles — {total} entities, parallel: {parallel_count}` (or equivalent); `人设生成完成!共生成 {len([p for p in profiles if p])} 个Agent` → `Profile generation complete — produced {n} agents` (or equivalent). +5. The OASIS Profile Generator shall preserve all f-string interpolations in the banners verbatim (`{total}`, `{parallel_count}`, the count expression). +6. The OASIS Profile Generator shall return zero CJK characters across all string literals contributed by `_print_generated_profile` and the surrounding `print(...)` banners in `generate_profiles_from_entities`. +7. The OASIS Profile Generator shall continue to use the existing `t('progress.profileGenerated', ...)` key for the per-profile heading row, since that key is already locale-keyed via the `t()` helper. + +### Requirement 7: Locale Switching Continues to Work via `get_language_instruction()` + +**Objective:** As a MiroFish operator running the pipeline under `Accept-Language: zh` (or any other configured non-English locale), I want the profile output to remain in the requested locale of equivalent quality, so that translating the base prompt does not regress non-English support. + +#### Acceptance Criteria + +1. The OASIS Profile Generator shall preserve the call to `get_language_instruction()` exactly at its existing locations (currently inside `_get_system_prompt` and inside both `_build_individual_persona_prompt` and `_build_group_persona_prompt` rules blocks), continuing to read locale via the existing thread-local / request-header resolution chain. +2. When the locale is `zh`, the OASIS Profile Generator shall produce profile JSON whose `bio` and `persona` fields are in Chinese, equivalent in quality to the pre-change behaviour. +3. When the locale is `en`, the OASIS Profile Generator shall produce profile JSON whose `bio` and `persona` fields are in English. +4. The OASIS Profile Generator shall not alter `backend/app/utils/locale.py`, the `_languages` registry, the `_translations` registries, or the locales under `/locales/`. + +### Requirement 8: Public API and Call-Site Stability + +**Objective:** As a developer maintaining the rest of the MiroFish backend pipeline, I want the public surface of `OasisProfileGenerator` to remain unchanged, so that the simulation pipeline and existing callers continue to work without modification. + +#### Acceptance Criteria + +1. The OASIS Profile Generator shall preserve the signatures of `OasisProfileGenerator.__init__`, `generate_profile_from_entity`, `generate_profiles_from_entities`, `set_graph_id`, `save_profiles`, and `save_profiles_to_json`. +2. The OASIS Profile Generator shall preserve the signatures of all private helpers, including `_generate_profile_with_llm`, `_build_individual_persona_prompt`, `_build_group_persona_prompt`, `_get_system_prompt`, `_build_entity_context`, `_search_zep_for_entity`, `_print_generated_profile`, `_normalize_gender`, `_save_twitter_csv`, `_save_reddit_json`, `_try_fix_json`, `_fix_truncated_json`, `_is_individual_entity`, `_is_group_entity`, `_generate_profile_rule_based`, `_generate_username`. +3. The OASIS Profile Generator shall preserve the return shape of `generate_profile_from_entity` (a populated `OasisAgentProfile` dataclass instance) and `generate_profiles_from_entities` (a `List[OasisAgentProfile]`). +4. The OASIS Profile Generator shall preserve the LLM invocation parameters (`response_format={"type": "json_object"}`, the `temperature=0.7 - (attempt * 0.1)` schedule, the absence of `max_tokens`) and the call to `self.client.chat.completions.create(...)`. +5. The OASIS Profile Generator shall preserve the `_normalize_gender` mapping table verbatim (the Chinese keys `男`, `女`, `机构`, `其他` continue to accept upstream Chinese input). +6. The OASIS Profile Generator shall preserve the rule-based `country: "中国"` default in `_generate_profile_rule_based` (this is a data value, not a prompt; changing it is out of scope per the boundary commitments). + +### Requirement 9: Reasoning-Model Output Compatibility + +**Objective:** As a MiroFish operator using a reasoning-model provider (e.g. MiniMax, GLM with `` tags or markdown code fences), I want JSON parsing of the profile response to continue working, so that translating the base prompt does not regress provider compatibility. + +#### Acceptance Criteria + +1. The OASIS Profile Generator shall continue to call `self.client.chat.completions.create(...)` with `response_format={"type": "json_object"}` and parse the response via the existing `json.loads` / `_try_fix_json` / `_fix_truncated_json` chain unchanged. +2. The OASIS Profile Generator shall not introduce any new pre-processing of the LLM response that depends on prompt language. +3. The fallback persona templates from Requirement 5 shall be safe to embed in JSON (no embedded raw newlines, balanced quotes). + +### Requirement 10: Out-of-Scope Surfaces Remain Untouched + +**Objective:** As a reviewer of this PR, I want the change to remain narrowly scoped to prompt strings and the immediately-adjacent context labels and console output, so that translation responsibilities for adjacent surfaces (issues #6 and #7) are not absorbed into this change. + +#### Acceptance Criteria + +1. The change shall not modify any `logger.warning(...)`, `logger.info(...)`, `logger.error(...)`, or `logger.debug(...)` call in `oasis_profile_generator.py` (covered by issues #6 / #24 / #25-style backend-log work — the calls already use `t("log.profile_generator.*")`). +2. The change shall not modify the module docstring, class docstrings, method docstrings, or inline comments in `oasis_profile_generator.py` (covered by issue #7) — including the inline comments at lines 65, 93, 641, 804–807, 816–819, etc. +3. The change shall not modify the `_normalize_gender` mapping table (Chinese gender keys must remain to handle upstream input). +4. The change shall not modify the rule-based `country: "中国"` default in `_generate_profile_rule_based`. +5. The change shall not modify the `ValueError("LLM_API_KEY 未配置")` raise (covered by issue #6). +6. The change shall not edit any file outside `backend/app/services/oasis_profile_generator.py` for production code, except for adding test fixtures or scripts under a clearly-isolated directory if a verification harness is needed. +7. The change shall not introduce a new dependency or modify `backend/pyproject.toml` / `backend/uv.lock`. diff --git a/.kiro/specs/i18n-oasis-profile-generator-prompts/spec.json b/.kiro/specs/i18n-oasis-profile-generator-prompts/spec.json new file mode 100644 index 00000000..cf2d0cbf --- /dev/null +++ b/.kiro/specs/i18n-oasis-profile-generator-prompts/spec.json @@ -0,0 +1,27 @@ +{ + "feature_name": "i18n-oasis-profile-generator-prompts", + "created_at": "2026-05-07T22:50:00Z", + "updated_at": "2026-05-07T22:50:00Z", + "language": "en", + "phase": "tasks-generated", + "approvals": { + "requirements": { + "generated": true, + "approved": true + }, + "design": { + "generated": true, + "approved": true + }, + "tasks": { + "generated": true, + "approved": true + } + }, + "ready_for_implementation": true, + "ticket": { + "number": 25, + "url": "https://github.com/salestech-group/MiroFish/issues/25", + "snapshot": ".ticket/25.md" + } +} diff --git a/.kiro/specs/i18n-oasis-profile-generator-prompts/tasks.md b/.kiro/specs/i18n-oasis-profile-generator-prompts/tasks.md new file mode 100644 index 00000000..04f865d5 --- /dev/null +++ b/.kiro/specs/i18n-oasis-profile-generator-prompts/tasks.md @@ -0,0 +1,92 @@ +# Implementation Plan + +- [ ] 1. Translate the system-prompt base string in `_get_system_prompt` + - Replace the body of `base_prompt` (currently `"你是社交媒体用户画像生成专家。生成详细、真实的人设用于舆论模拟,最大程度还原已有现实情况。必须返回有效的JSON格式,所有字符串值不能包含未转义的换行符。"`) with an English equivalent that preserves the same intent: define the LLM as an expert social-media-persona generator; require detailed, realistic personas grounded in supplied context; require valid JSON output; forbid unescaped newlines in string values + - Preserve the trailing `f"{base_prompt}\n\n{get_language_instruction()}"` concatenation site exactly + - Preserve the `is_individual` parameter (still accepted, still unused — no signature change) + - Observable completion: `_get_system_prompt(...)` returns an English-only base prompt followed by the locale-appropriate `get_language_instruction()` postfix + - _Requirements: 1.1, 1.2, 1.3, 1.4_ + +- [ ] 2. Translate the individual-persona user-message template in `_build_individual_persona_prompt` + - Replace the introductory line (`"为实体生成详细的社交媒体用户人设,..."`) with an English equivalent + - Replace the field-label rows (`实体名称`, `实体类型`, `实体摘要`, `实体属性`, `上下文信息`) with English equivalents + - Replace the `请生成JSON,包含以下字段:` enumeration block with an English equivalent that preserves the eight required output keys verbatim by name (`bio`, `persona`, `age`, `gender`, `mbti`, `country`, `profession`, `interested_topics`) + - Translate the per-field guidance: `bio` is a 200-character social-media bio; `persona` is a coherent ~2000-character text containing basic info, background, personality (with MBTI), social-media behavior, stance, distinctive traits, and event-specific memories; `age` must be an integer; `gender` must be the literal English token `"male"` or `"female"`; `mbti` is an MBTI four-letter code; `country` is a free-form country name; `profession` is a free-form occupation; `interested_topics` is a list of topics + - Replace the trailing `重要:` rules block with an English equivalent: all field values must be strings or numbers, no embedded newlines; persona must be a coherent single text block; `gender` must use English `male`/`female`; content must remain consistent with the entity information; `age` must be a valid integer + - Preserve the call to `get_language_instruction()` interpolated into the rules block + - Replace the `attrs_str` no-attributes placeholder `"无"` with `"None"` (or English equivalent) at line 677 + - Replace the `context_str` no-context placeholder `"无额外上下文"` with `"No additional context"` (or English equivalent) at line 678 + - Preserve every f-string interpolation by name and position: `{entity_name}`, `{entity_type}`, `{entity_summary}`, `{attrs_str}`, `{context_str}`, `{get_language_instruction()}` + - Observable completion: `_build_individual_persona_prompt(...)` produces an English-only message body for any input combination, with zero CJK characters in any string literal it contributes; under the same inputs as before, all interpolated values still appear in the rendered output + - _Requirements: 2.1, 2.2, 2.3, 2.4, 2.5, 2.6, 2.7, 2.8, 2.9_ + +- [ ] 3. Translate the group-persona user-message template in `_build_group_persona_prompt` + - Replace the introductory line (`"为机构/群体实体生成详细的社交媒体账号设定,..."`) with an English equivalent + - Replace the field-label rows (`实体名称`, `实体类型`, `实体摘要`, `实体属性`, `上下文信息`) with English equivalents (matching task 2) + - Replace the `请生成JSON,包含以下字段:` enumeration block with an English equivalent that preserves the eight required output keys verbatim by name (`bio`, `persona`, `age`, `gender`, `mbti`, `country`, `profession`, `interested_topics`) + - Translate the per-field guidance: `bio` is a polished ~200-character official-account bio; `persona` is a coherent ~2000-character text covering institutional background, account positioning, voice, content patterns, official stance, distinctive traits, and event-specific memories; `age` must be the integer literal `30`; `gender` must be the literal English token `"other"`; `mbti` describes account voice; `country` is a free-form country name; `profession` is the institution's role; `interested_topics` is a list of focus areas + - Replace the trailing `重要:` rules block with an English equivalent: all field values must be strings or numbers (no nulls); persona must be a coherent single text block (no embedded newlines); `gender` must use English `"other"`; `age` must be the integer `30`; the institutional account's voice must match its identity + - Preserve the call to `get_language_instruction()` interpolated into the rules block + - Replace the `attrs_str` and `context_str` placeholders the same way as in task 2 (lines 726, 727) + - Preserve every f-string interpolation by name and position + - Observable completion: `_build_group_persona_prompt(...)` produces an English-only message body for any input combination, with zero CJK characters; under the same inputs as before, all interpolated values still appear in the rendered output + - _Requirements: 3.1, 3.2, 3.3, 3.4, 3.5, 3.6, 3.7, 3.8, 3.9_ + +- [ ] 4. Translate the section labels in `_search_zep_for_entity` and `_build_entity_context` + - Replace the related-node prefix `f"相关实体: {node.name}"` with an English equivalent (e.g. `f"Related entity: {node.name}"`) at line 384 + - Replace the facts block heading `"事实信息:\n"` with `"Facts:\n"` (or equivalent) at line 390 + - Replace the related-entities block heading `"相关实体:\n"` with `"Related entities:\n"` (or equivalent) at line 392 + - Replace the entity-attributes section heading `"### 实体属性\n"` with `"### Entity attributes\n"` (or equivalent) at line 422 + - Replace the inline edge-direction placeholder `(相关实体)` with `(related entity)` (or equivalent) at lines 438 and 440 (both outgoing and incoming branches) + - Replace the related-facts/relationships section heading `"### 相关事实和关系\n"` with `"### Related facts and relationships\n"` (or equivalent) at line 443 + - Replace the related-entity-information section heading `"### 关联实体信息\n"` with `"### Related entity information\n"` (or equivalent) at line 463 + - Replace the Zep-retrieved facts section heading `"### Zep检索到的事实信息\n"` with `"### Facts retrieved from the graph\n"` (or equivalent) at line 472 + - Replace the Zep-retrieved related-nodes section heading `"### Zep检索到的相关节点\n"` with `"### Related nodes retrieved from the graph\n"` (or equivalent) at line 475 + - Preserve the structure (heading + bulleted body, joined by `"\n".join(...)`) + - Observable completion: the context string returned by `_build_entity_context(...)` contains zero CJK characters in section labels for any input + - _Requirements: 4.1, 4.2, 4.3, 4.4, 4.5, 4.6, 4.7, 4.8, 4.9, 4.10_ + +- [ ] 5. Translate the fallback persona templates + - Replace `f"{entity_name}是一个{entity_type}。"` with `f"{entity_name} is a {entity_type}."` (or equivalent) at line 547 (`_generate_profile_with_llm`, missing-persona branch) + - Replace the same template at line 644 (`_try_fix_json`, regex-extraction branch) + - Replace the same template at line 659 (`_try_fix_json`, catastrophic-failure branch) + - Preserve the `entity_summary or template` priority order at every site + - Observable completion: when the LLM fails JSON parse and the fallback template is invoked, the resulting `persona` value is English + - _Requirements: 5.1, 5.2, 5.3_ + +- [ ] 6. Translate the console-output formatting in `_print_generated_profile` and the surrounding banners + - Replace the section headings in `_print_generated_profile`: `f"【简介】"` → English equivalent (e.g. `"[Bio]"`), `f"【详细人设】"` → English equivalent (e.g. `"[Persona]"`), `f"【基本属性】"` → English equivalent (e.g. `"[Basic attributes]"`) + - Replace the row labels in `_print_generated_profile`: `f"用户名:"` → `f"Username: {profile.user_name}"`, `f"年龄: {profile.age} | 性别: {profile.gender} | MBTI: {profile.mbti}"` → `f"Age: {profile.age} | Gender: {profile.gender} | MBTI: {profile.mbti}"`, `f"职业: {profile.profession} | 国家: {profile.country}"` → `f"Profession: {profile.profession} | Country: {profile.country}"`, `f"兴趣话题: {topics_str}"` → `f"Interested topics: {topics_str}"` + - Replace the empty-topics sentinel `'无'` with `'None'` (or equivalent) at line 1011 + - Replace the start-of-batch banner in `generate_profiles_from_entities` (currently `f"开始生成Agent人设 - 共 {total} 个实体,并行数: {parallel_count}"` at line 945) with an English equivalent (e.g. `f"Generating agent profiles — {total} entities, parallel: {parallel_count}"`) + - Replace the end-of-batch banner (currently `f"人设生成完成!共生成 {len([p for p in profiles if p])} 个Agent"` at line 1001) with an English equivalent (e.g. `f"Profile generation complete — produced {len([p for p in profiles if p])} agents"`) + - Preserve all f-string interpolations + - Preserve the existing `t('progress.profileGenerated', name=entity_name, type=entity_type)` call (already locale-keyed) + - Observable completion: the console output stream contains zero CJK characters in literals contributed by `_print_generated_profile` and the two batch banners (the entity name itself may still contain CJK because it is data, not a literal) + - _Requirements: 6.1, 6.2, 6.3, 6.4, 6.5, 6.6, 6.7_ + +- [ ] 7. Confirm boundary commitments around the translation + - Confirm `logger.warning(...)`, `logger.info(...)`, `logger.error(...)`, `logger.debug(...)` calls and their `t("log.profile_generator.*")` keys in this file are unchanged + - Confirm the module/class/method docstrings and inline comments are unchanged (including lines 65, 93, 641, 804–807, 816–819) + - Confirm `_normalize_gender` mapping table (Chinese keys `男`/`女`/`机构`/`其他`) is unchanged + - Confirm the rule-based `country: "中国"` default at lines 807, 819 is unchanged + - Confirm the `ValueError("LLM_API_KEY 未配置")` raise at line 194 is unchanged + - Confirm public signatures (`__init__`, `generate_profile_from_entity`, `generate_profiles_from_entities`, `set_graph_id`, `save_profiles`, `save_profiles_to_json`) and private helper signatures are unchanged + - Confirm the `OasisAgentProfile` dataclass schema is unchanged + - Confirm the LLM call (`response_format={"type": "json_object"}`, `temperature=0.7 - (attempt * 0.1)`, no `max_tokens`) is unchanged + - Confirm `backend/app/utils/locale.py`, `/locales/languages.json`, `/locales/en.json`, `/locales/zh.json` are not modified + - Confirm `backend/pyproject.toml`, `backend/uv.lock`, and any file outside `backend/app/services/oasis_profile_generator.py` are not modified + - Observable completion: a `git diff` review against `main` shows changes only inside `backend/app/services/oasis_profile_generator.py`, only inside the seven owned regions + - _Requirements: 7.1, 7.4, 8.1, 8.2, 8.3, 8.4, 8.5, 8.6, 10.1, 10.2, 10.3, 10.4, 10.5, 10.6, 10.7_ + +- [ ] 8. Verify CJK-free invariant in the seven owned regions + - Run a one-shot script that imports `OasisProfileGenerator`, calls `_build_individual_persona_prompt(...)`, `_build_group_persona_prompt(...)`, `_get_system_prompt(...)`, and `_build_entity_context(...)` with representative inputs that contain no CJK in the inputs themselves, and asserts the rendered output contains zero matches against the regex `[一-鿿]` + - Manually inspect the seven owned regions in the patched file with a CJK regex (`grep -nP '[\x{4e00}-\x{9fff}]'`) and confirm there are no remaining matches inside the owned regions + - Observable completion: the inspection passes; if it fails, fix the offending region and re-run before completing this task + - _Requirements: 1.1, 2.8, 3.8, 4.10, 5.3, 6.6_ + +- [ ] 9. Verify locale-driven output language under both `en` and `zh` + - Set the thread-local locale to `en` via `set_locale("en")`, run `OasisProfileGenerator().generate_profile_from_entity(...)` against the configured LLM with a small representative entity, and confirm the returned `bio` and `persona` are in English + - Set the thread-local locale to `zh` via `set_locale("zh")`, run the same round-trip, and confirm the returned `bio` and `persona` are in Chinese, equivalent in quality to the pre-change baseline + - Observable completion: both runs succeed; the `en` run is CJK-free in `bio` and `persona`; the `zh` run continues to produce Chinese; results recorded in the PR description + - _Requirements: 7.2, 7.3_ diff --git a/backend/app/services/oasis_profile_generator.py b/backend/app/services/oasis_profile_generator.py index 1cf9158a..d5e53868 100644 --- a/backend/app/services/oasis_profile_generator.py +++ b/backend/app/services/oasis_profile_generator.py @@ -381,15 +381,15 @@ class OasisProfileGenerator: if hasattr(node, 'summary') and node.summary: all_summaries.add(node.summary) if hasattr(node, 'name') and node.name and node.name != entity_name: - all_summaries.add(f"相关实体: {node.name}") + all_summaries.add(f"Related entity: {node.name}") results["node_summaries"] = list(all_summaries) # 构建综合上下文 context_parts = [] if results["facts"]: - context_parts.append("事实信息:\n" + "\n".join(f"- {f}" for f in results["facts"][:20])) + context_parts.append("Facts:\n" + "\n".join(f"- {f}" for f in results["facts"][:20])) if results["node_summaries"]: - context_parts.append("相关实体:\n" + "\n".join(f"- {s}" for s in results["node_summaries"][:10])) + context_parts.append("Related entities:\n" + "\n".join(f"- {s}" for s in results["node_summaries"][:10])) results["context"] = "\n\n".join(context_parts) logger.info(t("log.profile_generator.m006", entity_name=entity_name, len=len(results['facts']), len_2=len(results['node_summaries']))) @@ -419,7 +419,7 @@ class OasisProfileGenerator: if value and str(value).strip(): attrs.append(f"- {key}: {value}") if attrs: - context_parts.append("### 实体属性\n" + "\n".join(attrs)) + context_parts.append("### Entity attributes\n" + "\n".join(attrs)) # 2. 添加相关边信息(事实/关系) existing_facts = set() @@ -435,12 +435,12 @@ class OasisProfileGenerator: existing_facts.add(fact) elif edge_name: if direction == "outgoing": - relationships.append(f"- {entity.name} --[{edge_name}]--> (相关实体)") + relationships.append(f"- {entity.name} --[{edge_name}]--> (related entity)") else: - relationships.append(f"- (相关实体) --[{edge_name}]--> {entity.name}") - + relationships.append(f"- (related entity) --[{edge_name}]--> {entity.name}") + if relationships: - context_parts.append("### 相关事实和关系\n" + "\n".join(relationships)) + context_parts.append("### Related facts and relationships\n" + "\n".join(relationships)) # 3. 添加关联节点的详细信息 if entity.related_nodes: @@ -460,7 +460,7 @@ class OasisProfileGenerator: related_info.append(f"- **{node_name}**{label_str}") if related_info: - context_parts.append("### 关联实体信息\n" + "\n".join(related_info)) + context_parts.append("### Related entity information\n" + "\n".join(related_info)) # 4. 使用Zep混合检索获取更丰富的信息 zep_results = self._search_zep_for_entity(entity) @@ -469,10 +469,10 @@ class OasisProfileGenerator: # 去重:排除已存在的事实 new_facts = [f for f in zep_results["facts"] if f not in existing_facts] if new_facts: - context_parts.append("### Zep检索到的事实信息\n" + "\n".join(f"- {f}" for f in new_facts[:15])) - + context_parts.append("### Facts retrieved from the graph\n" + "\n".join(f"- {f}" for f in new_facts[:15])) + if zep_results.get("node_summaries"): - context_parts.append("### Zep检索到的相关节点\n" + "\n".join(f"- {s}" for s in zep_results["node_summaries"][:10])) + context_parts.append("### Related nodes retrieved from the graph\n" + "\n".join(f"- {s}" for s in zep_results["node_summaries"][:10])) return "\n\n".join(context_parts) @@ -544,7 +544,7 @@ class OasisProfileGenerator: if "bio" not in result or not result["bio"]: result["bio"] = entity_summary[:200] if entity_summary else f"{entity_type}: {entity_name}" if "persona" not in result or not result["persona"]: - result["persona"] = entity_summary or f"{entity_name}是一个{entity_type}。" + result["persona"] = entity_summary or f"{entity_name} is a {entity_type}." return result @@ -641,7 +641,7 @@ class OasisProfileGenerator: persona_match = re.search(r'"persona"\s*:\s*"([^"]*)', content) # 可能被截断 bio = bio_match.group(1) if bio_match else (entity_summary[:200] if entity_summary else f"{entity_type}: {entity_name}") - persona = persona_match.group(1) if persona_match else (entity_summary or f"{entity_name}是一个{entity_type}。") + persona = persona_match.group(1) if persona_match else (entity_summary or f"{entity_name} is a {entity_type}.") # 如果提取到了有意义的内容,标记为已修复 if bio_match or persona_match: @@ -656,12 +656,12 @@ class OasisProfileGenerator: logger.warning(t("log.profile_generator.m014")) return { "bio": entity_summary[:200] if entity_summary else f"{entity_type}: {entity_name}", - "persona": entity_summary or f"{entity_name}是一个{entity_type}。" + "persona": entity_summary or f"{entity_name} is a {entity_type}." } def _get_system_prompt(self, is_individual: bool) -> str: """获取系统提示词""" - base_prompt = "你是社交媒体用户画像生成专家。生成详细、真实的人设用于舆论模拟,最大程度还原已有现实情况。必须返回有效的JSON格式,所有字符串值不能包含未转义的换行符。" + base_prompt = "You are an expert at generating social-media user personas. Produce detailed, realistic personas for opinion-simulation, faithfully grounded in the supplied real-world context. You MUST return valid JSON; no string value may contain unescaped newline characters." return f"{base_prompt}\n\n{get_language_instruction()}" def _build_individual_persona_prompt( @@ -674,43 +674,43 @@ class OasisProfileGenerator: ) -> str: """构建个人实体的详细人设提示词""" - attrs_str = json.dumps(entity_attributes, ensure_ascii=False) if entity_attributes else "无" - context_str = context[:3000] if context else "无额外上下文" - - return f"""为实体生成详细的社交媒体用户人设,最大程度还原已有现实情况。 + attrs_str = json.dumps(entity_attributes, ensure_ascii=False) if entity_attributes else "None" + context_str = context[:3000] if context else "No additional context" -实体名称: {entity_name} -实体类型: {entity_type} -实体摘要: {entity_summary} -实体属性: {attrs_str} + return f"""Generate a detailed social-media user persona for an entity, faithfully grounded in the supplied real-world context. -上下文信息: +Entity name: {entity_name} +Entity type: {entity_type} +Entity summary: {entity_summary} +Entity attributes: {attrs_str} + +Context: {context_str} -请生成JSON,包含以下字段: +Produce a JSON object with the following fields: -1. bio: 社交媒体简介,200字 -2. persona: 详细人设描述(2000字的纯文本),需包含: - - 基本信息(年龄、职业、教育背景、所在地) - - 人物背景(重要经历、与事件的关联、社会关系) - - 性格特征(MBTI类型、核心性格、情绪表达方式) - - 社交媒体行为(发帖频率、内容偏好、互动风格、语言特点) - - 立场观点(对话题的态度、可能被激怒/感动的内容) - - 独特特征(口头禅、特殊经历、个人爱好) - - 个人记忆(人设的重要部分,要介绍这个个体与事件的关联,以及这个个体在事件中的已有动作与反应) -3. age: 年龄数字(必须是整数) -4. gender: 性别,必须是英文: "male" 或 "female" -5. mbti: MBTI类型(如INTJ、ENFP等) -6. country: 国家(使用中文,如"中国") -7. profession: 职业 -8. interested_topics: 感兴趣话题数组 +1. bio: ~200-character social-media bio. +2. persona: detailed persona description as a single coherent ~2000-character plain-text passage covering: + - basic info (age, profession, educational background, location) + - background (notable experiences, link to the focal event, social relationships) + - personality (MBTI type, core traits, emotional expression style) + - social-media behaviour (posting frequency, content preferences, interaction style, voice) + - stance and opinions (attitude toward the topic, content likely to provoke or move them) + - distinctive traits (catchphrases, unusual experiences, hobbies) + - personal memories (a key part of the persona; describe this individual's link to the focal event and any actions / reactions they have already taken in connection with it) +3. age: an integer. +4. gender: must be the literal English token "male" or "female". +5. mbti: MBTI type (e.g. INTJ, ENFP). +6. country: free-form country name. +7. profession: free-form occupation. +8. interested_topics: array of topic strings. -重要: -- 所有字段值必须是字符串或数字,不要使用换行符 -- persona必须是一段连贯的文字描述 -- {get_language_instruction()} (gender字段必须用英文male/female) -- 内容要与实体信息保持一致 -- age必须是有效的整数,gender必须是"male"或"female" +Important: +- All field values must be strings or numbers; do not include newline characters in any string value. +- persona must be a single coherent prose passage. +- {get_language_instruction()} (the gender field must remain English: male/female.) +- The content must remain consistent with the supplied entity information. +- age must be a valid integer; gender must be exactly "male" or "female". """ def _build_group_persona_prompt( @@ -723,43 +723,43 @@ class OasisProfileGenerator: ) -> str: """构建群体/机构实体的详细人设提示词""" - attrs_str = json.dumps(entity_attributes, ensure_ascii=False) if entity_attributes else "无" - context_str = context[:3000] if context else "无额外上下文" - - return f"""为机构/群体实体生成详细的社交媒体账号设定,最大程度还原已有现实情况。 + attrs_str = json.dumps(entity_attributes, ensure_ascii=False) if entity_attributes else "None" + context_str = context[:3000] if context else "No additional context" -实体名称: {entity_name} -实体类型: {entity_type} -实体摘要: {entity_summary} -实体属性: {attrs_str} + return f"""Generate a detailed social-media account profile for an institutional or group entity, faithfully grounded in the supplied real-world context. -上下文信息: +Entity name: {entity_name} +Entity type: {entity_type} +Entity summary: {entity_summary} +Entity attributes: {attrs_str} + +Context: {context_str} -请生成JSON,包含以下字段: +Produce a JSON object with the following fields: -1. bio: 官方账号简介,200字,专业得体 -2. persona: 详细账号设定描述(2000字的纯文本),需包含: - - 机构基本信息(正式名称、机构性质、成立背景、主要职能) - - 账号定位(账号类型、目标受众、核心功能) - - 发言风格(语言特点、常用表达、禁忌话题) - - 发布内容特点(内容类型、发布频率、活跃时间段) - - 立场态度(对核心话题的官方立场、面对争议的处理方式) - - 特殊说明(代表的群体画像、运营习惯) - - 机构记忆(机构人设的重要部分,要介绍这个机构与事件的关联,以及这个机构在事件中的已有动作与反应) -3. age: 固定填30(机构账号的虚拟年龄) -4. gender: 固定填"other"(机构账号使用other表示非个人) -5. mbti: MBTI类型,用于描述账号风格,如ISTJ代表严谨保守 -6. country: 国家(使用中文,如"中国") -7. profession: 机构职能描述 -8. interested_topics: 关注领域数组 +1. bio: ~200-character official-account bio, polished and professional. +2. persona: detailed account profile as a single coherent ~2000-character plain-text passage covering: + - institution basics (formal name, type of institution, founding background, primary functions) + - account positioning (account type, target audience, core purpose) + - voice (linguistic style, common expressions, taboo topics) + - content patterns (content types, posting frequency, active hours) + - stance (official position on the focal topic, how disputes are handled) + - special notes (the group profile it represents, operational habits) + - institutional memory (a key part of the persona; describe this institution's link to the focal event and any actions / reactions it has already taken in connection with it) +3. age: must be the integer 30 (a virtual age used for institutional accounts). +4. gender: must be the literal English token "other" (institutional accounts use "other" to indicate non-individual). +5. mbti: MBTI type used to describe the account's voice (e.g. ISTJ for a rigorous, conservative tone). +6. country: free-form country name. +7. profession: free-form description of the institution's role. +8. interested_topics: array of focus areas. -重要: -- 所有字段值必须是字符串或数字,不允许null值 -- persona必须是一段连贯的文字描述,不要使用换行符 -- {get_language_instruction()} (gender字段必须用英文"other") -- age必须是整数30,gender必须是字符串"other" -- 机构账号发言要符合其身份定位""" +Important: +- All field values must be strings or numbers; null values are not allowed. +- persona must be a single coherent prose passage; do not include newline characters in any string value. +- {get_language_instruction()} (the gender field must remain English: "other".) +- age must be the integer 30; gender must be exactly the string "other". +- The institutional account's voice must match its identity.""" def _generate_profile_rule_based( self, @@ -942,7 +942,7 @@ class OasisProfileGenerator: logger.info(t("log.profile_generator.m017", total=total, parallel_count=parallel_count)) print(f"\n{'='*60}") - print(f"开始生成Agent人设 - 共 {total} 个实体,并行数: {parallel_count}") + print(f"Generating agent profiles - {total} entities, parallel: {parallel_count}") print(f"{'='*60}\n") # 使用线程池并行执行 @@ -973,7 +973,7 @@ class OasisProfileGenerator: progress_callback( current, total, - f"已完成 {current}/{total}: {entity.name}({entity_type})" + f"Completed {current}/{total}: {entity.name} ({entity_type})" ) if error: @@ -998,7 +998,7 @@ class OasisProfileGenerator: save_profiles_realtime() print(f"\n{'='*60}") - print(f"人设生成完成!共生成 {len([p for p in profiles if p])} 个Agent") + print(f"Profile generation complete - produced {len([p for p in profiles if p])} agents") print(f"{'='*60}\n") return profiles @@ -1008,24 +1008,24 @@ class OasisProfileGenerator: separator = "-" * 70 # 构建完整输出内容(不截断) - topics_str = ', '.join(profile.interested_topics) if profile.interested_topics else '无' - + topics_str = ', '.join(profile.interested_topics) if profile.interested_topics else 'None' + output_lines = [ f"\n{separator}", t('progress.profileGenerated', name=entity_name, type=entity_type), f"{separator}", - f"用户名: {profile.user_name}", + f"Username: {profile.user_name}", f"", - f"【简介】", + f"[Bio]", f"{profile.bio}", f"", - f"【详细人设】", + f"[Persona]", f"{profile.persona}", f"", - f"【基本属性】", - f"年龄: {profile.age} | 性别: {profile.gender} | MBTI: {profile.mbti}", - f"职业: {profile.profession} | 国家: {profile.country}", - f"兴趣话题: {topics_str}", + f"[Basic attributes]", + f"Age: {profile.age} | Gender: {profile.gender} | MBTI: {profile.mbti}", + f"Profession: {profile.profession} | Country: {profile.country}", + f"Interested topics: {topics_str}", separator ]