Merge f0e936ae43 into 96096ea0ff
This commit is contained in:
commit
b8c5ddc6a3
|
|
@ -9,6 +9,7 @@ import re
|
|||
from typing import Dict, Any, List, Optional
|
||||
from ..utils.llm_client import LLMClient
|
||||
from ..utils.locale import get_language_instruction
|
||||
from ..utils.file_parser import split_text_into_chunks
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
|
|
@ -227,6 +228,10 @@ class OntologyGenerator:
|
|||
|
||||
# 传给 LLM 的文本最大长度(5万字)
|
||||
MAX_TEXT_LENGTH_FOR_LLM = 50000
|
||||
LONG_TEXT_CHUNK_SIZE = 8000
|
||||
LONG_TEXT_CHUNK_OVERLAP = 200
|
||||
MAX_LONG_TEXT_CHUNKS = 60
|
||||
MIN_LONG_TEXT_EXCERPT = 400
|
||||
|
||||
def _build_user_message(
|
||||
self,
|
||||
|
|
@ -236,14 +241,7 @@ class OntologyGenerator:
|
|||
) -> str:
|
||||
"""构建用户消息"""
|
||||
|
||||
# 合并文本
|
||||
combined_text = "\n\n---\n\n".join(document_texts)
|
||||
original_length = len(combined_text)
|
||||
|
||||
# 如果文本超过5万字,截断(仅影响传给LLM的内容,不影响图谱构建)
|
||||
if len(combined_text) > self.MAX_TEXT_LENGTH_FOR_LLM:
|
||||
combined_text = combined_text[:self.MAX_TEXT_LENGTH_FOR_LLM]
|
||||
combined_text += f"\n\n...(原文共{original_length}字,已截取前{self.MAX_TEXT_LENGTH_FOR_LLM}字用于本体分析)..."
|
||||
combined_text = self._build_document_context(document_texts)
|
||||
|
||||
message = f"""## 模拟需求
|
||||
|
||||
|
|
@ -274,6 +272,142 @@ class OntologyGenerator:
|
|||
|
||||
return message
|
||||
|
||||
def _build_document_context(self, document_texts: List[str]) -> str:
|
||||
"""构建用于本体分析的文档上下文,长文本按全局分块抽样而不是只截取开头。"""
|
||||
|
||||
combined_text = "\n\n---\n\n".join(document_texts)
|
||||
original_length = len(combined_text)
|
||||
|
||||
if original_length <= self.MAX_TEXT_LENGTH_FOR_LLM:
|
||||
return combined_text
|
||||
|
||||
chunks = self._collect_document_chunks(document_texts)
|
||||
if not chunks:
|
||||
return ""
|
||||
|
||||
selected_chunks = self._select_representative_chunks(chunks)
|
||||
excerpt_budget = self._calculate_excerpt_budget(len(selected_chunks))
|
||||
context = self._render_chunked_context(
|
||||
selected_chunks=selected_chunks,
|
||||
original_length=original_length,
|
||||
total_chunks=len(chunks),
|
||||
excerpt_limit=excerpt_budget,
|
||||
)
|
||||
|
||||
while len(context) > self.MAX_TEXT_LENGTH_FOR_LLM and excerpt_budget > self.MIN_LONG_TEXT_EXCERPT:
|
||||
excerpt_budget = max(self.MIN_LONG_TEXT_EXCERPT, int(excerpt_budget * 0.85))
|
||||
context = self._render_chunked_context(
|
||||
selected_chunks=selected_chunks,
|
||||
original_length=original_length,
|
||||
total_chunks=len(chunks),
|
||||
excerpt_limit=excerpt_budget,
|
||||
)
|
||||
|
||||
if len(context) > self.MAX_TEXT_LENGTH_FOR_LLM:
|
||||
marker = "\n\n...(分块上下文已压缩到本体分析长度限制内)..."
|
||||
context = context[:self.MAX_TEXT_LENGTH_FOR_LLM - len(marker)] + marker
|
||||
|
||||
return context
|
||||
|
||||
def _collect_document_chunks(self, document_texts: List[str]) -> List[Dict[str, Any]]:
|
||||
"""按文档收集分块,保留文档和分块编号方便提示词定位。"""
|
||||
|
||||
all_chunks: List[Dict[str, Any]] = []
|
||||
for doc_index, text in enumerate(document_texts, 1):
|
||||
doc_chunks = split_text_into_chunks(
|
||||
text,
|
||||
chunk_size=self.LONG_TEXT_CHUNK_SIZE,
|
||||
overlap=self.LONG_TEXT_CHUNK_OVERLAP,
|
||||
)
|
||||
total_doc_chunks = len(doc_chunks)
|
||||
for chunk_index, chunk in enumerate(doc_chunks, 1):
|
||||
all_chunks.append({
|
||||
"document_index": doc_index,
|
||||
"chunk_index": chunk_index,
|
||||
"total_document_chunks": total_doc_chunks,
|
||||
"text": chunk,
|
||||
})
|
||||
|
||||
return all_chunks
|
||||
|
||||
def _select_representative_chunks(self, chunks: List[Dict[str, Any]]) -> List[Dict[str, Any]]:
|
||||
"""从全部分块中等距抽样,覆盖长文开头、中段和结尾。"""
|
||||
|
||||
if len(chunks) <= self.MAX_LONG_TEXT_CHUNKS:
|
||||
return chunks
|
||||
|
||||
if self.MAX_LONG_TEXT_CHUNKS <= 1:
|
||||
return [chunks[0]]
|
||||
|
||||
last_index = len(chunks) - 1
|
||||
selected_indexes = {
|
||||
round(i * last_index / (self.MAX_LONG_TEXT_CHUNKS - 1))
|
||||
for i in range(self.MAX_LONG_TEXT_CHUNKS)
|
||||
}
|
||||
return [chunks[i] for i in sorted(selected_indexes)]
|
||||
|
||||
def _calculate_excerpt_budget(self, selected_count: int) -> int:
|
||||
"""根据选中的分块数量为每块分配字符预算。"""
|
||||
|
||||
header_budget = 600
|
||||
chunk_header_budget = 120 * selected_count
|
||||
available = max(
|
||||
self.MIN_LONG_TEXT_EXCERPT * selected_count,
|
||||
self.MAX_TEXT_LENGTH_FOR_LLM - header_budget - chunk_header_budget,
|
||||
)
|
||||
return max(self.MIN_LONG_TEXT_EXCERPT, available // max(selected_count, 1))
|
||||
|
||||
def _render_chunked_context(
|
||||
self,
|
||||
selected_chunks: List[Dict[str, Any]],
|
||||
original_length: int,
|
||||
total_chunks: int,
|
||||
excerpt_limit: int,
|
||||
) -> str:
|
||||
"""渲染长文本分块上下文。"""
|
||||
|
||||
lines = [
|
||||
(
|
||||
f"【长文本自动分块摘要】原文共{original_length}字,"
|
||||
f"已分为{total_chunks}个文本块用于全局覆盖分析。"
|
||||
),
|
||||
(
|
||||
f"以下展示其中{len(selected_chunks)}个代表性文本块的摘录,"
|
||||
"覆盖开头、中段和结尾;请基于这些跨全文线索设计本体,不要只依赖第一段内容。"
|
||||
),
|
||||
]
|
||||
|
||||
for chunk in selected_chunks:
|
||||
excerpt = self._excerpt_text(chunk["text"], excerpt_limit)
|
||||
lines.append(
|
||||
"\n".join([
|
||||
(
|
||||
f"--- 文档 {chunk['document_index']} / "
|
||||
f"分块 {chunk['chunk_index']}/{chunk['total_document_chunks']} ---"
|
||||
),
|
||||
excerpt,
|
||||
])
|
||||
)
|
||||
|
||||
return "\n\n".join(lines)
|
||||
|
||||
@staticmethod
|
||||
def _excerpt_text(text: str, char_limit: int) -> str:
|
||||
"""长分块保留首尾,避免每个分块内部再次变成只看开头。"""
|
||||
|
||||
text = text.strip()
|
||||
if len(text) <= char_limit:
|
||||
return text
|
||||
|
||||
marker = "\n...(本分块中间内容省略)...\n"
|
||||
if char_limit <= len(marker) + 20:
|
||||
return text[:char_limit]
|
||||
|
||||
remaining = char_limit - len(marker)
|
||||
head_len = remaining // 2
|
||||
tail_len = remaining - head_len
|
||||
return f"{text[:head_len].rstrip()}{marker}{text[-tail_len:].lstrip()}"
|
||||
|
||||
def _validate_and_process(self, result: Dict[str, Any]) -> Dict[str, Any]:
|
||||
"""验证和后处理结果"""
|
||||
|
||||
|
|
@ -503,4 +637,3 @@ class OntologyGenerator:
|
|||
code_lines.append('}')
|
||||
|
||||
return '\n'.join(code_lines)
|
||||
|
||||
|
|
|
|||
|
|
@ -0,0 +1,52 @@
|
|||
from app.services.ontology_generator import OntologyGenerator
|
||||
|
||||
|
||||
def _generator_for_test() -> OntologyGenerator:
|
||||
generator = OntologyGenerator(llm_client=object())
|
||||
generator.MAX_TEXT_LENGTH_FOR_LLM = 2000
|
||||
generator.LONG_TEXT_CHUNK_SIZE = 500
|
||||
generator.LONG_TEXT_CHUNK_OVERLAP = 0
|
||||
generator.MAX_LONG_TEXT_CHUNKS = 3
|
||||
generator.MIN_LONG_TEXT_EXCERPT = 120
|
||||
return generator
|
||||
|
||||
|
||||
def test_short_ontology_context_keeps_original_text():
|
||||
generator = _generator_for_test()
|
||||
|
||||
context = generator._build_document_context(["short document body"])
|
||||
|
||||
assert context == "short document body"
|
||||
assert "长文本自动分块摘要" not in context
|
||||
|
||||
|
||||
def test_long_ontology_context_samples_across_document():
|
||||
generator = _generator_for_test()
|
||||
long_text = "BEGIN" + ("a" * 1050) + "MIDDLE" + ("b" * 1050) + "END"
|
||||
|
||||
context = generator._build_document_context([long_text])
|
||||
|
||||
assert len(context) <= generator.MAX_TEXT_LENGTH_FOR_LLM
|
||||
assert "长文本自动分块摘要" in context
|
||||
assert "BEGIN" in context
|
||||
assert "MIDDLE" in context
|
||||
assert "END" in context
|
||||
assert "分块 1/" in context
|
||||
assert "分块 3/" in context
|
||||
assert "分块 5/" in context
|
||||
|
||||
|
||||
def test_very_long_ontology_context_selects_representative_chunks():
|
||||
generator = _generator_for_test()
|
||||
chunks = ["BEGIN"] + [
|
||||
f"CHUNK{i:02d}-" + (str(i) * 490)
|
||||
for i in range(12)
|
||||
] + ["FINALEND"]
|
||||
long_text = "".join(chunks)
|
||||
|
||||
context = generator._build_document_context([long_text])
|
||||
|
||||
assert len(context) <= generator.MAX_TEXT_LENGTH_FOR_LLM
|
||||
assert "BEGIN" in context
|
||||
assert "FINALEND" in context
|
||||
assert context.count("--- 文档 1 / 分块") == generator.MAX_LONG_TEXT_CHUNKS
|
||||
Loading…
Reference in New Issue