53 lines
1.7 KiB
Python
53 lines
1.7 KiB
Python
from app.services.ontology_generator import OntologyGenerator
|
|
|
|
|
|
def _generator_for_test() -> OntologyGenerator:
|
|
generator = OntologyGenerator(llm_client=object())
|
|
generator.MAX_TEXT_LENGTH_FOR_LLM = 2000
|
|
generator.LONG_TEXT_CHUNK_SIZE = 500
|
|
generator.LONG_TEXT_CHUNK_OVERLAP = 0
|
|
generator.MAX_LONG_TEXT_CHUNKS = 3
|
|
generator.MIN_LONG_TEXT_EXCERPT = 120
|
|
return generator
|
|
|
|
|
|
def test_short_ontology_context_keeps_original_text():
|
|
generator = _generator_for_test()
|
|
|
|
context = generator._build_document_context(["short document body"])
|
|
|
|
assert context == "short document body"
|
|
assert "长文本自动分块摘要" not in context
|
|
|
|
|
|
def test_long_ontology_context_samples_across_document():
|
|
generator = _generator_for_test()
|
|
long_text = "BEGIN" + ("a" * 1050) + "MIDDLE" + ("b" * 1050) + "END"
|
|
|
|
context = generator._build_document_context([long_text])
|
|
|
|
assert len(context) <= generator.MAX_TEXT_LENGTH_FOR_LLM
|
|
assert "长文本自动分块摘要" in context
|
|
assert "BEGIN" in context
|
|
assert "MIDDLE" in context
|
|
assert "END" in context
|
|
assert "分块 1/" in context
|
|
assert "分块 3/" in context
|
|
assert "分块 5/" in context
|
|
|
|
|
|
def test_very_long_ontology_context_selects_representative_chunks():
|
|
generator = _generator_for_test()
|
|
chunks = ["BEGIN"] + [
|
|
f"CHUNK{i:02d}-" + (str(i) * 490)
|
|
for i in range(12)
|
|
] + ["FINALEND"]
|
|
long_text = "".join(chunks)
|
|
|
|
context = generator._build_document_context([long_text])
|
|
|
|
assert len(context) <= generator.MAX_TEXT_LENGTH_FOR_LLM
|
|
assert "BEGIN" in context
|
|
assert "FINALEND" in context
|
|
assert context.count("--- 文档 1 / 分块") == generator.MAX_LONG_TEXT_CHUNKS
|