Merge 47e749c36a into fa0f6519b1
This commit is contained in:
commit
c0c5b64ebe
|
|
@ -38,7 +38,18 @@ class Config:
|
|||
# 文件上传配置
|
||||
MAX_CONTENT_LENGTH = 50 * 1024 * 1024 # 50MB
|
||||
UPLOAD_FOLDER = os.path.join(os.path.dirname(__file__), '../uploads')
|
||||
ALLOWED_EXTENSIONS = {'pdf', 'md', 'txt', 'markdown'}
|
||||
ALLOWED_EXTENSIONS = {
|
||||
'pdf', 'md', 'txt', 'markdown',
|
||||
# Audio (Whisper-native)
|
||||
'mp3', 'wav', 'm4a', 'ogg', 'flac', 'mpga',
|
||||
# Video/container (Whisper-native)
|
||||
'mp4', 'mpeg', 'webm',
|
||||
# Video (requires ffmpeg extraction)
|
||||
'mkv', 'avi', 'mov', 'wmv',
|
||||
}
|
||||
|
||||
# Whisper transcription model
|
||||
WHISPER_MODEL = os.environ.get('WHISPER_MODEL', 'whisper-1')
|
||||
|
||||
# 文本处理配置
|
||||
DEFAULT_CHUNK_SIZE = 500 # 默认切块大小
|
||||
|
|
|
|||
|
|
@ -1,9 +1,10 @@
|
|||
"""
|
||||
文件解析工具
|
||||
支持PDF、Markdown、TXT文件的文本提取
|
||||
支持PDF、Markdown、TXT、音频、视频文件的文本提取
|
||||
"""
|
||||
|
||||
import os
|
||||
import tempfile
|
||||
from pathlib import Path
|
||||
from typing import List, Optional
|
||||
|
||||
|
|
@ -60,37 +61,46 @@ def _read_text_with_fallback(file_path: str) -> str:
|
|||
|
||||
class FileParser:
|
||||
"""文件解析器"""
|
||||
|
||||
SUPPORTED_EXTENSIONS = {'.pdf', '.md', '.markdown', '.txt'}
|
||||
|
||||
|
||||
# Formats sent directly to the transcription endpoint (or multimodal chat after PyAV conversion)
|
||||
_AUDIO_NATIVE = {'.mp3', '.mp4', '.mpeg', '.mpga', '.m4a', '.ogg', '.wav', '.webm', '.flac'}
|
||||
# Video container formats — audio extracted via PyAV before transcription
|
||||
_VIDEO_CONTAINERS = {'.mkv', '.avi', '.mov', '.wmv'}
|
||||
|
||||
SUPPORTED_EXTENSIONS = {'.pdf', '.md', '.markdown', '.txt'} | _AUDIO_NATIVE | _VIDEO_CONTAINERS
|
||||
|
||||
@classmethod
|
||||
def extract_text(cls, file_path: str) -> str:
|
||||
"""
|
||||
从文件中提取文本
|
||||
|
||||
|
||||
Args:
|
||||
file_path: 文件路径
|
||||
|
||||
|
||||
Returns:
|
||||
提取的文本内容
|
||||
"""
|
||||
path = Path(file_path)
|
||||
|
||||
|
||||
if not path.exists():
|
||||
raise FileNotFoundError(f"文件不存在: {file_path}")
|
||||
|
||||
|
||||
suffix = path.suffix.lower()
|
||||
|
||||
|
||||
if suffix not in cls.SUPPORTED_EXTENSIONS:
|
||||
raise ValueError(f"不支持的文件格式: {suffix}")
|
||||
|
||||
|
||||
if suffix == '.pdf':
|
||||
return cls._extract_from_pdf(file_path)
|
||||
elif suffix in {'.md', '.markdown'}:
|
||||
return cls._extract_from_md(file_path)
|
||||
elif suffix == '.txt':
|
||||
return cls._extract_from_txt(file_path)
|
||||
|
||||
elif suffix in cls._AUDIO_NATIVE:
|
||||
return cls._transcribe_audio(file_path)
|
||||
elif suffix in cls._VIDEO_CONTAINERS:
|
||||
return cls._transcribe_video(file_path)
|
||||
|
||||
raise ValueError(f"无法处理的文件格式: {suffix}")
|
||||
|
||||
@staticmethod
|
||||
|
|
@ -120,6 +130,109 @@ class FileParser:
|
|||
"""从TXT提取文本,支持自动编码检测"""
|
||||
return _read_text_with_fallback(file_path)
|
||||
|
||||
@staticmethod
|
||||
def _to_wav_bytes(file_path: str) -> bytes:
|
||||
"""Extract audio from any container and return raw WAV bytes using PyAV."""
|
||||
try:
|
||||
import av
|
||||
except ImportError:
|
||||
raise ImportError(
|
||||
"PyAV is required to convert audio/video files. "
|
||||
"Install it with: pip install av"
|
||||
)
|
||||
import io
|
||||
|
||||
buf = io.BytesIO()
|
||||
with av.open(file_path) as inp:
|
||||
audio = next((s for s in inp.streams if s.type == 'audio'), None)
|
||||
if audio is None:
|
||||
raise ValueError(f"No audio track found in: {file_path}")
|
||||
with av.open(buf, 'w', format='wav') as out:
|
||||
out_stream = out.add_stream('pcm_s16le', rate=16000, layout='mono')
|
||||
for packet in inp.demux(audio):
|
||||
for frame in packet.decode():
|
||||
frame.pts = None
|
||||
for pkt in out_stream.encode(frame):
|
||||
out.mux(pkt)
|
||||
for pkt in out_stream.encode(None):
|
||||
out.mux(pkt)
|
||||
buf.seek(0)
|
||||
return buf.read()
|
||||
|
||||
@staticmethod
|
||||
def _transcribe_audio(file_path: str) -> str:
|
||||
"""Transcribe audio file.
|
||||
|
||||
Tries the Whisper-style /audio/transcriptions endpoint first (OpenAI, Groq, etc.).
|
||||
Falls back to multimodal chat (Gemini and other vision/audio LLMs) if the
|
||||
provider doesn't support that endpoint (404).
|
||||
"""
|
||||
import base64
|
||||
from openai import OpenAI, NotFoundError
|
||||
from ..config import Config
|
||||
|
||||
client = OpenAI(api_key=Config.LLM_API_KEY, base_url=Config.LLM_BASE_URL)
|
||||
|
||||
# --- attempt 1: Whisper-compatible endpoint ---
|
||||
try:
|
||||
with open(file_path, 'rb') as f:
|
||||
transcript = client.audio.transcriptions.create(
|
||||
model=Config.WHISPER_MODEL,
|
||||
file=f,
|
||||
)
|
||||
return transcript.text
|
||||
except NotFoundError:
|
||||
pass # Provider doesn't expose Whisper; fall through to multimodal chat
|
||||
|
||||
# --- attempt 2: multimodal chat (e.g. Gemini) ---
|
||||
# Gemini only accepts wav/mp3; convert anything else using PyAV (no system ffmpeg needed)
|
||||
fmt = Path(file_path).suffix.lower().lstrip('.')
|
||||
if fmt in ('wav', 'mp3'):
|
||||
with open(file_path, 'rb') as f:
|
||||
audio_bytes = f.read()
|
||||
else:
|
||||
audio_bytes = FileParser._to_wav_bytes(file_path)
|
||||
fmt = 'wav'
|
||||
|
||||
audio_b64 = base64.b64encode(audio_bytes).decode('utf-8')
|
||||
|
||||
response = client.chat.completions.create(
|
||||
model=Config.LLM_MODEL_NAME,
|
||||
messages=[{
|
||||
"role": "user",
|
||||
"content": [
|
||||
{
|
||||
"type": "input_audio",
|
||||
"input_audio": {"data": audio_b64, "format": fmt},
|
||||
},
|
||||
{
|
||||
"type": "text",
|
||||
"text": "Transcribe this audio accurately. Return only the transcription text, no commentary.",
|
||||
},
|
||||
],
|
||||
}],
|
||||
)
|
||||
content = response.choices[0].message.content
|
||||
# Strip <think> tags that some models inject
|
||||
import re
|
||||
content = re.sub(r'<think>[\s\S]*?</think>', '', content).strip()
|
||||
return content
|
||||
|
||||
@classmethod
|
||||
def _transcribe_video(cls, file_path: str) -> str:
|
||||
"""Extract audio from video container and transcribe, using PyAV (no system ffmpeg needed)."""
|
||||
import io
|
||||
# Convert to WAV in memory, write to a temp file so _transcribe_audio can open it
|
||||
wav_bytes = cls._to_wav_bytes(file_path)
|
||||
with tempfile.NamedTemporaryFile(suffix='.wav', delete=False) as tmp:
|
||||
tmp.write(wav_bytes)
|
||||
tmp_path = tmp.name
|
||||
try:
|
||||
return cls._transcribe_audio(tmp_path)
|
||||
finally:
|
||||
if os.path.exists(tmp_path):
|
||||
os.unlink(tmp_path)
|
||||
|
||||
@classmethod
|
||||
def extract_from_multiple(cls, file_paths: List[str]) -> str:
|
||||
"""
|
||||
|
|
|
|||
|
|
@ -12,23 +12,20 @@ dependencies = [
|
|||
# 核心框架
|
||||
"flask>=3.0.0",
|
||||
"flask-cors>=6.0.0",
|
||||
|
||||
# LLM 相关
|
||||
"openai>=1.0.0",
|
||||
|
||||
# Zep Cloud
|
||||
"zep-cloud==3.13.0",
|
||||
|
||||
# OASIS 社交媒体模拟
|
||||
"camel-oasis==0.2.5",
|
||||
"camel-ai==0.2.78",
|
||||
|
||||
# 文件处理
|
||||
"PyMuPDF>=1.24.0",
|
||||
# Audio/video transcription (bundles FFmpeg libs, no system install needed)
|
||||
"av>=12.0.0",
|
||||
# 编码检测(支持非UTF-8编码的文本文件)
|
||||
"charset-normalizer>=3.0.0",
|
||||
"chardet>=5.0.0",
|
||||
|
||||
# 工具库
|
||||
"python-dotenv>=1.0.0",
|
||||
"pydantic>=2.0.0",
|
||||
|
|
|
|||
|
|
@ -1,5 +1,5 @@
|
|||
version = 1
|
||||
revision = 3
|
||||
revision = 2
|
||||
requires-python = ">=3.11"
|
||||
resolution-markers = [
|
||||
"python_full_version >= '3.12'",
|
||||
|
|
@ -73,6 +73,30 @@ wheels = [
|
|||
{ url = "https://files.pythonhosted.org/packages/3a/2a/7cc015f5b9f5db42b7d48157e23356022889fc354a2813c15934b7cb5c0e/attrs-25.4.0-py3-none-any.whl", hash = "sha256:adcf7e2a1fb3b36ac48d97835bb6d8ade15b8dcce26aba8bf1d14847b57a3373", size = 67615, upload-time = "2025-10-06T13:54:43.17Z" },
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "av"
|
||||
version = "17.0.0"
|
||||
source = { registry = "https://pypi.org/simple" }
|
||||
sdist = { url = "https://files.pythonhosted.org/packages/b2/eb/abca886df3a091bc406feb5ff71b4c4f426beaae6b71b9697264ce8c7211/av-17.0.0.tar.gz", hash = "sha256:c53685df73775a8763c375c7b2d62a6cb149d992a26a4b098204da42ade8c3df", size = 4410769, upload-time = "2026-03-14T14:38:45.868Z" }
|
||||
wheels = [
|
||||
{ url = "https://files.pythonhosted.org/packages/b1/fb/55e3b5b5d1fc61466292f26fbcbabafa2642f378dc48875f8f554591e1a4/av-17.0.0-cp311-abi3-macosx_11_0_x86_64.whl", hash = "sha256:ed4013fac77c309a4a68141dcf6148f1821bb1073a36d4289379762a6372f711", size = 23238424, upload-time = "2026-03-14T14:38:05.856Z" },
|
||||
{ url = "https://files.pythonhosted.org/packages/52/03/9ace1acc08bc9ae38c14bf3a4b1360e995e4d999d1d33c2cbd7c9e77582a/av-17.0.0-cp311-abi3-macosx_14_0_arm64.whl", hash = "sha256:e44b6c83e9f3be9f79ee87d0b77a27cea9a9cd67bd630362c86b7e56a748dfbb", size = 18709043, upload-time = "2026-03-14T14:38:08.288Z" },
|
||||
{ url = "https://files.pythonhosted.org/packages/00/c0/637721f3cd5bb8bd16105a1a08efd781fc12f449931bdb3a4d0cfd63fa55/av-17.0.0-cp311-abi3-manylinux_2_28_aarch64.whl", hash = "sha256:b440da6ac47da0629d509316f24bcd858f33158dbdd0f1b7293d71e99beb26de", size = 34018780, upload-time = "2026-03-14T14:38:10.45Z" },
|
||||
{ url = "https://files.pythonhosted.org/packages/d2/59/d19bc3257dd985d55337d7f0414c019414b97e16cd3690ebf9941a847543/av-17.0.0-cp311-abi3-manylinux_2_28_x86_64.whl", hash = "sha256:1060cba85f97f4a337311169d92c0b5e143452cfa5ca0e65fa499d7955e8592e", size = 36358757, upload-time = "2026-03-14T14:38:13.092Z" },
|
||||
{ url = "https://files.pythonhosted.org/packages/52/6c/a1f4f2677bae6f2ade7a8a18e90ebdcf70690c9b1c4e40e118aa30fa313f/av-17.0.0-cp311-abi3-musllinux_1_2_aarch64.whl", hash = "sha256:deda202e6021cfc7ba3e816897760ec5431309d59a4da1f75df3c0e9413d71e7", size = 35195281, upload-time = "2026-03-14T14:38:15.789Z" },
|
||||
{ url = "https://files.pythonhosted.org/packages/90/ea/52b0fc6f69432c7bf3f5fbe6f707113650aa40a1a05b9096ffc2bba4f77d/av-17.0.0-cp311-abi3-musllinux_1_2_x86_64.whl", hash = "sha256:ffaf266a1a9c2148072de0a4b5ae98061465178d2cfaa69ee089761149342974", size = 37444817, upload-time = "2026-03-14T14:38:18.563Z" },
|
||||
{ url = "https://files.pythonhosted.org/packages/34/ad/d2172966282cb8f146c13b6be7416efefde74186460c5e1708ddfc13dba6/av-17.0.0-cp311-abi3-win_amd64.whl", hash = "sha256:45a35a40b2875bf2f98de7c952d74d960f92f319734e6d28e03b4c62a49e6f49", size = 28888553, upload-time = "2026-03-14T14:38:21.223Z" },
|
||||
{ url = "https://files.pythonhosted.org/packages/b0/bb/c5a4c4172c514d631fb506e6366b503576b8c7f29809cf42aca73e28ff01/av-17.0.0-cp311-abi3-win_arm64.whl", hash = "sha256:3d32e9b5c5bbcb872a0b6917b352a1db8a42142237826c9b49a36d5dbd9e9c26", size = 21916910, upload-time = "2026-03-14T14:38:23.706Z" },
|
||||
{ url = "https://files.pythonhosted.org/packages/7f/8e/c40ac08e63f79387c59f6ecc38f47d4c942b549130eee579ec1a91f6a291/av-17.0.0-cp314-cp314t-macosx_11_0_x86_64.whl", hash = "sha256:d13250fb4b4522e9a6bec32da082556d5f257110ea223758151375748d9bbe25", size = 23483029, upload-time = "2026-03-14T14:38:25.758Z" },
|
||||
{ url = "https://files.pythonhosted.org/packages/a9/fb/b4419494bfc249163ec393c613966d66db7e95c76da3345711cd115a79df/av-17.0.0-cp314-cp314t-macosx_14_0_arm64.whl", hash = "sha256:dbb56aa3b7ae72451d1bf6e9d37c7d83d39b97af712f73583ff419fbf08fc237", size = 18920446, upload-time = "2026-03-14T14:38:27.905Z" },
|
||||
{ url = "https://files.pythonhosted.org/packages/30/62/c2306d91602ddad2c56106f21dcb334fd51d5ea2e952f7fa025bb8aa39fc/av-17.0.0-cp314-cp314t-manylinux_2_28_aarch64.whl", hash = "sha256:a213ac9e83b7ab12c2e9f277a09cac8e9d85cf0883efdab7a87a60e2e4e48879", size = 37477266, upload-time = "2026-03-14T14:38:30.404Z" },
|
||||
{ url = "https://files.pythonhosted.org/packages/28/cd/c8510a9607886785c0b3ca019d503e888c3757529be42a7287fe2bfa92d5/av-17.0.0-cp314-cp314t-manylinux_2_28_x86_64.whl", hash = "sha256:e15c88bb0921f9435bcc5a27a0863dba571a80ad5e1389c4fcf2073833bb4a74", size = 39572988, upload-time = "2026-03-14T14:38:32.984Z" },
|
||||
{ url = "https://files.pythonhosted.org/packages/7d/2d/207d9361e25b5abec9be335bbab4df6b6b838e2214be4b374f4cfb285427/av-17.0.0-cp314-cp314t-musllinux_1_2_aarch64.whl", hash = "sha256:096cfd1e9fc896506726c7c42aaf9b370e78c2f257cde4d6ddb6c889bfcc49ec", size = 38399591, upload-time = "2026-03-14T14:38:35.465Z" },
|
||||
{ url = "https://files.pythonhosted.org/packages/73/ca/307740c6aa2980966bf11383ffcb04bacc5b13f3d268ab4cfb274ad6f793/av-17.0.0-cp314-cp314t-musllinux_1_2_x86_64.whl", hash = "sha256:3649ab3d2c7f58049ded1a36e100c0d8fd529cf258f41dd88678ba824034d8c9", size = 40590681, upload-time = "2026-03-14T14:38:38.269Z" },
|
||||
{ url = "https://files.pythonhosted.org/packages/35/f2/6fdb26d0651adf409864cb2a0d60da107e467d3d1aabc94b234ead54324a/av-17.0.0-cp314-cp314t-win_amd64.whl", hash = "sha256:e5002271ab2135b551d980c2db8f3299d452e3b9d3633f24f6bb57fffe91cd10", size = 29216337, upload-time = "2026-03-14T14:38:40.83Z" },
|
||||
{ url = "https://files.pythonhosted.org/packages/41/0a/0896b829a39b5669a2d811e1a79598de661693685cd62b31f11d0c18e65b/av-17.0.0-cp314-cp314t-win_arm64.whl", hash = "sha256:dba98603fc4665b4f750de86fbaf6c0cfaece970671a9b529e0e3d1711e8367e", size = 22071058, upload-time = "2026-03-14T14:38:43.663Z" },
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "backcall"
|
||||
version = "0.2.0"
|
||||
|
|
@ -1242,6 +1266,7 @@ name = "mirofish-backend"
|
|||
version = "0.1.0"
|
||||
source = { editable = "." }
|
||||
dependencies = [
|
||||
{ name = "av" },
|
||||
{ name = "camel-ai" },
|
||||
{ name = "camel-oasis" },
|
||||
{ name = "chardet" },
|
||||
|
|
@ -1270,6 +1295,7 @@ dev = [
|
|||
|
||||
[package.metadata]
|
||||
requires-dist = [
|
||||
{ name = "av", specifier = ">=12.0.0" },
|
||||
{ name = "camel-ai", specifier = "==0.2.78" },
|
||||
{ name = "camel-oasis", specifier = "==0.2.5" },
|
||||
{ name = "chardet", specifier = ">=5.0.0" },
|
||||
|
|
|
|||
|
|
@ -1435,7 +1435,6 @@
|
|||
"resolved": "https://registry.npmjs.org/d3-selection/-/d3-selection-3.0.0.tgz",
|
||||
"integrity": "sha512-fmTRWbNMmsmWq6xJV8D19U/gw/bwrHfNXxrIN+HfZgnzqTHp9jOmKMhsTUjXOJnZOdZY9Q28y4yebKzqDKlxlQ==",
|
||||
"license": "ISC",
|
||||
"peer": true,
|
||||
"engines": {
|
||||
"node": ">=12"
|
||||
}
|
||||
|
|
@ -1913,7 +1912,6 @@
|
|||
"integrity": "sha512-QP88BAKvMam/3NxH6vj2o21R6MjxZUAd6nlwAS/pnGvN9IVLocLHxGYIzFhg6fUQ+5th6P4dv4eW9jX3DSIj7A==",
|
||||
"dev": true,
|
||||
"license": "MIT",
|
||||
"peer": true,
|
||||
"engines": {
|
||||
"node": ">=12"
|
||||
},
|
||||
|
|
@ -2053,7 +2051,6 @@
|
|||
"integrity": "sha512-ITcnkFeR3+fI8P1wMgItjGrR10170d8auB4EpMLPqmx6uxElH3a/hHGQabSHKdqd4FXWO1nFIp9rRn7JQ34ACQ==",
|
||||
"dev": true,
|
||||
"license": "MIT",
|
||||
"peer": true,
|
||||
"dependencies": {
|
||||
"esbuild": "^0.25.0",
|
||||
"fdir": "^6.5.0",
|
||||
|
|
@ -2128,7 +2125,6 @@
|
|||
"resolved": "https://registry.npmjs.org/vue/-/vue-3.5.25.tgz",
|
||||
"integrity": "sha512-YLVdgv2K13WJ6n+kD5owehKtEXwdwXuj2TTyJMsO7pSeKw2bfRNZGjhB7YzrpbMYj5b5QsUebHpOqR3R3ziy/g==",
|
||||
"license": "MIT",
|
||||
"peer": true,
|
||||
"dependencies": {
|
||||
"@vue/compiler-dom": "3.5.25",
|
||||
"@vue/compiler-sfc": "3.5.25",
|
||||
|
|
|
|||
|
|
@ -145,7 +145,7 @@
|
|||
ref="fileInput"
|
||||
type="file"
|
||||
multiple
|
||||
accept=".pdf,.md,.txt"
|
||||
accept=".pdf,.md,.txt,.mp3,.wav,.m4a,.ogg,.flac,.mp4,.mpeg,.webm,.mkv,.avi,.mov,.wmv"
|
||||
@change="handleFileSelect"
|
||||
style="display: none"
|
||||
:disabled="loading"
|
||||
|
|
@ -276,7 +276,7 @@ const handleDrop = (e) => {
|
|||
const addFiles = (newFiles) => {
|
||||
const validFiles = newFiles.filter(file => {
|
||||
const ext = file.name.split('.').pop().toLowerCase()
|
||||
return ['pdf', 'md', 'txt'].includes(ext)
|
||||
return ['pdf', 'md', 'txt', 'markdown', 'mp3', 'wav', 'm4a', 'ogg', 'flac', 'mpga', 'mp4', 'mpeg', 'webm', 'mkv', 'avi', 'mov', 'wmv'].includes(ext)
|
||||
})
|
||||
files.value.push(...validFiles)
|
||||
}
|
||||
|
|
|
|||
Loading…
Reference in New Issue