This commit is contained in:
AlessioGalluccio 2026-05-16 23:52:53 +00:00 committed by GitHub
commit c0c5b64ebe
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194
6 changed files with 167 additions and 24 deletions

View File

@ -38,7 +38,18 @@ class Config:
# 文件上传配置
MAX_CONTENT_LENGTH = 50 * 1024 * 1024 # 50MB
UPLOAD_FOLDER = os.path.join(os.path.dirname(__file__), '../uploads')
ALLOWED_EXTENSIONS = {'pdf', 'md', 'txt', 'markdown'}
ALLOWED_EXTENSIONS = {
'pdf', 'md', 'txt', 'markdown',
# Audio (Whisper-native)
'mp3', 'wav', 'm4a', 'ogg', 'flac', 'mpga',
# Video/container (Whisper-native)
'mp4', 'mpeg', 'webm',
# Video (requires ffmpeg extraction)
'mkv', 'avi', 'mov', 'wmv',
}
# Whisper transcription model
WHISPER_MODEL = os.environ.get('WHISPER_MODEL', 'whisper-1')
# 文本处理配置
DEFAULT_CHUNK_SIZE = 500 # 默认切块大小

View File

@ -1,9 +1,10 @@
"""
文件解析工具
支持PDFMarkdownTXT文件的文本提取
支持PDFMarkdownTXT音频视频文件的文本提取
"""
import os
import tempfile
from pathlib import Path
from typing import List, Optional
@ -60,37 +61,46 @@ def _read_text_with_fallback(file_path: str) -> str:
class FileParser:
"""文件解析器"""
SUPPORTED_EXTENSIONS = {'.pdf', '.md', '.markdown', '.txt'}
# Formats sent directly to the transcription endpoint (or multimodal chat after PyAV conversion)
_AUDIO_NATIVE = {'.mp3', '.mp4', '.mpeg', '.mpga', '.m4a', '.ogg', '.wav', '.webm', '.flac'}
# Video container formats — audio extracted via PyAV before transcription
_VIDEO_CONTAINERS = {'.mkv', '.avi', '.mov', '.wmv'}
SUPPORTED_EXTENSIONS = {'.pdf', '.md', '.markdown', '.txt'} | _AUDIO_NATIVE | _VIDEO_CONTAINERS
@classmethod
def extract_text(cls, file_path: str) -> str:
"""
从文件中提取文本
Args:
file_path: 文件路径
Returns:
提取的文本内容
"""
path = Path(file_path)
if not path.exists():
raise FileNotFoundError(f"文件不存在: {file_path}")
suffix = path.suffix.lower()
if suffix not in cls.SUPPORTED_EXTENSIONS:
raise ValueError(f"不支持的文件格式: {suffix}")
if suffix == '.pdf':
return cls._extract_from_pdf(file_path)
elif suffix in {'.md', '.markdown'}:
return cls._extract_from_md(file_path)
elif suffix == '.txt':
return cls._extract_from_txt(file_path)
elif suffix in cls._AUDIO_NATIVE:
return cls._transcribe_audio(file_path)
elif suffix in cls._VIDEO_CONTAINERS:
return cls._transcribe_video(file_path)
raise ValueError(f"无法处理的文件格式: {suffix}")
@staticmethod
@ -120,6 +130,109 @@ class FileParser:
"""从TXT提取文本支持自动编码检测"""
return _read_text_with_fallback(file_path)
@staticmethod
def _to_wav_bytes(file_path: str) -> bytes:
"""Extract audio from any container and return raw WAV bytes using PyAV."""
try:
import av
except ImportError:
raise ImportError(
"PyAV is required to convert audio/video files. "
"Install it with: pip install av"
)
import io
buf = io.BytesIO()
with av.open(file_path) as inp:
audio = next((s for s in inp.streams if s.type == 'audio'), None)
if audio is None:
raise ValueError(f"No audio track found in: {file_path}")
with av.open(buf, 'w', format='wav') as out:
out_stream = out.add_stream('pcm_s16le', rate=16000, layout='mono')
for packet in inp.demux(audio):
for frame in packet.decode():
frame.pts = None
for pkt in out_stream.encode(frame):
out.mux(pkt)
for pkt in out_stream.encode(None):
out.mux(pkt)
buf.seek(0)
return buf.read()
@staticmethod
def _transcribe_audio(file_path: str) -> str:
"""Transcribe audio file.
Tries the Whisper-style /audio/transcriptions endpoint first (OpenAI, Groq, etc.).
Falls back to multimodal chat (Gemini and other vision/audio LLMs) if the
provider doesn't support that endpoint (404).
"""
import base64
from openai import OpenAI, NotFoundError
from ..config import Config
client = OpenAI(api_key=Config.LLM_API_KEY, base_url=Config.LLM_BASE_URL)
# --- attempt 1: Whisper-compatible endpoint ---
try:
with open(file_path, 'rb') as f:
transcript = client.audio.transcriptions.create(
model=Config.WHISPER_MODEL,
file=f,
)
return transcript.text
except NotFoundError:
pass # Provider doesn't expose Whisper; fall through to multimodal chat
# --- attempt 2: multimodal chat (e.g. Gemini) ---
# Gemini only accepts wav/mp3; convert anything else using PyAV (no system ffmpeg needed)
fmt = Path(file_path).suffix.lower().lstrip('.')
if fmt in ('wav', 'mp3'):
with open(file_path, 'rb') as f:
audio_bytes = f.read()
else:
audio_bytes = FileParser._to_wav_bytes(file_path)
fmt = 'wav'
audio_b64 = base64.b64encode(audio_bytes).decode('utf-8')
response = client.chat.completions.create(
model=Config.LLM_MODEL_NAME,
messages=[{
"role": "user",
"content": [
{
"type": "input_audio",
"input_audio": {"data": audio_b64, "format": fmt},
},
{
"type": "text",
"text": "Transcribe this audio accurately. Return only the transcription text, no commentary.",
},
],
}],
)
content = response.choices[0].message.content
# Strip <think> tags that some models inject
import re
content = re.sub(r'<think>[\s\S]*?</think>', '', content).strip()
return content
@classmethod
def _transcribe_video(cls, file_path: str) -> str:
"""Extract audio from video container and transcribe, using PyAV (no system ffmpeg needed)."""
import io
# Convert to WAV in memory, write to a temp file so _transcribe_audio can open it
wav_bytes = cls._to_wav_bytes(file_path)
with tempfile.NamedTemporaryFile(suffix='.wav', delete=False) as tmp:
tmp.write(wav_bytes)
tmp_path = tmp.name
try:
return cls._transcribe_audio(tmp_path)
finally:
if os.path.exists(tmp_path):
os.unlink(tmp_path)
@classmethod
def extract_from_multiple(cls, file_paths: List[str]) -> str:
"""

View File

@ -12,23 +12,20 @@ dependencies = [
# 核心框架
"flask>=3.0.0",
"flask-cors>=6.0.0",
# LLM 相关
"openai>=1.0.0",
# Zep Cloud
"zep-cloud==3.13.0",
# OASIS 社交媒体模拟
"camel-oasis==0.2.5",
"camel-ai==0.2.78",
# 文件处理
"PyMuPDF>=1.24.0",
# Audio/video transcription (bundles FFmpeg libs, no system install needed)
"av>=12.0.0",
# 编码检测支持非UTF-8编码的文本文件
"charset-normalizer>=3.0.0",
"chardet>=5.0.0",
# 工具库
"python-dotenv>=1.0.0",
"pydantic>=2.0.0",

View File

@ -1,5 +1,5 @@
version = 1
revision = 3
revision = 2
requires-python = ">=3.11"
resolution-markers = [
"python_full_version >= '3.12'",
@ -73,6 +73,30 @@ wheels = [
{ url = "https://files.pythonhosted.org/packages/3a/2a/7cc015f5b9f5db42b7d48157e23356022889fc354a2813c15934b7cb5c0e/attrs-25.4.0-py3-none-any.whl", hash = "sha256:adcf7e2a1fb3b36ac48d97835bb6d8ade15b8dcce26aba8bf1d14847b57a3373", size = 67615, upload-time = "2025-10-06T13:54:43.17Z" },
]
[[package]]
name = "av"
version = "17.0.0"
source = { registry = "https://pypi.org/simple" }
sdist = { url = "https://files.pythonhosted.org/packages/b2/eb/abca886df3a091bc406feb5ff71b4c4f426beaae6b71b9697264ce8c7211/av-17.0.0.tar.gz", hash = "sha256:c53685df73775a8763c375c7b2d62a6cb149d992a26a4b098204da42ade8c3df", size = 4410769, upload-time = "2026-03-14T14:38:45.868Z" }
wheels = [
{ url = "https://files.pythonhosted.org/packages/b1/fb/55e3b5b5d1fc61466292f26fbcbabafa2642f378dc48875f8f554591e1a4/av-17.0.0-cp311-abi3-macosx_11_0_x86_64.whl", hash = "sha256:ed4013fac77c309a4a68141dcf6148f1821bb1073a36d4289379762a6372f711", size = 23238424, upload-time = "2026-03-14T14:38:05.856Z" },
{ url = "https://files.pythonhosted.org/packages/52/03/9ace1acc08bc9ae38c14bf3a4b1360e995e4d999d1d33c2cbd7c9e77582a/av-17.0.0-cp311-abi3-macosx_14_0_arm64.whl", hash = "sha256:e44b6c83e9f3be9f79ee87d0b77a27cea9a9cd67bd630362c86b7e56a748dfbb", size = 18709043, upload-time = "2026-03-14T14:38:08.288Z" },
{ url = "https://files.pythonhosted.org/packages/00/c0/637721f3cd5bb8bd16105a1a08efd781fc12f449931bdb3a4d0cfd63fa55/av-17.0.0-cp311-abi3-manylinux_2_28_aarch64.whl", hash = "sha256:b440da6ac47da0629d509316f24bcd858f33158dbdd0f1b7293d71e99beb26de", size = 34018780, upload-time = "2026-03-14T14:38:10.45Z" },
{ url = "https://files.pythonhosted.org/packages/d2/59/d19bc3257dd985d55337d7f0414c019414b97e16cd3690ebf9941a847543/av-17.0.0-cp311-abi3-manylinux_2_28_x86_64.whl", hash = "sha256:1060cba85f97f4a337311169d92c0b5e143452cfa5ca0e65fa499d7955e8592e", size = 36358757, upload-time = "2026-03-14T14:38:13.092Z" },
{ url = "https://files.pythonhosted.org/packages/52/6c/a1f4f2677bae6f2ade7a8a18e90ebdcf70690c9b1c4e40e118aa30fa313f/av-17.0.0-cp311-abi3-musllinux_1_2_aarch64.whl", hash = "sha256:deda202e6021cfc7ba3e816897760ec5431309d59a4da1f75df3c0e9413d71e7", size = 35195281, upload-time = "2026-03-14T14:38:15.789Z" },
{ url = "https://files.pythonhosted.org/packages/90/ea/52b0fc6f69432c7bf3f5fbe6f707113650aa40a1a05b9096ffc2bba4f77d/av-17.0.0-cp311-abi3-musllinux_1_2_x86_64.whl", hash = "sha256:ffaf266a1a9c2148072de0a4b5ae98061465178d2cfaa69ee089761149342974", size = 37444817, upload-time = "2026-03-14T14:38:18.563Z" },
{ url = "https://files.pythonhosted.org/packages/34/ad/d2172966282cb8f146c13b6be7416efefde74186460c5e1708ddfc13dba6/av-17.0.0-cp311-abi3-win_amd64.whl", hash = "sha256:45a35a40b2875bf2f98de7c952d74d960f92f319734e6d28e03b4c62a49e6f49", size = 28888553, upload-time = "2026-03-14T14:38:21.223Z" },
{ url = "https://files.pythonhosted.org/packages/b0/bb/c5a4c4172c514d631fb506e6366b503576b8c7f29809cf42aca73e28ff01/av-17.0.0-cp311-abi3-win_arm64.whl", hash = "sha256:3d32e9b5c5bbcb872a0b6917b352a1db8a42142237826c9b49a36d5dbd9e9c26", size = 21916910, upload-time = "2026-03-14T14:38:23.706Z" },
{ url = "https://files.pythonhosted.org/packages/7f/8e/c40ac08e63f79387c59f6ecc38f47d4c942b549130eee579ec1a91f6a291/av-17.0.0-cp314-cp314t-macosx_11_0_x86_64.whl", hash = "sha256:d13250fb4b4522e9a6bec32da082556d5f257110ea223758151375748d9bbe25", size = 23483029, upload-time = "2026-03-14T14:38:25.758Z" },
{ url = "https://files.pythonhosted.org/packages/a9/fb/b4419494bfc249163ec393c613966d66db7e95c76da3345711cd115a79df/av-17.0.0-cp314-cp314t-macosx_14_0_arm64.whl", hash = "sha256:dbb56aa3b7ae72451d1bf6e9d37c7d83d39b97af712f73583ff419fbf08fc237", size = 18920446, upload-time = "2026-03-14T14:38:27.905Z" },
{ url = "https://files.pythonhosted.org/packages/30/62/c2306d91602ddad2c56106f21dcb334fd51d5ea2e952f7fa025bb8aa39fc/av-17.0.0-cp314-cp314t-manylinux_2_28_aarch64.whl", hash = "sha256:a213ac9e83b7ab12c2e9f277a09cac8e9d85cf0883efdab7a87a60e2e4e48879", size = 37477266, upload-time = "2026-03-14T14:38:30.404Z" },
{ url = "https://files.pythonhosted.org/packages/28/cd/c8510a9607886785c0b3ca019d503e888c3757529be42a7287fe2bfa92d5/av-17.0.0-cp314-cp314t-manylinux_2_28_x86_64.whl", hash = "sha256:e15c88bb0921f9435bcc5a27a0863dba571a80ad5e1389c4fcf2073833bb4a74", size = 39572988, upload-time = "2026-03-14T14:38:32.984Z" },
{ url = "https://files.pythonhosted.org/packages/7d/2d/207d9361e25b5abec9be335bbab4df6b6b838e2214be4b374f4cfb285427/av-17.0.0-cp314-cp314t-musllinux_1_2_aarch64.whl", hash = "sha256:096cfd1e9fc896506726c7c42aaf9b370e78c2f257cde4d6ddb6c889bfcc49ec", size = 38399591, upload-time = "2026-03-14T14:38:35.465Z" },
{ url = "https://files.pythonhosted.org/packages/73/ca/307740c6aa2980966bf11383ffcb04bacc5b13f3d268ab4cfb274ad6f793/av-17.0.0-cp314-cp314t-musllinux_1_2_x86_64.whl", hash = "sha256:3649ab3d2c7f58049ded1a36e100c0d8fd529cf258f41dd88678ba824034d8c9", size = 40590681, upload-time = "2026-03-14T14:38:38.269Z" },
{ url = "https://files.pythonhosted.org/packages/35/f2/6fdb26d0651adf409864cb2a0d60da107e467d3d1aabc94b234ead54324a/av-17.0.0-cp314-cp314t-win_amd64.whl", hash = "sha256:e5002271ab2135b551d980c2db8f3299d452e3b9d3633f24f6bb57fffe91cd10", size = 29216337, upload-time = "2026-03-14T14:38:40.83Z" },
{ url = "https://files.pythonhosted.org/packages/41/0a/0896b829a39b5669a2d811e1a79598de661693685cd62b31f11d0c18e65b/av-17.0.0-cp314-cp314t-win_arm64.whl", hash = "sha256:dba98603fc4665b4f750de86fbaf6c0cfaece970671a9b529e0e3d1711e8367e", size = 22071058, upload-time = "2026-03-14T14:38:43.663Z" },
]
[[package]]
name = "backcall"
version = "0.2.0"
@ -1242,6 +1266,7 @@ name = "mirofish-backend"
version = "0.1.0"
source = { editable = "." }
dependencies = [
{ name = "av" },
{ name = "camel-ai" },
{ name = "camel-oasis" },
{ name = "chardet" },
@ -1270,6 +1295,7 @@ dev = [
[package.metadata]
requires-dist = [
{ name = "av", specifier = ">=12.0.0" },
{ name = "camel-ai", specifier = "==0.2.78" },
{ name = "camel-oasis", specifier = "==0.2.5" },
{ name = "chardet", specifier = ">=5.0.0" },

View File

@ -1435,7 +1435,6 @@
"resolved": "https://registry.npmjs.org/d3-selection/-/d3-selection-3.0.0.tgz",
"integrity": "sha512-fmTRWbNMmsmWq6xJV8D19U/gw/bwrHfNXxrIN+HfZgnzqTHp9jOmKMhsTUjXOJnZOdZY9Q28y4yebKzqDKlxlQ==",
"license": "ISC",
"peer": true,
"engines": {
"node": ">=12"
}
@ -1913,7 +1912,6 @@
"integrity": "sha512-QP88BAKvMam/3NxH6vj2o21R6MjxZUAd6nlwAS/pnGvN9IVLocLHxGYIzFhg6fUQ+5th6P4dv4eW9jX3DSIj7A==",
"dev": true,
"license": "MIT",
"peer": true,
"engines": {
"node": ">=12"
},
@ -2053,7 +2051,6 @@
"integrity": "sha512-ITcnkFeR3+fI8P1wMgItjGrR10170d8auB4EpMLPqmx6uxElH3a/hHGQabSHKdqd4FXWO1nFIp9rRn7JQ34ACQ==",
"dev": true,
"license": "MIT",
"peer": true,
"dependencies": {
"esbuild": "^0.25.0",
"fdir": "^6.5.0",
@ -2128,7 +2125,6 @@
"resolved": "https://registry.npmjs.org/vue/-/vue-3.5.25.tgz",
"integrity": "sha512-YLVdgv2K13WJ6n+kD5owehKtEXwdwXuj2TTyJMsO7pSeKw2bfRNZGjhB7YzrpbMYj5b5QsUebHpOqR3R3ziy/g==",
"license": "MIT",
"peer": true,
"dependencies": {
"@vue/compiler-dom": "3.5.25",
"@vue/compiler-sfc": "3.5.25",

View File

@ -145,7 +145,7 @@
ref="fileInput"
type="file"
multiple
accept=".pdf,.md,.txt"
accept=".pdf,.md,.txt,.mp3,.wav,.m4a,.ogg,.flac,.mp4,.mpeg,.webm,.mkv,.avi,.mov,.wmv"
@change="handleFileSelect"
style="display: none"
:disabled="loading"
@ -276,7 +276,7 @@ const handleDrop = (e) => {
const addFiles = (newFiles) => {
const validFiles = newFiles.filter(file => {
const ext = file.name.split('.').pop().toLowerCase()
return ['pdf', 'md', 'txt'].includes(ext)
return ['pdf', 'md', 'txt', 'markdown', 'mp3', 'wav', 'm4a', 'ogg', 'flac', 'mpga', 'mp4', 'mpeg', 'webm', 'mkv', 'avi', 'mov', 'wmv'].includes(ext)
})
files.value.push(...validFiles)
}