diff --git a/backend/app/config.py b/backend/app/config.py index 953dfa50..0fedc769 100644 --- a/backend/app/config.py +++ b/backend/app/config.py @@ -38,7 +38,18 @@ class Config: # 文件上传配置 MAX_CONTENT_LENGTH = 50 * 1024 * 1024 # 50MB UPLOAD_FOLDER = os.path.join(os.path.dirname(__file__), '../uploads') - ALLOWED_EXTENSIONS = {'pdf', 'md', 'txt', 'markdown'} + ALLOWED_EXTENSIONS = { + 'pdf', 'md', 'txt', 'markdown', + # Audio (Whisper-native) + 'mp3', 'wav', 'm4a', 'ogg', 'flac', 'mpga', + # Video/container (Whisper-native) + 'mp4', 'mpeg', 'webm', + # Video (requires ffmpeg extraction) + 'mkv', 'avi', 'mov', 'wmv', + } + + # Whisper transcription model + WHISPER_MODEL = os.environ.get('WHISPER_MODEL', 'whisper-1') # 文本处理配置 DEFAULT_CHUNK_SIZE = 500 # 默认切块大小 diff --git a/backend/app/utils/file_parser.py b/backend/app/utils/file_parser.py index 3f1d8ed2..328eb02b 100644 --- a/backend/app/utils/file_parser.py +++ b/backend/app/utils/file_parser.py @@ -1,9 +1,10 @@ """ 文件解析工具 -支持PDF、Markdown、TXT文件的文本提取 +支持PDF、Markdown、TXT、音频、视频文件的文本提取 """ import os +import tempfile from pathlib import Path from typing import List, Optional @@ -60,37 +61,46 @@ def _read_text_with_fallback(file_path: str) -> str: class FileParser: """文件解析器""" - - SUPPORTED_EXTENSIONS = {'.pdf', '.md', '.markdown', '.txt'} - + + # Formats sent directly to the transcription endpoint (or multimodal chat after PyAV conversion) + _AUDIO_NATIVE = {'.mp3', '.mp4', '.mpeg', '.mpga', '.m4a', '.ogg', '.wav', '.webm', '.flac'} + # Video container formats — audio extracted via PyAV before transcription + _VIDEO_CONTAINERS = {'.mkv', '.avi', '.mov', '.wmv'} + + SUPPORTED_EXTENSIONS = {'.pdf', '.md', '.markdown', '.txt'} | _AUDIO_NATIVE | _VIDEO_CONTAINERS + @classmethod def extract_text(cls, file_path: str) -> str: """ 从文件中提取文本 - + Args: file_path: 文件路径 - + Returns: 提取的文本内容 """ path = Path(file_path) - + if not path.exists(): raise FileNotFoundError(f"文件不存在: {file_path}") - + suffix = path.suffix.lower() - + if suffix not in cls.SUPPORTED_EXTENSIONS: raise ValueError(f"不支持的文件格式: {suffix}") - + if suffix == '.pdf': return cls._extract_from_pdf(file_path) elif suffix in {'.md', '.markdown'}: return cls._extract_from_md(file_path) elif suffix == '.txt': return cls._extract_from_txt(file_path) - + elif suffix in cls._AUDIO_NATIVE: + return cls._transcribe_audio(file_path) + elif suffix in cls._VIDEO_CONTAINERS: + return cls._transcribe_video(file_path) + raise ValueError(f"无法处理的文件格式: {suffix}") @staticmethod @@ -120,6 +130,109 @@ class FileParser: """从TXT提取文本,支持自动编码检测""" return _read_text_with_fallback(file_path) + @staticmethod + def _to_wav_bytes(file_path: str) -> bytes: + """Extract audio from any container and return raw WAV bytes using PyAV.""" + try: + import av + except ImportError: + raise ImportError( + "PyAV is required to convert audio/video files. " + "Install it with: pip install av" + ) + import io + + buf = io.BytesIO() + with av.open(file_path) as inp: + audio = next((s for s in inp.streams if s.type == 'audio'), None) + if audio is None: + raise ValueError(f"No audio track found in: {file_path}") + with av.open(buf, 'w', format='wav') as out: + out_stream = out.add_stream('pcm_s16le', rate=16000, layout='mono') + for packet in inp.demux(audio): + for frame in packet.decode(): + frame.pts = None + for pkt in out_stream.encode(frame): + out.mux(pkt) + for pkt in out_stream.encode(None): + out.mux(pkt) + buf.seek(0) + return buf.read() + + @staticmethod + def _transcribe_audio(file_path: str) -> str: + """Transcribe audio file. + + Tries the Whisper-style /audio/transcriptions endpoint first (OpenAI, Groq, etc.). + Falls back to multimodal chat (Gemini and other vision/audio LLMs) if the + provider doesn't support that endpoint (404). + """ + import base64 + from openai import OpenAI, NotFoundError + from ..config import Config + + client = OpenAI(api_key=Config.LLM_API_KEY, base_url=Config.LLM_BASE_URL) + + # --- attempt 1: Whisper-compatible endpoint --- + try: + with open(file_path, 'rb') as f: + transcript = client.audio.transcriptions.create( + model=Config.WHISPER_MODEL, + file=f, + ) + return transcript.text + except NotFoundError: + pass # Provider doesn't expose Whisper; fall through to multimodal chat + + # --- attempt 2: multimodal chat (e.g. Gemini) --- + # Gemini only accepts wav/mp3; convert anything else using PyAV (no system ffmpeg needed) + fmt = Path(file_path).suffix.lower().lstrip('.') + if fmt in ('wav', 'mp3'): + with open(file_path, 'rb') as f: + audio_bytes = f.read() + else: + audio_bytes = FileParser._to_wav_bytes(file_path) + fmt = 'wav' + + audio_b64 = base64.b64encode(audio_bytes).decode('utf-8') + + response = client.chat.completions.create( + model=Config.LLM_MODEL_NAME, + messages=[{ + "role": "user", + "content": [ + { + "type": "input_audio", + "input_audio": {"data": audio_b64, "format": fmt}, + }, + { + "type": "text", + "text": "Transcribe this audio accurately. Return only the transcription text, no commentary.", + }, + ], + }], + ) + content = response.choices[0].message.content + # Strip tags that some models inject + import re + content = re.sub(r'[\s\S]*?', '', content).strip() + return content + + @classmethod + def _transcribe_video(cls, file_path: str) -> str: + """Extract audio from video container and transcribe, using PyAV (no system ffmpeg needed).""" + import io + # Convert to WAV in memory, write to a temp file so _transcribe_audio can open it + wav_bytes = cls._to_wav_bytes(file_path) + with tempfile.NamedTemporaryFile(suffix='.wav', delete=False) as tmp: + tmp.write(wav_bytes) + tmp_path = tmp.name + try: + return cls._transcribe_audio(tmp_path) + finally: + if os.path.exists(tmp_path): + os.unlink(tmp_path) + @classmethod def extract_from_multiple(cls, file_paths: List[str]) -> str: """ diff --git a/backend/pyproject.toml b/backend/pyproject.toml index 4f5361d5..a0cf6abd 100644 --- a/backend/pyproject.toml +++ b/backend/pyproject.toml @@ -12,23 +12,20 @@ dependencies = [ # 核心框架 "flask>=3.0.0", "flask-cors>=6.0.0", - # LLM 相关 "openai>=1.0.0", - # Zep Cloud "zep-cloud==3.13.0", - # OASIS 社交媒体模拟 "camel-oasis==0.2.5", "camel-ai==0.2.78", - # 文件处理 "PyMuPDF>=1.24.0", + # Audio/video transcription (bundles FFmpeg libs, no system install needed) + "av>=12.0.0", # 编码检测(支持非UTF-8编码的文本文件) "charset-normalizer>=3.0.0", "chardet>=5.0.0", - # 工具库 "python-dotenv>=1.0.0", "pydantic>=2.0.0", diff --git a/backend/uv.lock b/backend/uv.lock index f1ce4b60..40fa80a1 100644 --- a/backend/uv.lock +++ b/backend/uv.lock @@ -1,5 +1,5 @@ version = 1 -revision = 3 +revision = 2 requires-python = ">=3.11" resolution-markers = [ "python_full_version >= '3.12'", @@ -73,6 +73,30 @@ wheels = [ { url = "https://files.pythonhosted.org/packages/3a/2a/7cc015f5b9f5db42b7d48157e23356022889fc354a2813c15934b7cb5c0e/attrs-25.4.0-py3-none-any.whl", hash = "sha256:adcf7e2a1fb3b36ac48d97835bb6d8ade15b8dcce26aba8bf1d14847b57a3373", size = 67615, upload-time = "2025-10-06T13:54:43.17Z" }, ] +[[package]] +name = "av" +version = "17.0.0" +source = { registry = "https://pypi.org/simple" } +sdist = { url = "https://files.pythonhosted.org/packages/b2/eb/abca886df3a091bc406feb5ff71b4c4f426beaae6b71b9697264ce8c7211/av-17.0.0.tar.gz", hash = "sha256:c53685df73775a8763c375c7b2d62a6cb149d992a26a4b098204da42ade8c3df", size = 4410769, upload-time = "2026-03-14T14:38:45.868Z" } +wheels = [ + { url = "https://files.pythonhosted.org/packages/b1/fb/55e3b5b5d1fc61466292f26fbcbabafa2642f378dc48875f8f554591e1a4/av-17.0.0-cp311-abi3-macosx_11_0_x86_64.whl", hash = "sha256:ed4013fac77c309a4a68141dcf6148f1821bb1073a36d4289379762a6372f711", size = 23238424, upload-time = "2026-03-14T14:38:05.856Z" }, + { url = "https://files.pythonhosted.org/packages/52/03/9ace1acc08bc9ae38c14bf3a4b1360e995e4d999d1d33c2cbd7c9e77582a/av-17.0.0-cp311-abi3-macosx_14_0_arm64.whl", hash = "sha256:e44b6c83e9f3be9f79ee87d0b77a27cea9a9cd67bd630362c86b7e56a748dfbb", size = 18709043, upload-time = "2026-03-14T14:38:08.288Z" }, + { url = "https://files.pythonhosted.org/packages/00/c0/637721f3cd5bb8bd16105a1a08efd781fc12f449931bdb3a4d0cfd63fa55/av-17.0.0-cp311-abi3-manylinux_2_28_aarch64.whl", hash = "sha256:b440da6ac47da0629d509316f24bcd858f33158dbdd0f1b7293d71e99beb26de", size = 34018780, upload-time = "2026-03-14T14:38:10.45Z" }, + { url = "https://files.pythonhosted.org/packages/d2/59/d19bc3257dd985d55337d7f0414c019414b97e16cd3690ebf9941a847543/av-17.0.0-cp311-abi3-manylinux_2_28_x86_64.whl", hash = "sha256:1060cba85f97f4a337311169d92c0b5e143452cfa5ca0e65fa499d7955e8592e", size = 36358757, upload-time = "2026-03-14T14:38:13.092Z" }, + { url = "https://files.pythonhosted.org/packages/52/6c/a1f4f2677bae6f2ade7a8a18e90ebdcf70690c9b1c4e40e118aa30fa313f/av-17.0.0-cp311-abi3-musllinux_1_2_aarch64.whl", hash = "sha256:deda202e6021cfc7ba3e816897760ec5431309d59a4da1f75df3c0e9413d71e7", size = 35195281, upload-time = "2026-03-14T14:38:15.789Z" }, + { url = "https://files.pythonhosted.org/packages/90/ea/52b0fc6f69432c7bf3f5fbe6f707113650aa40a1a05b9096ffc2bba4f77d/av-17.0.0-cp311-abi3-musllinux_1_2_x86_64.whl", hash = "sha256:ffaf266a1a9c2148072de0a4b5ae98061465178d2cfaa69ee089761149342974", size = 37444817, upload-time = "2026-03-14T14:38:18.563Z" }, + { url = "https://files.pythonhosted.org/packages/34/ad/d2172966282cb8f146c13b6be7416efefde74186460c5e1708ddfc13dba6/av-17.0.0-cp311-abi3-win_amd64.whl", hash = "sha256:45a35a40b2875bf2f98de7c952d74d960f92f319734e6d28e03b4c62a49e6f49", size = 28888553, upload-time = "2026-03-14T14:38:21.223Z" }, + { url = "https://files.pythonhosted.org/packages/b0/bb/c5a4c4172c514d631fb506e6366b503576b8c7f29809cf42aca73e28ff01/av-17.0.0-cp311-abi3-win_arm64.whl", hash = "sha256:3d32e9b5c5bbcb872a0b6917b352a1db8a42142237826c9b49a36d5dbd9e9c26", size = 21916910, upload-time = "2026-03-14T14:38:23.706Z" }, + { url = "https://files.pythonhosted.org/packages/7f/8e/c40ac08e63f79387c59f6ecc38f47d4c942b549130eee579ec1a91f6a291/av-17.0.0-cp314-cp314t-macosx_11_0_x86_64.whl", hash = "sha256:d13250fb4b4522e9a6bec32da082556d5f257110ea223758151375748d9bbe25", size = 23483029, upload-time = "2026-03-14T14:38:25.758Z" }, + { url = "https://files.pythonhosted.org/packages/a9/fb/b4419494bfc249163ec393c613966d66db7e95c76da3345711cd115a79df/av-17.0.0-cp314-cp314t-macosx_14_0_arm64.whl", hash = "sha256:dbb56aa3b7ae72451d1bf6e9d37c7d83d39b97af712f73583ff419fbf08fc237", size = 18920446, upload-time = "2026-03-14T14:38:27.905Z" }, + { url = "https://files.pythonhosted.org/packages/30/62/c2306d91602ddad2c56106f21dcb334fd51d5ea2e952f7fa025bb8aa39fc/av-17.0.0-cp314-cp314t-manylinux_2_28_aarch64.whl", hash = "sha256:a213ac9e83b7ab12c2e9f277a09cac8e9d85cf0883efdab7a87a60e2e4e48879", size = 37477266, upload-time = "2026-03-14T14:38:30.404Z" }, + { url = "https://files.pythonhosted.org/packages/28/cd/c8510a9607886785c0b3ca019d503e888c3757529be42a7287fe2bfa92d5/av-17.0.0-cp314-cp314t-manylinux_2_28_x86_64.whl", hash = "sha256:e15c88bb0921f9435bcc5a27a0863dba571a80ad5e1389c4fcf2073833bb4a74", size = 39572988, upload-time = "2026-03-14T14:38:32.984Z" }, + { url = "https://files.pythonhosted.org/packages/7d/2d/207d9361e25b5abec9be335bbab4df6b6b838e2214be4b374f4cfb285427/av-17.0.0-cp314-cp314t-musllinux_1_2_aarch64.whl", hash = "sha256:096cfd1e9fc896506726c7c42aaf9b370e78c2f257cde4d6ddb6c889bfcc49ec", size = 38399591, upload-time = "2026-03-14T14:38:35.465Z" }, + { url = "https://files.pythonhosted.org/packages/73/ca/307740c6aa2980966bf11383ffcb04bacc5b13f3d268ab4cfb274ad6f793/av-17.0.0-cp314-cp314t-musllinux_1_2_x86_64.whl", hash = "sha256:3649ab3d2c7f58049ded1a36e100c0d8fd529cf258f41dd88678ba824034d8c9", size = 40590681, upload-time = "2026-03-14T14:38:38.269Z" }, + { url = "https://files.pythonhosted.org/packages/35/f2/6fdb26d0651adf409864cb2a0d60da107e467d3d1aabc94b234ead54324a/av-17.0.0-cp314-cp314t-win_amd64.whl", hash = "sha256:e5002271ab2135b551d980c2db8f3299d452e3b9d3633f24f6bb57fffe91cd10", size = 29216337, upload-time = "2026-03-14T14:38:40.83Z" }, + { url = "https://files.pythonhosted.org/packages/41/0a/0896b829a39b5669a2d811e1a79598de661693685cd62b31f11d0c18e65b/av-17.0.0-cp314-cp314t-win_arm64.whl", hash = "sha256:dba98603fc4665b4f750de86fbaf6c0cfaece970671a9b529e0e3d1711e8367e", size = 22071058, upload-time = "2026-03-14T14:38:43.663Z" }, +] + [[package]] name = "backcall" version = "0.2.0" @@ -1242,6 +1266,7 @@ name = "mirofish-backend" version = "0.1.0" source = { editable = "." } dependencies = [ + { name = "av" }, { name = "camel-ai" }, { name = "camel-oasis" }, { name = "chardet" }, @@ -1270,6 +1295,7 @@ dev = [ [package.metadata] requires-dist = [ + { name = "av", specifier = ">=12.0.0" }, { name = "camel-ai", specifier = "==0.2.78" }, { name = "camel-oasis", specifier = "==0.2.5" }, { name = "chardet", specifier = ">=5.0.0" }, diff --git a/frontend/package-lock.json b/frontend/package-lock.json index 3e56d752..fdab7ac4 100644 --- a/frontend/package-lock.json +++ b/frontend/package-lock.json @@ -1435,7 +1435,6 @@ "resolved": "https://registry.npmjs.org/d3-selection/-/d3-selection-3.0.0.tgz", "integrity": "sha512-fmTRWbNMmsmWq6xJV8D19U/gw/bwrHfNXxrIN+HfZgnzqTHp9jOmKMhsTUjXOJnZOdZY9Q28y4yebKzqDKlxlQ==", "license": "ISC", - "peer": true, "engines": { "node": ">=12" } @@ -1913,7 +1912,6 @@ "integrity": "sha512-QP88BAKvMam/3NxH6vj2o21R6MjxZUAd6nlwAS/pnGvN9IVLocLHxGYIzFhg6fUQ+5th6P4dv4eW9jX3DSIj7A==", "dev": true, "license": "MIT", - "peer": true, "engines": { "node": ">=12" }, @@ -2053,7 +2051,6 @@ "integrity": "sha512-ITcnkFeR3+fI8P1wMgItjGrR10170d8auB4EpMLPqmx6uxElH3a/hHGQabSHKdqd4FXWO1nFIp9rRn7JQ34ACQ==", "dev": true, "license": "MIT", - "peer": true, "dependencies": { "esbuild": "^0.25.0", "fdir": "^6.5.0", @@ -2128,7 +2125,6 @@ "resolved": "https://registry.npmjs.org/vue/-/vue-3.5.25.tgz", "integrity": "sha512-YLVdgv2K13WJ6n+kD5owehKtEXwdwXuj2TTyJMsO7pSeKw2bfRNZGjhB7YzrpbMYj5b5QsUebHpOqR3R3ziy/g==", "license": "MIT", - "peer": true, "dependencies": { "@vue/compiler-dom": "3.5.25", "@vue/compiler-sfc": "3.5.25", diff --git a/frontend/src/views/Home.vue b/frontend/src/views/Home.vue index ca7ef6ff..894d7371 100644 --- a/frontend/src/views/Home.vue +++ b/frontend/src/views/Home.vue @@ -145,7 +145,7 @@ ref="fileInput" type="file" multiple - accept=".pdf,.md,.txt" + accept=".pdf,.md,.txt,.mp3,.wav,.m4a,.ogg,.flac,.mp4,.mpeg,.webm,.mkv,.avi,.mov,.wmv" @change="handleFileSelect" style="display: none" :disabled="loading" @@ -276,7 +276,7 @@ const handleDrop = (e) => { const addFiles = (newFiles) => { const validFiles = newFiles.filter(file => { const ext = file.name.split('.').pop().toLowerCase() - return ['pdf', 'md', 'txt'].includes(ext) + return ['pdf', 'md', 'txt', 'markdown', 'mp3', 'wav', 'm4a', 'ogg', 'flac', 'mpga', 'mp4', 'mpeg', 'webm', 'mkv', 'avi', 'mov', 'wmv'].includes(ext) }) files.value.push(...validFiles) }