36 lines
1022 B
Python
36 lines
1022 B
Python
"""
|
|
Voice input — transcribes raw audio bytes using OpenAI Whisper (local).
|
|
Model size set via WHISPER_MODEL env var (default: base).
|
|
"""
|
|
import os
|
|
import tempfile
|
|
import asyncio
|
|
import whisper
|
|
|
|
_model = None
|
|
|
|
def _get_model():
|
|
global _model
|
|
if _model is None:
|
|
model_size = os.getenv("WHISPER_MODEL", "base")
|
|
print(f"[JARVIS] Loading Whisper model: {model_size}")
|
|
_model = whisper.load_model(model_size)
|
|
return _model
|
|
|
|
async def transcribe_audio(audio_bytes: bytes) -> str:
|
|
"""Transcribe raw WAV audio bytes to text using Whisper."""
|
|
model = _get_model()
|
|
loop = asyncio.get_event_loop()
|
|
|
|
def _transcribe():
|
|
with tempfile.NamedTemporaryFile(suffix=".wav", delete=False) as f:
|
|
f.write(audio_bytes)
|
|
tmp_path = f.name
|
|
try:
|
|
result = model.transcribe(tmp_path, fp16=False)
|
|
return result["text"].strip()
|
|
finally:
|
|
os.unlink(tmp_path)
|
|
|
|
return await loop.run_in_executor(None, _transcribe)
|