Spaces:
Running
Running
import os | |
import uuid | |
from flask import jsonify, send_file, request | |
from main import * | |
import torch | |
import torchaudio | |
def speech_to_text_func(audio_path): | |
if stt_model is None: | |
return {"error": "STT model not initialized."} | |
waveform, sample_rate = torchaudio.load(audio_path) | |
if waveform.ndim > 1: | |
waveform = torch.mean(waveform, dim=0, keepdim=True) | |
waveform = waveform.to(device) | |
with torch.no_grad(): | |
logits = stt_model(waveform) | |
predicted_ids = torch.argmax(logits, dim=-1) | |
transcription = stt_model.tokenizer.decode(predicted_ids[0].cpu().tolist()) | |
return {"text": transcription} | |
def stt_api(): | |
if 'audio' not in request.files: | |
return jsonify({"error": "Audio file is required"}), 400 | |
audio_file = request.files['audio'] | |
temp_audio_path = f"temp_audio_{uuid.uuid4()}.wav" | |
audio_file.save(temp_audio_path) | |
output = speech_to_text_func(temp_audio_path) | |
os.remove(temp_audio_path) | |
if "error" in output: | |
return jsonify({"error": output["error"]}), 500 | |
return jsonify(output) | |