MyAlexa / app.py
camanalo1's picture
Update app.py
841bb81
raw
history blame
905 Bytes
import gradio as gr
import torchaudio
import torch
import transformers
transformer = transformers.Wav2Vec2ForCTC.from_pretrained("facebook/wav2vec2-base-960h")
processor = transformers.Wav2Vec2Processor.from_pretrained("facebook/wav2vec2-base-960h")
def speech_to_text(audio):
# Convert audio to torch tensor
waveform, _ = torchaudio.load(audio.name)
input_values = processor(waveform, return_tensors="pt").input_values
# Perform inference
logits = transformer(input_values).logits
predicted_ids = torch.argmax(logits, dim=-1)
transcription = processor.batch_decode(predicted_ids)[0]
return transcription
audio_input = gr.inputs.Audio(source="microphone", type="file", label="Record your voice:")
text_output = gr.outputs.Text(label="Transcription")
gr.Interface(fn=speech_to_text, inputs=audio_input, outputs=text_output, title="Speech-to-Text").launch(inline=True)