|
import gradio as gr |
|
import torchaudio |
|
import torch |
|
import transformers |
|
|
|
transformer = transformers.Wav2Vec2ForCTC.from_pretrained("facebook/wav2vec2-base-960h") |
|
processor = transformers.Wav2Vec2Processor.from_pretrained("facebook/wav2vec2-base-960h") |
|
|
|
def speech_to_text(audio): |
|
|
|
waveform, _ = torchaudio.load(audio.name) |
|
input_values = processor(waveform, return_tensors="pt").input_values |
|
|
|
|
|
logits = transformer(input_values).logits |
|
predicted_ids = torch.argmax(logits, dim=-1) |
|
transcription = processor.batch_decode(predicted_ids)[0] |
|
|
|
return transcription |
|
|
|
audio_input = gr.inputs.Audio(source="microphone", type="file", label="Record your voice:") |
|
text_output = gr.outputs.Text(label="Transcription") |
|
|
|
gr.Interface(fn=speech_to_text, inputs=audio_input, outputs=text_output, title="Speech-to-Text").launch(inline=True) |
|
|