import gradio as gr import numpy as np import pytubefix as pt import os, time, librosa, torch from pyannote.audio import Pipeline from transformers import pipeline import spaces def second_to_timecode(x: float) -> str: """Float x second to HH:MM:SS.DDD format.""" hour, x = divmod(x, 3600) minute, x = divmod(x, 60) second, x = divmod(x, 1) millisecond = int(x * 1000.) return '%.2d:%.2d:%.2d,%.3d' % (hour, minute, second, millisecond) def download_from_youtube(youtube_link: str) -> str: yt = pt.YouTube(youtube_link) available_streams = yt.streams.filter(only_audio=True) print('available streams:') print(available_streams) stream = available_streams.first() # , audio_codec='wav' stream.download(filename="audio.wav") return "audio.wav" MODEL_NAME = 'Dorjzodovsuren/whisper-large-v2-mn' #MODEL_NAME = 'Dorjzodovsuren/whisper-large-v3-turbo-mn-2' lang = 'mn' chunk_length_s = 9 vad_activation_min_duration = 9 # sec device = 0 if torch.cuda.is_available() else "cpu" SAMPLE_RATE = 16_000 ######## LOAD MODELS FROM HUB ######## dia_model = Pipeline.from_pretrained("pyannote/speaker-diarization", use_auth_token=os.environ['TOKEN']) vad_model = Pipeline.from_pretrained("pyannote/voice-activity-detection", use_auth_token=os.environ['TOKEN']) dia_model = dia_model.to(torch.device('cuda')) vad_model = vad_model.to(torch.device('cuda')) import torch from transformers import pipeline from transformers import AutoProcessor, AutoModelForSpeechSeq2Seq if MODEL_NAME == 'Dorjzodovsuren/whisper-large-v2-mn': processor = AutoProcessor.from_pretrained(MODEL_NAME) else: processor = AutoProcessor.from_pretrained("openai/whisper-large-v3-turbo") model = AutoModelForSpeechSeq2Seq.from_pretrained(MODEL_NAME) asr_pipeline = pipeline( "automatic-speech-recognition", model=model, tokenizer=processor.tokenizer, feature_extractor=processor.feature_extractor, chunk_length_s=chunk_length_s, device_map="auto" ) lang = 'mn' asr_pipeline.model.config.forced_decoder_ids = asr_pipeline.tokenizer.get_decoder_prompt_ids(language=lang, task="transcribe") print("----------> Loaded models <-----------") gpu_timeout = int(os.getenv("GPU_TIMEOUT", 60)) @spaces.GPU(duration=gpu_timeout) def generator(file_upload, num_speakers, max_duration, history): history = history or "" path = file_upload waveform, sampling_rate = librosa.load(path, sr=SAMPLE_RATE, mono=True, duration=max_duration) print(waveform.shape, sampling_rate) waveform_tensor = torch.unsqueeze(torch.tensor(waveform), 0).to(device) dia_result = dia_model({ "waveform": waveform_tensor, "sample_rate": sampling_rate, }, num_speakers=num_speakers) counter = 1 for speech_turn, track, speaker in dia_result.itertracks(yield_label=True): print(f"{speech_turn.start:4.1f} {speech_turn.end:4.1f} {speaker}") _start = int(sampling_rate * speech_turn.start) _end = int(sampling_rate * speech_turn.end) data = waveform[_start: _end] if speech_turn.end - speech_turn.start > vad_activation_min_duration: print(f'audio duration {speech_turn.end - speech_turn.start} sec ----> activating VAD') vad_output = vad_model({ 'waveform': waveform_tensor[:, _start:_end], 'sample_rate': sampling_rate}) for vad_turn in vad_output.get_timeline().support(): vad_start = _start + int(sampling_rate * vad_turn.start) vad_end = _start + int(sampling_rate * vad_turn.end) prediction = asr_pipeline(waveform[vad_start: vad_end])['text'] history += f"{counter}\n" + \ f"{second_to_timecode(speech_turn.start + vad_turn.start)} --> {second_to_timecode(speech_turn.start + vad_turn.end)}\n" + \ f"{prediction}\n\n" # f">> {speaker}: {prediction}\n\n" yield history, history, None counter += 1 else: prediction = asr_pipeline(data)['text'] history += f"{counter}\n" + \ f"{second_to_timecode(speech_turn.start)} --> {second_to_timecode(speech_turn.end)}\n" + \ f"{prediction}\n\n" # f">> {speaker}: {prediction}\n\n" counter += 1 yield history, history, None # https://support.google.com/youtube/answer/2734698?hl=en#zippy=%2Cbasic-file-formats%2Csubrip-srt-example%2Csubviewer-sbv-example file_name = 'transcript.srt' with open(file_name, 'w') as fp: fp.write(history) yield history, history, file_name # demo = gr.Interface( # generator, # inputs=[ # gr.Audio(type="filepath"), # gr.Number(value=1, label="Number of Speakers"), # gr.Number(value=120, label="Maximum Duration (Seconds)"), # 'state', # ], # outputs=['text', 'state', 'file'], # title="Mongolian Whisper 🇲🇳", # description=( # "Transcribe Microphone / Uploaded File in Mongolian Whisper Model." # ) # ) with gr.Blocks(title="Mongolian Whisper 🇲🇳") as demo: with gr.Column(): # everything in one vertical stack # ---- inputs ---- audio_input = gr.Audio(type="filepath", label="Дуу") with gr.Accordion("Нэмэлт тохиргоо", open=False): speakers_input = gr.Number(value=1, label="Яриж буй нийт хүний тоо хэд вэ?") duration_input = gr.Slider( minimum=0, maximum=300, step=1, value=120, label="Дууны хамгийн урт хэмжээ (Seconds) хэд вэ?" ) state_input = gr.State() # hidden, but part of the workflow # ---- outputs ---- text_output = gr.Textbox(label="Текст хөрвүүлгэ") state_output = gr.State() # hidden, passed through file_output = gr.File(label="Үр дүнгийн файл") # hook it all up audio_input.change( fn=generator, inputs=[audio_input, speakers_input, duration_input, state_input], outputs=[text_output, state_output, file_output], ) # define queue - required for generators demo.queue() demo.launch(debug=True)