Dorjzodovsuren commited on
Commit
2a5aa2b
·
verified ·
1 Parent(s): 4e6066c

Create app.py

Browse files
Files changed (1) hide show
  1. app.py +133 -0
app.py ADDED
@@ -0,0 +1,133 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import gradio as gr
2
+ import numpy as np
3
+ import pytubefix as pt
4
+ import os, time, librosa, torch
5
+ from pyannote.audio import Pipeline
6
+ from transformers import pipeline
7
+ from google.colab import userdata
8
+
9
+
10
+ def second_to_timecode(x: float) -> str:
11
+ """Float x second to HH:MM:SS.DDD format."""
12
+ hour, x = divmod(x, 3600)
13
+ minute, x = divmod(x, 60)
14
+ second, x = divmod(x, 1)
15
+ millisecond = int(x * 1000.)
16
+
17
+ return '%.2d:%.2d:%.2d,%.3d' % (hour, minute, second, millisecond)
18
+
19
+
20
+ def download_from_youtube(youtube_link: str) -> str:
21
+ yt = pt.YouTube(youtube_link)
22
+ available_streams = yt.streams.filter(only_audio=True)
23
+ print('available streams:')
24
+ print(available_streams)
25
+ stream = available_streams.first()
26
+ # , audio_codec='wav'
27
+
28
+ stream.download(filename="audio.wav")
29
+ return "audio.wav"
30
+
31
+
32
+ #MODEL_NAME = 'bayartsogt/whisper-large-v2-mn-13'
33
+ MODEL_NAME = 'Dorjzodovsuren/whisper-large-v3-turbo-mn-2'
34
+ lang = 'mn'
35
+
36
+ chunk_length_s = 9
37
+ vad_activation_min_duration = 9 # sec
38
+ device = 0 if torch.cuda.is_available() else "cpu"
39
+ SAMPLE_RATE = 16_000
40
+
41
+ ######## LOAD MODELS FROM HUB ########
42
+ dia_model = Pipeline.from_pretrained("pyannote/speaker-diarization", use_auth_token=os.environ['TOKEN'])
43
+ vad_model = Pipeline.from_pretrained("pyannote/voice-activity-detection", use_auth_token=os.environ['TOKEN'])
44
+
45
+ import torch
46
+ from transformers import pipeline
47
+ from transformers import AutoProcessor, AutoModelForSpeechSeq2Seq
48
+
49
+ if MODEL_NAME == 'bayartsogt/whisper-large-v2-mn-13':
50
+ processor = AutoProcessor.from_pretrained('bayartsogt/whisper-large-v2-mn-13')
51
+
52
+ else:
53
+ processor = AutoProcessor.from_pretrained("openai/whisper-large-v3-turbo")
54
+
55
+ model = AutoModelForSpeechSeq2Seq.from_pretrained(MODEL_NAME)
56
+
57
+ asr_pipeline = pipeline(
58
+ "automatic-speech-recognition",
59
+ model=model,
60
+ tokenizer=processor.tokenizer,
61
+ feature_extractor=processor.feature_extractor,
62
+ chunk_length_s=chunk_length_s,
63
+ device=device
64
+ )
65
+
66
+ lang = 'mn'
67
+ asr_pipeline.model.config.forced_decoder_ids = asr_pipeline.tokenizer.get_decoder_prompt_ids(language=lang, task="transcribe")
68
+
69
+ print("----------> Loaded models <-----------")
70
+
71
+ def generator(youtube_link, microphone, file_upload, num_speakers, max_duration, history):
72
+
73
+ if int(youtube_link != '') + int(microphone is not None) + int(file_upload is not None) != 1:
74
+ raise Exception(f"Only one of the source should be given youtube_link={youtube_link}, microphone={microphone}, file_upload={file_upload}")
75
+
76
+ history = history or ""
77
+
78
+ if microphone:
79
+ path = microphone
80
+ elif file_upload:
81
+ path = file_upload
82
+ elif youtube_link:
83
+ path = download_from_youtube(youtube_link)
84
+
85
+ waveform, sampling_rate = librosa.load(path, sr=SAMPLE_RATE, mono=True, duration=max_duration)
86
+
87
+ print(waveform.shape, sampling_rate)
88
+ waveform_tensor = torch.unsqueeze(torch.tensor(waveform), 0).to(device)
89
+
90
+ dia_result = dia_model({
91
+ "waveform": waveform_tensor,
92
+ "sample_rate": sampling_rate,
93
+ }, num_speakers=num_speakers)
94
+
95
+ counter = 1
96
+
97
+ for speech_turn, track, speaker in dia_result.itertracks(yield_label=True):
98
+ print(f"{speech_turn.start:4.1f} {speech_turn.end:4.1f} {speaker}")
99
+ _start = int(sampling_rate * speech_turn.start)
100
+ _end = int(sampling_rate * speech_turn.end)
101
+ data = waveform[_start: _end]
102
+
103
+ if speech_turn.end - speech_turn.start > vad_activation_min_duration:
104
+ print(f'audio duration {speech_turn.end - speech_turn.start} sec ----> activating VAD')
105
+ vad_output = vad_model({
106
+ 'waveform': waveform_tensor[:, _start:_end],
107
+ 'sample_rate': sampling_rate})
108
+ for vad_turn in vad_output.get_timeline().support():
109
+ vad_start = _start + int(sampling_rate * vad_turn.start)
110
+ vad_end = _start + int(sampling_rate * vad_turn.end)
111
+ prediction = asr_pipeline(waveform[vad_start: vad_end])['text']
112
+ history += f"{counter}\n" + \
113
+ f"{second_to_timecode(speech_turn.start + vad_turn.start)} --> {second_to_timecode(speech_turn.start + vad_turn.end)}\n" + \
114
+ f"{prediction}\n\n"
115
+ # f">> {speaker}: {prediction}\n\n"
116
+ yield history, history, None
117
+ counter += 1
118
+
119
+ else:
120
+ prediction = asr_pipeline(data)['text']
121
+ history += f"{counter}\n" + \
122
+ f"{second_to_timecode(speech_turn.start)} --> {second_to_timecode(speech_turn.end)}\n" + \
123
+ f"{prediction}\n\n"
124
+ # f">> {speaker}: {prediction}\n\n"
125
+ counter += 1
126
+ yield history, history, None
127
+
128
+ # https://support.google.com/youtube/answer/2734698?hl=en#zippy=%2Cbasic-file-formats%2Csubrip-srt-example%2Csubviewer-sbv-example
129
+ file_name = 'transcript.srt'
130
+ with open(file_name, 'w') as fp:
131
+ fp.write(history)
132
+
133
+ yield history, history, file_name