import gradio as gr from demo.infer import LiveCCDemoInfer class GradioBackend: waiting_video_response = 'Waiting for video input...' not_found_video_response = 'Video does not exist...' mode2api = { 'Real-Time Commentary': 'live_cc', 'Conversation': 'video_qa' } def __init__(self, model_path: str = 'chenjoya/LiveCC-7B-Instruct'): self.infer = LiveCCDemoInfer(model_path) from kokoro import KPipeline self.audio_pipeline = KPipeline(lang_code='a') def __call__(self, query: str = None, state: dict = {}, mode: str = 'Real-Time Commentary', **kwargs): return getattr(self.infer, self.mode2api[mode])(query=query, state=state, **kwargs) gradio_backend = GradioBackend() with gr.Blocks() as demo: gr.Markdown("## LiveCC Real-Time Commentary and Conversation - Gradio Demo") gr.Markdown("### [LiveCC: Learning Video LLM with Streaming Speech Transcription at Scale (CVPR 2025)](https://showlab.github.io/livecc/)") gr.Markdown("1️⃣ Select Mode, Real-Time Commentary (LiveCC) or Conversation (Common QA)") gr.Markdown("2️⃣ (Optional) Input a query. If there is no query, the default query is 'Please describe the video.' ") gr.Markdown("3️⃣ Click the video, or upload a video.") gr.Markdown("*Web Gradio has unexpected latency (3s~5s). If you want to enjoy the very real-time experience, please try our CLI demo https://github.com/showlab/livecc*") gr_state = gr.State({}, render=False) # control all useful state, including kv cache gr_video_state = gr.JSON({}, visible=False) # only record video state, belong to gr_state but lightweight gr_static_trigger = gr.Number(value=0, visible=False) # control start streaming or stop gr_dynamic_trigger = gr.Number(value=0, visible=False) # for continuous refresh with gr.Row(): with gr.Column(): gr_video = gr.Video( label="video", elem_id="gr_video", visible=True, sources=['upload'], autoplay=True, include_audio=False, width=720, height=480 ) gr_examples = gr.Examples( examples=[ 'demo/sources/howto_fix_laptop_mute_1080p.mp4', ], inputs=[gr_video], ) gr_clean_button = gr.Button("Clean (Press me before changing video)", elem_id="gr_button") with gr.Column(): with gr.Row(): gr_radio_mode = gr.Radio(label="Select Mode", choices=["Real-Time Commentary", "Conversation"], elem_id="gr_radio_mode", value='Real-Time Commentary', interactive=True) def gr_chatinterface_fn(message, history, state, video_path, mode): state['video_path'] = video_path response, state = gradio_backend(query=message, state=state, mode=mode) return response, state def gr_chatinterface_chatbot_clear_fn(): return {}, {}, 0, 0 gr_chatinterface = gr.ChatInterface( fn=gr_chatinterface_fn, type="messages", additional_inputs=[gr_state, gr_video, gr_radio_mode], additional_outputs=[gr_state], ) gr_chatinterface.chatbot.clear(fn=gr_chatinterface_chatbot_clear_fn, outputs=[gr_video_state, gr_state, gr_static_trigger, gr_dynamic_trigger]) gr_clean_button.click(fn=lambda :[[], *gr_chatinterface_chatbot_clear_fn()], outputs=[gr_video_state, gr_state, gr_static_trigger, gr_dynamic_trigger]) def gr_for_streaming(history: list[gr.ChatMessage], video_state: dict, state: dict, mode: str, static_trigger: int, dynamic_trigger: int): # if static_trigger == 0: # return gr_chatinterface_chatbot_clear_fn() # if video_state['video_path'] != state.get('video_path', None): # return gr_chatinterface_chatbot_clear_fn() state.update(video_state) query, assistant_waiting_message = None, None for message in history[::-1]: if message['role'] == 'user': if message['metadata'] is None or message['metadata'].get('status', '') == '': query = message['content'] if message['metadata'] is None: message['metadata'] = {} message['metadata']['status'] = 'pending' continue if query is not None: # put others as done message['metadata']['status'] = 'done' elif message['content'] == GradioBackend.waiting_video_response: assistant_waiting_message = message for (start_timestamp, stop_timestamp), response, state in gradio_backend(query=query, state=state, mode=mode): if start_timestamp >= 0: response_with_timestamp = f'{start_timestamp:.1f}s-{stop_timestamp:.1f}s: {response}' if assistant_waiting_message is None: history.append(gr.ChatMessage(role="assistant", content=response_with_timestamp)) else: assistant_waiting_message['content'] = response_with_timestamp assistant_waiting_message = None yield history, state, dynamic_trigger yield history, state, 1 - dynamic_trigger js_video_timestamp_fetcher = """ (state, video_state) => { const videoEl = document.querySelector("#gr_video video"); return { video_path: videoEl.currentSrc, video_timestamp: videoEl.currentTime }; } """ def gr_get_video_state(video_state): if 'file=' in video_state['video_path']: video_state['video_path'] = video_state['video_path'].split('file=')[1] return video_state def gr_video_change_fn(mode): return [1, 1] if mode == "Real-Time Commentary" else [0, 0] gr_video.change( fn=gr_video_change_fn, inputs=[gr_radio_mode], outputs=[gr_static_trigger, gr_dynamic_trigger] ) gr_dynamic_trigger.change( fn=gr_get_video_state, inputs=[gr_video_state], outputs=[gr_video_state], js=js_video_timestamp_fetcher ).then( fn=gr_for_streaming, inputs=[gr_chatinterface.chatbot, gr_video_state, gr_state, gr_radio_mode, gr_static_trigger, gr_dynamic_trigger], outputs=[gr_chatinterface.chatbot, gr_state, gr_dynamic_trigger], ) demo.queue(max_size=5, default_concurrency_limit=5) demo.launch(share=True) # --- for streaming --- # gr_tts = gr.Audio(visible=False, elem_id="gr_tts", streaming=True, autoplay=True) # def tts(): # while True: # contents = '' # while not gradio_backend.contents.empty(): # content = gradio_backend.contents.get() # contents += ' ' + content.rstrip(' ...') # contents = contents.strip() # if contents: # generator = gradio_backend.audio_pipeline(contents, voice='af_heart', speed=1.2) # for _, _, audio_torch in generator: # audio_np = audio_torch.cpu().numpy() # max_val = np.max(np.abs(audio_np)) # if max_val > 0: # audio_np = audio_np / max_val # audio_int16 = (audio_np * 32767).astype(np.int16) # yield (24000, audio_int16) # gr_video.change(fn=tts, outputs=[gr_tts])