File size: 5,024 Bytes
356bab3
5cd1d6b
1487f7e
93d849e
949b582
4041d63
5cd1d6b
a952e20
4041d63
 
 
a952e20
477c4c7
4041d63
1487f7e
477c4c7
 
3c10179
1487f7e
 
 
 
 
477c4c7
 
 
 
 
 
1ab9d68
 
 
 
 
 
 
 
 
 
477c4c7
 
 
 
 
 
 
 
 
 
 
 
 
 
 
beed497
5cd1d6b
32920c5
5cd1d6b
 
1ab9d68
 
 
 
 
 
 
 
 
 
477c4c7
5cd1d6b
 
 
356bab3
 
 
32920c5
 
4041d63
 
 
477c4c7
4041d63
477c4c7
4041d63
477c4c7
cf392a0
 
 
c8a13a7
477c4c7
cf392a0
477c4c7
 
4041d63
477c4c7
cf392a0
477c4c7
 
 
 
 
 
 
613192e
477c4c7
 
 
4041d63
477c4c7
4041d63
32c6718
477c4c7
cf392a0
 
4041d63
beed497
 
477c4c7
5cd1d6b
a69be69
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
import gradio as gr
import os
from moviepy.editor import VideoFileClip
from transformers import pipeline

# Load models
asr = pipeline(task="automatic-speech-recognition", model="distil-whisper/distil-small.en")
summarizer = pipeline("summarization", model="facebook/bart-large-cnn")
qa_pipeline = pipeline("question-answering", model="distilbert-base-cased-distilled-squad")

stored_transcript = ""

def transcribe_from_video(video_file):
    global stored_transcript
    if video_file is None:
        return "Error: No video file provided.", ""
    
    try:
        video = VideoFileClip(video_file)
        audio_path = "temp_audio.wav"
        video.audio.write_audiofile(audio_path, codec='pcm_s16le')

        transcription_result = asr(audio_path, return_timestamps=True)
        transcribed_text = " ".join([chunk["text"] for chunk in transcription_result["chunks"]])
        stored_transcript = transcribed_text

        if len(transcribed_text.split()) < 50:
            summarized_text = "Text too short to summarize."
        else:
            word_count = len(transcribed_text.split())
            max_summary_length = max(50, int(word_count * 0.3))
            min_summary_length = max(20, int(word_count * 0.15))

            summary = summarizer(
                transcribed_text,
                max_length=max_summary_length,
                min_length=min_summary_length,
                do_sample=False
            )
            summarized_text = summary[0]["summary_text"]

        return transcribed_text, summarized_text

    except Exception as e:
        return f"Error: {str(e)}", ""

def transcribe_from_audio(audio_file):
    global stored_transcript
    if audio_file is None:
        return "Error: No audio recorded.", ""

    try:
        transcription_result = asr(audio_file, return_timestamps=True)
        transcribed_text = " ".join([chunk["text"] for chunk in transcription_result["chunks"]])
        stored_transcript = transcribed_text

        if len(transcribed_text.split()) < 50:
            summarized_text = "Text too short to summarize."
        else:
            word_count = len(transcribed_text.split())
            max_summary_length = max(50, int(word_count * 0.3))
            min_summary_length = max(20, int(word_count * 0.15))

            summary = summarizer(
                transcribed_text,
                max_length=max_summary_length,
                min_length=min_summary_length,
                do_sample=False
            )
            summarized_text = summary[0]["summary_text"]

        return transcribed_text, summarized_text

    except Exception as e:
        return f"Error: {str(e)}", ""

        

def answer_question(question):
    global stored_transcript
    if not stored_transcript:
        return "Please transcribe a video or record audio first."
    result = qa_pipeline(question=question, context=stored_transcript)
    return result["answer"]

# UI
with gr.Blocks(css="""
body { background-color: black !important; }
.gradio-container { color: #FFFF33 !important; }
button { background-color: #FFFF33 !important; color: black !important; border: none !important; }
input, textarea, .gr-textbox, .gr-video, .gr-audio { background-color: #111 !important; color: #FFFF33 !important; border-color: #FFFF33 !important; }
""") as iface:
    gr.HTML("<h1 style='color:#FFFF33'>🎀 Video & Voice Transcriber, Summarizer & Q&A</h1>")
    gr.HTML("<p style='color:#CCCC33'>Upload a video or record speech to get transcript, summary, and ask questions.</p>")

    with gr.Tab("πŸŽ₯ Video Upload"):
        video_input = gr.Video(label="Upload Video (.mp4)", interactive=True)
        transcribe_btn = gr.Button("πŸš€ Transcribe from Video")
        transcribed_text_v = gr.Textbox(label="Transcribed Text", lines=8, interactive=False)
        summarized_text_v = gr.Textbox(label="Summarized Text", lines=8, interactive=False)

        transcribe_btn.click(fn=transcribe_from_video, inputs=video_input, outputs=[transcribed_text_v, summarized_text_v])

    with gr.Tab("πŸŽ™οΈ Record Speech"):
        audio_input = gr.Audio(type="filepath", label="Record Audio")
        record_btn = gr.Button("🎧 Transcribe from Audio")
        transcribed_text_a = gr.Textbox(label="Transcribed Text", lines=8, interactive=False)
        summarized_text_a = gr.Textbox(label="Summarized Text", lines=8, interactive=False)

        record_btn.click(fn=transcribe_from_audio, inputs=audio_input, outputs=[transcribed_text_a, summarized_text_a])

    with gr.Tab("❓ Ask Questions"):
        question_input = gr.Textbox(label="Ask a question about the transcript", placeholder="E.g., What was the main topic?")
        ask_btn = gr.Button("πŸ” Get Answer")
        answer_output = gr.Textbox(label="Answer", interactive=False)

        ask_btn.click(fn=answer_question, inputs=question_input, outputs=answer_output)

# Launch
port = int(os.environ.get('PORT1', 7860))
url = iface.launch(share=True, server_port=port)
print(f"Interface is live at: {url}")