import gradio as gr from transformers import pipeline import numpy as np import pandas as pd import re from pydub import AudioSegment from pydub.generators import Sine import io MODEL_NAME = "openai/whisper-large-v3" BATCH_SIZE = 8 # device = 0 if torch.cuda.is_available() else "cpu" pipe = pipeline( task="automatic-speech-recognition", model=MODEL_NAME, chunk_length_s=30, # device=device, ) arabic_bad_Words = pd.read_csv("arabic_bad_words_dataset.csv") english_bad_Words = pd.read_csv("english_bad_words_dataset.csv") def clean_text(text): # Use regex to remove special characters, punctuation, and spaces around words cleaned_text = re.sub(r'^[\s\W_]+|[\s\W_]+$', '', text) return cleaned_text def classifier(word_list_with_timestamp, language): if language == "English": list_to_search = set(english_bad_Words["words"]) else: list_to_search = set(english_bad_Words["words"]) foul_words = [] negative_timestamps = [] for item in word_list_with_timestamp: word = clean_text(item['text']) if word in list_to_search: if word not in foul_words: foul_words.append(word) negative_timestamps.append(item['timestamp']) return [foul_words, negative_timestamps] def generate_bleep(duration_ms, frequency=1000): sine_wave = Sine(frequency) bleep = sine_wave.to_audio_segment(duration=duration_ms) return bleep def mute_audio_range(audio_filepath, ranges, bleep_frequency=800): audio = AudioSegment.from_file(audio_filepath) for range in ranges: start_time = range[0] - 0.1 end_time = range[-1] + 0.1 start_ms = start_time * 1000 # pydub works with milliseconds end_ms = end_time * 1000 duration_ms = end_ms - start_ms # Generate the bleep sound bleep_sound = generate_bleep(duration_ms, bleep_frequency) # Combine the original audio with the bleep sound audio = audio[:start_ms] + bleep_sound + audio[end_ms:] return audio def format_output_to_list(data): formatted_list = "\n".join([f"{item['timestamp'][0]}s - {item['timestamp'][1]}s \t : {item['text']}" for item in data]) return formatted_list def transcribe(input_audio, audio_language, task, timestamp_type): if input_audio is None: raise gr.Error("No audio file submitted! Please upload or record an audio file before submitting your request.") if timestamp_type == "sentence": timestamp_type = True else: timestamp_type = "word" output = pipe(input_audio, batch_size=BATCH_SIZE, return_timestamps=timestamp_type, generate_kwargs={"task": task}) text = output['text'] timestamps = format_output_to_list(output['chunks']) foul_words_list, negative_timestamps = classifier(output['chunks'], audio_language) foul_words_list = ", ".join(foul_words_list) audio_output = mute_audio_range(input_audio, negative_timestamps) output_path = "output_audio.wav" audio_output.export(output_path, format="wav") return [text, timestamps, foul_words_list, output_path] examples = [ ["arabic_english_audios/audios/arabic_audio_1.wav", 'Arabic', 'transcribe', 'word'], ["arabic_english_audios/audios/arabic_audio_2.wav", 'Arabic', 'transcribe', 'word'], ["arabic_english_audios/audios/arabic_audio_3.wav", 'Arabic', 'transcribe', 'word'], ["arabic_english_audios/audios/arabic_audio_4.wav", 'Arabic', 'transcribe', 'word'], ["arabic_english_audios/audios/arabic_hate_audio_1.mp3", 'Arabic', 'transcribe', 'word'], ["arabic_english_audios/audios/arabic_hate_audio_2.mp3", 'Arabic', 'transcribe', 'word'], ["arabic_english_audios/audios/arabic_hate_audio_3.mp3", 'Arabic', 'transcribe', 'word'], ["arabic_english_audios/audios/english_audio_1.wav", 'English', 'transcribe', 'word'], ["arabic_english_audios/audios/english_audio_2.mp3", 'English', 'transcribe', 'word'], ["arabic_english_audios/audios/english_audio_3.mp3", 'English', 'transcribe', 'word'], ["arabic_english_audios/audios/english_audio_4.mp3", 'English', 'transcribe', 'word'], ["arabic_english_audios/audios/english_audio_5.mp3", 'English', 'transcribe', 'word'], ["arabic_english_audios/audios/english_audio_6.wav", 'English', 'transcribe', 'word'] ] with gr.Blocks(theme=gr.themes.Default()) as demo: gr.HTML("