Spaces:
Running
Running
Update app.py
Browse filesUpdates for o4, tts via gpt-4o
app.py
CHANGED
@@ -496,7 +496,7 @@ class DialogueItem(BaseModel):
|
|
496 |
class Dialogue(BaseModel):
|
497 |
scratchpad: str
|
498 |
dialogue: List[DialogueItem]
|
499 |
-
|
500 |
def get_mp3(text: str, voice: str, audio_model: str, api_key: str = None) -> bytes:
|
501 |
client = OpenAI(
|
502 |
api_key=api_key or os.getenv("OPENAI_API_KEY"),
|
@@ -511,6 +511,25 @@ def get_mp3(text: str, voice: str, audio_model: str, api_key: str = None) -> byt
|
|
511 |
for chunk in response.iter_bytes():
|
512 |
file.write(chunk)
|
513 |
return file.getvalue()
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
514 |
|
515 |
|
516 |
from functools import wraps
|
@@ -531,10 +550,12 @@ def conditional_llm(model, api_base=None, api_key=None):
|
|
531 |
def generate_audio(
|
532 |
files: list,
|
533 |
openai_api_key: str = None,
|
534 |
-
text_model: str = "o1-2024-12-17", #"o1-preview-2024-09-12",
|
535 |
audio_model: str = "tts-1",
|
536 |
speaker_1_voice: str = "alloy",
|
537 |
speaker_2_voice: str = "echo",
|
|
|
|
|
538 |
api_base: str = None,
|
539 |
intro_instructions: str = '',
|
540 |
text_instructions: str = '',
|
@@ -578,8 +599,6 @@ def generate_audio(
|
|
578 |
with file_path.open("r", encoding="utf-8") as f:
|
579 |
text = f.read()
|
580 |
combined_text += text + "\n\n"
|
581 |
-
|
582 |
-
|
583 |
# Configure the LLM based on selected model and api_base
|
584 |
@retry(retry=retry_if_exception_type(ValidationError))
|
585 |
@conditional_llm(model=text_model, api_base=api_base, api_key=openai_api_key)
|
@@ -642,7 +661,8 @@ def generate_audio(
|
|
642 |
for line in llm_output.dialogue:
|
643 |
transcript_line = f"{line.speaker}: {line.text}"
|
644 |
voice = speaker_1_voice if line.speaker == "speaker-1" else speaker_2_voice
|
645 |
-
|
|
|
646 |
futures.append((future, transcript_line))
|
647 |
characters += len(line.text)
|
648 |
|
@@ -675,7 +695,7 @@ def generate_audio(
|
|
675 |
def validate_and_generate_audio(*args):
|
676 |
files = args[0]
|
677 |
if not files:
|
678 |
-
return None, None, None, "Please upload at least one PDF file before generating audio."
|
679 |
try:
|
680 |
audio_file, transcript, original_text = generate_audio(*args)
|
681 |
return audio_file, transcript, original_text, None # Return None as the error when successful
|
@@ -741,7 +761,6 @@ with gr.Blocks(title="PDF to Audio", css="""
|
|
741 |
|
742 |
with gr.Row(elem_id="main_container"):
|
743 |
with gr.Column(scale=2):
|
744 |
-
#files = gr.Files(label="PDFs", file_types=["pdf"], )
|
745 |
files = gr.Files(label="PDFs (.pdf), markdown (.md, .mmd), or text files (.txt)", file_types=[".pdf", ".PDF", ".md", ".mmd", ".txt"], )
|
746 |
|
747 |
openai_api_key = gr.Textbox(
|
@@ -753,7 +772,7 @@ with gr.Blocks(title="PDF to Audio", css="""
|
|
753 |
text_model = gr.Dropdown(
|
754 |
label="Text Generation Model",
|
755 |
choices=STANDARD_TEXT_MODELS,
|
756 |
-
value="o1-preview-2024-09-12", #"gpt-4o-mini",
|
757 |
info="Select the model to generate the dialogue text.",
|
758 |
)
|
759 |
audio_model = gr.Dropdown(
|
@@ -774,6 +793,20 @@ with gr.Blocks(title="PDF to Audio", css="""
|
|
774 |
value="echo",
|
775 |
info="Select the voice for Speaker 2.",
|
776 |
)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
777 |
api_base = gr.Textbox(
|
778 |
label="Custom API Base",
|
779 |
placeholder="Enter custom API base URL if using a custom/local model...",
|
@@ -852,7 +885,8 @@ with gr.Blocks(title="PDF to Audio", css="""
|
|
852 |
fn=validate_and_generate_audio,
|
853 |
inputs=[
|
854 |
files, openai_api_key, text_model, audio_model,
|
855 |
-
speaker_1_voice, speaker_2_voice,
|
|
|
856 |
intro_instructions, text_instructions, scratch_pad_instructions,
|
857 |
prelude_dialog, podcast_dialog_instructions,
|
858 |
edited_transcript, # placeholder for edited_transcript
|
@@ -881,7 +915,8 @@ with gr.Blocks(title="PDF to Audio", css="""
|
|
881 |
inputs=[
|
882 |
use_edited_transcript, edited_transcript,
|
883 |
files, openai_api_key, text_model, audio_model,
|
884 |
-
speaker_1_voice, speaker_2_voice,
|
|
|
885 |
intro_instructions, text_instructions, scratch_pad_instructions,
|
886 |
prelude_dialog, podcast_dialog_instructions,
|
887 |
user_feedback, original_text_output
|
@@ -908,7 +943,7 @@ with gr.Blocks(title="PDF to Audio", css="""
|
|
908 |
#demo.queue(max_size=20, default_concurrency_limit=32)
|
909 |
|
910 |
# Launch the Gradio app
|
911 |
-
if __name__ == "__main__":
|
912 |
-
demo.launch(share=True)
|
913 |
|
914 |
-
|
|
|
496 |
class Dialogue(BaseModel):
|
497 |
scratchpad: str
|
498 |
dialogue: List[DialogueItem]
|
499 |
+
'''
|
500 |
def get_mp3(text: str, voice: str, audio_model: str, api_key: str = None) -> bytes:
|
501 |
client = OpenAI(
|
502 |
api_key=api_key or os.getenv("OPENAI_API_KEY"),
|
|
|
511 |
for chunk in response.iter_bytes():
|
512 |
file.write(chunk)
|
513 |
return file.getvalue()
|
514 |
+
'''
|
515 |
+
def get_mp3(text: str, voice: str, audio_model: str, api_key: str = None,
|
516 |
+
speaker_instructions: str ='Speak in an emotive and friendly tone.') -> bytes:
|
517 |
+
client = OpenAI(
|
518 |
+
api_key=api_key or os.getenv("OPENAI_API_KEY"),
|
519 |
+
)
|
520 |
+
|
521 |
+
|
522 |
+
with client.audio.speech.with_streaming_response.create(
|
523 |
+
model=audio_model,
|
524 |
+
voice=voice,
|
525 |
+
input=text,
|
526 |
+
instructions=speaker_instructions,
|
527 |
+
) as response:
|
528 |
+
with io.BytesIO() as file:
|
529 |
+
for chunk in response.iter_bytes():
|
530 |
+
file.write(chunk)
|
531 |
+
return file.getvalue()
|
532 |
+
|
533 |
|
534 |
|
535 |
from functools import wraps
|
|
|
550 |
def generate_audio(
|
551 |
files: list,
|
552 |
openai_api_key: str = None,
|
553 |
+
text_model: str = "o4-mini", #o1-2024-12-17", #"o1-preview-2024-09-12",
|
554 |
audio_model: str = "tts-1",
|
555 |
speaker_1_voice: str = "alloy",
|
556 |
speaker_2_voice: str = "echo",
|
557 |
+
speaker_1_instructions: str = '',
|
558 |
+
speaker_2_instructions: str = '',
|
559 |
api_base: str = None,
|
560 |
intro_instructions: str = '',
|
561 |
text_instructions: str = '',
|
|
|
599 |
with file_path.open("r", encoding="utf-8") as f:
|
600 |
text = f.read()
|
601 |
combined_text += text + "\n\n"
|
|
|
|
|
602 |
# Configure the LLM based on selected model and api_base
|
603 |
@retry(retry=retry_if_exception_type(ValidationError))
|
604 |
@conditional_llm(model=text_model, api_base=api_base, api_key=openai_api_key)
|
|
|
661 |
for line in llm_output.dialogue:
|
662 |
transcript_line = f"{line.speaker}: {line.text}"
|
663 |
voice = speaker_1_voice if line.speaker == "speaker-1" else speaker_2_voice
|
664 |
+
speaker_instructions=speaker_1_instructions if line.speaker == "speaker-1" else speaker_2_instructions
|
665 |
+
future = executor.submit(get_mp3, line.text, voice, audio_model, openai_api_key, speaker_instructions, )
|
666 |
futures.append((future, transcript_line))
|
667 |
characters += len(line.text)
|
668 |
|
|
|
695 |
def validate_and_generate_audio(*args):
|
696 |
files = args[0]
|
697 |
if not files:
|
698 |
+
return None, None, None, "Please upload at least one PDF (or MD/MMD/TXT) file before generating audio."
|
699 |
try:
|
700 |
audio_file, transcript, original_text = generate_audio(*args)
|
701 |
return audio_file, transcript, original_text, None # Return None as the error when successful
|
|
|
761 |
|
762 |
with gr.Row(elem_id="main_container"):
|
763 |
with gr.Column(scale=2):
|
|
|
764 |
files = gr.Files(label="PDFs (.pdf), markdown (.md, .mmd), or text files (.txt)", file_types=[".pdf", ".PDF", ".md", ".mmd", ".txt"], )
|
765 |
|
766 |
openai_api_key = gr.Textbox(
|
|
|
772 |
text_model = gr.Dropdown(
|
773 |
label="Text Generation Model",
|
774 |
choices=STANDARD_TEXT_MODELS,
|
775 |
+
value="o3-mini", "o4-mini", #"o1-preview-2024-09-12", #"gpt-4o-mini",
|
776 |
info="Select the model to generate the dialogue text.",
|
777 |
)
|
778 |
audio_model = gr.Dropdown(
|
|
|
793 |
value="echo",
|
794 |
info="Select the voice for Speaker 2.",
|
795 |
)
|
796 |
+
speaker_1_instructions = gr.Textbox(
|
797 |
+
label="Speaker 1 instructions",
|
798 |
+
value="Speak in an emotive and friendly tone.",
|
799 |
+
info="Speaker 1 instructions (used with gpt-4o-mini-tts only)",
|
800 |
+
interactive=True,
|
801 |
+
)
|
802 |
+
|
803 |
+
speaker_2_instructions = gr.Textbox(
|
804 |
+
label="Speaker 2 instructions",
|
805 |
+
value="Speak in a friendly, but serious tone.",
|
806 |
+
info="Speaker 2 instructions (used with gpt-4o-mini-tts only)",
|
807 |
+
interactive=True,
|
808 |
+
)
|
809 |
+
|
810 |
api_base = gr.Textbox(
|
811 |
label="Custom API Base",
|
812 |
placeholder="Enter custom API base URL if using a custom/local model...",
|
|
|
885 |
fn=validate_and_generate_audio,
|
886 |
inputs=[
|
887 |
files, openai_api_key, text_model, audio_model,
|
888 |
+
speaker_1_voice, speaker_2_voice, speaker_1_instructions, speaker_2_instructions,
|
889 |
+
api_base,
|
890 |
intro_instructions, text_instructions, scratch_pad_instructions,
|
891 |
prelude_dialog, podcast_dialog_instructions,
|
892 |
edited_transcript, # placeholder for edited_transcript
|
|
|
915 |
inputs=[
|
916 |
use_edited_transcript, edited_transcript,
|
917 |
files, openai_api_key, text_model, audio_model,
|
918 |
+
speaker_1_voice, speaker_2_voice, speaker_1_instructions, speaker_2_instructions,
|
919 |
+
api_base,
|
920 |
intro_instructions, text_instructions, scratch_pad_instructions,
|
921 |
prelude_dialog, podcast_dialog_instructions,
|
922 |
user_feedback, original_text_output
|
|
|
943 |
#demo.queue(max_size=20, default_concurrency_limit=32)
|
944 |
|
945 |
# Launch the Gradio app
|
946 |
+
#if __name__ == "__main__":
|
947 |
+
# demo.launch(share=True)
|
948 |
|
949 |
+
demo.launch()
|