Spaces:

karagmercola
/

Sesame-AI-POC

Sleeping

App Files Files Community

karagmercola commited on Mar 30

Commit

5f13ae1

verified ·

1 Parent(s): f67c9fa

Update app.py

Browse files

Files changed (1) hide show

app.py +25 -13

app.py CHANGED Viewed

@@ -1,5 +1,6 @@
-import gradio as gr
 import whisper
 from main import conversation_with_voice
 # Load Whisper model
@@ -7,19 +8,17 @@ model = whisper.load_model("base")
 # Description displayed at the top of the UI
 description = """
-Proof Of Concept
-This demo allows you to interact with an AI using both voice-to-voice and text-to-speech capabilities.
-Why each tool was added:
-- Whisper (OpenAI): Used for converting spoken input to text because Sesame AI currently only supports text-to-speech.
-- LLaMA 3 (AWS): Acts as the brain that generates intelligent responses from your questions.
-- Sesame AI (Hugging Face): Converts the AI's response back to expressive speech.
-Example questions you can ask:
-- What are the healthiest oils to cook with?
-- How much water should I drink daily?
-- What are good snacks for weight loss?
 Created by Kara Granados
 """
@@ -27,9 +26,12 @@ Created by Kara Granados
 def voice_to_voice(audio_file):
     if audio_file is None:
         return "No audio received", None
     result = model.transcribe(audio_file)
     user_input = result["text"]
     response = conversation_with_voice(user_input)
     if "error" in response:
         return response.get("error"), None
     return response["text_response"], response["audio_path"]
@@ -37,7 +39,10 @@ def voice_to_voice(audio_file):
 def text_to_voice(text_input):
     if not text_input.strip():
         return "Please enter a question.", None
     response = conversation_with_voice(text_input)
     if "error" in response:
         return response.get("error"), None
     return response["text_response"], response["audio_path"]
@@ -60,4 +65,11 @@ with gr.Blocks(title="Sesame AI POC") as demo:
         text_button = gr.Button("Submit Text")
         text_button.click(fn=text_to_voice, inputs=text_input, outputs=[text_output_text, text_output_audio])
 demo.launch()

+import gradio as gr
 import whisper
+import time
 from main import conversation_with_voice
 # Load Whisper model
 # Description displayed at the top of the UI
 description = """
+Proof Of Concept
+This demo allows you to interact with an AI using both voice-to-voice and text-to-speech capabilities.
+**Why each tool was added:**
+- Whisper (OpenAI): Used for converting spoken input to text because Sesame AI currently only supports text-to-speech.
+- LLaMA 3 (AWS): Acts as the brain that generates intelligent responses from your questions.
+- Sesame AI (Hugging Face): Converts the AI's response back to expressive speech.
+**Example questions you can ask:**
+- What are the healthiest oils to cook with?
+- How much water should I drink daily?
+- What are good snacks for weight loss?
 Created by Kara Granados
 """
 def voice_to_voice(audio_file):
     if audio_file is None:
         return "No audio received", None
+    start_time = time.time()
     result = model.transcribe(audio_file)
     user_input = result["text"]
     response = conversation_with_voice(user_input)
+    end_time = time.time()
+    print(f"Total processing time (voice input): {end_time - start_time:.2f} seconds")
     if "error" in response:
         return response.get("error"), None
     return response["text_response"], response["audio_path"]
 def text_to_voice(text_input):
     if not text_input.strip():
         return "Please enter a question.", None
+    start_time = time.time()
     response = conversation_with_voice(text_input)
+    end_time = time.time()
+    print(f"Total processing time (text input): {end_time - start_time:.2f} seconds")
     if "error" in response:
         return response.get("error"), None
     return response["text_response"], response["audio_path"]
         text_button = gr.Button("Submit Text")
         text_button.click(fn=text_to_voice, inputs=text_input, outputs=[text_output_text, text_output_audio])
+    gr.Markdown("""
+**NOTE:** This demo is intended for testing purposes. The longer response time is due to using free-tier resources on Hugging Face. In a production environment, dedicated infrastructure will be used to ensure real-time performance.
+**Additional Info:** The CSM (Conversational Speech Model) used for voice output is a large model and may take additional time to load and generate audio responses, especially during the first use or after inactivity.
+""")
 demo.launch()