Spaces:
Sleeping
Sleeping
Update app.py
Browse files
app.py
CHANGED
@@ -1,5 +1,6 @@
|
|
1 |
-
import gradio as gr
|
2 |
import whisper
|
|
|
3 |
from main import conversation_with_voice
|
4 |
|
5 |
# Load Whisper model
|
@@ -7,19 +8,17 @@ model = whisper.load_model("base")
|
|
7 |
|
8 |
# Description displayed at the top of the UI
|
9 |
description = """
|
10 |
-
Proof Of Concept
|
|
|
|
|
|
|
|
|
|
|
11 |
|
12 |
-
|
13 |
-
|
14 |
-
|
15 |
-
-
|
16 |
-
- LLaMA 3 (AWS): Acts as the brain that generates intelligent responses from your questions.
|
17 |
-
- Sesame AI (Hugging Face): Converts the AI's response back to expressive speech.
|
18 |
-
|
19 |
-
Example questions you can ask:
|
20 |
-
- What are the healthiest oils to cook with?
|
21 |
-
- How much water should I drink daily?
|
22 |
-
- What are good snacks for weight loss?
|
23 |
|
24 |
Created by Kara Granados
|
25 |
"""
|
@@ -27,9 +26,12 @@ Created by Kara Granados
|
|
27 |
def voice_to_voice(audio_file):
|
28 |
if audio_file is None:
|
29 |
return "No audio received", None
|
|
|
30 |
result = model.transcribe(audio_file)
|
31 |
user_input = result["text"]
|
32 |
response = conversation_with_voice(user_input)
|
|
|
|
|
33 |
if "error" in response:
|
34 |
return response.get("error"), None
|
35 |
return response["text_response"], response["audio_path"]
|
@@ -37,7 +39,10 @@ def voice_to_voice(audio_file):
|
|
37 |
def text_to_voice(text_input):
|
38 |
if not text_input.strip():
|
39 |
return "Please enter a question.", None
|
|
|
40 |
response = conversation_with_voice(text_input)
|
|
|
|
|
41 |
if "error" in response:
|
42 |
return response.get("error"), None
|
43 |
return response["text_response"], response["audio_path"]
|
@@ -60,4 +65,11 @@ with gr.Blocks(title="Sesame AI POC") as demo:
|
|
60 |
text_button = gr.Button("Submit Text")
|
61 |
text_button.click(fn=text_to_voice, inputs=text_input, outputs=[text_output_text, text_output_audio])
|
62 |
|
|
|
|
|
|
|
|
|
|
|
|
|
63 |
demo.launch()
|
|
|
|
1 |
+
import gradio as gr
|
2 |
import whisper
|
3 |
+
import time
|
4 |
from main import conversation_with_voice
|
5 |
|
6 |
# Load Whisper model
|
|
|
8 |
|
9 |
# Description displayed at the top of the UI
|
10 |
description = """
|
11 |
+
Proof Of Concept
|
12 |
+
This demo allows you to interact with an AI using both voice-to-voice and text-to-speech capabilities.
|
13 |
+
**Why each tool was added:**
|
14 |
+
- Whisper (OpenAI): Used for converting spoken input to text because Sesame AI currently only supports text-to-speech.
|
15 |
+
- LLaMA 3 (AWS): Acts as the brain that generates intelligent responses from your questions.
|
16 |
+
- Sesame AI (Hugging Face): Converts the AI's response back to expressive speech.
|
17 |
|
18 |
+
**Example questions you can ask:**
|
19 |
+
- What are the healthiest oils to cook with?
|
20 |
+
- How much water should I drink daily?
|
21 |
+
- What are good snacks for weight loss?
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
22 |
|
23 |
Created by Kara Granados
|
24 |
"""
|
|
|
26 |
def voice_to_voice(audio_file):
|
27 |
if audio_file is None:
|
28 |
return "No audio received", None
|
29 |
+
start_time = time.time()
|
30 |
result = model.transcribe(audio_file)
|
31 |
user_input = result["text"]
|
32 |
response = conversation_with_voice(user_input)
|
33 |
+
end_time = time.time()
|
34 |
+
print(f"Total processing time (voice input): {end_time - start_time:.2f} seconds")
|
35 |
if "error" in response:
|
36 |
return response.get("error"), None
|
37 |
return response["text_response"], response["audio_path"]
|
|
|
39 |
def text_to_voice(text_input):
|
40 |
if not text_input.strip():
|
41 |
return "Please enter a question.", None
|
42 |
+
start_time = time.time()
|
43 |
response = conversation_with_voice(text_input)
|
44 |
+
end_time = time.time()
|
45 |
+
print(f"Total processing time (text input): {end_time - start_time:.2f} seconds")
|
46 |
if "error" in response:
|
47 |
return response.get("error"), None
|
48 |
return response["text_response"], response["audio_path"]
|
|
|
65 |
text_button = gr.Button("Submit Text")
|
66 |
text_button.click(fn=text_to_voice, inputs=text_input, outputs=[text_output_text, text_output_audio])
|
67 |
|
68 |
+
gr.Markdown("""
|
69 |
+
**NOTE:** This demo is intended for testing purposes. The longer response time is due to using free-tier resources on Hugging Face. In a production environment, dedicated infrastructure will be used to ensure real-time performance.
|
70 |
+
|
71 |
+
**Additional Info:** The CSM (Conversational Speech Model) used for voice output is a large model and may take additional time to load and generate audio responses, especially during the first use or after inactivity.
|
72 |
+
""")
|
73 |
+
|
74 |
demo.launch()
|
75 |
+
|