karagmercola commited on
Commit
5f13ae1
·
verified ·
1 Parent(s): f67c9fa

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +25 -13
app.py CHANGED
@@ -1,5 +1,6 @@
1
- import gradio as gr
2
  import whisper
 
3
  from main import conversation_with_voice
4
 
5
  # Load Whisper model
@@ -7,19 +8,17 @@ model = whisper.load_model("base")
7
 
8
  # Description displayed at the top of the UI
9
  description = """
10
- Proof Of Concept
 
 
 
 
 
11
 
12
- This demo allows you to interact with an AI using both voice-to-voice and text-to-speech capabilities.
13
-
14
- Why each tool was added:
15
- - Whisper (OpenAI): Used for converting spoken input to text because Sesame AI currently only supports text-to-speech.
16
- - LLaMA 3 (AWS): Acts as the brain that generates intelligent responses from your questions.
17
- - Sesame AI (Hugging Face): Converts the AI's response back to expressive speech.
18
-
19
- Example questions you can ask:
20
- - What are the healthiest oils to cook with?
21
- - How much water should I drink daily?
22
- - What are good snacks for weight loss?
23
 
24
  Created by Kara Granados
25
  """
@@ -27,9 +26,12 @@ Created by Kara Granados
27
  def voice_to_voice(audio_file):
28
  if audio_file is None:
29
  return "No audio received", None
 
30
  result = model.transcribe(audio_file)
31
  user_input = result["text"]
32
  response = conversation_with_voice(user_input)
 
 
33
  if "error" in response:
34
  return response.get("error"), None
35
  return response["text_response"], response["audio_path"]
@@ -37,7 +39,10 @@ def voice_to_voice(audio_file):
37
  def text_to_voice(text_input):
38
  if not text_input.strip():
39
  return "Please enter a question.", None
 
40
  response = conversation_with_voice(text_input)
 
 
41
  if "error" in response:
42
  return response.get("error"), None
43
  return response["text_response"], response["audio_path"]
@@ -60,4 +65,11 @@ with gr.Blocks(title="Sesame AI POC") as demo:
60
  text_button = gr.Button("Submit Text")
61
  text_button.click(fn=text_to_voice, inputs=text_input, outputs=[text_output_text, text_output_audio])
62
 
 
 
 
 
 
 
63
  demo.launch()
 
 
1
+ import gradio as gr
2
  import whisper
3
+ import time
4
  from main import conversation_with_voice
5
 
6
  # Load Whisper model
 
8
 
9
  # Description displayed at the top of the UI
10
  description = """
11
+ Proof Of Concept
12
+ This demo allows you to interact with an AI using both voice-to-voice and text-to-speech capabilities.
13
+ **Why each tool was added:**
14
+ - Whisper (OpenAI): Used for converting spoken input to text because Sesame AI currently only supports text-to-speech.
15
+ - LLaMA 3 (AWS): Acts as the brain that generates intelligent responses from your questions.
16
+ - Sesame AI (Hugging Face): Converts the AI's response back to expressive speech.
17
 
18
+ **Example questions you can ask:**
19
+ - What are the healthiest oils to cook with?
20
+ - How much water should I drink daily?
21
+ - What are good snacks for weight loss?
 
 
 
 
 
 
 
22
 
23
  Created by Kara Granados
24
  """
 
26
  def voice_to_voice(audio_file):
27
  if audio_file is None:
28
  return "No audio received", None
29
+ start_time = time.time()
30
  result = model.transcribe(audio_file)
31
  user_input = result["text"]
32
  response = conversation_with_voice(user_input)
33
+ end_time = time.time()
34
+ print(f"Total processing time (voice input): {end_time - start_time:.2f} seconds")
35
  if "error" in response:
36
  return response.get("error"), None
37
  return response["text_response"], response["audio_path"]
 
39
  def text_to_voice(text_input):
40
  if not text_input.strip():
41
  return "Please enter a question.", None
42
+ start_time = time.time()
43
  response = conversation_with_voice(text_input)
44
+ end_time = time.time()
45
+ print(f"Total processing time (text input): {end_time - start_time:.2f} seconds")
46
  if "error" in response:
47
  return response.get("error"), None
48
  return response["text_response"], response["audio_path"]
 
65
  text_button = gr.Button("Submit Text")
66
  text_button.click(fn=text_to_voice, inputs=text_input, outputs=[text_output_text, text_output_audio])
67
 
68
+ gr.Markdown("""
69
+ **NOTE:** This demo is intended for testing purposes. The longer response time is due to using free-tier resources on Hugging Face. In a production environment, dedicated infrastructure will be used to ensure real-time performance.
70
+
71
+ **Additional Info:** The CSM (Conversational Speech Model) used for voice output is a large model and may take additional time to load and generate audio responses, especially during the first use or after inactivity.
72
+ """)
73
+
74
  demo.launch()
75
+