Spaces:
Running
on
Zero
Running
on
Zero
Update app.py
Browse files
app.py
CHANGED
@@ -29,26 +29,27 @@ def load_model():
|
|
29 |
return model, processor
|
30 |
|
31 |
def chunk_audio(audio_file_path, chunk_length=30):
|
32 |
-
"""Split audio into
|
33 |
try:
|
|
|
|
|
|
|
|
|
34 |
target_rate = 16000
|
35 |
chunk_length_ms = chunk_length * 1000
|
36 |
|
37 |
-
# Load and resample
|
38 |
-
audio = AudioSegment.from_file(audio_file_path)
|
39 |
-
audio = audio.set_channels(1).set_frame_rate(target_rate)
|
40 |
-
|
41 |
-
chunks = []
|
42 |
for i in range(0, len(audio), chunk_length_ms):
|
43 |
chunk = audio[i:i + chunk_length_ms]
|
|
|
44 |
|
45 |
-
|
46 |
-
|
47 |
-
|
|
|
|
|
|
|
48 |
|
49 |
-
|
50 |
-
return chunks
|
51 |
-
|
52 |
except Exception as e:
|
53 |
raise gr.Error(f"Error processing audio file: {str(e)}. Make sure ffmpeg is installed.")
|
54 |
|
@@ -95,7 +96,6 @@ def search_audio(query, embeddings, audios, top_k=5):
|
|
95 |
|
96 |
# Score against all embeddings
|
97 |
scores = processor.score_multi_vector(query_embeddings, embeddings)
|
98 |
-
top_k = min(top_k, scores[0].shape[0])
|
99 |
top_indices = scores[0].topk(top_k).indices.tolist()
|
100 |
|
101 |
# Move model back to CPU
|
@@ -115,10 +115,10 @@ def audio_to_base64(data, rate=16000):
|
|
115 |
def process_audio_rag(audio_file_path, query, chunk_length=30, use_openai=False, openai_key=None):
|
116 |
"""Main processing function"""
|
117 |
if not audio_file_path:
|
118 |
-
return "Please upload an audio file", None
|
119 |
|
120 |
if not query:
|
121 |
-
return "Please enter a search query", None
|
122 |
|
123 |
try:
|
124 |
# Chunk audio
|
@@ -132,8 +132,7 @@ def process_audio_rag(audio_file_path, query, chunk_length=30, use_openai=False,
|
|
132 |
|
133 |
# Prepare results
|
134 |
result_text = f"Found {len(top_indices)} relevant audio chunks:\n"
|
135 |
-
result_text += f"Chunk indices: {top_indices}\n"
|
136 |
-
result_text += f"Total chunks in audio: {len(audios)}\n\n"
|
137 |
|
138 |
# Save first result as audio file
|
139 |
first_chunk_path = "result_chunk.wav"
|
@@ -141,7 +140,6 @@ def process_audio_rag(audio_file_path, query, chunk_length=30, use_openai=False,
|
|
141 |
|
142 |
# Optional: Use OpenAI for answer generation
|
143 |
if use_openai and openai_key:
|
144 |
-
result_text += "Generating textual answer from retrieved audio chunks...\n\n"
|
145 |
from openai import OpenAI
|
146 |
client = OpenAI(api_key=openai_key)
|
147 |
|
@@ -164,14 +162,23 @@ def process_audio_rag(audio_file_path, query, chunk_length=30, use_openai=False,
|
|
164 |
model="gpt-4o-audio-preview",
|
165 |
messages=[{"role": "user", "content": content}]
|
166 |
)
|
167 |
-
result_text += f"
|
168 |
except Exception as e:
|
169 |
-
result_text += f"
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
170 |
|
171 |
-
return result_text, first_chunk_path
|
172 |
|
173 |
except Exception as e:
|
174 |
-
return f"Error: {str(e)}", None
|
175 |
|
176 |
# Create Gradio interface
|
177 |
with gr.Blocks(title="AudioRAG Demo") as demo:
|
|
|
29 |
return model, processor
|
30 |
|
31 |
def chunk_audio(audio_file_path, chunk_length=30):
|
32 |
+
"""Split audio into chunks"""
|
33 |
try:
|
34 |
+
# audio_file_path is already a string path when type="filepath"
|
35 |
+
audio = AudioSegment.from_file(audio_file_path)
|
36 |
+
|
37 |
+
audios = []
|
38 |
target_rate = 16000
|
39 |
chunk_length_ms = chunk_length * 1000
|
40 |
|
|
|
|
|
|
|
|
|
|
|
41 |
for i in range(0, len(audio), chunk_length_ms):
|
42 |
chunk = audio[i:i + chunk_length_ms]
|
43 |
+
chunk = chunk.set_channels(1).set_frame_rate(target_rate)
|
44 |
|
45 |
+
buf = io.BytesIO()
|
46 |
+
chunk.export(buf, format="wav")
|
47 |
+
buf.seek(0)
|
48 |
+
|
49 |
+
rate, data = wavfile.read(buf)
|
50 |
+
audios.append(data)
|
51 |
|
52 |
+
return audios
|
|
|
|
|
53 |
except Exception as e:
|
54 |
raise gr.Error(f"Error processing audio file: {str(e)}. Make sure ffmpeg is installed.")
|
55 |
|
|
|
96 |
|
97 |
# Score against all embeddings
|
98 |
scores = processor.score_multi_vector(query_embeddings, embeddings)
|
|
|
99 |
top_indices = scores[0].topk(top_k).indices.tolist()
|
100 |
|
101 |
# Move model back to CPU
|
|
|
115 |
def process_audio_rag(audio_file_path, query, chunk_length=30, use_openai=False, openai_key=None):
|
116 |
"""Main processing function"""
|
117 |
if not audio_file_path:
|
118 |
+
return "Please upload an audio file", None, None
|
119 |
|
120 |
if not query:
|
121 |
+
return "Please enter a search query", None, None
|
122 |
|
123 |
try:
|
124 |
# Chunk audio
|
|
|
132 |
|
133 |
# Prepare results
|
134 |
result_text = f"Found {len(top_indices)} relevant audio chunks:\n"
|
135 |
+
result_text += f"Chunk indices: {top_indices}\n\n"
|
|
|
136 |
|
137 |
# Save first result as audio file
|
138 |
first_chunk_path = "result_chunk.wav"
|
|
|
140 |
|
141 |
# Optional: Use OpenAI for answer generation
|
142 |
if use_openai and openai_key:
|
|
|
143 |
from openai import OpenAI
|
144 |
client = OpenAI(api_key=openai_key)
|
145 |
|
|
|
162 |
model="gpt-4o-audio-preview",
|
163 |
messages=[{"role": "user", "content": content}]
|
164 |
)
|
165 |
+
result_text += f"\nOpenAI Answer: {completion.choices[0].message.content}"
|
166 |
except Exception as e:
|
167 |
+
result_text += f"\nOpenAI Error: {str(e)}"
|
168 |
+
|
169 |
+
# Create audio visualization
|
170 |
+
import matplotlib.pyplot as plt
|
171 |
+
fig, ax = plt.subplots(figsize=(10, 4))
|
172 |
+
ax.plot(audios[top_indices[0]])
|
173 |
+
ax.set_title(f"Waveform of top matching chunk (#{top_indices[0]})")
|
174 |
+
ax.set_xlabel("Samples")
|
175 |
+
ax.set_ylabel("Amplitude")
|
176 |
+
plt.tight_layout()
|
177 |
|
178 |
+
return result_text, first_chunk_path, fig
|
179 |
|
180 |
except Exception as e:
|
181 |
+
return f"Error: {str(e)}", None, None
|
182 |
|
183 |
# Create Gradio interface
|
184 |
with gr.Blocks(title="AudioRAG Demo") as demo:
|