import os import torchaudio import gradio as gr import torch from fastapi import FastAPI, HTTPException, File, UploadFile from speechbrain.inference import SpeakerRecognition from fastapi.responses import JSONResponse import numpy as np # Initialize the speaker verification model speaker_verification = SpeakerRecognition.from_hparams( source="speechbrain/spkrec-ecapa-voxceleb", savedir="tmp_model" ) # Function to calculate similarity score def get_similarity(audio1, audio2, sample_rate=16000): try: # Ensure audio1 and audio2 are numpy arrays signal1 = torch.tensor(audio1) signal2 = torch.tensor(audio2) # Make sure the signals are in the right shape (2D tensor: (1, N)) if signal1.ndimension() == 1: signal1 = signal1.unsqueeze(0) if signal2.ndimension() == 1: signal2 = signal2.unsqueeze(0) # Get similarity score and prediction score, prediction = speaker_verification.verify_batch(signal1, signal2) return float(score), "Yes" if prediction else "No" except Exception as e: return None, str(e) # Return error message if any exception # API function to compare voices def compare_voices(file1, file2): try: # Debugging: Check the types of inputs print(f"Received file1: {type(file1)}") print(f"Received file2: {type(file2)}") # Ensure file1 and file2 are numpy arrays if isinstance(file1, np.ndarray) and isinstance(file2, np.ndarray): audio1, audio2 = file1, file2 else: return {"error": "Invalid input format. Both inputs must be numpy arrays."} # Get similarity score score, is_same_user = get_similarity(audio1, audio2) if score is None: # Return the error message if processing fails return {"error": is_same_user} # Return a dictionary with the similarity score and prediction return {"Similarity Score": f"{score:.4f}", "Same User Prediction": is_same_user} except Exception as e: # Handle unexpected errors return {"error": str(e)} # FastAPI app app = FastAPI() @app.post("/compare_voices/") async def compare_voices_api(file1: UploadFile = File(...), file2: UploadFile = File(...)): """ Compare two audio files and return the similarity score and prediction. """ try: # Process the audio files and return them as numpy arrays file1_data = await file1.read() file2_data = await file2.read() # Assuming the audio is decoded into numpy arrays here (e.g., using torchaudio) # For example: audio1, _ = torchaudio.load(io.BytesIO(file1_data)) # (Tensor, sample_rate) audio2, _ = torchaudio.load(io.BytesIO(file2_data)) # (Tensor, sample_rate) audio1 = audio1.numpy() audio2 = audio2.numpy() # Compare the two audio files and return the result return compare_voices(audio1, audio2) except Exception as e: raise HTTPException(status_code=400, detail=str(e)) # Gradio interface function def gradio_interface(): return gr.Interface( fn=compare_voices, inputs=[ gr.Audio(type="numpy", label="First Audio File"), # Gradio now gives numpy arrays gr.Audio(type="numpy", label="Second Audio File") # Gradio now gives numpy arrays ], outputs="json", # Output results as JSON live=False # No live interface, just the API ) # Launch Gradio as a web interface @app.on_event("startup") async def startup(): gr.Interface(fn=compare_voices, inputs=[ gr.Audio(type="numpy", label="First Audio File"), # Gradio now gives numpy arrays gr.Audio(type="numpy", label="Second Audio File") # Gradio now gives numpy arrays ], outputs="json", live=False).launch(share=True, inline=True) # Running the FastAPI app with Gradio if __name__ == "__main__": import uvicorn uvicorn.run(app, host="0.0.0.0", port=5000)