File size: 4,024 Bytes
28b1c5e
 
 
fec10d3
3f8c577
28b1c5e
3f8c577
20acaf7
28b1c5e
 
 
 
 
 
 
 
fec10d3
28b1c5e
20acaf7
fec10d3
 
 
 
 
 
 
 
28b1c5e
 
 
 
 
8998902
28b1c5e
 
 
8998902
20acaf7
456232f
 
 
20acaf7
 
 
456232f
20acaf7
8998902
 
 
28b1c5e
8998902
 
 
28b1c5e
8998902
 
28b1c5e
8998902
 
 
28b1c5e
3f8c577
 
 
 
 
 
 
 
fec10d3
8998902
fec10d3
 
3f8c577
8998902
 
20acaf7
 
 
 
 
3f8c577
20acaf7
 
3f8c577
fec10d3
 
3f8c577
 
 
 
 
 
fec10d3
 
3f8c577
 
 
 
 
 
 
 
 
fec10d3
 
3f8c577
28b1c5e
ee0b049
3f8c577
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
import os
import torchaudio
import gradio as gr
import torch
from fastapi import FastAPI, HTTPException, File, UploadFile
from speechbrain.inference import SpeakerRecognition
from fastapi.responses import JSONResponse
import numpy as np

# Initialize the speaker verification model
speaker_verification = SpeakerRecognition.from_hparams(
    source="speechbrain/spkrec-ecapa-voxceleb",
    savedir="tmp_model"
)

# Function to calculate similarity score
def get_similarity(audio1, audio2, sample_rate=16000):
    try:
        # Ensure audio1 and audio2 are numpy arrays
        signal1 = torch.tensor(audio1)
        signal2 = torch.tensor(audio2)

        # Make sure the signals are in the right shape (2D tensor: (1, N))
        if signal1.ndimension() == 1:
            signal1 = signal1.unsqueeze(0)
        if signal2.ndimension() == 1:
            signal2 = signal2.unsqueeze(0)

        # Get similarity score and prediction
        score, prediction = speaker_verification.verify_batch(signal1, signal2)
        return float(score), "Yes" if prediction else "No"
    except Exception as e:
        return None, str(e)  # Return error message if any exception

# API function to compare voices
def compare_voices(file1, file2):
    try:
        # Debugging: Check the types of inputs
        print(f"Received file1: {type(file1)}")
        print(f"Received file2: {type(file2)}")

        # Ensure file1 and file2 are numpy arrays
        if isinstance(file1, np.ndarray) and isinstance(file2, np.ndarray):
            audio1, audio2 = file1, file2
        else:
            return {"error": "Invalid input format. Both inputs must be numpy arrays."}

        # Get similarity score
        score, is_same_user = get_similarity(audio1, audio2)

        if score is None:
            # Return the error message if processing fails
            return {"error": is_same_user}

        # Return a dictionary with the similarity score and prediction
        return {"Similarity Score": f"{score:.4f}", "Same User Prediction": is_same_user}

    except Exception as e:
        # Handle unexpected errors
        return {"error": str(e)}

# FastAPI app
app = FastAPI()

@app.post("/compare_voices/")
async def compare_voices_api(file1: UploadFile = File(...), file2: UploadFile = File(...)):
    """
    Compare two audio files and return the similarity score and prediction.
    """
    try:
        # Process the audio files and return them as numpy arrays
        file1_data = await file1.read()
        file2_data = await file2.read()

        # Assuming the audio is decoded into numpy arrays here (e.g., using torchaudio)
        # For example:
        audio1, _ = torchaudio.load(io.BytesIO(file1_data))  # (Tensor, sample_rate)
        audio2, _ = torchaudio.load(io.BytesIO(file2_data))  # (Tensor, sample_rate)

        audio1 = audio1.numpy()
        audio2 = audio2.numpy()

        # Compare the two audio files and return the result
        return compare_voices(audio1, audio2)

    except Exception as e:
        raise HTTPException(status_code=400, detail=str(e))

# Gradio interface function
def gradio_interface():
    return gr.Interface(
        fn=compare_voices,
        inputs=[
            gr.Audio(type="numpy", label="First Audio File"),  # Gradio now gives numpy arrays
            gr.Audio(type="numpy", label="Second Audio File")  # Gradio now gives numpy arrays
        ],
        outputs="json",  # Output results as JSON
        live=False  # No live interface, just the API
    )

# Launch Gradio as a web interface
@app.on_event("startup")
async def startup():
    gr.Interface(fn=compare_voices, inputs=[
        gr.Audio(type="numpy", label="First Audio File"),  # Gradio now gives numpy arrays
        gr.Audio(type="numpy", label="Second Audio File")  # Gradio now gives numpy arrays
    ], outputs="json", live=False).launch(share=True, inline=True)

# Running the FastAPI app with Gradio
if __name__ == "__main__":
    import uvicorn
    uvicorn.run(app, host="0.0.0.0", port=5000)