File size: 2,609 Bytes
47ce5f0
 
 
 
 
39f1550
8b265e9
47ce5f0
 
 
 
39f1550
47ce5f0
 
 
39f1550
47ce5f0
 
 
 
 
 
 
 
 
 
39f1550
 
 
 
47ce5f0
 
e04d1d4
39f1550
47ce5f0
39f1550
47ce5f0
 
 
 
39f1550
47ce5f0
39f5f9a
47ce5f0
39f5f9a
39f1550
47ce5f0
 
 
39f5f9a
47ce5f0
39f1550
47ce5f0
 
 
 
 
 
39f5f9a
47ce5f0
 
 
 
 
 
 
 
69452e3
 
 
47ce5f0
69452e3
 
47ce5f0
39f1550
47ce5f0
 
e04d1d4
47ce5f0
 
 
 
e04d1d4
47ce5f0
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
import gradio as gr
from transformers import Wav2Vec2ForSequenceClassification, Wav2Vec2Processor
import torch
import torchaudio

# Modelni yuklash
model_name = "Mrkomiljon/voiceGUARD"
model = Wav2Vec2ForSequenceClassification.from_pretrained(model_name)
processor = Wav2Vec2Processor.from_pretrained(model_name)
model.eval()

# Device
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)

# Klass mapping
id2label = {
    0: "diffwave",
    1: "melgan",
    2: "parallel_wave_gan",
    3: "Real",
    4: "wavegrad",
    5: "wavnet",
    6: "wavernn"
}

# Prediction funksiyasi
def predict_audio(file_path):
    target_sample_rate = 16000
    max_length = target_sample_rate * 10

    try:
        # upload audio file
        waveform, sample_rate = torchaudio.load(file_path)

        # Resample agar sample_rate mos kelmasa
        if sample_rate != target_sample_rate:
            resampler = torchaudio.transforms.Resample(orig_freq=sample_rate, new_freq=target_sample_rate)
            waveform = resampler(waveform)

        # Truncate yoki pad
        if waveform.size(1) > max_length:
            waveform = waveform[:, :max_length]
        elif waveform.size(1) < max_length:
            waveform = torch.nn.functional.pad(waveform, (0, max_length - waveform.size(1)))

        if waveform.ndim > 1:
            waveform = waveform[0]

        # Preprocess input
        inputs = processor(
            waveform.numpy(),
            sampling_rate=target_sample_rate,
            return_tensors="pt",
            padding=True
        )
        input_values = inputs["input_values"].to(device)

        # Inference
        with torch.no_grad():
            logits = model(input_values).logits
            probabilities = torch.nn.functional.softmax(logits, dim=-1)
            predicted_label = torch.argmax(probabilities, dim=-1).item()
            confidence = probabilities[0, predicted_label].item()

        class_name = id2label.get(predicted_label, "Unknown Class")

        # Return alohida qiymatlar
        return class_name, float(confidence)

    except Exception as e:
        # Xatolik bo'lsa
        return "Error", str(e)

# Gradio interfeysi
iface = gr.Interface(
    fn=predict_audio,
    inputs=gr.Audio(type="filepath"),  # `filepath` parameter use
    outputs=[
        gr.Label(label="Predicted Class"),
        gr.Label(label="Confidence")
    ],
    title="Human or AI-generated voice classification",
    description="Upload an audio file to classify it into one of the predefined categories."
)

if __name__ == "__main__":
    iface.launch()