File size: 1,711 Bytes
f5790af
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
import torch
import torch.nn as nn
import torch.nn.functional as F
import torchaudio
import wget
import json
import os

STT_FOLDER = "./STTModel"
STT_MODEL_NAME = "wav2vec2"
STT_MODEL_WEIGHTS = "pytorch_model.bin"
STT_CONFIG = "config.json"
STT_VOCAB = "vocab.json"
STT_MODEL_WEIGHTS_URL = "https://huggingface.co/facebook/wav2vec2-base-960h/resolve/main/pytorch_model.bin"
STT_CONFIG_URL = "https://huggingface.co/facebook/wav2vec2-base-960h/resolve/main/config.json"
STT_VOCAB_URL = "https://huggingface.co/facebook/wav2vec2-base-960h/resolve/main/vocab.json"
STT_FILES_URLS = [
    (STT_MODEL_WEIGHTS_URL, STT_MODEL_WEIGHTS),
    (STT_CONFIG_URL, STT_CONFIG),
    (STT_VOCAB_URL, STT_VOCAB),
]

def ensure_stt_files_exist():
    os.makedirs(STT_FOLDER, exist_ok=True)
    for url, filename in STT_FILES_URLS:
        filepath = os.path.join(STT_FOLDER, filename)
        if not os.path.exists(filepath):
            wget.download(url, out=filepath)

class Wav2Vec2ForCTC(nn.Module):
    def __init__(self, num_classes):
        super().__init__()
        self.conv1 = nn.Conv1d(1, 16, kernel_size=5, stride=2, padding=2)
        self.relu1 = nn.ReLU()
        self.pool1 = nn.MaxPool1d(kernel_size=2, stride=2)
        self.conv2 = nn.Conv1d(16, 32, kernel_size=3, stride=2, padding=1)
        self.relu2 = nn.ReLU()
        self.pool2 = nn.MaxPool1d(kernel_size=2, stride=2)
        self.fc = nn.Linear(32 * 39 * 40, num_classes) # Adjusted input size

    def forward(self, x):
        x = self.pool1(self.relu1(self.conv1(x)))
        x = self.pool2(self.relu2(self.conv2(x)))
        x = x.view(x.size(0), -1)
        logits = self.fc(x)
        return logits