Update README.md
Browse files
README.md
CHANGED
@@ -20,7 +20,7 @@ The main difference with classical ASV embeddings is that here only the non-timb
|
|
20 |
|
21 |
The model has been derived from the self-supervised pretrained model [WavLM-large](https://huggingface.co/microsoft/wavlm-large).
|
22 |
|
23 |
-
See section below for an eplanation on how to compute the non-timbral embeddings.
|
24 |
|
25 |
# Publication
|
26 |
Details about the method used to build this model have been published at Interspeech 2024 in the paper entitled
|
@@ -43,7 +43,43 @@ Gengembre, N., Le Blouch, O., Gendrot, C. (2024) Disentangling prosody and timbr
|
|
43 |
```
|
44 |
|
45 |
# Usage
|
46 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
47 |
|
48 |
# Limitations
|
49 |
The fine tuning data used to produce this model (VoxCeleb, VCTK) are mostly in english, which may affect the performance on other languages.
|
|
|
20 |
|
21 |
The model has been derived from the self-supervised pretrained model [WavLM-large](https://huggingface.co/microsoft/wavlm-large).
|
22 |
|
23 |
+
See [Usage section] (https://huggingface.co/ggmbr/wnt#usage) below for an eplanation on how to compute the non-timbral embeddings.
|
24 |
|
25 |
# Publication
|
26 |
Details about the method used to build this model have been published at Interspeech 2024 in the paper entitled
|
|
|
43 |
```
|
44 |
|
45 |
# Usage
|
46 |
+
```
|
47 |
+
import torch
|
48 |
+
import torch.nn as nn
|
49 |
+
from transformers.models.wavlm.modeling_wavlm import WavLMPreTrainedModel, WavLMModel
|
50 |
+
|
51 |
+
class TopLayers(nn.Module):
|
52 |
+
def __init__(self, embd_size = 250, top_interm_size = 512):
|
53 |
+
super(TopLayers, self).__init__()
|
54 |
+
self.affine1 = nn.Conv1d(in_channels=2048, out_channels=top_interm_size, kernel_size=1)
|
55 |
+
self.batchnorm1 = nn.BatchNorm1d(num_features=top_interm_size, affine=False, eps=1e-03)
|
56 |
+
self.affine2 = nn.Conv1d(in_channels=top_interm_size, out_channels=embd_size, kernel_size=1)
|
57 |
+
self.batchnorm2 = nn.BatchNorm1d(num_features=embd_size, affine=False, eps=1e-03)
|
58 |
+
self.activation = nn.ReLU(inplace=True)
|
59 |
+
|
60 |
+
def forward(self, x):
|
61 |
+
out = self.batchnorm1(self.activation(self.affine1(x)))
|
62 |
+
out = self.batchnorm2(self.activation(self.affine2(out)))
|
63 |
+
return nn.functional.normalize(out[:,:,0])
|
64 |
+
|
65 |
+
class EmbeddingsModel(WavLMPreTrainedModel):
|
66 |
+
def __init__(self, config):
|
67 |
+
super().__init__(config)
|
68 |
+
self.wavlm = WavLMModel(config)
|
69 |
+
self.top_layers = TopLayers(config.embd_size, config.top_interm_size)
|
70 |
+
|
71 |
+
def forward(self, input_values):
|
72 |
+
# MVN normalization
|
73 |
+
x_norm = (input_values - input_values.mean(dim=1).unsqueeze(1)) / (input_values.std(dim=1).unsqueeze(1))
|
74 |
+
base_out = self.wavlm(input_values=x_norm, output_hidden_states=False).last_hidden_state
|
75 |
+
v = base_out.var(dim=1).clamp(min=1e-10)
|
76 |
+
x_stats = torch.cat((base_out.mean(dim=1),v.pow(0.5)),dim=1).unsqueeze(dim=2)
|
77 |
+
return self.top_layers(x_stats)
|
78 |
+
|
79 |
+
nt_extractor = EmbeddingsModel("ggmbr/wnt")
|
80 |
+
```
|
81 |
+
|
82 |
+
# Evaluations
|
83 |
|
84 |
# Limitations
|
85 |
The fine tuning data used to produce this model (VoxCeleb, VCTK) are mostly in english, which may affect the performance on other languages.
|