Audio-to-Audio
Safetensors
torch
lucadellalib commited on
Commit
5c3331f
·
verified ·
1 Parent(s): 1434d48

Upload 4 files

Browse files
.gitattributes CHANGED
@@ -33,3 +33,4 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
33
  *.zip filter=lfs diff=lfs merge=lfs -text
34
  *.zst filter=lfs diff=lfs merge=lfs -text
35
  *tfevents* filter=lfs diff=lfs merge=lfs -text
 
 
33
  *.zip filter=lfs diff=lfs merge=lfs -text
34
  *.zst filter=lfs diff=lfs merge=lfs -text
35
  *tfevents* filter=lfs diff=lfs merge=lfs -text
36
+ focalcodec.png filter=lfs diff=lfs merge=lfs -text
LibriTTS960_12_5Hz.json ADDED
@@ -0,0 +1,63 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "encoder_name": "WavLM",
3
+ "encoder_config": {
4
+ "hidden_dims": [512, 512, 512, 512, 512, 512, 512],
5
+ "kernel_sizes": [10, 3, 3, 3, 3, 2, 2],
6
+ "strides": [5, 2, 2, 2, 2, 2, 2],
7
+ "num_layers": 6,
8
+ "dim": 1024,
9
+ "ffn_dim": 4096,
10
+ "num_heads": 16,
11
+ "num_buckets": 320,
12
+ "max_distance": 800,
13
+ "dropout": 0.0,
14
+ "conv_pos": 128,
15
+ "conv_pos_groups": 16
16
+ },
17
+ "compressor_name": "FocalEncoder",
18
+ "compressor_config": {
19
+ "input_dim": 1024,
20
+ "output_dim": 13,
21
+ "hidden_dims": [1024, 512, 256],
22
+ "downscale_factors": [2, 2, 1],
23
+ "focal_window": 7,
24
+ "focal_level": 2,
25
+ "focal_factor": 2,
26
+ "dropout": 0.0,
27
+ "use_post_norm": false,
28
+ "use_layerscale": false,
29
+ "layerscale_init": 0.0001,
30
+ "normalize_modulator": false
31
+ },
32
+ "quantizer_name": "BinarySphericalQuantizer",
33
+ "quantizer_config": {
34
+ "codebook_size": 8192
35
+ },
36
+ "decompressor_name": "FocalDecoder",
37
+ "decompressor_config": {
38
+ "input_dim": 13,
39
+ "output_dim": 1024,
40
+ "hidden_dims": [256, 512, 1024],
41
+ "upscale_factors": [1, 2, 2],
42
+ "focal_window": 7,
43
+ "focal_level": 2,
44
+ "focal_factor": 2,
45
+ "dropout": 0.0,
46
+ "use_post_norm": false,
47
+ "use_layerscale": false,
48
+ "layerscale_init": 0.0001,
49
+ "normalize_modulator": false
50
+ },
51
+ "decoder_name": "Vocos",
52
+ "decoder_config": {
53
+ "input_channels": 1024,
54
+ "num_layers": 8,
55
+ "dim": 512,
56
+ "ffn_dim": 1536,
57
+ "kernel_size": 7,
58
+ "padding": 3,
59
+ "layerscale_init": null,
60
+ "n_fft": 1024,
61
+ "hop_length": 320
62
+ }
63
+ }
LibriTTS960_12_5Hz.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:5362c33ed75801d9bced7e8573f8eece674592ea3c3156451a85f4b924b1c1e5
3
+ size 581137532
README.md CHANGED
@@ -1,3 +1,377 @@
1
- ---
2
- license: apache-2.0
3
- ---
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ ---
2
+ license: apache-2.0
3
+ base_model:
4
+ - microsoft/wavlm-large
5
+ pipeline_tag: audio-to-audio
6
+ ---
7
+
8
+ # FocalCodec
9
+
10
+ A low-bitrate single-codebook 16 kHz speech codec based on [focal modulation](https://arxiv.org/abs/2203.11926).
11
+
12
+ - **Preprint**: https://arxiv.org/abs/2502.04465
13
+
14
+ - **Project Page**: https://lucadellalib.github.io/focalcodec-web/
15
+
16
+ - **GitHub**: https://github.com/lucadellalib/focalcodec
17
+
18
+ <img src="focalcodec.png" width="700">
19
+
20
+ ---------------------------------------------------------------------------------------------------------
21
+
22
+ ## ▶️ Quickstart
23
+
24
+ See the readme at: https://github.com/lucadellalib/focalcodec
25
+
26
+ ---------------------------------------------------------------------------------------------------------
27
+
28
+ ## 📌 Available Checkpoints
29
+
30
+ | Checkpoint | Token Rate (Hz) | Bitrate (kbps) | Dataset |
31
+ |:-----------------------:|:---------------:|:--------------:|:-----------:|
32
+ | **LibriTTS960_50Hz** | 50.0 | 0.65 | LibriTTS960 |
33
+ | **LibriTTS960_25Hz** | 25.0 | 0.33 | LibriTTS960 |
34
+ | **LibriTTS960_12_5Hz** | 12.5 | 0.16 | LibriTTS960 |
35
+
36
+ ---------------------------------------------------------------------------------------------------------
37
+
38
+ ## @ Citing
39
+
40
+ ```
41
+ @article{dellalibera2025focalcodec,
42
+ title = {{FocalCodec}: Low-Bitrate Speech Coding via Focal Modulation Networks},
43
+ author = {Luca {Della Libera} and Francesco Paissan and Cem Subakan and Mirco Ravanelli},
44
+ journal = {arXiv preprint arXiv:2502.04465},
45
+ year = {2025},
46
+ }
47
+ ```
48
+
49
+ ---------------------------------------------------------------------------------------------------------
50
+
51
+ ## 📧 Contact
52
+
53
+ [luca.dellalib@gmail.com](mailto:luca.dellalib@gmail.com)
54
+
55
+ ---------------------------------------------------------------------------------------------------------
56
+
57
+ # File information
58
+
59
+ The repository contains the following file information:
60
+
61
+ Filename: LibriTTS960_25Hz.json
62
+ Content: {
63
+ "encoder_name": "WavLM",
64
+ "encoder_config": {
65
+ "hidden_dims": [
66
+ 512,
67
+ 512,
68
+ 512,
69
+ 512,
70
+ 512,
71
+ 512,
72
+ 512
73
+ ],
74
+ "kernel_sizes": [
75
+ 10,
76
+ 3,
77
+ 3,
78
+ 3,
79
+ 3,
80
+ 2,
81
+ 2
82
+ ],
83
+ "strides": [
84
+ 5,
85
+ 2,
86
+ 2,
87
+ 2,
88
+ 2,
89
+ 2,
90
+ 2
91
+ ],
92
+ "num_layers": 6,
93
+ "dim": 1024,
94
+ "ffn_dim": 4096,
95
+ "num_heads": 16,
96
+ "num_buckets": 320,
97
+ "max_distance": 800,
98
+ "dropout": 0.0,
99
+ "conv_pos": 128,
100
+ "conv_pos_groups": 16
101
+ },
102
+ "compressor_name": "FocalEncoder",
103
+ "compressor_config": {
104
+ "input_dim": 1024,
105
+ "output_dim": 13,
106
+ "hidden_dims": [
107
+ 1024,
108
+ 512,
109
+ 256
110
+ ],
111
+ "downscale_factors": [
112
+ 2,
113
+ 1,
114
+ 1
115
+ ],
116
+ "focal_window": 7,
117
+ "focal_level": 2,
118
+ "focal_factor": 2,
119
+ "dropout": 0.0,
120
+ "use_post_norm": false,
121
+ "use_layerscale": false,
122
+ "layerscale_init": 0.0001,
123
+ "normalize_modulator": false
124
+ },
125
+ "quantizer_name": "BinarySphericalQuantizer",
126
+ "quantizer_config": {
127
+ "codebook_size": 8192
128
+ },
129
+ "decompressor_name": "FocalDecoder",
130
+ "decompressor_config": {
131
+ "input_dim": 13,
132
+ "output_dim": 1024,
133
+ "hidden_dims": [
134
+ 256,
135
+ 512,
136
+ 1024
137
+ ],
138
+ "upscale_factors": [
139
+ 1,
140
+ 1,
141
+ 2
142
+ ],
143
+ "focal_window": 7,
144
+ "focal_level": 2,
145
+ "focal_factor": 2,
146
+ "dropout": 0.0,
147
+ "use_post_norm": false,
148
+ "use_layerscale": false,
149
+ "layerscale_init": 0.0001,
150
+ "normalize_modulator": false
151
+ },
152
+ "decoder_name": "Vocos",
153
+ "decoder_config": {
154
+ "input_channels": 1024,
155
+ "num_layers": 8,
156
+ "dim": 512,
157
+ "ffn_dim": 1536,
158
+ "kernel_size": 7,
159
+ "padding": 3,
160
+ "layerscale_init": null,
161
+ "n_fft": 1024,
162
+ "hop_length": 320
163
+ }
164
+ }
165
+
166
+ Filename: focalcodec.png
167
+ Content: "Content of the file is larger than 50 KB, too long to display."
168
+
169
+ Filename: LibriTTS960_50Hz.json
170
+ Content: {
171
+ "encoder_name": "WavLM",
172
+ "encoder_config": {
173
+ "hidden_dims": [
174
+ 512,
175
+ 512,
176
+ 512,
177
+ 512,
178
+ 512,
179
+ 512,
180
+ 512
181
+ ],
182
+ "kernel_sizes": [
183
+ 10,
184
+ 3,
185
+ 3,
186
+ 3,
187
+ 3,
188
+ 2,
189
+ 2
190
+ ],
191
+ "strides": [
192
+ 5,
193
+ 2,
194
+ 2,
195
+ 2,
196
+ 2,
197
+ 2,
198
+ 2
199
+ ],
200
+ "num_layers": 6,
201
+ "dim": 1024,
202
+ "ffn_dim": 4096,
203
+ "num_heads": 16,
204
+ "num_buckets": 320,
205
+ "max_distance": 800,
206
+ "dropout": 0.0,
207
+ "conv_pos": 128,
208
+ "conv_pos_groups": 16
209
+ },
210
+ "compressor_name": "FocalEncoder",
211
+ "compressor_config": {
212
+ "input_dim": 1024,
213
+ "output_dim": 13,
214
+ "hidden_dims": [
215
+ 1024,
216
+ 512,
217
+ 256
218
+ ],
219
+ "downscale_factors": [
220
+ 1,
221
+ 1,
222
+ 1
223
+ ],
224
+ "focal_window": 7,
225
+ "focal_level": 2,
226
+ "focal_factor": 2,
227
+ "dropout": 0.0,
228
+ "use_post_norm": false,
229
+ "use_layerscale": false,
230
+ "layerscale_init": 0.0001,
231
+ "normalize_modulator": false
232
+ },
233
+ "quantizer_name": "BinarySphericalQuantizer",
234
+ "quantizer_config": {
235
+ "codebook_size": 8192
236
+ },
237
+ "decompressor_name": "FocalDecoder",
238
+ "decompressor_config": {
239
+ "input_dim": 13,
240
+ "output_dim": 1024,
241
+ "hidden_dims": [
242
+ 256,
243
+ 512,
244
+ 1024
245
+ ],
246
+ "upscale_factors": [
247
+ 1,
248
+ 1,
249
+ 1
250
+ ],
251
+ "focal_window": 7,
252
+ "focal_level": 2,
253
+ "focal_factor": 2,
254
+ "dropout": 0.0,
255
+ "use_post_norm": false,
256
+ "use_layerscale": false,
257
+ "layerscale_init": 0.0001,
258
+ "normalize_modulator": false
259
+ },
260
+ "decoder_name": "Vocos",
261
+ "decoder_config": {
262
+ "input_channels": 1024,
263
+ "num_layers": 8,
264
+ "dim": 512,
265
+ "ffn_dim": 1536,
266
+ "kernel_size": 7,
267
+ "padding": 3,
268
+ "layerscale_init": null,
269
+ "n_fft": 1024,
270
+ "hop_length": 320
271
+ }
272
+ }
273
+
274
+ Filename: LibriTTS960_12_5Hz.json
275
+ Content: {
276
+ "encoder_name": "WavLM",
277
+ "encoder_config": {
278
+ "hidden_dims": [
279
+ 512,
280
+ 512,
281
+ 512,
282
+ 512,
283
+ 512,
284
+ 512,
285
+ 512
286
+ ],
287
+ "kernel_sizes": [
288
+ 10,
289
+ 3,
290
+ 3,
291
+ 3,
292
+ 3,
293
+ 2,
294
+ 2
295
+ ],
296
+ "strides": [
297
+ 5,
298
+ 2,
299
+ 2,
300
+ 2,
301
+ 2,
302
+ 2,
303
+ 2
304
+ ],
305
+ "num_layers": 6,
306
+ "dim": 1024,
307
+ "ffn_dim": 4096,
308
+ "num_heads": 16,
309
+ "num_buckets": 320,
310
+ "max_distance": 800,
311
+ "dropout": 0.0,
312
+ "conv_pos": 128,
313
+ "conv_pos_groups": 16
314
+ },
315
+ "compressor_name": "FocalEncoder",
316
+ "compressor_config": {
317
+ "input_dim": 1024,
318
+ "output_dim": 13,
319
+ "hidden_dims": [
320
+ 1024,
321
+ 512,
322
+ 256
323
+ ],
324
+ "downscale_factors": [
325
+ 2,
326
+ 2,
327
+ 1
328
+ ],
329
+ "focal_window": 7,
330
+ "focal_level": 2,
331
+ "focal_factor": 2,
332
+ "dropout": 0.0,
333
+ "use_post_norm": false,
334
+ "use_layerscale": false,
335
+ "layerscale_init": 0.0001,
336
+ "normalize_modulator": false
337
+ },
338
+ "quantizer_name": "BinarySphericalQuantizer",
339
+ "quantizer_config": {
340
+ "codebook_size": 8192
341
+ },
342
+ "decompressor_name": "FocalDecoder",
343
+ "decompressor_config": {
344
+ "input_dim": 13,
345
+ "output_dim": 1024,
346
+ "hidden_dims": [
347
+ 256,
348
+ 512,
349
+ 1024
350
+ ],
351
+ "upscale_factors": [
352
+ 1,
353
+ 2,
354
+ 2
355
+ ],
356
+ "focal_window": 7,
357
+ "focal_level": 2,
358
+ "focal_factor": 2,
359
+ "dropout": 0.0,
360
+ "use_post_norm": false,
361
+ "use_layerscale": false,
362
+ "layerscale_init": 0.0001,
363
+ "normalize_modulator": false
364
+ },
365
+ "decoder_name": "Vocos",
366
+ "decoder_config": {
367
+ "input_channels": 1024,
368
+ "num_layers": 8,
369
+ "dim": 512,
370
+ "ffn_dim": 1536,
371
+ "kernel_size": 7,
372
+ "padding": 3,
373
+ "layerscale_init": null,
374
+ "n_fft": 1024,
375
+ "hop_length": 320
376
+ }
377
+ }
focalcodec.png ADDED

Git LFS Details

  • SHA256: 93eefb4b78b4ee860c678e8408456516082ef4f6fcf9cce9a831e234ea260b84
  • Pointer size: 131 Bytes
  • Size of remote file: 406 kB