Upload 4 files

Browse files

Files changed (5) hide show

.gitattributes +1 -0
LibriTTS960_12_5Hz.json +63 -0
LibriTTS960_12_5Hz.safetensors +3 -0
README.md +377 -3
focalcodec.png +3 -0

.gitattributes CHANGED Viewed

@@ -33,3 +33,4 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
 *.zip filter=lfs diff=lfs merge=lfs -text
 *.zst filter=lfs diff=lfs merge=lfs -text
 *tfevents* filter=lfs diff=lfs merge=lfs -text

 *.zip filter=lfs diff=lfs merge=lfs -text
 *.zst filter=lfs diff=lfs merge=lfs -text
 *tfevents* filter=lfs diff=lfs merge=lfs -text
+focalcodec.png filter=lfs diff=lfs merge=lfs -text

LibriTTS960_12_5Hz.json ADDED Viewed

	@@ -0,0 +1,63 @@

+{
+  "encoder_name": "WavLM",
+  "encoder_config": {
+    "hidden_dims": [512, 512, 512, 512, 512, 512, 512],
+    "kernel_sizes": [10, 3, 3, 3, 3, 2, 2],
+    "strides": [5, 2, 2, 2, 2, 2, 2],
+    "num_layers": 6,
+    "dim": 1024,
+    "ffn_dim": 4096,
+    "num_heads": 16,
+    "num_buckets": 320,
+    "max_distance": 800,
+    "dropout": 0.0,
+    "conv_pos": 128,
+    "conv_pos_groups": 16
+  },
+  "compressor_name": "FocalEncoder",
+  "compressor_config": {
+    "input_dim": 1024,
+    "output_dim": 13,
+    "hidden_dims": [1024, 512, 256],
+    "downscale_factors": [2, 2, 1],
+    "focal_window": 7,
+    "focal_level": 2,
+    "focal_factor": 2,
+    "dropout": 0.0,
+    "use_post_norm": false,
+    "use_layerscale": false,
+    "layerscale_init": 0.0001,
+    "normalize_modulator": false
+  },
+  "quantizer_name": "BinarySphericalQuantizer",
+  "quantizer_config": {
+    "codebook_size": 8192
+  },
+  "decompressor_name": "FocalDecoder",
+  "decompressor_config": {
+    "input_dim": 13,
+    "output_dim": 1024,
+    "hidden_dims": [256, 512, 1024],
+    "upscale_factors": [1, 2, 2],
+    "focal_window": 7,
+    "focal_level": 2,
+    "focal_factor": 2,
+    "dropout": 0.0,
+    "use_post_norm": false,
+    "use_layerscale": false,
+    "layerscale_init": 0.0001,
+    "normalize_modulator": false
+  },
+  "decoder_name": "Vocos",
+  "decoder_config": {
+    "input_channels": 1024,
+    "num_layers": 8,
+    "dim": 512,
+    "ffn_dim": 1536,
+    "kernel_size": 7,
+    "padding": 3,
+    "layerscale_init": null,
+    "n_fft": 1024,
+    "hop_length": 320
+  }
+}

LibriTTS960_12_5Hz.safetensors ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:5362c33ed75801d9bced7e8573f8eece674592ea3c3156451a85f4b924b1c1e5
+size 581137532

README.md CHANGED Viewed

@@ -1,3 +1,377 @@
----
-license: apache-2.0
----

+---
+license: apache-2.0
+base_model:
+- microsoft/wavlm-large
+pipeline_tag: audio-to-audio
+---
+# FocalCodec
+A low-bitrate single-codebook 16 kHz speech codec based on [focal modulation](https://arxiv.org/abs/2203.11926).
+- **Preprint**: https://arxiv.org/abs/2502.04465
+- **Project Page**: https://lucadellalib.github.io/focalcodec-web/
+- **GitHub**: https://github.com/lucadellalib/focalcodec
+<img src="focalcodec.png" width="700">
+---------------------------------------------------------------------------------------------------------
+## ▶️ Quickstart
+See the readme at: https://github.com/lucadellalib/focalcodec
+---------------------------------------------------------------------------------------------------------
+## 📌 Available Checkpoints
+|       Checkpoint        | Token Rate (Hz) | Bitrate (kbps) |   Dataset   |
+|:-----------------------:|:---------------:|:--------------:|:-----------:|
+|  **LibriTTS960_50Hz**   |       50.0      |      0.65      | LibriTTS960 |
+|  **LibriTTS960_25Hz**   |      25.0       |      0.33      | LibriTTS960 |
+| **LibriTTS960_12_5Hz**  |      12.5       |      0.16      | LibriTTS960 |
+---------------------------------------------------------------------------------------------------------
+## @ Citing
+```
+@article{dellalibera2025focalcodec,
+    title   = {{FocalCodec}: Low-Bitrate Speech Coding via Focal Modulation Networks},
+    author  = {Luca {Della Libera} and Francesco Paissan and Cem Subakan and Mirco Ravanelli},
+    journal = {arXiv preprint arXiv:2502.04465},
+    year    = {2025},
+}
+```
+---------------------------------------------------------------------------------------------------------
+## 📧 Contact
+[luca.dellalib@gmail.com](mailto:luca.dellalib@gmail.com)
+---------------------------------------------------------------------------------------------------------
+# File information
+The repository contains the following file information:
+Filename: LibriTTS960_25Hz.json
+Content: {
+  "encoder_name": "WavLM",
+  "encoder_config": {
+    "hidden_dims": [
+      512,
+      512,
+      512,
+      512,
+      512,
+      512,
+      512
+    ],
+    "kernel_sizes": [
+      10,
+      3,
+      3,
+      3,
+      3,
+      2,
+      2
+    ],
+    "strides": [
+      5,
+      2,
+      2,
+      2,
+      2,
+      2,
+      2
+    ],
+    "num_layers": 6,
+    "dim": 1024,
+    "ffn_dim": 4096,
+    "num_heads": 16,
+    "num_buckets": 320,
+    "max_distance": 800,
+    "dropout": 0.0,
+    "conv_pos": 128,
+    "conv_pos_groups": 16
+  },
+  "compressor_name": "FocalEncoder",
+  "compressor_config": {
+    "input_dim": 1024,
+    "output_dim": 13,
+    "hidden_dims": [
+      1024,
+      512,
+      256
+    ],
+    "downscale_factors": [
+      2,
+      1,
+      1
+    ],
+    "focal_window": 7,
+    "focal_level": 2,
+    "focal_factor": 2,
+    "dropout": 0.0,
+    "use_post_norm": false,
+    "use_layerscale": false,
+    "layerscale_init": 0.0001,
+    "normalize_modulator": false
+  },
+  "quantizer_name": "BinarySphericalQuantizer",
+  "quantizer_config": {
+    "codebook_size": 8192
+  },
+  "decompressor_name": "FocalDecoder",
+  "decompressor_config": {
+    "input_dim": 13,
+    "output_dim": 1024,
+    "hidden_dims": [
+      256,
+      512,
+      1024
+    ],
+    "upscale_factors": [
+      1,
+      1,
+      2
+    ],
+    "focal_window": 7,
+    "focal_level": 2,
+    "focal_factor": 2,
+    "dropout": 0.0,
+    "use_post_norm": false,
+    "use_layerscale": false,
+    "layerscale_init": 0.0001,
+    "normalize_modulator": false
+  },
+  "decoder_name": "Vocos",
+  "decoder_config": {
+    "input_channels": 1024,
+    "num_layers": 8,
+    "dim": 512,
+    "ffn_dim": 1536,
+    "kernel_size": 7,
+    "padding": 3,
+    "layerscale_init": null,
+    "n_fft": 1024,
+    "hop_length": 320
+  }
+}
+Filename: focalcodec.png
+Content: "Content of the file is larger than 50 KB, too long to display."
+Filename: LibriTTS960_50Hz.json
+Content: {
+  "encoder_name": "WavLM",
+  "encoder_config": {
+    "hidden_dims": [
+      512,
+      512,
+      512,
+      512,
+      512,
+      512,
+      512
+    ],
+    "kernel_sizes": [
+      10,
+      3,
+      3,
+      3,
+      3,
+      2,
+      2
+    ],
+    "strides": [
+      5,
+      2,
+      2,
+      2,
+      2,
+      2,
+      2
+    ],
+    "num_layers": 6,
+    "dim": 1024,
+    "ffn_dim": 4096,
+    "num_heads": 16,
+    "num_buckets": 320,
+    "max_distance": 800,
+    "dropout": 0.0,
+    "conv_pos": 128,
+    "conv_pos_groups": 16
+  },
+  "compressor_name": "FocalEncoder",
+  "compressor_config": {
+    "input_dim": 1024,
+    "output_dim": 13,
+    "hidden_dims": [
+      1024,
+      512,
+      256
+    ],
+    "downscale_factors": [
+      1,
+      1,
+      1
+    ],
+    "focal_window": 7,
+    "focal_level": 2,
+    "focal_factor": 2,
+    "dropout": 0.0,
+    "use_post_norm": false,
+    "use_layerscale": false,
+    "layerscale_init": 0.0001,
+    "normalize_modulator": false
+  },
+  "quantizer_name": "BinarySphericalQuantizer",
+  "quantizer_config": {
+    "codebook_size": 8192
+  },
+  "decompressor_name": "FocalDecoder",
+  "decompressor_config": {
+    "input_dim": 13,
+    "output_dim": 1024,
+    "hidden_dims": [
+      256,
+      512,
+      1024
+    ],
+    "upscale_factors": [
+      1,
+      1,
+      1
+    ],
+    "focal_window": 7,
+    "focal_level": 2,
+    "focal_factor": 2,
+    "dropout": 0.0,
+    "use_post_norm": false,
+    "use_layerscale": false,
+    "layerscale_init": 0.0001,
+    "normalize_modulator": false
+  },
+  "decoder_name": "Vocos",
+  "decoder_config": {
+    "input_channels": 1024,
+    "num_layers": 8,
+    "dim": 512,
+    "ffn_dim": 1536,
+    "kernel_size": 7,
+    "padding": 3,
+    "layerscale_init": null,
+    "n_fft": 1024,
+    "hop_length": 320
+  }
+}
+Filename: LibriTTS960_12_5Hz.json
+Content: {
+  "encoder_name": "WavLM",
+  "encoder_config": {
+    "hidden_dims": [
+      512,
+      512,
+      512,
+      512,
+      512,
+      512,
+      512
+    ],
+    "kernel_sizes": [
+      10,
+      3,
+      3,
+      3,
+      3,
+      2,
+      2
+    ],
+    "strides": [
+      5,
+      2,
+      2,
+      2,
+      2,
+      2,
+      2
+    ],
+    "num_layers": 6,
+    "dim": 1024,
+    "ffn_dim": 4096,
+    "num_heads": 16,
+    "num_buckets": 320,
+    "max_distance": 800,
+    "dropout": 0.0,
+    "conv_pos": 128,
+    "conv_pos_groups": 16
+  },
+  "compressor_name": "FocalEncoder",
+  "compressor_config": {
+    "input_dim": 1024,
+    "output_dim": 13,
+    "hidden_dims": [
+      1024,
+      512,
+      256
+    ],
+    "downscale_factors": [
+      2,
+      2,
+      1
+    ],
+    "focal_window": 7,
+    "focal_level": 2,
+    "focal_factor": 2,
+    "dropout": 0.0,
+    "use_post_norm": false,
+    "use_layerscale": false,
+    "layerscale_init": 0.0001,
+    "normalize_modulator": false
+  },
+  "quantizer_name": "BinarySphericalQuantizer",
+  "quantizer_config": {
+    "codebook_size": 8192
+  },
+  "decompressor_name": "FocalDecoder",
+  "decompressor_config": {
+    "input_dim": 13,
+    "output_dim": 1024,
+    "hidden_dims": [
+      256,
+      512,
+      1024
+    ],
+    "upscale_factors": [
+      1,
+      2,
+      2
+    ],
+    "focal_window": 7,
+    "focal_level": 2,
+    "focal_factor": 2,
+    "dropout": 0.0,
+    "use_post_norm": false,
+    "use_layerscale": false,
+    "layerscale_init": 0.0001,
+    "normalize_modulator": false
+  },
+  "decoder_name": "Vocos",
+  "decoder_config": {
+    "input_channels": 1024,
+    "num_layers": 8,
+    "dim": 512,
+    "ffn_dim": 1536,
+    "kernel_size": 7,
+    "padding": 3,
+    "layerscale_init": null,
+    "n_fft": 1024,
+    "hop_length": 320
+  }
+}

focalcodec.png ADDED Viewed

Git LFS Details

SHA256: 93eefb4b78b4ee860c678e8408456516082ef4f6fcf9cce9a831e234ea260b84
Pointer size: 131 Bytes
Size of remote file: 406 kB