cfg_file: | |
precision: 'bf16-mixed' # ['16-mixed', 'bf16-mixed'] | |
min_dur: 60 | |
max_dur: 150 | |
sr: 48000 | |
pretrained_path: ${dynamic_path:???/songbloom_full_150s_dpo.pt} | |
continue_checkpoint: | |
train_dataset: | |
lyric_processor: phoneme | |
prompt_len: 10 | |
vae: | |
vae_cfg: ${dynamic_path:???/stable_audio_1920_vae.json} | |
vae_ckpt: ${dynamic_path:???/autoencoder_music_dsp1920.ckpt} | |
sr: ${sr} | |
model: | |
block_size: 16 | |
latent_dim: 64 | |
dim: 1536 | |
num_heads: 24 | |
lm_layers: 36 | |
diff_layers: 12 | |
num_pitch: 16384 | |
time_cond_type: prepend | |
timestep_features_dim: 256 | |
diffusion_objective: rectified_flow | |
timestep_sampler: logit_normal | |
backend: llama | |
rotary_base_val: 20000 | |
init_std: 0.02 | |
h_dropout: 0.05 | |
condition_provider_cfg: | |
prompt_wav: | |
type: audio_tokenizer_wrapper | |
output_dim: ${model.dim} | |
audio_tokenizer: | |
max_len: 250 # 25.0 * 10s | |
lyrics: | |
type: phoneme_tokenizer | |
output_dim: ${model.dim} | |
vocab_list: ${load_yaml:${dynamic_path:???/vocab_g2p.yaml}} | |
max_len: 600 | |
max_sentence_per_structure: 50 | |
mode: sum | |
cfg_dropout: 0.1 | |
attribute_dropout: | |
text: | |
lyrics: 0. | |
wav: | |
prompt_wav: 0.1 | |
fuser_cfg: | |
cross_attention_pos_emb: false | |
cross_attention_pos_emb_scale: 1 | |
sum: [] | |
prepend: [lyrics, prompt_wav] | |
cross: [] | |
input_interpolate: [] | |
inference: | |
cfg_coef: 1.5 | |
temp: 0.9 | |
diff_temp: 0.95 | |
top_k: 100 | |
penalty_repeat: True | |
penalty_window: 50 | |
steps: 36 | |
dit_cfg_type: h |