cfg_file: precision: 'bf16-mixed' # ['16-mixed', 'bf16-mixed'] min_dur: 60 max_dur: 150 sr: 48000 pretrained_path: ${dynamic_path:???/songbloom_full_150s_dpo.pt} continue_checkpoint: train_dataset: lyric_processor: phoneme prompt_len: 10 vae: vae_cfg: ${dynamic_path:???/stable_audio_1920_vae.json} vae_ckpt: ${dynamic_path:???/autoencoder_music_dsp1920.ckpt} sr: ${sr} model: block_size: 16 latent_dim: 64 dim: 1536 num_heads: 24 lm_layers: 36 diff_layers: 12 num_pitch: 16384 time_cond_type: prepend timestep_features_dim: 256 diffusion_objective: rectified_flow timestep_sampler: logit_normal backend: llama rotary_base_val: 20000 init_std: 0.02 h_dropout: 0.05 condition_provider_cfg: prompt_wav: type: audio_tokenizer_wrapper output_dim: ${model.dim} audio_tokenizer: max_len: 250 # 25.0 * 10s lyrics: type: phoneme_tokenizer output_dim: ${model.dim} vocab_list: ${load_yaml:${dynamic_path:???/vocab_g2p.yaml}} max_len: 600 max_sentence_per_structure: 50 mode: sum cfg_dropout: 0.1 attribute_dropout: text: lyrics: 0. wav: prompt_wav: 0.1 fuser_cfg: cross_attention_pos_emb: false cross_attention_pos_emb_scale: 1 sum: [] prepend: [lyrics, prompt_wav] cross: [] input_interpolate: [] inference: cfg_coef: 1.5 temp: 0.9 diff_temp: 0.95 top_k: 100 penalty_repeat: True penalty_window: 50 steps: 36 dit_cfg_type: h