cfg_file: 
precision: 'bf16-mixed' # ['16-mixed', 'bf16-mixed']
min_dur: 60
max_dur: 150
sr: 48000

pretrained_path: ${dynamic_path:???/songbloom_full_150s_dpo.pt}
continue_checkpoint: 

train_dataset:
  lyric_processor: phoneme
  prompt_len: 10

vae:
  vae_cfg: ${dynamic_path:???/stable_audio_1920_vae.json}
  vae_ckpt: ${dynamic_path:???/autoencoder_music_dsp1920.ckpt}
  sr: ${sr}

model: 
  block_size: 16 
  latent_dim: 64
  dim: 1536
  num_heads: 24
  lm_layers: 36
  diff_layers: 12
  num_pitch: 16384
  time_cond_type: prepend 
  timestep_features_dim: 256
  diffusion_objective: rectified_flow
  timestep_sampler: logit_normal
  backend: llama
  rotary_base_val: 20000
  init_std: 0.02
  h_dropout: 0.05 

  condition_provider_cfg:
    prompt_wav:
      type: audio_tokenizer_wrapper
      output_dim: ${model.dim}
      audio_tokenizer: 
      max_len: 250 # 25.0 * 10s
    lyrics:
      type: phoneme_tokenizer
      output_dim: ${model.dim}
      vocab_list: ${load_yaml:${dynamic_path:???/vocab_g2p.yaml}}
      max_len: 600
      max_sentence_per_structure: 50
      mode: sum


  cfg_dropout: 0.1
  attribute_dropout:
    text:
      lyrics: 0.
    wav:
      prompt_wav: 0.1

  fuser_cfg:
    cross_attention_pos_emb: false
    cross_attention_pos_emb_scale: 1
    sum: []
    prepend: [lyrics, prompt_wav] 
    cross: []      
    input_interpolate: []


inference:
  cfg_coef: 1.5 
  temp: 0.9
  diff_temp: 0.95
  top_k: 100
  penalty_repeat: True
  penalty_window: 50
  steps: 36
  dit_cfg_type: h