SongBloom / songbloom_full_150s_dpo.yaml
CypressYang's picture
Upload songbloom_full_150s_dpo.yaml with huggingface_hub
7ba8fd2 verified
cfg_file:
precision: 'bf16-mixed' # ['16-mixed', 'bf16-mixed']
min_dur: 60
max_dur: 150
sr: 48000
pretrained_path: ${dynamic_path:???/songbloom_full_150s_dpo.pt}
continue_checkpoint:
train_dataset:
lyric_processor: phoneme
prompt_len: 10
vae:
vae_cfg: ${dynamic_path:???/stable_audio_1920_vae.json}
vae_ckpt: ${dynamic_path:???/autoencoder_music_dsp1920.ckpt}
sr: ${sr}
model:
block_size: 16
latent_dim: 64
dim: 1536
num_heads: 24
lm_layers: 36
diff_layers: 12
num_pitch: 16384
time_cond_type: prepend
timestep_features_dim: 256
diffusion_objective: rectified_flow
timestep_sampler: logit_normal
backend: llama
rotary_base_val: 20000
init_std: 0.02
h_dropout: 0.05
condition_provider_cfg:
prompt_wav:
type: audio_tokenizer_wrapper
output_dim: ${model.dim}
audio_tokenizer:
max_len: 250 # 25.0 * 10s
lyrics:
type: phoneme_tokenizer
output_dim: ${model.dim}
vocab_list: ${load_yaml:${dynamic_path:???/vocab_g2p.yaml}}
max_len: 600
max_sentence_per_structure: 50
mode: sum
cfg_dropout: 0.1
attribute_dropout:
text:
lyrics: 0.
wav:
prompt_wav: 0.1
fuser_cfg:
cross_attention_pos_emb: false
cross_attention_pos_emb_scale: 1
sum: []
prepend: [lyrics, prompt_wav]
cross: []
input_interpolate: []
inference:
cfg_coef: 1.5
temp: 0.9
diff_temp: 0.95
top_k: 100
penalty_repeat: True
penalty_window: 50
steps: 36
dit_cfg_type: h