model:
  name: EleutherAI/gpt-neo-125M
  reward_type: irl
  reward_model: matthieubou-imperial-college-london/bayes_irl_vi/posterior_bradley_terry_rkiq5pd8:v0
  base_model_name: EleutherAI/pythia-410m
  likelihood_type: bradley_terry
  use_raw_irl_score: true
  irl_normalization_strategy: none
  irl_reward_scale: 1.0
  n_posterior_samples: 100
  use_distance_sampling: false
  learning_rate: 2.0e-06
  batch_size: 128
  mini_batch_size: 8
  gradient_accumulation_steps: 16
  max_sequence_length: 512
  generation:
    min_length: -1
    top_k: 0
    top_p: 0.8
    temperature: 0.7
    do_sample: true
    output_min_length: 20
    output_max_length: 30
rlhf:
  model:
    ppo_epochs: 3
    init_kl_coef: 0.2
    target: 1.0
    cliprange: 0.1
    cliprange_value: 0.2
    vf_coef: 0.1
    adap_kl_ctrl: true
    use_score_norm: true
    ratio_threshold: 10.0
training:
  num_train_epochs: 80
  save_freq: 2
  eval_freq: 2
  seed: 0
dataset:
  name: allenai/real-toxicity-prompts
  toxicity_metric: profanity
  toxicity_threshold: 0.7
  input_min_text_length: 10
  input_max_text_length: 100
  test_size: 0.1
  num_samples: 20000
output:
  push_to_hub: true
  push_checkpoints_to_hub: true
  push_final_model_to_hub: true
  hub_org: MattBou00
  repo_name_prefix: rlhf-checkpoint
  private: false
wandb:
  project: irl-rlhf-detox
  entity: matthieubou-imperial-college-london
  name: null
now: 2025-08-02_23-53-31
logging:
  use_wandb: true
  project_name: irl-rlhf-detox
  wandb_mode: online