HectorHe commited on
Commit
f4cc9b8
·
verified ·
1 Parent(s): 1d1e44c

Training in progress, epoch 1

Browse files
config.json CHANGED
@@ -28,6 +28,6 @@
28
  "tie_word_embeddings": false,
29
  "torch_dtype": "bfloat16",
30
  "transformers_version": "4.51.0",
31
- "use_cache": true,
32
  "vocab_size": 50304
33
  }
 
28
  "tie_word_embeddings": false,
29
  "torch_dtype": "bfloat16",
30
  "transformers_version": "4.51.0",
31
+ "use_cache": false,
32
  "vocab_size": 50304
33
  }
model-00001-of-00003.safetensors CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:7deffd31534904794040aa12bfa54f0f5719fd3eccf03bf201d0e56cf13769e3
3
  size 4997746208
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:02f878a290f3f8182f489cc2773d0058985e50d14bef3346ddd259fa6790dd0f
3
  size 4997746208
model-00002-of-00003.safetensors CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:b07fdca73ed9f5508384fe0a336ce12e634bd48e51538af26df79cf1de6e8b49
3
  size 4997236504
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:f3fa1b6efd4cfc00a30f3266390e6befd45a36460be3049b004da17fb1f6cb33
3
  size 4997236504
model-00003-of-00003.safetensors CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:a80bc89b0b20b67f94ca67adac52c1fcb57ad77c4493d35c7915b8434f673456
3
  size 3843742800
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:4c9a507933a87f0d850c29f51c851a58e2771b1ccc69f869e2e27b42284b5ef5
3
  size 3843742800
tokenizer.json CHANGED
@@ -1,6 +1,11 @@
1
  {
2
  "version": "1.0",
3
- "truncation": null,
 
 
 
 
 
4
  "padding": null,
5
  "added_tokens": [
6
  {
 
1
  {
2
  "version": "1.0",
3
+ "truncation": {
4
+ "direction": "Right",
5
+ "max_length": 77,
6
+ "strategy": "LongestFirst",
7
+ "stride": 0
8
+ },
9
  "padding": null,
10
  "added_tokens": [
11
  {
training.log CHANGED
@@ -1,6 +1,6 @@
1
- 2025-09-23 17:54:28 - INFO - __main__ - Model parameters AuxFreeModelConfig(model_name_or_path='allenai/OLMoE-1B-7B-0125', model_revision='main', torch_dtype='bfloat16', trust_remote_code=True, attn_implementation='flash_attention_2', use_peft=False, lora_r=16, lora_alpha=32, lora_dropout=0.05, lora_target_modules=None, lora_modules_to_save=None, lora_task_type='CAUSAL_LM', use_rslora=False, load_in_8bit=False, load_in_4bit=False, bnb_4bit_quant_type='nf4', use_bnb_nested_quant=False, bias_update_speed=0.0001, remove_aux_loss=True, add_aux_free_loss=True, sinkhorn_routing=False, enable_forced_experts=False, num_forced_experts=2, bias_file_path='')
2
- 2025-09-23 17:54:28 - INFO - __main__ - Script parameters ScriptArguments(dataset_name='fw407/Commonsense-15K', dataset_config=None, dataset_train_split='train', dataset_test_split='test', gradient_checkpointing_use_reentrant=False, ignore_bias_buffers=False)
3
- 2025-09-23 17:54:28 - INFO - __main__ - Training parameters SFTConfig(
4
  _n_gpu=1,
5
  accelerator_config={'split_batches': False, 'dispatch_batches': None, 'even_batches': True, 'use_seedable_sampler': True, 'non_blocking': False, 'gradient_accumulation_kwargs': None, 'use_configured_state': False},
6
  adafactor=False,
@@ -81,7 +81,7 @@ local_rank=0,
81
  log_level=info,
82
  log_level_replica=warning,
83
  log_on_each_node=True,
84
- logging_dir=/tmp/data/OLMoE-1B-7B-0125/aux_free_sft/commonsense15k/runs/Sep23_17-54-27_orchard-community-4,
85
  logging_first_step=False,
86
  logging_nan_inf_filter=True,
87
  logging_steps=1,
@@ -151,11 +151,357 @@ warmup_ratio=0.1,
151
  warmup_steps=0,
152
  weight_decay=0.0,
153
  )
154
- 2025-09-23 17:54:31 - INFO - __main__ - *** Initializing model kwargs ***
155
- 2025-09-23 17:54:31 - INFO - __main__ - 🔧 Patching OLMOE with aux-free MoE blocks...
156
- 2025-09-23 17:54:31 - INFO - __main__ - ✅ Using standard aux-free routing
157
- 2025-09-23 17:55:17 - INFO - __main__ - *** Train ***
158
- 2025-09-23 17:55:17 - INFO - __main__ - OlmoeForCausalLM(
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
159
  (model): OlmoeModel(
160
  (embed_tokens): Embedding(50304, 2048, padding_idx=1)
161
  (layers): ModuleList(
@@ -188,42 +534,3 @@ weight_decay=0.0,
188
  )
189
  (lm_head): Linear(in_features=2048, out_features=50304, bias=False)
190
  )
191
- 2025-09-23 18:18:04 - INFO - __main__ - *** Save model ***
192
- 2025-09-23 18:18:04 - INFO - __main__ - 💾 Saving MoE bias states...
193
- 2025-09-23 18:18:04 - INFO - __main__ - 🔍 Searching for MoE layers with bias states...
194
- 2025-09-23 18:18:04 - INFO - __main__ - ✅ Saved bias from model.layers.0.mlp: 64 experts, update_speed=0.000100
195
- 2025-09-23 18:18:04 - INFO - __main__ - ✅ Saved bias from model.layers.1.mlp: 64 experts, update_speed=0.000100
196
- 2025-09-23 18:18:04 - INFO - __main__ - ✅ Saved bias from model.layers.2.mlp: 64 experts, update_speed=0.000100
197
- 2025-09-23 18:18:04 - INFO - __main__ - ✅ Saved bias from model.layers.3.mlp: 64 experts, update_speed=0.000100
198
- 2025-09-23 18:18:04 - INFO - __main__ - ✅ Saved bias from model.layers.4.mlp: 64 experts, update_speed=0.000100
199
- 2025-09-23 18:18:04 - INFO - __main__ - ✅ Saved bias from model.layers.5.mlp: 64 experts, update_speed=0.000100
200
- 2025-09-23 18:18:04 - INFO - __main__ - ✅ Saved bias from model.layers.6.mlp: 64 experts, update_speed=0.000100
201
- 2025-09-23 18:18:04 - INFO - __main__ - ✅ Saved bias from model.layers.7.mlp: 64 experts, update_speed=0.000100
202
- 2025-09-23 18:18:04 - INFO - __main__ - ✅ Saved bias from model.layers.8.mlp: 64 experts, update_speed=0.000100
203
- 2025-09-23 18:18:04 - INFO - __main__ - ✅ Saved bias from model.layers.9.mlp: 64 experts, update_speed=0.000100
204
- 2025-09-23 18:18:04 - INFO - __main__ - ✅ Saved bias from model.layers.10.mlp: 64 experts, update_speed=0.000100
205
- 2025-09-23 18:18:04 - INFO - __main__ - ✅ Saved bias from model.layers.11.mlp: 64 experts, update_speed=0.000100
206
- 2025-09-23 18:18:04 - INFO - __main__ - ✅ Saved bias from model.layers.12.mlp: 64 experts, update_speed=0.000100
207
- 2025-09-23 18:18:04 - INFO - __main__ - ✅ Saved bias from model.layers.13.mlp: 64 experts, update_speed=0.000100
208
- 2025-09-23 18:18:04 - INFO - __main__ - ✅ Saved bias from model.layers.14.mlp: 64 experts, update_speed=0.000100
209
- 2025-09-23 18:18:04 - INFO - __main__ - ✅ Saved bias from model.layers.15.mlp: 64 experts, update_speed=0.000100
210
- 2025-09-23 18:18:04 - INFO - __main__ - 🎉 Successfully saved 16 MoE bias states to /tmp/data/OLMoE-1B-7B-0125/aux_free_sft/commonsense15k/moe_bias_states.json
211
- 2025-09-23 18:18:04 - INFO - __main__ - 📊 Bias States Summary:
212
- 2025-09-23 18:18:04 - INFO - __main__ - model.layers.0.mlp: 64 experts, range=[0.0000, 0.0000]
213
- 2025-09-23 18:18:04 - INFO - __main__ - model.layers.1.mlp: 64 experts, range=[0.0000, 0.0000]
214
- 2025-09-23 18:18:04 - INFO - __main__ - model.layers.2.mlp: 64 experts, range=[0.0000, 0.0000]
215
- 2025-09-23 18:18:04 - INFO - __main__ - model.layers.3.mlp: 64 experts, range=[0.0000, 0.0000]
216
- 2025-09-23 18:18:04 - INFO - __main__ - model.layers.4.mlp: 64 experts, range=[0.0000, 0.0000]
217
- 2025-09-23 18:18:04 - INFO - __main__ - model.layers.5.mlp: 64 experts, range=[0.0000, 0.0000]
218
- 2025-09-23 18:18:04 - INFO - __main__ - model.layers.6.mlp: 64 experts, range=[0.0000, 0.0000]
219
- 2025-09-23 18:18:04 - INFO - __main__ - model.layers.7.mlp: 64 experts, range=[0.0000, 0.0000]
220
- 2025-09-23 18:18:04 - INFO - __main__ - model.layers.8.mlp: 64 experts, range=[0.0000, 0.0000]
221
- 2025-09-23 18:18:04 - INFO - __main__ - model.layers.9.mlp: 64 experts, range=[0.0000, 0.0000]
222
- 2025-09-23 18:18:04 - INFO - __main__ - model.layers.10.mlp: 64 experts, range=[0.0000, 0.0000]
223
- 2025-09-23 18:18:04 - INFO - __main__ - model.layers.11.mlp: 64 experts, range=[0.0000, 0.0000]
224
- 2025-09-23 18:18:04 - INFO - __main__ - model.layers.12.mlp: 64 experts, range=[0.0000, 0.0000]
225
- 2025-09-23 18:18:04 - INFO - __main__ - model.layers.13.mlp: 64 experts, range=[0.0000, 0.0000]
226
- 2025-09-23 18:18:04 - INFO - __main__ - model.layers.14.mlp: 64 experts, range=[0.0000, 0.0000]
227
- 2025-09-23 18:18:04 - INFO - __main__ - model.layers.15.mlp: 64 experts, range=[0.0000, 0.0000]
228
- 2025-09-23 18:18:51 - INFO - __main__ - Model saved to /tmp/data/OLMoE-1B-7B-0125/aux_free_sft/commonsense15k
229
- 2025-09-23 18:18:51 - INFO - __main__ - Pushing to hub...
 
1
+ 2025-09-23 20:45:40 - INFO - __main__ - Model parameters AuxFreeModelConfig(model_name_or_path='allenai/OLMoE-1B-7B-0125', model_revision='main', torch_dtype='bfloat16', trust_remote_code=True, attn_implementation='flash_attention_2', use_peft=False, lora_r=16, lora_alpha=32, lora_dropout=0.05, lora_target_modules=None, lora_modules_to_save=None, lora_task_type='CAUSAL_LM', use_rslora=False, load_in_8bit=False, load_in_4bit=False, bnb_4bit_quant_type='nf4', use_bnb_nested_quant=False, bias_update_speed=0.0001, remove_aux_loss=True, add_aux_free_loss=True, sinkhorn_routing=False, enable_forced_experts=False, num_forced_experts=2, bias_file_path='')
2
+ 2025-09-23 20:45:40 - INFO - __main__ - Script parameters ScriptArguments(dataset_name='fw407/Commonsense-15K', dataset_config=None, dataset_train_split='train', dataset_test_split='test', gradient_checkpointing_use_reentrant=False, ignore_bias_buffers=False)
3
+ 2025-09-23 20:45:40 - INFO - __main__ - Training parameters SFTConfig(
4
  _n_gpu=1,
5
  accelerator_config={'split_batches': False, 'dispatch_batches': None, 'even_batches': True, 'use_seedable_sampler': True, 'non_blocking': False, 'gradient_accumulation_kwargs': None, 'use_configured_state': False},
6
  adafactor=False,
 
81
  log_level=info,
82
  log_level_replica=warning,
83
  log_on_each_node=True,
84
+ logging_dir=/tmp/data/OLMoE-1B-7B-0125/aux_free_sft/commonsense15k/runs/Sep23_20-45-39_orchard-community-4,
85
  logging_first_step=False,
86
  logging_nan_inf_filter=True,
87
  logging_steps=1,
 
151
  warmup_steps=0,
152
  weight_decay=0.0,
153
  )
154
+ 2025-09-23 20:45:41 - INFO - __main__ - *** Initializing model kwargs ***
155
+ 2025-09-23 20:45:41 - INFO - __main__ - 🔧 Patching OLMOE with aux-free MoE blocks...
156
+ 2025-09-23 20:45:41 - INFO - __main__ - ✅ Using standard aux-free routing
157
+ 2025-09-23 20:46:13 - INFO - __main__ - *** Train ***
158
+ 2025-09-23 20:46:13 - INFO - __main__ - OlmoeForCausalLM(
159
+ (model): OlmoeModel(
160
+ (embed_tokens): Embedding(50304, 2048, padding_idx=1)
161
+ (layers): ModuleList(
162
+ (0-15): 16 x OlmoeDecoderLayer(
163
+ (self_attn): OlmoeFlashAttention2(
164
+ (q_proj): Linear(in_features=2048, out_features=2048, bias=False)
165
+ (k_proj): Linear(in_features=2048, out_features=2048, bias=False)
166
+ (v_proj): Linear(in_features=2048, out_features=2048, bias=False)
167
+ (o_proj): Linear(in_features=2048, out_features=2048, bias=False)
168
+ (q_norm): OlmoeRMSNorm((2048,), eps=1e-05)
169
+ (k_norm): OlmoeRMSNorm((2048,), eps=1e-05)
170
+ )
171
+ (mlp): AuxFreeOlmoeSparseMoeBlock(
172
+ (gate): Linear(in_features=2048, out_features=64, bias=False)
173
+ (experts): ModuleList(
174
+ (0-63): 64 x OlmoeMLP(
175
+ (gate_proj): Linear(in_features=2048, out_features=1024, bias=False)
176
+ (up_proj): Linear(in_features=2048, out_features=1024, bias=False)
177
+ (down_proj): Linear(in_features=1024, out_features=2048, bias=False)
178
+ (act_fn): SiLU()
179
+ )
180
+ )
181
+ )
182
+ (input_layernorm): OlmoeRMSNorm((2048,), eps=1e-05)
183
+ (post_attention_layernorm): OlmoeRMSNorm((2048,), eps=1e-05)
184
+ )
185
+ )
186
+ (norm): OlmoeRMSNorm((2048,), eps=1e-05)
187
+ (rotary_emb): OlmoeRotaryEmbedding()
188
+ )
189
+ (lm_head): Linear(in_features=2048, out_features=50304, bias=False)
190
+ )
191
+ 2025-09-23 22:05:59 - INFO - __main__ - Model parameters AuxFreeModelConfig(model_name_or_path='allenai/OLMoE-1B-7B-0125', model_revision='main', torch_dtype='bfloat16', trust_remote_code=True, attn_implementation='flash_attention_2', use_peft=False, lora_r=16, lora_alpha=32, lora_dropout=0.05, lora_target_modules=None, lora_modules_to_save=None, lora_task_type='CAUSAL_LM', use_rslora=False, load_in_8bit=False, load_in_4bit=False, bnb_4bit_quant_type='nf4', use_bnb_nested_quant=False, bias_update_speed=0.0001, remove_aux_loss=True, add_aux_free_loss=True, sinkhorn_routing=False, enable_forced_experts=False, num_forced_experts=2, bias_file_path='')
192
+ 2025-09-23 22:05:59 - INFO - __main__ - Script parameters ScriptArguments(dataset_name='fw407/Commonsense-15K', dataset_config=None, dataset_train_split='train', dataset_test_split='test', gradient_checkpointing_use_reentrant=False, ignore_bias_buffers=False)
193
+ 2025-09-23 22:05:59 - INFO - __main__ - Training parameters SFTConfig(
194
+ _n_gpu=1,
195
+ accelerator_config={'split_batches': False, 'dispatch_batches': None, 'even_batches': True, 'use_seedable_sampler': True, 'non_blocking': False, 'gradient_accumulation_kwargs': None, 'use_configured_state': False},
196
+ adafactor=False,
197
+ adam_beta1=0.9,
198
+ adam_beta2=0.999,
199
+ adam_epsilon=1e-08,
200
+ auto_find_batch_size=False,
201
+ average_tokens_across_devices=False,
202
+ batch_eval_metrics=False,
203
+ benchmarks=[],
204
+ bf16=True,
205
+ bf16_full_eval=False,
206
+ callbacks=[],
207
+ chars_per_token=<CHARS_PER_TOKEN>,
208
+ chat_template=None,
209
+ data_seed=None,
210
+ dataloader_drop_last=False,
211
+ dataloader_num_workers=0,
212
+ dataloader_persistent_workers=False,
213
+ dataloader_pin_memory=True,
214
+ dataloader_prefetch_factor=None,
215
+ dataset_batch_size=None,
216
+ dataset_kwargs=None,
217
+ dataset_num_proc=None,
218
+ dataset_text_field=text,
219
+ ddp_backend=None,
220
+ ddp_broadcast_buffers=None,
221
+ ddp_bucket_cap_mb=None,
222
+ ddp_find_unused_parameters=None,
223
+ ddp_timeout=1800000000,
224
+ debug=[],
225
+ deepspeed=None,
226
+ disable_tqdm=False,
227
+ do_eval=True,
228
+ do_predict=False,
229
+ do_train=False,
230
+ eval_accumulation_steps=None,
231
+ eval_delay=0,
232
+ eval_do_concat_batches=True,
233
+ eval_on_start=False,
234
+ eval_packing=None,
235
+ eval_steps=None,
236
+ eval_strategy=IntervalStrategy.NO,
237
+ eval_use_gather_object=False,
238
+ fp16=False,
239
+ fp16_backend=auto,
240
+ fp16_full_eval=False,
241
+ fp16_opt_level=O1,
242
+ fsdp=[],
243
+ fsdp_config={'min_num_params': 0, 'xla': False, 'xla_fsdp_v2': False, 'xla_fsdp_grad_ckpt': False},
244
+ fsdp_min_num_params=0,
245
+ fsdp_transformer_layer_cls_to_wrap=None,
246
+ full_determinism=False,
247
+ gradient_accumulation_steps=1,
248
+ gradient_checkpointing=True,
249
+ gradient_checkpointing_kwargs={'use_reentrant': False},
250
+ greater_is_better=None,
251
+ group_by_length=False,
252
+ half_precision_backend=auto,
253
+ hub_always_push=False,
254
+ hub_model_id=OLMoE-1B-7B-0125-aux-free-sft-commonsense15k,
255
+ hub_model_revision=main,
256
+ hub_private_repo=None,
257
+ hub_strategy=HubStrategy.EVERY_SAVE,
258
+ hub_token=<HUB_TOKEN>,
259
+ ignore_data_skip=False,
260
+ include_for_metrics=[],
261
+ include_inputs_for_metrics=False,
262
+ include_num_input_tokens_seen=False,
263
+ include_tokens_per_second=False,
264
+ jit_mode_eval=False,
265
+ label_names=None,
266
+ label_smoothing_factor=0.0,
267
+ learning_rate=1e-05,
268
+ length_column_name=length,
269
+ load_best_model_at_end=False,
270
+ local_rank=0,
271
+ log_level=info,
272
+ log_level_replica=warning,
273
+ log_on_each_node=True,
274
+ logging_dir=/tmp/data/OLMoE-1B-7B-0125/aux_free_sft/commonsense15k/runs/Sep23_22-05-58_orchard-community-4,
275
+ logging_first_step=False,
276
+ logging_nan_inf_filter=True,
277
+ logging_steps=1,
278
+ logging_strategy=IntervalStrategy.STEPS,
279
+ lr_scheduler_kwargs={'min_lr_rate': 0.1},
280
+ lr_scheduler_type=SchedulerType.COSINE_WITH_MIN_LR,
281
+ max_grad_norm=1.0,
282
+ max_length=2048,
283
+ max_seq_length=None,
284
+ max_steps=-1,
285
+ metric_for_best_model=None,
286
+ model_init_kwargs=None,
287
+ mp_parameters=,
288
+ neftune_noise_alpha=None,
289
+ no_cuda=False,
290
+ num_of_sequences=None,
291
+ num_train_epochs=1,
292
+ optim=OptimizerNames.ADAMW_TORCH,
293
+ optim_args=None,
294
+ optim_target_modules=None,
295
+ output_dir=/tmp/data/OLMoE-1B-7B-0125/aux_free_sft/commonsense15k,
296
+ overwrite_hub_revision=False,
297
+ overwrite_output_dir=True,
298
+ packing=False,
299
+ past_index=-1,
300
+ per_device_eval_batch_size=16,
301
+ per_device_train_batch_size=8,
302
+ prediction_loss_only=False,
303
+ push_to_hub=True,
304
+ push_to_hub_model_id=None,
305
+ push_to_hub_organization=None,
306
+ push_to_hub_revision=False,
307
+ push_to_hub_token=<PUSH_TO_HUB_TOKEN>,
308
+ ray_scope=last,
309
+ remove_unused_columns=True,
310
+ report_to=['wandb'],
311
+ restore_callback_states_from_checkpoint=False,
312
+ resume_from_checkpoint=None,
313
+ run_name=/tmp/data/OLMoE-1B-7B-0125/aux_free_sft/commonsense15k,
314
+ save_on_each_node=False,
315
+ save_only_model=False,
316
+ save_safetensors=True,
317
+ save_steps=500,
318
+ save_strategy=SaveStrategy.EPOCH,
319
+ save_total_limit=3,
320
+ seed=1234,
321
+ skip_memory_metrics=True,
322
+ system_prompt=None,
323
+ tf32=None,
324
+ torch_compile=False,
325
+ torch_compile_backend=None,
326
+ torch_compile_mode=None,
327
+ torch_empty_cache_steps=None,
328
+ torchdynamo=None,
329
+ tp_size=0,
330
+ tpu_metrics_debug=False,
331
+ tpu_num_cores=None,
332
+ use_cpu=False,
333
+ use_ipex=False,
334
+ use_legacy_prediction_loop=False,
335
+ use_liger=False,
336
+ use_liger_kernel=False,
337
+ use_mps_device=False,
338
+ wandb_entity=None,
339
+ wandb_project=None,
340
+ warmup_ratio=0.1,
341
+ warmup_steps=0,
342
+ weight_decay=0.0,
343
+ )
344
+ 2025-09-23 22:06:00 - INFO - __main__ - *** Initializing model kwargs ***
345
+ 2025-09-23 22:06:00 - INFO - __main__ - 🔧 Patching OLMOE with aux-free MoE blocks...
346
+ 2025-09-23 22:06:00 - INFO - __main__ - ��� Using standard aux-free routing
347
+ 2025-09-23 22:17:25 - INFO - __main__ - Model parameters AuxFreeModelConfig(model_name_or_path='allenai/OLMoE-1B-7B-0125', model_revision='main', torch_dtype='bfloat16', trust_remote_code=True, attn_implementation='flash_attention_2', use_peft=False, lora_r=16, lora_alpha=32, lora_dropout=0.05, lora_target_modules=None, lora_modules_to_save=None, lora_task_type='CAUSAL_LM', use_rslora=False, load_in_8bit=False, load_in_4bit=False, bnb_4bit_quant_type='nf4', use_bnb_nested_quant=False, bias_update_speed=0.0001, remove_aux_loss=True, add_aux_free_loss=True, sinkhorn_routing=False, enable_forced_experts=False, num_forced_experts=2, bias_file_path='')
348
+ 2025-09-23 22:17:25 - INFO - __main__ - Script parameters ScriptArguments(dataset_name='fw407/Commonsense-15K', dataset_config=None, dataset_train_split='train', dataset_test_split='test', gradient_checkpointing_use_reentrant=False, ignore_bias_buffers=False)
349
+ 2025-09-23 22:17:25 - INFO - __main__ - Training parameters SFTConfig(
350
+ _n_gpu=1,
351
+ accelerator_config={'split_batches': False, 'dispatch_batches': None, 'even_batches': True, 'use_seedable_sampler': True, 'non_blocking': False, 'gradient_accumulation_kwargs': None, 'use_configured_state': False},
352
+ adafactor=False,
353
+ adam_beta1=0.9,
354
+ adam_beta2=0.999,
355
+ adam_epsilon=1e-08,
356
+ auto_find_batch_size=False,
357
+ average_tokens_across_devices=False,
358
+ batch_eval_metrics=False,
359
+ benchmarks=[],
360
+ bf16=True,
361
+ bf16_full_eval=False,
362
+ callbacks=[],
363
+ chars_per_token=<CHARS_PER_TOKEN>,
364
+ chat_template=None,
365
+ data_seed=None,
366
+ dataloader_drop_last=False,
367
+ dataloader_num_workers=0,
368
+ dataloader_persistent_workers=False,
369
+ dataloader_pin_memory=True,
370
+ dataloader_prefetch_factor=None,
371
+ dataset_batch_size=None,
372
+ dataset_kwargs=None,
373
+ dataset_num_proc=None,
374
+ dataset_text_field=text,
375
+ ddp_backend=None,
376
+ ddp_broadcast_buffers=None,
377
+ ddp_bucket_cap_mb=None,
378
+ ddp_find_unused_parameters=None,
379
+ ddp_timeout=1800000000,
380
+ debug=[],
381
+ deepspeed=None,
382
+ disable_tqdm=False,
383
+ do_eval=True,
384
+ do_predict=False,
385
+ do_train=False,
386
+ eval_accumulation_steps=None,
387
+ eval_delay=0,
388
+ eval_do_concat_batches=True,
389
+ eval_on_start=False,
390
+ eval_packing=None,
391
+ eval_steps=None,
392
+ eval_strategy=IntervalStrategy.NO,
393
+ eval_use_gather_object=False,
394
+ fp16=False,
395
+ fp16_backend=auto,
396
+ fp16_full_eval=False,
397
+ fp16_opt_level=O1,
398
+ fsdp=[],
399
+ fsdp_config={'min_num_params': 0, 'xla': False, 'xla_fsdp_v2': False, 'xla_fsdp_grad_ckpt': False},
400
+ fsdp_min_num_params=0,
401
+ fsdp_transformer_layer_cls_to_wrap=None,
402
+ full_determinism=False,
403
+ gradient_accumulation_steps=1,
404
+ gradient_checkpointing=True,
405
+ gradient_checkpointing_kwargs={'use_reentrant': False},
406
+ greater_is_better=None,
407
+ group_by_length=False,
408
+ half_precision_backend=auto,
409
+ hub_always_push=False,
410
+ hub_model_id=OLMoE-1B-7B-0125-aux-free-sft-commonsense15k,
411
+ hub_model_revision=main,
412
+ hub_private_repo=None,
413
+ hub_strategy=HubStrategy.EVERY_SAVE,
414
+ hub_token=<HUB_TOKEN>,
415
+ ignore_data_skip=False,
416
+ include_for_metrics=[],
417
+ include_inputs_for_metrics=False,
418
+ include_num_input_tokens_seen=False,
419
+ include_tokens_per_second=False,
420
+ jit_mode_eval=False,
421
+ label_names=None,
422
+ label_smoothing_factor=0.0,
423
+ learning_rate=1e-05,
424
+ length_column_name=length,
425
+ load_best_model_at_end=False,
426
+ local_rank=0,
427
+ log_level=info,
428
+ log_level_replica=warning,
429
+ log_on_each_node=True,
430
+ logging_dir=/tmp/data/OLMoE-1B-7B-0125/aux_free_sft/commonsense15k/runs/Sep23_22-17-24_orchard-community-4,
431
+ logging_first_step=False,
432
+ logging_nan_inf_filter=True,
433
+ logging_steps=1,
434
+ logging_strategy=IntervalStrategy.STEPS,
435
+ lr_scheduler_kwargs={'min_lr_rate': 0.1},
436
+ lr_scheduler_type=SchedulerType.COSINE_WITH_MIN_LR,
437
+ max_grad_norm=1.0,
438
+ max_length=2048,
439
+ max_seq_length=None,
440
+ max_steps=-1,
441
+ metric_for_best_model=None,
442
+ model_init_kwargs=None,
443
+ mp_parameters=,
444
+ neftune_noise_alpha=None,
445
+ no_cuda=False,
446
+ num_of_sequences=None,
447
+ num_train_epochs=1,
448
+ optim=OptimizerNames.ADAMW_TORCH,
449
+ optim_args=None,
450
+ optim_target_modules=None,
451
+ output_dir=/tmp/data/OLMoE-1B-7B-0125/aux_free_sft/commonsense15k,
452
+ overwrite_hub_revision=False,
453
+ overwrite_output_dir=True,
454
+ packing=False,
455
+ past_index=-1,
456
+ per_device_eval_batch_size=16,
457
+ per_device_train_batch_size=8,
458
+ prediction_loss_only=False,
459
+ push_to_hub=True,
460
+ push_to_hub_model_id=None,
461
+ push_to_hub_organization=None,
462
+ push_to_hub_revision=False,
463
+ push_to_hub_token=<PUSH_TO_HUB_TOKEN>,
464
+ ray_scope=last,
465
+ remove_unused_columns=True,
466
+ report_to=['wandb'],
467
+ restore_callback_states_from_checkpoint=False,
468
+ resume_from_checkpoint=None,
469
+ run_name=/tmp/data/OLMoE-1B-7B-0125/aux_free_sft/commonsense15k,
470
+ save_on_each_node=False,
471
+ save_only_model=False,
472
+ save_safetensors=True,
473
+ save_steps=500,
474
+ save_strategy=SaveStrategy.EPOCH,
475
+ save_total_limit=3,
476
+ seed=1234,
477
+ skip_memory_metrics=True,
478
+ system_prompt=None,
479
+ tf32=None,
480
+ torch_compile=False,
481
+ torch_compile_backend=None,
482
+ torch_compile_mode=None,
483
+ torch_empty_cache_steps=None,
484
+ torchdynamo=None,
485
+ tp_size=0,
486
+ tpu_metrics_debug=False,
487
+ tpu_num_cores=None,
488
+ use_cpu=False,
489
+ use_ipex=False,
490
+ use_legacy_prediction_loop=False,
491
+ use_liger=False,
492
+ use_liger_kernel=False,
493
+ use_mps_device=False,
494
+ wandb_entity=None,
495
+ wandb_project=None,
496
+ warmup_ratio=0.1,
497
+ warmup_steps=0,
498
+ weight_decay=0.0,
499
+ )
500
+ 2025-09-23 22:17:26 - INFO - __main__ - *** Initializing model kwargs ***
501
+ 2025-09-23 22:17:26 - INFO - __main__ - 🔧 Patching OLMOE with aux-free MoE blocks...
502
+ 2025-09-23 22:17:26 - INFO - __main__ - ✅ Using standard aux-free routing
503
+ 2025-09-23 22:17:57 - INFO - __main__ - *** Train ***
504
+ 2025-09-23 22:17:57 - INFO - __main__ - OlmoeForCausalLM(
505
  (model): OlmoeModel(
506
  (embed_tokens): Embedding(50304, 2048, padding_idx=1)
507
  (layers): ModuleList(
 
534
  )
535
  (lm_head): Linear(in_features=2048, out_features=50304, bias=False)
536
  )
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
training_args.bin CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:0457d8c6c5097c2ae04030c0d90d460c059b5425f9af93a7b7c6044ce1c5d671
3
  size 7544
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:c4c17d8b3e16f73e3940542bc301a4fbc3e7f443cf3d5c692fc68fa33d313774
3
  size 7544