{ "best_metric": null, "best_model_checkpoint": null, "epoch": 3.0, "eval_steps": 500, "global_step": 2703, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.05549389567147614, "grad_norm": 15.3125, "learning_rate": 4.907510173880873e-05, "loss": 1.186, "mean_token_accuracy": 0.7049988703429699, "step": 50 }, { "epoch": 0.11098779134295228, "grad_norm": 7.0, "learning_rate": 4.8150203477617464e-05, "loss": 1.1463, "mean_token_accuracy": 0.7061308635771275, "step": 100 }, { "epoch": 0.16648168701442842, "grad_norm": 9.1875, "learning_rate": 4.72253052164262e-05, "loss": 0.9759, "mean_token_accuracy": 0.7407757295668125, "step": 150 }, { "epoch": 0.22197558268590456, "grad_norm": 11.0625, "learning_rate": 4.6300406955234926e-05, "loss": 1.0052, "mean_token_accuracy": 0.7320078010857105, "step": 200 }, { "epoch": 0.27746947835738067, "grad_norm": 4.90625, "learning_rate": 4.5375508694043653e-05, "loss": 1.0511, "mean_token_accuracy": 0.7330576995015144, "step": 250 }, { "epoch": 0.33296337402885684, "grad_norm": 9.125, "learning_rate": 4.445061043285239e-05, "loss": 0.9694, "mean_token_accuracy": 0.7509057518839836, "step": 300 }, { "epoch": 0.38845726970033295, "grad_norm": 6.9375, "learning_rate": 4.352571217166112e-05, "loss": 1.029, "mean_token_accuracy": 0.7393789207935333, "step": 350 }, { "epoch": 0.4439511653718091, "grad_norm": 9.25, "learning_rate": 4.260081391046985e-05, "loss": 0.9199, "mean_token_accuracy": 0.760094360858202, "step": 400 }, { "epoch": 0.49944506104328523, "grad_norm": 9.125, "learning_rate": 4.167591564927858e-05, "loss": 0.9908, "mean_token_accuracy": 0.7447640253603458, "step": 450 }, { "epoch": 0.5549389567147613, "grad_norm": 8.25, "learning_rate": 4.075101738808732e-05, "loss": 0.8984, "mean_token_accuracy": 0.7630760133266449, "step": 500 }, { "epoch": 0.6104328523862376, "grad_norm": 9.1875, "learning_rate": 3.9826119126896046e-05, "loss": 0.9407, "mean_token_accuracy": 0.757173229753971, "step": 550 }, { "epoch": 0.6659267480577137, "grad_norm": 6.34375, "learning_rate": 3.890122086570477e-05, "loss": 0.9575, "mean_token_accuracy": 0.7462678501009941, "step": 600 }, { "epoch": 0.7214206437291898, "grad_norm": 7.40625, "learning_rate": 3.79763226045135e-05, "loss": 0.8748, "mean_token_accuracy": 0.7648530495911836, "step": 650 }, { "epoch": 0.7769145394006659, "grad_norm": 5.90625, "learning_rate": 3.705142434332224e-05, "loss": 0.9121, "mean_token_accuracy": 0.7616761503368616, "step": 700 }, { "epoch": 0.832408435072142, "grad_norm": 5.21875, "learning_rate": 3.612652608213097e-05, "loss": 0.8764, "mean_token_accuracy": 0.7672346414625645, "step": 750 }, { "epoch": 0.8879023307436182, "grad_norm": 5.5625, "learning_rate": 3.5201627820939697e-05, "loss": 0.8334, "mean_token_accuracy": 0.7711550496518612, "step": 800 }, { "epoch": 0.9433962264150944, "grad_norm": 4.90625, "learning_rate": 3.4276729559748424e-05, "loss": 0.8035, "mean_token_accuracy": 0.7846515120565891, "step": 850 }, { "epoch": 0.9988901220865705, "grad_norm": 4.90625, "learning_rate": 3.3351831298557165e-05, "loss": 0.784, "mean_token_accuracy": 0.7859167304635047, "step": 900 }, { "epoch": 1.0543840177580466, "grad_norm": 6.34375, "learning_rate": 3.242693303736589e-05, "loss": 0.4242, "mean_token_accuracy": 0.8643624025583267, "step": 950 }, { "epoch": 1.1098779134295227, "grad_norm": 6.46875, "learning_rate": 3.150203477617462e-05, "loss": 0.4209, "mean_token_accuracy": 0.8601007984578609, "step": 1000 }, { "epoch": 1.1653718091009988, "grad_norm": 3.734375, "learning_rate": 3.0577136514983354e-05, "loss": 0.4225, "mean_token_accuracy": 0.8565894016623496, "step": 1050 }, { "epoch": 1.220865704772475, "grad_norm": 8.3125, "learning_rate": 2.9652238253792085e-05, "loss": 0.3897, "mean_token_accuracy": 0.8680702060461044, "step": 1100 }, { "epoch": 1.2763596004439512, "grad_norm": 4.03125, "learning_rate": 2.8727339992600816e-05, "loss": 0.43, "mean_token_accuracy": 0.8627835646271705, "step": 1150 }, { "epoch": 1.3318534961154274, "grad_norm": 4.09375, "learning_rate": 2.7802441731409544e-05, "loss": 0.4254, "mean_token_accuracy": 0.8601828774809838, "step": 1200 }, { "epoch": 1.3873473917869035, "grad_norm": 3.953125, "learning_rate": 2.6877543470218275e-05, "loss": 0.404, "mean_token_accuracy": 0.8627345436811447, "step": 1250 }, { "epoch": 1.4428412874583796, "grad_norm": 3.796875, "learning_rate": 2.595264520902701e-05, "loss": 0.406, "mean_token_accuracy": 0.8670766687393189, "step": 1300 }, { "epoch": 1.4983351831298557, "grad_norm": 7.875, "learning_rate": 2.502774694783574e-05, "loss": 0.4123, "mean_token_accuracy": 0.8563414543867112, "step": 1350 }, { "epoch": 1.5538290788013318, "grad_norm": 4.5625, "learning_rate": 2.410284868664447e-05, "loss": 0.3906, "mean_token_accuracy": 0.8690173977613449, "step": 1400 }, { "epoch": 1.609322974472808, "grad_norm": 3.828125, "learning_rate": 2.31779504254532e-05, "loss": 0.3896, "mean_token_accuracy": 0.8703913018107414, "step": 1450 }, { "epoch": 1.6648168701442843, "grad_norm": 4.03125, "learning_rate": 2.2253052164261932e-05, "loss": 0.3916, "mean_token_accuracy": 0.8693778309226036, "step": 1500 }, { "epoch": 1.7203107658157601, "grad_norm": 4.4375, "learning_rate": 2.1328153903070663e-05, "loss": 0.3955, "mean_token_accuracy": 0.8697583265602589, "step": 1550 }, { "epoch": 1.7758046614872365, "grad_norm": 5.09375, "learning_rate": 2.0403255641879394e-05, "loss": 0.4101, "mean_token_accuracy": 0.8629062753915787, "step": 1600 }, { "epoch": 1.8312985571587126, "grad_norm": 3.59375, "learning_rate": 1.9478357380688125e-05, "loss": 0.3577, "mean_token_accuracy": 0.8781416714191437, "step": 1650 }, { "epoch": 1.8867924528301887, "grad_norm": 2.609375, "learning_rate": 1.8553459119496856e-05, "loss": 0.3991, "mean_token_accuracy": 0.8602015821635723, "step": 1700 }, { "epoch": 1.9422863485016648, "grad_norm": 3.5, "learning_rate": 1.7628560858305587e-05, "loss": 0.3717, "mean_token_accuracy": 0.8744880908727646, "step": 1750 }, { "epoch": 1.997780244173141, "grad_norm": 3.671875, "learning_rate": 1.670366259711432e-05, "loss": 0.3798, "mean_token_accuracy": 0.8699861750006675, "step": 1800 }, { "epoch": 2.0532741398446173, "grad_norm": 4.3125, "learning_rate": 1.577876433592305e-05, "loss": 0.156, "mean_token_accuracy": 0.9473638749122619, "step": 1850 }, { "epoch": 2.108768035516093, "grad_norm": 3.71875, "learning_rate": 1.4853866074731781e-05, "loss": 0.1442, "mean_token_accuracy": 0.9488543626666069, "step": 1900 }, { "epoch": 2.1642619311875695, "grad_norm": 4.96875, "learning_rate": 1.392896781354051e-05, "loss": 0.1507, "mean_token_accuracy": 0.9458903276920319, "step": 1950 }, { "epoch": 2.2197558268590454, "grad_norm": 3.734375, "learning_rate": 1.3004069552349243e-05, "loss": 0.1485, "mean_token_accuracy": 0.9459331405162811, "step": 2000 }, { "epoch": 2.2752497225305217, "grad_norm": 4.40625, "learning_rate": 1.2079171291157974e-05, "loss": 0.137, "mean_token_accuracy": 0.946778584420681, "step": 2050 }, { "epoch": 2.3307436182019976, "grad_norm": 4.65625, "learning_rate": 1.1154273029966705e-05, "loss": 0.1302, "mean_token_accuracy": 0.9501716086268425, "step": 2100 }, { "epoch": 2.386237513873474, "grad_norm": 3.875, "learning_rate": 1.0229374768775436e-05, "loss": 0.1487, "mean_token_accuracy": 0.9474012264609337, "step": 2150 }, { "epoch": 2.44173140954495, "grad_norm": 6.1875, "learning_rate": 9.304476507584166e-06, "loss": 0.1371, "mean_token_accuracy": 0.9514653950929641, "step": 2200 }, { "epoch": 2.497225305216426, "grad_norm": 6.1875, "learning_rate": 8.379578246392897e-06, "loss": 0.1384, "mean_token_accuracy": 0.9439902834594249, "step": 2250 }, { "epoch": 2.5527192008879025, "grad_norm": 2.71875, "learning_rate": 7.454679985201627e-06, "loss": 0.1357, "mean_token_accuracy": 0.9507204574346543, "step": 2300 }, { "epoch": 2.6082130965593784, "grad_norm": 5.71875, "learning_rate": 6.529781724010358e-06, "loss": 0.1284, "mean_token_accuracy": 0.9505923983454704, "step": 2350 }, { "epoch": 2.6637069922308547, "grad_norm": 2.90625, "learning_rate": 5.60488346281909e-06, "loss": 0.1327, "mean_token_accuracy": 0.9514957949519157, "step": 2400 }, { "epoch": 2.7192008879023306, "grad_norm": 3.796875, "learning_rate": 4.679985201627821e-06, "loss": 0.1422, "mean_token_accuracy": 0.950286613702774, "step": 2450 }, { "epoch": 2.774694783573807, "grad_norm": 4.03125, "learning_rate": 3.7550869404365522e-06, "loss": 0.1351, "mean_token_accuracy": 0.9470736330747604, "step": 2500 }, { "epoch": 2.830188679245283, "grad_norm": 5.71875, "learning_rate": 2.830188679245283e-06, "loss": 0.1247, "mean_token_accuracy": 0.9515706104040146, "step": 2550 }, { "epoch": 2.885682574916759, "grad_norm": 4.28125, "learning_rate": 1.9052904180540142e-06, "loss": 0.1368, "mean_token_accuracy": 0.9486922469735145, "step": 2600 }, { "epoch": 2.9411764705882355, "grad_norm": 3.796875, "learning_rate": 9.80392156862745e-07, "loss": 0.1282, "mean_token_accuracy": 0.9518065723776817, "step": 2650 }, { "epoch": 2.9966703662597114, "grad_norm": 4.15625, "learning_rate": 5.549389567147614e-08, "loss": 0.1399, "mean_token_accuracy": 0.9513461437821388, "step": 2700 } ], "logging_steps": 50, "max_steps": 2703, "num_input_tokens_seen": 0, "num_train_epochs": 3, "save_steps": 500, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": true }, "attributes": {} } }, "total_flos": 3.0206819147904e+16, "train_batch_size": 2, "trial_name": null, "trial_params": null }