diff --git "a/trainer_state.json" "b/trainer_state.json" new file mode 100644--- /dev/null +++ "b/trainer_state.json" @@ -0,0 +1,4984 @@ +{ + "best_global_step": null, + "best_metric": null, + "best_model_checkpoint": null, + "epoch": 0.9987389659520807, + "eval_steps": 500, + "global_step": 495, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "epoch": 0.00201765447667087, + "grad_norm": 42.0, + "learning_rate": 0.0, + "loss": 3.6982, + "memory/device_mem_reserved(gib)": 49.98, + "memory/max_mem_active(gib)": 45.11, + "memory/max_mem_allocated(gib)": 45.11, + "step": 1 + }, + { + "epoch": 0.00403530895334174, + "grad_norm": 43.0, + "learning_rate": 2.0000000000000002e-07, + "loss": 3.8197, + "memory/device_mem_reserved(gib)": 50.78, + "memory/max_mem_active(gib)": 45.14, + "memory/max_mem_allocated(gib)": 45.14, + "step": 2 + }, + { + "epoch": 0.00605296343001261, + "grad_norm": 43.5, + "learning_rate": 4.0000000000000003e-07, + "loss": 3.8927, + "memory/device_mem_reserved(gib)": 50.78, + "memory/max_mem_active(gib)": 45.14, + "memory/max_mem_allocated(gib)": 45.14, + "step": 3 + }, + { + "epoch": 0.00807061790668348, + "grad_norm": 44.5, + "learning_rate": 6.000000000000001e-07, + "loss": 3.7968, + "memory/device_mem_reserved(gib)": 50.82, + "memory/max_mem_active(gib)": 45.14, + "memory/max_mem_allocated(gib)": 45.14, + "step": 4 + }, + { + "epoch": 0.01008827238335435, + "grad_norm": 42.25, + "learning_rate": 8.000000000000001e-07, + "loss": 3.8159, + "memory/device_mem_reserved(gib)": 50.82, + "memory/max_mem_active(gib)": 45.14, + "memory/max_mem_allocated(gib)": 45.14, + "step": 5 + }, + { + "epoch": 0.01210592686002522, + "grad_norm": 39.5, + "learning_rate": 1.0000000000000002e-06, + "loss": 3.8933, + "memory/device_mem_reserved(gib)": 50.82, + "memory/max_mem_active(gib)": 45.14, + "memory/max_mem_allocated(gib)": 45.14, + "step": 6 + }, + { + "epoch": 0.01412358133669609, + "grad_norm": 42.0, + "learning_rate": 1.2000000000000002e-06, + "loss": 3.8945, + "memory/device_mem_reserved(gib)": 50.82, + "memory/max_mem_active(gib)": 45.14, + "memory/max_mem_allocated(gib)": 45.14, + "step": 7 + }, + { + "epoch": 0.01614123581336696, + "grad_norm": 44.5, + "learning_rate": 1.4000000000000001e-06, + "loss": 3.9202, + "memory/device_mem_reserved(gib)": 50.82, + "memory/max_mem_active(gib)": 45.14, + "memory/max_mem_allocated(gib)": 45.14, + "step": 8 + }, + { + "epoch": 0.01815889029003783, + "grad_norm": 44.5, + "learning_rate": 1.6000000000000001e-06, + "loss": 3.8472, + "memory/device_mem_reserved(gib)": 50.82, + "memory/max_mem_active(gib)": 45.14, + "memory/max_mem_allocated(gib)": 45.14, + "step": 9 + }, + { + "epoch": 0.0201765447667087, + "grad_norm": 44.25, + "learning_rate": 1.8000000000000001e-06, + "loss": 3.9382, + "memory/device_mem_reserved(gib)": 50.82, + "memory/max_mem_active(gib)": 45.14, + "memory/max_mem_allocated(gib)": 45.14, + "step": 10 + }, + { + "epoch": 0.02219419924337957, + "grad_norm": 46.75, + "learning_rate": 2.0000000000000003e-06, + "loss": 3.8074, + "memory/device_mem_reserved(gib)": 50.82, + "memory/max_mem_active(gib)": 45.14, + "memory/max_mem_allocated(gib)": 45.14, + "step": 11 + }, + { + "epoch": 0.02421185372005044, + "grad_norm": 43.0, + "learning_rate": 2.2e-06, + "loss": 3.9049, + "memory/device_mem_reserved(gib)": 50.82, + "memory/max_mem_active(gib)": 45.14, + "memory/max_mem_allocated(gib)": 45.14, + "step": 12 + }, + { + "epoch": 0.02622950819672131, + "grad_norm": 46.25, + "learning_rate": 2.4000000000000003e-06, + "loss": 3.8222, + "memory/device_mem_reserved(gib)": 50.82, + "memory/max_mem_active(gib)": 45.14, + "memory/max_mem_allocated(gib)": 45.14, + "step": 13 + }, + { + "epoch": 0.02824716267339218, + "grad_norm": 39.75, + "learning_rate": 2.6e-06, + "loss": 3.7079, + "memory/device_mem_reserved(gib)": 50.82, + "memory/max_mem_active(gib)": 45.14, + "memory/max_mem_allocated(gib)": 45.14, + "step": 14 + }, + { + "epoch": 0.03026481715006305, + "grad_norm": 39.25, + "learning_rate": 2.8000000000000003e-06, + "loss": 3.6677, + "memory/device_mem_reserved(gib)": 50.82, + "memory/max_mem_active(gib)": 45.14, + "memory/max_mem_allocated(gib)": 45.14, + "step": 15 + }, + { + "epoch": 0.03228247162673392, + "grad_norm": 46.75, + "learning_rate": 3e-06, + "loss": 3.6726, + "memory/device_mem_reserved(gib)": 50.82, + "memory/max_mem_active(gib)": 45.14, + "memory/max_mem_allocated(gib)": 45.14, + "step": 16 + }, + { + "epoch": 0.03430012610340479, + "grad_norm": 43.0, + "learning_rate": 3.2000000000000003e-06, + "loss": 3.7075, + "memory/device_mem_reserved(gib)": 50.82, + "memory/max_mem_active(gib)": 45.14, + "memory/max_mem_allocated(gib)": 45.14, + "step": 17 + }, + { + "epoch": 0.03631778058007566, + "grad_norm": 47.25, + "learning_rate": 3.4000000000000005e-06, + "loss": 3.7656, + "memory/device_mem_reserved(gib)": 50.82, + "memory/max_mem_active(gib)": 45.14, + "memory/max_mem_allocated(gib)": 45.14, + "step": 18 + }, + { + "epoch": 0.03833543505674653, + "grad_norm": 42.25, + "learning_rate": 3.6000000000000003e-06, + "loss": 3.6015, + "memory/device_mem_reserved(gib)": 50.82, + "memory/max_mem_active(gib)": 45.14, + "memory/max_mem_allocated(gib)": 45.14, + "step": 19 + }, + { + "epoch": 0.0403530895334174, + "grad_norm": 45.25, + "learning_rate": 3.8000000000000005e-06, + "loss": 3.5949, + "memory/device_mem_reserved(gib)": 50.82, + "memory/max_mem_active(gib)": 45.14, + "memory/max_mem_allocated(gib)": 45.14, + "step": 20 + }, + { + "epoch": 0.04237074401008827, + "grad_norm": 43.5, + "learning_rate": 4.000000000000001e-06, + "loss": 3.514, + "memory/device_mem_reserved(gib)": 50.82, + "memory/max_mem_active(gib)": 45.14, + "memory/max_mem_allocated(gib)": 45.14, + "step": 21 + }, + { + "epoch": 0.04438839848675914, + "grad_norm": 44.75, + "learning_rate": 4.2000000000000004e-06, + "loss": 3.3632, + "memory/device_mem_reserved(gib)": 50.82, + "memory/max_mem_active(gib)": 45.14, + "memory/max_mem_allocated(gib)": 45.14, + "step": 22 + }, + { + "epoch": 0.04640605296343001, + "grad_norm": 40.75, + "learning_rate": 4.4e-06, + "loss": 3.4502, + "memory/device_mem_reserved(gib)": 50.82, + "memory/max_mem_active(gib)": 45.14, + "memory/max_mem_allocated(gib)": 45.14, + "step": 23 + }, + { + "epoch": 0.04842370744010088, + "grad_norm": 33.25, + "learning_rate": 4.600000000000001e-06, + "loss": 3.3191, + "memory/device_mem_reserved(gib)": 50.82, + "memory/max_mem_active(gib)": 45.14, + "memory/max_mem_allocated(gib)": 45.14, + "step": 24 + }, + { + "epoch": 0.05044136191677175, + "grad_norm": 39.0, + "learning_rate": 4.800000000000001e-06, + "loss": 3.3536, + "memory/device_mem_reserved(gib)": 50.82, + "memory/max_mem_active(gib)": 45.14, + "memory/max_mem_allocated(gib)": 45.14, + "step": 25 + }, + { + "epoch": 0.05245901639344262, + "grad_norm": 27.625, + "learning_rate": 5e-06, + "loss": 3.3645, + "memory/device_mem_reserved(gib)": 50.82, + "memory/max_mem_active(gib)": 45.14, + "memory/max_mem_allocated(gib)": 45.14, + "step": 26 + }, + { + "epoch": 0.05447667087011349, + "grad_norm": 26.375, + "learning_rate": 4.999944151382673e-06, + "loss": 3.2603, + "memory/device_mem_reserved(gib)": 50.82, + "memory/max_mem_active(gib)": 45.14, + "memory/max_mem_allocated(gib)": 45.14, + "step": 27 + }, + { + "epoch": 0.05649432534678436, + "grad_norm": 20.375, + "learning_rate": 4.999776608025946e-06, + "loss": 3.2394, + "memory/device_mem_reserved(gib)": 50.82, + "memory/max_mem_active(gib)": 45.14, + "memory/max_mem_allocated(gib)": 45.14, + "step": 28 + }, + { + "epoch": 0.05851197982345523, + "grad_norm": 20.375, + "learning_rate": 4.99949737741547e-06, + "loss": 3.237, + "memory/device_mem_reserved(gib)": 50.82, + "memory/max_mem_active(gib)": 45.14, + "memory/max_mem_allocated(gib)": 45.14, + "step": 29 + }, + { + "epoch": 0.0605296343001261, + "grad_norm": 14.25, + "learning_rate": 4.99910647202696e-06, + "loss": 3.0697, + "memory/device_mem_reserved(gib)": 50.82, + "memory/max_mem_active(gib)": 45.14, + "memory/max_mem_allocated(gib)": 45.14, + "step": 30 + }, + { + "epoch": 0.06254728877679698, + "grad_norm": 12.4375, + "learning_rate": 4.998603909325636e-06, + "loss": 3.2899, + "memory/device_mem_reserved(gib)": 50.82, + "memory/max_mem_active(gib)": 45.14, + "memory/max_mem_allocated(gib)": 45.14, + "step": 31 + }, + { + "epoch": 0.06456494325346784, + "grad_norm": 11.25, + "learning_rate": 4.997989711765447e-06, + "loss": 3.1902, + "memory/device_mem_reserved(gib)": 50.82, + "memory/max_mem_active(gib)": 45.14, + "memory/max_mem_allocated(gib)": 45.14, + "step": 32 + }, + { + "epoch": 0.06658259773013872, + "grad_norm": 7.5625, + "learning_rate": 4.9972639067880555e-06, + "loss": 3.0802, + "memory/device_mem_reserved(gib)": 50.82, + "memory/max_mem_active(gib)": 45.14, + "memory/max_mem_allocated(gib)": 45.14, + "step": 33 + }, + { + "epoch": 0.06860025220680958, + "grad_norm": 6.5, + "learning_rate": 4.996426526821629e-06, + "loss": 3.1234, + "memory/device_mem_reserved(gib)": 50.82, + "memory/max_mem_active(gib)": 45.14, + "memory/max_mem_allocated(gib)": 45.14, + "step": 34 + }, + { + "epoch": 0.07061790668348046, + "grad_norm": 6.25, + "learning_rate": 4.9954776092793755e-06, + "loss": 3.1298, + "memory/device_mem_reserved(gib)": 50.82, + "memory/max_mem_active(gib)": 45.14, + "memory/max_mem_allocated(gib)": 45.14, + "step": 35 + }, + { + "epoch": 0.07263556116015132, + "grad_norm": 5.9375, + "learning_rate": 4.994417196557884e-06, + "loss": 2.9955, + "memory/device_mem_reserved(gib)": 50.82, + "memory/max_mem_active(gib)": 45.14, + "memory/max_mem_allocated(gib)": 45.14, + "step": 36 + }, + { + "epoch": 0.0746532156368222, + "grad_norm": 5.46875, + "learning_rate": 4.993245336035219e-06, + "loss": 3.1321, + "memory/device_mem_reserved(gib)": 50.82, + "memory/max_mem_active(gib)": 45.14, + "memory/max_mem_allocated(gib)": 45.14, + "step": 37 + }, + { + "epoch": 0.07667087011349306, + "grad_norm": 3.875, + "learning_rate": 4.991962080068813e-06, + "loss": 3.0705, + "memory/device_mem_reserved(gib)": 50.82, + "memory/max_mem_active(gib)": 45.14, + "memory/max_mem_allocated(gib)": 45.14, + "step": 38 + }, + { + "epoch": 0.07868852459016394, + "grad_norm": 4.40625, + "learning_rate": 4.990567485993125e-06, + "loss": 3.072, + "memory/device_mem_reserved(gib)": 50.82, + "memory/max_mem_active(gib)": 45.14, + "memory/max_mem_allocated(gib)": 45.14, + "step": 39 + }, + { + "epoch": 0.0807061790668348, + "grad_norm": 3.859375, + "learning_rate": 4.989061616117073e-06, + "loss": 3.2298, + "memory/device_mem_reserved(gib)": 50.82, + "memory/max_mem_active(gib)": 45.14, + "memory/max_mem_allocated(gib)": 45.14, + "step": 40 + }, + { + "epoch": 0.08272383354350568, + "grad_norm": 3.46875, + "learning_rate": 4.98744453772126e-06, + "loss": 3.014, + "memory/device_mem_reserved(gib)": 50.82, + "memory/max_mem_active(gib)": 45.14, + "memory/max_mem_allocated(gib)": 45.14, + "step": 41 + }, + { + "epoch": 0.08474148802017654, + "grad_norm": 3.25, + "learning_rate": 4.985716323054959e-06, + "loss": 3.0271, + "memory/device_mem_reserved(gib)": 50.82, + "memory/max_mem_active(gib)": 45.14, + "memory/max_mem_allocated(gib)": 45.14, + "step": 42 + }, + { + "epoch": 0.08675914249684742, + "grad_norm": 3.40625, + "learning_rate": 4.983877049332889e-06, + "loss": 3.0976, + "memory/device_mem_reserved(gib)": 50.82, + "memory/max_mem_active(gib)": 45.14, + "memory/max_mem_allocated(gib)": 45.14, + "step": 43 + }, + { + "epoch": 0.08877679697351828, + "grad_norm": 2.671875, + "learning_rate": 4.981926798731767e-06, + "loss": 2.959, + "memory/device_mem_reserved(gib)": 50.82, + "memory/max_mem_active(gib)": 45.14, + "memory/max_mem_allocated(gib)": 45.14, + "step": 44 + }, + { + "epoch": 0.09079445145018916, + "grad_norm": 3.0, + "learning_rate": 4.97986565838663e-06, + "loss": 2.9811, + "memory/device_mem_reserved(gib)": 50.82, + "memory/max_mem_active(gib)": 45.14, + "memory/max_mem_allocated(gib)": 45.14, + "step": 45 + }, + { + "epoch": 0.09281210592686003, + "grad_norm": 2.6875, + "learning_rate": 4.977693720386951e-06, + "loss": 2.9736, + "memory/device_mem_reserved(gib)": 50.82, + "memory/max_mem_active(gib)": 45.14, + "memory/max_mem_allocated(gib)": 45.14, + "step": 46 + }, + { + "epoch": 0.0948297604035309, + "grad_norm": 2.203125, + "learning_rate": 4.975411081772516e-06, + "loss": 3.0368, + "memory/device_mem_reserved(gib)": 50.82, + "memory/max_mem_active(gib)": 45.14, + "memory/max_mem_allocated(gib)": 45.14, + "step": 47 + }, + { + "epoch": 0.09684741488020177, + "grad_norm": 1.96875, + "learning_rate": 4.9730178445290945e-06, + "loss": 3.0042, + "memory/device_mem_reserved(gib)": 50.82, + "memory/max_mem_active(gib)": 45.14, + "memory/max_mem_allocated(gib)": 45.14, + "step": 48 + }, + { + "epoch": 0.09886506935687264, + "grad_norm": 1.84375, + "learning_rate": 4.970514115583878e-06, + "loss": 3.0294, + "memory/device_mem_reserved(gib)": 50.82, + "memory/max_mem_active(gib)": 45.14, + "memory/max_mem_allocated(gib)": 45.14, + "step": 49 + }, + { + "epoch": 0.1008827238335435, + "grad_norm": 1.8359375, + "learning_rate": 4.967900006800708e-06, + "loss": 3.0115, + "memory/device_mem_reserved(gib)": 50.82, + "memory/max_mem_active(gib)": 45.14, + "memory/max_mem_allocated(gib)": 45.14, + "step": 50 + }, + { + "epoch": 0.10290037831021438, + "grad_norm": 1.78125, + "learning_rate": 4.965175634975072e-06, + "loss": 3.0661, + "memory/device_mem_reserved(gib)": 50.82, + "memory/max_mem_active(gib)": 45.14, + "memory/max_mem_allocated(gib)": 45.14, + "step": 51 + }, + { + "epoch": 0.10491803278688525, + "grad_norm": 1.6015625, + "learning_rate": 4.96234112182889e-06, + "loss": 3.0006, + "memory/device_mem_reserved(gib)": 50.82, + "memory/max_mem_active(gib)": 45.14, + "memory/max_mem_allocated(gib)": 45.14, + "step": 52 + }, + { + "epoch": 0.10693568726355612, + "grad_norm": 1.5859375, + "learning_rate": 4.959396594005073e-06, + "loss": 3.0469, + "memory/device_mem_reserved(gib)": 50.82, + "memory/max_mem_active(gib)": 45.14, + "memory/max_mem_allocated(gib)": 45.14, + "step": 53 + }, + { + "epoch": 0.10895334174022699, + "grad_norm": 1.453125, + "learning_rate": 4.95634218306187e-06, + "loss": 3.0325, + "memory/device_mem_reserved(gib)": 50.82, + "memory/max_mem_active(gib)": 45.14, + "memory/max_mem_allocated(gib)": 45.14, + "step": 54 + }, + { + "epoch": 0.11097099621689786, + "grad_norm": 1.296875, + "learning_rate": 4.953178025466981e-06, + "loss": 2.9788, + "memory/device_mem_reserved(gib)": 50.82, + "memory/max_mem_active(gib)": 45.14, + "memory/max_mem_allocated(gib)": 45.14, + "step": 55 + }, + { + "epoch": 0.11298865069356873, + "grad_norm": 1.3984375, + "learning_rate": 4.949904262591467e-06, + "loss": 3.0148, + "memory/device_mem_reserved(gib)": 50.82, + "memory/max_mem_active(gib)": 45.14, + "memory/max_mem_allocated(gib)": 45.14, + "step": 56 + }, + { + "epoch": 0.1150063051702396, + "grad_norm": 1.3828125, + "learning_rate": 4.946521040703434e-06, + "loss": 2.9016, + "memory/device_mem_reserved(gib)": 50.82, + "memory/max_mem_active(gib)": 45.14, + "memory/max_mem_allocated(gib)": 45.14, + "step": 57 + }, + { + "epoch": 0.11702395964691047, + "grad_norm": 1.21875, + "learning_rate": 4.943028510961492e-06, + "loss": 3.1604, + "memory/device_mem_reserved(gib)": 50.82, + "memory/max_mem_active(gib)": 45.14, + "memory/max_mem_allocated(gib)": 45.14, + "step": 58 + }, + { + "epoch": 0.11904161412358134, + "grad_norm": 1.2109375, + "learning_rate": 4.939426829408008e-06, + "loss": 2.9688, + "memory/device_mem_reserved(gib)": 50.82, + "memory/max_mem_active(gib)": 45.14, + "memory/max_mem_allocated(gib)": 45.14, + "step": 59 + }, + { + "epoch": 0.1210592686002522, + "grad_norm": 1.203125, + "learning_rate": 4.9357161569621275e-06, + "loss": 2.8605, + "memory/device_mem_reserved(gib)": 50.82, + "memory/max_mem_active(gib)": 45.14, + "memory/max_mem_allocated(gib)": 45.14, + "step": 60 + }, + { + "epoch": 0.12307692307692308, + "grad_norm": 0.93359375, + "learning_rate": 4.931896659412593e-06, + "loss": 3.0057, + "memory/device_mem_reserved(gib)": 50.82, + "memory/max_mem_active(gib)": 45.14, + "memory/max_mem_allocated(gib)": 45.14, + "step": 61 + }, + { + "epoch": 0.12509457755359396, + "grad_norm": 0.96484375, + "learning_rate": 4.92796850741033e-06, + "loss": 2.9342, + "memory/device_mem_reserved(gib)": 50.82, + "memory/max_mem_active(gib)": 45.14, + "memory/max_mem_allocated(gib)": 45.14, + "step": 62 + }, + { + "epoch": 0.1271122320302648, + "grad_norm": 0.9921875, + "learning_rate": 4.9239318764608245e-06, + "loss": 2.949, + "memory/device_mem_reserved(gib)": 50.82, + "memory/max_mem_active(gib)": 45.14, + "memory/max_mem_allocated(gib)": 45.14, + "step": 63 + }, + { + "epoch": 0.1291298865069357, + "grad_norm": 0.95703125, + "learning_rate": 4.919786946916282e-06, + "loss": 2.9372, + "memory/device_mem_reserved(gib)": 50.82, + "memory/max_mem_active(gib)": 45.14, + "memory/max_mem_allocated(gib)": 45.14, + "step": 64 + }, + { + "epoch": 0.13114754098360656, + "grad_norm": 0.79296875, + "learning_rate": 4.91553390396757e-06, + "loss": 2.9564, + "memory/device_mem_reserved(gib)": 50.82, + "memory/max_mem_active(gib)": 45.14, + "memory/max_mem_allocated(gib)": 45.14, + "step": 65 + }, + { + "epoch": 0.13316519546027744, + "grad_norm": 0.90234375, + "learning_rate": 4.911172937635942e-06, + "loss": 2.9389, + "memory/device_mem_reserved(gib)": 50.82, + "memory/max_mem_active(gib)": 45.14, + "memory/max_mem_allocated(gib)": 45.14, + "step": 66 + }, + { + "epoch": 0.1351828499369483, + "grad_norm": 0.79296875, + "learning_rate": 4.906704242764551e-06, + "loss": 2.8962, + "memory/device_mem_reserved(gib)": 50.82, + "memory/max_mem_active(gib)": 45.14, + "memory/max_mem_allocated(gib)": 45.14, + "step": 67 + }, + { + "epoch": 0.13720050441361917, + "grad_norm": 0.76953125, + "learning_rate": 4.902128019009741e-06, + "loss": 2.9853, + "memory/device_mem_reserved(gib)": 50.82, + "memory/max_mem_active(gib)": 45.14, + "memory/max_mem_allocated(gib)": 45.14, + "step": 68 + }, + { + "epoch": 0.13921815889029004, + "grad_norm": 0.77734375, + "learning_rate": 4.8974444708321265e-06, + "loss": 2.9407, + "memory/device_mem_reserved(gib)": 50.82, + "memory/max_mem_active(gib)": 45.14, + "memory/max_mem_allocated(gib)": 45.14, + "step": 69 + }, + { + "epoch": 0.14123581336696092, + "grad_norm": 0.78125, + "learning_rate": 4.892653807487461e-06, + "loss": 2.9395, + "memory/device_mem_reserved(gib)": 50.82, + "memory/max_mem_active(gib)": 45.14, + "memory/max_mem_allocated(gib)": 45.14, + "step": 70 + }, + { + "epoch": 0.14325346784363177, + "grad_norm": 0.72265625, + "learning_rate": 4.887756243017282e-06, + "loss": 3.0041, + "memory/device_mem_reserved(gib)": 50.82, + "memory/max_mem_active(gib)": 45.14, + "memory/max_mem_allocated(gib)": 45.14, + "step": 71 + }, + { + "epoch": 0.14527112232030265, + "grad_norm": 0.63671875, + "learning_rate": 4.882751996239352e-06, + "loss": 2.9692, + "memory/device_mem_reserved(gib)": 50.82, + "memory/max_mem_active(gib)": 45.14, + "memory/max_mem_allocated(gib)": 45.14, + "step": 72 + }, + { + "epoch": 0.14728877679697353, + "grad_norm": 0.6875, + "learning_rate": 4.8776412907378845e-06, + "loss": 2.9279, + "memory/device_mem_reserved(gib)": 50.82, + "memory/max_mem_active(gib)": 45.14, + "memory/max_mem_allocated(gib)": 45.14, + "step": 73 + }, + { + "epoch": 0.1493064312736444, + "grad_norm": 0.7109375, + "learning_rate": 4.872424354853545e-06, + "loss": 2.9776, + "memory/device_mem_reserved(gib)": 50.82, + "memory/max_mem_active(gib)": 45.14, + "memory/max_mem_allocated(gib)": 45.14, + "step": 74 + }, + { + "epoch": 0.15132408575031525, + "grad_norm": 0.80859375, + "learning_rate": 4.867101421673261e-06, + "loss": 3.0093, + "memory/device_mem_reserved(gib)": 50.82, + "memory/max_mem_active(gib)": 45.14, + "memory/max_mem_allocated(gib)": 45.14, + "step": 75 + }, + { + "epoch": 0.15334174022698613, + "grad_norm": 0.62890625, + "learning_rate": 4.861672729019798e-06, + "loss": 2.9489, + "memory/device_mem_reserved(gib)": 50.82, + "memory/max_mem_active(gib)": 45.14, + "memory/max_mem_allocated(gib)": 45.14, + "step": 76 + }, + { + "epoch": 0.155359394703657, + "grad_norm": 0.6171875, + "learning_rate": 4.856138519441137e-06, + "loss": 2.8465, + "memory/device_mem_reserved(gib)": 50.82, + "memory/max_mem_active(gib)": 45.14, + "memory/max_mem_allocated(gib)": 45.14, + "step": 77 + }, + { + "epoch": 0.15737704918032788, + "grad_norm": 0.7265625, + "learning_rate": 4.8504990401996434e-06, + "loss": 2.9363, + "memory/device_mem_reserved(gib)": 50.82, + "memory/max_mem_active(gib)": 45.14, + "memory/max_mem_allocated(gib)": 45.14, + "step": 78 + }, + { + "epoch": 0.15939470365699873, + "grad_norm": 0.53125, + "learning_rate": 4.8447545432610095e-06, + "loss": 2.8965, + "memory/device_mem_reserved(gib)": 50.82, + "memory/max_mem_active(gib)": 45.14, + "memory/max_mem_allocated(gib)": 45.14, + "step": 79 + }, + { + "epoch": 0.1614123581336696, + "grad_norm": 0.61328125, + "learning_rate": 4.8389052852830055e-06, + "loss": 2.96, + "memory/device_mem_reserved(gib)": 50.82, + "memory/max_mem_active(gib)": 45.14, + "memory/max_mem_allocated(gib)": 45.14, + "step": 80 + }, + { + "epoch": 0.16343001261034049, + "grad_norm": 0.578125, + "learning_rate": 4.832951527604007e-06, + "loss": 2.9506, + "memory/device_mem_reserved(gib)": 50.82, + "memory/max_mem_active(gib)": 45.14, + "memory/max_mem_allocated(gib)": 45.14, + "step": 81 + }, + { + "epoch": 0.16544766708701136, + "grad_norm": 0.59375, + "learning_rate": 4.826893536231322e-06, + "loss": 2.8551, + "memory/device_mem_reserved(gib)": 50.82, + "memory/max_mem_active(gib)": 45.14, + "memory/max_mem_allocated(gib)": 45.14, + "step": 82 + }, + { + "epoch": 0.1674653215636822, + "grad_norm": 0.625, + "learning_rate": 4.820731581829303e-06, + "loss": 3.0159, + "memory/device_mem_reserved(gib)": 50.82, + "memory/max_mem_active(gib)": 45.14, + "memory/max_mem_allocated(gib)": 45.14, + "step": 83 + }, + { + "epoch": 0.1694829760403531, + "grad_norm": 0.5234375, + "learning_rate": 4.814465939707259e-06, + "loss": 2.9186, + "memory/device_mem_reserved(gib)": 50.82, + "memory/max_mem_active(gib)": 45.14, + "memory/max_mem_allocated(gib)": 45.14, + "step": 84 + }, + { + "epoch": 0.17150063051702397, + "grad_norm": 0.51171875, + "learning_rate": 4.808096889807147e-06, + "loss": 3.0161, + "memory/device_mem_reserved(gib)": 50.82, + "memory/max_mem_active(gib)": 45.14, + "memory/max_mem_allocated(gib)": 45.14, + "step": 85 + }, + { + "epoch": 0.17351828499369484, + "grad_norm": 0.57421875, + "learning_rate": 4.801624716691072e-06, + "loss": 2.9589, + "memory/device_mem_reserved(gib)": 50.82, + "memory/max_mem_active(gib)": 45.14, + "memory/max_mem_allocated(gib)": 45.14, + "step": 86 + }, + { + "epoch": 0.1755359394703657, + "grad_norm": 0.5625, + "learning_rate": 4.795049709528571e-06, + "loss": 2.9643, + "memory/device_mem_reserved(gib)": 50.82, + "memory/max_mem_active(gib)": 45.14, + "memory/max_mem_allocated(gib)": 45.14, + "step": 87 + }, + { + "epoch": 0.17755359394703657, + "grad_norm": 0.578125, + "learning_rate": 4.78837216208369e-06, + "loss": 2.9542, + "memory/device_mem_reserved(gib)": 50.82, + "memory/max_mem_active(gib)": 45.14, + "memory/max_mem_allocated(gib)": 45.14, + "step": 88 + }, + { + "epoch": 0.17957124842370745, + "grad_norm": 0.50390625, + "learning_rate": 4.7815923727018625e-06, + "loss": 2.9755, + "memory/device_mem_reserved(gib)": 50.82, + "memory/max_mem_active(gib)": 45.14, + "memory/max_mem_allocated(gib)": 45.14, + "step": 89 + }, + { + "epoch": 0.18158890290037832, + "grad_norm": 0.54296875, + "learning_rate": 4.774710644296579e-06, + "loss": 2.9595, + "memory/device_mem_reserved(gib)": 50.82, + "memory/max_mem_active(gib)": 45.14, + "memory/max_mem_allocated(gib)": 45.14, + "step": 90 + }, + { + "epoch": 0.18360655737704917, + "grad_norm": 0.48828125, + "learning_rate": 4.767727284335852e-06, + "loss": 2.9172, + "memory/device_mem_reserved(gib)": 50.82, + "memory/max_mem_active(gib)": 45.14, + "memory/max_mem_allocated(gib)": 45.14, + "step": 91 + }, + { + "epoch": 0.18562421185372005, + "grad_norm": 0.515625, + "learning_rate": 4.760642604828482e-06, + "loss": 2.9182, + "memory/device_mem_reserved(gib)": 50.82, + "memory/max_mem_active(gib)": 45.14, + "memory/max_mem_allocated(gib)": 45.14, + "step": 92 + }, + { + "epoch": 0.18764186633039093, + "grad_norm": 0.59375, + "learning_rate": 4.753456922310109e-06, + "loss": 2.9225, + "memory/device_mem_reserved(gib)": 50.82, + "memory/max_mem_active(gib)": 45.14, + "memory/max_mem_allocated(gib)": 45.14, + "step": 93 + }, + { + "epoch": 0.1896595208070618, + "grad_norm": 0.515625, + "learning_rate": 4.746170557829084e-06, + "loss": 2.8674, + "memory/device_mem_reserved(gib)": 50.82, + "memory/max_mem_active(gib)": 45.14, + "memory/max_mem_allocated(gib)": 45.14, + "step": 94 + }, + { + "epoch": 0.19167717528373265, + "grad_norm": 0.4921875, + "learning_rate": 4.738783836932109e-06, + "loss": 2.8713, + "memory/device_mem_reserved(gib)": 50.82, + "memory/max_mem_active(gib)": 45.14, + "memory/max_mem_allocated(gib)": 45.14, + "step": 95 + }, + { + "epoch": 0.19369482976040353, + "grad_norm": 0.478515625, + "learning_rate": 4.731297089649704e-06, + "loss": 3.0112, + "memory/device_mem_reserved(gib)": 50.82, + "memory/max_mem_active(gib)": 45.14, + "memory/max_mem_allocated(gib)": 45.14, + "step": 96 + }, + { + "epoch": 0.1957124842370744, + "grad_norm": 0.46875, + "learning_rate": 4.723710650481456e-06, + "loss": 2.9569, + "memory/device_mem_reserved(gib)": 50.82, + "memory/max_mem_active(gib)": 45.14, + "memory/max_mem_allocated(gib)": 45.14, + "step": 97 + }, + { + "epoch": 0.19773013871374528, + "grad_norm": 0.48828125, + "learning_rate": 4.7160248583810755e-06, + "loss": 2.9466, + "memory/device_mem_reserved(gib)": 50.82, + "memory/max_mem_active(gib)": 45.14, + "memory/max_mem_allocated(gib)": 45.14, + "step": 98 + }, + { + "epoch": 0.19974779319041613, + "grad_norm": 0.48828125, + "learning_rate": 4.708240056741253e-06, + "loss": 2.9399, + "memory/device_mem_reserved(gib)": 50.82, + "memory/max_mem_active(gib)": 45.14, + "memory/max_mem_allocated(gib)": 45.14, + "step": 99 + }, + { + "epoch": 0.201765447667087, + "grad_norm": 0.51171875, + "learning_rate": 4.700356593378312e-06, + "loss": 2.9445, + "memory/device_mem_reserved(gib)": 50.82, + "memory/max_mem_active(gib)": 45.14, + "memory/max_mem_allocated(gib)": 45.14, + "step": 100 + }, + { + "epoch": 0.2037831021437579, + "grad_norm": 0.51953125, + "learning_rate": 4.692374820516679e-06, + "loss": 2.9657, + "memory/device_mem_reserved(gib)": 50.82, + "memory/max_mem_active(gib)": 45.14, + "memory/max_mem_allocated(gib)": 45.14, + "step": 101 + }, + { + "epoch": 0.20580075662042877, + "grad_norm": 0.53125, + "learning_rate": 4.684295094773134e-06, + "loss": 3.0236, + "memory/device_mem_reserved(gib)": 50.82, + "memory/max_mem_active(gib)": 45.14, + "memory/max_mem_allocated(gib)": 45.14, + "step": 102 + }, + { + "epoch": 0.20781841109709961, + "grad_norm": 0.470703125, + "learning_rate": 4.676117777140887e-06, + "loss": 2.9442, + "memory/device_mem_reserved(gib)": 50.82, + "memory/max_mem_active(gib)": 45.14, + "memory/max_mem_allocated(gib)": 45.14, + "step": 103 + }, + { + "epoch": 0.2098360655737705, + "grad_norm": 0.490234375, + "learning_rate": 4.667843232973444e-06, + "loss": 2.9491, + "memory/device_mem_reserved(gib)": 50.82, + "memory/max_mem_active(gib)": 45.14, + "memory/max_mem_allocated(gib)": 45.14, + "step": 104 + }, + { + "epoch": 0.21185372005044137, + "grad_norm": 0.5390625, + "learning_rate": 4.659471831968285e-06, + "loss": 3.0061, + "memory/device_mem_reserved(gib)": 50.82, + "memory/max_mem_active(gib)": 45.14, + "memory/max_mem_allocated(gib)": 45.14, + "step": 105 + }, + { + "epoch": 0.21387137452711225, + "grad_norm": 0.474609375, + "learning_rate": 4.651003948150349e-06, + "loss": 2.8666, + "memory/device_mem_reserved(gib)": 50.82, + "memory/max_mem_active(gib)": 45.14, + "memory/max_mem_allocated(gib)": 45.14, + "step": 106 + }, + { + "epoch": 0.2158890290037831, + "grad_norm": 0.4375, + "learning_rate": 4.642439959855316e-06, + "loss": 2.9507, + "memory/device_mem_reserved(gib)": 50.82, + "memory/max_mem_active(gib)": 45.14, + "memory/max_mem_allocated(gib)": 45.14, + "step": 107 + }, + { + "epoch": 0.21790668348045397, + "grad_norm": 0.48828125, + "learning_rate": 4.633780249712712e-06, + "loss": 2.8984, + "memory/device_mem_reserved(gib)": 50.82, + "memory/max_mem_active(gib)": 45.14, + "memory/max_mem_allocated(gib)": 45.14, + "step": 108 + }, + { + "epoch": 0.21992433795712485, + "grad_norm": 0.421875, + "learning_rate": 4.625025204628806e-06, + "loss": 2.8593, + "memory/device_mem_reserved(gib)": 50.82, + "memory/max_mem_active(gib)": 45.14, + "memory/max_mem_allocated(gib)": 45.14, + "step": 109 + }, + { + "epoch": 0.22194199243379573, + "grad_norm": 0.470703125, + "learning_rate": 4.616175215769328e-06, + "loss": 2.8705, + "memory/device_mem_reserved(gib)": 50.82, + "memory/max_mem_active(gib)": 45.14, + "memory/max_mem_allocated(gib)": 45.14, + "step": 110 + }, + { + "epoch": 0.22395964691046658, + "grad_norm": 0.482421875, + "learning_rate": 4.607230678541993e-06, + "loss": 2.916, + "memory/device_mem_reserved(gib)": 50.82, + "memory/max_mem_active(gib)": 45.14, + "memory/max_mem_allocated(gib)": 45.14, + "step": 111 + }, + { + "epoch": 0.22597730138713745, + "grad_norm": 0.39453125, + "learning_rate": 4.5981919925788285e-06, + "loss": 2.921, + "memory/device_mem_reserved(gib)": 50.82, + "memory/max_mem_active(gib)": 45.14, + "memory/max_mem_allocated(gib)": 45.14, + "step": 112 + }, + { + "epoch": 0.22799495586380833, + "grad_norm": 0.40625, + "learning_rate": 4.5890595617183254e-06, + "loss": 2.8851, + "memory/device_mem_reserved(gib)": 50.82, + "memory/max_mem_active(gib)": 45.14, + "memory/max_mem_allocated(gib)": 45.14, + "step": 113 + }, + { + "epoch": 0.2300126103404792, + "grad_norm": 0.443359375, + "learning_rate": 4.579833793987393e-06, + "loss": 2.9397, + "memory/device_mem_reserved(gib)": 50.82, + "memory/max_mem_active(gib)": 45.14, + "memory/max_mem_allocated(gib)": 45.14, + "step": 114 + }, + { + "epoch": 0.23203026481715006, + "grad_norm": 0.47265625, + "learning_rate": 4.570515101583128e-06, + "loss": 2.8029, + "memory/device_mem_reserved(gib)": 50.82, + "memory/max_mem_active(gib)": 45.14, + "memory/max_mem_allocated(gib)": 45.14, + "step": 115 + }, + { + "epoch": 0.23404791929382093, + "grad_norm": 0.447265625, + "learning_rate": 4.561103900854401e-06, + "loss": 2.9667, + "memory/device_mem_reserved(gib)": 50.82, + "memory/max_mem_active(gib)": 45.14, + "memory/max_mem_allocated(gib)": 45.14, + "step": 116 + }, + { + "epoch": 0.2360655737704918, + "grad_norm": 0.435546875, + "learning_rate": 4.551600612283249e-06, + "loss": 3.009, + "memory/device_mem_reserved(gib)": 50.82, + "memory/max_mem_active(gib)": 45.14, + "memory/max_mem_allocated(gib)": 45.14, + "step": 117 + }, + { + "epoch": 0.2380832282471627, + "grad_norm": 0.427734375, + "learning_rate": 4.542005660466095e-06, + "loss": 2.7894, + "memory/device_mem_reserved(gib)": 50.82, + "memory/max_mem_active(gib)": 45.14, + "memory/max_mem_allocated(gib)": 45.14, + "step": 118 + }, + { + "epoch": 0.24010088272383354, + "grad_norm": 0.45703125, + "learning_rate": 4.532319474094769e-06, + "loss": 2.9377, + "memory/device_mem_reserved(gib)": 50.82, + "memory/max_mem_active(gib)": 45.14, + "memory/max_mem_allocated(gib)": 45.14, + "step": 119 + }, + { + "epoch": 0.2421185372005044, + "grad_norm": 0.4609375, + "learning_rate": 4.522542485937369e-06, + "loss": 2.8978, + "memory/device_mem_reserved(gib)": 50.82, + "memory/max_mem_active(gib)": 45.14, + "memory/max_mem_allocated(gib)": 45.14, + "step": 120 + }, + { + "epoch": 0.2441361916771753, + "grad_norm": 0.45703125, + "learning_rate": 4.512675132818908e-06, + "loss": 2.9091, + "memory/device_mem_reserved(gib)": 50.82, + "memory/max_mem_active(gib)": 45.14, + "memory/max_mem_allocated(gib)": 45.14, + "step": 121 + }, + { + "epoch": 0.24615384615384617, + "grad_norm": 0.4375, + "learning_rate": 4.5027178556018095e-06, + "loss": 2.9647, + "memory/device_mem_reserved(gib)": 50.82, + "memory/max_mem_active(gib)": 45.14, + "memory/max_mem_allocated(gib)": 45.14, + "step": 122 + }, + { + "epoch": 0.24817150063051702, + "grad_norm": 0.431640625, + "learning_rate": 4.492671099166204e-06, + "loss": 2.94, + "memory/device_mem_reserved(gib)": 50.82, + "memory/max_mem_active(gib)": 45.14, + "memory/max_mem_allocated(gib)": 45.14, + "step": 123 + }, + { + "epoch": 0.2501891551071879, + "grad_norm": 0.412109375, + "learning_rate": 4.482535312390059e-06, + "loss": 2.8902, + "memory/device_mem_reserved(gib)": 50.82, + "memory/max_mem_active(gib)": 45.14, + "memory/max_mem_allocated(gib)": 45.14, + "step": 124 + }, + { + "epoch": 0.25220680958385877, + "grad_norm": 0.421875, + "learning_rate": 4.472310948129113e-06, + "loss": 3.0486, + "memory/device_mem_reserved(gib)": 50.82, + "memory/max_mem_active(gib)": 45.14, + "memory/max_mem_allocated(gib)": 45.14, + "step": 125 + }, + { + "epoch": 0.2542244640605296, + "grad_norm": 0.47265625, + "learning_rate": 4.461998463196653e-06, + "loss": 2.848, + "memory/device_mem_reserved(gib)": 50.82, + "memory/max_mem_active(gib)": 45.14, + "memory/max_mem_allocated(gib)": 45.14, + "step": 126 + }, + { + "epoch": 0.2562421185372005, + "grad_norm": 0.447265625, + "learning_rate": 4.451598318343099e-06, + "loss": 2.9275, + "memory/device_mem_reserved(gib)": 50.82, + "memory/max_mem_active(gib)": 45.14, + "memory/max_mem_allocated(gib)": 45.14, + "step": 127 + }, + { + "epoch": 0.2582597730138714, + "grad_norm": 0.443359375, + "learning_rate": 4.441110978235419e-06, + "loss": 3.0023, + "memory/device_mem_reserved(gib)": 50.82, + "memory/max_mem_active(gib)": 45.14, + "memory/max_mem_allocated(gib)": 45.14, + "step": 128 + }, + { + "epoch": 0.2602774274905422, + "grad_norm": 0.421875, + "learning_rate": 4.430536911436368e-06, + "loss": 2.8752, + "memory/device_mem_reserved(gib)": 50.82, + "memory/max_mem_active(gib)": 45.14, + "memory/max_mem_allocated(gib)": 45.14, + "step": 129 + }, + { + "epoch": 0.26229508196721313, + "grad_norm": 0.447265625, + "learning_rate": 4.419876590383554e-06, + "loss": 2.9107, + "memory/device_mem_reserved(gib)": 50.82, + "memory/max_mem_active(gib)": 45.14, + "memory/max_mem_allocated(gib)": 45.14, + "step": 130 + }, + { + "epoch": 0.264312736443884, + "grad_norm": 0.376953125, + "learning_rate": 4.409130491368331e-06, + "loss": 2.9529, + "memory/device_mem_reserved(gib)": 50.82, + "memory/max_mem_active(gib)": 45.14, + "memory/max_mem_allocated(gib)": 45.14, + "step": 131 + }, + { + "epoch": 0.2663303909205549, + "grad_norm": 0.400390625, + "learning_rate": 4.398299094514515e-06, + "loss": 2.9129, + "memory/device_mem_reserved(gib)": 50.82, + "memory/max_mem_active(gib)": 45.14, + "memory/max_mem_allocated(gib)": 45.14, + "step": 132 + }, + { + "epoch": 0.26834804539722573, + "grad_norm": 0.376953125, + "learning_rate": 4.387382883756938e-06, + "loss": 2.9802, + "memory/device_mem_reserved(gib)": 50.82, + "memory/max_mem_active(gib)": 45.14, + "memory/max_mem_allocated(gib)": 45.14, + "step": 133 + }, + { + "epoch": 0.2703656998738966, + "grad_norm": 0.39453125, + "learning_rate": 4.37638234681982e-06, + "loss": 2.8098, + "memory/device_mem_reserved(gib)": 50.82, + "memory/max_mem_active(gib)": 45.14, + "memory/max_mem_allocated(gib)": 45.14, + "step": 134 + }, + { + "epoch": 0.2723833543505675, + "grad_norm": 0.365234375, + "learning_rate": 4.365297975194984e-06, + "loss": 2.9357, + "memory/device_mem_reserved(gib)": 50.82, + "memory/max_mem_active(gib)": 45.14, + "memory/max_mem_allocated(gib)": 45.14, + "step": 135 + }, + { + "epoch": 0.27440100882723834, + "grad_norm": 0.48828125, + "learning_rate": 4.354130264119894e-06, + "loss": 2.9512, + "memory/device_mem_reserved(gib)": 50.82, + "memory/max_mem_active(gib)": 45.14, + "memory/max_mem_allocated(gib)": 45.14, + "step": 136 + }, + { + "epoch": 0.2764186633039092, + "grad_norm": 0.52734375, + "learning_rate": 4.342879712555528e-06, + "loss": 2.9478, + "memory/device_mem_reserved(gib)": 50.82, + "memory/max_mem_active(gib)": 45.14, + "memory/max_mem_allocated(gib)": 45.14, + "step": 137 + }, + { + "epoch": 0.2784363177805801, + "grad_norm": 0.423828125, + "learning_rate": 4.331546823164083e-06, + "loss": 3.0044, + "memory/device_mem_reserved(gib)": 50.82, + "memory/max_mem_active(gib)": 45.14, + "memory/max_mem_allocated(gib)": 45.14, + "step": 138 + }, + { + "epoch": 0.28045397225725094, + "grad_norm": 0.388671875, + "learning_rate": 4.320132102286524e-06, + "loss": 2.9148, + "memory/device_mem_reserved(gib)": 50.82, + "memory/max_mem_active(gib)": 45.14, + "memory/max_mem_allocated(gib)": 45.14, + "step": 139 + }, + { + "epoch": 0.28247162673392184, + "grad_norm": 0.36328125, + "learning_rate": 4.308636059919952e-06, + "loss": 2.8966, + "memory/device_mem_reserved(gib)": 50.82, + "memory/max_mem_active(gib)": 45.14, + "memory/max_mem_allocated(gib)": 45.14, + "step": 140 + }, + { + "epoch": 0.2844892812105927, + "grad_norm": 0.40625, + "learning_rate": 4.297059209694824e-06, + "loss": 2.7564, + "memory/device_mem_reserved(gib)": 50.82, + "memory/max_mem_active(gib)": 45.14, + "memory/max_mem_allocated(gib)": 45.14, + "step": 141 + }, + { + "epoch": 0.28650693568726354, + "grad_norm": 0.400390625, + "learning_rate": 4.2854020688520025e-06, + "loss": 2.809, + "memory/device_mem_reserved(gib)": 50.82, + "memory/max_mem_active(gib)": 45.14, + "memory/max_mem_allocated(gib)": 45.14, + "step": 142 + }, + { + "epoch": 0.28852459016393445, + "grad_norm": 0.416015625, + "learning_rate": 4.273665158219645e-06, + "loss": 2.9237, + "memory/device_mem_reserved(gib)": 50.82, + "memory/max_mem_active(gib)": 45.14, + "memory/max_mem_allocated(gib)": 45.14, + "step": 143 + }, + { + "epoch": 0.2905422446406053, + "grad_norm": 0.3828125, + "learning_rate": 4.261849002189939e-06, + "loss": 2.9274, + "memory/device_mem_reserved(gib)": 50.82, + "memory/max_mem_active(gib)": 45.14, + "memory/max_mem_allocated(gib)": 45.14, + "step": 144 + }, + { + "epoch": 0.29255989911727615, + "grad_norm": 0.40234375, + "learning_rate": 4.249954128695662e-06, + "loss": 2.909, + "memory/device_mem_reserved(gib)": 50.82, + "memory/max_mem_active(gib)": 45.14, + "memory/max_mem_allocated(gib)": 45.14, + "step": 145 + }, + { + "epoch": 0.29457755359394705, + "grad_norm": 0.359375, + "learning_rate": 4.237981069186606e-06, + "loss": 2.8917, + "memory/device_mem_reserved(gib)": 50.82, + "memory/max_mem_active(gib)": 45.14, + "memory/max_mem_allocated(gib)": 45.14, + "step": 146 + }, + { + "epoch": 0.2965952080706179, + "grad_norm": 0.39453125, + "learning_rate": 4.225930358605827e-06, + "loss": 2.8702, + "memory/device_mem_reserved(gib)": 50.82, + "memory/max_mem_active(gib)": 45.14, + "memory/max_mem_allocated(gib)": 45.14, + "step": 147 + }, + { + "epoch": 0.2986128625472888, + "grad_norm": 0.373046875, + "learning_rate": 4.213802535365741e-06, + "loss": 2.9231, + "memory/device_mem_reserved(gib)": 50.82, + "memory/max_mem_active(gib)": 45.14, + "memory/max_mem_allocated(gib)": 45.14, + "step": 148 + }, + { + "epoch": 0.30063051702395965, + "grad_norm": 0.376953125, + "learning_rate": 4.201598141324078e-06, + "loss": 2.8554, + "memory/device_mem_reserved(gib)": 50.82, + "memory/max_mem_active(gib)": 45.14, + "memory/max_mem_allocated(gib)": 45.14, + "step": 149 + }, + { + "epoch": 0.3026481715006305, + "grad_norm": 0.376953125, + "learning_rate": 4.189317721759663e-06, + "loss": 2.8243, + "memory/device_mem_reserved(gib)": 50.82, + "memory/max_mem_active(gib)": 45.14, + "memory/max_mem_allocated(gib)": 45.14, + "step": 150 + }, + { + "epoch": 0.3046658259773014, + "grad_norm": 0.400390625, + "learning_rate": 4.176961825348059e-06, + "loss": 3.031, + "memory/device_mem_reserved(gib)": 50.82, + "memory/max_mem_active(gib)": 45.14, + "memory/max_mem_allocated(gib)": 45.14, + "step": 151 + }, + { + "epoch": 0.30668348045397226, + "grad_norm": 0.37109375, + "learning_rate": 4.16453100413705e-06, + "loss": 2.8808, + "memory/device_mem_reserved(gib)": 50.82, + "memory/max_mem_active(gib)": 45.14, + "memory/max_mem_allocated(gib)": 45.14, + "step": 152 + }, + { + "epoch": 0.3087011349306431, + "grad_norm": 0.36328125, + "learning_rate": 4.152025813521976e-06, + "loss": 2.8957, + "memory/device_mem_reserved(gib)": 50.82, + "memory/max_mem_active(gib)": 45.14, + "memory/max_mem_allocated(gib)": 45.14, + "step": 153 + }, + { + "epoch": 0.310718789407314, + "grad_norm": 0.400390625, + "learning_rate": 4.1394468122209245e-06, + "loss": 3.0874, + "memory/device_mem_reserved(gib)": 50.82, + "memory/max_mem_active(gib)": 45.14, + "memory/max_mem_allocated(gib)": 45.14, + "step": 154 + }, + { + "epoch": 0.31273644388398486, + "grad_norm": 0.392578125, + "learning_rate": 4.1267945622497566e-06, + "loss": 2.9214, + "memory/device_mem_reserved(gib)": 50.82, + "memory/max_mem_active(gib)": 45.14, + "memory/max_mem_allocated(gib)": 45.14, + "step": 155 + }, + { + "epoch": 0.31475409836065577, + "grad_norm": 0.37890625, + "learning_rate": 4.114069628897006e-06, + "loss": 2.9121, + "memory/device_mem_reserved(gib)": 50.82, + "memory/max_mem_active(gib)": 45.14, + "memory/max_mem_allocated(gib)": 45.14, + "step": 156 + }, + { + "epoch": 0.3167717528373266, + "grad_norm": 0.388671875, + "learning_rate": 4.101272580698621e-06, + "loss": 2.944, + "memory/device_mem_reserved(gib)": 50.82, + "memory/max_mem_active(gib)": 45.14, + "memory/max_mem_allocated(gib)": 45.14, + "step": 157 + }, + { + "epoch": 0.31878940731399746, + "grad_norm": 0.40234375, + "learning_rate": 4.08840398941256e-06, + "loss": 2.928, + "memory/device_mem_reserved(gib)": 50.82, + "memory/max_mem_active(gib)": 45.14, + "memory/max_mem_allocated(gib)": 45.14, + "step": 158 + }, + { + "epoch": 0.32080706179066837, + "grad_norm": 0.390625, + "learning_rate": 4.075464429993244e-06, + "loss": 2.8135, + "memory/device_mem_reserved(gib)": 50.82, + "memory/max_mem_active(gib)": 45.14, + "memory/max_mem_allocated(gib)": 45.14, + "step": 159 + }, + { + "epoch": 0.3228247162673392, + "grad_norm": 0.369140625, + "learning_rate": 4.0624544805658795e-06, + "loss": 2.93, + "memory/device_mem_reserved(gib)": 50.82, + "memory/max_mem_active(gib)": 45.14, + "memory/max_mem_allocated(gib)": 45.14, + "step": 160 + }, + { + "epoch": 0.32484237074401007, + "grad_norm": 0.3515625, + "learning_rate": 4.049374722400613e-06, + "loss": 2.7283, + "memory/device_mem_reserved(gib)": 50.82, + "memory/max_mem_active(gib)": 45.14, + "memory/max_mem_allocated(gib)": 45.14, + "step": 161 + }, + { + "epoch": 0.32686002522068097, + "grad_norm": 0.380859375, + "learning_rate": 4.0362257398865715e-06, + "loss": 2.9056, + "memory/device_mem_reserved(gib)": 50.82, + "memory/max_mem_active(gib)": 45.14, + "memory/max_mem_allocated(gib)": 45.14, + "step": 162 + }, + { + "epoch": 0.3288776796973518, + "grad_norm": 0.37890625, + "learning_rate": 4.02300812050575e-06, + "loss": 2.9232, + "memory/device_mem_reserved(gib)": 50.82, + "memory/max_mem_active(gib)": 45.14, + "memory/max_mem_allocated(gib)": 45.14, + "step": 163 + }, + { + "epoch": 0.3308953341740227, + "grad_norm": 0.380859375, + "learning_rate": 4.009722454806762e-06, + "loss": 2.8507, + "memory/device_mem_reserved(gib)": 50.82, + "memory/max_mem_active(gib)": 45.14, + "memory/max_mem_allocated(gib)": 45.14, + "step": 164 + }, + { + "epoch": 0.3329129886506936, + "grad_norm": 0.388671875, + "learning_rate": 3.9963693363784544e-06, + "loss": 2.8988, + "memory/device_mem_reserved(gib)": 50.82, + "memory/max_mem_active(gib)": 45.14, + "memory/max_mem_allocated(gib)": 45.14, + "step": 165 + }, + { + "epoch": 0.3349306431273644, + "grad_norm": 0.35546875, + "learning_rate": 3.982949361823388e-06, + "loss": 2.8836, + "memory/device_mem_reserved(gib)": 50.82, + "memory/max_mem_active(gib)": 45.14, + "memory/max_mem_allocated(gib)": 45.14, + "step": 166 + }, + { + "epoch": 0.33694829760403533, + "grad_norm": 0.369140625, + "learning_rate": 3.969463130731183e-06, + "loss": 2.8122, + "memory/device_mem_reserved(gib)": 50.82, + "memory/max_mem_active(gib)": 45.14, + "memory/max_mem_allocated(gib)": 45.14, + "step": 167 + }, + { + "epoch": 0.3389659520807062, + "grad_norm": 0.361328125, + "learning_rate": 3.955911245651726e-06, + "loss": 2.9137, + "memory/device_mem_reserved(gib)": 50.82, + "memory/max_mem_active(gib)": 45.14, + "memory/max_mem_allocated(gib)": 45.14, + "step": 168 + }, + { + "epoch": 0.34098360655737703, + "grad_norm": 0.359375, + "learning_rate": 3.942294312068252e-06, + "loss": 2.9557, + "memory/device_mem_reserved(gib)": 50.82, + "memory/max_mem_active(gib)": 45.14, + "memory/max_mem_allocated(gib)": 45.14, + "step": 169 + }, + { + "epoch": 0.34300126103404793, + "grad_norm": 0.380859375, + "learning_rate": 3.928612938370292e-06, + "loss": 2.8614, + "memory/device_mem_reserved(gib)": 50.82, + "memory/max_mem_active(gib)": 45.14, + "memory/max_mem_allocated(gib)": 45.14, + "step": 170 + }, + { + "epoch": 0.3450189155107188, + "grad_norm": 0.376953125, + "learning_rate": 3.914867735826489e-06, + "loss": 2.9181, + "memory/device_mem_reserved(gib)": 50.82, + "memory/max_mem_active(gib)": 45.14, + "memory/max_mem_allocated(gib)": 45.14, + "step": 171 + }, + { + "epoch": 0.3470365699873897, + "grad_norm": 0.35546875, + "learning_rate": 3.901059318557287e-06, + "loss": 2.8829, + "memory/device_mem_reserved(gib)": 50.82, + "memory/max_mem_active(gib)": 45.14, + "memory/max_mem_allocated(gib)": 45.14, + "step": 172 + }, + { + "epoch": 0.34905422446406054, + "grad_norm": 0.421875, + "learning_rate": 3.8871883035074975e-06, + "loss": 2.9295, + "memory/device_mem_reserved(gib)": 50.82, + "memory/max_mem_active(gib)": 45.14, + "memory/max_mem_allocated(gib)": 45.14, + "step": 173 + }, + { + "epoch": 0.3510718789407314, + "grad_norm": 0.36328125, + "learning_rate": 3.87325531041873e-06, + "loss": 2.8678, + "memory/device_mem_reserved(gib)": 50.82, + "memory/max_mem_active(gib)": 45.14, + "memory/max_mem_allocated(gib)": 45.14, + "step": 174 + }, + { + "epoch": 0.3530895334174023, + "grad_norm": 0.376953125, + "learning_rate": 3.859260961801702e-06, + "loss": 2.871, + "memory/device_mem_reserved(gib)": 50.82, + "memory/max_mem_active(gib)": 45.14, + "memory/max_mem_allocated(gib)": 45.14, + "step": 175 + }, + { + "epoch": 0.35510718789407314, + "grad_norm": 0.3515625, + "learning_rate": 3.845205882908432e-06, + "loss": 2.9155, + "memory/device_mem_reserved(gib)": 50.82, + "memory/max_mem_active(gib)": 45.14, + "memory/max_mem_allocated(gib)": 45.14, + "step": 176 + }, + { + "epoch": 0.357124842370744, + "grad_norm": 0.341796875, + "learning_rate": 3.8310907017042966e-06, + "loss": 2.8222, + "memory/device_mem_reserved(gib)": 50.82, + "memory/max_mem_active(gib)": 45.14, + "memory/max_mem_allocated(gib)": 45.14, + "step": 177 + }, + { + "epoch": 0.3591424968474149, + "grad_norm": 0.37890625, + "learning_rate": 3.816916048839979e-06, + "loss": 2.9431, + "memory/device_mem_reserved(gib)": 50.82, + "memory/max_mem_active(gib)": 45.14, + "memory/max_mem_allocated(gib)": 45.14, + "step": 178 + }, + { + "epoch": 0.36116015132408574, + "grad_norm": 0.421875, + "learning_rate": 3.8026825576232906e-06, + "loss": 2.8602, + "memory/device_mem_reserved(gib)": 50.82, + "memory/max_mem_active(gib)": 45.14, + "memory/max_mem_allocated(gib)": 45.14, + "step": 179 + }, + { + "epoch": 0.36317780580075665, + "grad_norm": 0.380859375, + "learning_rate": 3.7883908639908752e-06, + "loss": 2.9468, + "memory/device_mem_reserved(gib)": 50.82, + "memory/max_mem_active(gib)": 45.14, + "memory/max_mem_allocated(gib)": 45.14, + "step": 180 + }, + { + "epoch": 0.3651954602774275, + "grad_norm": 0.365234375, + "learning_rate": 3.774041606479794e-06, + "loss": 2.9297, + "memory/device_mem_reserved(gib)": 50.82, + "memory/max_mem_active(gib)": 45.14, + "memory/max_mem_allocated(gib)": 45.14, + "step": 181 + }, + { + "epoch": 0.36721311475409835, + "grad_norm": 0.404296875, + "learning_rate": 3.759635426199001e-06, + "loss": 2.8849, + "memory/device_mem_reserved(gib)": 50.82, + "memory/max_mem_active(gib)": 45.14, + "memory/max_mem_allocated(gib)": 45.14, + "step": 182 + }, + { + "epoch": 0.36923076923076925, + "grad_norm": 0.373046875, + "learning_rate": 3.7451729668006974e-06, + "loss": 2.8075, + "memory/device_mem_reserved(gib)": 50.82, + "memory/max_mem_active(gib)": 45.14, + "memory/max_mem_allocated(gib)": 45.14, + "step": 183 + }, + { + "epoch": 0.3712484237074401, + "grad_norm": 0.359375, + "learning_rate": 3.730654874451569e-06, + "loss": 2.7931, + "memory/device_mem_reserved(gib)": 50.82, + "memory/max_mem_active(gib)": 45.14, + "memory/max_mem_allocated(gib)": 45.14, + "step": 184 + }, + { + "epoch": 0.37326607818411095, + "grad_norm": 0.375, + "learning_rate": 3.7160817978039256e-06, + "loss": 2.9012, + "memory/device_mem_reserved(gib)": 50.82, + "memory/max_mem_active(gib)": 45.14, + "memory/max_mem_allocated(gib)": 45.14, + "step": 185 + }, + { + "epoch": 0.37528373266078185, + "grad_norm": 0.34765625, + "learning_rate": 3.7014543879667097e-06, + "loss": 2.8396, + "memory/device_mem_reserved(gib)": 50.82, + "memory/max_mem_active(gib)": 45.14, + "memory/max_mem_allocated(gib)": 45.14, + "step": 186 + }, + { + "epoch": 0.3773013871374527, + "grad_norm": 0.345703125, + "learning_rate": 3.6867732984764144e-06, + "loss": 2.8584, + "memory/device_mem_reserved(gib)": 50.82, + "memory/max_mem_active(gib)": 45.14, + "memory/max_mem_allocated(gib)": 45.14, + "step": 187 + }, + { + "epoch": 0.3793190416141236, + "grad_norm": 0.373046875, + "learning_rate": 3.6720391852678783e-06, + "loss": 2.9347, + "memory/device_mem_reserved(gib)": 50.82, + "memory/max_mem_active(gib)": 45.14, + "memory/max_mem_allocated(gib)": 45.14, + "step": 188 + }, + { + "epoch": 0.38133669609079446, + "grad_norm": 0.353515625, + "learning_rate": 3.657252706644982e-06, + "loss": 2.9129, + "memory/device_mem_reserved(gib)": 50.82, + "memory/max_mem_active(gib)": 45.14, + "memory/max_mem_allocated(gib)": 45.14, + "step": 189 + }, + { + "epoch": 0.3833543505674653, + "grad_norm": 0.365234375, + "learning_rate": 3.6424145232512337e-06, + "loss": 2.8445, + "memory/device_mem_reserved(gib)": 50.82, + "memory/max_mem_active(gib)": 45.14, + "memory/max_mem_allocated(gib)": 45.14, + "step": 190 + }, + { + "epoch": 0.3853720050441362, + "grad_norm": 0.365234375, + "learning_rate": 3.627525298040255e-06, + "loss": 2.8571, + "memory/device_mem_reserved(gib)": 50.82, + "memory/max_mem_active(gib)": 45.14, + "memory/max_mem_allocated(gib)": 45.14, + "step": 191 + }, + { + "epoch": 0.38738965952080706, + "grad_norm": 0.36328125, + "learning_rate": 3.612585696246158e-06, + "loss": 2.9072, + "memory/device_mem_reserved(gib)": 50.82, + "memory/max_mem_active(gib)": 45.14, + "memory/max_mem_allocated(gib)": 45.14, + "step": 192 + }, + { + "epoch": 0.3894073139974779, + "grad_norm": 0.35546875, + "learning_rate": 3.5975963853538273e-06, + "loss": 2.9549, + "memory/device_mem_reserved(gib)": 50.82, + "memory/max_mem_active(gib)": 45.14, + "memory/max_mem_allocated(gib)": 45.14, + "step": 193 + }, + { + "epoch": 0.3914249684741488, + "grad_norm": 0.365234375, + "learning_rate": 3.5825580350690914e-06, + "loss": 2.8663, + "memory/device_mem_reserved(gib)": 50.82, + "memory/max_mem_active(gib)": 45.14, + "memory/max_mem_allocated(gib)": 45.14, + "step": 194 + }, + { + "epoch": 0.39344262295081966, + "grad_norm": 0.3515625, + "learning_rate": 3.5674713172888075e-06, + "loss": 3.0238, + "memory/device_mem_reserved(gib)": 50.82, + "memory/max_mem_active(gib)": 45.14, + "memory/max_mem_allocated(gib)": 45.14, + "step": 195 + }, + { + "epoch": 0.39546027742749057, + "grad_norm": 0.3671875, + "learning_rate": 3.552336906070838e-06, + "loss": 2.922, + "memory/device_mem_reserved(gib)": 50.82, + "memory/max_mem_active(gib)": 45.14, + "memory/max_mem_allocated(gib)": 45.14, + "step": 196 + }, + { + "epoch": 0.3974779319041614, + "grad_norm": 0.34375, + "learning_rate": 3.5371554776039344e-06, + "loss": 2.8983, + "memory/device_mem_reserved(gib)": 50.82, + "memory/max_mem_active(gib)": 45.14, + "memory/max_mem_allocated(gib)": 45.14, + "step": 197 + }, + { + "epoch": 0.39949558638083227, + "grad_norm": 0.3515625, + "learning_rate": 3.52192771017753e-06, + "loss": 2.9008, + "memory/device_mem_reserved(gib)": 50.82, + "memory/max_mem_active(gib)": 45.14, + "memory/max_mem_allocated(gib)": 45.14, + "step": 198 + }, + { + "epoch": 0.4015132408575032, + "grad_norm": 0.357421875, + "learning_rate": 3.5066542841514275e-06, + "loss": 2.9236, + "memory/device_mem_reserved(gib)": 50.82, + "memory/max_mem_active(gib)": 45.14, + "memory/max_mem_allocated(gib)": 45.14, + "step": 199 + }, + { + "epoch": 0.403530895334174, + "grad_norm": 0.400390625, + "learning_rate": 3.491335881925407e-06, + "loss": 2.8329, + "memory/device_mem_reserved(gib)": 50.82, + "memory/max_mem_active(gib)": 45.14, + "memory/max_mem_allocated(gib)": 45.14, + "step": 200 + }, + { + "epoch": 0.40554854981084487, + "grad_norm": 0.37890625, + "learning_rate": 3.4759731879087373e-06, + "loss": 2.8975, + "memory/device_mem_reserved(gib)": 50.82, + "memory/max_mem_active(gib)": 45.14, + "memory/max_mem_allocated(gib)": 45.14, + "step": 201 + }, + { + "epoch": 0.4075662042875158, + "grad_norm": 0.359375, + "learning_rate": 3.460566888489593e-06, + "loss": 2.9363, + "memory/device_mem_reserved(gib)": 50.82, + "memory/max_mem_active(gib)": 45.14, + "memory/max_mem_allocated(gib)": 45.14, + "step": 202 + }, + { + "epoch": 0.4095838587641866, + "grad_norm": 0.35546875, + "learning_rate": 3.4451176720043906e-06, + "loss": 2.9014, + "memory/device_mem_reserved(gib)": 50.82, + "memory/max_mem_active(gib)": 45.14, + "memory/max_mem_allocated(gib)": 45.14, + "step": 203 + }, + { + "epoch": 0.41160151324085753, + "grad_norm": 0.35546875, + "learning_rate": 3.4296262287070337e-06, + "loss": 2.8582, + "memory/device_mem_reserved(gib)": 50.82, + "memory/max_mem_active(gib)": 45.14, + "memory/max_mem_allocated(gib)": 45.14, + "step": 204 + }, + { + "epoch": 0.4136191677175284, + "grad_norm": 0.375, + "learning_rate": 3.4140932507380727e-06, + "loss": 2.9568, + "memory/device_mem_reserved(gib)": 50.82, + "memory/max_mem_active(gib)": 45.14, + "memory/max_mem_allocated(gib)": 45.14, + "step": 205 + }, + { + "epoch": 0.41563682219419923, + "grad_norm": 0.359375, + "learning_rate": 3.398519432093782e-06, + "loss": 2.7844, + "memory/device_mem_reserved(gib)": 50.82, + "memory/max_mem_active(gib)": 45.14, + "memory/max_mem_allocated(gib)": 45.14, + "step": 206 + }, + { + "epoch": 0.41765447667087013, + "grad_norm": 0.341796875, + "learning_rate": 3.3829054685951535e-06, + "loss": 2.9679, + "memory/device_mem_reserved(gib)": 50.82, + "memory/max_mem_active(gib)": 45.14, + "memory/max_mem_allocated(gib)": 45.14, + "step": 207 + }, + { + "epoch": 0.419672131147541, + "grad_norm": 0.353515625, + "learning_rate": 3.3672520578568018e-06, + "loss": 3.0012, + "memory/device_mem_reserved(gib)": 50.82, + "memory/max_mem_active(gib)": 45.14, + "memory/max_mem_allocated(gib)": 45.14, + "step": 208 + }, + { + "epoch": 0.42168978562421183, + "grad_norm": 0.359375, + "learning_rate": 3.351559899255806e-06, + "loss": 2.8042, + "memory/device_mem_reserved(gib)": 50.82, + "memory/max_mem_active(gib)": 45.14, + "memory/max_mem_allocated(gib)": 45.14, + "step": 209 + }, + { + "epoch": 0.42370744010088274, + "grad_norm": 0.34765625, + "learning_rate": 3.335829693900455e-06, + "loss": 2.8814, + "memory/device_mem_reserved(gib)": 50.82, + "memory/max_mem_active(gib)": 45.14, + "memory/max_mem_allocated(gib)": 45.14, + "step": 210 + }, + { + "epoch": 0.4257250945775536, + "grad_norm": 0.36328125, + "learning_rate": 3.3200621445989227e-06, + "loss": 2.8684, + "memory/device_mem_reserved(gib)": 50.82, + "memory/max_mem_active(gib)": 45.14, + "memory/max_mem_allocated(gib)": 45.14, + "step": 211 + }, + { + "epoch": 0.4277427490542245, + "grad_norm": 0.3671875, + "learning_rate": 3.304257955827872e-06, + "loss": 2.8368, + "memory/device_mem_reserved(gib)": 50.82, + "memory/max_mem_active(gib)": 45.14, + "memory/max_mem_allocated(gib)": 45.14, + "step": 212 + }, + { + "epoch": 0.42976040353089534, + "grad_norm": 0.36328125, + "learning_rate": 3.2884178337009764e-06, + "loss": 2.8657, + "memory/device_mem_reserved(gib)": 50.82, + "memory/max_mem_active(gib)": 45.14, + "memory/max_mem_allocated(gib)": 45.14, + "step": 213 + }, + { + "epoch": 0.4317780580075662, + "grad_norm": 0.353515625, + "learning_rate": 3.272542485937369e-06, + "loss": 2.8492, + "memory/device_mem_reserved(gib)": 50.82, + "memory/max_mem_active(gib)": 45.14, + "memory/max_mem_allocated(gib)": 45.14, + "step": 214 + }, + { + "epoch": 0.4337957124842371, + "grad_norm": 0.33984375, + "learning_rate": 3.2566326218300287e-06, + "loss": 2.904, + "memory/device_mem_reserved(gib)": 50.82, + "memory/max_mem_active(gib)": 45.14, + "memory/max_mem_allocated(gib)": 45.14, + "step": 215 + }, + { + "epoch": 0.43581336696090794, + "grad_norm": 0.41796875, + "learning_rate": 3.2406889522140854e-06, + "loss": 2.9362, + "memory/device_mem_reserved(gib)": 50.82, + "memory/max_mem_active(gib)": 45.14, + "memory/max_mem_allocated(gib)": 45.14, + "step": 216 + }, + { + "epoch": 0.4378310214375788, + "grad_norm": 0.34375, + "learning_rate": 3.2247121894350614e-06, + "loss": 2.8333, + "memory/device_mem_reserved(gib)": 50.82, + "memory/max_mem_active(gib)": 45.14, + "memory/max_mem_allocated(gib)": 45.14, + "step": 217 + }, + { + "epoch": 0.4398486759142497, + "grad_norm": 0.3515625, + "learning_rate": 3.208703047317045e-06, + "loss": 2.9378, + "memory/device_mem_reserved(gib)": 50.82, + "memory/max_mem_active(gib)": 45.14, + "memory/max_mem_allocated(gib)": 45.14, + "step": 218 + }, + { + "epoch": 0.44186633039092055, + "grad_norm": 0.330078125, + "learning_rate": 3.1926622411307985e-06, + "loss": 2.8866, + "memory/device_mem_reserved(gib)": 50.82, + "memory/max_mem_active(gib)": 45.14, + "memory/max_mem_allocated(gib)": 45.14, + "step": 219 + }, + { + "epoch": 0.44388398486759145, + "grad_norm": 0.33203125, + "learning_rate": 3.1765904875617977e-06, + "loss": 2.8879, + "memory/device_mem_reserved(gib)": 50.82, + "memory/max_mem_active(gib)": 45.14, + "memory/max_mem_allocated(gib)": 45.14, + "step": 220 + }, + { + "epoch": 0.4459016393442623, + "grad_norm": 0.5390625, + "learning_rate": 3.1604885046782158e-06, + "loss": 2.8542, + "memory/device_mem_reserved(gib)": 50.82, + "memory/max_mem_active(gib)": 45.14, + "memory/max_mem_allocated(gib)": 45.14, + "step": 221 + }, + { + "epoch": 0.44791929382093315, + "grad_norm": 0.35546875, + "learning_rate": 3.1443570118988357e-06, + "loss": 2.9413, + "memory/device_mem_reserved(gib)": 50.82, + "memory/max_mem_active(gib)": 45.14, + "memory/max_mem_allocated(gib)": 45.14, + "step": 222 + }, + { + "epoch": 0.44993694829760406, + "grad_norm": 0.353515625, + "learning_rate": 3.128196729960912e-06, + "loss": 2.9176, + "memory/device_mem_reserved(gib)": 50.82, + "memory/max_mem_active(gib)": 45.14, + "memory/max_mem_allocated(gib)": 45.14, + "step": 223 + }, + { + "epoch": 0.4519546027742749, + "grad_norm": 0.396484375, + "learning_rate": 3.1120083808879666e-06, + "loss": 2.9672, + "memory/device_mem_reserved(gib)": 50.82, + "memory/max_mem_active(gib)": 45.14, + "memory/max_mem_allocated(gib)": 45.14, + "step": 224 + }, + { + "epoch": 0.45397225725094575, + "grad_norm": 0.33984375, + "learning_rate": 3.095792687957528e-06, + "loss": 2.9543, + "memory/device_mem_reserved(gib)": 50.82, + "memory/max_mem_active(gib)": 45.14, + "memory/max_mem_allocated(gib)": 45.14, + "step": 225 + }, + { + "epoch": 0.45598991172761666, + "grad_norm": 0.33984375, + "learning_rate": 3.0795503756688212e-06, + "loss": 2.7878, + "memory/device_mem_reserved(gib)": 50.82, + "memory/max_mem_active(gib)": 45.14, + "memory/max_mem_allocated(gib)": 45.14, + "step": 226 + }, + { + "epoch": 0.4580075662042875, + "grad_norm": 0.365234375, + "learning_rate": 3.063282169710392e-06, + "loss": 2.9394, + "memory/device_mem_reserved(gib)": 50.82, + "memory/max_mem_active(gib)": 45.14, + "memory/max_mem_allocated(gib)": 45.14, + "step": 227 + }, + { + "epoch": 0.4600252206809584, + "grad_norm": 0.373046875, + "learning_rate": 3.046988796927688e-06, + "loss": 2.838, + "memory/device_mem_reserved(gib)": 50.82, + "memory/max_mem_active(gib)": 45.14, + "memory/max_mem_allocated(gib)": 45.14, + "step": 228 + }, + { + "epoch": 0.46204287515762926, + "grad_norm": 0.357421875, + "learning_rate": 3.0306709852905824e-06, + "loss": 2.9316, + "memory/device_mem_reserved(gib)": 50.82, + "memory/max_mem_active(gib)": 45.14, + "memory/max_mem_allocated(gib)": 45.14, + "step": 229 + }, + { + "epoch": 0.4640605296343001, + "grad_norm": 0.337890625, + "learning_rate": 3.014329463860849e-06, + "loss": 2.8895, + "memory/device_mem_reserved(gib)": 50.82, + "memory/max_mem_active(gib)": 45.14, + "memory/max_mem_allocated(gib)": 45.14, + "step": 230 + }, + { + "epoch": 0.466078184110971, + "grad_norm": 0.375, + "learning_rate": 2.9979649627595904e-06, + "loss": 2.9382, + "memory/device_mem_reserved(gib)": 50.82, + "memory/max_mem_active(gib)": 45.14, + "memory/max_mem_allocated(gib)": 45.14, + "step": 231 + }, + { + "epoch": 0.46809583858764187, + "grad_norm": 0.375, + "learning_rate": 2.981578213134614e-06, + "loss": 2.9179, + "memory/device_mem_reserved(gib)": 50.82, + "memory/max_mem_active(gib)": 45.14, + "memory/max_mem_allocated(gib)": 45.14, + "step": 232 + }, + { + "epoch": 0.4701134930643127, + "grad_norm": 0.314453125, + "learning_rate": 2.9651699471277664e-06, + "loss": 2.803, + "memory/device_mem_reserved(gib)": 50.82, + "memory/max_mem_active(gib)": 45.14, + "memory/max_mem_allocated(gib)": 45.14, + "step": 233 + }, + { + "epoch": 0.4721311475409836, + "grad_norm": 0.453125, + "learning_rate": 2.9487408978422233e-06, + "loss": 2.8782, + "memory/device_mem_reserved(gib)": 50.82, + "memory/max_mem_active(gib)": 45.14, + "memory/max_mem_allocated(gib)": 45.14, + "step": 234 + }, + { + "epoch": 0.47414880201765447, + "grad_norm": 0.357421875, + "learning_rate": 2.932291799309734e-06, + "loss": 2.9224, + "memory/device_mem_reserved(gib)": 50.82, + "memory/max_mem_active(gib)": 45.14, + "memory/max_mem_allocated(gib)": 45.14, + "step": 235 + }, + { + "epoch": 0.4761664564943254, + "grad_norm": 0.322265625, + "learning_rate": 2.9158233864578256e-06, + "loss": 2.8951, + "memory/device_mem_reserved(gib)": 50.82, + "memory/max_mem_active(gib)": 45.14, + "memory/max_mem_allocated(gib)": 45.14, + "step": 236 + }, + { + "epoch": 0.4781841109709962, + "grad_norm": 0.400390625, + "learning_rate": 2.8993363950769685e-06, + "loss": 2.8369, + "memory/device_mem_reserved(gib)": 50.82, + "memory/max_mem_active(gib)": 45.14, + "memory/max_mem_allocated(gib)": 45.14, + "step": 237 + }, + { + "epoch": 0.4802017654476671, + "grad_norm": 0.357421875, + "learning_rate": 2.8828315617877006e-06, + "loss": 2.9335, + "memory/device_mem_reserved(gib)": 50.82, + "memory/max_mem_active(gib)": 45.14, + "memory/max_mem_allocated(gib)": 45.14, + "step": 238 + }, + { + "epoch": 0.482219419924338, + "grad_norm": 0.337890625, + "learning_rate": 2.866309624007717e-06, + "loss": 2.838, + "memory/device_mem_reserved(gib)": 50.82, + "memory/max_mem_active(gib)": 45.14, + "memory/max_mem_allocated(gib)": 45.14, + "step": 239 + }, + { + "epoch": 0.4842370744010088, + "grad_norm": 0.333984375, + "learning_rate": 2.849771319918922e-06, + "loss": 2.9317, + "memory/device_mem_reserved(gib)": 50.82, + "memory/max_mem_active(gib)": 45.14, + "memory/max_mem_allocated(gib)": 45.14, + "step": 240 + }, + { + "epoch": 0.4862547288776797, + "grad_norm": 0.3671875, + "learning_rate": 2.8332173884344477e-06, + "loss": 2.8651, + "memory/device_mem_reserved(gib)": 50.82, + "memory/max_mem_active(gib)": 45.14, + "memory/max_mem_allocated(gib)": 45.14, + "step": 241 + }, + { + "epoch": 0.4882723833543506, + "grad_norm": 0.349609375, + "learning_rate": 2.8166485691656425e-06, + "loss": 2.8215, + "memory/device_mem_reserved(gib)": 50.82, + "memory/max_mem_active(gib)": 45.14, + "memory/max_mem_allocated(gib)": 45.14, + "step": 242 + }, + { + "epoch": 0.49029003783102143, + "grad_norm": 0.33203125, + "learning_rate": 2.8000656023890245e-06, + "loss": 2.909, + "memory/device_mem_reserved(gib)": 50.82, + "memory/max_mem_active(gib)": 45.14, + "memory/max_mem_allocated(gib)": 45.14, + "step": 243 + }, + { + "epoch": 0.49230769230769234, + "grad_norm": 0.361328125, + "learning_rate": 2.7834692290132054e-06, + "loss": 2.8417, + "memory/device_mem_reserved(gib)": 50.82, + "memory/max_mem_active(gib)": 45.14, + "memory/max_mem_allocated(gib)": 45.14, + "step": 244 + }, + { + "epoch": 0.4943253467843632, + "grad_norm": 0.35546875, + "learning_rate": 2.766860190545791e-06, + "loss": 2.8922, + "memory/device_mem_reserved(gib)": 50.82, + "memory/max_mem_active(gib)": 45.14, + "memory/max_mem_allocated(gib)": 45.14, + "step": 245 + }, + { + "epoch": 0.49634300126103403, + "grad_norm": 0.3515625, + "learning_rate": 2.7502392290602463e-06, + "loss": 2.8867, + "memory/device_mem_reserved(gib)": 50.82, + "memory/max_mem_active(gib)": 45.14, + "memory/max_mem_allocated(gib)": 45.14, + "step": 246 + }, + { + "epoch": 0.49836065573770494, + "grad_norm": 0.3203125, + "learning_rate": 2.7336070871627467e-06, + "loss": 2.8258, + "memory/device_mem_reserved(gib)": 50.82, + "memory/max_mem_active(gib)": 45.14, + "memory/max_mem_allocated(gib)": 45.14, + "step": 247 + }, + { + "epoch": 0.5003783102143758, + "grad_norm": 0.345703125, + "learning_rate": 2.716964507958994e-06, + "loss": 2.8912, + "memory/device_mem_reserved(gib)": 50.82, + "memory/max_mem_active(gib)": 45.14, + "memory/max_mem_allocated(gib)": 45.14, + "step": 248 + }, + { + "epoch": 0.5023959646910466, + "grad_norm": 0.349609375, + "learning_rate": 2.7003122350210185e-06, + "loss": 2.9522, + "memory/device_mem_reserved(gib)": 50.82, + "memory/max_mem_active(gib)": 45.14, + "memory/max_mem_allocated(gib)": 45.14, + "step": 249 + }, + { + "epoch": 0.5044136191677175, + "grad_norm": 0.33984375, + "learning_rate": 2.6836510123539556e-06, + "loss": 2.8601, + "memory/device_mem_reserved(gib)": 50.82, + "memory/max_mem_active(gib)": 45.14, + "memory/max_mem_allocated(gib)": 45.14, + "step": 250 + }, + { + "epoch": 0.5064312736443884, + "grad_norm": 0.341796875, + "learning_rate": 2.6669815843628043e-06, + "loss": 2.8434, + "memory/device_mem_reserved(gib)": 50.82, + "memory/max_mem_active(gib)": 45.14, + "memory/max_mem_allocated(gib)": 45.14, + "step": 251 + }, + { + "epoch": 0.5084489281210592, + "grad_norm": 0.3359375, + "learning_rate": 2.650304695819168e-06, + "loss": 2.9957, + "memory/device_mem_reserved(gib)": 50.82, + "memory/max_mem_active(gib)": 45.14, + "memory/max_mem_allocated(gib)": 45.14, + "step": 252 + }, + { + "epoch": 0.5104665825977301, + "grad_norm": 0.388671875, + "learning_rate": 2.6336210918279807e-06, + "loss": 2.9345, + "memory/device_mem_reserved(gib)": 50.82, + "memory/max_mem_active(gib)": 45.14, + "memory/max_mem_allocated(gib)": 45.14, + "step": 253 + }, + { + "epoch": 0.512484237074401, + "grad_norm": 0.373046875, + "learning_rate": 2.6169315177942134e-06, + "loss": 2.8346, + "memory/device_mem_reserved(gib)": 50.82, + "memory/max_mem_active(gib)": 45.14, + "memory/max_mem_allocated(gib)": 45.14, + "step": 254 + }, + { + "epoch": 0.5145018915510718, + "grad_norm": 0.341796875, + "learning_rate": 2.6002367193895733e-06, + "loss": 2.8084, + "memory/device_mem_reserved(gib)": 50.82, + "memory/max_mem_active(gib)": 45.14, + "memory/max_mem_allocated(gib)": 45.14, + "step": 255 + }, + { + "epoch": 0.5165195460277427, + "grad_norm": 0.333984375, + "learning_rate": 2.5835374425191867e-06, + "loss": 2.9052, + "memory/device_mem_reserved(gib)": 50.82, + "memory/max_mem_active(gib)": 45.14, + "memory/max_mem_allocated(gib)": 45.14, + "step": 256 + }, + { + "epoch": 0.5185372005044137, + "grad_norm": 0.3359375, + "learning_rate": 2.566834433288272e-06, + "loss": 2.8175, + "memory/device_mem_reserved(gib)": 50.82, + "memory/max_mem_active(gib)": 45.14, + "memory/max_mem_allocated(gib)": 45.14, + "step": 257 + }, + { + "epoch": 0.5205548549810844, + "grad_norm": 0.341796875, + "learning_rate": 2.5501284379688067e-06, + "loss": 2.8486, + "memory/device_mem_reserved(gib)": 50.82, + "memory/max_mem_active(gib)": 45.14, + "memory/max_mem_allocated(gib)": 45.14, + "step": 258 + }, + { + "epoch": 0.5225725094577554, + "grad_norm": 0.35546875, + "learning_rate": 2.533420202966182e-06, + "loss": 2.8192, + "memory/device_mem_reserved(gib)": 50.82, + "memory/max_mem_active(gib)": 45.14, + "memory/max_mem_allocated(gib)": 45.14, + "step": 259 + }, + { + "epoch": 0.5245901639344263, + "grad_norm": 0.46484375, + "learning_rate": 2.516710474785856e-06, + "loss": 2.9291, + "memory/device_mem_reserved(gib)": 50.82, + "memory/max_mem_active(gib)": 45.14, + "memory/max_mem_allocated(gib)": 45.14, + "step": 260 + }, + { + "epoch": 0.526607818411097, + "grad_norm": 0.34765625, + "learning_rate": 2.5e-06, + "loss": 2.9709, + "memory/device_mem_reserved(gib)": 50.82, + "memory/max_mem_active(gib)": 45.14, + "memory/max_mem_allocated(gib)": 45.14, + "step": 261 + }, + { + "epoch": 0.528625472887768, + "grad_norm": 0.310546875, + "learning_rate": 2.483289525214145e-06, + "loss": 2.8498, + "memory/device_mem_reserved(gib)": 50.82, + "memory/max_mem_active(gib)": 45.14, + "memory/max_mem_allocated(gib)": 45.14, + "step": 262 + }, + { + "epoch": 0.5306431273644389, + "grad_norm": 0.353515625, + "learning_rate": 2.4665797970338183e-06, + "loss": 2.9485, + "memory/device_mem_reserved(gib)": 50.82, + "memory/max_mem_active(gib)": 45.14, + "memory/max_mem_allocated(gib)": 45.14, + "step": 263 + }, + { + "epoch": 0.5326607818411098, + "grad_norm": 0.33984375, + "learning_rate": 2.4498715620311937e-06, + "loss": 2.9009, + "memory/device_mem_reserved(gib)": 50.82, + "memory/max_mem_active(gib)": 45.14, + "memory/max_mem_allocated(gib)": 45.14, + "step": 264 + }, + { + "epoch": 0.5346784363177806, + "grad_norm": 0.337890625, + "learning_rate": 2.4331655667117284e-06, + "loss": 2.8577, + "memory/device_mem_reserved(gib)": 50.82, + "memory/max_mem_active(gib)": 45.14, + "memory/max_mem_allocated(gib)": 45.14, + "step": 265 + }, + { + "epoch": 0.5366960907944515, + "grad_norm": 0.38671875, + "learning_rate": 2.4164625574808145e-06, + "loss": 2.8691, + "memory/device_mem_reserved(gib)": 50.82, + "memory/max_mem_active(gib)": 45.14, + "memory/max_mem_allocated(gib)": 45.14, + "step": 266 + }, + { + "epoch": 0.5387137452711224, + "grad_norm": 0.447265625, + "learning_rate": 2.3997632806104275e-06, + "loss": 2.8994, + "memory/device_mem_reserved(gib)": 50.82, + "memory/max_mem_active(gib)": 45.14, + "memory/max_mem_allocated(gib)": 45.14, + "step": 267 + }, + { + "epoch": 0.5407313997477932, + "grad_norm": 0.330078125, + "learning_rate": 2.383068482205788e-06, + "loss": 2.8892, + "memory/device_mem_reserved(gib)": 50.82, + "memory/max_mem_active(gib)": 45.14, + "memory/max_mem_allocated(gib)": 45.14, + "step": 268 + }, + { + "epoch": 0.5427490542244641, + "grad_norm": 0.373046875, + "learning_rate": 2.36637890817202e-06, + "loss": 2.9277, + "memory/device_mem_reserved(gib)": 50.82, + "memory/max_mem_active(gib)": 45.14, + "memory/max_mem_allocated(gib)": 45.14, + "step": 269 + }, + { + "epoch": 0.544766708701135, + "grad_norm": 0.361328125, + "learning_rate": 2.3496953041808327e-06, + "loss": 2.7727, + "memory/device_mem_reserved(gib)": 50.82, + "memory/max_mem_active(gib)": 45.14, + "memory/max_mem_allocated(gib)": 45.14, + "step": 270 + }, + { + "epoch": 0.5467843631778058, + "grad_norm": 0.310546875, + "learning_rate": 2.333018415637196e-06, + "loss": 2.8423, + "memory/device_mem_reserved(gib)": 50.82, + "memory/max_mem_active(gib)": 45.14, + "memory/max_mem_allocated(gib)": 45.14, + "step": 271 + }, + { + "epoch": 0.5488020176544767, + "grad_norm": 0.328125, + "learning_rate": 2.3163489876460453e-06, + "loss": 2.7515, + "memory/device_mem_reserved(gib)": 50.82, + "memory/max_mem_active(gib)": 45.14, + "memory/max_mem_allocated(gib)": 45.14, + "step": 272 + }, + { + "epoch": 0.5508196721311476, + "grad_norm": 0.369140625, + "learning_rate": 2.2996877649789815e-06, + "loss": 3.039, + "memory/device_mem_reserved(gib)": 50.82, + "memory/max_mem_active(gib)": 45.14, + "memory/max_mem_allocated(gib)": 45.14, + "step": 273 + }, + { + "epoch": 0.5528373266078184, + "grad_norm": 0.33203125, + "learning_rate": 2.2830354920410066e-06, + "loss": 2.8529, + "memory/device_mem_reserved(gib)": 50.82, + "memory/max_mem_active(gib)": 45.14, + "memory/max_mem_allocated(gib)": 45.14, + "step": 274 + }, + { + "epoch": 0.5548549810844893, + "grad_norm": 0.33203125, + "learning_rate": 2.2663929128372537e-06, + "loss": 2.8775, + "memory/device_mem_reserved(gib)": 50.82, + "memory/max_mem_active(gib)": 45.14, + "memory/max_mem_allocated(gib)": 45.14, + "step": 275 + }, + { + "epoch": 0.5568726355611602, + "grad_norm": 0.333984375, + "learning_rate": 2.249760770939754e-06, + "loss": 2.9064, + "memory/device_mem_reserved(gib)": 50.82, + "memory/max_mem_active(gib)": 45.14, + "memory/max_mem_allocated(gib)": 45.14, + "step": 276 + }, + { + "epoch": 0.558890290037831, + "grad_norm": 0.3359375, + "learning_rate": 2.2331398094542097e-06, + "loss": 2.8687, + "memory/device_mem_reserved(gib)": 50.82, + "memory/max_mem_active(gib)": 45.14, + "memory/max_mem_allocated(gib)": 45.14, + "step": 277 + }, + { + "epoch": 0.5609079445145019, + "grad_norm": 0.333984375, + "learning_rate": 2.2165307709867954e-06, + "loss": 2.9269, + "memory/device_mem_reserved(gib)": 50.82, + "memory/max_mem_active(gib)": 45.14, + "memory/max_mem_allocated(gib)": 45.14, + "step": 278 + }, + { + "epoch": 0.5629255989911728, + "grad_norm": 0.34375, + "learning_rate": 2.199934397610976e-06, + "loss": 2.877, + "memory/device_mem_reserved(gib)": 50.82, + "memory/max_mem_active(gib)": 45.14, + "memory/max_mem_allocated(gib)": 45.14, + "step": 279 + }, + { + "epoch": 0.5649432534678437, + "grad_norm": 0.3203125, + "learning_rate": 2.1833514308343583e-06, + "loss": 2.9292, + "memory/device_mem_reserved(gib)": 50.82, + "memory/max_mem_active(gib)": 45.14, + "memory/max_mem_allocated(gib)": 45.14, + "step": 280 + }, + { + "epoch": 0.5669609079445145, + "grad_norm": 0.345703125, + "learning_rate": 2.1667826115655536e-06, + "loss": 2.8412, + "memory/device_mem_reserved(gib)": 50.82, + "memory/max_mem_active(gib)": 45.14, + "memory/max_mem_allocated(gib)": 45.14, + "step": 281 + }, + { + "epoch": 0.5689785624211854, + "grad_norm": 0.330078125, + "learning_rate": 2.150228680081079e-06, + "loss": 2.88, + "memory/device_mem_reserved(gib)": 50.82, + "memory/max_mem_active(gib)": 45.14, + "memory/max_mem_allocated(gib)": 45.14, + "step": 282 + }, + { + "epoch": 0.5709962168978563, + "grad_norm": 0.3515625, + "learning_rate": 2.1336903759922838e-06, + "loss": 2.7502, + "memory/device_mem_reserved(gib)": 50.82, + "memory/max_mem_active(gib)": 45.14, + "memory/max_mem_allocated(gib)": 45.14, + "step": 283 + }, + { + "epoch": 0.5730138713745271, + "grad_norm": 0.408203125, + "learning_rate": 2.1171684382123002e-06, + "loss": 2.8315, + "memory/device_mem_reserved(gib)": 50.82, + "memory/max_mem_active(gib)": 45.14, + "memory/max_mem_allocated(gib)": 45.14, + "step": 284 + }, + { + "epoch": 0.575031525851198, + "grad_norm": 0.322265625, + "learning_rate": 2.1006636049230327e-06, + "loss": 2.941, + "memory/device_mem_reserved(gib)": 50.82, + "memory/max_mem_active(gib)": 45.14, + "memory/max_mem_allocated(gib)": 45.14, + "step": 285 + }, + { + "epoch": 0.5770491803278689, + "grad_norm": 0.328125, + "learning_rate": 2.0841766135421753e-06, + "loss": 2.9317, + "memory/device_mem_reserved(gib)": 50.82, + "memory/max_mem_active(gib)": 45.14, + "memory/max_mem_allocated(gib)": 45.14, + "step": 286 + }, + { + "epoch": 0.5790668348045397, + "grad_norm": 0.330078125, + "learning_rate": 2.0677082006902673e-06, + "loss": 2.931, + "memory/device_mem_reserved(gib)": 50.82, + "memory/max_mem_active(gib)": 45.14, + "memory/max_mem_allocated(gib)": 45.14, + "step": 287 + }, + { + "epoch": 0.5810844892812106, + "grad_norm": 0.349609375, + "learning_rate": 2.0512591021577775e-06, + "loss": 2.8567, + "memory/device_mem_reserved(gib)": 50.82, + "memory/max_mem_active(gib)": 45.14, + "memory/max_mem_allocated(gib)": 45.14, + "step": 288 + }, + { + "epoch": 0.5831021437578815, + "grad_norm": 0.345703125, + "learning_rate": 2.034830052872235e-06, + "loss": 2.7867, + "memory/device_mem_reserved(gib)": 50.82, + "memory/max_mem_active(gib)": 45.14, + "memory/max_mem_allocated(gib)": 45.14, + "step": 289 + }, + { + "epoch": 0.5851197982345523, + "grad_norm": 0.34765625, + "learning_rate": 2.018421786865387e-06, + "loss": 2.9489, + "memory/device_mem_reserved(gib)": 50.82, + "memory/max_mem_active(gib)": 45.14, + "memory/max_mem_allocated(gib)": 45.14, + "step": 290 + }, + { + "epoch": 0.5871374527112232, + "grad_norm": 0.34765625, + "learning_rate": 2.0020350372404104e-06, + "loss": 2.8389, + "memory/device_mem_reserved(gib)": 50.82, + "memory/max_mem_active(gib)": 45.14, + "memory/max_mem_allocated(gib)": 45.14, + "step": 291 + }, + { + "epoch": 0.5891551071878941, + "grad_norm": 0.33203125, + "learning_rate": 1.985670536139151e-06, + "loss": 2.9196, + "memory/device_mem_reserved(gib)": 50.82, + "memory/max_mem_active(gib)": 45.14, + "memory/max_mem_allocated(gib)": 45.14, + "step": 292 + }, + { + "epoch": 0.5911727616645649, + "grad_norm": 0.32421875, + "learning_rate": 1.9693290147094184e-06, + "loss": 2.9489, + "memory/device_mem_reserved(gib)": 50.82, + "memory/max_mem_active(gib)": 45.14, + "memory/max_mem_allocated(gib)": 45.14, + "step": 293 + }, + { + "epoch": 0.5931904161412358, + "grad_norm": 0.392578125, + "learning_rate": 1.9530112030723123e-06, + "loss": 2.958, + "memory/device_mem_reserved(gib)": 50.82, + "memory/max_mem_active(gib)": 45.14, + "memory/max_mem_allocated(gib)": 45.14, + "step": 294 + }, + { + "epoch": 0.5952080706179067, + "grad_norm": 0.34765625, + "learning_rate": 1.9367178302896087e-06, + "loss": 2.7799, + "memory/device_mem_reserved(gib)": 50.82, + "memory/max_mem_active(gib)": 45.14, + "memory/max_mem_allocated(gib)": 45.14, + "step": 295 + }, + { + "epoch": 0.5972257250945776, + "grad_norm": 0.376953125, + "learning_rate": 1.920449624331179e-06, + "loss": 2.9718, + "memory/device_mem_reserved(gib)": 50.82, + "memory/max_mem_active(gib)": 45.14, + "memory/max_mem_allocated(gib)": 45.14, + "step": 296 + }, + { + "epoch": 0.5992433795712484, + "grad_norm": 0.337890625, + "learning_rate": 1.9042073120424727e-06, + "loss": 2.9414, + "memory/device_mem_reserved(gib)": 50.82, + "memory/max_mem_active(gib)": 45.14, + "memory/max_mem_allocated(gib)": 45.14, + "step": 297 + }, + { + "epoch": 0.6012610340479193, + "grad_norm": 0.33203125, + "learning_rate": 1.887991619112035e-06, + "loss": 2.9257, + "memory/device_mem_reserved(gib)": 50.82, + "memory/max_mem_active(gib)": 45.14, + "memory/max_mem_allocated(gib)": 45.14, + "step": 298 + }, + { + "epoch": 0.6032786885245902, + "grad_norm": 0.349609375, + "learning_rate": 1.8718032700390887e-06, + "loss": 2.9388, + "memory/device_mem_reserved(gib)": 50.82, + "memory/max_mem_active(gib)": 45.14, + "memory/max_mem_allocated(gib)": 45.14, + "step": 299 + }, + { + "epoch": 0.605296343001261, + "grad_norm": 0.353515625, + "learning_rate": 1.8556429881011655e-06, + "loss": 2.8914, + "memory/device_mem_reserved(gib)": 50.82, + "memory/max_mem_active(gib)": 45.14, + "memory/max_mem_allocated(gib)": 45.14, + "step": 300 + }, + { + "epoch": 0.6073139974779319, + "grad_norm": 0.37109375, + "learning_rate": 1.8395114953217853e-06, + "loss": 2.8707, + "memory/device_mem_reserved(gib)": 50.82, + "memory/max_mem_active(gib)": 45.14, + "memory/max_mem_allocated(gib)": 45.14, + "step": 301 + }, + { + "epoch": 0.6093316519546028, + "grad_norm": 0.310546875, + "learning_rate": 1.8234095124382031e-06, + "loss": 2.8361, + "memory/device_mem_reserved(gib)": 50.82, + "memory/max_mem_active(gib)": 45.14, + "memory/max_mem_allocated(gib)": 45.14, + "step": 302 + }, + { + "epoch": 0.6113493064312736, + "grad_norm": 0.357421875, + "learning_rate": 1.8073377588692026e-06, + "loss": 2.8958, + "memory/device_mem_reserved(gib)": 50.82, + "memory/max_mem_active(gib)": 45.14, + "memory/max_mem_allocated(gib)": 45.14, + "step": 303 + }, + { + "epoch": 0.6133669609079445, + "grad_norm": 0.34375, + "learning_rate": 1.791296952682956e-06, + "loss": 2.9175, + "memory/device_mem_reserved(gib)": 50.82, + "memory/max_mem_active(gib)": 45.14, + "memory/max_mem_allocated(gib)": 45.14, + "step": 304 + }, + { + "epoch": 0.6153846153846154, + "grad_norm": 0.333984375, + "learning_rate": 1.775287810564939e-06, + "loss": 2.8774, + "memory/device_mem_reserved(gib)": 50.82, + "memory/max_mem_active(gib)": 45.14, + "memory/max_mem_allocated(gib)": 45.14, + "step": 305 + }, + { + "epoch": 0.6174022698612862, + "grad_norm": 0.326171875, + "learning_rate": 1.7593110477859155e-06, + "loss": 2.9021, + "memory/device_mem_reserved(gib)": 50.82, + "memory/max_mem_active(gib)": 45.14, + "memory/max_mem_allocated(gib)": 45.14, + "step": 306 + }, + { + "epoch": 0.6194199243379571, + "grad_norm": 0.349609375, + "learning_rate": 1.7433673781699717e-06, + "loss": 2.919, + "memory/device_mem_reserved(gib)": 50.82, + "memory/max_mem_active(gib)": 45.14, + "memory/max_mem_allocated(gib)": 45.14, + "step": 307 + }, + { + "epoch": 0.621437578814628, + "grad_norm": 0.357421875, + "learning_rate": 1.7274575140626318e-06, + "loss": 2.9148, + "memory/device_mem_reserved(gib)": 50.82, + "memory/max_mem_active(gib)": 45.14, + "memory/max_mem_allocated(gib)": 45.14, + "step": 308 + }, + { + "epoch": 0.6234552332912988, + "grad_norm": 0.35546875, + "learning_rate": 1.7115821662990246e-06, + "loss": 2.8786, + "memory/device_mem_reserved(gib)": 50.82, + "memory/max_mem_active(gib)": 45.14, + "memory/max_mem_allocated(gib)": 45.14, + "step": 309 + }, + { + "epoch": 0.6254728877679697, + "grad_norm": 0.361328125, + "learning_rate": 1.6957420441721285e-06, + "loss": 2.9553, + "memory/device_mem_reserved(gib)": 50.82, + "memory/max_mem_active(gib)": 45.14, + "memory/max_mem_allocated(gib)": 45.14, + "step": 310 + }, + { + "epoch": 0.6274905422446406, + "grad_norm": 0.328125, + "learning_rate": 1.6799378554010773e-06, + "loss": 2.8745, + "memory/device_mem_reserved(gib)": 50.82, + "memory/max_mem_active(gib)": 45.14, + "memory/max_mem_allocated(gib)": 45.14, + "step": 311 + }, + { + "epoch": 0.6295081967213115, + "grad_norm": 0.400390625, + "learning_rate": 1.6641703060995456e-06, + "loss": 2.8396, + "memory/device_mem_reserved(gib)": 50.82, + "memory/max_mem_active(gib)": 45.14, + "memory/max_mem_allocated(gib)": 45.14, + "step": 312 + }, + { + "epoch": 0.6315258511979823, + "grad_norm": 0.33984375, + "learning_rate": 1.6484401007441938e-06, + "loss": 2.8615, + "memory/device_mem_reserved(gib)": 50.82, + "memory/max_mem_active(gib)": 45.14, + "memory/max_mem_allocated(gib)": 45.14, + "step": 313 + }, + { + "epoch": 0.6335435056746532, + "grad_norm": 0.349609375, + "learning_rate": 1.6327479421431984e-06, + "loss": 2.8957, + "memory/device_mem_reserved(gib)": 50.82, + "memory/max_mem_active(gib)": 45.14, + "memory/max_mem_allocated(gib)": 45.14, + "step": 314 + }, + { + "epoch": 0.6355611601513241, + "grad_norm": 0.35546875, + "learning_rate": 1.6170945314048476e-06, + "loss": 2.9217, + "memory/device_mem_reserved(gib)": 50.82, + "memory/max_mem_active(gib)": 45.14, + "memory/max_mem_allocated(gib)": 45.14, + "step": 315 + }, + { + "epoch": 0.6375788146279949, + "grad_norm": 0.34765625, + "learning_rate": 1.6014805679062185e-06, + "loss": 2.8061, + "memory/device_mem_reserved(gib)": 50.82, + "memory/max_mem_active(gib)": 45.14, + "memory/max_mem_allocated(gib)": 45.14, + "step": 316 + }, + { + "epoch": 0.6395964691046658, + "grad_norm": 0.31640625, + "learning_rate": 1.5859067492619284e-06, + "loss": 2.7589, + "memory/device_mem_reserved(gib)": 50.82, + "memory/max_mem_active(gib)": 45.14, + "memory/max_mem_allocated(gib)": 45.14, + "step": 317 + }, + { + "epoch": 0.6416141235813367, + "grad_norm": 0.306640625, + "learning_rate": 1.5703737712929674e-06, + "loss": 2.8038, + "memory/device_mem_reserved(gib)": 50.82, + "memory/max_mem_active(gib)": 45.14, + "memory/max_mem_allocated(gib)": 45.14, + "step": 318 + }, + { + "epoch": 0.6436317780580075, + "grad_norm": 0.345703125, + "learning_rate": 1.5548823279956104e-06, + "loss": 2.9054, + "memory/device_mem_reserved(gib)": 50.82, + "memory/max_mem_active(gib)": 45.14, + "memory/max_mem_allocated(gib)": 45.14, + "step": 319 + }, + { + "epoch": 0.6456494325346784, + "grad_norm": 0.37109375, + "learning_rate": 1.5394331115104074e-06, + "loss": 3.0088, + "memory/device_mem_reserved(gib)": 50.82, + "memory/max_mem_active(gib)": 45.14, + "memory/max_mem_allocated(gib)": 45.14, + "step": 320 + }, + { + "epoch": 0.6476670870113493, + "grad_norm": 0.33984375, + "learning_rate": 1.5240268120912631e-06, + "loss": 2.9129, + "memory/device_mem_reserved(gib)": 50.82, + "memory/max_mem_active(gib)": 45.14, + "memory/max_mem_allocated(gib)": 45.14, + "step": 321 + }, + { + "epoch": 0.6496847414880201, + "grad_norm": 0.34375, + "learning_rate": 1.5086641180745934e-06, + "loss": 2.9135, + "memory/device_mem_reserved(gib)": 50.82, + "memory/max_mem_active(gib)": 45.14, + "memory/max_mem_allocated(gib)": 45.14, + "step": 322 + }, + { + "epoch": 0.651702395964691, + "grad_norm": 0.33203125, + "learning_rate": 1.493345715848574e-06, + "loss": 2.9974, + "memory/device_mem_reserved(gib)": 50.82, + "memory/max_mem_active(gib)": 45.14, + "memory/max_mem_allocated(gib)": 45.14, + "step": 323 + }, + { + "epoch": 0.6537200504413619, + "grad_norm": 0.3359375, + "learning_rate": 1.478072289822471e-06, + "loss": 2.8818, + "memory/device_mem_reserved(gib)": 50.82, + "memory/max_mem_active(gib)": 45.14, + "memory/max_mem_allocated(gib)": 45.14, + "step": 324 + }, + { + "epoch": 0.6557377049180327, + "grad_norm": 0.34375, + "learning_rate": 1.462844522396066e-06, + "loss": 2.9075, + "memory/device_mem_reserved(gib)": 50.82, + "memory/max_mem_active(gib)": 45.14, + "memory/max_mem_allocated(gib)": 45.14, + "step": 325 + }, + { + "epoch": 0.6577553593947036, + "grad_norm": 0.33984375, + "learning_rate": 1.4476630939291631e-06, + "loss": 2.8859, + "memory/device_mem_reserved(gib)": 50.82, + "memory/max_mem_active(gib)": 45.14, + "memory/max_mem_allocated(gib)": 45.14, + "step": 326 + }, + { + "epoch": 0.6597730138713745, + "grad_norm": 0.32421875, + "learning_rate": 1.4325286827111931e-06, + "loss": 2.8487, + "memory/device_mem_reserved(gib)": 50.82, + "memory/max_mem_active(gib)": 45.14, + "memory/max_mem_allocated(gib)": 45.14, + "step": 327 + }, + { + "epoch": 0.6617906683480455, + "grad_norm": 0.328125, + "learning_rate": 1.417441964930909e-06, + "loss": 2.9071, + "memory/device_mem_reserved(gib)": 50.82, + "memory/max_mem_active(gib)": 45.14, + "memory/max_mem_allocated(gib)": 45.14, + "step": 328 + }, + { + "epoch": 0.6638083228247162, + "grad_norm": 0.328125, + "learning_rate": 1.4024036146461734e-06, + "loss": 2.8911, + "memory/device_mem_reserved(gib)": 50.82, + "memory/max_mem_active(gib)": 45.14, + "memory/max_mem_allocated(gib)": 45.14, + "step": 329 + }, + { + "epoch": 0.6658259773013872, + "grad_norm": 0.3671875, + "learning_rate": 1.3874143037538417e-06, + "loss": 2.8569, + "memory/device_mem_reserved(gib)": 50.82, + "memory/max_mem_active(gib)": 45.14, + "memory/max_mem_allocated(gib)": 45.14, + "step": 330 + }, + { + "epoch": 0.6678436317780581, + "grad_norm": 0.3359375, + "learning_rate": 1.372474701959745e-06, + "loss": 2.9294, + "memory/device_mem_reserved(gib)": 50.82, + "memory/max_mem_active(gib)": 45.14, + "memory/max_mem_allocated(gib)": 45.14, + "step": 331 + }, + { + "epoch": 0.6698612862547288, + "grad_norm": 0.341796875, + "learning_rate": 1.357585476748766e-06, + "loss": 2.9338, + "memory/device_mem_reserved(gib)": 50.82, + "memory/max_mem_active(gib)": 45.14, + "memory/max_mem_allocated(gib)": 45.14, + "step": 332 + }, + { + "epoch": 0.6718789407313998, + "grad_norm": 0.318359375, + "learning_rate": 1.342747293355019e-06, + "loss": 2.7899, + "memory/device_mem_reserved(gib)": 50.82, + "memory/max_mem_active(gib)": 45.14, + "memory/max_mem_allocated(gib)": 45.14, + "step": 333 + }, + { + "epoch": 0.6738965952080707, + "grad_norm": 0.34765625, + "learning_rate": 1.3279608147321223e-06, + "loss": 2.8819, + "memory/device_mem_reserved(gib)": 50.82, + "memory/max_mem_active(gib)": 45.14, + "memory/max_mem_allocated(gib)": 45.14, + "step": 334 + }, + { + "epoch": 0.6759142496847415, + "grad_norm": 0.369140625, + "learning_rate": 1.3132267015235862e-06, + "loss": 2.9289, + "memory/device_mem_reserved(gib)": 50.82, + "memory/max_mem_active(gib)": 45.14, + "memory/max_mem_allocated(gib)": 45.14, + "step": 335 + }, + { + "epoch": 0.6779319041614124, + "grad_norm": 0.349609375, + "learning_rate": 1.2985456120332907e-06, + "loss": 2.8636, + "memory/device_mem_reserved(gib)": 50.82, + "memory/max_mem_active(gib)": 45.14, + "memory/max_mem_allocated(gib)": 45.14, + "step": 336 + }, + { + "epoch": 0.6799495586380833, + "grad_norm": 0.30078125, + "learning_rate": 1.2839182021960753e-06, + "loss": 2.9377, + "memory/device_mem_reserved(gib)": 50.82, + "memory/max_mem_active(gib)": 45.14, + "memory/max_mem_allocated(gib)": 45.14, + "step": 337 + }, + { + "epoch": 0.6819672131147541, + "grad_norm": 0.36328125, + "learning_rate": 1.2693451255484314e-06, + "loss": 2.884, + "memory/device_mem_reserved(gib)": 50.82, + "memory/max_mem_active(gib)": 45.14, + "memory/max_mem_allocated(gib)": 45.14, + "step": 338 + }, + { + "epoch": 0.683984867591425, + "grad_norm": 0.333984375, + "learning_rate": 1.2548270331993034e-06, + "loss": 2.87, + "memory/device_mem_reserved(gib)": 50.82, + "memory/max_mem_active(gib)": 45.14, + "memory/max_mem_allocated(gib)": 45.14, + "step": 339 + }, + { + "epoch": 0.6860025220680959, + "grad_norm": 0.306640625, + "learning_rate": 1.2403645738009998e-06, + "loss": 2.9084, + "memory/device_mem_reserved(gib)": 50.82, + "memory/max_mem_active(gib)": 45.14, + "memory/max_mem_allocated(gib)": 45.14, + "step": 340 + }, + { + "epoch": 0.6880201765447667, + "grad_norm": 0.333984375, + "learning_rate": 1.2259583935202063e-06, + "loss": 2.9395, + "memory/device_mem_reserved(gib)": 50.82, + "memory/max_mem_active(gib)": 45.14, + "memory/max_mem_allocated(gib)": 45.14, + "step": 341 + }, + { + "epoch": 0.6900378310214376, + "grad_norm": 0.34375, + "learning_rate": 1.2116091360091262e-06, + "loss": 2.9742, + "memory/device_mem_reserved(gib)": 50.82, + "memory/max_mem_active(gib)": 45.14, + "memory/max_mem_allocated(gib)": 45.14, + "step": 342 + }, + { + "epoch": 0.6920554854981085, + "grad_norm": 0.3515625, + "learning_rate": 1.1973174423767098e-06, + "loss": 2.9324, + "memory/device_mem_reserved(gib)": 50.82, + "memory/max_mem_active(gib)": 45.14, + "memory/max_mem_allocated(gib)": 45.14, + "step": 343 + }, + { + "epoch": 0.6940731399747794, + "grad_norm": 0.326171875, + "learning_rate": 1.1830839511600211e-06, + "loss": 2.8262, + "memory/device_mem_reserved(gib)": 50.82, + "memory/max_mem_active(gib)": 45.14, + "memory/max_mem_allocated(gib)": 45.14, + "step": 344 + }, + { + "epoch": 0.6960907944514502, + "grad_norm": 0.365234375, + "learning_rate": 1.168909298295704e-06, + "loss": 2.8697, + "memory/device_mem_reserved(gib)": 50.82, + "memory/max_mem_active(gib)": 45.14, + "memory/max_mem_allocated(gib)": 45.14, + "step": 345 + }, + { + "epoch": 0.6981084489281211, + "grad_norm": 0.419921875, + "learning_rate": 1.1547941170915686e-06, + "loss": 2.922, + "memory/device_mem_reserved(gib)": 50.82, + "memory/max_mem_active(gib)": 45.14, + "memory/max_mem_allocated(gib)": 45.14, + "step": 346 + }, + { + "epoch": 0.700126103404792, + "grad_norm": 0.376953125, + "learning_rate": 1.140739038198298e-06, + "loss": 2.8969, + "memory/device_mem_reserved(gib)": 50.82, + "memory/max_mem_active(gib)": 45.14, + "memory/max_mem_allocated(gib)": 45.14, + "step": 347 + }, + { + "epoch": 0.7021437578814628, + "grad_norm": 0.333984375, + "learning_rate": 1.1267446895812704e-06, + "loss": 2.8233, + "memory/device_mem_reserved(gib)": 50.82, + "memory/max_mem_active(gib)": 45.14, + "memory/max_mem_allocated(gib)": 45.14, + "step": 348 + }, + { + "epoch": 0.7041614123581337, + "grad_norm": 0.330078125, + "learning_rate": 1.1128116964925023e-06, + "loss": 2.9209, + "memory/device_mem_reserved(gib)": 50.82, + "memory/max_mem_active(gib)": 45.14, + "memory/max_mem_allocated(gib)": 45.14, + "step": 349 + }, + { + "epoch": 0.7061790668348046, + "grad_norm": 0.306640625, + "learning_rate": 1.098940681442713e-06, + "loss": 2.855, + "memory/device_mem_reserved(gib)": 50.82, + "memory/max_mem_active(gib)": 45.14, + "memory/max_mem_allocated(gib)": 45.14, + "step": 350 + }, + { + "epoch": 0.7081967213114754, + "grad_norm": 0.341796875, + "learning_rate": 1.0851322641735119e-06, + "loss": 2.8907, + "memory/device_mem_reserved(gib)": 50.82, + "memory/max_mem_active(gib)": 45.14, + "memory/max_mem_allocated(gib)": 45.14, + "step": 351 + }, + { + "epoch": 0.7102143757881463, + "grad_norm": 0.333984375, + "learning_rate": 1.0713870616297093e-06, + "loss": 2.7215, + "memory/device_mem_reserved(gib)": 50.82, + "memory/max_mem_active(gib)": 45.14, + "memory/max_mem_allocated(gib)": 45.14, + "step": 352 + }, + { + "epoch": 0.7122320302648172, + "grad_norm": 0.341796875, + "learning_rate": 1.0577056879317486e-06, + "loss": 2.8413, + "memory/device_mem_reserved(gib)": 50.82, + "memory/max_mem_active(gib)": 45.14, + "memory/max_mem_allocated(gib)": 45.14, + "step": 353 + }, + { + "epoch": 0.714249684741488, + "grad_norm": 0.3515625, + "learning_rate": 1.0440887543482747e-06, + "loss": 2.8578, + "memory/device_mem_reserved(gib)": 50.82, + "memory/max_mem_active(gib)": 45.14, + "memory/max_mem_allocated(gib)": 45.14, + "step": 354 + }, + { + "epoch": 0.7162673392181589, + "grad_norm": 0.3359375, + "learning_rate": 1.0305368692688175e-06, + "loss": 2.9202, + "memory/device_mem_reserved(gib)": 50.82, + "memory/max_mem_active(gib)": 45.14, + "memory/max_mem_allocated(gib)": 45.14, + "step": 355 + }, + { + "epoch": 0.7182849936948298, + "grad_norm": 0.412109375, + "learning_rate": 1.0170506381766121e-06, + "loss": 2.962, + "memory/device_mem_reserved(gib)": 50.82, + "memory/max_mem_active(gib)": 45.14, + "memory/max_mem_allocated(gib)": 45.14, + "step": 356 + }, + { + "epoch": 0.7203026481715006, + "grad_norm": 0.330078125, + "learning_rate": 1.0036306636215462e-06, + "loss": 2.9256, + "memory/device_mem_reserved(gib)": 50.82, + "memory/max_mem_active(gib)": 45.14, + "memory/max_mem_allocated(gib)": 45.14, + "step": 357 + }, + { + "epoch": 0.7223203026481715, + "grad_norm": 0.302734375, + "learning_rate": 9.902775451932387e-07, + "loss": 2.9143, + "memory/device_mem_reserved(gib)": 50.82, + "memory/max_mem_active(gib)": 45.14, + "memory/max_mem_allocated(gib)": 45.14, + "step": 358 + }, + { + "epoch": 0.7243379571248424, + "grad_norm": 0.322265625, + "learning_rate": 9.769918794942511e-07, + "loss": 2.8691, + "memory/device_mem_reserved(gib)": 50.82, + "memory/max_mem_active(gib)": 45.14, + "memory/max_mem_allocated(gib)": 45.14, + "step": 359 + }, + { + "epoch": 0.7263556116015133, + "grad_norm": 0.298828125, + "learning_rate": 9.637742601134287e-07, + "loss": 2.8137, + "memory/device_mem_reserved(gib)": 50.82, + "memory/max_mem_active(gib)": 45.14, + "memory/max_mem_allocated(gib)": 45.14, + "step": 360 + }, + { + "epoch": 0.7283732660781841, + "grad_norm": 0.337890625, + "learning_rate": 9.506252775993882e-07, + "loss": 2.8396, + "memory/device_mem_reserved(gib)": 50.82, + "memory/max_mem_active(gib)": 45.14, + "memory/max_mem_allocated(gib)": 45.14, + "step": 361 + }, + { + "epoch": 0.730390920554855, + "grad_norm": 0.3203125, + "learning_rate": 9.375455194341215e-07, + "loss": 2.8566, + "memory/device_mem_reserved(gib)": 50.82, + "memory/max_mem_active(gib)": 45.14, + "memory/max_mem_allocated(gib)": 45.14, + "step": 362 + }, + { + "epoch": 0.7324085750315259, + "grad_norm": 0.318359375, + "learning_rate": 9.24535570006756e-07, + "loss": 2.8701, + "memory/device_mem_reserved(gib)": 50.82, + "memory/max_mem_active(gib)": 45.14, + "memory/max_mem_allocated(gib)": 45.14, + "step": 363 + }, + { + "epoch": 0.7344262295081967, + "grad_norm": 0.330078125, + "learning_rate": 9.115960105874411e-07, + "loss": 2.8288, + "memory/device_mem_reserved(gib)": 50.82, + "memory/max_mem_active(gib)": 45.14, + "memory/max_mem_allocated(gib)": 45.14, + "step": 364 + }, + { + "epoch": 0.7364438839848676, + "grad_norm": 0.287109375, + "learning_rate": 8.987274193013792e-07, + "loss": 2.8787, + "memory/device_mem_reserved(gib)": 50.82, + "memory/max_mem_active(gib)": 45.14, + "memory/max_mem_allocated(gib)": 45.14, + "step": 365 + }, + { + "epoch": 0.7384615384615385, + "grad_norm": 0.34765625, + "learning_rate": 8.85930371102994e-07, + "loss": 2.7868, + "memory/device_mem_reserved(gib)": 50.82, + "memory/max_mem_active(gib)": 45.14, + "memory/max_mem_allocated(gib)": 45.14, + "step": 366 + }, + { + "epoch": 0.7404791929382093, + "grad_norm": 0.32421875, + "learning_rate": 8.732054377502442e-07, + "loss": 2.9876, + "memory/device_mem_reserved(gib)": 50.82, + "memory/max_mem_active(gib)": 45.14, + "memory/max_mem_allocated(gib)": 45.14, + "step": 367 + }, + { + "epoch": 0.7424968474148802, + "grad_norm": 0.33984375, + "learning_rate": 8.605531877790762e-07, + "loss": 2.9011, + "memory/device_mem_reserved(gib)": 50.82, + "memory/max_mem_active(gib)": 45.14, + "memory/max_mem_allocated(gib)": 45.14, + "step": 368 + }, + { + "epoch": 0.7445145018915511, + "grad_norm": 0.36328125, + "learning_rate": 8.479741864780236e-07, + "loss": 2.8392, + "memory/device_mem_reserved(gib)": 50.82, + "memory/max_mem_active(gib)": 45.14, + "memory/max_mem_allocated(gib)": 45.14, + "step": 369 + }, + { + "epoch": 0.7465321563682219, + "grad_norm": 0.310546875, + "learning_rate": 8.354689958629514e-07, + "loss": 2.8629, + "memory/device_mem_reserved(gib)": 50.82, + "memory/max_mem_active(gib)": 45.14, + "memory/max_mem_allocated(gib)": 45.14, + "step": 370 + }, + { + "epoch": 0.7485498108448928, + "grad_norm": 0.3046875, + "learning_rate": 8.23038174651942e-07, + "loss": 2.9016, + "memory/device_mem_reserved(gib)": 50.82, + "memory/max_mem_active(gib)": 45.14, + "memory/max_mem_allocated(gib)": 45.14, + "step": 371 + }, + { + "epoch": 0.7505674653215637, + "grad_norm": 0.328125, + "learning_rate": 8.106822782403376e-07, + "loss": 2.9677, + "memory/device_mem_reserved(gib)": 50.82, + "memory/max_mem_active(gib)": 45.14, + "memory/max_mem_allocated(gib)": 45.14, + "step": 372 + }, + { + "epoch": 0.7525851197982345, + "grad_norm": 0.373046875, + "learning_rate": 7.984018586759227e-07, + "loss": 2.8286, + "memory/device_mem_reserved(gib)": 50.82, + "memory/max_mem_active(gib)": 45.14, + "memory/max_mem_allocated(gib)": 45.14, + "step": 373 + }, + { + "epoch": 0.7546027742749054, + "grad_norm": 0.326171875, + "learning_rate": 7.861974646342596e-07, + "loss": 2.826, + "memory/device_mem_reserved(gib)": 50.82, + "memory/max_mem_active(gib)": 45.14, + "memory/max_mem_allocated(gib)": 45.14, + "step": 374 + }, + { + "epoch": 0.7566204287515763, + "grad_norm": 0.333984375, + "learning_rate": 7.740696413941745e-07, + "loss": 2.8649, + "memory/device_mem_reserved(gib)": 50.82, + "memory/max_mem_active(gib)": 45.14, + "memory/max_mem_allocated(gib)": 45.14, + "step": 375 + }, + { + "epoch": 0.7586380832282472, + "grad_norm": 0.33984375, + "learning_rate": 7.620189308133943e-07, + "loss": 2.8342, + "memory/device_mem_reserved(gib)": 50.82, + "memory/max_mem_active(gib)": 45.14, + "memory/max_mem_allocated(gib)": 45.14, + "step": 376 + }, + { + "epoch": 0.760655737704918, + "grad_norm": 0.330078125, + "learning_rate": 7.500458713043385e-07, + "loss": 2.8452, + "memory/device_mem_reserved(gib)": 50.82, + "memory/max_mem_active(gib)": 45.14, + "memory/max_mem_allocated(gib)": 45.14, + "step": 377 + }, + { + "epoch": 0.7626733921815889, + "grad_norm": 0.345703125, + "learning_rate": 7.381509978100626e-07, + "loss": 2.7791, + "memory/device_mem_reserved(gib)": 50.82, + "memory/max_mem_active(gib)": 45.14, + "memory/max_mem_allocated(gib)": 45.14, + "step": 378 + }, + { + "epoch": 0.7646910466582598, + "grad_norm": 0.306640625, + "learning_rate": 7.263348417803545e-07, + "loss": 2.8265, + "memory/device_mem_reserved(gib)": 50.82, + "memory/max_mem_active(gib)": 45.14, + "memory/max_mem_allocated(gib)": 45.14, + "step": 379 + }, + { + "epoch": 0.7667087011349306, + "grad_norm": 0.314453125, + "learning_rate": 7.145979311479986e-07, + "loss": 2.9179, + "memory/device_mem_reserved(gib)": 50.82, + "memory/max_mem_active(gib)": 45.14, + "memory/max_mem_allocated(gib)": 45.14, + "step": 380 + }, + { + "epoch": 0.7687263556116015, + "grad_norm": 0.3046875, + "learning_rate": 7.029407903051771e-07, + "loss": 2.9605, + "memory/device_mem_reserved(gib)": 50.82, + "memory/max_mem_active(gib)": 45.14, + "memory/max_mem_allocated(gib)": 45.14, + "step": 381 + }, + { + "epoch": 0.7707440100882724, + "grad_norm": 0.306640625, + "learning_rate": 6.91363940080049e-07, + "loss": 2.8854, + "memory/device_mem_reserved(gib)": 50.82, + "memory/max_mem_active(gib)": 45.14, + "memory/max_mem_allocated(gib)": 45.14, + "step": 382 + }, + { + "epoch": 0.7727616645649432, + "grad_norm": 0.32421875, + "learning_rate": 6.798678977134768e-07, + "loss": 2.8363, + "memory/device_mem_reserved(gib)": 50.82, + "memory/max_mem_active(gib)": 45.14, + "memory/max_mem_allocated(gib)": 45.14, + "step": 383 + }, + { + "epoch": 0.7747793190416141, + "grad_norm": 0.318359375, + "learning_rate": 6.684531768359173e-07, + "loss": 2.9301, + "memory/device_mem_reserved(gib)": 50.82, + "memory/max_mem_active(gib)": 45.14, + "memory/max_mem_allocated(gib)": 45.14, + "step": 384 + }, + { + "epoch": 0.776796973518285, + "grad_norm": 0.345703125, + "learning_rate": 6.57120287444473e-07, + "loss": 2.8542, + "memory/device_mem_reserved(gib)": 50.82, + "memory/max_mem_active(gib)": 45.14, + "memory/max_mem_allocated(gib)": 45.14, + "step": 385 + }, + { + "epoch": 0.7788146279949558, + "grad_norm": 0.345703125, + "learning_rate": 6.458697358801061e-07, + "loss": 2.8322, + "memory/device_mem_reserved(gib)": 50.82, + "memory/max_mem_active(gib)": 45.14, + "memory/max_mem_allocated(gib)": 45.14, + "step": 386 + }, + { + "epoch": 0.7808322824716267, + "grad_norm": 0.314453125, + "learning_rate": 6.34702024805016e-07, + "loss": 2.8331, + "memory/device_mem_reserved(gib)": 50.82, + "memory/max_mem_active(gib)": 45.14, + "memory/max_mem_allocated(gib)": 45.14, + "step": 387 + }, + { + "epoch": 0.7828499369482976, + "grad_norm": 0.314453125, + "learning_rate": 6.236176531801813e-07, + "loss": 2.8607, + "memory/device_mem_reserved(gib)": 50.82, + "memory/max_mem_active(gib)": 45.14, + "memory/max_mem_allocated(gib)": 45.14, + "step": 388 + }, + { + "epoch": 0.7848675914249684, + "grad_norm": 0.345703125, + "learning_rate": 6.126171162430636e-07, + "loss": 2.9638, + "memory/device_mem_reserved(gib)": 50.82, + "memory/max_mem_active(gib)": 45.14, + "memory/max_mem_allocated(gib)": 45.14, + "step": 389 + }, + { + "epoch": 0.7868852459016393, + "grad_norm": 0.33984375, + "learning_rate": 6.017009054854858e-07, + "loss": 2.9402, + "memory/device_mem_reserved(gib)": 50.82, + "memory/max_mem_active(gib)": 45.14, + "memory/max_mem_allocated(gib)": 45.14, + "step": 390 + }, + { + "epoch": 0.7889029003783102, + "grad_norm": 0.3125, + "learning_rate": 5.908695086316701e-07, + "loss": 2.8842, + "memory/device_mem_reserved(gib)": 50.82, + "memory/max_mem_active(gib)": 45.14, + "memory/max_mem_allocated(gib)": 45.14, + "step": 391 + }, + { + "epoch": 0.7909205548549811, + "grad_norm": 0.427734375, + "learning_rate": 5.801234096164468e-07, + "loss": 2.87, + "memory/device_mem_reserved(gib)": 50.82, + "memory/max_mem_active(gib)": 45.14, + "memory/max_mem_allocated(gib)": 45.14, + "step": 392 + }, + { + "epoch": 0.7929382093316519, + "grad_norm": 0.34765625, + "learning_rate": 5.694630885636332e-07, + "loss": 2.8928, + "memory/device_mem_reserved(gib)": 50.82, + "memory/max_mem_active(gib)": 45.14, + "memory/max_mem_allocated(gib)": 45.14, + "step": 393 + }, + { + "epoch": 0.7949558638083228, + "grad_norm": 0.322265625, + "learning_rate": 5.588890217645821e-07, + "loss": 2.7904, + "memory/device_mem_reserved(gib)": 50.82, + "memory/max_mem_active(gib)": 45.14, + "memory/max_mem_allocated(gib)": 45.14, + "step": 394 + }, + { + "epoch": 0.7969735182849937, + "grad_norm": 0.341796875, + "learning_rate": 5.484016816569015e-07, + "loss": 2.9444, + "memory/device_mem_reserved(gib)": 50.82, + "memory/max_mem_active(gib)": 45.14, + "memory/max_mem_allocated(gib)": 45.14, + "step": 395 + }, + { + "epoch": 0.7989911727616645, + "grad_norm": 0.3203125, + "learning_rate": 5.380015368033476e-07, + "loss": 2.9772, + "memory/device_mem_reserved(gib)": 50.82, + "memory/max_mem_active(gib)": 45.14, + "memory/max_mem_allocated(gib)": 45.14, + "step": 396 + }, + { + "epoch": 0.8010088272383354, + "grad_norm": 0.34375, + "learning_rate": 5.276890518708885e-07, + "loss": 2.7561, + "memory/device_mem_reserved(gib)": 50.82, + "memory/max_mem_active(gib)": 45.14, + "memory/max_mem_allocated(gib)": 45.14, + "step": 397 + }, + { + "epoch": 0.8030264817150063, + "grad_norm": 0.32421875, + "learning_rate": 5.174646876099421e-07, + "loss": 2.8828, + "memory/device_mem_reserved(gib)": 50.82, + "memory/max_mem_active(gib)": 45.14, + "memory/max_mem_allocated(gib)": 45.14, + "step": 398 + }, + { + "epoch": 0.8050441361916771, + "grad_norm": 0.302734375, + "learning_rate": 5.073289008337967e-07, + "loss": 2.8515, + "memory/device_mem_reserved(gib)": 50.82, + "memory/max_mem_active(gib)": 45.14, + "memory/max_mem_allocated(gib)": 45.14, + "step": 399 + }, + { + "epoch": 0.807061790668348, + "grad_norm": 0.32421875, + "learning_rate": 4.972821443981921e-07, + "loss": 2.8422, + "memory/device_mem_reserved(gib)": 50.82, + "memory/max_mem_active(gib)": 45.14, + "memory/max_mem_allocated(gib)": 45.14, + "step": 400 + }, + { + "epoch": 0.809079445145019, + "grad_norm": 0.31640625, + "learning_rate": 4.873248671810929e-07, + "loss": 2.9368, + "memory/device_mem_reserved(gib)": 50.82, + "memory/max_mem_active(gib)": 45.14, + "memory/max_mem_allocated(gib)": 45.14, + "step": 401 + }, + { + "epoch": 0.8110970996216897, + "grad_norm": 0.373046875, + "learning_rate": 4.774575140626317e-07, + "loss": 2.955, + "memory/device_mem_reserved(gib)": 50.82, + "memory/max_mem_active(gib)": 45.14, + "memory/max_mem_allocated(gib)": 45.14, + "step": 402 + }, + { + "epoch": 0.8131147540983606, + "grad_norm": 0.302734375, + "learning_rate": 4.6768052590523053e-07, + "loss": 2.9267, + "memory/device_mem_reserved(gib)": 50.82, + "memory/max_mem_active(gib)": 45.14, + "memory/max_mem_allocated(gib)": 45.14, + "step": 403 + }, + { + "epoch": 0.8151324085750316, + "grad_norm": 0.328125, + "learning_rate": 4.579943395339062e-07, + "loss": 2.8792, + "memory/device_mem_reserved(gib)": 50.82, + "memory/max_mem_active(gib)": 45.14, + "memory/max_mem_allocated(gib)": 45.14, + "step": 404 + }, + { + "epoch": 0.8171500630517023, + "grad_norm": 0.328125, + "learning_rate": 4.4839938771675115e-07, + "loss": 2.9025, + "memory/device_mem_reserved(gib)": 50.82, + "memory/max_mem_active(gib)": 45.14, + "memory/max_mem_allocated(gib)": 45.14, + "step": 405 + }, + { + "epoch": 0.8191677175283733, + "grad_norm": 0.328125, + "learning_rate": 4.388960991455998e-07, + "loss": 2.9083, + "memory/device_mem_reserved(gib)": 50.82, + "memory/max_mem_active(gib)": 45.14, + "memory/max_mem_allocated(gib)": 45.14, + "step": 406 + }, + { + "epoch": 0.8211853720050442, + "grad_norm": 0.326171875, + "learning_rate": 4.294848984168723e-07, + "loss": 2.9084, + "memory/device_mem_reserved(gib)": 50.82, + "memory/max_mem_active(gib)": 45.14, + "memory/max_mem_allocated(gib)": 45.14, + "step": 407 + }, + { + "epoch": 0.8232030264817151, + "grad_norm": 0.310546875, + "learning_rate": 4.20166206012608e-07, + "loss": 2.8086, + "memory/device_mem_reserved(gib)": 50.82, + "memory/max_mem_active(gib)": 45.14, + "memory/max_mem_allocated(gib)": 45.14, + "step": 408 + }, + { + "epoch": 0.8252206809583859, + "grad_norm": 0.3359375, + "learning_rate": 4.109404382816756e-07, + "loss": 2.8263, + "memory/device_mem_reserved(gib)": 50.82, + "memory/max_mem_active(gib)": 45.14, + "memory/max_mem_allocated(gib)": 45.14, + "step": 409 + }, + { + "epoch": 0.8272383354350568, + "grad_norm": 0.326171875, + "learning_rate": 4.0180800742117246e-07, + "loss": 2.9327, + "memory/device_mem_reserved(gib)": 50.82, + "memory/max_mem_active(gib)": 45.14, + "memory/max_mem_allocated(gib)": 45.14, + "step": 410 + }, + { + "epoch": 0.8292559899117277, + "grad_norm": 0.333984375, + "learning_rate": 3.927693214580075e-07, + "loss": 2.8724, + "memory/device_mem_reserved(gib)": 50.82, + "memory/max_mem_active(gib)": 45.14, + "memory/max_mem_allocated(gib)": 45.14, + "step": 411 + }, + { + "epoch": 0.8312736443883985, + "grad_norm": 0.3359375, + "learning_rate": 3.8382478423067163e-07, + "loss": 2.8089, + "memory/device_mem_reserved(gib)": 50.82, + "memory/max_mem_active(gib)": 45.14, + "memory/max_mem_allocated(gib)": 45.14, + "step": 412 + }, + { + "epoch": 0.8332912988650694, + "grad_norm": 0.35546875, + "learning_rate": 3.7497479537119435e-07, + "loss": 2.9261, + "memory/device_mem_reserved(gib)": 50.82, + "memory/max_mem_active(gib)": 45.14, + "memory/max_mem_allocated(gib)": 45.14, + "step": 413 + }, + { + "epoch": 0.8353089533417403, + "grad_norm": 0.34765625, + "learning_rate": 3.662197502872886e-07, + "loss": 2.9533, + "memory/device_mem_reserved(gib)": 50.82, + "memory/max_mem_active(gib)": 45.14, + "memory/max_mem_allocated(gib)": 45.14, + "step": 414 + }, + { + "epoch": 0.8373266078184111, + "grad_norm": 0.3359375, + "learning_rate": 3.575600401446841e-07, + "loss": 2.9823, + "memory/device_mem_reserved(gib)": 50.82, + "memory/max_mem_active(gib)": 45.14, + "memory/max_mem_allocated(gib)": 45.14, + "step": 415 + }, + { + "epoch": 0.839344262295082, + "grad_norm": 0.349609375, + "learning_rate": 3.489960518496521e-07, + "loss": 2.9184, + "memory/device_mem_reserved(gib)": 50.82, + "memory/max_mem_active(gib)": 45.14, + "memory/max_mem_allocated(gib)": 45.14, + "step": 416 + }, + { + "epoch": 0.8413619167717529, + "grad_norm": 0.357421875, + "learning_rate": 3.405281680317149e-07, + "loss": 2.8453, + "memory/device_mem_reserved(gib)": 50.82, + "memory/max_mem_active(gib)": 45.14, + "memory/max_mem_allocated(gib)": 45.14, + "step": 417 + }, + { + "epoch": 0.8433795712484237, + "grad_norm": 0.314453125, + "learning_rate": 3.3215676702655687e-07, + "loss": 2.8601, + "memory/device_mem_reserved(gib)": 50.82, + "memory/max_mem_active(gib)": 45.14, + "memory/max_mem_allocated(gib)": 45.14, + "step": 418 + }, + { + "epoch": 0.8453972257250946, + "grad_norm": 0.32421875, + "learning_rate": 3.2388222285911373e-07, + "loss": 2.9082, + "memory/device_mem_reserved(gib)": 50.82, + "memory/max_mem_active(gib)": 45.14, + "memory/max_mem_allocated(gib)": 45.14, + "step": 419 + }, + { + "epoch": 0.8474148802017655, + "grad_norm": 0.322265625, + "learning_rate": 3.1570490522686624e-07, + "loss": 2.8188, + "memory/device_mem_reserved(gib)": 50.82, + "memory/max_mem_active(gib)": 45.14, + "memory/max_mem_allocated(gib)": 45.14, + "step": 420 + }, + { + "epoch": 0.8494325346784363, + "grad_norm": 0.3515625, + "learning_rate": 3.076251794833213e-07, + "loss": 2.8224, + "memory/device_mem_reserved(gib)": 50.82, + "memory/max_mem_active(gib)": 45.14, + "memory/max_mem_allocated(gib)": 45.14, + "step": 421 + }, + { + "epoch": 0.8514501891551072, + "grad_norm": 0.32421875, + "learning_rate": 2.9964340662168774e-07, + "loss": 2.8893, + "memory/device_mem_reserved(gib)": 50.82, + "memory/max_mem_active(gib)": 45.14, + "memory/max_mem_allocated(gib)": 45.14, + "step": 422 + }, + { + "epoch": 0.8534678436317781, + "grad_norm": 0.3203125, + "learning_rate": 2.9175994325874783e-07, + "loss": 2.8318, + "memory/device_mem_reserved(gib)": 50.82, + "memory/max_mem_active(gib)": 45.14, + "memory/max_mem_allocated(gib)": 45.14, + "step": 423 + }, + { + "epoch": 0.855485498108449, + "grad_norm": 0.31640625, + "learning_rate": 2.8397514161892484e-07, + "loss": 2.8396, + "memory/device_mem_reserved(gib)": 50.82, + "memory/max_mem_active(gib)": 45.14, + "memory/max_mem_allocated(gib)": 45.14, + "step": 424 + }, + { + "epoch": 0.8575031525851198, + "grad_norm": 0.31640625, + "learning_rate": 2.7628934951854506e-07, + "loss": 2.7336, + "memory/device_mem_reserved(gib)": 50.82, + "memory/max_mem_active(gib)": 45.14, + "memory/max_mem_allocated(gib)": 45.14, + "step": 425 + }, + { + "epoch": 0.8595208070617907, + "grad_norm": 0.326171875, + "learning_rate": 2.6870291035029724e-07, + "loss": 2.8688, + "memory/device_mem_reserved(gib)": 50.82, + "memory/max_mem_active(gib)": 45.14, + "memory/max_mem_allocated(gib)": 45.14, + "step": 426 + }, + { + "epoch": 0.8615384615384616, + "grad_norm": 0.30859375, + "learning_rate": 2.612161630678922e-07, + "loss": 2.7664, + "memory/device_mem_reserved(gib)": 50.82, + "memory/max_mem_active(gib)": 45.14, + "memory/max_mem_allocated(gib)": 45.14, + "step": 427 + }, + { + "epoch": 0.8635561160151324, + "grad_norm": 0.3515625, + "learning_rate": 2.5382944217091725e-07, + "loss": 2.8153, + "memory/device_mem_reserved(gib)": 50.82, + "memory/max_mem_active(gib)": 45.14, + "memory/max_mem_allocated(gib)": 45.14, + "step": 428 + }, + { + "epoch": 0.8655737704918033, + "grad_norm": 0.33984375, + "learning_rate": 2.465430776898911e-07, + "loss": 2.818, + "memory/device_mem_reserved(gib)": 50.82, + "memory/max_mem_active(gib)": 45.14, + "memory/max_mem_allocated(gib)": 45.14, + "step": 429 + }, + { + "epoch": 0.8675914249684742, + "grad_norm": 0.29296875, + "learning_rate": 2.3935739517151916e-07, + "loss": 2.9183, + "memory/device_mem_reserved(gib)": 50.82, + "memory/max_mem_active(gib)": 45.14, + "memory/max_mem_allocated(gib)": 45.14, + "step": 430 + }, + { + "epoch": 0.869609079445145, + "grad_norm": 0.359375, + "learning_rate": 2.3227271566414827e-07, + "loss": 2.874, + "memory/device_mem_reserved(gib)": 50.82, + "memory/max_mem_active(gib)": 45.14, + "memory/max_mem_allocated(gib)": 45.14, + "step": 431 + }, + { + "epoch": 0.8716267339218159, + "grad_norm": 0.3359375, + "learning_rate": 2.2528935570342165e-07, + "loss": 2.9182, + "memory/device_mem_reserved(gib)": 50.82, + "memory/max_mem_active(gib)": 45.14, + "memory/max_mem_allocated(gib)": 45.14, + "step": 432 + }, + { + "epoch": 0.8736443883984868, + "grad_norm": 0.341796875, + "learning_rate": 2.1840762729813808e-07, + "loss": 2.8589, + "memory/device_mem_reserved(gib)": 50.82, + "memory/max_mem_active(gib)": 45.14, + "memory/max_mem_allocated(gib)": 45.14, + "step": 433 + }, + { + "epoch": 0.8756620428751576, + "grad_norm": 0.31640625, + "learning_rate": 2.116278379163106e-07, + "loss": 2.938, + "memory/device_mem_reserved(gib)": 50.82, + "memory/max_mem_active(gib)": 45.14, + "memory/max_mem_allocated(gib)": 45.14, + "step": 434 + }, + { + "epoch": 0.8776796973518285, + "grad_norm": 0.353515625, + "learning_rate": 2.0495029047142983e-07, + "loss": 2.9281, + "memory/device_mem_reserved(gib)": 50.82, + "memory/max_mem_active(gib)": 45.14, + "memory/max_mem_allocated(gib)": 45.14, + "step": 435 + }, + { + "epoch": 0.8796973518284994, + "grad_norm": 0.330078125, + "learning_rate": 1.9837528330892781e-07, + "loss": 2.9188, + "memory/device_mem_reserved(gib)": 50.82, + "memory/max_mem_active(gib)": 45.14, + "memory/max_mem_allocated(gib)": 45.14, + "step": 436 + }, + { + "epoch": 0.8817150063051702, + "grad_norm": 0.32421875, + "learning_rate": 1.9190311019285368e-07, + "loss": 2.9296, + "memory/device_mem_reserved(gib)": 50.82, + "memory/max_mem_active(gib)": 45.14, + "memory/max_mem_allocated(gib)": 45.14, + "step": 437 + }, + { + "epoch": 0.8837326607818411, + "grad_norm": 0.31640625, + "learning_rate": 1.855340602927419e-07, + "loss": 2.8826, + "memory/device_mem_reserved(gib)": 50.82, + "memory/max_mem_active(gib)": 45.14, + "memory/max_mem_allocated(gib)": 45.14, + "step": 438 + }, + { + "epoch": 0.885750315258512, + "grad_norm": 0.359375, + "learning_rate": 1.7926841817069717e-07, + "loss": 2.8862, + "memory/device_mem_reserved(gib)": 50.82, + "memory/max_mem_active(gib)": 45.14, + "memory/max_mem_allocated(gib)": 45.14, + "step": 439 + }, + { + "epoch": 0.8877679697351829, + "grad_norm": 0.37890625, + "learning_rate": 1.7310646376867885e-07, + "loss": 2.9719, + "memory/device_mem_reserved(gib)": 50.82, + "memory/max_mem_active(gib)": 45.14, + "memory/max_mem_allocated(gib)": 45.14, + "step": 440 + }, + { + "epoch": 0.8897856242118537, + "grad_norm": 0.326171875, + "learning_rate": 1.6704847239599364e-07, + "loss": 2.8783, + "memory/device_mem_reserved(gib)": 50.82, + "memory/max_mem_active(gib)": 45.14, + "memory/max_mem_allocated(gib)": 45.14, + "step": 441 + }, + { + "epoch": 0.8918032786885246, + "grad_norm": 0.36328125, + "learning_rate": 1.6109471471699557e-07, + "loss": 2.9328, + "memory/device_mem_reserved(gib)": 50.82, + "memory/max_mem_active(gib)": 45.14, + "memory/max_mem_allocated(gib)": 45.14, + "step": 442 + }, + { + "epoch": 0.8938209331651955, + "grad_norm": 0.314453125, + "learning_rate": 1.5524545673899106e-07, + "loss": 2.8429, + "memory/device_mem_reserved(gib)": 50.82, + "memory/max_mem_active(gib)": 45.14, + "memory/max_mem_allocated(gib)": 45.14, + "step": 443 + }, + { + "epoch": 0.8958385876418663, + "grad_norm": 0.3515625, + "learning_rate": 1.4950095980035772e-07, + "loss": 2.9344, + "memory/device_mem_reserved(gib)": 50.82, + "memory/max_mem_active(gib)": 45.14, + "memory/max_mem_allocated(gib)": 45.14, + "step": 444 + }, + { + "epoch": 0.8978562421185372, + "grad_norm": 0.330078125, + "learning_rate": 1.438614805588634e-07, + "loss": 2.8765, + "memory/device_mem_reserved(gib)": 50.82, + "memory/max_mem_active(gib)": 45.14, + "memory/max_mem_allocated(gib)": 45.14, + "step": 445 + }, + { + "epoch": 0.8998738965952081, + "grad_norm": 0.37109375, + "learning_rate": 1.3832727098020333e-07, + "loss": 2.7767, + "memory/device_mem_reserved(gib)": 50.82, + "memory/max_mem_active(gib)": 45.14, + "memory/max_mem_allocated(gib)": 45.14, + "step": 446 + }, + { + "epoch": 0.9018915510718789, + "grad_norm": 0.330078125, + "learning_rate": 1.3289857832673947e-07, + "loss": 2.9776, + "memory/device_mem_reserved(gib)": 50.82, + "memory/max_mem_active(gib)": 45.14, + "memory/max_mem_allocated(gib)": 45.14, + "step": 447 + }, + { + "epoch": 0.9039092055485498, + "grad_norm": 0.404296875, + "learning_rate": 1.2757564514645492e-07, + "loss": 2.8624, + "memory/device_mem_reserved(gib)": 50.82, + "memory/max_mem_active(gib)": 45.14, + "memory/max_mem_allocated(gib)": 45.14, + "step": 448 + }, + { + "epoch": 0.9059268600252207, + "grad_norm": 0.330078125, + "learning_rate": 1.223587092621162e-07, + "loss": 2.993, + "memory/device_mem_reserved(gib)": 50.82, + "memory/max_mem_active(gib)": 45.14, + "memory/max_mem_allocated(gib)": 45.14, + "step": 449 + }, + { + "epoch": 0.9079445145018915, + "grad_norm": 0.31640625, + "learning_rate": 1.1724800376064799e-07, + "loss": 2.8225, + "memory/device_mem_reserved(gib)": 50.82, + "memory/max_mem_active(gib)": 45.14, + "memory/max_mem_allocated(gib)": 45.14, + "step": 450 + }, + { + "epoch": 0.9099621689785624, + "grad_norm": 0.353515625, + "learning_rate": 1.1224375698271894e-07, + "loss": 2.928, + "memory/device_mem_reserved(gib)": 50.82, + "memory/max_mem_active(gib)": 45.14, + "memory/max_mem_allocated(gib)": 45.14, + "step": 451 + }, + { + "epoch": 0.9119798234552333, + "grad_norm": 0.32421875, + "learning_rate": 1.0734619251253963e-07, + "loss": 2.8997, + "memory/device_mem_reserved(gib)": 50.82, + "memory/max_mem_active(gib)": 45.14, + "memory/max_mem_allocated(gib)": 45.14, + "step": 452 + }, + { + "epoch": 0.9139974779319041, + "grad_norm": 0.337890625, + "learning_rate": 1.0255552916787343e-07, + "loss": 2.9446, + "memory/device_mem_reserved(gib)": 50.82, + "memory/max_mem_active(gib)": 45.14, + "memory/max_mem_allocated(gib)": 45.14, + "step": 453 + }, + { + "epoch": 0.916015132408575, + "grad_norm": 0.326171875, + "learning_rate": 9.78719809902598e-08, + "loss": 2.9681, + "memory/device_mem_reserved(gib)": 50.82, + "memory/max_mem_active(gib)": 45.14, + "memory/max_mem_allocated(gib)": 45.14, + "step": 454 + }, + { + "epoch": 0.9180327868852459, + "grad_norm": 0.33984375, + "learning_rate": 9.329575723544925e-08, + "loss": 2.9225, + "memory/device_mem_reserved(gib)": 50.82, + "memory/max_mem_active(gib)": 45.14, + "memory/max_mem_allocated(gib)": 45.14, + "step": 455 + }, + { + "epoch": 0.9200504413619168, + "grad_norm": 0.314453125, + "learning_rate": 8.882706236405886e-08, + "loss": 2.885, + "memory/device_mem_reserved(gib)": 50.82, + "memory/max_mem_active(gib)": 45.14, + "memory/max_mem_allocated(gib)": 45.14, + "step": 456 + }, + { + "epoch": 0.9220680958385876, + "grad_norm": 0.296875, + "learning_rate": 8.446609603243117e-08, + "loss": 2.8799, + "memory/device_mem_reserved(gib)": 50.82, + "memory/max_mem_active(gib)": 45.14, + "memory/max_mem_allocated(gib)": 45.14, + "step": 457 + }, + { + "epoch": 0.9240857503152585, + "grad_norm": 0.328125, + "learning_rate": 8.021305308371891e-08, + "loss": 2.8873, + "memory/device_mem_reserved(gib)": 50.82, + "memory/max_mem_active(gib)": 45.14, + "memory/max_mem_allocated(gib)": 45.14, + "step": 458 + }, + { + "epoch": 0.9261034047919294, + "grad_norm": 0.333984375, + "learning_rate": 7.606812353917636e-08, + "loss": 2.8536, + "memory/device_mem_reserved(gib)": 50.82, + "memory/max_mem_active(gib)": 45.14, + "memory/max_mem_allocated(gib)": 45.14, + "step": 459 + }, + { + "epoch": 0.9281210592686002, + "grad_norm": 0.32421875, + "learning_rate": 7.203149258967035e-08, + "loss": 2.8754, + "memory/device_mem_reserved(gib)": 50.82, + "memory/max_mem_active(gib)": 45.14, + "memory/max_mem_allocated(gib)": 45.14, + "step": 460 + }, + { + "epoch": 0.9301387137452711, + "grad_norm": 0.341796875, + "learning_rate": 6.810334058740736e-08, + "loss": 2.804, + "memory/device_mem_reserved(gib)": 50.82, + "memory/max_mem_active(gib)": 45.14, + "memory/max_mem_allocated(gib)": 45.14, + "step": 461 + }, + { + "epoch": 0.932156368221942, + "grad_norm": 0.357421875, + "learning_rate": 6.428384303787282e-08, + "loss": 2.8765, + "memory/device_mem_reserved(gib)": 50.82, + "memory/max_mem_active(gib)": 45.14, + "memory/max_mem_allocated(gib)": 45.14, + "step": 462 + }, + { + "epoch": 0.9341740226986128, + "grad_norm": 0.3359375, + "learning_rate": 6.05731705919932e-08, + "loss": 2.8596, + "memory/device_mem_reserved(gib)": 50.82, + "memory/max_mem_active(gib)": 45.14, + "memory/max_mem_allocated(gib)": 45.14, + "step": 463 + }, + { + "epoch": 0.9361916771752837, + "grad_norm": 0.326171875, + "learning_rate": 5.697148903850869e-08, + "loss": 2.8129, + "memory/device_mem_reserved(gib)": 50.82, + "memory/max_mem_active(gib)": 45.14, + "memory/max_mem_allocated(gib)": 45.14, + "step": 464 + }, + { + "epoch": 0.9382093316519546, + "grad_norm": 0.34765625, + "learning_rate": 5.347895929656649e-08, + "loss": 2.8172, + "memory/device_mem_reserved(gib)": 50.82, + "memory/max_mem_active(gib)": 45.14, + "memory/max_mem_allocated(gib)": 45.14, + "step": 465 + }, + { + "epoch": 0.9402269861286254, + "grad_norm": 0.31640625, + "learning_rate": 5.009573740853313e-08, + "loss": 2.8026, + "memory/device_mem_reserved(gib)": 50.82, + "memory/max_mem_active(gib)": 45.14, + "memory/max_mem_allocated(gib)": 45.14, + "step": 466 + }, + { + "epoch": 0.9422446406052963, + "grad_norm": 0.376953125, + "learning_rate": 4.682197453301951e-08, + "loss": 2.8827, + "memory/device_mem_reserved(gib)": 50.82, + "memory/max_mem_active(gib)": 45.14, + "memory/max_mem_allocated(gib)": 45.14, + "step": 467 + }, + { + "epoch": 0.9442622950819672, + "grad_norm": 0.330078125, + "learning_rate": 4.365781693813048e-08, + "loss": 2.8079, + "memory/device_mem_reserved(gib)": 50.82, + "memory/max_mem_active(gib)": 45.14, + "memory/max_mem_allocated(gib)": 45.14, + "step": 468 + }, + { + "epoch": 0.946279949558638, + "grad_norm": 0.4609375, + "learning_rate": 4.060340599492646e-08, + "loss": 2.8384, + "memory/device_mem_reserved(gib)": 50.82, + "memory/max_mem_active(gib)": 45.14, + "memory/max_mem_allocated(gib)": 45.14, + "step": 469 + }, + { + "epoch": 0.9482976040353089, + "grad_norm": 0.330078125, + "learning_rate": 3.765887817111069e-08, + "loss": 2.8274, + "memory/device_mem_reserved(gib)": 50.82, + "memory/max_mem_active(gib)": 45.14, + "memory/max_mem_allocated(gib)": 45.14, + "step": 470 + }, + { + "epoch": 0.9503152585119798, + "grad_norm": 0.328125, + "learning_rate": 3.4824365024928585e-08, + "loss": 3.0094, + "memory/device_mem_reserved(gib)": 50.82, + "memory/max_mem_active(gib)": 45.14, + "memory/max_mem_allocated(gib)": 45.14, + "step": 471 + }, + { + "epoch": 0.9523329129886507, + "grad_norm": 0.326171875, + "learning_rate": 3.209999319929269e-08, + "loss": 2.8901, + "memory/device_mem_reserved(gib)": 50.82, + "memory/max_mem_active(gib)": 45.14, + "memory/max_mem_allocated(gib)": 45.14, + "step": 472 + }, + { + "epoch": 0.9543505674653215, + "grad_norm": 0.345703125, + "learning_rate": 2.9485884416122213e-08, + "loss": 2.9069, + "memory/device_mem_reserved(gib)": 50.82, + "memory/max_mem_active(gib)": 45.14, + "memory/max_mem_allocated(gib)": 45.14, + "step": 473 + }, + { + "epoch": 0.9563682219419924, + "grad_norm": 0.34375, + "learning_rate": 2.698215547090599e-08, + "loss": 2.8147, + "memory/device_mem_reserved(gib)": 50.82, + "memory/max_mem_active(gib)": 45.14, + "memory/max_mem_allocated(gib)": 45.14, + "step": 474 + }, + { + "epoch": 0.9583858764186634, + "grad_norm": 0.33984375, + "learning_rate": 2.458891822748444e-08, + "loss": 2.8549, + "memory/device_mem_reserved(gib)": 50.82, + "memory/max_mem_active(gib)": 45.14, + "memory/max_mem_allocated(gib)": 45.14, + "step": 475 + }, + { + "epoch": 0.9604035308953341, + "grad_norm": 0.349609375, + "learning_rate": 2.230627961304993e-08, + "loss": 2.7994, + "memory/device_mem_reserved(gib)": 50.82, + "memory/max_mem_active(gib)": 45.14, + "memory/max_mem_allocated(gib)": 45.14, + "step": 476 + }, + { + "epoch": 0.962421185372005, + "grad_norm": 0.310546875, + "learning_rate": 2.0134341613370633e-08, + "loss": 2.9219, + "memory/device_mem_reserved(gib)": 50.82, + "memory/max_mem_active(gib)": 45.14, + "memory/max_mem_allocated(gib)": 45.14, + "step": 477 + }, + { + "epoch": 0.964438839848676, + "grad_norm": 0.330078125, + "learning_rate": 1.8073201268234142e-08, + "loss": 2.8302, + "memory/device_mem_reserved(gib)": 50.82, + "memory/max_mem_active(gib)": 45.14, + "memory/max_mem_allocated(gib)": 45.14, + "step": 478 + }, + { + "epoch": 0.9664564943253467, + "grad_norm": 0.31640625, + "learning_rate": 1.612295066711095e-08, + "loss": 2.8299, + "memory/device_mem_reserved(gib)": 50.82, + "memory/max_mem_active(gib)": 45.14, + "memory/max_mem_allocated(gib)": 45.14, + "step": 479 + }, + { + "epoch": 0.9684741488020177, + "grad_norm": 0.37890625, + "learning_rate": 1.4283676945041348e-08, + "loss": 2.9729, + "memory/device_mem_reserved(gib)": 50.82, + "memory/max_mem_active(gib)": 45.14, + "memory/max_mem_allocated(gib)": 45.14, + "step": 480 + }, + { + "epoch": 0.9704918032786886, + "grad_norm": 0.314453125, + "learning_rate": 1.255546227873966e-08, + "loss": 2.8149, + "memory/device_mem_reserved(gib)": 50.82, + "memory/max_mem_active(gib)": 45.14, + "memory/max_mem_allocated(gib)": 45.14, + "step": 481 + }, + { + "epoch": 0.9725094577553594, + "grad_norm": 0.318359375, + "learning_rate": 1.0938383882926618e-08, + "loss": 2.848, + "memory/device_mem_reserved(gib)": 50.82, + "memory/max_mem_active(gib)": 45.14, + "memory/max_mem_allocated(gib)": 45.14, + "step": 482 + }, + { + "epoch": 0.9745271122320303, + "grad_norm": 0.310546875, + "learning_rate": 9.432514006875725e-09, + "loss": 2.8566, + "memory/device_mem_reserved(gib)": 50.82, + "memory/max_mem_active(gib)": 45.14, + "memory/max_mem_allocated(gib)": 45.14, + "step": 483 + }, + { + "epoch": 0.9765447667087012, + "grad_norm": 0.349609375, + "learning_rate": 8.037919931187243e-09, + "loss": 2.8315, + "memory/device_mem_reserved(gib)": 50.82, + "memory/max_mem_active(gib)": 45.14, + "memory/max_mem_allocated(gib)": 45.14, + "step": 484 + }, + { + "epoch": 0.978562421185372, + "grad_norm": 0.34765625, + "learning_rate": 6.754663964781971e-09, + "loss": 2.9492, + "memory/device_mem_reserved(gib)": 50.82, + "memory/max_mem_active(gib)": 45.14, + "memory/max_mem_allocated(gib)": 45.14, + "step": 485 + }, + { + "epoch": 0.9805800756620429, + "grad_norm": 0.326171875, + "learning_rate": 5.582803442117091e-09, + "loss": 2.7673, + "memory/device_mem_reserved(gib)": 50.82, + "memory/max_mem_active(gib)": 45.14, + "memory/max_mem_allocated(gib)": 45.14, + "step": 486 + }, + { + "epoch": 0.9825977301387138, + "grad_norm": 0.310546875, + "learning_rate": 4.522390720624603e-09, + "loss": 2.8186, + "memory/device_mem_reserved(gib)": 50.82, + "memory/max_mem_active(gib)": 45.14, + "memory/max_mem_allocated(gib)": 45.14, + "step": 487 + }, + { + "epoch": 0.9846153846153847, + "grad_norm": 0.31640625, + "learning_rate": 3.573473178371534e-09, + "loss": 2.8907, + "memory/device_mem_reserved(gib)": 50.82, + "memory/max_mem_active(gib)": 45.14, + "memory/max_mem_allocated(gib)": 45.14, + "step": 488 + }, + { + "epoch": 0.9866330390920555, + "grad_norm": 0.349609375, + "learning_rate": 2.736093211944679e-09, + "loss": 2.8521, + "memory/device_mem_reserved(gib)": 50.82, + "memory/max_mem_active(gib)": 45.14, + "memory/max_mem_allocated(gib)": 45.14, + "step": 489 + }, + { + "epoch": 0.9886506935687264, + "grad_norm": 0.3671875, + "learning_rate": 2.0102882345540696e-09, + "loss": 2.9154, + "memory/device_mem_reserved(gib)": 50.82, + "memory/max_mem_active(gib)": 45.14, + "memory/max_mem_allocated(gib)": 45.14, + "step": 490 + }, + { + "epoch": 0.9906683480453973, + "grad_norm": 0.333984375, + "learning_rate": 1.3960906743634706e-09, + "loss": 2.9166, + "memory/device_mem_reserved(gib)": 50.82, + "memory/max_mem_active(gib)": 45.14, + "memory/max_mem_allocated(gib)": 45.14, + "step": 491 + }, + { + "epoch": 0.9926860025220681, + "grad_norm": 0.349609375, + "learning_rate": 8.935279730407087e-10, + "loss": 2.8178, + "memory/device_mem_reserved(gib)": 50.82, + "memory/max_mem_active(gib)": 45.14, + "memory/max_mem_allocated(gib)": 45.14, + "step": 492 + }, + { + "epoch": 0.994703656998739, + "grad_norm": 0.3203125, + "learning_rate": 5.026225845308763e-10, + "loss": 2.8598, + "memory/device_mem_reserved(gib)": 50.82, + "memory/max_mem_active(gib)": 45.14, + "memory/max_mem_allocated(gib)": 45.14, + "step": 493 + }, + { + "epoch": 0.9967213114754099, + "grad_norm": 0.310546875, + "learning_rate": 2.2339197405490952e-10, + "loss": 2.9274, + "memory/device_mem_reserved(gib)": 50.82, + "memory/max_mem_active(gib)": 45.14, + "memory/max_mem_allocated(gib)": 45.14, + "step": 494 + }, + { + "epoch": 0.9987389659520807, + "grad_norm": 0.34765625, + "learning_rate": 5.5848617327436404e-11, + "loss": 3.005, + "memory/device_mem_reserved(gib)": 50.82, + "memory/max_mem_active(gib)": 45.14, + "memory/max_mem_allocated(gib)": 45.14, + "step": 495 + } + ], + "logging_steps": 1, + "max_steps": 495, + "num_input_tokens_seen": 0, + "num_train_epochs": 1, + "save_steps": 248, + "stateful_callbacks": { + "TrainerControl": { + "args": { + "should_epoch_stop": false, + "should_evaluate": false, + "should_log": false, + "should_save": true, + "should_training_stop": true + }, + "attributes": {} + } + }, + "total_flos": 6.453534295012147e+18, + "train_batch_size": 8, + "trial_name": null, + "trial_params": null +}