{ "best_global_step": null, "best_metric": null, "best_model_checkpoint": null, "epoch": 0.9987389659520807, "eval_steps": 500, "global_step": 495, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.00201765447667087, "grad_norm": 42.0, "learning_rate": 0.0, "loss": 3.6982, "memory/device_mem_reserved(gib)": 49.98, "memory/max_mem_active(gib)": 45.11, "memory/max_mem_allocated(gib)": 45.11, "step": 1 }, { "epoch": 0.00403530895334174, "grad_norm": 43.0, "learning_rate": 2.0000000000000002e-07, "loss": 3.8197, "memory/device_mem_reserved(gib)": 50.78, "memory/max_mem_active(gib)": 45.14, "memory/max_mem_allocated(gib)": 45.14, "step": 2 }, { "epoch": 0.00605296343001261, "grad_norm": 43.5, "learning_rate": 4.0000000000000003e-07, "loss": 3.8927, "memory/device_mem_reserved(gib)": 50.78, "memory/max_mem_active(gib)": 45.14, "memory/max_mem_allocated(gib)": 45.14, "step": 3 }, { "epoch": 0.00807061790668348, "grad_norm": 44.5, "learning_rate": 6.000000000000001e-07, "loss": 3.7968, "memory/device_mem_reserved(gib)": 50.82, "memory/max_mem_active(gib)": 45.14, "memory/max_mem_allocated(gib)": 45.14, "step": 4 }, { "epoch": 0.01008827238335435, "grad_norm": 42.25, "learning_rate": 8.000000000000001e-07, "loss": 3.8159, "memory/device_mem_reserved(gib)": 50.82, "memory/max_mem_active(gib)": 45.14, "memory/max_mem_allocated(gib)": 45.14, "step": 5 }, { "epoch": 0.01210592686002522, "grad_norm": 39.5, "learning_rate": 1.0000000000000002e-06, "loss": 3.8933, "memory/device_mem_reserved(gib)": 50.82, "memory/max_mem_active(gib)": 45.14, "memory/max_mem_allocated(gib)": 45.14, "step": 6 }, { "epoch": 0.01412358133669609, "grad_norm": 42.0, "learning_rate": 1.2000000000000002e-06, "loss": 3.8945, "memory/device_mem_reserved(gib)": 50.82, "memory/max_mem_active(gib)": 45.14, "memory/max_mem_allocated(gib)": 45.14, "step": 7 }, { "epoch": 0.01614123581336696, "grad_norm": 44.5, "learning_rate": 1.4000000000000001e-06, "loss": 3.9202, "memory/device_mem_reserved(gib)": 50.82, "memory/max_mem_active(gib)": 45.14, "memory/max_mem_allocated(gib)": 45.14, "step": 8 }, { "epoch": 0.01815889029003783, "grad_norm": 44.5, "learning_rate": 1.6000000000000001e-06, "loss": 3.8472, "memory/device_mem_reserved(gib)": 50.82, "memory/max_mem_active(gib)": 45.14, "memory/max_mem_allocated(gib)": 45.14, "step": 9 }, { "epoch": 0.0201765447667087, "grad_norm": 44.25, "learning_rate": 1.8000000000000001e-06, "loss": 3.9382, "memory/device_mem_reserved(gib)": 50.82, "memory/max_mem_active(gib)": 45.14, "memory/max_mem_allocated(gib)": 45.14, "step": 10 }, { "epoch": 0.02219419924337957, "grad_norm": 46.75, "learning_rate": 2.0000000000000003e-06, "loss": 3.8074, "memory/device_mem_reserved(gib)": 50.82, "memory/max_mem_active(gib)": 45.14, "memory/max_mem_allocated(gib)": 45.14, "step": 11 }, { "epoch": 0.02421185372005044, "grad_norm": 43.0, "learning_rate": 2.2e-06, "loss": 3.9049, "memory/device_mem_reserved(gib)": 50.82, "memory/max_mem_active(gib)": 45.14, "memory/max_mem_allocated(gib)": 45.14, "step": 12 }, { "epoch": 0.02622950819672131, "grad_norm": 46.25, "learning_rate": 2.4000000000000003e-06, "loss": 3.8222, "memory/device_mem_reserved(gib)": 50.82, "memory/max_mem_active(gib)": 45.14, "memory/max_mem_allocated(gib)": 45.14, "step": 13 }, { "epoch": 0.02824716267339218, "grad_norm": 39.75, "learning_rate": 2.6e-06, "loss": 3.7079, "memory/device_mem_reserved(gib)": 50.82, "memory/max_mem_active(gib)": 45.14, "memory/max_mem_allocated(gib)": 45.14, "step": 14 }, { "epoch": 0.03026481715006305, "grad_norm": 39.25, "learning_rate": 2.8000000000000003e-06, "loss": 3.6677, "memory/device_mem_reserved(gib)": 50.82, "memory/max_mem_active(gib)": 45.14, "memory/max_mem_allocated(gib)": 45.14, "step": 15 }, { "epoch": 0.03228247162673392, "grad_norm": 46.75, "learning_rate": 3e-06, "loss": 3.6726, "memory/device_mem_reserved(gib)": 50.82, "memory/max_mem_active(gib)": 45.14, "memory/max_mem_allocated(gib)": 45.14, "step": 16 }, { "epoch": 0.03430012610340479, "grad_norm": 43.0, "learning_rate": 3.2000000000000003e-06, "loss": 3.7075, "memory/device_mem_reserved(gib)": 50.82, "memory/max_mem_active(gib)": 45.14, "memory/max_mem_allocated(gib)": 45.14, "step": 17 }, { "epoch": 0.03631778058007566, "grad_norm": 47.25, "learning_rate": 3.4000000000000005e-06, "loss": 3.7656, "memory/device_mem_reserved(gib)": 50.82, "memory/max_mem_active(gib)": 45.14, "memory/max_mem_allocated(gib)": 45.14, "step": 18 }, { "epoch": 0.03833543505674653, "grad_norm": 42.25, "learning_rate": 3.6000000000000003e-06, "loss": 3.6015, "memory/device_mem_reserved(gib)": 50.82, "memory/max_mem_active(gib)": 45.14, "memory/max_mem_allocated(gib)": 45.14, "step": 19 }, { "epoch": 0.0403530895334174, "grad_norm": 45.25, "learning_rate": 3.8000000000000005e-06, "loss": 3.5949, "memory/device_mem_reserved(gib)": 50.82, "memory/max_mem_active(gib)": 45.14, "memory/max_mem_allocated(gib)": 45.14, "step": 20 }, { "epoch": 0.04237074401008827, "grad_norm": 43.5, "learning_rate": 4.000000000000001e-06, "loss": 3.514, "memory/device_mem_reserved(gib)": 50.82, "memory/max_mem_active(gib)": 45.14, "memory/max_mem_allocated(gib)": 45.14, "step": 21 }, { "epoch": 0.04438839848675914, "grad_norm": 44.75, "learning_rate": 4.2000000000000004e-06, "loss": 3.3632, "memory/device_mem_reserved(gib)": 50.82, "memory/max_mem_active(gib)": 45.14, "memory/max_mem_allocated(gib)": 45.14, "step": 22 }, { "epoch": 0.04640605296343001, "grad_norm": 40.75, "learning_rate": 4.4e-06, "loss": 3.4502, "memory/device_mem_reserved(gib)": 50.82, "memory/max_mem_active(gib)": 45.14, "memory/max_mem_allocated(gib)": 45.14, "step": 23 }, { "epoch": 0.04842370744010088, "grad_norm": 33.25, "learning_rate": 4.600000000000001e-06, "loss": 3.3191, "memory/device_mem_reserved(gib)": 50.82, "memory/max_mem_active(gib)": 45.14, "memory/max_mem_allocated(gib)": 45.14, "step": 24 }, { "epoch": 0.05044136191677175, "grad_norm": 39.0, "learning_rate": 4.800000000000001e-06, "loss": 3.3536, "memory/device_mem_reserved(gib)": 50.82, "memory/max_mem_active(gib)": 45.14, "memory/max_mem_allocated(gib)": 45.14, "step": 25 }, { "epoch": 0.05245901639344262, "grad_norm": 27.625, "learning_rate": 5e-06, "loss": 3.3645, "memory/device_mem_reserved(gib)": 50.82, "memory/max_mem_active(gib)": 45.14, "memory/max_mem_allocated(gib)": 45.14, "step": 26 }, { "epoch": 0.05447667087011349, "grad_norm": 26.375, "learning_rate": 4.999944151382673e-06, "loss": 3.2603, "memory/device_mem_reserved(gib)": 50.82, "memory/max_mem_active(gib)": 45.14, "memory/max_mem_allocated(gib)": 45.14, "step": 27 }, { "epoch": 0.05649432534678436, "grad_norm": 20.375, "learning_rate": 4.999776608025946e-06, "loss": 3.2394, "memory/device_mem_reserved(gib)": 50.82, "memory/max_mem_active(gib)": 45.14, "memory/max_mem_allocated(gib)": 45.14, "step": 28 }, { "epoch": 0.05851197982345523, "grad_norm": 20.375, "learning_rate": 4.99949737741547e-06, "loss": 3.237, "memory/device_mem_reserved(gib)": 50.82, "memory/max_mem_active(gib)": 45.14, "memory/max_mem_allocated(gib)": 45.14, "step": 29 }, { "epoch": 0.0605296343001261, "grad_norm": 14.25, "learning_rate": 4.99910647202696e-06, "loss": 3.0697, "memory/device_mem_reserved(gib)": 50.82, "memory/max_mem_active(gib)": 45.14, "memory/max_mem_allocated(gib)": 45.14, "step": 30 }, { "epoch": 0.06254728877679698, "grad_norm": 12.4375, "learning_rate": 4.998603909325636e-06, "loss": 3.2899, "memory/device_mem_reserved(gib)": 50.82, "memory/max_mem_active(gib)": 45.14, "memory/max_mem_allocated(gib)": 45.14, "step": 31 }, { "epoch": 0.06456494325346784, "grad_norm": 11.25, "learning_rate": 4.997989711765447e-06, "loss": 3.1902, "memory/device_mem_reserved(gib)": 50.82, "memory/max_mem_active(gib)": 45.14, "memory/max_mem_allocated(gib)": 45.14, "step": 32 }, { "epoch": 0.06658259773013872, "grad_norm": 7.5625, "learning_rate": 4.9972639067880555e-06, "loss": 3.0802, "memory/device_mem_reserved(gib)": 50.82, "memory/max_mem_active(gib)": 45.14, "memory/max_mem_allocated(gib)": 45.14, "step": 33 }, { "epoch": 0.06860025220680958, "grad_norm": 6.5, "learning_rate": 4.996426526821629e-06, "loss": 3.1234, "memory/device_mem_reserved(gib)": 50.82, "memory/max_mem_active(gib)": 45.14, "memory/max_mem_allocated(gib)": 45.14, "step": 34 }, { "epoch": 0.07061790668348046, "grad_norm": 6.25, "learning_rate": 4.9954776092793755e-06, "loss": 3.1298, "memory/device_mem_reserved(gib)": 50.82, "memory/max_mem_active(gib)": 45.14, "memory/max_mem_allocated(gib)": 45.14, "step": 35 }, { "epoch": 0.07263556116015132, "grad_norm": 5.9375, "learning_rate": 4.994417196557884e-06, "loss": 2.9955, "memory/device_mem_reserved(gib)": 50.82, "memory/max_mem_active(gib)": 45.14, "memory/max_mem_allocated(gib)": 45.14, "step": 36 }, { "epoch": 0.0746532156368222, "grad_norm": 5.46875, "learning_rate": 4.993245336035219e-06, "loss": 3.1321, "memory/device_mem_reserved(gib)": 50.82, "memory/max_mem_active(gib)": 45.14, "memory/max_mem_allocated(gib)": 45.14, "step": 37 }, { "epoch": 0.07667087011349306, "grad_norm": 3.875, "learning_rate": 4.991962080068813e-06, "loss": 3.0705, "memory/device_mem_reserved(gib)": 50.82, "memory/max_mem_active(gib)": 45.14, "memory/max_mem_allocated(gib)": 45.14, "step": 38 }, { "epoch": 0.07868852459016394, "grad_norm": 4.40625, "learning_rate": 4.990567485993125e-06, "loss": 3.072, "memory/device_mem_reserved(gib)": 50.82, "memory/max_mem_active(gib)": 45.14, "memory/max_mem_allocated(gib)": 45.14, "step": 39 }, { "epoch": 0.0807061790668348, "grad_norm": 3.859375, "learning_rate": 4.989061616117073e-06, "loss": 3.2298, "memory/device_mem_reserved(gib)": 50.82, "memory/max_mem_active(gib)": 45.14, "memory/max_mem_allocated(gib)": 45.14, "step": 40 }, { "epoch": 0.08272383354350568, "grad_norm": 3.46875, "learning_rate": 4.98744453772126e-06, "loss": 3.014, "memory/device_mem_reserved(gib)": 50.82, "memory/max_mem_active(gib)": 45.14, "memory/max_mem_allocated(gib)": 45.14, "step": 41 }, { "epoch": 0.08474148802017654, "grad_norm": 3.25, "learning_rate": 4.985716323054959e-06, "loss": 3.0271, "memory/device_mem_reserved(gib)": 50.82, "memory/max_mem_active(gib)": 45.14, "memory/max_mem_allocated(gib)": 45.14, "step": 42 }, { "epoch": 0.08675914249684742, "grad_norm": 3.40625, "learning_rate": 4.983877049332889e-06, "loss": 3.0976, "memory/device_mem_reserved(gib)": 50.82, "memory/max_mem_active(gib)": 45.14, "memory/max_mem_allocated(gib)": 45.14, "step": 43 }, { "epoch": 0.08877679697351828, "grad_norm": 2.671875, "learning_rate": 4.981926798731767e-06, "loss": 2.959, "memory/device_mem_reserved(gib)": 50.82, "memory/max_mem_active(gib)": 45.14, "memory/max_mem_allocated(gib)": 45.14, "step": 44 }, { "epoch": 0.09079445145018916, "grad_norm": 3.0, "learning_rate": 4.97986565838663e-06, "loss": 2.9811, "memory/device_mem_reserved(gib)": 50.82, "memory/max_mem_active(gib)": 45.14, "memory/max_mem_allocated(gib)": 45.14, "step": 45 }, { "epoch": 0.09281210592686003, "grad_norm": 2.6875, "learning_rate": 4.977693720386951e-06, "loss": 2.9736, "memory/device_mem_reserved(gib)": 50.82, "memory/max_mem_active(gib)": 45.14, "memory/max_mem_allocated(gib)": 45.14, "step": 46 }, { "epoch": 0.0948297604035309, "grad_norm": 2.203125, "learning_rate": 4.975411081772516e-06, "loss": 3.0368, "memory/device_mem_reserved(gib)": 50.82, "memory/max_mem_active(gib)": 45.14, "memory/max_mem_allocated(gib)": 45.14, "step": 47 }, { "epoch": 0.09684741488020177, "grad_norm": 1.96875, "learning_rate": 4.9730178445290945e-06, "loss": 3.0042, "memory/device_mem_reserved(gib)": 50.82, "memory/max_mem_active(gib)": 45.14, "memory/max_mem_allocated(gib)": 45.14, "step": 48 }, { "epoch": 0.09886506935687264, "grad_norm": 1.84375, "learning_rate": 4.970514115583878e-06, "loss": 3.0294, "memory/device_mem_reserved(gib)": 50.82, "memory/max_mem_active(gib)": 45.14, "memory/max_mem_allocated(gib)": 45.14, "step": 49 }, { "epoch": 0.1008827238335435, "grad_norm": 1.8359375, "learning_rate": 4.967900006800708e-06, "loss": 3.0115, "memory/device_mem_reserved(gib)": 50.82, "memory/max_mem_active(gib)": 45.14, "memory/max_mem_allocated(gib)": 45.14, "step": 50 }, { "epoch": 0.10290037831021438, "grad_norm": 1.78125, "learning_rate": 4.965175634975072e-06, "loss": 3.0661, "memory/device_mem_reserved(gib)": 50.82, "memory/max_mem_active(gib)": 45.14, "memory/max_mem_allocated(gib)": 45.14, "step": 51 }, { "epoch": 0.10491803278688525, "grad_norm": 1.6015625, "learning_rate": 4.96234112182889e-06, "loss": 3.0006, "memory/device_mem_reserved(gib)": 50.82, "memory/max_mem_active(gib)": 45.14, "memory/max_mem_allocated(gib)": 45.14, "step": 52 }, { "epoch": 0.10693568726355612, "grad_norm": 1.5859375, "learning_rate": 4.959396594005073e-06, "loss": 3.0469, "memory/device_mem_reserved(gib)": 50.82, "memory/max_mem_active(gib)": 45.14, "memory/max_mem_allocated(gib)": 45.14, "step": 53 }, { "epoch": 0.10895334174022699, "grad_norm": 1.453125, "learning_rate": 4.95634218306187e-06, "loss": 3.0325, "memory/device_mem_reserved(gib)": 50.82, "memory/max_mem_active(gib)": 45.14, "memory/max_mem_allocated(gib)": 45.14, "step": 54 }, { "epoch": 0.11097099621689786, "grad_norm": 1.296875, "learning_rate": 4.953178025466981e-06, "loss": 2.9788, "memory/device_mem_reserved(gib)": 50.82, "memory/max_mem_active(gib)": 45.14, "memory/max_mem_allocated(gib)": 45.14, "step": 55 }, { "epoch": 0.11298865069356873, "grad_norm": 1.3984375, "learning_rate": 4.949904262591467e-06, "loss": 3.0148, "memory/device_mem_reserved(gib)": 50.82, "memory/max_mem_active(gib)": 45.14, "memory/max_mem_allocated(gib)": 45.14, "step": 56 }, { "epoch": 0.1150063051702396, "grad_norm": 1.3828125, "learning_rate": 4.946521040703434e-06, "loss": 2.9016, "memory/device_mem_reserved(gib)": 50.82, "memory/max_mem_active(gib)": 45.14, "memory/max_mem_allocated(gib)": 45.14, "step": 57 }, { "epoch": 0.11702395964691047, "grad_norm": 1.21875, "learning_rate": 4.943028510961492e-06, "loss": 3.1604, "memory/device_mem_reserved(gib)": 50.82, "memory/max_mem_active(gib)": 45.14, "memory/max_mem_allocated(gib)": 45.14, "step": 58 }, { "epoch": 0.11904161412358134, "grad_norm": 1.2109375, "learning_rate": 4.939426829408008e-06, "loss": 2.9688, "memory/device_mem_reserved(gib)": 50.82, "memory/max_mem_active(gib)": 45.14, "memory/max_mem_allocated(gib)": 45.14, "step": 59 }, { "epoch": 0.1210592686002522, "grad_norm": 1.203125, "learning_rate": 4.9357161569621275e-06, "loss": 2.8605, "memory/device_mem_reserved(gib)": 50.82, "memory/max_mem_active(gib)": 45.14, "memory/max_mem_allocated(gib)": 45.14, "step": 60 }, { "epoch": 0.12307692307692308, "grad_norm": 0.93359375, "learning_rate": 4.931896659412593e-06, "loss": 3.0057, "memory/device_mem_reserved(gib)": 50.82, "memory/max_mem_active(gib)": 45.14, "memory/max_mem_allocated(gib)": 45.14, "step": 61 }, { "epoch": 0.12509457755359396, "grad_norm": 0.96484375, "learning_rate": 4.92796850741033e-06, "loss": 2.9342, "memory/device_mem_reserved(gib)": 50.82, "memory/max_mem_active(gib)": 45.14, "memory/max_mem_allocated(gib)": 45.14, "step": 62 }, { "epoch": 0.1271122320302648, "grad_norm": 0.9921875, "learning_rate": 4.9239318764608245e-06, "loss": 2.949, "memory/device_mem_reserved(gib)": 50.82, "memory/max_mem_active(gib)": 45.14, "memory/max_mem_allocated(gib)": 45.14, "step": 63 }, { "epoch": 0.1291298865069357, "grad_norm": 0.95703125, "learning_rate": 4.919786946916282e-06, "loss": 2.9372, "memory/device_mem_reserved(gib)": 50.82, "memory/max_mem_active(gib)": 45.14, "memory/max_mem_allocated(gib)": 45.14, "step": 64 }, { "epoch": 0.13114754098360656, "grad_norm": 0.79296875, "learning_rate": 4.91553390396757e-06, "loss": 2.9564, "memory/device_mem_reserved(gib)": 50.82, "memory/max_mem_active(gib)": 45.14, "memory/max_mem_allocated(gib)": 45.14, "step": 65 }, { "epoch": 0.13316519546027744, "grad_norm": 0.90234375, "learning_rate": 4.911172937635942e-06, "loss": 2.9389, "memory/device_mem_reserved(gib)": 50.82, "memory/max_mem_active(gib)": 45.14, "memory/max_mem_allocated(gib)": 45.14, "step": 66 }, { "epoch": 0.1351828499369483, "grad_norm": 0.79296875, "learning_rate": 4.906704242764551e-06, "loss": 2.8962, "memory/device_mem_reserved(gib)": 50.82, "memory/max_mem_active(gib)": 45.14, "memory/max_mem_allocated(gib)": 45.14, "step": 67 }, { "epoch": 0.13720050441361917, "grad_norm": 0.76953125, "learning_rate": 4.902128019009741e-06, "loss": 2.9853, "memory/device_mem_reserved(gib)": 50.82, "memory/max_mem_active(gib)": 45.14, "memory/max_mem_allocated(gib)": 45.14, "step": 68 }, { "epoch": 0.13921815889029004, "grad_norm": 0.77734375, "learning_rate": 4.8974444708321265e-06, "loss": 2.9407, "memory/device_mem_reserved(gib)": 50.82, "memory/max_mem_active(gib)": 45.14, "memory/max_mem_allocated(gib)": 45.14, "step": 69 }, { "epoch": 0.14123581336696092, "grad_norm": 0.78125, "learning_rate": 4.892653807487461e-06, "loss": 2.9395, "memory/device_mem_reserved(gib)": 50.82, "memory/max_mem_active(gib)": 45.14, "memory/max_mem_allocated(gib)": 45.14, "step": 70 }, { "epoch": 0.14325346784363177, "grad_norm": 0.72265625, "learning_rate": 4.887756243017282e-06, "loss": 3.0041, "memory/device_mem_reserved(gib)": 50.82, "memory/max_mem_active(gib)": 45.14, "memory/max_mem_allocated(gib)": 45.14, "step": 71 }, { "epoch": 0.14527112232030265, "grad_norm": 0.63671875, "learning_rate": 4.882751996239352e-06, "loss": 2.9692, "memory/device_mem_reserved(gib)": 50.82, "memory/max_mem_active(gib)": 45.14, "memory/max_mem_allocated(gib)": 45.14, "step": 72 }, { "epoch": 0.14728877679697353, "grad_norm": 0.6875, "learning_rate": 4.8776412907378845e-06, "loss": 2.9279, "memory/device_mem_reserved(gib)": 50.82, "memory/max_mem_active(gib)": 45.14, "memory/max_mem_allocated(gib)": 45.14, "step": 73 }, { "epoch": 0.1493064312736444, "grad_norm": 0.7109375, "learning_rate": 4.872424354853545e-06, "loss": 2.9776, "memory/device_mem_reserved(gib)": 50.82, "memory/max_mem_active(gib)": 45.14, "memory/max_mem_allocated(gib)": 45.14, "step": 74 }, { "epoch": 0.15132408575031525, "grad_norm": 0.80859375, "learning_rate": 4.867101421673261e-06, "loss": 3.0093, "memory/device_mem_reserved(gib)": 50.82, "memory/max_mem_active(gib)": 45.14, "memory/max_mem_allocated(gib)": 45.14, "step": 75 }, { "epoch": 0.15334174022698613, "grad_norm": 0.62890625, "learning_rate": 4.861672729019798e-06, "loss": 2.9489, "memory/device_mem_reserved(gib)": 50.82, "memory/max_mem_active(gib)": 45.14, "memory/max_mem_allocated(gib)": 45.14, "step": 76 }, { "epoch": 0.155359394703657, "grad_norm": 0.6171875, "learning_rate": 4.856138519441137e-06, "loss": 2.8465, "memory/device_mem_reserved(gib)": 50.82, "memory/max_mem_active(gib)": 45.14, "memory/max_mem_allocated(gib)": 45.14, "step": 77 }, { "epoch": 0.15737704918032788, "grad_norm": 0.7265625, "learning_rate": 4.8504990401996434e-06, "loss": 2.9363, "memory/device_mem_reserved(gib)": 50.82, "memory/max_mem_active(gib)": 45.14, "memory/max_mem_allocated(gib)": 45.14, "step": 78 }, { "epoch": 0.15939470365699873, "grad_norm": 0.53125, "learning_rate": 4.8447545432610095e-06, "loss": 2.8965, "memory/device_mem_reserved(gib)": 50.82, "memory/max_mem_active(gib)": 45.14, "memory/max_mem_allocated(gib)": 45.14, "step": 79 }, { "epoch": 0.1614123581336696, "grad_norm": 0.61328125, "learning_rate": 4.8389052852830055e-06, "loss": 2.96, "memory/device_mem_reserved(gib)": 50.82, "memory/max_mem_active(gib)": 45.14, "memory/max_mem_allocated(gib)": 45.14, "step": 80 }, { "epoch": 0.16343001261034049, "grad_norm": 0.578125, "learning_rate": 4.832951527604007e-06, "loss": 2.9506, "memory/device_mem_reserved(gib)": 50.82, "memory/max_mem_active(gib)": 45.14, "memory/max_mem_allocated(gib)": 45.14, "step": 81 }, { "epoch": 0.16544766708701136, "grad_norm": 0.59375, "learning_rate": 4.826893536231322e-06, "loss": 2.8551, "memory/device_mem_reserved(gib)": 50.82, "memory/max_mem_active(gib)": 45.14, "memory/max_mem_allocated(gib)": 45.14, "step": 82 }, { "epoch": 0.1674653215636822, "grad_norm": 0.625, "learning_rate": 4.820731581829303e-06, "loss": 3.0159, "memory/device_mem_reserved(gib)": 50.82, "memory/max_mem_active(gib)": 45.14, "memory/max_mem_allocated(gib)": 45.14, "step": 83 }, { "epoch": 0.1694829760403531, "grad_norm": 0.5234375, "learning_rate": 4.814465939707259e-06, "loss": 2.9186, "memory/device_mem_reserved(gib)": 50.82, "memory/max_mem_active(gib)": 45.14, "memory/max_mem_allocated(gib)": 45.14, "step": 84 }, { "epoch": 0.17150063051702397, "grad_norm": 0.51171875, "learning_rate": 4.808096889807147e-06, "loss": 3.0161, "memory/device_mem_reserved(gib)": 50.82, "memory/max_mem_active(gib)": 45.14, "memory/max_mem_allocated(gib)": 45.14, "step": 85 }, { "epoch": 0.17351828499369484, "grad_norm": 0.57421875, "learning_rate": 4.801624716691072e-06, "loss": 2.9589, "memory/device_mem_reserved(gib)": 50.82, "memory/max_mem_active(gib)": 45.14, "memory/max_mem_allocated(gib)": 45.14, "step": 86 }, { "epoch": 0.1755359394703657, "grad_norm": 0.5625, "learning_rate": 4.795049709528571e-06, "loss": 2.9643, "memory/device_mem_reserved(gib)": 50.82, "memory/max_mem_active(gib)": 45.14, "memory/max_mem_allocated(gib)": 45.14, "step": 87 }, { "epoch": 0.17755359394703657, "grad_norm": 0.578125, "learning_rate": 4.78837216208369e-06, "loss": 2.9542, "memory/device_mem_reserved(gib)": 50.82, "memory/max_mem_active(gib)": 45.14, "memory/max_mem_allocated(gib)": 45.14, "step": 88 }, { "epoch": 0.17957124842370745, "grad_norm": 0.50390625, "learning_rate": 4.7815923727018625e-06, "loss": 2.9755, "memory/device_mem_reserved(gib)": 50.82, "memory/max_mem_active(gib)": 45.14, "memory/max_mem_allocated(gib)": 45.14, "step": 89 }, { "epoch": 0.18158890290037832, "grad_norm": 0.54296875, "learning_rate": 4.774710644296579e-06, "loss": 2.9595, "memory/device_mem_reserved(gib)": 50.82, "memory/max_mem_active(gib)": 45.14, "memory/max_mem_allocated(gib)": 45.14, "step": 90 }, { "epoch": 0.18360655737704917, "grad_norm": 0.48828125, "learning_rate": 4.767727284335852e-06, "loss": 2.9172, "memory/device_mem_reserved(gib)": 50.82, "memory/max_mem_active(gib)": 45.14, "memory/max_mem_allocated(gib)": 45.14, "step": 91 }, { "epoch": 0.18562421185372005, "grad_norm": 0.515625, "learning_rate": 4.760642604828482e-06, "loss": 2.9182, "memory/device_mem_reserved(gib)": 50.82, "memory/max_mem_active(gib)": 45.14, "memory/max_mem_allocated(gib)": 45.14, "step": 92 }, { "epoch": 0.18764186633039093, "grad_norm": 0.59375, "learning_rate": 4.753456922310109e-06, "loss": 2.9225, "memory/device_mem_reserved(gib)": 50.82, "memory/max_mem_active(gib)": 45.14, "memory/max_mem_allocated(gib)": 45.14, "step": 93 }, { "epoch": 0.1896595208070618, "grad_norm": 0.515625, "learning_rate": 4.746170557829084e-06, "loss": 2.8674, "memory/device_mem_reserved(gib)": 50.82, "memory/max_mem_active(gib)": 45.14, "memory/max_mem_allocated(gib)": 45.14, "step": 94 }, { "epoch": 0.19167717528373265, "grad_norm": 0.4921875, "learning_rate": 4.738783836932109e-06, "loss": 2.8713, "memory/device_mem_reserved(gib)": 50.82, "memory/max_mem_active(gib)": 45.14, "memory/max_mem_allocated(gib)": 45.14, "step": 95 }, { "epoch": 0.19369482976040353, "grad_norm": 0.478515625, "learning_rate": 4.731297089649704e-06, "loss": 3.0112, "memory/device_mem_reserved(gib)": 50.82, "memory/max_mem_active(gib)": 45.14, "memory/max_mem_allocated(gib)": 45.14, "step": 96 }, { "epoch": 0.1957124842370744, "grad_norm": 0.46875, "learning_rate": 4.723710650481456e-06, "loss": 2.9569, "memory/device_mem_reserved(gib)": 50.82, "memory/max_mem_active(gib)": 45.14, "memory/max_mem_allocated(gib)": 45.14, "step": 97 }, { "epoch": 0.19773013871374528, "grad_norm": 0.48828125, "learning_rate": 4.7160248583810755e-06, "loss": 2.9466, "memory/device_mem_reserved(gib)": 50.82, "memory/max_mem_active(gib)": 45.14, "memory/max_mem_allocated(gib)": 45.14, "step": 98 }, { "epoch": 0.19974779319041613, "grad_norm": 0.48828125, "learning_rate": 4.708240056741253e-06, "loss": 2.9399, "memory/device_mem_reserved(gib)": 50.82, "memory/max_mem_active(gib)": 45.14, "memory/max_mem_allocated(gib)": 45.14, "step": 99 }, { "epoch": 0.201765447667087, "grad_norm": 0.51171875, "learning_rate": 4.700356593378312e-06, "loss": 2.9445, "memory/device_mem_reserved(gib)": 50.82, "memory/max_mem_active(gib)": 45.14, "memory/max_mem_allocated(gib)": 45.14, "step": 100 }, { "epoch": 0.2037831021437579, "grad_norm": 0.51953125, "learning_rate": 4.692374820516679e-06, "loss": 2.9657, "memory/device_mem_reserved(gib)": 50.82, "memory/max_mem_active(gib)": 45.14, "memory/max_mem_allocated(gib)": 45.14, "step": 101 }, { "epoch": 0.20580075662042877, "grad_norm": 0.53125, "learning_rate": 4.684295094773134e-06, "loss": 3.0236, "memory/device_mem_reserved(gib)": 50.82, "memory/max_mem_active(gib)": 45.14, "memory/max_mem_allocated(gib)": 45.14, "step": 102 }, { "epoch": 0.20781841109709961, "grad_norm": 0.470703125, "learning_rate": 4.676117777140887e-06, "loss": 2.9442, "memory/device_mem_reserved(gib)": 50.82, "memory/max_mem_active(gib)": 45.14, "memory/max_mem_allocated(gib)": 45.14, "step": 103 }, { "epoch": 0.2098360655737705, "grad_norm": 0.490234375, "learning_rate": 4.667843232973444e-06, "loss": 2.9491, "memory/device_mem_reserved(gib)": 50.82, "memory/max_mem_active(gib)": 45.14, "memory/max_mem_allocated(gib)": 45.14, "step": 104 }, { "epoch": 0.21185372005044137, "grad_norm": 0.5390625, "learning_rate": 4.659471831968285e-06, "loss": 3.0061, "memory/device_mem_reserved(gib)": 50.82, "memory/max_mem_active(gib)": 45.14, "memory/max_mem_allocated(gib)": 45.14, "step": 105 }, { "epoch": 0.21387137452711225, "grad_norm": 0.474609375, "learning_rate": 4.651003948150349e-06, "loss": 2.8666, "memory/device_mem_reserved(gib)": 50.82, "memory/max_mem_active(gib)": 45.14, "memory/max_mem_allocated(gib)": 45.14, "step": 106 }, { "epoch": 0.2158890290037831, "grad_norm": 0.4375, "learning_rate": 4.642439959855316e-06, "loss": 2.9507, "memory/device_mem_reserved(gib)": 50.82, "memory/max_mem_active(gib)": 45.14, "memory/max_mem_allocated(gib)": 45.14, "step": 107 }, { "epoch": 0.21790668348045397, "grad_norm": 0.48828125, "learning_rate": 4.633780249712712e-06, "loss": 2.8984, "memory/device_mem_reserved(gib)": 50.82, "memory/max_mem_active(gib)": 45.14, "memory/max_mem_allocated(gib)": 45.14, "step": 108 }, { "epoch": 0.21992433795712485, "grad_norm": 0.421875, "learning_rate": 4.625025204628806e-06, "loss": 2.8593, "memory/device_mem_reserved(gib)": 50.82, "memory/max_mem_active(gib)": 45.14, "memory/max_mem_allocated(gib)": 45.14, "step": 109 }, { "epoch": 0.22194199243379573, "grad_norm": 0.470703125, "learning_rate": 4.616175215769328e-06, "loss": 2.8705, "memory/device_mem_reserved(gib)": 50.82, "memory/max_mem_active(gib)": 45.14, "memory/max_mem_allocated(gib)": 45.14, "step": 110 }, { "epoch": 0.22395964691046658, "grad_norm": 0.482421875, "learning_rate": 4.607230678541993e-06, "loss": 2.916, "memory/device_mem_reserved(gib)": 50.82, "memory/max_mem_active(gib)": 45.14, "memory/max_mem_allocated(gib)": 45.14, "step": 111 }, { "epoch": 0.22597730138713745, "grad_norm": 0.39453125, "learning_rate": 4.5981919925788285e-06, "loss": 2.921, "memory/device_mem_reserved(gib)": 50.82, "memory/max_mem_active(gib)": 45.14, "memory/max_mem_allocated(gib)": 45.14, "step": 112 }, { "epoch": 0.22799495586380833, "grad_norm": 0.40625, "learning_rate": 4.5890595617183254e-06, "loss": 2.8851, "memory/device_mem_reserved(gib)": 50.82, "memory/max_mem_active(gib)": 45.14, "memory/max_mem_allocated(gib)": 45.14, "step": 113 }, { "epoch": 0.2300126103404792, "grad_norm": 0.443359375, "learning_rate": 4.579833793987393e-06, "loss": 2.9397, "memory/device_mem_reserved(gib)": 50.82, "memory/max_mem_active(gib)": 45.14, "memory/max_mem_allocated(gib)": 45.14, "step": 114 }, { "epoch": 0.23203026481715006, "grad_norm": 0.47265625, "learning_rate": 4.570515101583128e-06, "loss": 2.8029, "memory/device_mem_reserved(gib)": 50.82, "memory/max_mem_active(gib)": 45.14, "memory/max_mem_allocated(gib)": 45.14, "step": 115 }, { "epoch": 0.23404791929382093, "grad_norm": 0.447265625, "learning_rate": 4.561103900854401e-06, "loss": 2.9667, "memory/device_mem_reserved(gib)": 50.82, "memory/max_mem_active(gib)": 45.14, "memory/max_mem_allocated(gib)": 45.14, "step": 116 }, { "epoch": 0.2360655737704918, "grad_norm": 0.435546875, "learning_rate": 4.551600612283249e-06, "loss": 3.009, "memory/device_mem_reserved(gib)": 50.82, "memory/max_mem_active(gib)": 45.14, "memory/max_mem_allocated(gib)": 45.14, "step": 117 }, { "epoch": 0.2380832282471627, "grad_norm": 0.427734375, "learning_rate": 4.542005660466095e-06, "loss": 2.7894, "memory/device_mem_reserved(gib)": 50.82, "memory/max_mem_active(gib)": 45.14, "memory/max_mem_allocated(gib)": 45.14, "step": 118 }, { "epoch": 0.24010088272383354, "grad_norm": 0.45703125, "learning_rate": 4.532319474094769e-06, "loss": 2.9377, "memory/device_mem_reserved(gib)": 50.82, "memory/max_mem_active(gib)": 45.14, "memory/max_mem_allocated(gib)": 45.14, "step": 119 }, { "epoch": 0.2421185372005044, "grad_norm": 0.4609375, "learning_rate": 4.522542485937369e-06, "loss": 2.8978, "memory/device_mem_reserved(gib)": 50.82, "memory/max_mem_active(gib)": 45.14, "memory/max_mem_allocated(gib)": 45.14, "step": 120 }, { "epoch": 0.2441361916771753, "grad_norm": 0.45703125, "learning_rate": 4.512675132818908e-06, "loss": 2.9091, "memory/device_mem_reserved(gib)": 50.82, "memory/max_mem_active(gib)": 45.14, "memory/max_mem_allocated(gib)": 45.14, "step": 121 }, { "epoch": 0.24615384615384617, "grad_norm": 0.4375, "learning_rate": 4.5027178556018095e-06, "loss": 2.9647, "memory/device_mem_reserved(gib)": 50.82, "memory/max_mem_active(gib)": 45.14, "memory/max_mem_allocated(gib)": 45.14, "step": 122 }, { "epoch": 0.24817150063051702, "grad_norm": 0.431640625, "learning_rate": 4.492671099166204e-06, "loss": 2.94, "memory/device_mem_reserved(gib)": 50.82, "memory/max_mem_active(gib)": 45.14, "memory/max_mem_allocated(gib)": 45.14, "step": 123 }, { "epoch": 0.2501891551071879, "grad_norm": 0.412109375, "learning_rate": 4.482535312390059e-06, "loss": 2.8902, "memory/device_mem_reserved(gib)": 50.82, "memory/max_mem_active(gib)": 45.14, "memory/max_mem_allocated(gib)": 45.14, "step": 124 }, { "epoch": 0.25220680958385877, "grad_norm": 0.421875, "learning_rate": 4.472310948129113e-06, "loss": 3.0486, "memory/device_mem_reserved(gib)": 50.82, "memory/max_mem_active(gib)": 45.14, "memory/max_mem_allocated(gib)": 45.14, "step": 125 }, { "epoch": 0.2542244640605296, "grad_norm": 0.47265625, "learning_rate": 4.461998463196653e-06, "loss": 2.848, "memory/device_mem_reserved(gib)": 50.82, "memory/max_mem_active(gib)": 45.14, "memory/max_mem_allocated(gib)": 45.14, "step": 126 }, { "epoch": 0.2562421185372005, "grad_norm": 0.447265625, "learning_rate": 4.451598318343099e-06, "loss": 2.9275, "memory/device_mem_reserved(gib)": 50.82, "memory/max_mem_active(gib)": 45.14, "memory/max_mem_allocated(gib)": 45.14, "step": 127 }, { "epoch": 0.2582597730138714, "grad_norm": 0.443359375, "learning_rate": 4.441110978235419e-06, "loss": 3.0023, "memory/device_mem_reserved(gib)": 50.82, "memory/max_mem_active(gib)": 45.14, "memory/max_mem_allocated(gib)": 45.14, "step": 128 }, { "epoch": 0.2602774274905422, "grad_norm": 0.421875, "learning_rate": 4.430536911436368e-06, "loss": 2.8752, "memory/device_mem_reserved(gib)": 50.82, "memory/max_mem_active(gib)": 45.14, "memory/max_mem_allocated(gib)": 45.14, "step": 129 }, { "epoch": 0.26229508196721313, "grad_norm": 0.447265625, "learning_rate": 4.419876590383554e-06, "loss": 2.9107, "memory/device_mem_reserved(gib)": 50.82, "memory/max_mem_active(gib)": 45.14, "memory/max_mem_allocated(gib)": 45.14, "step": 130 }, { "epoch": 0.264312736443884, "grad_norm": 0.376953125, "learning_rate": 4.409130491368331e-06, "loss": 2.9529, "memory/device_mem_reserved(gib)": 50.82, "memory/max_mem_active(gib)": 45.14, "memory/max_mem_allocated(gib)": 45.14, "step": 131 }, { "epoch": 0.2663303909205549, "grad_norm": 0.400390625, "learning_rate": 4.398299094514515e-06, "loss": 2.9129, "memory/device_mem_reserved(gib)": 50.82, "memory/max_mem_active(gib)": 45.14, "memory/max_mem_allocated(gib)": 45.14, "step": 132 }, { "epoch": 0.26834804539722573, "grad_norm": 0.376953125, "learning_rate": 4.387382883756938e-06, "loss": 2.9802, "memory/device_mem_reserved(gib)": 50.82, "memory/max_mem_active(gib)": 45.14, "memory/max_mem_allocated(gib)": 45.14, "step": 133 }, { "epoch": 0.2703656998738966, "grad_norm": 0.39453125, "learning_rate": 4.37638234681982e-06, "loss": 2.8098, "memory/device_mem_reserved(gib)": 50.82, "memory/max_mem_active(gib)": 45.14, "memory/max_mem_allocated(gib)": 45.14, "step": 134 }, { "epoch": 0.2723833543505675, "grad_norm": 0.365234375, "learning_rate": 4.365297975194984e-06, "loss": 2.9357, "memory/device_mem_reserved(gib)": 50.82, "memory/max_mem_active(gib)": 45.14, "memory/max_mem_allocated(gib)": 45.14, "step": 135 }, { "epoch": 0.27440100882723834, "grad_norm": 0.48828125, "learning_rate": 4.354130264119894e-06, "loss": 2.9512, "memory/device_mem_reserved(gib)": 50.82, "memory/max_mem_active(gib)": 45.14, "memory/max_mem_allocated(gib)": 45.14, "step": 136 }, { "epoch": 0.2764186633039092, "grad_norm": 0.52734375, "learning_rate": 4.342879712555528e-06, "loss": 2.9478, "memory/device_mem_reserved(gib)": 50.82, "memory/max_mem_active(gib)": 45.14, "memory/max_mem_allocated(gib)": 45.14, "step": 137 }, { "epoch": 0.2784363177805801, "grad_norm": 0.423828125, "learning_rate": 4.331546823164083e-06, "loss": 3.0044, "memory/device_mem_reserved(gib)": 50.82, "memory/max_mem_active(gib)": 45.14, "memory/max_mem_allocated(gib)": 45.14, "step": 138 }, { "epoch": 0.28045397225725094, "grad_norm": 0.388671875, "learning_rate": 4.320132102286524e-06, "loss": 2.9148, "memory/device_mem_reserved(gib)": 50.82, "memory/max_mem_active(gib)": 45.14, "memory/max_mem_allocated(gib)": 45.14, "step": 139 }, { "epoch": 0.28247162673392184, "grad_norm": 0.36328125, "learning_rate": 4.308636059919952e-06, "loss": 2.8966, "memory/device_mem_reserved(gib)": 50.82, "memory/max_mem_active(gib)": 45.14, "memory/max_mem_allocated(gib)": 45.14, "step": 140 }, { "epoch": 0.2844892812105927, "grad_norm": 0.40625, "learning_rate": 4.297059209694824e-06, "loss": 2.7564, "memory/device_mem_reserved(gib)": 50.82, "memory/max_mem_active(gib)": 45.14, "memory/max_mem_allocated(gib)": 45.14, "step": 141 }, { "epoch": 0.28650693568726354, "grad_norm": 0.400390625, "learning_rate": 4.2854020688520025e-06, "loss": 2.809, "memory/device_mem_reserved(gib)": 50.82, "memory/max_mem_active(gib)": 45.14, "memory/max_mem_allocated(gib)": 45.14, "step": 142 }, { "epoch": 0.28852459016393445, "grad_norm": 0.416015625, "learning_rate": 4.273665158219645e-06, "loss": 2.9237, "memory/device_mem_reserved(gib)": 50.82, "memory/max_mem_active(gib)": 45.14, "memory/max_mem_allocated(gib)": 45.14, "step": 143 }, { "epoch": 0.2905422446406053, "grad_norm": 0.3828125, "learning_rate": 4.261849002189939e-06, "loss": 2.9274, "memory/device_mem_reserved(gib)": 50.82, "memory/max_mem_active(gib)": 45.14, "memory/max_mem_allocated(gib)": 45.14, "step": 144 }, { "epoch": 0.29255989911727615, "grad_norm": 0.40234375, "learning_rate": 4.249954128695662e-06, "loss": 2.909, "memory/device_mem_reserved(gib)": 50.82, "memory/max_mem_active(gib)": 45.14, "memory/max_mem_allocated(gib)": 45.14, "step": 145 }, { "epoch": 0.29457755359394705, "grad_norm": 0.359375, "learning_rate": 4.237981069186606e-06, "loss": 2.8917, "memory/device_mem_reserved(gib)": 50.82, "memory/max_mem_active(gib)": 45.14, "memory/max_mem_allocated(gib)": 45.14, "step": 146 }, { "epoch": 0.2965952080706179, "grad_norm": 0.39453125, "learning_rate": 4.225930358605827e-06, "loss": 2.8702, "memory/device_mem_reserved(gib)": 50.82, "memory/max_mem_active(gib)": 45.14, "memory/max_mem_allocated(gib)": 45.14, "step": 147 }, { "epoch": 0.2986128625472888, "grad_norm": 0.373046875, "learning_rate": 4.213802535365741e-06, "loss": 2.9231, "memory/device_mem_reserved(gib)": 50.82, "memory/max_mem_active(gib)": 45.14, "memory/max_mem_allocated(gib)": 45.14, "step": 148 }, { "epoch": 0.30063051702395965, "grad_norm": 0.376953125, "learning_rate": 4.201598141324078e-06, "loss": 2.8554, "memory/device_mem_reserved(gib)": 50.82, "memory/max_mem_active(gib)": 45.14, "memory/max_mem_allocated(gib)": 45.14, "step": 149 }, { "epoch": 0.3026481715006305, "grad_norm": 0.376953125, "learning_rate": 4.189317721759663e-06, "loss": 2.8243, "memory/device_mem_reserved(gib)": 50.82, "memory/max_mem_active(gib)": 45.14, "memory/max_mem_allocated(gib)": 45.14, "step": 150 }, { "epoch": 0.3046658259773014, "grad_norm": 0.400390625, "learning_rate": 4.176961825348059e-06, "loss": 3.031, "memory/device_mem_reserved(gib)": 50.82, "memory/max_mem_active(gib)": 45.14, "memory/max_mem_allocated(gib)": 45.14, "step": 151 }, { "epoch": 0.30668348045397226, "grad_norm": 0.37109375, "learning_rate": 4.16453100413705e-06, "loss": 2.8808, "memory/device_mem_reserved(gib)": 50.82, "memory/max_mem_active(gib)": 45.14, "memory/max_mem_allocated(gib)": 45.14, "step": 152 }, { "epoch": 0.3087011349306431, "grad_norm": 0.36328125, "learning_rate": 4.152025813521976e-06, "loss": 2.8957, "memory/device_mem_reserved(gib)": 50.82, "memory/max_mem_active(gib)": 45.14, "memory/max_mem_allocated(gib)": 45.14, "step": 153 }, { "epoch": 0.310718789407314, "grad_norm": 0.400390625, "learning_rate": 4.1394468122209245e-06, "loss": 3.0874, "memory/device_mem_reserved(gib)": 50.82, "memory/max_mem_active(gib)": 45.14, "memory/max_mem_allocated(gib)": 45.14, "step": 154 }, { "epoch": 0.31273644388398486, "grad_norm": 0.392578125, "learning_rate": 4.1267945622497566e-06, "loss": 2.9214, "memory/device_mem_reserved(gib)": 50.82, "memory/max_mem_active(gib)": 45.14, "memory/max_mem_allocated(gib)": 45.14, "step": 155 }, { "epoch": 0.31475409836065577, "grad_norm": 0.37890625, "learning_rate": 4.114069628897006e-06, "loss": 2.9121, "memory/device_mem_reserved(gib)": 50.82, "memory/max_mem_active(gib)": 45.14, "memory/max_mem_allocated(gib)": 45.14, "step": 156 }, { "epoch": 0.3167717528373266, "grad_norm": 0.388671875, "learning_rate": 4.101272580698621e-06, "loss": 2.944, "memory/device_mem_reserved(gib)": 50.82, "memory/max_mem_active(gib)": 45.14, "memory/max_mem_allocated(gib)": 45.14, "step": 157 }, { "epoch": 0.31878940731399746, "grad_norm": 0.40234375, "learning_rate": 4.08840398941256e-06, "loss": 2.928, "memory/device_mem_reserved(gib)": 50.82, "memory/max_mem_active(gib)": 45.14, "memory/max_mem_allocated(gib)": 45.14, "step": 158 }, { "epoch": 0.32080706179066837, "grad_norm": 0.390625, "learning_rate": 4.075464429993244e-06, "loss": 2.8135, "memory/device_mem_reserved(gib)": 50.82, "memory/max_mem_active(gib)": 45.14, "memory/max_mem_allocated(gib)": 45.14, "step": 159 }, { "epoch": 0.3228247162673392, "grad_norm": 0.369140625, "learning_rate": 4.0624544805658795e-06, "loss": 2.93, "memory/device_mem_reserved(gib)": 50.82, "memory/max_mem_active(gib)": 45.14, "memory/max_mem_allocated(gib)": 45.14, "step": 160 }, { "epoch": 0.32484237074401007, "grad_norm": 0.3515625, "learning_rate": 4.049374722400613e-06, "loss": 2.7283, "memory/device_mem_reserved(gib)": 50.82, "memory/max_mem_active(gib)": 45.14, "memory/max_mem_allocated(gib)": 45.14, "step": 161 }, { "epoch": 0.32686002522068097, "grad_norm": 0.380859375, "learning_rate": 4.0362257398865715e-06, "loss": 2.9056, "memory/device_mem_reserved(gib)": 50.82, "memory/max_mem_active(gib)": 45.14, "memory/max_mem_allocated(gib)": 45.14, "step": 162 }, { "epoch": 0.3288776796973518, "grad_norm": 0.37890625, "learning_rate": 4.02300812050575e-06, "loss": 2.9232, "memory/device_mem_reserved(gib)": 50.82, "memory/max_mem_active(gib)": 45.14, "memory/max_mem_allocated(gib)": 45.14, "step": 163 }, { "epoch": 0.3308953341740227, "grad_norm": 0.380859375, "learning_rate": 4.009722454806762e-06, "loss": 2.8507, "memory/device_mem_reserved(gib)": 50.82, "memory/max_mem_active(gib)": 45.14, "memory/max_mem_allocated(gib)": 45.14, "step": 164 }, { "epoch": 0.3329129886506936, "grad_norm": 0.388671875, "learning_rate": 3.9963693363784544e-06, "loss": 2.8988, "memory/device_mem_reserved(gib)": 50.82, "memory/max_mem_active(gib)": 45.14, "memory/max_mem_allocated(gib)": 45.14, "step": 165 }, { "epoch": 0.3349306431273644, "grad_norm": 0.35546875, "learning_rate": 3.982949361823388e-06, "loss": 2.8836, "memory/device_mem_reserved(gib)": 50.82, "memory/max_mem_active(gib)": 45.14, "memory/max_mem_allocated(gib)": 45.14, "step": 166 }, { "epoch": 0.33694829760403533, "grad_norm": 0.369140625, "learning_rate": 3.969463130731183e-06, "loss": 2.8122, "memory/device_mem_reserved(gib)": 50.82, "memory/max_mem_active(gib)": 45.14, "memory/max_mem_allocated(gib)": 45.14, "step": 167 }, { "epoch": 0.3389659520807062, "grad_norm": 0.361328125, "learning_rate": 3.955911245651726e-06, "loss": 2.9137, "memory/device_mem_reserved(gib)": 50.82, "memory/max_mem_active(gib)": 45.14, "memory/max_mem_allocated(gib)": 45.14, "step": 168 }, { "epoch": 0.34098360655737703, "grad_norm": 0.359375, "learning_rate": 3.942294312068252e-06, "loss": 2.9557, "memory/device_mem_reserved(gib)": 50.82, "memory/max_mem_active(gib)": 45.14, "memory/max_mem_allocated(gib)": 45.14, "step": 169 }, { "epoch": 0.34300126103404793, "grad_norm": 0.380859375, "learning_rate": 3.928612938370292e-06, "loss": 2.8614, "memory/device_mem_reserved(gib)": 50.82, "memory/max_mem_active(gib)": 45.14, "memory/max_mem_allocated(gib)": 45.14, "step": 170 }, { "epoch": 0.3450189155107188, "grad_norm": 0.376953125, "learning_rate": 3.914867735826489e-06, "loss": 2.9181, "memory/device_mem_reserved(gib)": 50.82, "memory/max_mem_active(gib)": 45.14, "memory/max_mem_allocated(gib)": 45.14, "step": 171 }, { "epoch": 0.3470365699873897, "grad_norm": 0.35546875, "learning_rate": 3.901059318557287e-06, "loss": 2.8829, "memory/device_mem_reserved(gib)": 50.82, "memory/max_mem_active(gib)": 45.14, "memory/max_mem_allocated(gib)": 45.14, "step": 172 }, { "epoch": 0.34905422446406054, "grad_norm": 0.421875, "learning_rate": 3.8871883035074975e-06, "loss": 2.9295, "memory/device_mem_reserved(gib)": 50.82, "memory/max_mem_active(gib)": 45.14, "memory/max_mem_allocated(gib)": 45.14, "step": 173 }, { "epoch": 0.3510718789407314, "grad_norm": 0.36328125, "learning_rate": 3.87325531041873e-06, "loss": 2.8678, "memory/device_mem_reserved(gib)": 50.82, "memory/max_mem_active(gib)": 45.14, "memory/max_mem_allocated(gib)": 45.14, "step": 174 }, { "epoch": 0.3530895334174023, "grad_norm": 0.376953125, "learning_rate": 3.859260961801702e-06, "loss": 2.871, "memory/device_mem_reserved(gib)": 50.82, "memory/max_mem_active(gib)": 45.14, "memory/max_mem_allocated(gib)": 45.14, "step": 175 }, { "epoch": 0.35510718789407314, "grad_norm": 0.3515625, "learning_rate": 3.845205882908432e-06, "loss": 2.9155, "memory/device_mem_reserved(gib)": 50.82, "memory/max_mem_active(gib)": 45.14, "memory/max_mem_allocated(gib)": 45.14, "step": 176 }, { "epoch": 0.357124842370744, "grad_norm": 0.341796875, "learning_rate": 3.8310907017042966e-06, "loss": 2.8222, "memory/device_mem_reserved(gib)": 50.82, "memory/max_mem_active(gib)": 45.14, "memory/max_mem_allocated(gib)": 45.14, "step": 177 }, { "epoch": 0.3591424968474149, "grad_norm": 0.37890625, "learning_rate": 3.816916048839979e-06, "loss": 2.9431, "memory/device_mem_reserved(gib)": 50.82, "memory/max_mem_active(gib)": 45.14, "memory/max_mem_allocated(gib)": 45.14, "step": 178 }, { "epoch": 0.36116015132408574, "grad_norm": 0.421875, "learning_rate": 3.8026825576232906e-06, "loss": 2.8602, "memory/device_mem_reserved(gib)": 50.82, "memory/max_mem_active(gib)": 45.14, "memory/max_mem_allocated(gib)": 45.14, "step": 179 }, { "epoch": 0.36317780580075665, "grad_norm": 0.380859375, "learning_rate": 3.7883908639908752e-06, "loss": 2.9468, "memory/device_mem_reserved(gib)": 50.82, "memory/max_mem_active(gib)": 45.14, "memory/max_mem_allocated(gib)": 45.14, "step": 180 }, { "epoch": 0.3651954602774275, "grad_norm": 0.365234375, "learning_rate": 3.774041606479794e-06, "loss": 2.9297, "memory/device_mem_reserved(gib)": 50.82, "memory/max_mem_active(gib)": 45.14, "memory/max_mem_allocated(gib)": 45.14, "step": 181 }, { "epoch": 0.36721311475409835, "grad_norm": 0.404296875, "learning_rate": 3.759635426199001e-06, "loss": 2.8849, "memory/device_mem_reserved(gib)": 50.82, "memory/max_mem_active(gib)": 45.14, "memory/max_mem_allocated(gib)": 45.14, "step": 182 }, { "epoch": 0.36923076923076925, "grad_norm": 0.373046875, "learning_rate": 3.7451729668006974e-06, "loss": 2.8075, "memory/device_mem_reserved(gib)": 50.82, "memory/max_mem_active(gib)": 45.14, "memory/max_mem_allocated(gib)": 45.14, "step": 183 }, { "epoch": 0.3712484237074401, "grad_norm": 0.359375, "learning_rate": 3.730654874451569e-06, "loss": 2.7931, "memory/device_mem_reserved(gib)": 50.82, "memory/max_mem_active(gib)": 45.14, "memory/max_mem_allocated(gib)": 45.14, "step": 184 }, { "epoch": 0.37326607818411095, "grad_norm": 0.375, "learning_rate": 3.7160817978039256e-06, "loss": 2.9012, "memory/device_mem_reserved(gib)": 50.82, "memory/max_mem_active(gib)": 45.14, "memory/max_mem_allocated(gib)": 45.14, "step": 185 }, { "epoch": 0.37528373266078185, "grad_norm": 0.34765625, "learning_rate": 3.7014543879667097e-06, "loss": 2.8396, "memory/device_mem_reserved(gib)": 50.82, "memory/max_mem_active(gib)": 45.14, "memory/max_mem_allocated(gib)": 45.14, "step": 186 }, { "epoch": 0.3773013871374527, "grad_norm": 0.345703125, "learning_rate": 3.6867732984764144e-06, "loss": 2.8584, "memory/device_mem_reserved(gib)": 50.82, "memory/max_mem_active(gib)": 45.14, "memory/max_mem_allocated(gib)": 45.14, "step": 187 }, { "epoch": 0.3793190416141236, "grad_norm": 0.373046875, "learning_rate": 3.6720391852678783e-06, "loss": 2.9347, "memory/device_mem_reserved(gib)": 50.82, "memory/max_mem_active(gib)": 45.14, "memory/max_mem_allocated(gib)": 45.14, "step": 188 }, { "epoch": 0.38133669609079446, "grad_norm": 0.353515625, "learning_rate": 3.657252706644982e-06, "loss": 2.9129, "memory/device_mem_reserved(gib)": 50.82, "memory/max_mem_active(gib)": 45.14, "memory/max_mem_allocated(gib)": 45.14, "step": 189 }, { "epoch": 0.3833543505674653, "grad_norm": 0.365234375, "learning_rate": 3.6424145232512337e-06, "loss": 2.8445, "memory/device_mem_reserved(gib)": 50.82, "memory/max_mem_active(gib)": 45.14, "memory/max_mem_allocated(gib)": 45.14, "step": 190 }, { "epoch": 0.3853720050441362, "grad_norm": 0.365234375, "learning_rate": 3.627525298040255e-06, "loss": 2.8571, "memory/device_mem_reserved(gib)": 50.82, "memory/max_mem_active(gib)": 45.14, "memory/max_mem_allocated(gib)": 45.14, "step": 191 }, { "epoch": 0.38738965952080706, "grad_norm": 0.36328125, "learning_rate": 3.612585696246158e-06, "loss": 2.9072, "memory/device_mem_reserved(gib)": 50.82, "memory/max_mem_active(gib)": 45.14, "memory/max_mem_allocated(gib)": 45.14, "step": 192 }, { "epoch": 0.3894073139974779, "grad_norm": 0.35546875, "learning_rate": 3.5975963853538273e-06, "loss": 2.9549, "memory/device_mem_reserved(gib)": 50.82, "memory/max_mem_active(gib)": 45.14, "memory/max_mem_allocated(gib)": 45.14, "step": 193 }, { "epoch": 0.3914249684741488, "grad_norm": 0.365234375, "learning_rate": 3.5825580350690914e-06, "loss": 2.8663, "memory/device_mem_reserved(gib)": 50.82, "memory/max_mem_active(gib)": 45.14, "memory/max_mem_allocated(gib)": 45.14, "step": 194 }, { "epoch": 0.39344262295081966, "grad_norm": 0.3515625, "learning_rate": 3.5674713172888075e-06, "loss": 3.0238, "memory/device_mem_reserved(gib)": 50.82, "memory/max_mem_active(gib)": 45.14, "memory/max_mem_allocated(gib)": 45.14, "step": 195 }, { "epoch": 0.39546027742749057, "grad_norm": 0.3671875, "learning_rate": 3.552336906070838e-06, "loss": 2.922, "memory/device_mem_reserved(gib)": 50.82, "memory/max_mem_active(gib)": 45.14, "memory/max_mem_allocated(gib)": 45.14, "step": 196 }, { "epoch": 0.3974779319041614, "grad_norm": 0.34375, "learning_rate": 3.5371554776039344e-06, "loss": 2.8983, "memory/device_mem_reserved(gib)": 50.82, "memory/max_mem_active(gib)": 45.14, "memory/max_mem_allocated(gib)": 45.14, "step": 197 }, { "epoch": 0.39949558638083227, "grad_norm": 0.3515625, "learning_rate": 3.52192771017753e-06, "loss": 2.9008, "memory/device_mem_reserved(gib)": 50.82, "memory/max_mem_active(gib)": 45.14, "memory/max_mem_allocated(gib)": 45.14, "step": 198 }, { "epoch": 0.4015132408575032, "grad_norm": 0.357421875, "learning_rate": 3.5066542841514275e-06, "loss": 2.9236, "memory/device_mem_reserved(gib)": 50.82, "memory/max_mem_active(gib)": 45.14, "memory/max_mem_allocated(gib)": 45.14, "step": 199 }, { "epoch": 0.403530895334174, "grad_norm": 0.400390625, "learning_rate": 3.491335881925407e-06, "loss": 2.8329, "memory/device_mem_reserved(gib)": 50.82, "memory/max_mem_active(gib)": 45.14, "memory/max_mem_allocated(gib)": 45.14, "step": 200 }, { "epoch": 0.40554854981084487, "grad_norm": 0.37890625, "learning_rate": 3.4759731879087373e-06, "loss": 2.8975, "memory/device_mem_reserved(gib)": 50.82, "memory/max_mem_active(gib)": 45.14, "memory/max_mem_allocated(gib)": 45.14, "step": 201 }, { "epoch": 0.4075662042875158, "grad_norm": 0.359375, "learning_rate": 3.460566888489593e-06, "loss": 2.9363, "memory/device_mem_reserved(gib)": 50.82, "memory/max_mem_active(gib)": 45.14, "memory/max_mem_allocated(gib)": 45.14, "step": 202 }, { "epoch": 0.4095838587641866, "grad_norm": 0.35546875, "learning_rate": 3.4451176720043906e-06, "loss": 2.9014, "memory/device_mem_reserved(gib)": 50.82, "memory/max_mem_active(gib)": 45.14, "memory/max_mem_allocated(gib)": 45.14, "step": 203 }, { "epoch": 0.41160151324085753, "grad_norm": 0.35546875, "learning_rate": 3.4296262287070337e-06, "loss": 2.8582, "memory/device_mem_reserved(gib)": 50.82, "memory/max_mem_active(gib)": 45.14, "memory/max_mem_allocated(gib)": 45.14, "step": 204 }, { "epoch": 0.4136191677175284, "grad_norm": 0.375, "learning_rate": 3.4140932507380727e-06, "loss": 2.9568, "memory/device_mem_reserved(gib)": 50.82, "memory/max_mem_active(gib)": 45.14, "memory/max_mem_allocated(gib)": 45.14, "step": 205 }, { "epoch": 0.41563682219419923, "grad_norm": 0.359375, "learning_rate": 3.398519432093782e-06, "loss": 2.7844, "memory/device_mem_reserved(gib)": 50.82, "memory/max_mem_active(gib)": 45.14, "memory/max_mem_allocated(gib)": 45.14, "step": 206 }, { "epoch": 0.41765447667087013, "grad_norm": 0.341796875, "learning_rate": 3.3829054685951535e-06, "loss": 2.9679, "memory/device_mem_reserved(gib)": 50.82, "memory/max_mem_active(gib)": 45.14, "memory/max_mem_allocated(gib)": 45.14, "step": 207 }, { "epoch": 0.419672131147541, "grad_norm": 0.353515625, "learning_rate": 3.3672520578568018e-06, "loss": 3.0012, "memory/device_mem_reserved(gib)": 50.82, "memory/max_mem_active(gib)": 45.14, "memory/max_mem_allocated(gib)": 45.14, "step": 208 }, { "epoch": 0.42168978562421183, "grad_norm": 0.359375, "learning_rate": 3.351559899255806e-06, "loss": 2.8042, "memory/device_mem_reserved(gib)": 50.82, "memory/max_mem_active(gib)": 45.14, "memory/max_mem_allocated(gib)": 45.14, "step": 209 }, { "epoch": 0.42370744010088274, "grad_norm": 0.34765625, "learning_rate": 3.335829693900455e-06, "loss": 2.8814, "memory/device_mem_reserved(gib)": 50.82, "memory/max_mem_active(gib)": 45.14, "memory/max_mem_allocated(gib)": 45.14, "step": 210 }, { "epoch": 0.4257250945775536, "grad_norm": 0.36328125, "learning_rate": 3.3200621445989227e-06, "loss": 2.8684, "memory/device_mem_reserved(gib)": 50.82, "memory/max_mem_active(gib)": 45.14, "memory/max_mem_allocated(gib)": 45.14, "step": 211 }, { "epoch": 0.4277427490542245, "grad_norm": 0.3671875, "learning_rate": 3.304257955827872e-06, "loss": 2.8368, "memory/device_mem_reserved(gib)": 50.82, "memory/max_mem_active(gib)": 45.14, "memory/max_mem_allocated(gib)": 45.14, "step": 212 }, { "epoch": 0.42976040353089534, "grad_norm": 0.36328125, "learning_rate": 3.2884178337009764e-06, "loss": 2.8657, "memory/device_mem_reserved(gib)": 50.82, "memory/max_mem_active(gib)": 45.14, "memory/max_mem_allocated(gib)": 45.14, "step": 213 }, { "epoch": 0.4317780580075662, "grad_norm": 0.353515625, "learning_rate": 3.272542485937369e-06, "loss": 2.8492, "memory/device_mem_reserved(gib)": 50.82, "memory/max_mem_active(gib)": 45.14, "memory/max_mem_allocated(gib)": 45.14, "step": 214 }, { "epoch": 0.4337957124842371, "grad_norm": 0.33984375, "learning_rate": 3.2566326218300287e-06, "loss": 2.904, "memory/device_mem_reserved(gib)": 50.82, "memory/max_mem_active(gib)": 45.14, "memory/max_mem_allocated(gib)": 45.14, "step": 215 }, { "epoch": 0.43581336696090794, "grad_norm": 0.41796875, "learning_rate": 3.2406889522140854e-06, "loss": 2.9362, "memory/device_mem_reserved(gib)": 50.82, "memory/max_mem_active(gib)": 45.14, "memory/max_mem_allocated(gib)": 45.14, "step": 216 }, { "epoch": 0.4378310214375788, "grad_norm": 0.34375, "learning_rate": 3.2247121894350614e-06, "loss": 2.8333, "memory/device_mem_reserved(gib)": 50.82, "memory/max_mem_active(gib)": 45.14, "memory/max_mem_allocated(gib)": 45.14, "step": 217 }, { "epoch": 0.4398486759142497, "grad_norm": 0.3515625, "learning_rate": 3.208703047317045e-06, "loss": 2.9378, "memory/device_mem_reserved(gib)": 50.82, "memory/max_mem_active(gib)": 45.14, "memory/max_mem_allocated(gib)": 45.14, "step": 218 }, { "epoch": 0.44186633039092055, "grad_norm": 0.330078125, "learning_rate": 3.1926622411307985e-06, "loss": 2.8866, "memory/device_mem_reserved(gib)": 50.82, "memory/max_mem_active(gib)": 45.14, "memory/max_mem_allocated(gib)": 45.14, "step": 219 }, { "epoch": 0.44388398486759145, "grad_norm": 0.33203125, "learning_rate": 3.1765904875617977e-06, "loss": 2.8879, "memory/device_mem_reserved(gib)": 50.82, "memory/max_mem_active(gib)": 45.14, "memory/max_mem_allocated(gib)": 45.14, "step": 220 }, { "epoch": 0.4459016393442623, "grad_norm": 0.5390625, "learning_rate": 3.1604885046782158e-06, "loss": 2.8542, "memory/device_mem_reserved(gib)": 50.82, "memory/max_mem_active(gib)": 45.14, "memory/max_mem_allocated(gib)": 45.14, "step": 221 }, { "epoch": 0.44791929382093315, "grad_norm": 0.35546875, "learning_rate": 3.1443570118988357e-06, "loss": 2.9413, "memory/device_mem_reserved(gib)": 50.82, "memory/max_mem_active(gib)": 45.14, "memory/max_mem_allocated(gib)": 45.14, "step": 222 }, { "epoch": 0.44993694829760406, "grad_norm": 0.353515625, "learning_rate": 3.128196729960912e-06, "loss": 2.9176, "memory/device_mem_reserved(gib)": 50.82, "memory/max_mem_active(gib)": 45.14, "memory/max_mem_allocated(gib)": 45.14, "step": 223 }, { "epoch": 0.4519546027742749, "grad_norm": 0.396484375, "learning_rate": 3.1120083808879666e-06, "loss": 2.9672, "memory/device_mem_reserved(gib)": 50.82, "memory/max_mem_active(gib)": 45.14, "memory/max_mem_allocated(gib)": 45.14, "step": 224 }, { "epoch": 0.45397225725094575, "grad_norm": 0.33984375, "learning_rate": 3.095792687957528e-06, "loss": 2.9543, "memory/device_mem_reserved(gib)": 50.82, "memory/max_mem_active(gib)": 45.14, "memory/max_mem_allocated(gib)": 45.14, "step": 225 }, { "epoch": 0.45598991172761666, "grad_norm": 0.33984375, "learning_rate": 3.0795503756688212e-06, "loss": 2.7878, "memory/device_mem_reserved(gib)": 50.82, "memory/max_mem_active(gib)": 45.14, "memory/max_mem_allocated(gib)": 45.14, "step": 226 }, { "epoch": 0.4580075662042875, "grad_norm": 0.365234375, "learning_rate": 3.063282169710392e-06, "loss": 2.9394, "memory/device_mem_reserved(gib)": 50.82, "memory/max_mem_active(gib)": 45.14, "memory/max_mem_allocated(gib)": 45.14, "step": 227 }, { "epoch": 0.4600252206809584, "grad_norm": 0.373046875, "learning_rate": 3.046988796927688e-06, "loss": 2.838, "memory/device_mem_reserved(gib)": 50.82, "memory/max_mem_active(gib)": 45.14, "memory/max_mem_allocated(gib)": 45.14, "step": 228 }, { "epoch": 0.46204287515762926, "grad_norm": 0.357421875, "learning_rate": 3.0306709852905824e-06, "loss": 2.9316, "memory/device_mem_reserved(gib)": 50.82, "memory/max_mem_active(gib)": 45.14, "memory/max_mem_allocated(gib)": 45.14, "step": 229 }, { "epoch": 0.4640605296343001, "grad_norm": 0.337890625, "learning_rate": 3.014329463860849e-06, "loss": 2.8895, "memory/device_mem_reserved(gib)": 50.82, "memory/max_mem_active(gib)": 45.14, "memory/max_mem_allocated(gib)": 45.14, "step": 230 }, { "epoch": 0.466078184110971, "grad_norm": 0.375, "learning_rate": 2.9979649627595904e-06, "loss": 2.9382, "memory/device_mem_reserved(gib)": 50.82, "memory/max_mem_active(gib)": 45.14, "memory/max_mem_allocated(gib)": 45.14, "step": 231 }, { "epoch": 0.46809583858764187, "grad_norm": 0.375, "learning_rate": 2.981578213134614e-06, "loss": 2.9179, "memory/device_mem_reserved(gib)": 50.82, "memory/max_mem_active(gib)": 45.14, "memory/max_mem_allocated(gib)": 45.14, "step": 232 }, { "epoch": 0.4701134930643127, "grad_norm": 0.314453125, "learning_rate": 2.9651699471277664e-06, "loss": 2.803, "memory/device_mem_reserved(gib)": 50.82, "memory/max_mem_active(gib)": 45.14, "memory/max_mem_allocated(gib)": 45.14, "step": 233 }, { "epoch": 0.4721311475409836, "grad_norm": 0.453125, "learning_rate": 2.9487408978422233e-06, "loss": 2.8782, "memory/device_mem_reserved(gib)": 50.82, "memory/max_mem_active(gib)": 45.14, "memory/max_mem_allocated(gib)": 45.14, "step": 234 }, { "epoch": 0.47414880201765447, "grad_norm": 0.357421875, "learning_rate": 2.932291799309734e-06, "loss": 2.9224, "memory/device_mem_reserved(gib)": 50.82, "memory/max_mem_active(gib)": 45.14, "memory/max_mem_allocated(gib)": 45.14, "step": 235 }, { "epoch": 0.4761664564943254, "grad_norm": 0.322265625, "learning_rate": 2.9158233864578256e-06, "loss": 2.8951, "memory/device_mem_reserved(gib)": 50.82, "memory/max_mem_active(gib)": 45.14, "memory/max_mem_allocated(gib)": 45.14, "step": 236 }, { "epoch": 0.4781841109709962, "grad_norm": 0.400390625, "learning_rate": 2.8993363950769685e-06, "loss": 2.8369, "memory/device_mem_reserved(gib)": 50.82, "memory/max_mem_active(gib)": 45.14, "memory/max_mem_allocated(gib)": 45.14, "step": 237 }, { "epoch": 0.4802017654476671, "grad_norm": 0.357421875, "learning_rate": 2.8828315617877006e-06, "loss": 2.9335, "memory/device_mem_reserved(gib)": 50.82, "memory/max_mem_active(gib)": 45.14, "memory/max_mem_allocated(gib)": 45.14, "step": 238 }, { "epoch": 0.482219419924338, "grad_norm": 0.337890625, "learning_rate": 2.866309624007717e-06, "loss": 2.838, "memory/device_mem_reserved(gib)": 50.82, "memory/max_mem_active(gib)": 45.14, "memory/max_mem_allocated(gib)": 45.14, "step": 239 }, { "epoch": 0.4842370744010088, "grad_norm": 0.333984375, "learning_rate": 2.849771319918922e-06, "loss": 2.9317, "memory/device_mem_reserved(gib)": 50.82, "memory/max_mem_active(gib)": 45.14, "memory/max_mem_allocated(gib)": 45.14, "step": 240 }, { "epoch": 0.4862547288776797, "grad_norm": 0.3671875, "learning_rate": 2.8332173884344477e-06, "loss": 2.8651, "memory/device_mem_reserved(gib)": 50.82, "memory/max_mem_active(gib)": 45.14, "memory/max_mem_allocated(gib)": 45.14, "step": 241 }, { "epoch": 0.4882723833543506, "grad_norm": 0.349609375, "learning_rate": 2.8166485691656425e-06, "loss": 2.8215, "memory/device_mem_reserved(gib)": 50.82, "memory/max_mem_active(gib)": 45.14, "memory/max_mem_allocated(gib)": 45.14, "step": 242 }, { "epoch": 0.49029003783102143, "grad_norm": 0.33203125, "learning_rate": 2.8000656023890245e-06, "loss": 2.909, "memory/device_mem_reserved(gib)": 50.82, "memory/max_mem_active(gib)": 45.14, "memory/max_mem_allocated(gib)": 45.14, "step": 243 }, { "epoch": 0.49230769230769234, "grad_norm": 0.361328125, "learning_rate": 2.7834692290132054e-06, "loss": 2.8417, "memory/device_mem_reserved(gib)": 50.82, "memory/max_mem_active(gib)": 45.14, "memory/max_mem_allocated(gib)": 45.14, "step": 244 }, { "epoch": 0.4943253467843632, "grad_norm": 0.35546875, "learning_rate": 2.766860190545791e-06, "loss": 2.8922, "memory/device_mem_reserved(gib)": 50.82, "memory/max_mem_active(gib)": 45.14, "memory/max_mem_allocated(gib)": 45.14, "step": 245 }, { "epoch": 0.49634300126103403, "grad_norm": 0.3515625, "learning_rate": 2.7502392290602463e-06, "loss": 2.8867, "memory/device_mem_reserved(gib)": 50.82, "memory/max_mem_active(gib)": 45.14, "memory/max_mem_allocated(gib)": 45.14, "step": 246 }, { "epoch": 0.49836065573770494, "grad_norm": 0.3203125, "learning_rate": 2.7336070871627467e-06, "loss": 2.8258, "memory/device_mem_reserved(gib)": 50.82, "memory/max_mem_active(gib)": 45.14, "memory/max_mem_allocated(gib)": 45.14, "step": 247 }, { "epoch": 0.5003783102143758, "grad_norm": 0.345703125, "learning_rate": 2.716964507958994e-06, "loss": 2.8912, "memory/device_mem_reserved(gib)": 50.82, "memory/max_mem_active(gib)": 45.14, "memory/max_mem_allocated(gib)": 45.14, "step": 248 }, { "epoch": 0.5023959646910466, "grad_norm": 0.349609375, "learning_rate": 2.7003122350210185e-06, "loss": 2.9522, "memory/device_mem_reserved(gib)": 50.82, "memory/max_mem_active(gib)": 45.14, "memory/max_mem_allocated(gib)": 45.14, "step": 249 }, { "epoch": 0.5044136191677175, "grad_norm": 0.33984375, "learning_rate": 2.6836510123539556e-06, "loss": 2.8601, "memory/device_mem_reserved(gib)": 50.82, "memory/max_mem_active(gib)": 45.14, "memory/max_mem_allocated(gib)": 45.14, "step": 250 }, { "epoch": 0.5064312736443884, "grad_norm": 0.341796875, "learning_rate": 2.6669815843628043e-06, "loss": 2.8434, "memory/device_mem_reserved(gib)": 50.82, "memory/max_mem_active(gib)": 45.14, "memory/max_mem_allocated(gib)": 45.14, "step": 251 }, { "epoch": 0.5084489281210592, "grad_norm": 0.3359375, "learning_rate": 2.650304695819168e-06, "loss": 2.9957, "memory/device_mem_reserved(gib)": 50.82, "memory/max_mem_active(gib)": 45.14, "memory/max_mem_allocated(gib)": 45.14, "step": 252 }, { "epoch": 0.5104665825977301, "grad_norm": 0.388671875, "learning_rate": 2.6336210918279807e-06, "loss": 2.9345, "memory/device_mem_reserved(gib)": 50.82, "memory/max_mem_active(gib)": 45.14, "memory/max_mem_allocated(gib)": 45.14, "step": 253 }, { "epoch": 0.512484237074401, "grad_norm": 0.373046875, "learning_rate": 2.6169315177942134e-06, "loss": 2.8346, "memory/device_mem_reserved(gib)": 50.82, "memory/max_mem_active(gib)": 45.14, "memory/max_mem_allocated(gib)": 45.14, "step": 254 }, { "epoch": 0.5145018915510718, "grad_norm": 0.341796875, "learning_rate": 2.6002367193895733e-06, "loss": 2.8084, "memory/device_mem_reserved(gib)": 50.82, "memory/max_mem_active(gib)": 45.14, "memory/max_mem_allocated(gib)": 45.14, "step": 255 }, { "epoch": 0.5165195460277427, "grad_norm": 0.333984375, "learning_rate": 2.5835374425191867e-06, "loss": 2.9052, "memory/device_mem_reserved(gib)": 50.82, "memory/max_mem_active(gib)": 45.14, "memory/max_mem_allocated(gib)": 45.14, "step": 256 }, { "epoch": 0.5185372005044137, "grad_norm": 0.3359375, "learning_rate": 2.566834433288272e-06, "loss": 2.8175, "memory/device_mem_reserved(gib)": 50.82, "memory/max_mem_active(gib)": 45.14, "memory/max_mem_allocated(gib)": 45.14, "step": 257 }, { "epoch": 0.5205548549810844, "grad_norm": 0.341796875, "learning_rate": 2.5501284379688067e-06, "loss": 2.8486, "memory/device_mem_reserved(gib)": 50.82, "memory/max_mem_active(gib)": 45.14, "memory/max_mem_allocated(gib)": 45.14, "step": 258 }, { "epoch": 0.5225725094577554, "grad_norm": 0.35546875, "learning_rate": 2.533420202966182e-06, "loss": 2.8192, "memory/device_mem_reserved(gib)": 50.82, "memory/max_mem_active(gib)": 45.14, "memory/max_mem_allocated(gib)": 45.14, "step": 259 }, { "epoch": 0.5245901639344263, "grad_norm": 0.46484375, "learning_rate": 2.516710474785856e-06, "loss": 2.9291, "memory/device_mem_reserved(gib)": 50.82, "memory/max_mem_active(gib)": 45.14, "memory/max_mem_allocated(gib)": 45.14, "step": 260 }, { "epoch": 0.526607818411097, "grad_norm": 0.34765625, "learning_rate": 2.5e-06, "loss": 2.9709, "memory/device_mem_reserved(gib)": 50.82, "memory/max_mem_active(gib)": 45.14, "memory/max_mem_allocated(gib)": 45.14, "step": 261 }, { "epoch": 0.528625472887768, "grad_norm": 0.310546875, "learning_rate": 2.483289525214145e-06, "loss": 2.8498, "memory/device_mem_reserved(gib)": 50.82, "memory/max_mem_active(gib)": 45.14, "memory/max_mem_allocated(gib)": 45.14, "step": 262 }, { "epoch": 0.5306431273644389, "grad_norm": 0.353515625, "learning_rate": 2.4665797970338183e-06, "loss": 2.9485, "memory/device_mem_reserved(gib)": 50.82, "memory/max_mem_active(gib)": 45.14, "memory/max_mem_allocated(gib)": 45.14, "step": 263 }, { "epoch": 0.5326607818411098, "grad_norm": 0.33984375, "learning_rate": 2.4498715620311937e-06, "loss": 2.9009, "memory/device_mem_reserved(gib)": 50.82, "memory/max_mem_active(gib)": 45.14, "memory/max_mem_allocated(gib)": 45.14, "step": 264 }, { "epoch": 0.5346784363177806, "grad_norm": 0.337890625, "learning_rate": 2.4331655667117284e-06, "loss": 2.8577, "memory/device_mem_reserved(gib)": 50.82, "memory/max_mem_active(gib)": 45.14, "memory/max_mem_allocated(gib)": 45.14, "step": 265 }, { "epoch": 0.5366960907944515, "grad_norm": 0.38671875, "learning_rate": 2.4164625574808145e-06, "loss": 2.8691, "memory/device_mem_reserved(gib)": 50.82, "memory/max_mem_active(gib)": 45.14, "memory/max_mem_allocated(gib)": 45.14, "step": 266 }, { "epoch": 0.5387137452711224, "grad_norm": 0.447265625, "learning_rate": 2.3997632806104275e-06, "loss": 2.8994, "memory/device_mem_reserved(gib)": 50.82, "memory/max_mem_active(gib)": 45.14, "memory/max_mem_allocated(gib)": 45.14, "step": 267 }, { "epoch": 0.5407313997477932, "grad_norm": 0.330078125, "learning_rate": 2.383068482205788e-06, "loss": 2.8892, "memory/device_mem_reserved(gib)": 50.82, "memory/max_mem_active(gib)": 45.14, "memory/max_mem_allocated(gib)": 45.14, "step": 268 }, { "epoch": 0.5427490542244641, "grad_norm": 0.373046875, "learning_rate": 2.36637890817202e-06, "loss": 2.9277, "memory/device_mem_reserved(gib)": 50.82, "memory/max_mem_active(gib)": 45.14, "memory/max_mem_allocated(gib)": 45.14, "step": 269 }, { "epoch": 0.544766708701135, "grad_norm": 0.361328125, "learning_rate": 2.3496953041808327e-06, "loss": 2.7727, "memory/device_mem_reserved(gib)": 50.82, "memory/max_mem_active(gib)": 45.14, "memory/max_mem_allocated(gib)": 45.14, "step": 270 }, { "epoch": 0.5467843631778058, "grad_norm": 0.310546875, "learning_rate": 2.333018415637196e-06, "loss": 2.8423, "memory/device_mem_reserved(gib)": 50.82, "memory/max_mem_active(gib)": 45.14, "memory/max_mem_allocated(gib)": 45.14, "step": 271 }, { "epoch": 0.5488020176544767, "grad_norm": 0.328125, "learning_rate": 2.3163489876460453e-06, "loss": 2.7515, "memory/device_mem_reserved(gib)": 50.82, "memory/max_mem_active(gib)": 45.14, "memory/max_mem_allocated(gib)": 45.14, "step": 272 }, { "epoch": 0.5508196721311476, "grad_norm": 0.369140625, "learning_rate": 2.2996877649789815e-06, "loss": 3.039, "memory/device_mem_reserved(gib)": 50.82, "memory/max_mem_active(gib)": 45.14, "memory/max_mem_allocated(gib)": 45.14, "step": 273 }, { "epoch": 0.5528373266078184, "grad_norm": 0.33203125, "learning_rate": 2.2830354920410066e-06, "loss": 2.8529, "memory/device_mem_reserved(gib)": 50.82, "memory/max_mem_active(gib)": 45.14, "memory/max_mem_allocated(gib)": 45.14, "step": 274 }, { "epoch": 0.5548549810844893, "grad_norm": 0.33203125, "learning_rate": 2.2663929128372537e-06, "loss": 2.8775, "memory/device_mem_reserved(gib)": 50.82, "memory/max_mem_active(gib)": 45.14, "memory/max_mem_allocated(gib)": 45.14, "step": 275 }, { "epoch": 0.5568726355611602, "grad_norm": 0.333984375, "learning_rate": 2.249760770939754e-06, "loss": 2.9064, "memory/device_mem_reserved(gib)": 50.82, "memory/max_mem_active(gib)": 45.14, "memory/max_mem_allocated(gib)": 45.14, "step": 276 }, { "epoch": 0.558890290037831, "grad_norm": 0.3359375, "learning_rate": 2.2331398094542097e-06, "loss": 2.8687, "memory/device_mem_reserved(gib)": 50.82, "memory/max_mem_active(gib)": 45.14, "memory/max_mem_allocated(gib)": 45.14, "step": 277 }, { "epoch": 0.5609079445145019, "grad_norm": 0.333984375, "learning_rate": 2.2165307709867954e-06, "loss": 2.9269, "memory/device_mem_reserved(gib)": 50.82, "memory/max_mem_active(gib)": 45.14, "memory/max_mem_allocated(gib)": 45.14, "step": 278 }, { "epoch": 0.5629255989911728, "grad_norm": 0.34375, "learning_rate": 2.199934397610976e-06, "loss": 2.877, "memory/device_mem_reserved(gib)": 50.82, "memory/max_mem_active(gib)": 45.14, "memory/max_mem_allocated(gib)": 45.14, "step": 279 }, { "epoch": 0.5649432534678437, "grad_norm": 0.3203125, "learning_rate": 2.1833514308343583e-06, "loss": 2.9292, "memory/device_mem_reserved(gib)": 50.82, "memory/max_mem_active(gib)": 45.14, "memory/max_mem_allocated(gib)": 45.14, "step": 280 }, { "epoch": 0.5669609079445145, "grad_norm": 0.345703125, "learning_rate": 2.1667826115655536e-06, "loss": 2.8412, "memory/device_mem_reserved(gib)": 50.82, "memory/max_mem_active(gib)": 45.14, "memory/max_mem_allocated(gib)": 45.14, "step": 281 }, { "epoch": 0.5689785624211854, "grad_norm": 0.330078125, "learning_rate": 2.150228680081079e-06, "loss": 2.88, "memory/device_mem_reserved(gib)": 50.82, "memory/max_mem_active(gib)": 45.14, "memory/max_mem_allocated(gib)": 45.14, "step": 282 }, { "epoch": 0.5709962168978563, "grad_norm": 0.3515625, "learning_rate": 2.1336903759922838e-06, "loss": 2.7502, "memory/device_mem_reserved(gib)": 50.82, "memory/max_mem_active(gib)": 45.14, "memory/max_mem_allocated(gib)": 45.14, "step": 283 }, { "epoch": 0.5730138713745271, "grad_norm": 0.408203125, "learning_rate": 2.1171684382123002e-06, "loss": 2.8315, "memory/device_mem_reserved(gib)": 50.82, "memory/max_mem_active(gib)": 45.14, "memory/max_mem_allocated(gib)": 45.14, "step": 284 }, { "epoch": 0.575031525851198, "grad_norm": 0.322265625, "learning_rate": 2.1006636049230327e-06, "loss": 2.941, "memory/device_mem_reserved(gib)": 50.82, "memory/max_mem_active(gib)": 45.14, "memory/max_mem_allocated(gib)": 45.14, "step": 285 }, { "epoch": 0.5770491803278689, "grad_norm": 0.328125, "learning_rate": 2.0841766135421753e-06, "loss": 2.9317, "memory/device_mem_reserved(gib)": 50.82, "memory/max_mem_active(gib)": 45.14, "memory/max_mem_allocated(gib)": 45.14, "step": 286 }, { "epoch": 0.5790668348045397, "grad_norm": 0.330078125, "learning_rate": 2.0677082006902673e-06, "loss": 2.931, "memory/device_mem_reserved(gib)": 50.82, "memory/max_mem_active(gib)": 45.14, "memory/max_mem_allocated(gib)": 45.14, "step": 287 }, { "epoch": 0.5810844892812106, "grad_norm": 0.349609375, "learning_rate": 2.0512591021577775e-06, "loss": 2.8567, "memory/device_mem_reserved(gib)": 50.82, "memory/max_mem_active(gib)": 45.14, "memory/max_mem_allocated(gib)": 45.14, "step": 288 }, { "epoch": 0.5831021437578815, "grad_norm": 0.345703125, "learning_rate": 2.034830052872235e-06, "loss": 2.7867, "memory/device_mem_reserved(gib)": 50.82, "memory/max_mem_active(gib)": 45.14, "memory/max_mem_allocated(gib)": 45.14, "step": 289 }, { "epoch": 0.5851197982345523, "grad_norm": 0.34765625, "learning_rate": 2.018421786865387e-06, "loss": 2.9489, "memory/device_mem_reserved(gib)": 50.82, "memory/max_mem_active(gib)": 45.14, "memory/max_mem_allocated(gib)": 45.14, "step": 290 }, { "epoch": 0.5871374527112232, "grad_norm": 0.34765625, "learning_rate": 2.0020350372404104e-06, "loss": 2.8389, "memory/device_mem_reserved(gib)": 50.82, "memory/max_mem_active(gib)": 45.14, "memory/max_mem_allocated(gib)": 45.14, "step": 291 }, { "epoch": 0.5891551071878941, "grad_norm": 0.33203125, "learning_rate": 1.985670536139151e-06, "loss": 2.9196, "memory/device_mem_reserved(gib)": 50.82, "memory/max_mem_active(gib)": 45.14, "memory/max_mem_allocated(gib)": 45.14, "step": 292 }, { "epoch": 0.5911727616645649, "grad_norm": 0.32421875, "learning_rate": 1.9693290147094184e-06, "loss": 2.9489, "memory/device_mem_reserved(gib)": 50.82, "memory/max_mem_active(gib)": 45.14, "memory/max_mem_allocated(gib)": 45.14, "step": 293 }, { "epoch": 0.5931904161412358, "grad_norm": 0.392578125, "learning_rate": 1.9530112030723123e-06, "loss": 2.958, "memory/device_mem_reserved(gib)": 50.82, "memory/max_mem_active(gib)": 45.14, "memory/max_mem_allocated(gib)": 45.14, "step": 294 }, { "epoch": 0.5952080706179067, "grad_norm": 0.34765625, "learning_rate": 1.9367178302896087e-06, "loss": 2.7799, "memory/device_mem_reserved(gib)": 50.82, "memory/max_mem_active(gib)": 45.14, "memory/max_mem_allocated(gib)": 45.14, "step": 295 }, { "epoch": 0.5972257250945776, "grad_norm": 0.376953125, "learning_rate": 1.920449624331179e-06, "loss": 2.9718, "memory/device_mem_reserved(gib)": 50.82, "memory/max_mem_active(gib)": 45.14, "memory/max_mem_allocated(gib)": 45.14, "step": 296 }, { "epoch": 0.5992433795712484, "grad_norm": 0.337890625, "learning_rate": 1.9042073120424727e-06, "loss": 2.9414, "memory/device_mem_reserved(gib)": 50.82, "memory/max_mem_active(gib)": 45.14, "memory/max_mem_allocated(gib)": 45.14, "step": 297 }, { "epoch": 0.6012610340479193, "grad_norm": 0.33203125, "learning_rate": 1.887991619112035e-06, "loss": 2.9257, "memory/device_mem_reserved(gib)": 50.82, "memory/max_mem_active(gib)": 45.14, "memory/max_mem_allocated(gib)": 45.14, "step": 298 }, { "epoch": 0.6032786885245902, "grad_norm": 0.349609375, "learning_rate": 1.8718032700390887e-06, "loss": 2.9388, "memory/device_mem_reserved(gib)": 50.82, "memory/max_mem_active(gib)": 45.14, "memory/max_mem_allocated(gib)": 45.14, "step": 299 }, { "epoch": 0.605296343001261, "grad_norm": 0.353515625, "learning_rate": 1.8556429881011655e-06, "loss": 2.8914, "memory/device_mem_reserved(gib)": 50.82, "memory/max_mem_active(gib)": 45.14, "memory/max_mem_allocated(gib)": 45.14, "step": 300 }, { "epoch": 0.6073139974779319, "grad_norm": 0.37109375, "learning_rate": 1.8395114953217853e-06, "loss": 2.8707, "memory/device_mem_reserved(gib)": 50.82, "memory/max_mem_active(gib)": 45.14, "memory/max_mem_allocated(gib)": 45.14, "step": 301 }, { "epoch": 0.6093316519546028, "grad_norm": 0.310546875, "learning_rate": 1.8234095124382031e-06, "loss": 2.8361, "memory/device_mem_reserved(gib)": 50.82, "memory/max_mem_active(gib)": 45.14, "memory/max_mem_allocated(gib)": 45.14, "step": 302 }, { "epoch": 0.6113493064312736, "grad_norm": 0.357421875, "learning_rate": 1.8073377588692026e-06, "loss": 2.8958, "memory/device_mem_reserved(gib)": 50.82, "memory/max_mem_active(gib)": 45.14, "memory/max_mem_allocated(gib)": 45.14, "step": 303 }, { "epoch": 0.6133669609079445, "grad_norm": 0.34375, "learning_rate": 1.791296952682956e-06, "loss": 2.9175, "memory/device_mem_reserved(gib)": 50.82, "memory/max_mem_active(gib)": 45.14, "memory/max_mem_allocated(gib)": 45.14, "step": 304 }, { "epoch": 0.6153846153846154, "grad_norm": 0.333984375, "learning_rate": 1.775287810564939e-06, "loss": 2.8774, "memory/device_mem_reserved(gib)": 50.82, "memory/max_mem_active(gib)": 45.14, "memory/max_mem_allocated(gib)": 45.14, "step": 305 }, { "epoch": 0.6174022698612862, "grad_norm": 0.326171875, "learning_rate": 1.7593110477859155e-06, "loss": 2.9021, "memory/device_mem_reserved(gib)": 50.82, "memory/max_mem_active(gib)": 45.14, "memory/max_mem_allocated(gib)": 45.14, "step": 306 }, { "epoch": 0.6194199243379571, "grad_norm": 0.349609375, "learning_rate": 1.7433673781699717e-06, "loss": 2.919, "memory/device_mem_reserved(gib)": 50.82, "memory/max_mem_active(gib)": 45.14, "memory/max_mem_allocated(gib)": 45.14, "step": 307 }, { "epoch": 0.621437578814628, "grad_norm": 0.357421875, "learning_rate": 1.7274575140626318e-06, "loss": 2.9148, "memory/device_mem_reserved(gib)": 50.82, "memory/max_mem_active(gib)": 45.14, "memory/max_mem_allocated(gib)": 45.14, "step": 308 }, { "epoch": 0.6234552332912988, "grad_norm": 0.35546875, "learning_rate": 1.7115821662990246e-06, "loss": 2.8786, "memory/device_mem_reserved(gib)": 50.82, "memory/max_mem_active(gib)": 45.14, "memory/max_mem_allocated(gib)": 45.14, "step": 309 }, { "epoch": 0.6254728877679697, "grad_norm": 0.361328125, "learning_rate": 1.6957420441721285e-06, "loss": 2.9553, "memory/device_mem_reserved(gib)": 50.82, "memory/max_mem_active(gib)": 45.14, "memory/max_mem_allocated(gib)": 45.14, "step": 310 }, { "epoch": 0.6274905422446406, "grad_norm": 0.328125, "learning_rate": 1.6799378554010773e-06, "loss": 2.8745, "memory/device_mem_reserved(gib)": 50.82, "memory/max_mem_active(gib)": 45.14, "memory/max_mem_allocated(gib)": 45.14, "step": 311 }, { "epoch": 0.6295081967213115, "grad_norm": 0.400390625, "learning_rate": 1.6641703060995456e-06, "loss": 2.8396, "memory/device_mem_reserved(gib)": 50.82, "memory/max_mem_active(gib)": 45.14, "memory/max_mem_allocated(gib)": 45.14, "step": 312 }, { "epoch": 0.6315258511979823, "grad_norm": 0.33984375, "learning_rate": 1.6484401007441938e-06, "loss": 2.8615, "memory/device_mem_reserved(gib)": 50.82, "memory/max_mem_active(gib)": 45.14, "memory/max_mem_allocated(gib)": 45.14, "step": 313 }, { "epoch": 0.6335435056746532, "grad_norm": 0.349609375, "learning_rate": 1.6327479421431984e-06, "loss": 2.8957, "memory/device_mem_reserved(gib)": 50.82, "memory/max_mem_active(gib)": 45.14, "memory/max_mem_allocated(gib)": 45.14, "step": 314 }, { "epoch": 0.6355611601513241, "grad_norm": 0.35546875, "learning_rate": 1.6170945314048476e-06, "loss": 2.9217, "memory/device_mem_reserved(gib)": 50.82, "memory/max_mem_active(gib)": 45.14, "memory/max_mem_allocated(gib)": 45.14, "step": 315 }, { "epoch": 0.6375788146279949, "grad_norm": 0.34765625, "learning_rate": 1.6014805679062185e-06, "loss": 2.8061, "memory/device_mem_reserved(gib)": 50.82, "memory/max_mem_active(gib)": 45.14, "memory/max_mem_allocated(gib)": 45.14, "step": 316 }, { "epoch": 0.6395964691046658, "grad_norm": 0.31640625, "learning_rate": 1.5859067492619284e-06, "loss": 2.7589, "memory/device_mem_reserved(gib)": 50.82, "memory/max_mem_active(gib)": 45.14, "memory/max_mem_allocated(gib)": 45.14, "step": 317 }, { "epoch": 0.6416141235813367, "grad_norm": 0.306640625, "learning_rate": 1.5703737712929674e-06, "loss": 2.8038, "memory/device_mem_reserved(gib)": 50.82, "memory/max_mem_active(gib)": 45.14, "memory/max_mem_allocated(gib)": 45.14, "step": 318 }, { "epoch": 0.6436317780580075, "grad_norm": 0.345703125, "learning_rate": 1.5548823279956104e-06, "loss": 2.9054, "memory/device_mem_reserved(gib)": 50.82, "memory/max_mem_active(gib)": 45.14, "memory/max_mem_allocated(gib)": 45.14, "step": 319 }, { "epoch": 0.6456494325346784, "grad_norm": 0.37109375, "learning_rate": 1.5394331115104074e-06, "loss": 3.0088, "memory/device_mem_reserved(gib)": 50.82, "memory/max_mem_active(gib)": 45.14, "memory/max_mem_allocated(gib)": 45.14, "step": 320 }, { "epoch": 0.6476670870113493, "grad_norm": 0.33984375, "learning_rate": 1.5240268120912631e-06, "loss": 2.9129, "memory/device_mem_reserved(gib)": 50.82, "memory/max_mem_active(gib)": 45.14, "memory/max_mem_allocated(gib)": 45.14, "step": 321 }, { "epoch": 0.6496847414880201, "grad_norm": 0.34375, "learning_rate": 1.5086641180745934e-06, "loss": 2.9135, "memory/device_mem_reserved(gib)": 50.82, "memory/max_mem_active(gib)": 45.14, "memory/max_mem_allocated(gib)": 45.14, "step": 322 }, { "epoch": 0.651702395964691, "grad_norm": 0.33203125, "learning_rate": 1.493345715848574e-06, "loss": 2.9974, "memory/device_mem_reserved(gib)": 50.82, "memory/max_mem_active(gib)": 45.14, "memory/max_mem_allocated(gib)": 45.14, "step": 323 }, { "epoch": 0.6537200504413619, "grad_norm": 0.3359375, "learning_rate": 1.478072289822471e-06, "loss": 2.8818, "memory/device_mem_reserved(gib)": 50.82, "memory/max_mem_active(gib)": 45.14, "memory/max_mem_allocated(gib)": 45.14, "step": 324 }, { "epoch": 0.6557377049180327, "grad_norm": 0.34375, "learning_rate": 1.462844522396066e-06, "loss": 2.9075, "memory/device_mem_reserved(gib)": 50.82, "memory/max_mem_active(gib)": 45.14, "memory/max_mem_allocated(gib)": 45.14, "step": 325 }, { "epoch": 0.6577553593947036, "grad_norm": 0.33984375, "learning_rate": 1.4476630939291631e-06, "loss": 2.8859, "memory/device_mem_reserved(gib)": 50.82, "memory/max_mem_active(gib)": 45.14, "memory/max_mem_allocated(gib)": 45.14, "step": 326 }, { "epoch": 0.6597730138713745, "grad_norm": 0.32421875, "learning_rate": 1.4325286827111931e-06, "loss": 2.8487, "memory/device_mem_reserved(gib)": 50.82, "memory/max_mem_active(gib)": 45.14, "memory/max_mem_allocated(gib)": 45.14, "step": 327 }, { "epoch": 0.6617906683480455, "grad_norm": 0.328125, "learning_rate": 1.417441964930909e-06, "loss": 2.9071, "memory/device_mem_reserved(gib)": 50.82, "memory/max_mem_active(gib)": 45.14, "memory/max_mem_allocated(gib)": 45.14, "step": 328 }, { "epoch": 0.6638083228247162, "grad_norm": 0.328125, "learning_rate": 1.4024036146461734e-06, "loss": 2.8911, "memory/device_mem_reserved(gib)": 50.82, "memory/max_mem_active(gib)": 45.14, "memory/max_mem_allocated(gib)": 45.14, "step": 329 }, { "epoch": 0.6658259773013872, "grad_norm": 0.3671875, "learning_rate": 1.3874143037538417e-06, "loss": 2.8569, "memory/device_mem_reserved(gib)": 50.82, "memory/max_mem_active(gib)": 45.14, "memory/max_mem_allocated(gib)": 45.14, "step": 330 }, { "epoch": 0.6678436317780581, "grad_norm": 0.3359375, "learning_rate": 1.372474701959745e-06, "loss": 2.9294, "memory/device_mem_reserved(gib)": 50.82, "memory/max_mem_active(gib)": 45.14, "memory/max_mem_allocated(gib)": 45.14, "step": 331 }, { "epoch": 0.6698612862547288, "grad_norm": 0.341796875, "learning_rate": 1.357585476748766e-06, "loss": 2.9338, "memory/device_mem_reserved(gib)": 50.82, "memory/max_mem_active(gib)": 45.14, "memory/max_mem_allocated(gib)": 45.14, "step": 332 }, { "epoch": 0.6718789407313998, "grad_norm": 0.318359375, "learning_rate": 1.342747293355019e-06, "loss": 2.7899, "memory/device_mem_reserved(gib)": 50.82, "memory/max_mem_active(gib)": 45.14, "memory/max_mem_allocated(gib)": 45.14, "step": 333 }, { "epoch": 0.6738965952080707, "grad_norm": 0.34765625, "learning_rate": 1.3279608147321223e-06, "loss": 2.8819, "memory/device_mem_reserved(gib)": 50.82, "memory/max_mem_active(gib)": 45.14, "memory/max_mem_allocated(gib)": 45.14, "step": 334 }, { "epoch": 0.6759142496847415, "grad_norm": 0.369140625, "learning_rate": 1.3132267015235862e-06, "loss": 2.9289, "memory/device_mem_reserved(gib)": 50.82, "memory/max_mem_active(gib)": 45.14, "memory/max_mem_allocated(gib)": 45.14, "step": 335 }, { "epoch": 0.6779319041614124, "grad_norm": 0.349609375, "learning_rate": 1.2985456120332907e-06, "loss": 2.8636, "memory/device_mem_reserved(gib)": 50.82, "memory/max_mem_active(gib)": 45.14, "memory/max_mem_allocated(gib)": 45.14, "step": 336 }, { "epoch": 0.6799495586380833, "grad_norm": 0.30078125, "learning_rate": 1.2839182021960753e-06, "loss": 2.9377, "memory/device_mem_reserved(gib)": 50.82, "memory/max_mem_active(gib)": 45.14, "memory/max_mem_allocated(gib)": 45.14, "step": 337 }, { "epoch": 0.6819672131147541, "grad_norm": 0.36328125, "learning_rate": 1.2693451255484314e-06, "loss": 2.884, "memory/device_mem_reserved(gib)": 50.82, "memory/max_mem_active(gib)": 45.14, "memory/max_mem_allocated(gib)": 45.14, "step": 338 }, { "epoch": 0.683984867591425, "grad_norm": 0.333984375, "learning_rate": 1.2548270331993034e-06, "loss": 2.87, "memory/device_mem_reserved(gib)": 50.82, "memory/max_mem_active(gib)": 45.14, "memory/max_mem_allocated(gib)": 45.14, "step": 339 }, { "epoch": 0.6860025220680959, "grad_norm": 0.306640625, "learning_rate": 1.2403645738009998e-06, "loss": 2.9084, "memory/device_mem_reserved(gib)": 50.82, "memory/max_mem_active(gib)": 45.14, "memory/max_mem_allocated(gib)": 45.14, "step": 340 }, { "epoch": 0.6880201765447667, "grad_norm": 0.333984375, "learning_rate": 1.2259583935202063e-06, "loss": 2.9395, "memory/device_mem_reserved(gib)": 50.82, "memory/max_mem_active(gib)": 45.14, "memory/max_mem_allocated(gib)": 45.14, "step": 341 }, { "epoch": 0.6900378310214376, "grad_norm": 0.34375, "learning_rate": 1.2116091360091262e-06, "loss": 2.9742, "memory/device_mem_reserved(gib)": 50.82, "memory/max_mem_active(gib)": 45.14, "memory/max_mem_allocated(gib)": 45.14, "step": 342 }, { "epoch": 0.6920554854981085, "grad_norm": 0.3515625, "learning_rate": 1.1973174423767098e-06, "loss": 2.9324, "memory/device_mem_reserved(gib)": 50.82, "memory/max_mem_active(gib)": 45.14, "memory/max_mem_allocated(gib)": 45.14, "step": 343 }, { "epoch": 0.6940731399747794, "grad_norm": 0.326171875, "learning_rate": 1.1830839511600211e-06, "loss": 2.8262, "memory/device_mem_reserved(gib)": 50.82, "memory/max_mem_active(gib)": 45.14, "memory/max_mem_allocated(gib)": 45.14, "step": 344 }, { "epoch": 0.6960907944514502, "grad_norm": 0.365234375, "learning_rate": 1.168909298295704e-06, "loss": 2.8697, "memory/device_mem_reserved(gib)": 50.82, "memory/max_mem_active(gib)": 45.14, "memory/max_mem_allocated(gib)": 45.14, "step": 345 }, { "epoch": 0.6981084489281211, "grad_norm": 0.419921875, "learning_rate": 1.1547941170915686e-06, "loss": 2.922, "memory/device_mem_reserved(gib)": 50.82, "memory/max_mem_active(gib)": 45.14, "memory/max_mem_allocated(gib)": 45.14, "step": 346 }, { "epoch": 0.700126103404792, "grad_norm": 0.376953125, "learning_rate": 1.140739038198298e-06, "loss": 2.8969, "memory/device_mem_reserved(gib)": 50.82, "memory/max_mem_active(gib)": 45.14, "memory/max_mem_allocated(gib)": 45.14, "step": 347 }, { "epoch": 0.7021437578814628, "grad_norm": 0.333984375, "learning_rate": 1.1267446895812704e-06, "loss": 2.8233, "memory/device_mem_reserved(gib)": 50.82, "memory/max_mem_active(gib)": 45.14, "memory/max_mem_allocated(gib)": 45.14, "step": 348 }, { "epoch": 0.7041614123581337, "grad_norm": 0.330078125, "learning_rate": 1.1128116964925023e-06, "loss": 2.9209, "memory/device_mem_reserved(gib)": 50.82, "memory/max_mem_active(gib)": 45.14, "memory/max_mem_allocated(gib)": 45.14, "step": 349 }, { "epoch": 0.7061790668348046, "grad_norm": 0.306640625, "learning_rate": 1.098940681442713e-06, "loss": 2.855, "memory/device_mem_reserved(gib)": 50.82, "memory/max_mem_active(gib)": 45.14, "memory/max_mem_allocated(gib)": 45.14, "step": 350 }, { "epoch": 0.7081967213114754, "grad_norm": 0.341796875, "learning_rate": 1.0851322641735119e-06, "loss": 2.8907, "memory/device_mem_reserved(gib)": 50.82, "memory/max_mem_active(gib)": 45.14, "memory/max_mem_allocated(gib)": 45.14, "step": 351 }, { "epoch": 0.7102143757881463, "grad_norm": 0.333984375, "learning_rate": 1.0713870616297093e-06, "loss": 2.7215, "memory/device_mem_reserved(gib)": 50.82, "memory/max_mem_active(gib)": 45.14, "memory/max_mem_allocated(gib)": 45.14, "step": 352 }, { "epoch": 0.7122320302648172, "grad_norm": 0.341796875, "learning_rate": 1.0577056879317486e-06, "loss": 2.8413, "memory/device_mem_reserved(gib)": 50.82, "memory/max_mem_active(gib)": 45.14, "memory/max_mem_allocated(gib)": 45.14, "step": 353 }, { "epoch": 0.714249684741488, "grad_norm": 0.3515625, "learning_rate": 1.0440887543482747e-06, "loss": 2.8578, "memory/device_mem_reserved(gib)": 50.82, "memory/max_mem_active(gib)": 45.14, "memory/max_mem_allocated(gib)": 45.14, "step": 354 }, { "epoch": 0.7162673392181589, "grad_norm": 0.3359375, "learning_rate": 1.0305368692688175e-06, "loss": 2.9202, "memory/device_mem_reserved(gib)": 50.82, "memory/max_mem_active(gib)": 45.14, "memory/max_mem_allocated(gib)": 45.14, "step": 355 }, { "epoch": 0.7182849936948298, "grad_norm": 0.412109375, "learning_rate": 1.0170506381766121e-06, "loss": 2.962, "memory/device_mem_reserved(gib)": 50.82, "memory/max_mem_active(gib)": 45.14, "memory/max_mem_allocated(gib)": 45.14, "step": 356 }, { "epoch": 0.7203026481715006, "grad_norm": 0.330078125, "learning_rate": 1.0036306636215462e-06, "loss": 2.9256, "memory/device_mem_reserved(gib)": 50.82, "memory/max_mem_active(gib)": 45.14, "memory/max_mem_allocated(gib)": 45.14, "step": 357 }, { "epoch": 0.7223203026481715, "grad_norm": 0.302734375, "learning_rate": 9.902775451932387e-07, "loss": 2.9143, "memory/device_mem_reserved(gib)": 50.82, "memory/max_mem_active(gib)": 45.14, "memory/max_mem_allocated(gib)": 45.14, "step": 358 }, { "epoch": 0.7243379571248424, "grad_norm": 0.322265625, "learning_rate": 9.769918794942511e-07, "loss": 2.8691, "memory/device_mem_reserved(gib)": 50.82, "memory/max_mem_active(gib)": 45.14, "memory/max_mem_allocated(gib)": 45.14, "step": 359 }, { "epoch": 0.7263556116015133, "grad_norm": 0.298828125, "learning_rate": 9.637742601134287e-07, "loss": 2.8137, "memory/device_mem_reserved(gib)": 50.82, "memory/max_mem_active(gib)": 45.14, "memory/max_mem_allocated(gib)": 45.14, "step": 360 }, { "epoch": 0.7283732660781841, "grad_norm": 0.337890625, "learning_rate": 9.506252775993882e-07, "loss": 2.8396, "memory/device_mem_reserved(gib)": 50.82, "memory/max_mem_active(gib)": 45.14, "memory/max_mem_allocated(gib)": 45.14, "step": 361 }, { "epoch": 0.730390920554855, "grad_norm": 0.3203125, "learning_rate": 9.375455194341215e-07, "loss": 2.8566, "memory/device_mem_reserved(gib)": 50.82, "memory/max_mem_active(gib)": 45.14, "memory/max_mem_allocated(gib)": 45.14, "step": 362 }, { "epoch": 0.7324085750315259, "grad_norm": 0.318359375, "learning_rate": 9.24535570006756e-07, "loss": 2.8701, "memory/device_mem_reserved(gib)": 50.82, "memory/max_mem_active(gib)": 45.14, "memory/max_mem_allocated(gib)": 45.14, "step": 363 }, { "epoch": 0.7344262295081967, "grad_norm": 0.330078125, "learning_rate": 9.115960105874411e-07, "loss": 2.8288, "memory/device_mem_reserved(gib)": 50.82, "memory/max_mem_active(gib)": 45.14, "memory/max_mem_allocated(gib)": 45.14, "step": 364 }, { "epoch": 0.7364438839848676, "grad_norm": 0.287109375, "learning_rate": 8.987274193013792e-07, "loss": 2.8787, "memory/device_mem_reserved(gib)": 50.82, "memory/max_mem_active(gib)": 45.14, "memory/max_mem_allocated(gib)": 45.14, "step": 365 }, { "epoch": 0.7384615384615385, "grad_norm": 0.34765625, "learning_rate": 8.85930371102994e-07, "loss": 2.7868, "memory/device_mem_reserved(gib)": 50.82, "memory/max_mem_active(gib)": 45.14, "memory/max_mem_allocated(gib)": 45.14, "step": 366 }, { "epoch": 0.7404791929382093, "grad_norm": 0.32421875, "learning_rate": 8.732054377502442e-07, "loss": 2.9876, "memory/device_mem_reserved(gib)": 50.82, "memory/max_mem_active(gib)": 45.14, "memory/max_mem_allocated(gib)": 45.14, "step": 367 }, { "epoch": 0.7424968474148802, "grad_norm": 0.33984375, "learning_rate": 8.605531877790762e-07, "loss": 2.9011, "memory/device_mem_reserved(gib)": 50.82, "memory/max_mem_active(gib)": 45.14, "memory/max_mem_allocated(gib)": 45.14, "step": 368 }, { "epoch": 0.7445145018915511, "grad_norm": 0.36328125, "learning_rate": 8.479741864780236e-07, "loss": 2.8392, "memory/device_mem_reserved(gib)": 50.82, "memory/max_mem_active(gib)": 45.14, "memory/max_mem_allocated(gib)": 45.14, "step": 369 }, { "epoch": 0.7465321563682219, "grad_norm": 0.310546875, "learning_rate": 8.354689958629514e-07, "loss": 2.8629, "memory/device_mem_reserved(gib)": 50.82, "memory/max_mem_active(gib)": 45.14, "memory/max_mem_allocated(gib)": 45.14, "step": 370 }, { "epoch": 0.7485498108448928, "grad_norm": 0.3046875, "learning_rate": 8.23038174651942e-07, "loss": 2.9016, "memory/device_mem_reserved(gib)": 50.82, "memory/max_mem_active(gib)": 45.14, "memory/max_mem_allocated(gib)": 45.14, "step": 371 }, { "epoch": 0.7505674653215637, "grad_norm": 0.328125, "learning_rate": 8.106822782403376e-07, "loss": 2.9677, "memory/device_mem_reserved(gib)": 50.82, "memory/max_mem_active(gib)": 45.14, "memory/max_mem_allocated(gib)": 45.14, "step": 372 }, { "epoch": 0.7525851197982345, "grad_norm": 0.373046875, "learning_rate": 7.984018586759227e-07, "loss": 2.8286, "memory/device_mem_reserved(gib)": 50.82, "memory/max_mem_active(gib)": 45.14, "memory/max_mem_allocated(gib)": 45.14, "step": 373 }, { "epoch": 0.7546027742749054, "grad_norm": 0.326171875, "learning_rate": 7.861974646342596e-07, "loss": 2.826, "memory/device_mem_reserved(gib)": 50.82, "memory/max_mem_active(gib)": 45.14, "memory/max_mem_allocated(gib)": 45.14, "step": 374 }, { "epoch": 0.7566204287515763, "grad_norm": 0.333984375, "learning_rate": 7.740696413941745e-07, "loss": 2.8649, "memory/device_mem_reserved(gib)": 50.82, "memory/max_mem_active(gib)": 45.14, "memory/max_mem_allocated(gib)": 45.14, "step": 375 }, { "epoch": 0.7586380832282472, "grad_norm": 0.33984375, "learning_rate": 7.620189308133943e-07, "loss": 2.8342, "memory/device_mem_reserved(gib)": 50.82, "memory/max_mem_active(gib)": 45.14, "memory/max_mem_allocated(gib)": 45.14, "step": 376 }, { "epoch": 0.760655737704918, "grad_norm": 0.330078125, "learning_rate": 7.500458713043385e-07, "loss": 2.8452, "memory/device_mem_reserved(gib)": 50.82, "memory/max_mem_active(gib)": 45.14, "memory/max_mem_allocated(gib)": 45.14, "step": 377 }, { "epoch": 0.7626733921815889, "grad_norm": 0.345703125, "learning_rate": 7.381509978100626e-07, "loss": 2.7791, "memory/device_mem_reserved(gib)": 50.82, "memory/max_mem_active(gib)": 45.14, "memory/max_mem_allocated(gib)": 45.14, "step": 378 }, { "epoch": 0.7646910466582598, "grad_norm": 0.306640625, "learning_rate": 7.263348417803545e-07, "loss": 2.8265, "memory/device_mem_reserved(gib)": 50.82, "memory/max_mem_active(gib)": 45.14, "memory/max_mem_allocated(gib)": 45.14, "step": 379 }, { "epoch": 0.7667087011349306, "grad_norm": 0.314453125, "learning_rate": 7.145979311479986e-07, "loss": 2.9179, "memory/device_mem_reserved(gib)": 50.82, "memory/max_mem_active(gib)": 45.14, "memory/max_mem_allocated(gib)": 45.14, "step": 380 }, { "epoch": 0.7687263556116015, "grad_norm": 0.3046875, "learning_rate": 7.029407903051771e-07, "loss": 2.9605, "memory/device_mem_reserved(gib)": 50.82, "memory/max_mem_active(gib)": 45.14, "memory/max_mem_allocated(gib)": 45.14, "step": 381 }, { "epoch": 0.7707440100882724, "grad_norm": 0.306640625, "learning_rate": 6.91363940080049e-07, "loss": 2.8854, "memory/device_mem_reserved(gib)": 50.82, "memory/max_mem_active(gib)": 45.14, "memory/max_mem_allocated(gib)": 45.14, "step": 382 }, { "epoch": 0.7727616645649432, "grad_norm": 0.32421875, "learning_rate": 6.798678977134768e-07, "loss": 2.8363, "memory/device_mem_reserved(gib)": 50.82, "memory/max_mem_active(gib)": 45.14, "memory/max_mem_allocated(gib)": 45.14, "step": 383 }, { "epoch": 0.7747793190416141, "grad_norm": 0.318359375, "learning_rate": 6.684531768359173e-07, "loss": 2.9301, "memory/device_mem_reserved(gib)": 50.82, "memory/max_mem_active(gib)": 45.14, "memory/max_mem_allocated(gib)": 45.14, "step": 384 }, { "epoch": 0.776796973518285, "grad_norm": 0.345703125, "learning_rate": 6.57120287444473e-07, "loss": 2.8542, "memory/device_mem_reserved(gib)": 50.82, "memory/max_mem_active(gib)": 45.14, "memory/max_mem_allocated(gib)": 45.14, "step": 385 }, { "epoch": 0.7788146279949558, "grad_norm": 0.345703125, "learning_rate": 6.458697358801061e-07, "loss": 2.8322, "memory/device_mem_reserved(gib)": 50.82, "memory/max_mem_active(gib)": 45.14, "memory/max_mem_allocated(gib)": 45.14, "step": 386 }, { "epoch": 0.7808322824716267, "grad_norm": 0.314453125, "learning_rate": 6.34702024805016e-07, "loss": 2.8331, "memory/device_mem_reserved(gib)": 50.82, "memory/max_mem_active(gib)": 45.14, "memory/max_mem_allocated(gib)": 45.14, "step": 387 }, { "epoch": 0.7828499369482976, "grad_norm": 0.314453125, "learning_rate": 6.236176531801813e-07, "loss": 2.8607, "memory/device_mem_reserved(gib)": 50.82, "memory/max_mem_active(gib)": 45.14, "memory/max_mem_allocated(gib)": 45.14, "step": 388 }, { "epoch": 0.7848675914249684, "grad_norm": 0.345703125, "learning_rate": 6.126171162430636e-07, "loss": 2.9638, "memory/device_mem_reserved(gib)": 50.82, "memory/max_mem_active(gib)": 45.14, "memory/max_mem_allocated(gib)": 45.14, "step": 389 }, { "epoch": 0.7868852459016393, "grad_norm": 0.33984375, "learning_rate": 6.017009054854858e-07, "loss": 2.9402, "memory/device_mem_reserved(gib)": 50.82, "memory/max_mem_active(gib)": 45.14, "memory/max_mem_allocated(gib)": 45.14, "step": 390 }, { "epoch": 0.7889029003783102, "grad_norm": 0.3125, "learning_rate": 5.908695086316701e-07, "loss": 2.8842, "memory/device_mem_reserved(gib)": 50.82, "memory/max_mem_active(gib)": 45.14, "memory/max_mem_allocated(gib)": 45.14, "step": 391 }, { "epoch": 0.7909205548549811, "grad_norm": 0.427734375, "learning_rate": 5.801234096164468e-07, "loss": 2.87, "memory/device_mem_reserved(gib)": 50.82, "memory/max_mem_active(gib)": 45.14, "memory/max_mem_allocated(gib)": 45.14, "step": 392 }, { "epoch": 0.7929382093316519, "grad_norm": 0.34765625, "learning_rate": 5.694630885636332e-07, "loss": 2.8928, "memory/device_mem_reserved(gib)": 50.82, "memory/max_mem_active(gib)": 45.14, "memory/max_mem_allocated(gib)": 45.14, "step": 393 }, { "epoch": 0.7949558638083228, "grad_norm": 0.322265625, "learning_rate": 5.588890217645821e-07, "loss": 2.7904, "memory/device_mem_reserved(gib)": 50.82, "memory/max_mem_active(gib)": 45.14, "memory/max_mem_allocated(gib)": 45.14, "step": 394 }, { "epoch": 0.7969735182849937, "grad_norm": 0.341796875, "learning_rate": 5.484016816569015e-07, "loss": 2.9444, "memory/device_mem_reserved(gib)": 50.82, "memory/max_mem_active(gib)": 45.14, "memory/max_mem_allocated(gib)": 45.14, "step": 395 }, { "epoch": 0.7989911727616645, "grad_norm": 0.3203125, "learning_rate": 5.380015368033476e-07, "loss": 2.9772, "memory/device_mem_reserved(gib)": 50.82, "memory/max_mem_active(gib)": 45.14, "memory/max_mem_allocated(gib)": 45.14, "step": 396 }, { "epoch": 0.8010088272383354, "grad_norm": 0.34375, "learning_rate": 5.276890518708885e-07, "loss": 2.7561, "memory/device_mem_reserved(gib)": 50.82, "memory/max_mem_active(gib)": 45.14, "memory/max_mem_allocated(gib)": 45.14, "step": 397 }, { "epoch": 0.8030264817150063, "grad_norm": 0.32421875, "learning_rate": 5.174646876099421e-07, "loss": 2.8828, "memory/device_mem_reserved(gib)": 50.82, "memory/max_mem_active(gib)": 45.14, "memory/max_mem_allocated(gib)": 45.14, "step": 398 }, { "epoch": 0.8050441361916771, "grad_norm": 0.302734375, "learning_rate": 5.073289008337967e-07, "loss": 2.8515, "memory/device_mem_reserved(gib)": 50.82, "memory/max_mem_active(gib)": 45.14, "memory/max_mem_allocated(gib)": 45.14, "step": 399 }, { "epoch": 0.807061790668348, "grad_norm": 0.32421875, "learning_rate": 4.972821443981921e-07, "loss": 2.8422, "memory/device_mem_reserved(gib)": 50.82, "memory/max_mem_active(gib)": 45.14, "memory/max_mem_allocated(gib)": 45.14, "step": 400 }, { "epoch": 0.809079445145019, "grad_norm": 0.31640625, "learning_rate": 4.873248671810929e-07, "loss": 2.9368, "memory/device_mem_reserved(gib)": 50.82, "memory/max_mem_active(gib)": 45.14, "memory/max_mem_allocated(gib)": 45.14, "step": 401 }, { "epoch": 0.8110970996216897, "grad_norm": 0.373046875, "learning_rate": 4.774575140626317e-07, "loss": 2.955, "memory/device_mem_reserved(gib)": 50.82, "memory/max_mem_active(gib)": 45.14, "memory/max_mem_allocated(gib)": 45.14, "step": 402 }, { "epoch": 0.8131147540983606, "grad_norm": 0.302734375, "learning_rate": 4.6768052590523053e-07, "loss": 2.9267, "memory/device_mem_reserved(gib)": 50.82, "memory/max_mem_active(gib)": 45.14, "memory/max_mem_allocated(gib)": 45.14, "step": 403 }, { "epoch": 0.8151324085750316, "grad_norm": 0.328125, "learning_rate": 4.579943395339062e-07, "loss": 2.8792, "memory/device_mem_reserved(gib)": 50.82, "memory/max_mem_active(gib)": 45.14, "memory/max_mem_allocated(gib)": 45.14, "step": 404 }, { "epoch": 0.8171500630517023, "grad_norm": 0.328125, "learning_rate": 4.4839938771675115e-07, "loss": 2.9025, "memory/device_mem_reserved(gib)": 50.82, "memory/max_mem_active(gib)": 45.14, "memory/max_mem_allocated(gib)": 45.14, "step": 405 }, { "epoch": 0.8191677175283733, "grad_norm": 0.328125, "learning_rate": 4.388960991455998e-07, "loss": 2.9083, "memory/device_mem_reserved(gib)": 50.82, "memory/max_mem_active(gib)": 45.14, "memory/max_mem_allocated(gib)": 45.14, "step": 406 }, { "epoch": 0.8211853720050442, "grad_norm": 0.326171875, "learning_rate": 4.294848984168723e-07, "loss": 2.9084, "memory/device_mem_reserved(gib)": 50.82, "memory/max_mem_active(gib)": 45.14, "memory/max_mem_allocated(gib)": 45.14, "step": 407 }, { "epoch": 0.8232030264817151, "grad_norm": 0.310546875, "learning_rate": 4.20166206012608e-07, "loss": 2.8086, "memory/device_mem_reserved(gib)": 50.82, "memory/max_mem_active(gib)": 45.14, "memory/max_mem_allocated(gib)": 45.14, "step": 408 }, { "epoch": 0.8252206809583859, "grad_norm": 0.3359375, "learning_rate": 4.109404382816756e-07, "loss": 2.8263, "memory/device_mem_reserved(gib)": 50.82, "memory/max_mem_active(gib)": 45.14, "memory/max_mem_allocated(gib)": 45.14, "step": 409 }, { "epoch": 0.8272383354350568, "grad_norm": 0.326171875, "learning_rate": 4.0180800742117246e-07, "loss": 2.9327, "memory/device_mem_reserved(gib)": 50.82, "memory/max_mem_active(gib)": 45.14, "memory/max_mem_allocated(gib)": 45.14, "step": 410 }, { "epoch": 0.8292559899117277, "grad_norm": 0.333984375, "learning_rate": 3.927693214580075e-07, "loss": 2.8724, "memory/device_mem_reserved(gib)": 50.82, "memory/max_mem_active(gib)": 45.14, "memory/max_mem_allocated(gib)": 45.14, "step": 411 }, { "epoch": 0.8312736443883985, "grad_norm": 0.3359375, "learning_rate": 3.8382478423067163e-07, "loss": 2.8089, "memory/device_mem_reserved(gib)": 50.82, "memory/max_mem_active(gib)": 45.14, "memory/max_mem_allocated(gib)": 45.14, "step": 412 }, { "epoch": 0.8332912988650694, "grad_norm": 0.35546875, "learning_rate": 3.7497479537119435e-07, "loss": 2.9261, "memory/device_mem_reserved(gib)": 50.82, "memory/max_mem_active(gib)": 45.14, "memory/max_mem_allocated(gib)": 45.14, "step": 413 }, { "epoch": 0.8353089533417403, "grad_norm": 0.34765625, "learning_rate": 3.662197502872886e-07, "loss": 2.9533, "memory/device_mem_reserved(gib)": 50.82, "memory/max_mem_active(gib)": 45.14, "memory/max_mem_allocated(gib)": 45.14, "step": 414 }, { "epoch": 0.8373266078184111, "grad_norm": 0.3359375, "learning_rate": 3.575600401446841e-07, "loss": 2.9823, "memory/device_mem_reserved(gib)": 50.82, "memory/max_mem_active(gib)": 45.14, "memory/max_mem_allocated(gib)": 45.14, "step": 415 }, { "epoch": 0.839344262295082, "grad_norm": 0.349609375, "learning_rate": 3.489960518496521e-07, "loss": 2.9184, "memory/device_mem_reserved(gib)": 50.82, "memory/max_mem_active(gib)": 45.14, "memory/max_mem_allocated(gib)": 45.14, "step": 416 }, { "epoch": 0.8413619167717529, "grad_norm": 0.357421875, "learning_rate": 3.405281680317149e-07, "loss": 2.8453, "memory/device_mem_reserved(gib)": 50.82, "memory/max_mem_active(gib)": 45.14, "memory/max_mem_allocated(gib)": 45.14, "step": 417 }, { "epoch": 0.8433795712484237, "grad_norm": 0.314453125, "learning_rate": 3.3215676702655687e-07, "loss": 2.8601, "memory/device_mem_reserved(gib)": 50.82, "memory/max_mem_active(gib)": 45.14, "memory/max_mem_allocated(gib)": 45.14, "step": 418 }, { "epoch": 0.8453972257250946, "grad_norm": 0.32421875, "learning_rate": 3.2388222285911373e-07, "loss": 2.9082, "memory/device_mem_reserved(gib)": 50.82, "memory/max_mem_active(gib)": 45.14, "memory/max_mem_allocated(gib)": 45.14, "step": 419 }, { "epoch": 0.8474148802017655, "grad_norm": 0.322265625, "learning_rate": 3.1570490522686624e-07, "loss": 2.8188, "memory/device_mem_reserved(gib)": 50.82, "memory/max_mem_active(gib)": 45.14, "memory/max_mem_allocated(gib)": 45.14, "step": 420 }, { "epoch": 0.8494325346784363, "grad_norm": 0.3515625, "learning_rate": 3.076251794833213e-07, "loss": 2.8224, "memory/device_mem_reserved(gib)": 50.82, "memory/max_mem_active(gib)": 45.14, "memory/max_mem_allocated(gib)": 45.14, "step": 421 }, { "epoch": 0.8514501891551072, "grad_norm": 0.32421875, "learning_rate": 2.9964340662168774e-07, "loss": 2.8893, "memory/device_mem_reserved(gib)": 50.82, "memory/max_mem_active(gib)": 45.14, "memory/max_mem_allocated(gib)": 45.14, "step": 422 }, { "epoch": 0.8534678436317781, "grad_norm": 0.3203125, "learning_rate": 2.9175994325874783e-07, "loss": 2.8318, "memory/device_mem_reserved(gib)": 50.82, "memory/max_mem_active(gib)": 45.14, "memory/max_mem_allocated(gib)": 45.14, "step": 423 }, { "epoch": 0.855485498108449, "grad_norm": 0.31640625, "learning_rate": 2.8397514161892484e-07, "loss": 2.8396, "memory/device_mem_reserved(gib)": 50.82, "memory/max_mem_active(gib)": 45.14, "memory/max_mem_allocated(gib)": 45.14, "step": 424 }, { "epoch": 0.8575031525851198, "grad_norm": 0.31640625, "learning_rate": 2.7628934951854506e-07, "loss": 2.7336, "memory/device_mem_reserved(gib)": 50.82, "memory/max_mem_active(gib)": 45.14, "memory/max_mem_allocated(gib)": 45.14, "step": 425 }, { "epoch": 0.8595208070617907, "grad_norm": 0.326171875, "learning_rate": 2.6870291035029724e-07, "loss": 2.8688, "memory/device_mem_reserved(gib)": 50.82, "memory/max_mem_active(gib)": 45.14, "memory/max_mem_allocated(gib)": 45.14, "step": 426 }, { "epoch": 0.8615384615384616, "grad_norm": 0.30859375, "learning_rate": 2.612161630678922e-07, "loss": 2.7664, "memory/device_mem_reserved(gib)": 50.82, "memory/max_mem_active(gib)": 45.14, "memory/max_mem_allocated(gib)": 45.14, "step": 427 }, { "epoch": 0.8635561160151324, "grad_norm": 0.3515625, "learning_rate": 2.5382944217091725e-07, "loss": 2.8153, "memory/device_mem_reserved(gib)": 50.82, "memory/max_mem_active(gib)": 45.14, "memory/max_mem_allocated(gib)": 45.14, "step": 428 }, { "epoch": 0.8655737704918033, "grad_norm": 0.33984375, "learning_rate": 2.465430776898911e-07, "loss": 2.818, "memory/device_mem_reserved(gib)": 50.82, "memory/max_mem_active(gib)": 45.14, "memory/max_mem_allocated(gib)": 45.14, "step": 429 }, { "epoch": 0.8675914249684742, "grad_norm": 0.29296875, "learning_rate": 2.3935739517151916e-07, "loss": 2.9183, "memory/device_mem_reserved(gib)": 50.82, "memory/max_mem_active(gib)": 45.14, "memory/max_mem_allocated(gib)": 45.14, "step": 430 }, { "epoch": 0.869609079445145, "grad_norm": 0.359375, "learning_rate": 2.3227271566414827e-07, "loss": 2.874, "memory/device_mem_reserved(gib)": 50.82, "memory/max_mem_active(gib)": 45.14, "memory/max_mem_allocated(gib)": 45.14, "step": 431 }, { "epoch": 0.8716267339218159, "grad_norm": 0.3359375, "learning_rate": 2.2528935570342165e-07, "loss": 2.9182, "memory/device_mem_reserved(gib)": 50.82, "memory/max_mem_active(gib)": 45.14, "memory/max_mem_allocated(gib)": 45.14, "step": 432 }, { "epoch": 0.8736443883984868, "grad_norm": 0.341796875, "learning_rate": 2.1840762729813808e-07, "loss": 2.8589, "memory/device_mem_reserved(gib)": 50.82, "memory/max_mem_active(gib)": 45.14, "memory/max_mem_allocated(gib)": 45.14, "step": 433 }, { "epoch": 0.8756620428751576, "grad_norm": 0.31640625, "learning_rate": 2.116278379163106e-07, "loss": 2.938, "memory/device_mem_reserved(gib)": 50.82, "memory/max_mem_active(gib)": 45.14, "memory/max_mem_allocated(gib)": 45.14, "step": 434 }, { "epoch": 0.8776796973518285, "grad_norm": 0.353515625, "learning_rate": 2.0495029047142983e-07, "loss": 2.9281, "memory/device_mem_reserved(gib)": 50.82, "memory/max_mem_active(gib)": 45.14, "memory/max_mem_allocated(gib)": 45.14, "step": 435 }, { "epoch": 0.8796973518284994, "grad_norm": 0.330078125, "learning_rate": 1.9837528330892781e-07, "loss": 2.9188, "memory/device_mem_reserved(gib)": 50.82, "memory/max_mem_active(gib)": 45.14, "memory/max_mem_allocated(gib)": 45.14, "step": 436 }, { "epoch": 0.8817150063051702, "grad_norm": 0.32421875, "learning_rate": 1.9190311019285368e-07, "loss": 2.9296, "memory/device_mem_reserved(gib)": 50.82, "memory/max_mem_active(gib)": 45.14, "memory/max_mem_allocated(gib)": 45.14, "step": 437 }, { "epoch": 0.8837326607818411, "grad_norm": 0.31640625, "learning_rate": 1.855340602927419e-07, "loss": 2.8826, "memory/device_mem_reserved(gib)": 50.82, "memory/max_mem_active(gib)": 45.14, "memory/max_mem_allocated(gib)": 45.14, "step": 438 }, { "epoch": 0.885750315258512, "grad_norm": 0.359375, "learning_rate": 1.7926841817069717e-07, "loss": 2.8862, "memory/device_mem_reserved(gib)": 50.82, "memory/max_mem_active(gib)": 45.14, "memory/max_mem_allocated(gib)": 45.14, "step": 439 }, { "epoch": 0.8877679697351829, "grad_norm": 0.37890625, "learning_rate": 1.7310646376867885e-07, "loss": 2.9719, "memory/device_mem_reserved(gib)": 50.82, "memory/max_mem_active(gib)": 45.14, "memory/max_mem_allocated(gib)": 45.14, "step": 440 }, { "epoch": 0.8897856242118537, "grad_norm": 0.326171875, "learning_rate": 1.6704847239599364e-07, "loss": 2.8783, "memory/device_mem_reserved(gib)": 50.82, "memory/max_mem_active(gib)": 45.14, "memory/max_mem_allocated(gib)": 45.14, "step": 441 }, { "epoch": 0.8918032786885246, "grad_norm": 0.36328125, "learning_rate": 1.6109471471699557e-07, "loss": 2.9328, "memory/device_mem_reserved(gib)": 50.82, "memory/max_mem_active(gib)": 45.14, "memory/max_mem_allocated(gib)": 45.14, "step": 442 }, { "epoch": 0.8938209331651955, "grad_norm": 0.314453125, "learning_rate": 1.5524545673899106e-07, "loss": 2.8429, "memory/device_mem_reserved(gib)": 50.82, "memory/max_mem_active(gib)": 45.14, "memory/max_mem_allocated(gib)": 45.14, "step": 443 }, { "epoch": 0.8958385876418663, "grad_norm": 0.3515625, "learning_rate": 1.4950095980035772e-07, "loss": 2.9344, "memory/device_mem_reserved(gib)": 50.82, "memory/max_mem_active(gib)": 45.14, "memory/max_mem_allocated(gib)": 45.14, "step": 444 }, { "epoch": 0.8978562421185372, "grad_norm": 0.330078125, "learning_rate": 1.438614805588634e-07, "loss": 2.8765, "memory/device_mem_reserved(gib)": 50.82, "memory/max_mem_active(gib)": 45.14, "memory/max_mem_allocated(gib)": 45.14, "step": 445 }, { "epoch": 0.8998738965952081, "grad_norm": 0.37109375, "learning_rate": 1.3832727098020333e-07, "loss": 2.7767, "memory/device_mem_reserved(gib)": 50.82, "memory/max_mem_active(gib)": 45.14, "memory/max_mem_allocated(gib)": 45.14, "step": 446 }, { "epoch": 0.9018915510718789, "grad_norm": 0.330078125, "learning_rate": 1.3289857832673947e-07, "loss": 2.9776, "memory/device_mem_reserved(gib)": 50.82, "memory/max_mem_active(gib)": 45.14, "memory/max_mem_allocated(gib)": 45.14, "step": 447 }, { "epoch": 0.9039092055485498, "grad_norm": 0.404296875, "learning_rate": 1.2757564514645492e-07, "loss": 2.8624, "memory/device_mem_reserved(gib)": 50.82, "memory/max_mem_active(gib)": 45.14, "memory/max_mem_allocated(gib)": 45.14, "step": 448 }, { "epoch": 0.9059268600252207, "grad_norm": 0.330078125, "learning_rate": 1.223587092621162e-07, "loss": 2.993, "memory/device_mem_reserved(gib)": 50.82, "memory/max_mem_active(gib)": 45.14, "memory/max_mem_allocated(gib)": 45.14, "step": 449 }, { "epoch": 0.9079445145018915, "grad_norm": 0.31640625, "learning_rate": 1.1724800376064799e-07, "loss": 2.8225, "memory/device_mem_reserved(gib)": 50.82, "memory/max_mem_active(gib)": 45.14, "memory/max_mem_allocated(gib)": 45.14, "step": 450 }, { "epoch": 0.9099621689785624, "grad_norm": 0.353515625, "learning_rate": 1.1224375698271894e-07, "loss": 2.928, "memory/device_mem_reserved(gib)": 50.82, "memory/max_mem_active(gib)": 45.14, "memory/max_mem_allocated(gib)": 45.14, "step": 451 }, { "epoch": 0.9119798234552333, "grad_norm": 0.32421875, "learning_rate": 1.0734619251253963e-07, "loss": 2.8997, "memory/device_mem_reserved(gib)": 50.82, "memory/max_mem_active(gib)": 45.14, "memory/max_mem_allocated(gib)": 45.14, "step": 452 }, { "epoch": 0.9139974779319041, "grad_norm": 0.337890625, "learning_rate": 1.0255552916787343e-07, "loss": 2.9446, "memory/device_mem_reserved(gib)": 50.82, "memory/max_mem_active(gib)": 45.14, "memory/max_mem_allocated(gib)": 45.14, "step": 453 }, { "epoch": 0.916015132408575, "grad_norm": 0.326171875, "learning_rate": 9.78719809902598e-08, "loss": 2.9681, "memory/device_mem_reserved(gib)": 50.82, "memory/max_mem_active(gib)": 45.14, "memory/max_mem_allocated(gib)": 45.14, "step": 454 }, { "epoch": 0.9180327868852459, "grad_norm": 0.33984375, "learning_rate": 9.329575723544925e-08, "loss": 2.9225, "memory/device_mem_reserved(gib)": 50.82, "memory/max_mem_active(gib)": 45.14, "memory/max_mem_allocated(gib)": 45.14, "step": 455 }, { "epoch": 0.9200504413619168, "grad_norm": 0.314453125, "learning_rate": 8.882706236405886e-08, "loss": 2.885, "memory/device_mem_reserved(gib)": 50.82, "memory/max_mem_active(gib)": 45.14, "memory/max_mem_allocated(gib)": 45.14, "step": 456 }, { "epoch": 0.9220680958385876, "grad_norm": 0.296875, "learning_rate": 8.446609603243117e-08, "loss": 2.8799, "memory/device_mem_reserved(gib)": 50.82, "memory/max_mem_active(gib)": 45.14, "memory/max_mem_allocated(gib)": 45.14, "step": 457 }, { "epoch": 0.9240857503152585, "grad_norm": 0.328125, "learning_rate": 8.021305308371891e-08, "loss": 2.8873, "memory/device_mem_reserved(gib)": 50.82, "memory/max_mem_active(gib)": 45.14, "memory/max_mem_allocated(gib)": 45.14, "step": 458 }, { "epoch": 0.9261034047919294, "grad_norm": 0.333984375, "learning_rate": 7.606812353917636e-08, "loss": 2.8536, "memory/device_mem_reserved(gib)": 50.82, "memory/max_mem_active(gib)": 45.14, "memory/max_mem_allocated(gib)": 45.14, "step": 459 }, { "epoch": 0.9281210592686002, "grad_norm": 0.32421875, "learning_rate": 7.203149258967035e-08, "loss": 2.8754, "memory/device_mem_reserved(gib)": 50.82, "memory/max_mem_active(gib)": 45.14, "memory/max_mem_allocated(gib)": 45.14, "step": 460 }, { "epoch": 0.9301387137452711, "grad_norm": 0.341796875, "learning_rate": 6.810334058740736e-08, "loss": 2.804, "memory/device_mem_reserved(gib)": 50.82, "memory/max_mem_active(gib)": 45.14, "memory/max_mem_allocated(gib)": 45.14, "step": 461 }, { "epoch": 0.932156368221942, "grad_norm": 0.357421875, "learning_rate": 6.428384303787282e-08, "loss": 2.8765, "memory/device_mem_reserved(gib)": 50.82, "memory/max_mem_active(gib)": 45.14, "memory/max_mem_allocated(gib)": 45.14, "step": 462 }, { "epoch": 0.9341740226986128, "grad_norm": 0.3359375, "learning_rate": 6.05731705919932e-08, "loss": 2.8596, "memory/device_mem_reserved(gib)": 50.82, "memory/max_mem_active(gib)": 45.14, "memory/max_mem_allocated(gib)": 45.14, "step": 463 }, { "epoch": 0.9361916771752837, "grad_norm": 0.326171875, "learning_rate": 5.697148903850869e-08, "loss": 2.8129, "memory/device_mem_reserved(gib)": 50.82, "memory/max_mem_active(gib)": 45.14, "memory/max_mem_allocated(gib)": 45.14, "step": 464 }, { "epoch": 0.9382093316519546, "grad_norm": 0.34765625, "learning_rate": 5.347895929656649e-08, "loss": 2.8172, "memory/device_mem_reserved(gib)": 50.82, "memory/max_mem_active(gib)": 45.14, "memory/max_mem_allocated(gib)": 45.14, "step": 465 }, { "epoch": 0.9402269861286254, "grad_norm": 0.31640625, "learning_rate": 5.009573740853313e-08, "loss": 2.8026, "memory/device_mem_reserved(gib)": 50.82, "memory/max_mem_active(gib)": 45.14, "memory/max_mem_allocated(gib)": 45.14, "step": 466 }, { "epoch": 0.9422446406052963, "grad_norm": 0.376953125, "learning_rate": 4.682197453301951e-08, "loss": 2.8827, "memory/device_mem_reserved(gib)": 50.82, "memory/max_mem_active(gib)": 45.14, "memory/max_mem_allocated(gib)": 45.14, "step": 467 }, { "epoch": 0.9442622950819672, "grad_norm": 0.330078125, "learning_rate": 4.365781693813048e-08, "loss": 2.8079, "memory/device_mem_reserved(gib)": 50.82, "memory/max_mem_active(gib)": 45.14, "memory/max_mem_allocated(gib)": 45.14, "step": 468 }, { "epoch": 0.946279949558638, "grad_norm": 0.4609375, "learning_rate": 4.060340599492646e-08, "loss": 2.8384, "memory/device_mem_reserved(gib)": 50.82, "memory/max_mem_active(gib)": 45.14, "memory/max_mem_allocated(gib)": 45.14, "step": 469 }, { "epoch": 0.9482976040353089, "grad_norm": 0.330078125, "learning_rate": 3.765887817111069e-08, "loss": 2.8274, "memory/device_mem_reserved(gib)": 50.82, "memory/max_mem_active(gib)": 45.14, "memory/max_mem_allocated(gib)": 45.14, "step": 470 }, { "epoch": 0.9503152585119798, "grad_norm": 0.328125, "learning_rate": 3.4824365024928585e-08, "loss": 3.0094, "memory/device_mem_reserved(gib)": 50.82, "memory/max_mem_active(gib)": 45.14, "memory/max_mem_allocated(gib)": 45.14, "step": 471 }, { "epoch": 0.9523329129886507, "grad_norm": 0.326171875, "learning_rate": 3.209999319929269e-08, "loss": 2.8901, "memory/device_mem_reserved(gib)": 50.82, "memory/max_mem_active(gib)": 45.14, "memory/max_mem_allocated(gib)": 45.14, "step": 472 }, { "epoch": 0.9543505674653215, "grad_norm": 0.345703125, "learning_rate": 2.9485884416122213e-08, "loss": 2.9069, "memory/device_mem_reserved(gib)": 50.82, "memory/max_mem_active(gib)": 45.14, "memory/max_mem_allocated(gib)": 45.14, "step": 473 }, { "epoch": 0.9563682219419924, "grad_norm": 0.34375, "learning_rate": 2.698215547090599e-08, "loss": 2.8147, "memory/device_mem_reserved(gib)": 50.82, "memory/max_mem_active(gib)": 45.14, "memory/max_mem_allocated(gib)": 45.14, "step": 474 }, { "epoch": 0.9583858764186634, "grad_norm": 0.33984375, "learning_rate": 2.458891822748444e-08, "loss": 2.8549, "memory/device_mem_reserved(gib)": 50.82, "memory/max_mem_active(gib)": 45.14, "memory/max_mem_allocated(gib)": 45.14, "step": 475 }, { "epoch": 0.9604035308953341, "grad_norm": 0.349609375, "learning_rate": 2.230627961304993e-08, "loss": 2.7994, "memory/device_mem_reserved(gib)": 50.82, "memory/max_mem_active(gib)": 45.14, "memory/max_mem_allocated(gib)": 45.14, "step": 476 }, { "epoch": 0.962421185372005, "grad_norm": 0.310546875, "learning_rate": 2.0134341613370633e-08, "loss": 2.9219, "memory/device_mem_reserved(gib)": 50.82, "memory/max_mem_active(gib)": 45.14, "memory/max_mem_allocated(gib)": 45.14, "step": 477 }, { "epoch": 0.964438839848676, "grad_norm": 0.330078125, "learning_rate": 1.8073201268234142e-08, "loss": 2.8302, "memory/device_mem_reserved(gib)": 50.82, "memory/max_mem_active(gib)": 45.14, "memory/max_mem_allocated(gib)": 45.14, "step": 478 }, { "epoch": 0.9664564943253467, "grad_norm": 0.31640625, "learning_rate": 1.612295066711095e-08, "loss": 2.8299, "memory/device_mem_reserved(gib)": 50.82, "memory/max_mem_active(gib)": 45.14, "memory/max_mem_allocated(gib)": 45.14, "step": 479 }, { "epoch": 0.9684741488020177, "grad_norm": 0.37890625, "learning_rate": 1.4283676945041348e-08, "loss": 2.9729, "memory/device_mem_reserved(gib)": 50.82, "memory/max_mem_active(gib)": 45.14, "memory/max_mem_allocated(gib)": 45.14, "step": 480 }, { "epoch": 0.9704918032786886, "grad_norm": 0.314453125, "learning_rate": 1.255546227873966e-08, "loss": 2.8149, "memory/device_mem_reserved(gib)": 50.82, "memory/max_mem_active(gib)": 45.14, "memory/max_mem_allocated(gib)": 45.14, "step": 481 }, { "epoch": 0.9725094577553594, "grad_norm": 0.318359375, "learning_rate": 1.0938383882926618e-08, "loss": 2.848, "memory/device_mem_reserved(gib)": 50.82, "memory/max_mem_active(gib)": 45.14, "memory/max_mem_allocated(gib)": 45.14, "step": 482 }, { "epoch": 0.9745271122320303, "grad_norm": 0.310546875, "learning_rate": 9.432514006875725e-09, "loss": 2.8566, "memory/device_mem_reserved(gib)": 50.82, "memory/max_mem_active(gib)": 45.14, "memory/max_mem_allocated(gib)": 45.14, "step": 483 }, { "epoch": 0.9765447667087012, "grad_norm": 0.349609375, "learning_rate": 8.037919931187243e-09, "loss": 2.8315, "memory/device_mem_reserved(gib)": 50.82, "memory/max_mem_active(gib)": 45.14, "memory/max_mem_allocated(gib)": 45.14, "step": 484 }, { "epoch": 0.978562421185372, "grad_norm": 0.34765625, "learning_rate": 6.754663964781971e-09, "loss": 2.9492, "memory/device_mem_reserved(gib)": 50.82, "memory/max_mem_active(gib)": 45.14, "memory/max_mem_allocated(gib)": 45.14, "step": 485 }, { "epoch": 0.9805800756620429, "grad_norm": 0.326171875, "learning_rate": 5.582803442117091e-09, "loss": 2.7673, "memory/device_mem_reserved(gib)": 50.82, "memory/max_mem_active(gib)": 45.14, "memory/max_mem_allocated(gib)": 45.14, "step": 486 }, { "epoch": 0.9825977301387138, "grad_norm": 0.310546875, "learning_rate": 4.522390720624603e-09, "loss": 2.8186, "memory/device_mem_reserved(gib)": 50.82, "memory/max_mem_active(gib)": 45.14, "memory/max_mem_allocated(gib)": 45.14, "step": 487 }, { "epoch": 0.9846153846153847, "grad_norm": 0.31640625, "learning_rate": 3.573473178371534e-09, "loss": 2.8907, "memory/device_mem_reserved(gib)": 50.82, "memory/max_mem_active(gib)": 45.14, "memory/max_mem_allocated(gib)": 45.14, "step": 488 }, { "epoch": 0.9866330390920555, "grad_norm": 0.349609375, "learning_rate": 2.736093211944679e-09, "loss": 2.8521, "memory/device_mem_reserved(gib)": 50.82, "memory/max_mem_active(gib)": 45.14, "memory/max_mem_allocated(gib)": 45.14, "step": 489 }, { "epoch": 0.9886506935687264, "grad_norm": 0.3671875, "learning_rate": 2.0102882345540696e-09, "loss": 2.9154, "memory/device_mem_reserved(gib)": 50.82, "memory/max_mem_active(gib)": 45.14, "memory/max_mem_allocated(gib)": 45.14, "step": 490 }, { "epoch": 0.9906683480453973, "grad_norm": 0.333984375, "learning_rate": 1.3960906743634706e-09, "loss": 2.9166, "memory/device_mem_reserved(gib)": 50.82, "memory/max_mem_active(gib)": 45.14, "memory/max_mem_allocated(gib)": 45.14, "step": 491 }, { "epoch": 0.9926860025220681, "grad_norm": 0.349609375, "learning_rate": 8.935279730407087e-10, "loss": 2.8178, "memory/device_mem_reserved(gib)": 50.82, "memory/max_mem_active(gib)": 45.14, "memory/max_mem_allocated(gib)": 45.14, "step": 492 }, { "epoch": 0.994703656998739, "grad_norm": 0.3203125, "learning_rate": 5.026225845308763e-10, "loss": 2.8598, "memory/device_mem_reserved(gib)": 50.82, "memory/max_mem_active(gib)": 45.14, "memory/max_mem_allocated(gib)": 45.14, "step": 493 }, { "epoch": 0.9967213114754099, "grad_norm": 0.310546875, "learning_rate": 2.2339197405490952e-10, "loss": 2.9274, "memory/device_mem_reserved(gib)": 50.82, "memory/max_mem_active(gib)": 45.14, "memory/max_mem_allocated(gib)": 45.14, "step": 494 }, { "epoch": 0.9987389659520807, "grad_norm": 0.34765625, "learning_rate": 5.5848617327436404e-11, "loss": 3.005, "memory/device_mem_reserved(gib)": 50.82, "memory/max_mem_active(gib)": 45.14, "memory/max_mem_allocated(gib)": 45.14, "step": 495 } ], "logging_steps": 1, "max_steps": 495, "num_input_tokens_seen": 0, "num_train_epochs": 1, "save_steps": 248, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": true }, "attributes": {} } }, "total_flos": 6.453534295012147e+18, "train_batch_size": 8, "trial_name": null, "trial_params": null }