{ "best_global_step": null, "best_metric": null, "best_model_checkpoint": null, "epoch": 1.0, "eval_steps": 500, "global_step": 237, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.004219409282700422, "grad_norm": 56.492855072021484, "learning_rate": 0.0, "loss": 3.4465, "mean_token_accuracy": 0.43144476413726807, "step": 1 }, { "epoch": 0.008438818565400843, "grad_norm": 67.59725189208984, "learning_rate": 4.1666666666666667e-07, "loss": 3.5603, "mean_token_accuracy": 0.4281432032585144, "step": 2 }, { "epoch": 0.012658227848101266, "grad_norm": 57.31755065917969, "learning_rate": 8.333333333333333e-07, "loss": 3.5257, "mean_token_accuracy": 0.4222416579723358, "step": 3 }, { "epoch": 0.016877637130801686, "grad_norm": 60.031646728515625, "learning_rate": 1.25e-06, "loss": 3.5073, "mean_token_accuracy": 0.4332667887210846, "step": 4 }, { "epoch": 0.02109704641350211, "grad_norm": 52.10445022583008, "learning_rate": 1.6666666666666667e-06, "loss": 3.3834, "mean_token_accuracy": 0.44102635979652405, "step": 5 }, { "epoch": 0.02531645569620253, "grad_norm": 49.48136520385742, "learning_rate": 2.0833333333333334e-06, "loss": 3.2116, "mean_token_accuracy": 0.4614466726779938, "step": 6 }, { "epoch": 0.029535864978902954, "grad_norm": 57.7535400390625, "learning_rate": 2.5e-06, "loss": 3.1654, "mean_token_accuracy": 0.4769577383995056, "step": 7 }, { "epoch": 0.03375527426160337, "grad_norm": 55.622840881347656, "learning_rate": 2.916666666666667e-06, "loss": 2.7736, "mean_token_accuracy": 0.5238496661186218, "step": 8 }, { "epoch": 0.0379746835443038, "grad_norm": 53.84645080566406, "learning_rate": 3.3333333333333333e-06, "loss": 2.7429, "mean_token_accuracy": 0.5103054046630859, "step": 9 }, { "epoch": 0.04219409282700422, "grad_norm": 49.74085998535156, "learning_rate": 3.7500000000000005e-06, "loss": 2.7235, "mean_token_accuracy": 0.5065439343452454, "step": 10 }, { "epoch": 0.046413502109704644, "grad_norm": 55.516387939453125, "learning_rate": 4.166666666666667e-06, "loss": 2.1734, "mean_token_accuracy": 0.5752225518226624, "step": 11 }, { "epoch": 0.05063291139240506, "grad_norm": 51.78127670288086, "learning_rate": 4.583333333333333e-06, "loss": 1.9645, "mean_token_accuracy": 0.5979234576225281, "step": 12 }, { "epoch": 0.05485232067510549, "grad_norm": 41.615440368652344, "learning_rate": 5e-06, "loss": 1.9885, "mean_token_accuracy": 0.5931289196014404, "step": 13 }, { "epoch": 0.05907172995780591, "grad_norm": 18.521259307861328, "learning_rate": 5.416666666666667e-06, "loss": 1.6583, "mean_token_accuracy": 0.6572901606559753, "step": 14 }, { "epoch": 0.06329113924050633, "grad_norm": 16.77658462524414, "learning_rate": 5.833333333333334e-06, "loss": 1.2774, "mean_token_accuracy": 0.7345430254936218, "step": 15 }, { "epoch": 0.06751054852320675, "grad_norm": 16.902246475219727, "learning_rate": 6.25e-06, "loss": 1.4676, "mean_token_accuracy": 0.6940106153488159, "step": 16 }, { "epoch": 0.07172995780590717, "grad_norm": 4.634657382965088, "learning_rate": 6.666666666666667e-06, "loss": 1.331, "mean_token_accuracy": 0.7168394327163696, "step": 17 }, { "epoch": 0.0759493670886076, "grad_norm": 2.60319447517395, "learning_rate": 7.083333333333335e-06, "loss": 1.2297, "mean_token_accuracy": 0.738610565662384, "step": 18 }, { "epoch": 0.08016877637130802, "grad_norm": 2.7165443897247314, "learning_rate": 7.500000000000001e-06, "loss": 1.0861, "mean_token_accuracy": 0.7637795209884644, "step": 19 }, { "epoch": 0.08438818565400844, "grad_norm": 3.0834693908691406, "learning_rate": 7.916666666666667e-06, "loss": 1.2613, "mean_token_accuracy": 0.7090403437614441, "step": 20 }, { "epoch": 0.08860759493670886, "grad_norm": 2.5113463401794434, "learning_rate": 8.333333333333334e-06, "loss": 1.1188, "mean_token_accuracy": 0.7468186020851135, "step": 21 }, { "epoch": 0.09282700421940929, "grad_norm": 2.150643825531006, "learning_rate": 8.750000000000001e-06, "loss": 1.2615, "mean_token_accuracy": 0.7228550910949707, "step": 22 }, { "epoch": 0.0970464135021097, "grad_norm": 2.469194173812866, "learning_rate": 9.166666666666666e-06, "loss": 1.198, "mean_token_accuracy": 0.7393919825553894, "step": 23 }, { "epoch": 0.10126582278481013, "grad_norm": 2.4474411010742188, "learning_rate": 9.583333333333335e-06, "loss": 1.096, "mean_token_accuracy": 0.7572800517082214, "step": 24 }, { "epoch": 0.10548523206751055, "grad_norm": 2.088087320327759, "learning_rate": 1e-05, "loss": 0.985, "mean_token_accuracy": 0.7962148785591125, "step": 25 }, { "epoch": 0.10970464135021098, "grad_norm": 1.9643747806549072, "learning_rate": 9.999510542279196e-06, "loss": 1.209, "mean_token_accuracy": 0.729297935962677, "step": 26 }, { "epoch": 0.11392405063291139, "grad_norm": 2.398698568344116, "learning_rate": 9.998042275591827e-06, "loss": 1.1201, "mean_token_accuracy": 0.755721390247345, "step": 27 }, { "epoch": 0.11814345991561181, "grad_norm": 2.144404888153076, "learning_rate": 9.995595519339882e-06, "loss": 1.2033, "mean_token_accuracy": 0.737983763217926, "step": 28 }, { "epoch": 0.12236286919831224, "grad_norm": 1.771583914756775, "learning_rate": 9.992170805782799e-06, "loss": 1.2612, "mean_token_accuracy": 0.7275724411010742, "step": 29 }, { "epoch": 0.12658227848101267, "grad_norm": 1.857326626777649, "learning_rate": 9.987768879921685e-06, "loss": 1.089, "mean_token_accuracy": 0.7615371942520142, "step": 30 }, { "epoch": 0.1308016877637131, "grad_norm": 1.5715709924697876, "learning_rate": 9.982390699337253e-06, "loss": 1.1158, "mean_token_accuracy": 0.748799741268158, "step": 31 }, { "epoch": 0.1350210970464135, "grad_norm": 1.6975351572036743, "learning_rate": 9.976037433981505e-06, "loss": 1.2293, "mean_token_accuracy": 0.7248790264129639, "step": 32 }, { "epoch": 0.13924050632911392, "grad_norm": 1.6748303174972534, "learning_rate": 9.968710465923233e-06, "loss": 1.0872, "mean_token_accuracy": 0.7640039920806885, "step": 33 }, { "epoch": 0.14345991561181434, "grad_norm": 1.5389747619628906, "learning_rate": 9.960411389047366e-06, "loss": 1.091, "mean_token_accuracy": 0.7640097737312317, "step": 34 }, { "epoch": 0.14767932489451477, "grad_norm": 1.5563571453094482, "learning_rate": 9.951142008708238e-06, "loss": 1.1572, "mean_token_accuracy": 0.7440212368965149, "step": 35 }, { "epoch": 0.1518987341772152, "grad_norm": 1.5131391286849976, "learning_rate": 9.940904341336859e-06, "loss": 1.2426, "mean_token_accuracy": 0.7158628106117249, "step": 36 }, { "epoch": 0.15611814345991562, "grad_norm": 1.5075509548187256, "learning_rate": 9.929700614002265e-06, "loss": 1.1656, "mean_token_accuracy": 0.7480515837669373, "step": 37 }, { "epoch": 0.16033755274261605, "grad_norm": 1.4747117757797241, "learning_rate": 9.91753326392706e-06, "loss": 1.2723, "mean_token_accuracy": 0.7222742438316345, "step": 38 }, { "epoch": 0.16455696202531644, "grad_norm": 1.5765788555145264, "learning_rate": 9.904404937957213e-06, "loss": 1.1191, "mean_token_accuracy": 0.7518899440765381, "step": 39 }, { "epoch": 0.16877637130801687, "grad_norm": 1.4843032360076904, "learning_rate": 9.890318491986282e-06, "loss": 1.1076, "mean_token_accuracy": 0.7483693361282349, "step": 40 }, { "epoch": 0.1729957805907173, "grad_norm": 1.5332094430923462, "learning_rate": 9.87527699033415e-06, "loss": 1.118, "mean_token_accuracy": 0.7508232593536377, "step": 41 }, { "epoch": 0.17721518987341772, "grad_norm": 1.5774556398391724, "learning_rate": 9.85928370508043e-06, "loss": 1.1516, "mean_token_accuracy": 0.7587430477142334, "step": 42 }, { "epoch": 0.18143459915611815, "grad_norm": 1.5169801712036133, "learning_rate": 9.842342115352647e-06, "loss": 1.0526, "mean_token_accuracy": 0.7587090730667114, "step": 43 }, { "epoch": 0.18565400843881857, "grad_norm": 1.476678729057312, "learning_rate": 9.824455906569423e-06, "loss": 1.1723, "mean_token_accuracy": 0.7248958349227905, "step": 44 }, { "epoch": 0.189873417721519, "grad_norm": 1.8056585788726807, "learning_rate": 9.805628969638757e-06, "loss": 1.2247, "mean_token_accuracy": 0.728428065776825, "step": 45 }, { "epoch": 0.1940928270042194, "grad_norm": 1.4431560039520264, "learning_rate": 9.785865400111593e-06, "loss": 1.1691, "mean_token_accuracy": 0.7389825582504272, "step": 46 }, { "epoch": 0.19831223628691982, "grad_norm": 1.5770704746246338, "learning_rate": 9.765169497290908e-06, "loss": 1.2224, "mean_token_accuracy": 0.7342731356620789, "step": 47 }, { "epoch": 0.20253164556962025, "grad_norm": 1.505393624305725, "learning_rate": 9.743545763296451e-06, "loss": 1.1314, "mean_token_accuracy": 0.731634795665741, "step": 48 }, { "epoch": 0.20675105485232068, "grad_norm": 1.5118578672409058, "learning_rate": 9.720998902085354e-06, "loss": 1.0991, "mean_token_accuracy": 0.7543997764587402, "step": 49 }, { "epoch": 0.2109704641350211, "grad_norm": 1.4458030462265015, "learning_rate": 9.697533818428863e-06, "loss": 1.101, "mean_token_accuracy": 0.7582892775535583, "step": 50 }, { "epoch": 0.21518987341772153, "grad_norm": 1.3428678512573242, "learning_rate": 9.673155616845362e-06, "loss": 1.219, "mean_token_accuracy": 0.7282519936561584, "step": 51 }, { "epoch": 0.21940928270042195, "grad_norm": 1.4228544235229492, "learning_rate": 9.647869600489954e-06, "loss": 1.3229, "mean_token_accuracy": 0.7029238343238831, "step": 52 }, { "epoch": 0.22362869198312235, "grad_norm": 1.427255630493164, "learning_rate": 9.621681270000833e-06, "loss": 1.0733, "mean_token_accuracy": 0.7589775919914246, "step": 53 }, { "epoch": 0.22784810126582278, "grad_norm": 1.4552180767059326, "learning_rate": 9.594596322302688e-06, "loss": 1.0936, "mean_token_accuracy": 0.7616468071937561, "step": 54 }, { "epoch": 0.2320675105485232, "grad_norm": 1.6069968938827515, "learning_rate": 9.566620649367418e-06, "loss": 1.2073, "mean_token_accuracy": 0.7224194407463074, "step": 55 }, { "epoch": 0.23628691983122363, "grad_norm": 1.412222146987915, "learning_rate": 9.537760336932406e-06, "loss": 1.1143, "mean_token_accuracy": 0.7492688298225403, "step": 56 }, { "epoch": 0.24050632911392406, "grad_norm": 1.4546841382980347, "learning_rate": 9.508021663176648e-06, "loss": 1.1594, "mean_token_accuracy": 0.7387874126434326, "step": 57 }, { "epoch": 0.24472573839662448, "grad_norm": 1.3827760219573975, "learning_rate": 9.477411097355025e-06, "loss": 1.1539, "mean_token_accuracy": 0.7320411801338196, "step": 58 }, { "epoch": 0.2489451476793249, "grad_norm": 1.4128773212432861, "learning_rate": 9.445935298390994e-06, "loss": 1.1729, "mean_token_accuracy": 0.7295903563499451, "step": 59 }, { "epoch": 0.25316455696202533, "grad_norm": 1.436807632446289, "learning_rate": 9.413601113428032e-06, "loss": 1.1122, "mean_token_accuracy": 0.7527050375938416, "step": 60 }, { "epoch": 0.25738396624472576, "grad_norm": 1.4155519008636475, "learning_rate": 9.380415576340127e-06, "loss": 1.134, "mean_token_accuracy": 0.748845100402832, "step": 61 }, { "epoch": 0.2616033755274262, "grad_norm": 1.3568230867385864, "learning_rate": 9.346385906201653e-06, "loss": 1.0927, "mean_token_accuracy": 0.7473200559616089, "step": 62 }, { "epoch": 0.26582278481012656, "grad_norm": 1.3414636850357056, "learning_rate": 9.311519505716963e-06, "loss": 1.116, "mean_token_accuracy": 0.7434830665588379, "step": 63 }, { "epoch": 0.270042194092827, "grad_norm": 1.4687381982803345, "learning_rate": 9.275823959610019e-06, "loss": 1.1996, "mean_token_accuracy": 0.7335405349731445, "step": 64 }, { "epoch": 0.2742616033755274, "grad_norm": 1.3659785985946655, "learning_rate": 9.239307032974438e-06, "loss": 1.1548, "mean_token_accuracy": 0.7464311718940735, "step": 65 }, { "epoch": 0.27848101265822783, "grad_norm": 1.3824652433395386, "learning_rate": 9.201976669584299e-06, "loss": 1.0442, "mean_token_accuracy": 0.7634408473968506, "step": 66 }, { "epoch": 0.28270042194092826, "grad_norm": 1.3764418363571167, "learning_rate": 9.163840990166085e-06, "loss": 1.163, "mean_token_accuracy": 0.7362051606178284, "step": 67 }, { "epoch": 0.2869198312236287, "grad_norm": 1.4887207746505737, "learning_rate": 9.124908290632119e-06, "loss": 1.2148, "mean_token_accuracy": 0.7365679144859314, "step": 68 }, { "epoch": 0.2911392405063291, "grad_norm": 1.412133812904358, "learning_rate": 9.085187040275903e-06, "loss": 1.1769, "mean_token_accuracy": 0.7347424626350403, "step": 69 }, { "epoch": 0.29535864978902954, "grad_norm": 1.4466365575790405, "learning_rate": 9.044685879929734e-06, "loss": 1.1561, "mean_token_accuracy": 0.736750066280365, "step": 70 }, { "epoch": 0.29957805907172996, "grad_norm": 1.452510952949524, "learning_rate": 9.003413620085002e-06, "loss": 1.0451, "mean_token_accuracy": 0.7633549571037292, "step": 71 }, { "epoch": 0.3037974683544304, "grad_norm": 1.4171886444091797, "learning_rate": 8.961379238975594e-06, "loss": 1.1657, "mean_token_accuracy": 0.7385682463645935, "step": 72 }, { "epoch": 0.3080168776371308, "grad_norm": 1.3945873975753784, "learning_rate": 8.918591880624783e-06, "loss": 1.154, "mean_token_accuracy": 0.7416154742240906, "step": 73 }, { "epoch": 0.31223628691983124, "grad_norm": 1.3587473630905151, "learning_rate": 8.875060852856082e-06, "loss": 1.1147, "mean_token_accuracy": 0.7544776201248169, "step": 74 }, { "epoch": 0.31645569620253167, "grad_norm": 1.3392646312713623, "learning_rate": 8.830795625268437e-06, "loss": 1.0643, "mean_token_accuracy": 0.7560762763023376, "step": 75 }, { "epoch": 0.3206751054852321, "grad_norm": 1.3936909437179565, "learning_rate": 8.785805827176256e-06, "loss": 1.0533, "mean_token_accuracy": 0.7552059292793274, "step": 76 }, { "epoch": 0.32489451476793246, "grad_norm": 1.3902052640914917, "learning_rate": 8.740101245514659e-06, "loss": 0.9963, "mean_token_accuracy": 0.784263014793396, "step": 77 }, { "epoch": 0.3291139240506329, "grad_norm": 1.37273371219635, "learning_rate": 8.69369182271048e-06, "loss": 1.0645, "mean_token_accuracy": 0.7502308487892151, "step": 78 }, { "epoch": 0.3333333333333333, "grad_norm": 1.4530737400054932, "learning_rate": 8.646587654519413e-06, "loss": 1.048, "mean_token_accuracy": 0.765147864818573, "step": 79 }, { "epoch": 0.33755274261603374, "grad_norm": 1.351982593536377, "learning_rate": 8.598798987829816e-06, "loss": 1.1773, "mean_token_accuracy": 0.7323355674743652, "step": 80 }, { "epoch": 0.34177215189873417, "grad_norm": 1.4358004331588745, "learning_rate": 8.550336218433631e-06, "loss": 1.0669, "mean_token_accuracy": 0.7604398727416992, "step": 81 }, { "epoch": 0.3459915611814346, "grad_norm": 1.4607288837432861, "learning_rate": 8.501209888764928e-06, "loss": 1.1499, "mean_token_accuracy": 0.7327597737312317, "step": 82 }, { "epoch": 0.350210970464135, "grad_norm": 1.3822574615478516, "learning_rate": 8.451430685606532e-06, "loss": 1.0957, "mean_token_accuracy": 0.7444460988044739, "step": 83 }, { "epoch": 0.35443037974683544, "grad_norm": 1.3216774463653564, "learning_rate": 8.401009437765248e-06, "loss": 1.0046, "mean_token_accuracy": 0.767160177230835, "step": 84 }, { "epoch": 0.35864978902953587, "grad_norm": 1.6017900705337524, "learning_rate": 8.349957113716213e-06, "loss": 1.0585, "mean_token_accuracy": 0.7692307829856873, "step": 85 }, { "epoch": 0.3628691983122363, "grad_norm": 1.3391270637512207, "learning_rate": 8.29828481921683e-06, "loss": 1.2253, "mean_token_accuracy": 0.7242193818092346, "step": 86 }, { "epoch": 0.3670886075949367, "grad_norm": 1.373449683189392, "learning_rate": 8.246003794890885e-06, "loss": 1.0709, "mean_token_accuracy": 0.757455587387085, "step": 87 }, { "epoch": 0.37130801687763715, "grad_norm": 1.4508413076400757, "learning_rate": 8.19312541378326e-06, "loss": 1.0736, "mean_token_accuracy": 0.7621853947639465, "step": 88 }, { "epoch": 0.3755274261603376, "grad_norm": 1.3206504583358765, "learning_rate": 8.139661178885912e-06, "loss": 1.1301, "mean_token_accuracy": 0.7476067543029785, "step": 89 }, { "epoch": 0.379746835443038, "grad_norm": 1.3155889511108398, "learning_rate": 8.085622720635536e-06, "loss": 1.1623, "mean_token_accuracy": 0.743880569934845, "step": 90 }, { "epoch": 0.38396624472573837, "grad_norm": 1.4986933469772339, "learning_rate": 8.031021794383513e-06, "loss": 0.9997, "mean_token_accuracy": 0.7751938104629517, "step": 91 }, { "epoch": 0.3881856540084388, "grad_norm": 1.3492070436477661, "learning_rate": 7.975870277838695e-06, "loss": 1.1146, "mean_token_accuracy": 0.7491279244422913, "step": 92 }, { "epoch": 0.3924050632911392, "grad_norm": 1.3896995782852173, "learning_rate": 7.920180168483565e-06, "loss": 1.0942, "mean_token_accuracy": 0.7597774267196655, "step": 93 }, { "epoch": 0.39662447257383965, "grad_norm": 1.3334070444107056, "learning_rate": 7.863963580964344e-06, "loss": 1.0688, "mean_token_accuracy": 0.7475572228431702, "step": 94 }, { "epoch": 0.4008438818565401, "grad_norm": 1.4157761335372925, "learning_rate": 7.80723274445561e-06, "loss": 1.1208, "mean_token_accuracy": 0.731880784034729, "step": 95 }, { "epoch": 0.4050632911392405, "grad_norm": 1.4356557130813599, "learning_rate": 7.75e-06, "loss": 1.0452, "mean_token_accuracy": 0.7568075060844421, "step": 96 }, { "epoch": 0.4092827004219409, "grad_norm": 1.4047561883926392, "learning_rate": 7.692277797823585e-06, "loss": 1.0809, "mean_token_accuracy": 0.7537745237350464, "step": 97 }, { "epoch": 0.41350210970464135, "grad_norm": 1.3407511711120605, "learning_rate": 7.634078694627483e-06, "loss": 1.1126, "mean_token_accuracy": 0.7462708353996277, "step": 98 }, { "epoch": 0.4177215189873418, "grad_norm": 1.371414303779602, "learning_rate": 7.575415350856316e-06, "loss": 1.1067, "mean_token_accuracy": 0.7437711954116821, "step": 99 }, { "epoch": 0.4219409282700422, "grad_norm": 1.3650238513946533, "learning_rate": 7.516300527944104e-06, "loss": 1.0487, "mean_token_accuracy": 0.7662240862846375, "step": 100 }, { "epoch": 0.42616033755274263, "grad_norm": 1.3607968091964722, "learning_rate": 7.456747085538173e-06, "loss": 1.0891, "mean_token_accuracy": 0.7554155588150024, "step": 101 }, { "epoch": 0.43037974683544306, "grad_norm": 1.3348058462142944, "learning_rate": 7.3967679787017166e-06, "loss": 1.147, "mean_token_accuracy": 0.7373363971710205, "step": 102 }, { "epoch": 0.4345991561181435, "grad_norm": 1.4145907163619995, "learning_rate": 7.336376255095592e-06, "loss": 1.093, "mean_token_accuracy": 0.7504742741584778, "step": 103 }, { "epoch": 0.4388185654008439, "grad_norm": 1.3660974502563477, "learning_rate": 7.275585052139975e-06, "loss": 1.2449, "mean_token_accuracy": 0.7118442058563232, "step": 104 }, { "epoch": 0.4430379746835443, "grad_norm": 1.3254221677780151, "learning_rate": 7.2144075941564835e-06, "loss": 1.1414, "mean_token_accuracy": 0.7412657737731934, "step": 105 }, { "epoch": 0.4472573839662447, "grad_norm": 1.3703272342681885, "learning_rate": 7.152857189491406e-06, "loss": 1.1501, "mean_token_accuracy": 0.7333996891975403, "step": 106 }, { "epoch": 0.45147679324894513, "grad_norm": 1.3824867010116577, "learning_rate": 7.090947227620646e-06, "loss": 1.0566, "mean_token_accuracy": 0.761897087097168, "step": 107 }, { "epoch": 0.45569620253164556, "grad_norm": 1.353622317314148, "learning_rate": 7.028691176237018e-06, "loss": 1.1275, "mean_token_accuracy": 0.7441860437393188, "step": 108 }, { "epoch": 0.459915611814346, "grad_norm": 1.343522548675537, "learning_rate": 6.966102578320531e-06, "loss": 1.0808, "mean_token_accuracy": 0.750849723815918, "step": 109 }, { "epoch": 0.4641350210970464, "grad_norm": 1.2934088706970215, "learning_rate": 6.903195049192285e-06, "loss": 1.0279, "mean_token_accuracy": 0.7599811553955078, "step": 110 }, { "epoch": 0.46835443037974683, "grad_norm": 1.288739562034607, "learning_rate": 6.839982273552651e-06, "loss": 1.1493, "mean_token_accuracy": 0.7389063835144043, "step": 111 }, { "epoch": 0.47257383966244726, "grad_norm": 1.377466082572937, "learning_rate": 6.776478002504335e-06, "loss": 1.0465, "mean_token_accuracy": 0.7633302211761475, "step": 112 }, { "epoch": 0.4767932489451477, "grad_norm": 1.2997636795043945, "learning_rate": 6.712696050561014e-06, "loss": 1.129, "mean_token_accuracy": 0.7459065914154053, "step": 113 }, { "epoch": 0.4810126582278481, "grad_norm": 1.320365071296692, "learning_rate": 6.648650292642166e-06, "loss": 1.1049, "mean_token_accuracy": 0.7416779398918152, "step": 114 }, { "epoch": 0.48523206751054854, "grad_norm": 1.377439260482788, "learning_rate": 6.584354661054765e-06, "loss": 1.004, "mean_token_accuracy": 0.7778915166854858, "step": 115 }, { "epoch": 0.48945147679324896, "grad_norm": 1.2823392152786255, "learning_rate": 6.519823142462501e-06, "loss": 1.0903, "mean_token_accuracy": 0.7483987808227539, "step": 116 }, { "epoch": 0.4936708860759494, "grad_norm": 1.3214064836502075, "learning_rate": 6.4550697748431545e-06, "loss": 1.1488, "mean_token_accuracy": 0.7386626601219177, "step": 117 }, { "epoch": 0.4978902953586498, "grad_norm": 1.4070407152175903, "learning_rate": 6.390108644434828e-06, "loss": 1.1147, "mean_token_accuracy": 0.7577263712882996, "step": 118 }, { "epoch": 0.5021097046413502, "grad_norm": 1.3729873895645142, "learning_rate": 6.32495388267167e-06, "loss": 1.1647, "mean_token_accuracy": 0.7341417670249939, "step": 119 }, { "epoch": 0.5063291139240507, "grad_norm": 1.3975082635879517, "learning_rate": 6.259619663109762e-06, "loss": 1.0578, "mean_token_accuracy": 0.7666885852813721, "step": 120 }, { "epoch": 0.510548523206751, "grad_norm": 1.3510339260101318, "learning_rate": 6.19412019834386e-06, "loss": 1.0236, "mean_token_accuracy": 0.7666836380958557, "step": 121 }, { "epoch": 0.5147679324894515, "grad_norm": 1.2858039140701294, "learning_rate": 6.1284697369156276e-06, "loss": 1.1033, "mean_token_accuracy": 0.7595654726028442, "step": 122 }, { "epoch": 0.5189873417721519, "grad_norm": 1.325225830078125, "learning_rate": 6.062682560214053e-06, "loss": 1.0949, "mean_token_accuracy": 0.749600350856781, "step": 123 }, { "epoch": 0.5232067510548524, "grad_norm": 1.3359731435775757, "learning_rate": 5.996772979368715e-06, "loss": 1.0567, "mean_token_accuracy": 0.7532724738121033, "step": 124 }, { "epoch": 0.5274261603375527, "grad_norm": 1.315093755722046, "learning_rate": 5.930755332136604e-06, "loss": 1.0807, "mean_token_accuracy": 0.7582170367240906, "step": 125 }, { "epoch": 0.5316455696202531, "grad_norm": 1.3243067264556885, "learning_rate": 5.86464397978312e-06, "loss": 1.1816, "mean_token_accuracy": 0.7343863248825073, "step": 126 }, { "epoch": 0.5358649789029536, "grad_norm": 1.2779967784881592, "learning_rate": 5.798453303957968e-06, "loss": 1.2504, "mean_token_accuracy": 0.7217909097671509, "step": 127 }, { "epoch": 0.540084388185654, "grad_norm": 1.363757848739624, "learning_rate": 5.73219770356663e-06, "loss": 1.1925, "mean_token_accuracy": 0.7341822981834412, "step": 128 }, { "epoch": 0.5443037974683544, "grad_norm": 1.3086532354354858, "learning_rate": 5.665891591638063e-06, "loss": 1.1309, "mean_token_accuracy": 0.7532561421394348, "step": 129 }, { "epoch": 0.5485232067510548, "grad_norm": 1.367034673690796, "learning_rate": 5.5995493921893415e-06, "loss": 0.9554, "mean_token_accuracy": 0.7872604727745056, "step": 130 }, { "epoch": 0.5527426160337553, "grad_norm": 1.3385508060455322, "learning_rate": 5.533185537087906e-06, "loss": 1.0339, "mean_token_accuracy": 0.76613849401474, "step": 131 }, { "epoch": 0.5569620253164557, "grad_norm": 1.315830945968628, "learning_rate": 5.4668144629120945e-06, "loss": 1.1457, "mean_token_accuracy": 0.7451325058937073, "step": 132 }, { "epoch": 0.5611814345991561, "grad_norm": 1.4468579292297363, "learning_rate": 5.40045060781066e-06, "loss": 1.0689, "mean_token_accuracy": 0.7672370076179504, "step": 133 }, { "epoch": 0.5654008438818565, "grad_norm": 1.419268250465393, "learning_rate": 5.33410840836194e-06, "loss": 1.0781, "mean_token_accuracy": 0.7676616907119751, "step": 134 }, { "epoch": 0.569620253164557, "grad_norm": 1.3178187608718872, "learning_rate": 5.267802296433372e-06, "loss": 1.0813, "mean_token_accuracy": 0.7537773847579956, "step": 135 }, { "epoch": 0.5738396624472574, "grad_norm": 1.352331280708313, "learning_rate": 5.201546696042033e-06, "loss": 1.1175, "mean_token_accuracy": 0.74346524477005, "step": 136 }, { "epoch": 0.5780590717299579, "grad_norm": 1.2893943786621094, "learning_rate": 5.13535602021688e-06, "loss": 1.146, "mean_token_accuracy": 0.7399700880050659, "step": 137 }, { "epoch": 0.5822784810126582, "grad_norm": 1.373579978942871, "learning_rate": 5.069244667863397e-06, "loss": 1.0754, "mean_token_accuracy": 0.747799277305603, "step": 138 }, { "epoch": 0.5864978902953587, "grad_norm": 1.3490039110183716, "learning_rate": 5.003227020631287e-06, "loss": 1.064, "mean_token_accuracy": 0.764332115650177, "step": 139 }, { "epoch": 0.5907172995780591, "grad_norm": 1.341293454170227, "learning_rate": 4.937317439785949e-06, "loss": 1.16, "mean_token_accuracy": 0.7380607724189758, "step": 140 }, { "epoch": 0.5949367088607594, "grad_norm": 1.2629183530807495, "learning_rate": 4.871530263084373e-06, "loss": 1.0139, "mean_token_accuracy": 0.7695739269256592, "step": 141 }, { "epoch": 0.5991561181434599, "grad_norm": 1.3168911933898926, "learning_rate": 4.80587980165614e-06, "loss": 1.1634, "mean_token_accuracy": 0.7377873659133911, "step": 142 }, { "epoch": 0.6033755274261603, "grad_norm": 1.3361846208572388, "learning_rate": 4.74038033689024e-06, "loss": 1.126, "mean_token_accuracy": 0.7410577535629272, "step": 143 }, { "epoch": 0.6075949367088608, "grad_norm": 1.3500585556030273, "learning_rate": 4.675046117328333e-06, "loss": 1.0654, "mean_token_accuracy": 0.7601693868637085, "step": 144 }, { "epoch": 0.6118143459915611, "grad_norm": 1.4159785509109497, "learning_rate": 4.609891355565172e-06, "loss": 1.0892, "mean_token_accuracy": 0.7582122683525085, "step": 145 }, { "epoch": 0.6160337552742616, "grad_norm": 1.3257557153701782, "learning_rate": 4.544930225156847e-06, "loss": 1.0994, "mean_token_accuracy": 0.7577795386314392, "step": 146 }, { "epoch": 0.620253164556962, "grad_norm": 1.3204015493392944, "learning_rate": 4.480176857537499e-06, "loss": 1.1573, "mean_token_accuracy": 0.7304109334945679, "step": 147 }, { "epoch": 0.6244725738396625, "grad_norm": 1.2947508096694946, "learning_rate": 4.415645338945236e-06, "loss": 1.1672, "mean_token_accuracy": 0.7346352934837341, "step": 148 }, { "epoch": 0.6286919831223629, "grad_norm": 1.4755604267120361, "learning_rate": 4.351349707357836e-06, "loss": 1.0376, "mean_token_accuracy": 0.7728136777877808, "step": 149 }, { "epoch": 0.6329113924050633, "grad_norm": 1.375398874282837, "learning_rate": 4.287303949438987e-06, "loss": 1.1152, "mean_token_accuracy": 0.751935601234436, "step": 150 }, { "epoch": 0.6371308016877637, "grad_norm": 1.297082543373108, "learning_rate": 4.223521997495665e-06, "loss": 1.1076, "mean_token_accuracy": 0.7536709904670715, "step": 151 }, { "epoch": 0.6413502109704642, "grad_norm": 1.289937138557434, "learning_rate": 4.160017726447352e-06, "loss": 1.0683, "mean_token_accuracy": 0.7612445950508118, "step": 152 }, { "epoch": 0.6455696202531646, "grad_norm": 1.3394392728805542, "learning_rate": 4.096804950807717e-06, "loss": 1.1548, "mean_token_accuracy": 0.7326341271400452, "step": 153 }, { "epoch": 0.6497890295358649, "grad_norm": 1.3397821187973022, "learning_rate": 4.033897421679472e-06, "loss": 1.0725, "mean_token_accuracy": 0.7469461560249329, "step": 154 }, { "epoch": 0.6540084388185654, "grad_norm": 1.314746379852295, "learning_rate": 3.971308823762983e-06, "loss": 1.021, "mean_token_accuracy": 0.7617459893226624, "step": 155 }, { "epoch": 0.6582278481012658, "grad_norm": 1.3997515439987183, "learning_rate": 3.9090527723793545e-06, "loss": 0.9554, "mean_token_accuracy": 0.783367931842804, "step": 156 }, { "epoch": 0.6624472573839663, "grad_norm": 1.3897455930709839, "learning_rate": 3.847142810508596e-06, "loss": 1.0588, "mean_token_accuracy": 0.7604950666427612, "step": 157 }, { "epoch": 0.6666666666666666, "grad_norm": 1.29047429561615, "learning_rate": 3.785592405843518e-06, "loss": 0.9476, "mean_token_accuracy": 0.7861513495445251, "step": 158 }, { "epoch": 0.6708860759493671, "grad_norm": 1.3335379362106323, "learning_rate": 3.724414947860027e-06, "loss": 1.1159, "mean_token_accuracy": 0.7479957938194275, "step": 159 }, { "epoch": 0.6751054852320675, "grad_norm": 1.3141283988952637, "learning_rate": 3.6636237449044077e-06, "loss": 1.0423, "mean_token_accuracy": 0.7584823966026306, "step": 160 }, { "epoch": 0.679324894514768, "grad_norm": 1.2607753276824951, "learning_rate": 3.603232021298284e-06, "loss": 1.3062, "mean_token_accuracy": 0.704585075378418, "step": 161 }, { "epoch": 0.6835443037974683, "grad_norm": 1.414121150970459, "learning_rate": 3.5432529144618287e-06, "loss": 1.1291, "mean_token_accuracy": 0.7416020631790161, "step": 162 }, { "epoch": 0.6877637130801688, "grad_norm": 1.4330233335494995, "learning_rate": 3.483699472055897e-06, "loss": 1.0693, "mean_token_accuracy": 0.7502612471580505, "step": 163 }, { "epoch": 0.6919831223628692, "grad_norm": 1.2901924848556519, "learning_rate": 3.424584649143685e-06, "loss": 1.0724, "mean_token_accuracy": 0.7590579986572266, "step": 164 }, { "epoch": 0.6962025316455697, "grad_norm": 1.3194284439086914, "learning_rate": 3.365921305372519e-06, "loss": 1.0791, "mean_token_accuracy": 0.755479633808136, "step": 165 }, { "epoch": 0.70042194092827, "grad_norm": 1.3910928964614868, "learning_rate": 3.307722202176417e-06, "loss": 1.0845, "mean_token_accuracy": 0.7579113841056824, "step": 166 }, { "epoch": 0.7046413502109705, "grad_norm": 1.3976861238479614, "learning_rate": 3.2500000000000015e-06, "loss": 0.9902, "mean_token_accuracy": 0.7839325666427612, "step": 167 }, { "epoch": 0.7088607594936709, "grad_norm": 1.344634771347046, "learning_rate": 3.19276725554439e-06, "loss": 1.004, "mean_token_accuracy": 0.7765747904777527, "step": 168 }, { "epoch": 0.7130801687763713, "grad_norm": 1.4013010263442993, "learning_rate": 3.136036419035656e-06, "loss": 0.9893, "mean_token_accuracy": 0.7824804782867432, "step": 169 }, { "epoch": 0.7172995780590717, "grad_norm": 1.2817473411560059, "learning_rate": 3.0798198315164345e-06, "loss": 1.0701, "mean_token_accuracy": 0.7619868516921997, "step": 170 }, { "epoch": 0.7215189873417721, "grad_norm": 1.3074806928634644, "learning_rate": 3.024129722161305e-06, "loss": 1.1384, "mean_token_accuracy": 0.7425307631492615, "step": 171 }, { "epoch": 0.7257383966244726, "grad_norm": 1.3982939720153809, "learning_rate": 2.9689782056164874e-06, "loss": 1.1811, "mean_token_accuracy": 0.7242642045021057, "step": 172 }, { "epoch": 0.729957805907173, "grad_norm": 1.3266452550888062, "learning_rate": 2.914377279364464e-06, "loss": 1.1075, "mean_token_accuracy": 0.739130437374115, "step": 173 }, { "epoch": 0.7341772151898734, "grad_norm": 1.354162573814392, "learning_rate": 2.8603388211140886e-06, "loss": 1.0977, "mean_token_accuracy": 0.7508479952812195, "step": 174 }, { "epoch": 0.7383966244725738, "grad_norm": 1.3136208057403564, "learning_rate": 2.8068745862167423e-06, "loss": 1.0426, "mean_token_accuracy": 0.7577039003372192, "step": 175 }, { "epoch": 0.7426160337552743, "grad_norm": 1.264014720916748, "learning_rate": 2.7539962051091185e-06, "loss": 1.1094, "mean_token_accuracy": 0.7489492893218994, "step": 176 }, { "epoch": 0.7468354430379747, "grad_norm": 1.2856388092041016, "learning_rate": 2.70171518078317e-06, "loss": 0.9631, "mean_token_accuracy": 0.7795804738998413, "step": 177 }, { "epoch": 0.7510548523206751, "grad_norm": 1.2878668308258057, "learning_rate": 2.6500428862837878e-06, "loss": 1.1275, "mean_token_accuracy": 0.7491475343704224, "step": 178 }, { "epoch": 0.7552742616033755, "grad_norm": 1.2011187076568604, "learning_rate": 2.5989905622347538e-06, "loss": 1.0777, "mean_token_accuracy": 0.7586644291877747, "step": 179 }, { "epoch": 0.759493670886076, "grad_norm": 1.2956939935684204, "learning_rate": 2.5485693143934704e-06, "loss": 1.078, "mean_token_accuracy": 0.7640592455863953, "step": 180 }, { "epoch": 0.7637130801687764, "grad_norm": 1.278355598449707, "learning_rate": 2.498790111235072e-06, "loss": 1.0595, "mean_token_accuracy": 0.7603550553321838, "step": 181 }, { "epoch": 0.7679324894514767, "grad_norm": 1.3742746114730835, "learning_rate": 2.4496637815663697e-06, "loss": 0.9225, "mean_token_accuracy": 0.7922610640525818, "step": 182 }, { "epoch": 0.7721518987341772, "grad_norm": 1.2835192680358887, "learning_rate": 2.4012010121701853e-06, "loss": 1.2256, "mean_token_accuracy": 0.7330555319786072, "step": 183 }, { "epoch": 0.7763713080168776, "grad_norm": 1.303166389465332, "learning_rate": 2.353412345480587e-06, "loss": 1.0578, "mean_token_accuracy": 0.7578330039978027, "step": 184 }, { "epoch": 0.7805907172995781, "grad_norm": 1.3752117156982422, "learning_rate": 2.3063081772895203e-06, "loss": 1.0769, "mean_token_accuracy": 0.761528730392456, "step": 185 }, { "epoch": 0.7848101265822784, "grad_norm": 1.3514289855957031, "learning_rate": 2.2598987544853428e-06, "loss": 1.1491, "mean_token_accuracy": 0.7445311546325684, "step": 186 }, { "epoch": 0.7890295358649789, "grad_norm": 1.342996597290039, "learning_rate": 2.2141941728237467e-06, "loss": 1.1003, "mean_token_accuracy": 0.7531036734580994, "step": 187 }, { "epoch": 0.7932489451476793, "grad_norm": 1.3291714191436768, "learning_rate": 2.1692043747315628e-06, "loss": 1.0323, "mean_token_accuracy": 0.7538182139396667, "step": 188 }, { "epoch": 0.7974683544303798, "grad_norm": 1.3257864713668823, "learning_rate": 2.1249391471439206e-06, "loss": 0.9746, "mean_token_accuracy": 0.7849647402763367, "step": 189 }, { "epoch": 0.8016877637130801, "grad_norm": 1.2531793117523193, "learning_rate": 2.081408119375219e-06, "loss": 1.0839, "mean_token_accuracy": 0.7478733062744141, "step": 190 }, { "epoch": 0.8059071729957806, "grad_norm": 1.3834004402160645, "learning_rate": 2.0386207610244073e-06, "loss": 1.1095, "mean_token_accuracy": 0.7386733889579773, "step": 191 }, { "epoch": 0.810126582278481, "grad_norm": 1.2729295492172241, "learning_rate": 1.9965863799149988e-06, "loss": 1.0797, "mean_token_accuracy": 0.7641941905021667, "step": 192 }, { "epoch": 0.8143459915611815, "grad_norm": 1.3292949199676514, "learning_rate": 1.955314120070269e-06, "loss": 1.0022, "mean_token_accuracy": 0.7704975605010986, "step": 193 }, { "epoch": 0.8185654008438819, "grad_norm": 1.3078043460845947, "learning_rate": 1.9148129597240984e-06, "loss": 1.1246, "mean_token_accuracy": 0.7446621656417847, "step": 194 }, { "epoch": 0.8227848101265823, "grad_norm": 1.3361434936523438, "learning_rate": 1.8750917093678824e-06, "loss": 1.1086, "mean_token_accuracy": 0.7590915560722351, "step": 195 }, { "epoch": 0.8270042194092827, "grad_norm": 1.3019007444381714, "learning_rate": 1.8361590098339168e-06, "loss": 1.1935, "mean_token_accuracy": 0.7272245287895203, "step": 196 }, { "epoch": 0.8312236286919831, "grad_norm": 1.2692325115203857, "learning_rate": 1.7980233304157025e-06, "loss": 1.0738, "mean_token_accuracy": 0.7482107281684875, "step": 197 }, { "epoch": 0.8354430379746836, "grad_norm": 1.2975091934204102, "learning_rate": 1.760692967025564e-06, "loss": 1.1519, "mean_token_accuracy": 0.7438170313835144, "step": 198 }, { "epoch": 0.8396624472573839, "grad_norm": 1.3825592994689941, "learning_rate": 1.724176040389982e-06, "loss": 1.0687, "mean_token_accuracy": 0.7617847323417664, "step": 199 }, { "epoch": 0.8438818565400844, "grad_norm": 1.2977200746536255, "learning_rate": 1.6884804942830373e-06, "loss": 1.0253, "mean_token_accuracy": 0.7679784893989563, "step": 200 }, { "epoch": 0.8481012658227848, "grad_norm": 1.2826578617095947, "learning_rate": 1.6536140937983469e-06, "loss": 0.9576, "mean_token_accuracy": 0.7874261736869812, "step": 201 }, { "epoch": 0.8523206751054853, "grad_norm": 1.2712202072143555, "learning_rate": 1.619584423659875e-06, "loss": 1.1153, "mean_token_accuracy": 0.7474706768989563, "step": 202 }, { "epoch": 0.8565400843881856, "grad_norm": 1.3052817583084106, "learning_rate": 1.5863988865719702e-06, "loss": 1.1858, "mean_token_accuracy": 0.7376619577407837, "step": 203 }, { "epoch": 0.8607594936708861, "grad_norm": 1.2947852611541748, "learning_rate": 1.5540647016090066e-06, "loss": 1.1035, "mean_token_accuracy": 0.7478904724121094, "step": 204 }, { "epoch": 0.8649789029535865, "grad_norm": 1.3773938417434692, "learning_rate": 1.5225889026449754e-06, "loss": 1.0303, "mean_token_accuracy": 0.7679511308670044, "step": 205 }, { "epoch": 0.869198312236287, "grad_norm": 1.3938205242156982, "learning_rate": 1.4919783368233525e-06, "loss": 1.0553, "mean_token_accuracy": 0.7651947140693665, "step": 206 }, { "epoch": 0.8734177215189873, "grad_norm": 1.3168855905532837, "learning_rate": 1.462239663067596e-06, "loss": 1.1293, "mean_token_accuracy": 0.7346562743186951, "step": 207 }, { "epoch": 0.8776371308016878, "grad_norm": 1.2759041786193848, "learning_rate": 1.4333793506325832e-06, "loss": 1.1841, "mean_token_accuracy": 0.7245597243309021, "step": 208 }, { "epoch": 0.8818565400843882, "grad_norm": 1.2906190156936646, "learning_rate": 1.4054036776973123e-06, "loss": 1.1123, "mean_token_accuracy": 0.7488489747047424, "step": 209 }, { "epoch": 0.8860759493670886, "grad_norm": 1.3179748058319092, "learning_rate": 1.378318729999169e-06, "loss": 1.1058, "mean_token_accuracy": 0.7502433657646179, "step": 210 }, { "epoch": 0.890295358649789, "grad_norm": 1.3706414699554443, "learning_rate": 1.3521303995100479e-06, "loss": 0.9837, "mean_token_accuracy": 0.7637858986854553, "step": 211 }, { "epoch": 0.8945147679324894, "grad_norm": 1.302856683731079, "learning_rate": 1.32684438315464e-06, "loss": 1.0673, "mean_token_accuracy": 0.7459544539451599, "step": 212 }, { "epoch": 0.8987341772151899, "grad_norm": 1.2482903003692627, "learning_rate": 1.3024661815711387e-06, "loss": 1.1662, "mean_token_accuracy": 0.7322877049446106, "step": 213 }, { "epoch": 0.9029535864978903, "grad_norm": 1.2914732694625854, "learning_rate": 1.2790010979146467e-06, "loss": 0.984, "mean_token_accuracy": 0.7709949016571045, "step": 214 }, { "epoch": 0.9071729957805907, "grad_norm": 1.3376529216766357, "learning_rate": 1.2564542367035502e-06, "loss": 1.1432, "mean_token_accuracy": 0.7400686144828796, "step": 215 }, { "epoch": 0.9113924050632911, "grad_norm": 1.3194866180419922, "learning_rate": 1.2348305027090923e-06, "loss": 1.1384, "mean_token_accuracy": 0.7444621324539185, "step": 216 }, { "epoch": 0.9156118143459916, "grad_norm": 1.2742234468460083, "learning_rate": 1.2141345998884092e-06, "loss": 1.1669, "mean_token_accuracy": 0.7380778789520264, "step": 217 }, { "epoch": 0.919831223628692, "grad_norm": 1.3064154386520386, "learning_rate": 1.1943710303612459e-06, "loss": 1.0964, "mean_token_accuracy": 0.7555238604545593, "step": 218 }, { "epoch": 0.9240506329113924, "grad_norm": 1.261291742324829, "learning_rate": 1.175544093430577e-06, "loss": 1.107, "mean_token_accuracy": 0.751292884349823, "step": 219 }, { "epoch": 0.9282700421940928, "grad_norm": 1.2909098863601685, "learning_rate": 1.1576578846473558e-06, "loss": 1.0531, "mean_token_accuracy": 0.7663896679878235, "step": 220 }, { "epoch": 0.9324894514767933, "grad_norm": 1.3752104043960571, "learning_rate": 1.1407162949195732e-06, "loss": 1.0887, "mean_token_accuracy": 0.7531322240829468, "step": 221 }, { "epoch": 0.9367088607594937, "grad_norm": 1.297967791557312, "learning_rate": 1.1247230096658498e-06, "loss": 1.0218, "mean_token_accuracy": 0.7693407535552979, "step": 222 }, { "epoch": 0.9409282700421941, "grad_norm": 1.298868179321289, "learning_rate": 1.1096815080137196e-06, "loss": 0.9778, "mean_token_accuracy": 0.7718530297279358, "step": 223 }, { "epoch": 0.9451476793248945, "grad_norm": 1.3202515840530396, "learning_rate": 1.0955950620427883e-06, "loss": 1.0321, "mean_token_accuracy": 0.7685170769691467, "step": 224 }, { "epoch": 0.9493670886075949, "grad_norm": 1.2280904054641724, "learning_rate": 1.0824667360729408e-06, "loss": 1.2625, "mean_token_accuracy": 0.7183030843734741, "step": 225 }, { "epoch": 0.9535864978902954, "grad_norm": 1.2865333557128906, "learning_rate": 1.070299385997735e-06, "loss": 1.1937, "mean_token_accuracy": 0.7263870239257812, "step": 226 }, { "epoch": 0.9578059071729957, "grad_norm": 1.3348139524459839, "learning_rate": 1.0590956586631432e-06, "loss": 1.1109, "mean_token_accuracy": 0.743833601474762, "step": 227 }, { "epoch": 0.9620253164556962, "grad_norm": 1.3875958919525146, "learning_rate": 1.0488579912917633e-06, "loss": 1.017, "mean_token_accuracy": 0.7680346369743347, "step": 228 }, { "epoch": 0.9662447257383966, "grad_norm": 1.3476899862289429, "learning_rate": 1.0395886109526346e-06, "loss": 1.0319, "mean_token_accuracy": 0.7578980922698975, "step": 229 }, { "epoch": 0.9704641350210971, "grad_norm": 1.4462693929672241, "learning_rate": 1.0312895340767674e-06, "loss": 1.1329, "mean_token_accuracy": 0.761381208896637, "step": 230 }, { "epoch": 0.9746835443037974, "grad_norm": 1.2707955837249756, "learning_rate": 1.0239625660184965e-06, "loss": 1.2035, "mean_token_accuracy": 0.7266802787780762, "step": 231 }, { "epoch": 0.9789029535864979, "grad_norm": 1.331416368484497, "learning_rate": 1.0176093006627485e-06, "loss": 1.1598, "mean_token_accuracy": 0.7420715689659119, "step": 232 }, { "epoch": 0.9831223628691983, "grad_norm": 1.3013097047805786, "learning_rate": 1.0122311200783154e-06, "loss": 0.9994, "mean_token_accuracy": 0.7827889323234558, "step": 233 }, { "epoch": 0.9873417721518988, "grad_norm": 1.3347793817520142, "learning_rate": 1.0078291942172018e-06, "loss": 1.0984, "mean_token_accuracy": 0.7526082992553711, "step": 234 }, { "epoch": 0.9915611814345991, "grad_norm": 1.4117809534072876, "learning_rate": 1.0044044806601188e-06, "loss": 0.9413, "mean_token_accuracy": 0.7868911623954773, "step": 235 }, { "epoch": 0.9957805907172996, "grad_norm": 1.3429399728775024, "learning_rate": 1.0019577244081736e-06, "loss": 1.0494, "mean_token_accuracy": 0.7685214877128601, "step": 236 }, { "epoch": 1.0, "grad_norm": 1.2474669218063354, "learning_rate": 1.0004894577208057e-06, "loss": 1.0154, "mean_token_accuracy": 0.7552247643470764, "step": 237 }, { "epoch": 1.0, "step": 237, "total_flos": 1.3663861802532864e+17, "train_loss": 1.2117820017448457, "train_runtime": 1211.3807, "train_samples_per_second": 12.481, "train_steps_per_second": 0.196 } ], "logging_steps": 1, "max_steps": 237, "num_input_tokens_seen": 0, "num_train_epochs": 1, "save_steps": 500, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": true }, "attributes": {} } }, "total_flos": 1.3663861802532864e+17, "train_batch_size": 8, "trial_name": null, "trial_params": null }