{ "best_global_step": null, "best_metric": null, "best_model_checkpoint": null, "epoch": 1.0, "eval_steps": 500, "global_step": 237, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.004219409282700422, "grad_norm": 257.7193908691406, "learning_rate": 0.0, "loss": 5.6878, "mean_token_accuracy": 0.2766798436641693, "step": 1 }, { "epoch": 0.008438818565400843, "grad_norm": 258.8243713378906, "learning_rate": 4.1666666666666667e-07, "loss": 5.5809, "mean_token_accuracy": 0.2673267424106598, "step": 2 }, { "epoch": 0.012658227848101266, "grad_norm": 261.4031677246094, "learning_rate": 8.333333333333333e-07, "loss": 5.6727, "mean_token_accuracy": 0.2783300280570984, "step": 3 }, { "epoch": 0.016877637130801686, "grad_norm": 257.4659729003906, "learning_rate": 1.25e-06, "loss": 5.5759, "mean_token_accuracy": 0.2854291498661041, "step": 4 }, { "epoch": 0.02109704641350211, "grad_norm": 252.2744903564453, "learning_rate": 1.6666666666666667e-06, "loss": 5.3613, "mean_token_accuracy": 0.26824456453323364, "step": 5 }, { "epoch": 0.02531645569620253, "grad_norm": 239.39894104003906, "learning_rate": 2.0833333333333334e-06, "loss": 4.6217, "mean_token_accuracy": 0.3745020031929016, "step": 6 }, { "epoch": 0.029535864978902954, "grad_norm": 232.35324096679688, "learning_rate": 2.5e-06, "loss": 4.1933, "mean_token_accuracy": 0.4015747904777527, "step": 7 }, { "epoch": 0.03375527426160337, "grad_norm": 208.7158660888672, "learning_rate": 2.916666666666667e-06, "loss": 2.6697, "mean_token_accuracy": 0.5498008131980896, "step": 8 }, { "epoch": 0.0379746835443038, "grad_norm": 201.87884521484375, "learning_rate": 3.3333333333333333e-06, "loss": 2.4149, "mean_token_accuracy": 0.6184738874435425, "step": 9 }, { "epoch": 0.04219409282700422, "grad_norm": 191.26608276367188, "learning_rate": 3.7500000000000005e-06, "loss": 2.0262, "mean_token_accuracy": 0.6733067631721497, "step": 10 }, { "epoch": 0.046413502109704644, "grad_norm": 168.3002166748047, "learning_rate": 4.166666666666667e-06, "loss": 0.9979, "mean_token_accuracy": 0.7944111824035645, "step": 11 }, { "epoch": 0.05063291139240506, "grad_norm": 108.35690307617188, "learning_rate": 4.583333333333333e-06, "loss": 0.733, "mean_token_accuracy": 0.8169291615486145, "step": 12 }, { "epoch": 0.05485232067510549, "grad_norm": 53.91748809814453, "learning_rate": 5e-06, "loss": 0.6272, "mean_token_accuracy": 0.8043912053108215, "step": 13 }, { "epoch": 0.05907172995780591, "grad_norm": 68.18785095214844, "learning_rate": 5.416666666666667e-06, "loss": 0.3842, "mean_token_accuracy": 0.9404761791229248, "step": 14 }, { "epoch": 0.06329113924050633, "grad_norm": 13.696602821350098, "learning_rate": 5.833333333333334e-06, "loss": 0.1424, "mean_token_accuracy": 0.9363816976547241, "step": 15 }, { "epoch": 0.06751054852320675, "grad_norm": 8.211609840393066, "learning_rate": 6.25e-06, "loss": 0.1467, "mean_token_accuracy": 0.9279999732971191, "step": 16 }, { "epoch": 0.07172995780590717, "grad_norm": 3.198794364929199, "learning_rate": 6.666666666666667e-06, "loss": 0.1236, "mean_token_accuracy": 0.954365074634552, "step": 17 }, { "epoch": 0.0759493670886076, "grad_norm": 4.237670421600342, "learning_rate": 7.083333333333335e-06, "loss": 0.1245, "mean_token_accuracy": 0.942800760269165, "step": 18 }, { "epoch": 0.08016877637130802, "grad_norm": 3.2434728145599365, "learning_rate": 7.500000000000001e-06, "loss": 0.1209, "mean_token_accuracy": 0.9414141178131104, "step": 19 }, { "epoch": 0.08438818565400844, "grad_norm": 2.094727039337158, "learning_rate": 7.916666666666667e-06, "loss": 0.0984, "mean_token_accuracy": 0.9604743123054504, "step": 20 }, { "epoch": 0.08860759493670886, "grad_norm": 3.929363965988159, "learning_rate": 8.333333333333334e-06, "loss": 0.101, "mean_token_accuracy": 0.958742618560791, "step": 21 }, { "epoch": 0.09282700421940929, "grad_norm": 3.9261255264282227, "learning_rate": 8.750000000000001e-06, "loss": 0.1267, "mean_token_accuracy": 0.9467455744743347, "step": 22 }, { "epoch": 0.0970464135021097, "grad_norm": 2.7543833255767822, "learning_rate": 9.166666666666666e-06, "loss": 0.1115, "mean_token_accuracy": 0.9559118151664734, "step": 23 }, { "epoch": 0.10126582278481013, "grad_norm": 2.6712000370025635, "learning_rate": 9.583333333333335e-06, "loss": 0.1176, "mean_token_accuracy": 0.9442231059074402, "step": 24 }, { "epoch": 0.10548523206751055, "grad_norm": 3.4311094284057617, "learning_rate": 1e-05, "loss": 0.1051, "mean_token_accuracy": 0.9540917873382568, "step": 25 }, { "epoch": 0.10970464135021098, "grad_norm": 2.4751687049865723, "learning_rate": 9.999510542279196e-06, "loss": 0.095, "mean_token_accuracy": 0.9620000123977661, "step": 26 }, { "epoch": 0.11392405063291139, "grad_norm": 3.9214606285095215, "learning_rate": 9.998042275591827e-06, "loss": 0.0855, "mean_token_accuracy": 0.9601593613624573, "step": 27 }, { "epoch": 0.11814345991561181, "grad_norm": 5.87342643737793, "learning_rate": 9.995595519339882e-06, "loss": 0.1058, "mean_token_accuracy": 0.9500998258590698, "step": 28 }, { "epoch": 0.12236286919831224, "grad_norm": 5.889449596405029, "learning_rate": 9.992170805782799e-06, "loss": 0.1205, "mean_token_accuracy": 0.9528487324714661, "step": 29 }, { "epoch": 0.12658227848101267, "grad_norm": 3.171309232711792, "learning_rate": 9.987768879921685e-06, "loss": 0.0988, "mean_token_accuracy": 0.9580838084220886, "step": 30 }, { "epoch": 0.1308016877637131, "grad_norm": 2.4540059566497803, "learning_rate": 9.982390699337253e-06, "loss": 0.0794, "mean_token_accuracy": 0.9580000042915344, "step": 31 }, { "epoch": 0.1350210970464135, "grad_norm": 1.8785423040390015, "learning_rate": 9.976037433981505e-06, "loss": 0.1037, "mean_token_accuracy": 0.9466403126716614, "step": 32 }, { "epoch": 0.13924050632911392, "grad_norm": 2.374983310699463, "learning_rate": 9.968710465923233e-06, "loss": 0.1021, "mean_token_accuracy": 0.9561753273010254, "step": 33 }, { "epoch": 0.14345991561181434, "grad_norm": 2.279914140701294, "learning_rate": 9.960411389047366e-06, "loss": 0.0868, "mean_token_accuracy": 0.9538152813911438, "step": 34 }, { "epoch": 0.14767932489451477, "grad_norm": 2.292590856552124, "learning_rate": 9.951142008708238e-06, "loss": 0.0934, "mean_token_accuracy": 0.9584158658981323, "step": 35 }, { "epoch": 0.1518987341772152, "grad_norm": 3.0072073936462402, "learning_rate": 9.940904341336859e-06, "loss": 0.1071, "mean_token_accuracy": 0.9422310590744019, "step": 36 }, { "epoch": 0.15611814345991562, "grad_norm": 3.487196683883667, "learning_rate": 9.929700614002265e-06, "loss": 0.1221, "mean_token_accuracy": 0.9484127163887024, "step": 37 }, { "epoch": 0.16033755274261605, "grad_norm": 1.7234104871749878, "learning_rate": 9.91753326392706e-06, "loss": 0.0856, "mean_token_accuracy": 0.9584158658981323, "step": 38 }, { "epoch": 0.16455696202531644, "grad_norm": 3.139375925064087, "learning_rate": 9.904404937957213e-06, "loss": 0.0954, "mean_token_accuracy": 0.9582505226135254, "step": 39 }, { "epoch": 0.16877637130801687, "grad_norm": 2.0649945735931396, "learning_rate": 9.890318491986282e-06, "loss": 0.0765, "mean_token_accuracy": 0.9681274890899658, "step": 40 }, { "epoch": 0.1729957805907173, "grad_norm": 3.751356601715088, "learning_rate": 9.87527699033415e-06, "loss": 0.0823, "mean_token_accuracy": 0.9624505639076233, "step": 41 }, { "epoch": 0.17721518987341772, "grad_norm": 5.050260543823242, "learning_rate": 9.85928370508043e-06, "loss": 0.0854, "mean_token_accuracy": 0.9662027955055237, "step": 42 }, { "epoch": 0.18143459915611815, "grad_norm": 2.9303841590881348, "learning_rate": 9.842342115352647e-06, "loss": 0.0913, "mean_token_accuracy": 0.9683168530464172, "step": 43 }, { "epoch": 0.18565400843881857, "grad_norm": 3.8630435466766357, "learning_rate": 9.824455906569423e-06, "loss": 0.1028, "mean_token_accuracy": 0.9541832804679871, "step": 44 }, { "epoch": 0.189873417721519, "grad_norm": 3.1331076622009277, "learning_rate": 9.805628969638757e-06, "loss": 0.1007, "mean_token_accuracy": 0.954901933670044, "step": 45 }, { "epoch": 0.1940928270042194, "grad_norm": 2.5344247817993164, "learning_rate": 9.785865400111593e-06, "loss": 0.0999, "mean_token_accuracy": 0.9584980010986328, "step": 46 }, { "epoch": 0.19831223628691982, "grad_norm": 2.0629115104675293, "learning_rate": 9.765169497290908e-06, "loss": 0.0847, "mean_token_accuracy": 0.966269850730896, "step": 47 }, { "epoch": 0.20253164556962025, "grad_norm": 1.3927879333496094, "learning_rate": 9.743545763296451e-06, "loss": 0.0716, "mean_token_accuracy": 0.9720559120178223, "step": 48 }, { "epoch": 0.20675105485232068, "grad_norm": 2.489097833633423, "learning_rate": 9.720998902085354e-06, "loss": 0.0878, "mean_token_accuracy": 0.9644268751144409, "step": 49 }, { "epoch": 0.2109704641350211, "grad_norm": 3.036329984664917, "learning_rate": 9.697533818428863e-06, "loss": 0.0773, "mean_token_accuracy": 0.9701789021492004, "step": 50 }, { "epoch": 0.21518987341772153, "grad_norm": 4.313502311706543, "learning_rate": 9.673155616845362e-06, "loss": 0.1073, "mean_token_accuracy": 0.9522863030433655, "step": 51 }, { "epoch": 0.21940928270042195, "grad_norm": 3.9843344688415527, "learning_rate": 9.647869600489954e-06, "loss": 0.0925, "mean_token_accuracy": 0.9664031863212585, "step": 52 }, { "epoch": 0.22362869198312235, "grad_norm": 4.010601043701172, "learning_rate": 9.621681270000833e-06, "loss": 0.0766, "mean_token_accuracy": 0.9642857313156128, "step": 53 }, { "epoch": 0.22784810126582278, "grad_norm": 2.856581687927246, "learning_rate": 9.594596322302688e-06, "loss": 0.0598, "mean_token_accuracy": 0.9723865985870361, "step": 54 }, { "epoch": 0.2320675105485232, "grad_norm": 3.642458200454712, "learning_rate": 9.566620649367418e-06, "loss": 0.0885, "mean_token_accuracy": 0.9601593613624573, "step": 55 }, { "epoch": 0.23628691983122363, "grad_norm": 2.2541604042053223, "learning_rate": 9.537760336932406e-06, "loss": 0.1007, "mean_token_accuracy": 0.9602385759353638, "step": 56 }, { "epoch": 0.24050632911392406, "grad_norm": 2.00062894821167, "learning_rate": 9.508021663176648e-06, "loss": 0.0773, "mean_token_accuracy": 0.9597585797309875, "step": 57 }, { "epoch": 0.24472573839662448, "grad_norm": 3.8268930912017822, "learning_rate": 9.477411097355025e-06, "loss": 0.0888, "mean_token_accuracy": 0.9621514081954956, "step": 58 }, { "epoch": 0.2489451476793249, "grad_norm": 2.16796612739563, "learning_rate": 9.445935298390994e-06, "loss": 0.0882, "mean_token_accuracy": 0.9720559120178223, "step": 59 }, { "epoch": 0.25316455696202533, "grad_norm": 1.7303292751312256, "learning_rate": 9.413601113428032e-06, "loss": 0.0735, "mean_token_accuracy": 0.9682539701461792, "step": 60 }, { "epoch": 0.25738396624472576, "grad_norm": 3.067509889602661, "learning_rate": 9.380415576340127e-06, "loss": 0.0719, "mean_token_accuracy": 0.9679999947547913, "step": 61 }, { "epoch": 0.2616033755274262, "grad_norm": 3.767890691757202, "learning_rate": 9.346385906201653e-06, "loss": 0.0932, "mean_token_accuracy": 0.9643564224243164, "step": 62 }, { "epoch": 0.26582278481012656, "grad_norm": 2.2629635334014893, "learning_rate": 9.311519505716963e-06, "loss": 0.0839, "mean_token_accuracy": 0.9663366079330444, "step": 63 }, { "epoch": 0.270042194092827, "grad_norm": 7.801998615264893, "learning_rate": 9.275823959610019e-06, "loss": 0.109, "mean_token_accuracy": 0.9603174328804016, "step": 64 }, { "epoch": 0.2742616033755274, "grad_norm": 3.947153091430664, "learning_rate": 9.239307032974438e-06, "loss": 0.09, "mean_token_accuracy": 0.9625246524810791, "step": 65 }, { "epoch": 0.27848101265822783, "grad_norm": 2.642404556274414, "learning_rate": 9.201976669584299e-06, "loss": 0.0926, "mean_token_accuracy": 0.9644268751144409, "step": 66 }, { "epoch": 0.28270042194092826, "grad_norm": 3.429384469985962, "learning_rate": 9.163840990166085e-06, "loss": 0.095, "mean_token_accuracy": 0.954365074634552, "step": 67 }, { "epoch": 0.2869198312236287, "grad_norm": 4.0457940101623535, "learning_rate": 9.124908290632119e-06, "loss": 0.0963, "mean_token_accuracy": 0.9601593613624573, "step": 68 }, { "epoch": 0.2911392405063291, "grad_norm": 2.69016170501709, "learning_rate": 9.085187040275903e-06, "loss": 0.1046, "mean_token_accuracy": 0.9563491940498352, "step": 69 }, { "epoch": 0.29535864978902954, "grad_norm": 2.045168161392212, "learning_rate": 9.044685879929734e-06, "loss": 0.0947, "mean_token_accuracy": 0.958167314529419, "step": 70 }, { "epoch": 0.29957805907172996, "grad_norm": 1.8094499111175537, "learning_rate": 9.003413620085002e-06, "loss": 0.0779, "mean_token_accuracy": 0.9682539701461792, "step": 71 }, { "epoch": 0.3037974683544304, "grad_norm": 2.036627769470215, "learning_rate": 8.961379238975594e-06, "loss": 0.0851, "mean_token_accuracy": 0.9598393440246582, "step": 72 }, { "epoch": 0.3080168776371308, "grad_norm": 2.552898406982422, "learning_rate": 8.918591880624783e-06, "loss": 0.084, "mean_token_accuracy": 0.9600798487663269, "step": 73 }, { "epoch": 0.31223628691983124, "grad_norm": 1.793794870376587, "learning_rate": 8.875060852856082e-06, "loss": 0.0647, "mean_token_accuracy": 0.9743589758872986, "step": 74 }, { "epoch": 0.31645569620253167, "grad_norm": 2.2364492416381836, "learning_rate": 8.830795625268437e-06, "loss": 0.0627, "mean_token_accuracy": 0.9803149700164795, "step": 75 }, { "epoch": 0.3206751054852321, "grad_norm": 3.5208191871643066, "learning_rate": 8.785805827176256e-06, "loss": 0.1191, "mean_token_accuracy": 0.9485148787498474, "step": 76 }, { "epoch": 0.32489451476793246, "grad_norm": 2.1724491119384766, "learning_rate": 8.740101245514659e-06, "loss": 0.0631, "mean_token_accuracy": 0.9701789021492004, "step": 77 }, { "epoch": 0.3291139240506329, "grad_norm": 3.6974575519561768, "learning_rate": 8.69369182271048e-06, "loss": 0.1018, "mean_token_accuracy": 0.9563491940498352, "step": 78 }, { "epoch": 0.3333333333333333, "grad_norm": 2.6320230960845947, "learning_rate": 8.646587654519413e-06, "loss": 0.0711, "mean_token_accuracy": 0.959919810295105, "step": 79 }, { "epoch": 0.33755274261603374, "grad_norm": 3.240654945373535, "learning_rate": 8.598798987829816e-06, "loss": 0.1013, "mean_token_accuracy": 0.9541832804679871, "step": 80 }, { "epoch": 0.34177215189873417, "grad_norm": 2.3383774757385254, "learning_rate": 8.550336218433631e-06, "loss": 0.085, "mean_token_accuracy": 0.9719439148902893, "step": 81 }, { "epoch": 0.3459915611814346, "grad_norm": 3.362210273742676, "learning_rate": 8.501209888764928e-06, "loss": 0.0814, "mean_token_accuracy": 0.9723320007324219, "step": 82 }, { "epoch": 0.350210970464135, "grad_norm": 3.1673924922943115, "learning_rate": 8.451430685606532e-06, "loss": 0.0948, "mean_token_accuracy": 0.9642857313156128, "step": 83 }, { "epoch": 0.35443037974683544, "grad_norm": 2.021212339401245, "learning_rate": 8.401009437765248e-06, "loss": 0.0649, "mean_token_accuracy": 0.9722222089767456, "step": 84 }, { "epoch": 0.35864978902953587, "grad_norm": 2.806187391281128, "learning_rate": 8.349957113716213e-06, "loss": 0.0912, "mean_token_accuracy": 0.9623762369155884, "step": 85 }, { "epoch": 0.3628691983122363, "grad_norm": 3.192763328552246, "learning_rate": 8.29828481921683e-06, "loss": 0.1094, "mean_token_accuracy": 0.954365074634552, "step": 86 }, { "epoch": 0.3670886075949367, "grad_norm": 2.1149048805236816, "learning_rate": 8.246003794890885e-06, "loss": 0.0782, "mean_token_accuracy": 0.9800000190734863, "step": 87 }, { "epoch": 0.37130801687763715, "grad_norm": 2.255167245864868, "learning_rate": 8.19312541378326e-06, "loss": 0.0816, "mean_token_accuracy": 0.970355749130249, "step": 88 }, { "epoch": 0.3755274261603376, "grad_norm": 2.019317626953125, "learning_rate": 8.139661178885912e-06, "loss": 0.0903, "mean_token_accuracy": 0.9681908488273621, "step": 89 }, { "epoch": 0.379746835443038, "grad_norm": 2.3725459575653076, "learning_rate": 8.085622720635536e-06, "loss": 0.0964, "mean_token_accuracy": 0.9582505226135254, "step": 90 }, { "epoch": 0.38396624472573837, "grad_norm": 1.497867226600647, "learning_rate": 8.031021794383513e-06, "loss": 0.0658, "mean_token_accuracy": 0.9761431217193604, "step": 91 }, { "epoch": 0.3881856540084388, "grad_norm": 3.016174793243408, "learning_rate": 7.975870277838695e-06, "loss": 0.0922, "mean_token_accuracy": 0.9666011929512024, "step": 92 }, { "epoch": 0.3924050632911392, "grad_norm": 3.2005457878112793, "learning_rate": 7.920180168483565e-06, "loss": 0.0697, "mean_token_accuracy": 0.9663366079330444, "step": 93 }, { "epoch": 0.39662447257383965, "grad_norm": 1.80280339717865, "learning_rate": 7.863963580964344e-06, "loss": 0.0745, "mean_token_accuracy": 0.9620758295059204, "step": 94 }, { "epoch": 0.4008438818565401, "grad_norm": 2.3204383850097656, "learning_rate": 7.80723274445561e-06, "loss": 0.0726, "mean_token_accuracy": 0.9718875288963318, "step": 95 }, { "epoch": 0.4050632911392405, "grad_norm": 1.8595603704452515, "learning_rate": 7.75e-06, "loss": 0.0688, "mean_token_accuracy": 0.9660678505897522, "step": 96 }, { "epoch": 0.4092827004219409, "grad_norm": 3.164539098739624, "learning_rate": 7.692277797823585e-06, "loss": 0.0552, "mean_token_accuracy": 0.9742574095726013, "step": 97 }, { "epoch": 0.41350210970464135, "grad_norm": 1.4435194730758667, "learning_rate": 7.634078694627483e-06, "loss": 0.0475, "mean_token_accuracy": 0.9801980257034302, "step": 98 }, { "epoch": 0.4177215189873418, "grad_norm": 3.531642436981201, "learning_rate": 7.575415350856316e-06, "loss": 0.0702, "mean_token_accuracy": 0.974155068397522, "step": 99 }, { "epoch": 0.4219409282700422, "grad_norm": 3.010582685470581, "learning_rate": 7.516300527944104e-06, "loss": 0.0626, "mean_token_accuracy": 0.9841897487640381, "step": 100 }, { "epoch": 0.42616033755274263, "grad_norm": 3.7502403259277344, "learning_rate": 7.456747085538173e-06, "loss": 0.0749, "mean_token_accuracy": 0.9682539701461792, "step": 101 }, { "epoch": 0.43037974683544306, "grad_norm": 7.236377239227295, "learning_rate": 7.3967679787017166e-06, "loss": 0.0811, "mean_token_accuracy": 0.9721115827560425, "step": 102 }, { "epoch": 0.4345991561181435, "grad_norm": 3.3287315368652344, "learning_rate": 7.336376255095592e-06, "loss": 0.076, "mean_token_accuracy": 0.9701789021492004, "step": 103 }, { "epoch": 0.4388185654008439, "grad_norm": 2.148923635482788, "learning_rate": 7.275585052139975e-06, "loss": 0.0492, "mean_token_accuracy": 0.9822134375572205, "step": 104 }, { "epoch": 0.4430379746835443, "grad_norm": 5.421411991119385, "learning_rate": 7.2144075941564835e-06, "loss": 0.0805, "mean_token_accuracy": 0.9665354490280151, "step": 105 }, { "epoch": 0.4472573839662447, "grad_norm": 2.904890775680542, "learning_rate": 7.152857189491406e-06, "loss": 0.0794, "mean_token_accuracy": 0.9683168530464172, "step": 106 }, { "epoch": 0.45147679324894513, "grad_norm": 2.801551103591919, "learning_rate": 7.090947227620646e-06, "loss": 0.0719, "mean_token_accuracy": 0.9680638909339905, "step": 107 }, { "epoch": 0.45569620253164556, "grad_norm": 1.7664284706115723, "learning_rate": 7.028691176237018e-06, "loss": 0.062, "mean_token_accuracy": 0.9723865985870361, "step": 108 }, { "epoch": 0.459915611814346, "grad_norm": 2.012096405029297, "learning_rate": 6.966102578320531e-06, "loss": 0.0794, "mean_token_accuracy": 0.9664031863212585, "step": 109 }, { "epoch": 0.4641350210970464, "grad_norm": 1.9269474744796753, "learning_rate": 6.903195049192285e-06, "loss": 0.074, "mean_token_accuracy": 0.9742574095726013, "step": 110 }, { "epoch": 0.46835443037974683, "grad_norm": 2.175663471221924, "learning_rate": 6.839982273552651e-06, "loss": 0.078, "mean_token_accuracy": 0.9681908488273621, "step": 111 }, { "epoch": 0.47257383966244726, "grad_norm": 2.57498836517334, "learning_rate": 6.776478002504335e-06, "loss": 0.0802, "mean_token_accuracy": 0.9661354422569275, "step": 112 }, { "epoch": 0.4767932489451477, "grad_norm": 1.9922643899917603, "learning_rate": 6.712696050561014e-06, "loss": 0.0635, "mean_token_accuracy": 0.9743589758872986, "step": 113 }, { "epoch": 0.4810126582278481, "grad_norm": 1.8098024129867554, "learning_rate": 6.648650292642166e-06, "loss": 0.0749, "mean_token_accuracy": 0.9660000205039978, "step": 114 }, { "epoch": 0.48523206751054854, "grad_norm": 2.073256492614746, "learning_rate": 6.584354661054765e-06, "loss": 0.0693, "mean_token_accuracy": 0.970355749130249, "step": 115 }, { "epoch": 0.48945147679324896, "grad_norm": 4.054864883422852, "learning_rate": 6.519823142462501e-06, "loss": 0.085, "mean_token_accuracy": 0.9603960514068604, "step": 116 }, { "epoch": 0.4936708860759494, "grad_norm": 3.5610010623931885, "learning_rate": 6.4550697748431545e-06, "loss": 0.0687, "mean_token_accuracy": 0.974459707736969, "step": 117 }, { "epoch": 0.4978902953586498, "grad_norm": 3.0145444869995117, "learning_rate": 6.390108644434828e-06, "loss": 0.0783, "mean_token_accuracy": 0.9704142212867737, "step": 118 }, { "epoch": 0.5021097046413502, "grad_norm": 2.752378463745117, "learning_rate": 6.32495388267167e-06, "loss": 0.0572, "mean_token_accuracy": 0.9780439138412476, "step": 119 }, { "epoch": 0.5063291139240507, "grad_norm": 3.063128709793091, "learning_rate": 6.259619663109762e-06, "loss": 0.0903, "mean_token_accuracy": 0.9681274890899658, "step": 120 }, { "epoch": 0.510548523206751, "grad_norm": 2.5731825828552246, "learning_rate": 6.19412019834386e-06, "loss": 0.082, "mean_token_accuracy": 0.9659318923950195, "step": 121 }, { "epoch": 0.5147679324894515, "grad_norm": 2.734591484069824, "learning_rate": 6.1284697369156276e-06, "loss": 0.0914, "mean_token_accuracy": 0.9644268751144409, "step": 122 }, { "epoch": 0.5189873417721519, "grad_norm": 3.0813310146331787, "learning_rate": 6.062682560214053e-06, "loss": 0.0744, "mean_token_accuracy": 0.9684418439865112, "step": 123 }, { "epoch": 0.5232067510548524, "grad_norm": 2.8704934120178223, "learning_rate": 5.996772979368715e-06, "loss": 0.0922, "mean_token_accuracy": 0.9605522751808167, "step": 124 }, { "epoch": 0.5274261603375527, "grad_norm": 1.9350239038467407, "learning_rate": 5.930755332136604e-06, "loss": 0.0631, "mean_token_accuracy": 0.974155068397522, "step": 125 }, { "epoch": 0.5316455696202531, "grad_norm": 2.241611957550049, "learning_rate": 5.86464397978312e-06, "loss": 0.077, "mean_token_accuracy": 0.9681274890899658, "step": 126 }, { "epoch": 0.5358649789029536, "grad_norm": 1.976808786392212, "learning_rate": 5.798453303957968e-06, "loss": 0.0658, "mean_token_accuracy": 0.9782178401947021, "step": 127 }, { "epoch": 0.540084388185654, "grad_norm": 1.5725995302200317, "learning_rate": 5.73219770356663e-06, "loss": 0.0627, "mean_token_accuracy": 0.9781312346458435, "step": 128 }, { "epoch": 0.5443037974683544, "grad_norm": 1.8940309286117554, "learning_rate": 5.665891591638063e-06, "loss": 0.0689, "mean_token_accuracy": 0.976331353187561, "step": 129 }, { "epoch": 0.5485232067510548, "grad_norm": 3.2091996669769287, "learning_rate": 5.5995493921893415e-06, "loss": 0.0752, "mean_token_accuracy": 0.9680638909339905, "step": 130 }, { "epoch": 0.5527426160337553, "grad_norm": 2.6282997131347656, "learning_rate": 5.533185537087906e-06, "loss": 0.0707, "mean_token_accuracy": 0.9681274890899658, "step": 131 }, { "epoch": 0.5569620253164557, "grad_norm": 1.856486201286316, "learning_rate": 5.4668144629120945e-06, "loss": 0.0669, "mean_token_accuracy": 0.9760956168174744, "step": 132 }, { "epoch": 0.5611814345991561, "grad_norm": 2.383890390396118, "learning_rate": 5.40045060781066e-06, "loss": 0.0721, "mean_token_accuracy": 0.9721670150756836, "step": 133 }, { "epoch": 0.5654008438818565, "grad_norm": 2.2145943641662598, "learning_rate": 5.33410840836194e-06, "loss": 0.0531, "mean_token_accuracy": 0.9782178401947021, "step": 134 }, { "epoch": 0.569620253164557, "grad_norm": 2.1483144760131836, "learning_rate": 5.267802296433372e-06, "loss": 0.0839, "mean_token_accuracy": 0.9702970385551453, "step": 135 }, { "epoch": 0.5738396624472574, "grad_norm": 3.366399049758911, "learning_rate": 5.201546696042033e-06, "loss": 0.0913, "mean_token_accuracy": 0.9683168530464172, "step": 136 }, { "epoch": 0.5780590717299579, "grad_norm": 2.2231791019439697, "learning_rate": 5.13535602021688e-06, "loss": 0.055, "mean_token_accuracy": 0.9701789021492004, "step": 137 }, { "epoch": 0.5822784810126582, "grad_norm": 3.9542038440704346, "learning_rate": 5.069244667863397e-06, "loss": 0.0719, "mean_token_accuracy": 0.9702380895614624, "step": 138 }, { "epoch": 0.5864978902953587, "grad_norm": 1.733921766281128, "learning_rate": 5.003227020631287e-06, "loss": 0.0572, "mean_token_accuracy": 0.9822134375572205, "step": 139 }, { "epoch": 0.5907172995780591, "grad_norm": 1.9163316488265991, "learning_rate": 4.937317439785949e-06, "loss": 0.0519, "mean_token_accuracy": 0.984000027179718, "step": 140 }, { "epoch": 0.5949367088607594, "grad_norm": 2.5794224739074707, "learning_rate": 4.871530263084373e-06, "loss": 0.0487, "mean_token_accuracy": 0.9841583967208862, "step": 141 }, { "epoch": 0.5991561181434599, "grad_norm": 2.0642895698547363, "learning_rate": 4.80587980165614e-06, "loss": 0.0572, "mean_token_accuracy": 0.9840954542160034, "step": 142 }, { "epoch": 0.6033755274261603, "grad_norm": 2.106585741043091, "learning_rate": 4.74038033689024e-06, "loss": 0.0828, "mean_token_accuracy": 0.9660678505897522, "step": 143 }, { "epoch": 0.6075949367088608, "grad_norm": 2.404993772506714, "learning_rate": 4.675046117328333e-06, "loss": 0.0543, "mean_token_accuracy": 0.9760956168174744, "step": 144 }, { "epoch": 0.6118143459915611, "grad_norm": 2.5523087978363037, "learning_rate": 4.609891355565172e-06, "loss": 0.066, "mean_token_accuracy": 0.972000002861023, "step": 145 }, { "epoch": 0.6160337552742616, "grad_norm": 1.8869218826293945, "learning_rate": 4.544930225156847e-06, "loss": 0.0456, "mean_token_accuracy": 0.9782178401947021, "step": 146 }, { "epoch": 0.620253164556962, "grad_norm": 3.2489407062530518, "learning_rate": 4.480176857537499e-06, "loss": 0.0644, "mean_token_accuracy": 0.9682539701461792, "step": 147 }, { "epoch": 0.6244725738396625, "grad_norm": 2.1657814979553223, "learning_rate": 4.415645338945236e-06, "loss": 0.0457, "mean_token_accuracy": 0.9821428656578064, "step": 148 }, { "epoch": 0.6286919831223629, "grad_norm": 2.359807252883911, "learning_rate": 4.351349707357836e-06, "loss": 0.0594, "mean_token_accuracy": 0.9761431217193604, "step": 149 }, { "epoch": 0.6329113924050633, "grad_norm": 2.6776392459869385, "learning_rate": 4.287303949438987e-06, "loss": 0.0731, "mean_token_accuracy": 0.9684418439865112, "step": 150 }, { "epoch": 0.6371308016877637, "grad_norm": 1.798113465309143, "learning_rate": 4.223521997495665e-06, "loss": 0.0453, "mean_token_accuracy": 0.9782178401947021, "step": 151 }, { "epoch": 0.6413502109704642, "grad_norm": 1.7288978099822998, "learning_rate": 4.160017726447352e-06, "loss": 0.0458, "mean_token_accuracy": 0.9822134375572205, "step": 152 }, { "epoch": 0.6455696202531646, "grad_norm": 2.06483793258667, "learning_rate": 4.096804950807717e-06, "loss": 0.071, "mean_token_accuracy": 0.9698188900947571, "step": 153 }, { "epoch": 0.6497890295358649, "grad_norm": 2.3226966857910156, "learning_rate": 4.033897421679472e-06, "loss": 0.0536, "mean_token_accuracy": 0.97826087474823, "step": 154 }, { "epoch": 0.6540084388185654, "grad_norm": 2.3752081394195557, "learning_rate": 3.971308823762983e-06, "loss": 0.0557, "mean_token_accuracy": 0.9742063283920288, "step": 155 }, { "epoch": 0.6582278481012658, "grad_norm": 2.3697054386138916, "learning_rate": 3.9090527723793545e-06, "loss": 0.0606, "mean_token_accuracy": 0.9780439138412476, "step": 156 }, { "epoch": 0.6624472573839663, "grad_norm": 2.4742555618286133, "learning_rate": 3.847142810508596e-06, "loss": 0.0749, "mean_token_accuracy": 0.9679999947547913, "step": 157 }, { "epoch": 0.6666666666666666, "grad_norm": 3.0662121772766113, "learning_rate": 3.785592405843518e-06, "loss": 0.0816, "mean_token_accuracy": 0.9620000123977661, "step": 158 }, { "epoch": 0.6708860759493671, "grad_norm": 2.4840102195739746, "learning_rate": 3.724414947860027e-06, "loss": 0.0807, "mean_token_accuracy": 0.9680638909339905, "step": 159 }, { "epoch": 0.6751054852320675, "grad_norm": 2.195005416870117, "learning_rate": 3.6636237449044077e-06, "loss": 0.0514, "mean_token_accuracy": 0.9822485446929932, "step": 160 }, { "epoch": 0.679324894514768, "grad_norm": 2.2611730098724365, "learning_rate": 3.603232021298284e-06, "loss": 0.0568, "mean_token_accuracy": 0.974459707736969, "step": 161 }, { "epoch": 0.6835443037974683, "grad_norm": 3.0416808128356934, "learning_rate": 3.5432529144618287e-06, "loss": 0.0717, "mean_token_accuracy": 0.9680638909339905, "step": 162 }, { "epoch": 0.6877637130801688, "grad_norm": 2.9663596153259277, "learning_rate": 3.483699472055897e-06, "loss": 0.0733, "mean_token_accuracy": 0.9640718698501587, "step": 163 }, { "epoch": 0.6919831223628692, "grad_norm": 2.2681233882904053, "learning_rate": 3.424584649143685e-06, "loss": 0.0579, "mean_token_accuracy": 0.9722222089767456, "step": 164 }, { "epoch": 0.6962025316455697, "grad_norm": 3.0692615509033203, "learning_rate": 3.365921305372519e-06, "loss": 0.0617, "mean_token_accuracy": 0.9724950790405273, "step": 165 }, { "epoch": 0.70042194092827, "grad_norm": 1.608898639678955, "learning_rate": 3.307722202176417e-06, "loss": 0.0524, "mean_token_accuracy": 0.9799196720123291, "step": 166 }, { "epoch": 0.7046413502109705, "grad_norm": 2.2708873748779297, "learning_rate": 3.2500000000000015e-06, "loss": 0.0701, "mean_token_accuracy": 0.974155068397522, "step": 167 }, { "epoch": 0.7088607594936709, "grad_norm": 2.1685729026794434, "learning_rate": 3.19276725554439e-06, "loss": 0.0682, "mean_token_accuracy": 0.9704142212867737, "step": 168 }, { "epoch": 0.7130801687763713, "grad_norm": 1.4980052709579468, "learning_rate": 3.136036419035656e-06, "loss": 0.057, "mean_token_accuracy": 0.9721115827560425, "step": 169 }, { "epoch": 0.7172995780590717, "grad_norm": 1.4526597261428833, "learning_rate": 3.0798198315164345e-06, "loss": 0.044, "mean_token_accuracy": 0.9840954542160034, "step": 170 }, { "epoch": 0.7215189873417721, "grad_norm": 2.120936155319214, "learning_rate": 3.024129722161305e-06, "loss": 0.0553, "mean_token_accuracy": 0.976190447807312, "step": 171 }, { "epoch": 0.7257383966244726, "grad_norm": 3.176001787185669, "learning_rate": 2.9689782056164874e-06, "loss": 0.0901, "mean_token_accuracy": 0.9600798487663269, "step": 172 }, { "epoch": 0.729957805907173, "grad_norm": 1.7018901109695435, "learning_rate": 2.914377279364464e-06, "loss": 0.0446, "mean_token_accuracy": 0.9820716977119446, "step": 173 }, { "epoch": 0.7341772151898734, "grad_norm": 1.6599562168121338, "learning_rate": 2.8603388211140886e-06, "loss": 0.0535, "mean_token_accuracy": 0.9842519760131836, "step": 174 }, { "epoch": 0.7383966244725738, "grad_norm": 2.3324365615844727, "learning_rate": 2.8068745862167423e-06, "loss": 0.067, "mean_token_accuracy": 0.9781312346458435, "step": 175 }, { "epoch": 0.7426160337552743, "grad_norm": 2.444692373275757, "learning_rate": 2.7539962051091185e-06, "loss": 0.0564, "mean_token_accuracy": 0.9801192879676819, "step": 176 }, { "epoch": 0.7468354430379747, "grad_norm": 1.9699876308441162, "learning_rate": 2.70171518078317e-06, "loss": 0.0728, "mean_token_accuracy": 0.9702380895614624, "step": 177 }, { "epoch": 0.7510548523206751, "grad_norm": 2.7244060039520264, "learning_rate": 2.6500428862837878e-06, "loss": 0.0673, "mean_token_accuracy": 0.9704142212867737, "step": 178 }, { "epoch": 0.7552742616033755, "grad_norm": 1.956512689590454, "learning_rate": 2.5989905622347538e-06, "loss": 0.0448, "mean_token_accuracy": 0.9841583967208862, "step": 179 }, { "epoch": 0.759493670886076, "grad_norm": 1.6712697744369507, "learning_rate": 2.5485693143934704e-06, "loss": 0.0582, "mean_token_accuracy": 0.9802371263504028, "step": 180 }, { "epoch": 0.7637130801687764, "grad_norm": 2.2149744033813477, "learning_rate": 2.498790111235072e-06, "loss": 0.0578, "mean_token_accuracy": 0.9718875288963318, "step": 181 }, { "epoch": 0.7679324894514767, "grad_norm": 2.661464214324951, "learning_rate": 2.4496637815663697e-06, "loss": 0.0892, "mean_token_accuracy": 0.9582505226135254, "step": 182 }, { "epoch": 0.7721518987341772, "grad_norm": 2.306964874267578, "learning_rate": 2.4012010121701853e-06, "loss": 0.0484, "mean_token_accuracy": 0.9799599051475525, "step": 183 }, { "epoch": 0.7763713080168776, "grad_norm": 2.2716338634490967, "learning_rate": 2.353412345480587e-06, "loss": 0.0927, "mean_token_accuracy": 0.958167314529419, "step": 184 }, { "epoch": 0.7805907172995781, "grad_norm": 1.5946310758590698, "learning_rate": 2.3063081772895203e-06, "loss": 0.0398, "mean_token_accuracy": 0.9821782112121582, "step": 185 }, { "epoch": 0.7848101265822784, "grad_norm": 2.167217493057251, "learning_rate": 2.2598987544853428e-06, "loss": 0.0547, "mean_token_accuracy": 0.9762845635414124, "step": 186 }, { "epoch": 0.7890295358649789, "grad_norm": 1.6822096109390259, "learning_rate": 2.2141941728237467e-06, "loss": 0.0424, "mean_token_accuracy": 0.980079710483551, "step": 187 }, { "epoch": 0.7932489451476793, "grad_norm": 2.1831417083740234, "learning_rate": 2.1692043747315628e-06, "loss": 0.0712, "mean_token_accuracy": 0.9722222089767456, "step": 188 }, { "epoch": 0.7974683544303798, "grad_norm": 1.7368559837341309, "learning_rate": 2.1249391471439206e-06, "loss": 0.0511, "mean_token_accuracy": 0.9762375950813293, "step": 189 }, { "epoch": 0.8016877637130801, "grad_norm": 2.0790603160858154, "learning_rate": 2.081408119375219e-06, "loss": 0.0477, "mean_token_accuracy": 0.9801587462425232, "step": 190 }, { "epoch": 0.8059071729957806, "grad_norm": 2.1118156909942627, "learning_rate": 2.0386207610244073e-06, "loss": 0.0551, "mean_token_accuracy": 0.9679358601570129, "step": 191 }, { "epoch": 0.810126582278481, "grad_norm": 1.796213984489441, "learning_rate": 1.9965863799149988e-06, "loss": 0.07, "mean_token_accuracy": 0.9742063283920288, "step": 192 }, { "epoch": 0.8143459915611815, "grad_norm": 1.400124192237854, "learning_rate": 1.955314120070269e-06, "loss": 0.0439, "mean_token_accuracy": 0.9860557913780212, "step": 193 }, { "epoch": 0.8185654008438819, "grad_norm": 2.0059828758239746, "learning_rate": 1.9148129597240984e-06, "loss": 0.0727, "mean_token_accuracy": 0.9722222089767456, "step": 194 }, { "epoch": 0.8227848101265823, "grad_norm": 2.615203857421875, "learning_rate": 1.8750917093678824e-06, "loss": 0.0528, "mean_token_accuracy": 0.9780876636505127, "step": 195 }, { "epoch": 0.8270042194092827, "grad_norm": 1.4755908250808716, "learning_rate": 1.8361590098339168e-06, "loss": 0.0621, "mean_token_accuracy": 0.9721670150756836, "step": 196 }, { "epoch": 0.8312236286919831, "grad_norm": 3.2548787593841553, "learning_rate": 1.7980233304157025e-06, "loss": 0.0701, "mean_token_accuracy": 0.9724409580230713, "step": 197 }, { "epoch": 0.8354430379746836, "grad_norm": 2.5661020278930664, "learning_rate": 1.760692967025564e-06, "loss": 0.0723, "mean_token_accuracy": 0.9740518927574158, "step": 198 }, { "epoch": 0.8396624472573839, "grad_norm": 2.8426754474639893, "learning_rate": 1.724176040389982e-06, "loss": 0.0937, "mean_token_accuracy": 0.9666011929512024, "step": 199 }, { "epoch": 0.8438818565400844, "grad_norm": 1.9045084714889526, "learning_rate": 1.6884804942830373e-06, "loss": 0.0394, "mean_token_accuracy": 0.9881423115730286, "step": 200 }, { "epoch": 0.8481012658227848, "grad_norm": 1.5193547010421753, "learning_rate": 1.6536140937983469e-06, "loss": 0.0546, "mean_token_accuracy": 0.9841583967208862, "step": 201 }, { "epoch": 0.8523206751054853, "grad_norm": 2.2213330268859863, "learning_rate": 1.619584423659875e-06, "loss": 0.0617, "mean_token_accuracy": 0.9763779640197754, "step": 202 }, { "epoch": 0.8565400843881856, "grad_norm": 1.9233527183532715, "learning_rate": 1.5863988865719702e-06, "loss": 0.063, "mean_token_accuracy": 0.9800398945808411, "step": 203 }, { "epoch": 0.8607594936708861, "grad_norm": 2.0455269813537598, "learning_rate": 1.5540647016090066e-06, "loss": 0.0608, "mean_token_accuracy": 0.97817462682724, "step": 204 }, { "epoch": 0.8649789029535865, "grad_norm": 1.5122522115707397, "learning_rate": 1.5225889026449754e-06, "loss": 0.0473, "mean_token_accuracy": 0.9781312346458435, "step": 205 }, { "epoch": 0.869198312236287, "grad_norm": 2.244276762008667, "learning_rate": 1.4919783368233525e-06, "loss": 0.0478, "mean_token_accuracy": 0.9779559373855591, "step": 206 }, { "epoch": 0.8734177215189873, "grad_norm": 3.4484386444091797, "learning_rate": 1.462239663067596e-06, "loss": 0.0763, "mean_token_accuracy": 0.9681908488273621, "step": 207 }, { "epoch": 0.8776371308016878, "grad_norm": 2.3605382442474365, "learning_rate": 1.4333793506325832e-06, "loss": 0.0416, "mean_token_accuracy": 0.9821073412895203, "step": 208 }, { "epoch": 0.8818565400843882, "grad_norm": 2.4336893558502197, "learning_rate": 1.4054036776973123e-06, "loss": 0.0839, "mean_token_accuracy": 0.9642857313156128, "step": 209 }, { "epoch": 0.8860759493670886, "grad_norm": 2.4874982833862305, "learning_rate": 1.378318729999169e-06, "loss": 0.0754, "mean_token_accuracy": 0.9704724550247192, "step": 210 }, { "epoch": 0.890295358649789, "grad_norm": 1.8259137868881226, "learning_rate": 1.3521303995100479e-06, "loss": 0.0526, "mean_token_accuracy": 0.9760956168174744, "step": 211 }, { "epoch": 0.8945147679324894, "grad_norm": 1.8145296573638916, "learning_rate": 1.32684438315464e-06, "loss": 0.0449, "mean_token_accuracy": 0.9801980257034302, "step": 212 }, { "epoch": 0.8987341772151899, "grad_norm": 1.5290073156356812, "learning_rate": 1.3024661815711387e-06, "loss": 0.0638, "mean_token_accuracy": 0.9720559120178223, "step": 213 }, { "epoch": 0.9029535864978903, "grad_norm": 1.492910385131836, "learning_rate": 1.2790010979146467e-06, "loss": 0.0441, "mean_token_accuracy": 0.9841269850730896, "step": 214 }, { "epoch": 0.9071729957805907, "grad_norm": 2.1390316486358643, "learning_rate": 1.2564542367035502e-06, "loss": 0.05, "mean_token_accuracy": 0.9840637445449829, "step": 215 }, { "epoch": 0.9113924050632911, "grad_norm": 2.724358320236206, "learning_rate": 1.2348305027090923e-06, "loss": 0.041, "mean_token_accuracy": 0.988095223903656, "step": 216 }, { "epoch": 0.9156118143459916, "grad_norm": 2.140127420425415, "learning_rate": 1.2141345998884092e-06, "loss": 0.0628, "mean_token_accuracy": 0.9781312346458435, "step": 217 }, { "epoch": 0.919831223628692, "grad_norm": 2.0484468936920166, "learning_rate": 1.1943710303612459e-06, "loss": 0.0704, "mean_token_accuracy": 0.9723320007324219, "step": 218 }, { "epoch": 0.9240506329113924, "grad_norm": 2.2169361114501953, "learning_rate": 1.175544093430577e-06, "loss": 0.0675, "mean_token_accuracy": 0.9641434550285339, "step": 219 }, { "epoch": 0.9282700421940928, "grad_norm": 2.46567964553833, "learning_rate": 1.1576578846473558e-06, "loss": 0.0689, "mean_token_accuracy": 0.9702970385551453, "step": 220 }, { "epoch": 0.9324894514767933, "grad_norm": 1.8127048015594482, "learning_rate": 1.1407162949195732e-06, "loss": 0.0377, "mean_token_accuracy": 0.9860835075378418, "step": 221 }, { "epoch": 0.9367088607594937, "grad_norm": 1.7486697435379028, "learning_rate": 1.1247230096658498e-06, "loss": 0.053, "mean_token_accuracy": 0.9662027955055237, "step": 222 }, { "epoch": 0.9409282700421941, "grad_norm": 3.1288397312164307, "learning_rate": 1.1096815080137196e-06, "loss": 0.0729, "mean_token_accuracy": 0.9722222089767456, "step": 223 }, { "epoch": 0.9451476793248945, "grad_norm": 1.9057952165603638, "learning_rate": 1.0955950620427883e-06, "loss": 0.0528, "mean_token_accuracy": 0.9802371263504028, "step": 224 }, { "epoch": 0.9493670886075949, "grad_norm": 2.11747145652771, "learning_rate": 1.0824667360729408e-06, "loss": 0.0389, "mean_token_accuracy": 0.9901185631752014, "step": 225 }, { "epoch": 0.9535864978902954, "grad_norm": 3.3241822719573975, "learning_rate": 1.070299385997735e-06, "loss": 0.0711, "mean_token_accuracy": 0.9701195359230042, "step": 226 }, { "epoch": 0.9578059071729957, "grad_norm": 2.2276864051818848, "learning_rate": 1.0590956586631432e-06, "loss": 0.0647, "mean_token_accuracy": 0.9780876636505127, "step": 227 }, { "epoch": 0.9620253164556962, "grad_norm": 1.8504068851470947, "learning_rate": 1.0488579912917633e-06, "loss": 0.0708, "mean_token_accuracy": 0.9701789021492004, "step": 228 }, { "epoch": 0.9662447257383966, "grad_norm": 1.7956818342208862, "learning_rate": 1.0395886109526346e-06, "loss": 0.0379, "mean_token_accuracy": 0.9881188273429871, "step": 229 }, { "epoch": 0.9704641350210971, "grad_norm": 2.5056283473968506, "learning_rate": 1.0312895340767674e-06, "loss": 0.0629, "mean_token_accuracy": 0.9722772240638733, "step": 230 }, { "epoch": 0.9746835443037974, "grad_norm": 2.2064919471740723, "learning_rate": 1.0239625660184965e-06, "loss": 0.0451, "mean_token_accuracy": 0.9782178401947021, "step": 231 }, { "epoch": 0.9789029535864979, "grad_norm": 1.9372010231018066, "learning_rate": 1.0176093006627485e-06, "loss": 0.0491, "mean_token_accuracy": 0.9763779640197754, "step": 232 }, { "epoch": 0.9831223628691983, "grad_norm": 2.014017105102539, "learning_rate": 1.0122311200783154e-06, "loss": 0.0703, "mean_token_accuracy": 0.9721115827560425, "step": 233 }, { "epoch": 0.9873417721518988, "grad_norm": 1.827635407447815, "learning_rate": 1.0078291942172018e-06, "loss": 0.0489, "mean_token_accuracy": 0.9819639325141907, "step": 234 }, { "epoch": 0.9915611814345991, "grad_norm": 3.6782145500183105, "learning_rate": 1.0044044806601188e-06, "loss": 0.0771, "mean_token_accuracy": 0.9680638909339905, "step": 235 }, { "epoch": 0.9957805907172996, "grad_norm": 2.860877752304077, "learning_rate": 1.0019577244081736e-06, "loss": 0.0497, "mean_token_accuracy": 0.9800000190734863, "step": 236 }, { "epoch": 1.0, "grad_norm": 2.177860975265503, "learning_rate": 1.0004894577208057e-06, "loss": 0.0484, "mean_token_accuracy": 0.9841583967208862, "step": 237 }, { "epoch": 1.0, "step": 237, "total_flos": 1.3663861802532864e+17, "train_loss": 0.2661279478122162, "train_runtime": 1218.7153, "train_samples_per_second": 12.406, "train_steps_per_second": 0.194 } ], "logging_steps": 1, "max_steps": 237, "num_input_tokens_seen": 0, "num_train_epochs": 1, "save_steps": 500, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": true }, "attributes": {} } }, "total_flos": 1.3663861802532864e+17, "train_batch_size": 8, "trial_name": null, "trial_params": null }