diff --git "a/trainer_state.json" "b/trainer_state.json" new file mode 100644--- /dev/null +++ "b/trainer_state.json" @@ -0,0 +1,20833 @@ +{ + "best_metric": null, + "best_model_checkpoint": null, + "epoch": 0.9999663084127893, + "eval_steps": 500, + "global_step": 14840, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "epoch": 6.7383174421347e-05, + "grad_norm": 32.09206302403183, + "learning_rate": 6.738544474393531e-10, + "loss": 3.2042, + "step": 1 + }, + { + "epoch": 0.00033691587210673494, + "grad_norm": 29.85124796764343, + "learning_rate": 3.3692722371967655e-09, + "loss": 3.0738, + "step": 5 + }, + { + "epoch": 0.0006738317442134699, + "grad_norm": 35.348710336485944, + "learning_rate": 6.738544474393531e-09, + "loss": 3.0573, + "step": 10 + }, + { + "epoch": 0.001010747616320205, + "grad_norm": 35.40963118986941, + "learning_rate": 1.0107816711590296e-08, + "loss": 3.0608, + "step": 15 + }, + { + "epoch": 0.0013476634884269398, + "grad_norm": 32.39762203020303, + "learning_rate": 1.3477088948787062e-08, + "loss": 3.0198, + "step": 20 + }, + { + "epoch": 0.0016845793605336748, + "grad_norm": 30.251139866969535, + "learning_rate": 1.6846361185983825e-08, + "loss": 3.0206, + "step": 25 + }, + { + "epoch": 0.00202149523264041, + "grad_norm": 26.501991719428602, + "learning_rate": 2.021563342318059e-08, + "loss": 2.9709, + "step": 30 + }, + { + "epoch": 0.0023584111047471445, + "grad_norm": 31.481478748265243, + "learning_rate": 2.3584905660377358e-08, + "loss": 3.0565, + "step": 35 + }, + { + "epoch": 0.0026953269768538795, + "grad_norm": 34.70890491380774, + "learning_rate": 2.6954177897574124e-08, + "loss": 3.0505, + "step": 40 + }, + { + "epoch": 0.0030322428489606146, + "grad_norm": 31.691755532137382, + "learning_rate": 3.032345013477089e-08, + "loss": 3.0692, + "step": 45 + }, + { + "epoch": 0.0033691587210673496, + "grad_norm": 36.9671149567999, + "learning_rate": 3.369272237196765e-08, + "loss": 3.1505, + "step": 50 + }, + { + "epoch": 0.0037060745931740842, + "grad_norm": 30.246848965440783, + "learning_rate": 3.706199460916442e-08, + "loss": 3.08, + "step": 55 + }, + { + "epoch": 0.00404299046528082, + "grad_norm": 33.313935844298754, + "learning_rate": 4.043126684636118e-08, + "loss": 2.9876, + "step": 60 + }, + { + "epoch": 0.004379906337387554, + "grad_norm": 29.15727651429879, + "learning_rate": 4.380053908355795e-08, + "loss": 3.0608, + "step": 65 + }, + { + "epoch": 0.004716822209494289, + "grad_norm": 32.676729128831575, + "learning_rate": 4.7169811320754715e-08, + "loss": 3.1247, + "step": 70 + }, + { + "epoch": 0.005053738081601024, + "grad_norm": 31.46286104321294, + "learning_rate": 5.053908355795148e-08, + "loss": 3.0343, + "step": 75 + }, + { + "epoch": 0.005390653953707759, + "grad_norm": 25.222824338343894, + "learning_rate": 5.390835579514825e-08, + "loss": 2.9254, + "step": 80 + }, + { + "epoch": 0.005727569825814494, + "grad_norm": 22.637008002000595, + "learning_rate": 5.727762803234501e-08, + "loss": 2.7987, + "step": 85 + }, + { + "epoch": 0.006064485697921229, + "grad_norm": 27.769504016534167, + "learning_rate": 6.064690026954177e-08, + "loss": 2.9506, + "step": 90 + }, + { + "epoch": 0.006401401570027964, + "grad_norm": 26.957750943106987, + "learning_rate": 6.401617250673854e-08, + "loss": 2.9574, + "step": 95 + }, + { + "epoch": 0.006738317442134699, + "grad_norm": 26.113167121031843, + "learning_rate": 6.73854447439353e-08, + "loss": 2.9626, + "step": 100 + }, + { + "epoch": 0.007075233314241434, + "grad_norm": 27.596533306623297, + "learning_rate": 7.075471698113207e-08, + "loss": 3.0049, + "step": 105 + }, + { + "epoch": 0.0074121491863481685, + "grad_norm": 21.936343647967842, + "learning_rate": 7.412398921832884e-08, + "loss": 2.9212, + "step": 110 + }, + { + "epoch": 0.0077490650584549035, + "grad_norm": 17.587509277494334, + "learning_rate": 7.749326145552561e-08, + "loss": 2.8243, + "step": 115 + }, + { + "epoch": 0.00808598093056164, + "grad_norm": 17.039393337653188, + "learning_rate": 8.086253369272237e-08, + "loss": 2.7713, + "step": 120 + }, + { + "epoch": 0.008422896802668374, + "grad_norm": 17.11446090656002, + "learning_rate": 8.423180592991913e-08, + "loss": 2.7353, + "step": 125 + }, + { + "epoch": 0.008759812674775108, + "grad_norm": 17.005432651209553, + "learning_rate": 8.76010781671159e-08, + "loss": 2.83, + "step": 130 + }, + { + "epoch": 0.009096728546881844, + "grad_norm": 16.994628911009993, + "learning_rate": 9.097035040431267e-08, + "loss": 2.8259, + "step": 135 + }, + { + "epoch": 0.009433644418988578, + "grad_norm": 14.381055662834557, + "learning_rate": 9.433962264150943e-08, + "loss": 2.6585, + "step": 140 + }, + { + "epoch": 0.009770560291095314, + "grad_norm": 12.855855412057881, + "learning_rate": 9.770889487870619e-08, + "loss": 2.7127, + "step": 145 + }, + { + "epoch": 0.010107476163202048, + "grad_norm": 9.33136232394542, + "learning_rate": 1.0107816711590296e-07, + "loss": 2.6163, + "step": 150 + }, + { + "epoch": 0.010444392035308784, + "grad_norm": 8.974119069707546, + "learning_rate": 1.0444743935309973e-07, + "loss": 2.6374, + "step": 155 + }, + { + "epoch": 0.010781307907415518, + "grad_norm": 9.295477060485023, + "learning_rate": 1.078167115902965e-07, + "loss": 2.606, + "step": 160 + }, + { + "epoch": 0.011118223779522254, + "grad_norm": 9.296285754353445, + "learning_rate": 1.1118598382749325e-07, + "loss": 2.5663, + "step": 165 + }, + { + "epoch": 0.011455139651628988, + "grad_norm": 8.107059371834286, + "learning_rate": 1.1455525606469002e-07, + "loss": 2.5504, + "step": 170 + }, + { + "epoch": 0.011792055523735722, + "grad_norm": 8.843539027040743, + "learning_rate": 1.1792452830188679e-07, + "loss": 2.557, + "step": 175 + }, + { + "epoch": 0.012128971395842458, + "grad_norm": 7.057056300901147, + "learning_rate": 1.2129380053908355e-07, + "loss": 2.5615, + "step": 180 + }, + { + "epoch": 0.012465887267949192, + "grad_norm": 7.38030846809641, + "learning_rate": 1.2466307277628032e-07, + "loss": 2.5312, + "step": 185 + }, + { + "epoch": 0.012802803140055928, + "grad_norm": 7.8962233248696805, + "learning_rate": 1.280323450134771e-07, + "loss": 2.4867, + "step": 190 + }, + { + "epoch": 0.013139719012162663, + "grad_norm": 6.554681755642436, + "learning_rate": 1.3140161725067383e-07, + "loss": 2.5143, + "step": 195 + }, + { + "epoch": 0.013476634884269399, + "grad_norm": 7.140796549890275, + "learning_rate": 1.347708894878706e-07, + "loss": 2.4714, + "step": 200 + }, + { + "epoch": 0.013813550756376133, + "grad_norm": 6.487040702144441, + "learning_rate": 1.3814016172506737e-07, + "loss": 2.5227, + "step": 205 + }, + { + "epoch": 0.014150466628482869, + "grad_norm": 6.8916036053852645, + "learning_rate": 1.4150943396226414e-07, + "loss": 2.5232, + "step": 210 + }, + { + "epoch": 0.014487382500589603, + "grad_norm": 6.419262649425003, + "learning_rate": 1.448787061994609e-07, + "loss": 2.4131, + "step": 215 + }, + { + "epoch": 0.014824298372696337, + "grad_norm": 7.861046646197374, + "learning_rate": 1.4824797843665768e-07, + "loss": 2.4394, + "step": 220 + }, + { + "epoch": 0.015161214244803073, + "grad_norm": 7.562450211806467, + "learning_rate": 1.5161725067385445e-07, + "loss": 2.4057, + "step": 225 + }, + { + "epoch": 0.015498130116909807, + "grad_norm": 6.938949312004328, + "learning_rate": 1.5498652291105122e-07, + "loss": 2.4074, + "step": 230 + }, + { + "epoch": 0.01583504598901654, + "grad_norm": 6.496838158861888, + "learning_rate": 1.58355795148248e-07, + "loss": 2.3519, + "step": 235 + }, + { + "epoch": 0.01617196186112328, + "grad_norm": 5.7314057513508185, + "learning_rate": 1.6172506738544473e-07, + "loss": 2.367, + "step": 240 + }, + { + "epoch": 0.016508877733230013, + "grad_norm": 6.951340639666037, + "learning_rate": 1.650943396226415e-07, + "loss": 2.3327, + "step": 245 + }, + { + "epoch": 0.016845793605336747, + "grad_norm": 7.233019401736712, + "learning_rate": 1.6846361185983827e-07, + "loss": 2.3259, + "step": 250 + }, + { + "epoch": 0.01718270947744348, + "grad_norm": 6.120666820602053, + "learning_rate": 1.7183288409703504e-07, + "loss": 2.3448, + "step": 255 + }, + { + "epoch": 0.017519625349550216, + "grad_norm": 5.822657164591844, + "learning_rate": 1.752021563342318e-07, + "loss": 2.2525, + "step": 260 + }, + { + "epoch": 0.017856541221656953, + "grad_norm": 5.645590925567188, + "learning_rate": 1.7857142857142858e-07, + "loss": 2.3429, + "step": 265 + }, + { + "epoch": 0.018193457093763687, + "grad_norm": 5.542805631769041, + "learning_rate": 1.8194070080862535e-07, + "loss": 2.2838, + "step": 270 + }, + { + "epoch": 0.01853037296587042, + "grad_norm": 5.856964348688246, + "learning_rate": 1.853099730458221e-07, + "loss": 2.3266, + "step": 275 + }, + { + "epoch": 0.018867288837977156, + "grad_norm": 5.216722479418778, + "learning_rate": 1.8867924528301886e-07, + "loss": 2.2765, + "step": 280 + }, + { + "epoch": 0.019204204710083893, + "grad_norm": 5.608251688722653, + "learning_rate": 1.920485175202156e-07, + "loss": 2.2346, + "step": 285 + }, + { + "epoch": 0.019541120582190628, + "grad_norm": 6.214595981569079, + "learning_rate": 1.9541778975741237e-07, + "loss": 2.2486, + "step": 290 + }, + { + "epoch": 0.019878036454297362, + "grad_norm": 25.172949782744762, + "learning_rate": 1.9878706199460914e-07, + "loss": 2.2545, + "step": 295 + }, + { + "epoch": 0.020214952326404096, + "grad_norm": 6.03234667659201, + "learning_rate": 2.021563342318059e-07, + "loss": 2.3099, + "step": 300 + }, + { + "epoch": 0.02055186819851083, + "grad_norm": 5.5276661755222545, + "learning_rate": 2.0552560646900268e-07, + "loss": 2.1518, + "step": 305 + }, + { + "epoch": 0.020888784070617568, + "grad_norm": 6.148883486161354, + "learning_rate": 2.0889487870619945e-07, + "loss": 2.2243, + "step": 310 + }, + { + "epoch": 0.021225699942724302, + "grad_norm": 7.031330506249546, + "learning_rate": 2.1226415094339622e-07, + "loss": 2.2653, + "step": 315 + }, + { + "epoch": 0.021562615814831036, + "grad_norm": 5.720437592526712, + "learning_rate": 2.15633423180593e-07, + "loss": 2.2255, + "step": 320 + }, + { + "epoch": 0.02189953168693777, + "grad_norm": 6.232004013022435, + "learning_rate": 2.1900269541778973e-07, + "loss": 2.2326, + "step": 325 + }, + { + "epoch": 0.022236447559044508, + "grad_norm": 5.427884766447202, + "learning_rate": 2.223719676549865e-07, + "loss": 2.2255, + "step": 330 + }, + { + "epoch": 0.022573363431151242, + "grad_norm": 5.5418646369024875, + "learning_rate": 2.2574123989218327e-07, + "loss": 2.1378, + "step": 335 + }, + { + "epoch": 0.022910279303257976, + "grad_norm": 6.0425076069387895, + "learning_rate": 2.2911051212938004e-07, + "loss": 2.2097, + "step": 340 + }, + { + "epoch": 0.02324719517536471, + "grad_norm": 5.400950802947024, + "learning_rate": 2.324797843665768e-07, + "loss": 2.2412, + "step": 345 + }, + { + "epoch": 0.023584111047471445, + "grad_norm": 5.173515089054146, + "learning_rate": 2.3584905660377358e-07, + "loss": 2.1899, + "step": 350 + }, + { + "epoch": 0.023921026919578182, + "grad_norm": 5.240519930808722, + "learning_rate": 2.392183288409703e-07, + "loss": 2.2218, + "step": 355 + }, + { + "epoch": 0.024257942791684917, + "grad_norm": 5.732864030593667, + "learning_rate": 2.425876010781671e-07, + "loss": 2.1959, + "step": 360 + }, + { + "epoch": 0.02459485866379165, + "grad_norm": 5.986885229622732, + "learning_rate": 2.4595687331536387e-07, + "loss": 2.179, + "step": 365 + }, + { + "epoch": 0.024931774535898385, + "grad_norm": 6.367446128253974, + "learning_rate": 2.4932614555256063e-07, + "loss": 2.1408, + "step": 370 + }, + { + "epoch": 0.025268690408005123, + "grad_norm": 5.370594569811163, + "learning_rate": 2.526954177897574e-07, + "loss": 2.1142, + "step": 375 + }, + { + "epoch": 0.025605606280111857, + "grad_norm": 5.307650833779364, + "learning_rate": 2.560646900269542e-07, + "loss": 2.1089, + "step": 380 + }, + { + "epoch": 0.02594252215221859, + "grad_norm": 5.931005595781113, + "learning_rate": 2.5943396226415094e-07, + "loss": 2.0792, + "step": 385 + }, + { + "epoch": 0.026279438024325325, + "grad_norm": 5.636620139141842, + "learning_rate": 2.6280323450134766e-07, + "loss": 2.1345, + "step": 390 + }, + { + "epoch": 0.02661635389643206, + "grad_norm": 6.285950145006834, + "learning_rate": 2.661725067385445e-07, + "loss": 2.2002, + "step": 395 + }, + { + "epoch": 0.026953269768538797, + "grad_norm": 5.496040660546177, + "learning_rate": 2.695417789757412e-07, + "loss": 2.0634, + "step": 400 + }, + { + "epoch": 0.02729018564064553, + "grad_norm": 5.355777932542913, + "learning_rate": 2.72911051212938e-07, + "loss": 2.1038, + "step": 405 + }, + { + "epoch": 0.027627101512752265, + "grad_norm": 5.407560676614071, + "learning_rate": 2.7628032345013474e-07, + "loss": 2.116, + "step": 410 + }, + { + "epoch": 0.027964017384859, + "grad_norm": 5.510824187092772, + "learning_rate": 2.7964959568733156e-07, + "loss": 2.099, + "step": 415 + }, + { + "epoch": 0.028300933256965737, + "grad_norm": 5.6923669778496375, + "learning_rate": 2.830188679245283e-07, + "loss": 2.1419, + "step": 420 + }, + { + "epoch": 0.02863784912907247, + "grad_norm": 5.232069094231692, + "learning_rate": 2.863881401617251e-07, + "loss": 2.1309, + "step": 425 + }, + { + "epoch": 0.028974765001179206, + "grad_norm": 5.12385346701768, + "learning_rate": 2.897574123989218e-07, + "loss": 2.0957, + "step": 430 + }, + { + "epoch": 0.02931168087328594, + "grad_norm": 5.043874118840126, + "learning_rate": 2.9312668463611853e-07, + "loss": 2.0322, + "step": 435 + }, + { + "epoch": 0.029648596745392674, + "grad_norm": 5.5658668729227045, + "learning_rate": 2.9649595687331536e-07, + "loss": 2.136, + "step": 440 + }, + { + "epoch": 0.02998551261749941, + "grad_norm": 10.65312954737502, + "learning_rate": 2.9986522911051207e-07, + "loss": 2.0633, + "step": 445 + }, + { + "epoch": 0.030322428489606146, + "grad_norm": 5.397097363208325, + "learning_rate": 3.032345013477089e-07, + "loss": 2.0639, + "step": 450 + }, + { + "epoch": 0.03065934436171288, + "grad_norm": 5.639079239529771, + "learning_rate": 3.066037735849056e-07, + "loss": 2.1089, + "step": 455 + }, + { + "epoch": 0.030996260233819614, + "grad_norm": 6.098761612302749, + "learning_rate": 3.0997304582210244e-07, + "loss": 2.0253, + "step": 460 + }, + { + "epoch": 0.03133317610592635, + "grad_norm": 5.075909432856418, + "learning_rate": 3.1334231805929915e-07, + "loss": 2.107, + "step": 465 + }, + { + "epoch": 0.03167009197803308, + "grad_norm": 5.387955375678346, + "learning_rate": 3.16711590296496e-07, + "loss": 2.0536, + "step": 470 + }, + { + "epoch": 0.03200700785013982, + "grad_norm": 5.522928174932781, + "learning_rate": 3.200808625336927e-07, + "loss": 2.1081, + "step": 475 + }, + { + "epoch": 0.03234392372224656, + "grad_norm": 5.024688066890421, + "learning_rate": 3.2345013477088946e-07, + "loss": 2.0879, + "step": 480 + }, + { + "epoch": 0.03268083959435329, + "grad_norm": 5.2368353353349155, + "learning_rate": 3.2681940700808623e-07, + "loss": 2.0918, + "step": 485 + }, + { + "epoch": 0.033017755466460026, + "grad_norm": 5.118887611173772, + "learning_rate": 3.30188679245283e-07, + "loss": 2.025, + "step": 490 + }, + { + "epoch": 0.03335467133856676, + "grad_norm": 5.343662616511924, + "learning_rate": 3.3355795148247977e-07, + "loss": 2.1206, + "step": 495 + }, + { + "epoch": 0.033691587210673495, + "grad_norm": 5.28143943704075, + "learning_rate": 3.3692722371967654e-07, + "loss": 2.0416, + "step": 500 + }, + { + "epoch": 0.03402850308278023, + "grad_norm": 5.740167633758656, + "learning_rate": 3.402964959568733e-07, + "loss": 2.0, + "step": 505 + }, + { + "epoch": 0.03436541895488696, + "grad_norm": 6.130295525717783, + "learning_rate": 3.436657681940701e-07, + "loss": 2.044, + "step": 510 + }, + { + "epoch": 0.0347023348269937, + "grad_norm": 6.034524610430405, + "learning_rate": 3.4703504043126685e-07, + "loss": 2.0904, + "step": 515 + }, + { + "epoch": 0.03503925069910043, + "grad_norm": 5.618706136600561, + "learning_rate": 3.504043126684636e-07, + "loss": 2.1239, + "step": 520 + }, + { + "epoch": 0.03537616657120717, + "grad_norm": 5.5845268552377485, + "learning_rate": 3.5377358490566033e-07, + "loss": 2.0664, + "step": 525 + }, + { + "epoch": 0.03571308244331391, + "grad_norm": 4.924388794395622, + "learning_rate": 3.5714285714285716e-07, + "loss": 2.0291, + "step": 530 + }, + { + "epoch": 0.03604999831542064, + "grad_norm": 5.698445960129494, + "learning_rate": 3.605121293800539e-07, + "loss": 2.075, + "step": 535 + }, + { + "epoch": 0.036386914187527375, + "grad_norm": 5.7867177958187, + "learning_rate": 3.638814016172507e-07, + "loss": 2.0166, + "step": 540 + }, + { + "epoch": 0.03672383005963411, + "grad_norm": 5.377350472058835, + "learning_rate": 3.672506738544474e-07, + "loss": 2.056, + "step": 545 + }, + { + "epoch": 0.03706074593174084, + "grad_norm": 5.119390903437156, + "learning_rate": 3.706199460916442e-07, + "loss": 2.0499, + "step": 550 + }, + { + "epoch": 0.03739766180384758, + "grad_norm": 4.942708782421948, + "learning_rate": 3.7398921832884095e-07, + "loss": 2.0276, + "step": 555 + }, + { + "epoch": 0.03773457767595431, + "grad_norm": 5.248745016083541, + "learning_rate": 3.773584905660377e-07, + "loss": 2.0444, + "step": 560 + }, + { + "epoch": 0.038071493548061046, + "grad_norm": 5.055769934271325, + "learning_rate": 3.807277628032345e-07, + "loss": 1.9998, + "step": 565 + }, + { + "epoch": 0.03840840942016779, + "grad_norm": 5.432243157493703, + "learning_rate": 3.840970350404312e-07, + "loss": 2.0655, + "step": 570 + }, + { + "epoch": 0.03874532529227452, + "grad_norm": 4.9675516385155225, + "learning_rate": 3.8746630727762803e-07, + "loss": 1.9969, + "step": 575 + }, + { + "epoch": 0.039082241164381255, + "grad_norm": 5.378207728638795, + "learning_rate": 3.9083557951482475e-07, + "loss": 2.0253, + "step": 580 + }, + { + "epoch": 0.03941915703648799, + "grad_norm": 5.294707319370417, + "learning_rate": 3.9420485175202157e-07, + "loss": 2.0218, + "step": 585 + }, + { + "epoch": 0.039756072908594724, + "grad_norm": 5.715084163910885, + "learning_rate": 3.975741239892183e-07, + "loss": 2.0568, + "step": 590 + }, + { + "epoch": 0.04009298878070146, + "grad_norm": 6.898250059855762, + "learning_rate": 4.009433962264151e-07, + "loss": 2.0793, + "step": 595 + }, + { + "epoch": 0.04042990465280819, + "grad_norm": 5.468487143259513, + "learning_rate": 4.043126684636118e-07, + "loss": 2.0028, + "step": 600 + }, + { + "epoch": 0.040766820524914926, + "grad_norm": 5.730269710778106, + "learning_rate": 4.076819407008086e-07, + "loss": 2.0607, + "step": 605 + }, + { + "epoch": 0.04110373639702166, + "grad_norm": 5.79169857641335, + "learning_rate": 4.1105121293800537e-07, + "loss": 2.0516, + "step": 610 + }, + { + "epoch": 0.0414406522691284, + "grad_norm": 5.635207608283501, + "learning_rate": 4.1442048517520213e-07, + "loss": 2.0199, + "step": 615 + }, + { + "epoch": 0.041777568141235136, + "grad_norm": 5.521975548236661, + "learning_rate": 4.177897574123989e-07, + "loss": 2.0305, + "step": 620 + }, + { + "epoch": 0.04211448401334187, + "grad_norm": 5.083944440185206, + "learning_rate": 4.211590296495957e-07, + "loss": 2.0284, + "step": 625 + }, + { + "epoch": 0.042451399885448604, + "grad_norm": 5.377736048836437, + "learning_rate": 4.2452830188679244e-07, + "loss": 1.9927, + "step": 630 + }, + { + "epoch": 0.04278831575755534, + "grad_norm": 5.232868294304897, + "learning_rate": 4.278975741239892e-07, + "loss": 2.0419, + "step": 635 + }, + { + "epoch": 0.04312523162966207, + "grad_norm": 6.182464484906801, + "learning_rate": 4.31266846361186e-07, + "loss": 2.0166, + "step": 640 + }, + { + "epoch": 0.04346214750176881, + "grad_norm": 5.429049038299917, + "learning_rate": 4.3463611859838275e-07, + "loss": 1.9505, + "step": 645 + }, + { + "epoch": 0.04379906337387554, + "grad_norm": 5.350503007594097, + "learning_rate": 4.3800539083557947e-07, + "loss": 1.9925, + "step": 650 + }, + { + "epoch": 0.044135979245982275, + "grad_norm": 5.150651978378095, + "learning_rate": 4.413746630727763e-07, + "loss": 2.0446, + "step": 655 + }, + { + "epoch": 0.044472895118089016, + "grad_norm": 5.521163370130274, + "learning_rate": 4.44743935309973e-07, + "loss": 1.9818, + "step": 660 + }, + { + "epoch": 0.04480981099019575, + "grad_norm": 5.694227421727922, + "learning_rate": 4.481132075471698e-07, + "loss": 2.0312, + "step": 665 + }, + { + "epoch": 0.045146726862302484, + "grad_norm": 5.069392194962212, + "learning_rate": 4.5148247978436655e-07, + "loss": 2.0187, + "step": 670 + }, + { + "epoch": 0.04548364273440922, + "grad_norm": 5.147985295207899, + "learning_rate": 4.548517520215633e-07, + "loss": 2.0533, + "step": 675 + }, + { + "epoch": 0.04582055860651595, + "grad_norm": 5.647364286105832, + "learning_rate": 4.582210242587601e-07, + "loss": 2.0218, + "step": 680 + }, + { + "epoch": 0.04615747447862269, + "grad_norm": 5.296105092900961, + "learning_rate": 4.6159029649595686e-07, + "loss": 2.0438, + "step": 685 + }, + { + "epoch": 0.04649439035072942, + "grad_norm": 5.431070724140859, + "learning_rate": 4.649595687331536e-07, + "loss": 1.9468, + "step": 690 + }, + { + "epoch": 0.046831306222836155, + "grad_norm": 5.338564248947676, + "learning_rate": 4.6832884097035034e-07, + "loss": 1.9921, + "step": 695 + }, + { + "epoch": 0.04716822209494289, + "grad_norm": 5.467178934661103, + "learning_rate": 4.7169811320754717e-07, + "loss": 1.9582, + "step": 700 + }, + { + "epoch": 0.04750513796704963, + "grad_norm": 5.209811996274768, + "learning_rate": 4.750673854447439e-07, + "loss": 1.9878, + "step": 705 + }, + { + "epoch": 0.047842053839156365, + "grad_norm": 5.325725411751291, + "learning_rate": 4.784366576819407e-07, + "loss": 2.0557, + "step": 710 + }, + { + "epoch": 0.0481789697112631, + "grad_norm": 5.612384744469257, + "learning_rate": 4.818059299191375e-07, + "loss": 1.9881, + "step": 715 + }, + { + "epoch": 0.04851588558336983, + "grad_norm": 4.98603677284012, + "learning_rate": 4.851752021563342e-07, + "loss": 1.9988, + "step": 720 + }, + { + "epoch": 0.04885280145547657, + "grad_norm": 5.533953690217046, + "learning_rate": 4.88544474393531e-07, + "loss": 1.9702, + "step": 725 + }, + { + "epoch": 0.0491897173275833, + "grad_norm": 5.279439473276112, + "learning_rate": 4.919137466307277e-07, + "loss": 2.0248, + "step": 730 + }, + { + "epoch": 0.049526633199690036, + "grad_norm": 5.485075105507988, + "learning_rate": 4.952830188679246e-07, + "loss": 1.9812, + "step": 735 + }, + { + "epoch": 0.04986354907179677, + "grad_norm": 5.761664060674013, + "learning_rate": 4.986522911051213e-07, + "loss": 1.9241, + "step": 740 + }, + { + "epoch": 0.050200464943903504, + "grad_norm": 5.314097467258596, + "learning_rate": 5.020215633423181e-07, + "loss": 2.0161, + "step": 745 + }, + { + "epoch": 0.050537380816010245, + "grad_norm": 5.378664311376735, + "learning_rate": 5.053908355795148e-07, + "loss": 1.9518, + "step": 750 + }, + { + "epoch": 0.05087429668811698, + "grad_norm": 4.691452253547841, + "learning_rate": 5.087601078167115e-07, + "loss": 2.0194, + "step": 755 + }, + { + "epoch": 0.051211212560223714, + "grad_norm": 5.3831522238040845, + "learning_rate": 5.121293800539083e-07, + "loss": 1.999, + "step": 760 + }, + { + "epoch": 0.05154812843233045, + "grad_norm": 4.976811429217899, + "learning_rate": 5.154986522911052e-07, + "loss": 2.0192, + "step": 765 + }, + { + "epoch": 0.05188504430443718, + "grad_norm": 5.581599453910726, + "learning_rate": 5.188679245283019e-07, + "loss": 2.026, + "step": 770 + }, + { + "epoch": 0.052221960176543916, + "grad_norm": 5.243566709461788, + "learning_rate": 5.222371967654986e-07, + "loss": 1.9774, + "step": 775 + }, + { + "epoch": 0.05255887604865065, + "grad_norm": 21.240219094200093, + "learning_rate": 5.256064690026953e-07, + "loss": 1.9668, + "step": 780 + }, + { + "epoch": 0.052895791920757385, + "grad_norm": 5.554100631042519, + "learning_rate": 5.289757412398921e-07, + "loss": 2.0255, + "step": 785 + }, + { + "epoch": 0.05323270779286412, + "grad_norm": 5.40970243183288, + "learning_rate": 5.32345013477089e-07, + "loss": 1.9893, + "step": 790 + }, + { + "epoch": 0.05356962366497086, + "grad_norm": 5.0017195942526245, + "learning_rate": 5.357142857142857e-07, + "loss": 1.9773, + "step": 795 + }, + { + "epoch": 0.053906539537077594, + "grad_norm": 4.526154717874087, + "learning_rate": 5.390835579514824e-07, + "loss": 1.9305, + "step": 800 + }, + { + "epoch": 0.05424345540918433, + "grad_norm": 5.4425678388497865, + "learning_rate": 5.424528301886792e-07, + "loss": 1.9608, + "step": 805 + }, + { + "epoch": 0.05458037128129106, + "grad_norm": 4.623913713740688, + "learning_rate": 5.45822102425876e-07, + "loss": 2.0188, + "step": 810 + }, + { + "epoch": 0.0549172871533978, + "grad_norm": 5.881864605300756, + "learning_rate": 5.491913746630728e-07, + "loss": 1.9012, + "step": 815 + }, + { + "epoch": 0.05525420302550453, + "grad_norm": 5.011736843819804, + "learning_rate": 5.525606469002695e-07, + "loss": 2.019, + "step": 820 + }, + { + "epoch": 0.055591118897611265, + "grad_norm": 5.409773765863313, + "learning_rate": 5.559299191374662e-07, + "loss": 1.9922, + "step": 825 + }, + { + "epoch": 0.055928034769718, + "grad_norm": 4.969421140342336, + "learning_rate": 5.592991913746631e-07, + "loss": 1.9715, + "step": 830 + }, + { + "epoch": 0.05626495064182473, + "grad_norm": 5.4543763327476364, + "learning_rate": 5.626684636118598e-07, + "loss": 1.9817, + "step": 835 + }, + { + "epoch": 0.056601866513931474, + "grad_norm": 5.077339487080564, + "learning_rate": 5.660377358490566e-07, + "loss": 1.957, + "step": 840 + }, + { + "epoch": 0.05693878238603821, + "grad_norm": 5.078353916883788, + "learning_rate": 5.694070080862533e-07, + "loss": 1.9778, + "step": 845 + }, + { + "epoch": 0.05727569825814494, + "grad_norm": 5.007243744968121, + "learning_rate": 5.727762803234502e-07, + "loss": 2.014, + "step": 850 + }, + { + "epoch": 0.05761261413025168, + "grad_norm": 4.9317975602111925, + "learning_rate": 5.761455525606469e-07, + "loss": 2.0239, + "step": 855 + }, + { + "epoch": 0.05794953000235841, + "grad_norm": 4.761656162239346, + "learning_rate": 5.795148247978436e-07, + "loss": 2.0533, + "step": 860 + }, + { + "epoch": 0.058286445874465145, + "grad_norm": 5.385370063131065, + "learning_rate": 5.828840970350404e-07, + "loss": 1.9105, + "step": 865 + }, + { + "epoch": 0.05862336174657188, + "grad_norm": 5.30113234532175, + "learning_rate": 5.862533692722371e-07, + "loss": 1.9529, + "step": 870 + }, + { + "epoch": 0.058960277618678614, + "grad_norm": 6.9033659840878325, + "learning_rate": 5.89622641509434e-07, + "loss": 1.9739, + "step": 875 + }, + { + "epoch": 0.05929719349078535, + "grad_norm": 5.429285070930471, + "learning_rate": 5.929919137466307e-07, + "loss": 1.9436, + "step": 880 + }, + { + "epoch": 0.05963410936289209, + "grad_norm": 6.389343970685152, + "learning_rate": 5.963611859838274e-07, + "loss": 1.9751, + "step": 885 + }, + { + "epoch": 0.05997102523499882, + "grad_norm": 5.154716173144622, + "learning_rate": 5.997304582210241e-07, + "loss": 1.9629, + "step": 890 + }, + { + "epoch": 0.06030794110710556, + "grad_norm": 5.347558988304943, + "learning_rate": 6.030997304582211e-07, + "loss": 1.9282, + "step": 895 + }, + { + "epoch": 0.06064485697921229, + "grad_norm": 5.41923416719515, + "learning_rate": 6.064690026954178e-07, + "loss": 1.9376, + "step": 900 + }, + { + "epoch": 0.060981772851319026, + "grad_norm": 5.044466204349174, + "learning_rate": 6.098382749326145e-07, + "loss": 1.939, + "step": 905 + }, + { + "epoch": 0.06131868872342576, + "grad_norm": 5.186923659997595, + "learning_rate": 6.132075471698112e-07, + "loss": 1.9401, + "step": 910 + }, + { + "epoch": 0.061655604595532494, + "grad_norm": 6.170059318329579, + "learning_rate": 6.16576819407008e-07, + "loss": 1.9413, + "step": 915 + }, + { + "epoch": 0.06199252046763923, + "grad_norm": 5.228653101504448, + "learning_rate": 6.199460916442049e-07, + "loss": 1.9087, + "step": 920 + }, + { + "epoch": 0.06232943633974596, + "grad_norm": 4.741572431296043, + "learning_rate": 6.233153638814016e-07, + "loss": 1.94, + "step": 925 + }, + { + "epoch": 0.0626663522118527, + "grad_norm": 5.557072325156039, + "learning_rate": 6.266846361185983e-07, + "loss": 1.9267, + "step": 930 + }, + { + "epoch": 0.06300326808395944, + "grad_norm": 4.869727347438024, + "learning_rate": 6.300539083557951e-07, + "loss": 1.9688, + "step": 935 + }, + { + "epoch": 0.06334018395606617, + "grad_norm": 5.385355577314501, + "learning_rate": 6.33423180592992e-07, + "loss": 1.9268, + "step": 940 + }, + { + "epoch": 0.0636770998281729, + "grad_norm": 5.058388758820075, + "learning_rate": 6.367924528301887e-07, + "loss": 1.9199, + "step": 945 + }, + { + "epoch": 0.06401401570027963, + "grad_norm": 4.919811186559829, + "learning_rate": 6.401617250673854e-07, + "loss": 1.9054, + "step": 950 + }, + { + "epoch": 0.06435093157238637, + "grad_norm": 5.114782560307836, + "learning_rate": 6.435309973045822e-07, + "loss": 2.0087, + "step": 955 + }, + { + "epoch": 0.06468784744449312, + "grad_norm": 5.266029184598451, + "learning_rate": 6.469002695417789e-07, + "loss": 1.9631, + "step": 960 + }, + { + "epoch": 0.06502476331659984, + "grad_norm": 5.272345169294871, + "learning_rate": 6.502695417789757e-07, + "loss": 1.9649, + "step": 965 + }, + { + "epoch": 0.06536167918870658, + "grad_norm": 5.531522696661456, + "learning_rate": 6.536388140161725e-07, + "loss": 1.8831, + "step": 970 + }, + { + "epoch": 0.06569859506081331, + "grad_norm": 5.243060472631993, + "learning_rate": 6.570080862533693e-07, + "loss": 1.8948, + "step": 975 + }, + { + "epoch": 0.06603551093292005, + "grad_norm": 4.973394162552281, + "learning_rate": 6.60377358490566e-07, + "loss": 1.8546, + "step": 980 + }, + { + "epoch": 0.06637242680502678, + "grad_norm": 4.949589283745308, + "learning_rate": 6.637466307277628e-07, + "loss": 1.9344, + "step": 985 + }, + { + "epoch": 0.06670934267713352, + "grad_norm": 4.897598836158783, + "learning_rate": 6.671159029649595e-07, + "loss": 1.8681, + "step": 990 + }, + { + "epoch": 0.06704625854924025, + "grad_norm": 4.953043075542081, + "learning_rate": 6.704851752021563e-07, + "loss": 1.934, + "step": 995 + }, + { + "epoch": 0.06738317442134699, + "grad_norm": 5.051343081890513, + "learning_rate": 6.738544474393531e-07, + "loss": 1.9747, + "step": 1000 + }, + { + "epoch": 0.06772009029345373, + "grad_norm": 5.519843073235885, + "learning_rate": 6.772237196765498e-07, + "loss": 1.9198, + "step": 1005 + }, + { + "epoch": 0.06805700616556046, + "grad_norm": 6.92952990394133, + "learning_rate": 6.805929919137466e-07, + "loss": 1.9353, + "step": 1010 + }, + { + "epoch": 0.0683939220376672, + "grad_norm": 5.844500928830597, + "learning_rate": 6.839622641509433e-07, + "loss": 1.9603, + "step": 1015 + }, + { + "epoch": 0.06873083790977393, + "grad_norm": 4.680109015678994, + "learning_rate": 6.873315363881402e-07, + "loss": 1.9097, + "step": 1020 + }, + { + "epoch": 0.06906775378188067, + "grad_norm": 5.276507888101029, + "learning_rate": 6.907008086253369e-07, + "loss": 1.9169, + "step": 1025 + }, + { + "epoch": 0.0694046696539874, + "grad_norm": 4.597017974421258, + "learning_rate": 6.940700808625337e-07, + "loss": 1.866, + "step": 1030 + }, + { + "epoch": 0.06974158552609414, + "grad_norm": 5.248577301108423, + "learning_rate": 6.974393530997304e-07, + "loss": 1.9707, + "step": 1035 + }, + { + "epoch": 0.07007850139820086, + "grad_norm": 4.561722496535128, + "learning_rate": 7.008086253369272e-07, + "loss": 1.9717, + "step": 1040 + }, + { + "epoch": 0.0704154172703076, + "grad_norm": 5.822968995110593, + "learning_rate": 7.04177897574124e-07, + "loss": 1.9476, + "step": 1045 + }, + { + "epoch": 0.07075233314241434, + "grad_norm": 5.567490532077557, + "learning_rate": 7.075471698113207e-07, + "loss": 1.9454, + "step": 1050 + }, + { + "epoch": 0.07108924901452107, + "grad_norm": 5.073451663570337, + "learning_rate": 7.109164420485175e-07, + "loss": 1.9323, + "step": 1055 + }, + { + "epoch": 0.07142616488662781, + "grad_norm": 5.146228422818076, + "learning_rate": 7.142857142857143e-07, + "loss": 1.9075, + "step": 1060 + }, + { + "epoch": 0.07176308075873454, + "grad_norm": 4.631295463779331, + "learning_rate": 7.17654986522911e-07, + "loss": 1.8698, + "step": 1065 + }, + { + "epoch": 0.07209999663084128, + "grad_norm": 5.1593056457079225, + "learning_rate": 7.210242587601077e-07, + "loss": 1.9247, + "step": 1070 + }, + { + "epoch": 0.07243691250294801, + "grad_norm": 5.48417366428753, + "learning_rate": 7.243935309973046e-07, + "loss": 1.941, + "step": 1075 + }, + { + "epoch": 0.07277382837505475, + "grad_norm": 5.631796760221222, + "learning_rate": 7.277628032345014e-07, + "loss": 1.9156, + "step": 1080 + }, + { + "epoch": 0.07311074424716148, + "grad_norm": 4.777529076813078, + "learning_rate": 7.311320754716981e-07, + "loss": 1.927, + "step": 1085 + }, + { + "epoch": 0.07344766011926822, + "grad_norm": 4.90129939841515, + "learning_rate": 7.345013477088948e-07, + "loss": 1.9781, + "step": 1090 + }, + { + "epoch": 0.07378457599137496, + "grad_norm": 5.231366709132192, + "learning_rate": 7.378706199460915e-07, + "loss": 1.9127, + "step": 1095 + }, + { + "epoch": 0.07412149186348169, + "grad_norm": 4.723481493128406, + "learning_rate": 7.412398921832884e-07, + "loss": 1.918, + "step": 1100 + }, + { + "epoch": 0.07445840773558843, + "grad_norm": 5.201673325926327, + "learning_rate": 7.446091644204852e-07, + "loss": 1.9476, + "step": 1105 + }, + { + "epoch": 0.07479532360769515, + "grad_norm": 5.461855674554181, + "learning_rate": 7.479784366576819e-07, + "loss": 1.9227, + "step": 1110 + }, + { + "epoch": 0.0751322394798019, + "grad_norm": 5.122573096291021, + "learning_rate": 7.513477088948786e-07, + "loss": 1.9069, + "step": 1115 + }, + { + "epoch": 0.07546915535190862, + "grad_norm": 5.134161925716819, + "learning_rate": 7.547169811320754e-07, + "loss": 1.8723, + "step": 1120 + }, + { + "epoch": 0.07580607122401536, + "grad_norm": 4.912594707407192, + "learning_rate": 7.580862533692723e-07, + "loss": 1.925, + "step": 1125 + }, + { + "epoch": 0.07614298709612209, + "grad_norm": 4.833642336502445, + "learning_rate": 7.61455525606469e-07, + "loss": 1.9418, + "step": 1130 + }, + { + "epoch": 0.07647990296822883, + "grad_norm": 5.942285402534889, + "learning_rate": 7.648247978436657e-07, + "loss": 1.891, + "step": 1135 + }, + { + "epoch": 0.07681681884033557, + "grad_norm": 4.780486374896709, + "learning_rate": 7.681940700808624e-07, + "loss": 1.9281, + "step": 1140 + }, + { + "epoch": 0.0771537347124423, + "grad_norm": 5.0599976612815425, + "learning_rate": 7.715633423180593e-07, + "loss": 1.8856, + "step": 1145 + }, + { + "epoch": 0.07749065058454904, + "grad_norm": 5.447714555152886, + "learning_rate": 7.749326145552561e-07, + "loss": 1.9193, + "step": 1150 + }, + { + "epoch": 0.07782756645665577, + "grad_norm": 5.559443100359282, + "learning_rate": 7.783018867924528e-07, + "loss": 1.8755, + "step": 1155 + }, + { + "epoch": 0.07816448232876251, + "grad_norm": 9.485730862885632, + "learning_rate": 7.816711590296495e-07, + "loss": 1.8853, + "step": 1160 + }, + { + "epoch": 0.07850139820086924, + "grad_norm": 5.074665439939694, + "learning_rate": 7.850404312668463e-07, + "loss": 1.9718, + "step": 1165 + }, + { + "epoch": 0.07883831407297598, + "grad_norm": 5.35289626298854, + "learning_rate": 7.884097035040431e-07, + "loss": 1.9016, + "step": 1170 + }, + { + "epoch": 0.0791752299450827, + "grad_norm": 4.849839407058772, + "learning_rate": 7.917789757412399e-07, + "loss": 1.9096, + "step": 1175 + }, + { + "epoch": 0.07951214581718945, + "grad_norm": 5.292874310019159, + "learning_rate": 7.951482479784366e-07, + "loss": 1.8688, + "step": 1180 + }, + { + "epoch": 0.07984906168929619, + "grad_norm": 4.69518465130648, + "learning_rate": 7.985175202156334e-07, + "loss": 1.8804, + "step": 1185 + }, + { + "epoch": 0.08018597756140292, + "grad_norm": 5.330973093669295, + "learning_rate": 8.018867924528302e-07, + "loss": 1.9425, + "step": 1190 + }, + { + "epoch": 0.08052289343350966, + "grad_norm": 5.2166635094871, + "learning_rate": 8.052560646900269e-07, + "loss": 1.873, + "step": 1195 + }, + { + "epoch": 0.08085980930561638, + "grad_norm": 5.036327896074128, + "learning_rate": 8.086253369272237e-07, + "loss": 1.8749, + "step": 1200 + }, + { + "epoch": 0.08119672517772313, + "grad_norm": 4.569279981257993, + "learning_rate": 8.119946091644204e-07, + "loss": 1.908, + "step": 1205 + }, + { + "epoch": 0.08153364104982985, + "grad_norm": 5.277033519733169, + "learning_rate": 8.153638814016172e-07, + "loss": 1.9629, + "step": 1210 + }, + { + "epoch": 0.0818705569219366, + "grad_norm": 4.646480429400201, + "learning_rate": 8.18733153638814e-07, + "loss": 1.9139, + "step": 1215 + }, + { + "epoch": 0.08220747279404332, + "grad_norm": 5.355634775770506, + "learning_rate": 8.221024258760107e-07, + "loss": 1.8631, + "step": 1220 + }, + { + "epoch": 0.08254438866615006, + "grad_norm": 7.811459226424328, + "learning_rate": 8.254716981132074e-07, + "loss": 1.8857, + "step": 1225 + }, + { + "epoch": 0.0828813045382568, + "grad_norm": 10.013535553366285, + "learning_rate": 8.288409703504043e-07, + "loss": 1.9353, + "step": 1230 + }, + { + "epoch": 0.08321822041036353, + "grad_norm": 4.547839224400193, + "learning_rate": 8.322102425876011e-07, + "loss": 1.8719, + "step": 1235 + }, + { + "epoch": 0.08355513628247027, + "grad_norm": 4.994456574147338, + "learning_rate": 8.355795148247978e-07, + "loss": 1.9409, + "step": 1240 + }, + { + "epoch": 0.083892052154577, + "grad_norm": 4.320607866056009, + "learning_rate": 8.389487870619945e-07, + "loss": 1.9094, + "step": 1245 + }, + { + "epoch": 0.08422896802668374, + "grad_norm": 5.1285978622421675, + "learning_rate": 8.423180592991913e-07, + "loss": 1.8423, + "step": 1250 + }, + { + "epoch": 0.08456588389879047, + "grad_norm": 5.207593792225735, + "learning_rate": 8.456873315363881e-07, + "loss": 1.9206, + "step": 1255 + }, + { + "epoch": 0.08490279977089721, + "grad_norm": 5.16067220295262, + "learning_rate": 8.490566037735849e-07, + "loss": 1.8542, + "step": 1260 + }, + { + "epoch": 0.08523971564300394, + "grad_norm": 4.5869805182954275, + "learning_rate": 8.524258760107816e-07, + "loss": 1.8442, + "step": 1265 + }, + { + "epoch": 0.08557663151511068, + "grad_norm": 5.905122236250617, + "learning_rate": 8.557951482479784e-07, + "loss": 1.9592, + "step": 1270 + }, + { + "epoch": 0.08591354738721742, + "grad_norm": 4.850519676865327, + "learning_rate": 8.591644204851751e-07, + "loss": 1.8988, + "step": 1275 + }, + { + "epoch": 0.08625046325932414, + "grad_norm": 5.246664878234509, + "learning_rate": 8.62533692722372e-07, + "loss": 1.9069, + "step": 1280 + }, + { + "epoch": 0.08658737913143089, + "grad_norm": 4.686851588740122, + "learning_rate": 8.659029649595687e-07, + "loss": 1.9093, + "step": 1285 + }, + { + "epoch": 0.08692429500353761, + "grad_norm": 4.864848050723445, + "learning_rate": 8.692722371967655e-07, + "loss": 1.9621, + "step": 1290 + }, + { + "epoch": 0.08726121087564435, + "grad_norm": 5.245150673768001, + "learning_rate": 8.726415094339622e-07, + "loss": 1.9268, + "step": 1295 + }, + { + "epoch": 0.08759812674775108, + "grad_norm": 5.586645232579203, + "learning_rate": 8.760107816711589e-07, + "loss": 1.9637, + "step": 1300 + }, + { + "epoch": 0.08793504261985782, + "grad_norm": 5.129811614944534, + "learning_rate": 8.793800539083558e-07, + "loss": 1.8807, + "step": 1305 + }, + { + "epoch": 0.08827195849196455, + "grad_norm": 5.207910430535945, + "learning_rate": 8.827493261455526e-07, + "loss": 1.9112, + "step": 1310 + }, + { + "epoch": 0.08860887436407129, + "grad_norm": 5.148253867866013, + "learning_rate": 8.861185983827493e-07, + "loss": 1.8747, + "step": 1315 + }, + { + "epoch": 0.08894579023617803, + "grad_norm": 4.959406714040152, + "learning_rate": 8.89487870619946e-07, + "loss": 1.8511, + "step": 1320 + }, + { + "epoch": 0.08928270610828476, + "grad_norm": 5.311015482505079, + "learning_rate": 8.928571428571428e-07, + "loss": 1.9621, + "step": 1325 + }, + { + "epoch": 0.0896196219803915, + "grad_norm": 5.529873429359067, + "learning_rate": 8.962264150943396e-07, + "loss": 1.8751, + "step": 1330 + }, + { + "epoch": 0.08995653785249823, + "grad_norm": 5.019885851770948, + "learning_rate": 8.995956873315364e-07, + "loss": 1.8673, + "step": 1335 + }, + { + "epoch": 0.09029345372460497, + "grad_norm": 5.255259881875886, + "learning_rate": 9.029649595687331e-07, + "loss": 1.8594, + "step": 1340 + }, + { + "epoch": 0.0906303695967117, + "grad_norm": 4.864013243574321, + "learning_rate": 9.063342318059298e-07, + "loss": 1.8578, + "step": 1345 + }, + { + "epoch": 0.09096728546881844, + "grad_norm": 4.783979712270612, + "learning_rate": 9.097035040431266e-07, + "loss": 1.8668, + "step": 1350 + }, + { + "epoch": 0.09130420134092516, + "grad_norm": 5.288947009799633, + "learning_rate": 9.130727762803235e-07, + "loss": 1.8831, + "step": 1355 + }, + { + "epoch": 0.0916411172130319, + "grad_norm": 4.563876070584222, + "learning_rate": 9.164420485175202e-07, + "loss": 1.9362, + "step": 1360 + }, + { + "epoch": 0.09197803308513865, + "grad_norm": 4.368741024793251, + "learning_rate": 9.198113207547169e-07, + "loss": 1.8827, + "step": 1365 + }, + { + "epoch": 0.09231494895724537, + "grad_norm": 4.69647349179084, + "learning_rate": 9.231805929919137e-07, + "loss": 1.8673, + "step": 1370 + }, + { + "epoch": 0.09265186482935212, + "grad_norm": 5.410173364100209, + "learning_rate": 9.265498652291105e-07, + "loss": 1.9147, + "step": 1375 + }, + { + "epoch": 0.09298878070145884, + "grad_norm": 5.026718419465974, + "learning_rate": 9.299191374663073e-07, + "loss": 1.8939, + "step": 1380 + }, + { + "epoch": 0.09332569657356558, + "grad_norm": 5.226125492850762, + "learning_rate": 9.33288409703504e-07, + "loss": 1.8999, + "step": 1385 + }, + { + "epoch": 0.09366261244567231, + "grad_norm": 4.851233426364406, + "learning_rate": 9.366576819407007e-07, + "loss": 1.8784, + "step": 1390 + }, + { + "epoch": 0.09399952831777905, + "grad_norm": 4.31752533985363, + "learning_rate": 9.400269541778976e-07, + "loss": 1.825, + "step": 1395 + }, + { + "epoch": 0.09433644418988578, + "grad_norm": 5.805848435786427, + "learning_rate": 9.433962264150943e-07, + "loss": 1.8816, + "step": 1400 + }, + { + "epoch": 0.09467336006199252, + "grad_norm": 5.604277013042164, + "learning_rate": 9.46765498652291e-07, + "loss": 1.9501, + "step": 1405 + }, + { + "epoch": 0.09501027593409926, + "grad_norm": 4.640923447032217, + "learning_rate": 9.501347708894878e-07, + "loss": 1.8088, + "step": 1410 + }, + { + "epoch": 0.09534719180620599, + "grad_norm": 4.800198454517414, + "learning_rate": 9.535040431266847e-07, + "loss": 1.9026, + "step": 1415 + }, + { + "epoch": 0.09568410767831273, + "grad_norm": 4.778232798159787, + "learning_rate": 9.568733153638813e-07, + "loss": 1.8543, + "step": 1420 + }, + { + "epoch": 0.09602102355041946, + "grad_norm": 5.920465260156262, + "learning_rate": 9.60242587601078e-07, + "loss": 1.8315, + "step": 1425 + }, + { + "epoch": 0.0963579394225262, + "grad_norm": 5.5208201239955885, + "learning_rate": 9.63611859838275e-07, + "loss": 1.9189, + "step": 1430 + }, + { + "epoch": 0.09669485529463293, + "grad_norm": 4.465241133486137, + "learning_rate": 9.669811320754717e-07, + "loss": 1.8755, + "step": 1435 + }, + { + "epoch": 0.09703177116673967, + "grad_norm": 4.861772605861353, + "learning_rate": 9.703504043126684e-07, + "loss": 1.8424, + "step": 1440 + }, + { + "epoch": 0.0973686870388464, + "grad_norm": 4.932766715467049, + "learning_rate": 9.73719676549865e-07, + "loss": 1.8656, + "step": 1445 + }, + { + "epoch": 0.09770560291095313, + "grad_norm": 4.441378456407006, + "learning_rate": 9.77088948787062e-07, + "loss": 1.8494, + "step": 1450 + }, + { + "epoch": 0.09804251878305988, + "grad_norm": 5.275736089281324, + "learning_rate": 9.804582210242587e-07, + "loss": 1.8647, + "step": 1455 + }, + { + "epoch": 0.0983794346551666, + "grad_norm": 5.12890600254249, + "learning_rate": 9.838274932614555e-07, + "loss": 1.9004, + "step": 1460 + }, + { + "epoch": 0.09871635052727334, + "grad_norm": 5.705766103530472, + "learning_rate": 9.871967654986522e-07, + "loss": 1.9016, + "step": 1465 + }, + { + "epoch": 0.09905326639938007, + "grad_norm": 5.462295643224533, + "learning_rate": 9.90566037735849e-07, + "loss": 1.8279, + "step": 1470 + }, + { + "epoch": 0.09939018227148681, + "grad_norm": 4.847596040039102, + "learning_rate": 9.939353099730458e-07, + "loss": 1.8291, + "step": 1475 + }, + { + "epoch": 0.09972709814359354, + "grad_norm": 5.3575704163339655, + "learning_rate": 9.973045822102425e-07, + "loss": 1.8753, + "step": 1480 + }, + { + "epoch": 0.10006401401570028, + "grad_norm": 5.085814023260118, + "learning_rate": 9.999999861679377e-07, + "loss": 1.7741, + "step": 1485 + }, + { + "epoch": 0.10040092988780701, + "grad_norm": 5.539515142922222, + "learning_rate": 9.999995020458434e-07, + "loss": 1.8115, + "step": 1490 + }, + { + "epoch": 0.10073784575991375, + "grad_norm": 5.104632657913346, + "learning_rate": 9.99998326321407e-07, + "loss": 1.8537, + "step": 1495 + }, + { + "epoch": 0.10107476163202049, + "grad_norm": 5.037682554763403, + "learning_rate": 9.999964589962556e-07, + "loss": 1.8051, + "step": 1500 + }, + { + "epoch": 0.10141167750412722, + "grad_norm": 5.444043728664739, + "learning_rate": 9.999939000729715e-07, + "loss": 1.8778, + "step": 1505 + }, + { + "epoch": 0.10174859337623396, + "grad_norm": 4.447541781178455, + "learning_rate": 9.999906495550946e-07, + "loss": 1.8523, + "step": 1510 + }, + { + "epoch": 0.10208550924834069, + "grad_norm": 4.975360878532206, + "learning_rate": 9.999867074471207e-07, + "loss": 1.8719, + "step": 1515 + }, + { + "epoch": 0.10242242512044743, + "grad_norm": 4.71237216626977, + "learning_rate": 9.99982073754503e-07, + "loss": 1.8011, + "step": 1520 + }, + { + "epoch": 0.10275934099255415, + "grad_norm": 4.565133365936921, + "learning_rate": 9.999767484836502e-07, + "loss": 1.8379, + "step": 1525 + }, + { + "epoch": 0.1030962568646609, + "grad_norm": 5.110474397007126, + "learning_rate": 9.999707316419288e-07, + "loss": 1.7875, + "step": 1530 + }, + { + "epoch": 0.10343317273676762, + "grad_norm": 4.720086807903922, + "learning_rate": 9.99964023237661e-07, + "loss": 1.8251, + "step": 1535 + }, + { + "epoch": 0.10377008860887436, + "grad_norm": 5.341137351778559, + "learning_rate": 9.999566232801261e-07, + "loss": 1.8335, + "step": 1540 + }, + { + "epoch": 0.1041070044809811, + "grad_norm": 4.869144151159134, + "learning_rate": 9.9994853177956e-07, + "loss": 1.836, + "step": 1545 + }, + { + "epoch": 0.10444392035308783, + "grad_norm": 5.223975721103111, + "learning_rate": 9.999397487471543e-07, + "loss": 1.9321, + "step": 1550 + }, + { + "epoch": 0.10478083622519457, + "grad_norm": 4.685138611882665, + "learning_rate": 9.999302741950582e-07, + "loss": 1.8628, + "step": 1555 + }, + { + "epoch": 0.1051177520973013, + "grad_norm": 5.270431937376266, + "learning_rate": 9.999201081363768e-07, + "loss": 1.866, + "step": 1560 + }, + { + "epoch": 0.10545466796940804, + "grad_norm": 4.9359908760483995, + "learning_rate": 9.99909250585172e-07, + "loss": 1.8685, + "step": 1565 + }, + { + "epoch": 0.10579158384151477, + "grad_norm": 4.523129173891283, + "learning_rate": 9.998977015564617e-07, + "loss": 1.9224, + "step": 1570 + }, + { + "epoch": 0.10612849971362151, + "grad_norm": 4.72577503916574, + "learning_rate": 9.998854610662209e-07, + "loss": 1.8685, + "step": 1575 + }, + { + "epoch": 0.10646541558572824, + "grad_norm": 4.875038189097672, + "learning_rate": 9.998725291313805e-07, + "loss": 1.8772, + "step": 1580 + }, + { + "epoch": 0.10680233145783498, + "grad_norm": 8.560428339730194, + "learning_rate": 9.998589057698283e-07, + "loss": 1.8745, + "step": 1585 + }, + { + "epoch": 0.10713924732994172, + "grad_norm": 4.9717356721459565, + "learning_rate": 9.99844591000408e-07, + "loss": 1.7989, + "step": 1590 + }, + { + "epoch": 0.10747616320204845, + "grad_norm": 5.700211307942382, + "learning_rate": 9.9982958484292e-07, + "loss": 1.8694, + "step": 1595 + }, + { + "epoch": 0.10781307907415519, + "grad_norm": 4.9766939545781, + "learning_rate": 9.99813887318121e-07, + "loss": 1.8043, + "step": 1600 + }, + { + "epoch": 0.10814999494626192, + "grad_norm": 5.334106550163921, + "learning_rate": 9.997974984477236e-07, + "loss": 1.8789, + "step": 1605 + }, + { + "epoch": 0.10848691081836866, + "grad_norm": 4.762969343541741, + "learning_rate": 9.99780418254397e-07, + "loss": 1.8346, + "step": 1610 + }, + { + "epoch": 0.10882382669047538, + "grad_norm": 4.805895966179308, + "learning_rate": 9.99762646761767e-07, + "loss": 1.899, + "step": 1615 + }, + { + "epoch": 0.10916074256258212, + "grad_norm": 5.009948765551529, + "learning_rate": 9.99744183994415e-07, + "loss": 1.8119, + "step": 1620 + }, + { + "epoch": 0.10949765843468885, + "grad_norm": 4.932087499469652, + "learning_rate": 9.997250299778788e-07, + "loss": 1.8756, + "step": 1625 + }, + { + "epoch": 0.1098345743067956, + "grad_norm": 5.068638682364739, + "learning_rate": 9.997051847386524e-07, + "loss": 1.8727, + "step": 1630 + }, + { + "epoch": 0.11017149017890233, + "grad_norm": 4.592133002785248, + "learning_rate": 9.996846483041858e-07, + "loss": 1.9214, + "step": 1635 + }, + { + "epoch": 0.11050840605100906, + "grad_norm": 4.843565219513783, + "learning_rate": 9.99663420702885e-07, + "loss": 1.9096, + "step": 1640 + }, + { + "epoch": 0.1108453219231158, + "grad_norm": 4.7607329350093455, + "learning_rate": 9.996415019641124e-07, + "loss": 1.8273, + "step": 1645 + }, + { + "epoch": 0.11118223779522253, + "grad_norm": 4.760017744961109, + "learning_rate": 9.996188921181861e-07, + "loss": 1.8256, + "step": 1650 + }, + { + "epoch": 0.11151915366732927, + "grad_norm": 5.063047728653858, + "learning_rate": 9.9959559119638e-07, + "loss": 1.8168, + "step": 1655 + }, + { + "epoch": 0.111856069539436, + "grad_norm": 4.936422005296017, + "learning_rate": 9.995715992309244e-07, + "loss": 1.8618, + "step": 1660 + }, + { + "epoch": 0.11219298541154274, + "grad_norm": 5.094953809987295, + "learning_rate": 9.995469162550048e-07, + "loss": 1.8063, + "step": 1665 + }, + { + "epoch": 0.11252990128364947, + "grad_norm": 5.182573498494034, + "learning_rate": 9.99521542302763e-07, + "loss": 1.8227, + "step": 1670 + }, + { + "epoch": 0.11286681715575621, + "grad_norm": 4.743404258711493, + "learning_rate": 9.994954774092962e-07, + "loss": 1.7907, + "step": 1675 + }, + { + "epoch": 0.11320373302786295, + "grad_norm": 5.202730641786091, + "learning_rate": 9.994687216106579e-07, + "loss": 1.8331, + "step": 1680 + }, + { + "epoch": 0.11354064889996968, + "grad_norm": 5.090137020982806, + "learning_rate": 9.994412749438564e-07, + "loss": 1.7829, + "step": 1685 + }, + { + "epoch": 0.11387756477207642, + "grad_norm": 4.9893038014961295, + "learning_rate": 9.994131374468565e-07, + "loss": 1.7986, + "step": 1690 + }, + { + "epoch": 0.11421448064418314, + "grad_norm": 4.492198865372942, + "learning_rate": 9.993843091585782e-07, + "loss": 1.8536, + "step": 1695 + }, + { + "epoch": 0.11455139651628989, + "grad_norm": 8.98221193041383, + "learning_rate": 9.993547901188966e-07, + "loss": 1.8143, + "step": 1700 + }, + { + "epoch": 0.11488831238839661, + "grad_norm": 4.818273163330136, + "learning_rate": 9.993245803686426e-07, + "loss": 1.7691, + "step": 1705 + }, + { + "epoch": 0.11522522826050335, + "grad_norm": 5.20970940555951, + "learning_rate": 9.992936799496029e-07, + "loss": 1.849, + "step": 1710 + }, + { + "epoch": 0.11556214413261008, + "grad_norm": 4.787586490712673, + "learning_rate": 9.99262088904519e-07, + "loss": 1.8, + "step": 1715 + }, + { + "epoch": 0.11589906000471682, + "grad_norm": 4.991841549682482, + "learning_rate": 9.992298072770877e-07, + "loss": 1.8867, + "step": 1720 + }, + { + "epoch": 0.11623597587682356, + "grad_norm": 4.725283739157587, + "learning_rate": 9.991968351119612e-07, + "loss": 1.8404, + "step": 1725 + }, + { + "epoch": 0.11657289174893029, + "grad_norm": 5.151767048741731, + "learning_rate": 9.991631724547467e-07, + "loss": 1.8317, + "step": 1730 + }, + { + "epoch": 0.11690980762103703, + "grad_norm": 4.9360054197776115, + "learning_rate": 9.99128819352007e-07, + "loss": 1.8911, + "step": 1735 + }, + { + "epoch": 0.11724672349314376, + "grad_norm": 4.885881464702479, + "learning_rate": 9.99093775851259e-07, + "loss": 1.8769, + "step": 1740 + }, + { + "epoch": 0.1175836393652505, + "grad_norm": 4.760952548326224, + "learning_rate": 9.990580420009755e-07, + "loss": 1.8642, + "step": 1745 + }, + { + "epoch": 0.11792055523735723, + "grad_norm": 4.8133902321301045, + "learning_rate": 9.990216178505835e-07, + "loss": 1.8558, + "step": 1750 + }, + { + "epoch": 0.11825747110946397, + "grad_norm": 5.18502712536443, + "learning_rate": 9.989845034504651e-07, + "loss": 1.7965, + "step": 1755 + }, + { + "epoch": 0.1185943869815707, + "grad_norm": 5.582199965208816, + "learning_rate": 9.989466988519572e-07, + "loss": 1.8008, + "step": 1760 + }, + { + "epoch": 0.11893130285367744, + "grad_norm": 5.063498967253465, + "learning_rate": 9.989082041073517e-07, + "loss": 1.7832, + "step": 1765 + }, + { + "epoch": 0.11926821872578418, + "grad_norm": 4.983703694041557, + "learning_rate": 9.988690192698944e-07, + "loss": 1.8452, + "step": 1770 + }, + { + "epoch": 0.1196051345978909, + "grad_norm": 4.984092304046178, + "learning_rate": 9.988291443937857e-07, + "loss": 1.86, + "step": 1775 + }, + { + "epoch": 0.11994205046999765, + "grad_norm": 4.616659709966719, + "learning_rate": 9.987885795341816e-07, + "loss": 1.8214, + "step": 1780 + }, + { + "epoch": 0.12027896634210437, + "grad_norm": 4.824774887957237, + "learning_rate": 9.987473247471908e-07, + "loss": 1.8151, + "step": 1785 + }, + { + "epoch": 0.12061588221421111, + "grad_norm": 5.701725337107119, + "learning_rate": 9.98705380089878e-07, + "loss": 1.8337, + "step": 1790 + }, + { + "epoch": 0.12095279808631784, + "grad_norm": 4.981571796147867, + "learning_rate": 9.986627456202608e-07, + "loss": 1.8823, + "step": 1795 + }, + { + "epoch": 0.12128971395842458, + "grad_norm": 5.360549733357877, + "learning_rate": 9.986194213973113e-07, + "loss": 1.8113, + "step": 1800 + }, + { + "epoch": 0.12162662983053131, + "grad_norm": 4.673029000872237, + "learning_rate": 9.985754074809562e-07, + "loss": 1.774, + "step": 1805 + }, + { + "epoch": 0.12196354570263805, + "grad_norm": 4.636511070093299, + "learning_rate": 9.985307039320756e-07, + "loss": 1.8114, + "step": 1810 + }, + { + "epoch": 0.12230046157474479, + "grad_norm": 4.821261166128235, + "learning_rate": 9.98485310812504e-07, + "loss": 1.8265, + "step": 1815 + }, + { + "epoch": 0.12263737744685152, + "grad_norm": 4.475912019826286, + "learning_rate": 9.98439228185029e-07, + "loss": 1.8268, + "step": 1820 + }, + { + "epoch": 0.12297429331895826, + "grad_norm": 5.24713691341664, + "learning_rate": 9.983924561133927e-07, + "loss": 1.751, + "step": 1825 + }, + { + "epoch": 0.12331120919106499, + "grad_norm": 5.333336877544052, + "learning_rate": 9.983449946622906e-07, + "loss": 1.7927, + "step": 1830 + }, + { + "epoch": 0.12364812506317173, + "grad_norm": 4.630894115286722, + "learning_rate": 9.982968438973714e-07, + "loss": 1.8225, + "step": 1835 + }, + { + "epoch": 0.12398504093527846, + "grad_norm": 4.971319322552391, + "learning_rate": 9.982480038852375e-07, + "loss": 1.8832, + "step": 1840 + }, + { + "epoch": 0.1243219568073852, + "grad_norm": 5.041019026607787, + "learning_rate": 9.98198474693445e-07, + "loss": 1.8324, + "step": 1845 + }, + { + "epoch": 0.12465887267949192, + "grad_norm": 5.096166963207779, + "learning_rate": 9.981482563905025e-07, + "loss": 1.8775, + "step": 1850 + }, + { + "epoch": 0.12499578855159867, + "grad_norm": 4.644464767921582, + "learning_rate": 9.980973490458728e-07, + "loss": 1.8348, + "step": 1855 + }, + { + "epoch": 0.1253327044237054, + "grad_norm": 4.689654175446751, + "learning_rate": 9.980457527299708e-07, + "loss": 1.8309, + "step": 1860 + }, + { + "epoch": 0.12566962029581213, + "grad_norm": 5.007707026705937, + "learning_rate": 9.979934675141652e-07, + "loss": 1.8026, + "step": 1865 + }, + { + "epoch": 0.12600653616791888, + "grad_norm": 4.935975744739416, + "learning_rate": 9.979404934707771e-07, + "loss": 1.7989, + "step": 1870 + }, + { + "epoch": 0.12634345204002562, + "grad_norm": 5.295148949180079, + "learning_rate": 9.978868306730804e-07, + "loss": 1.814, + "step": 1875 + }, + { + "epoch": 0.12668036791213233, + "grad_norm": 6.425946944455151, + "learning_rate": 9.978324791953018e-07, + "loss": 1.808, + "step": 1880 + }, + { + "epoch": 0.12701728378423907, + "grad_norm": 4.63375202539753, + "learning_rate": 9.97777439112621e-07, + "loss": 1.8116, + "step": 1885 + }, + { + "epoch": 0.1273541996563458, + "grad_norm": 5.567189408814499, + "learning_rate": 9.977217105011693e-07, + "loss": 1.7951, + "step": 1890 + }, + { + "epoch": 0.12769111552845255, + "grad_norm": 4.967271192367276, + "learning_rate": 9.97665293438031e-07, + "loss": 1.8449, + "step": 1895 + }, + { + "epoch": 0.12802803140055927, + "grad_norm": 5.083573907507721, + "learning_rate": 9.976081880012426e-07, + "loss": 1.7821, + "step": 1900 + }, + { + "epoch": 0.128364947272666, + "grad_norm": 5.263763064210331, + "learning_rate": 9.975503942697925e-07, + "loss": 1.8511, + "step": 1905 + }, + { + "epoch": 0.12870186314477275, + "grad_norm": 4.805567854667613, + "learning_rate": 9.974919123236217e-07, + "loss": 1.818, + "step": 1910 + }, + { + "epoch": 0.1290387790168795, + "grad_norm": 5.3266090858155755, + "learning_rate": 9.974327422436223e-07, + "loss": 1.7796, + "step": 1915 + }, + { + "epoch": 0.12937569488898623, + "grad_norm": 4.317364534885699, + "learning_rate": 9.97372884111639e-07, + "loss": 1.8047, + "step": 1920 + }, + { + "epoch": 0.12971261076109294, + "grad_norm": 4.877735058995506, + "learning_rate": 9.97312338010468e-07, + "loss": 1.8578, + "step": 1925 + }, + { + "epoch": 0.13004952663319969, + "grad_norm": 4.802457765477097, + "learning_rate": 9.97251104023857e-07, + "loss": 1.8649, + "step": 1930 + }, + { + "epoch": 0.13038644250530643, + "grad_norm": 4.55290352417241, + "learning_rate": 9.971891822365048e-07, + "loss": 1.8144, + "step": 1935 + }, + { + "epoch": 0.13072335837741317, + "grad_norm": 4.808796442273753, + "learning_rate": 9.971265727340627e-07, + "loss": 1.8112, + "step": 1940 + }, + { + "epoch": 0.13106027424951988, + "grad_norm": 5.075751866145093, + "learning_rate": 9.970632756031322e-07, + "loss": 1.8588, + "step": 1945 + }, + { + "epoch": 0.13139719012162662, + "grad_norm": 5.368095057405194, + "learning_rate": 9.969992909312658e-07, + "loss": 1.8349, + "step": 1950 + }, + { + "epoch": 0.13173410599373336, + "grad_norm": 4.543717796981582, + "learning_rate": 9.969346188069684e-07, + "loss": 1.8516, + "step": 1955 + }, + { + "epoch": 0.1320710218658401, + "grad_norm": 4.810992798528759, + "learning_rate": 9.968692593196943e-07, + "loss": 1.7291, + "step": 1960 + }, + { + "epoch": 0.13240793773794685, + "grad_norm": 4.7057325097189935, + "learning_rate": 9.968032125598493e-07, + "loss": 1.9157, + "step": 1965 + }, + { + "epoch": 0.13274485361005356, + "grad_norm": 4.6279852022273635, + "learning_rate": 9.967364786187894e-07, + "loss": 1.804, + "step": 1970 + }, + { + "epoch": 0.1330817694821603, + "grad_norm": 4.797725869731827, + "learning_rate": 9.96669057588822e-07, + "loss": 1.827, + "step": 1975 + }, + { + "epoch": 0.13341868535426704, + "grad_norm": 4.63352318189097, + "learning_rate": 9.966009495632037e-07, + "loss": 1.7916, + "step": 1980 + }, + { + "epoch": 0.13375560122637378, + "grad_norm": 4.63007800091646, + "learning_rate": 9.965321546361421e-07, + "loss": 1.7955, + "step": 1985 + }, + { + "epoch": 0.1340925170984805, + "grad_norm": 4.592355530933701, + "learning_rate": 9.964626729027948e-07, + "loss": 1.8173, + "step": 1990 + }, + { + "epoch": 0.13442943297058724, + "grad_norm": 4.754656508753518, + "learning_rate": 9.963925044592695e-07, + "loss": 1.8508, + "step": 1995 + }, + { + "epoch": 0.13476634884269398, + "grad_norm": 5.036001769976464, + "learning_rate": 9.963216494026235e-07, + "loss": 1.7708, + "step": 2000 + }, + { + "epoch": 0.13510326471480072, + "grad_norm": 4.680634436660986, + "learning_rate": 9.962501078308636e-07, + "loss": 1.8545, + "step": 2005 + }, + { + "epoch": 0.13544018058690746, + "grad_norm": 5.019081719432215, + "learning_rate": 9.96177879842947e-07, + "loss": 1.8008, + "step": 2010 + }, + { + "epoch": 0.13577709645901417, + "grad_norm": 5.238373802263778, + "learning_rate": 9.961049655387799e-07, + "loss": 1.8496, + "step": 2015 + }, + { + "epoch": 0.13611401233112091, + "grad_norm": 4.796232137071534, + "learning_rate": 9.960313650192175e-07, + "loss": 1.8012, + "step": 2020 + }, + { + "epoch": 0.13645092820322766, + "grad_norm": 4.971202397273539, + "learning_rate": 9.959570783860647e-07, + "loss": 1.7943, + "step": 2025 + }, + { + "epoch": 0.1367878440753344, + "grad_norm": 4.74948378121568, + "learning_rate": 9.958821057420752e-07, + "loss": 1.84, + "step": 2030 + }, + { + "epoch": 0.1371247599474411, + "grad_norm": 4.845185928928487, + "learning_rate": 9.958064471909513e-07, + "loss": 1.7563, + "step": 2035 + }, + { + "epoch": 0.13746167581954785, + "grad_norm": 4.822222510818922, + "learning_rate": 9.95730102837345e-07, + "loss": 1.7998, + "step": 2040 + }, + { + "epoch": 0.1377985916916546, + "grad_norm": 4.219454587091505, + "learning_rate": 9.956530727868558e-07, + "loss": 1.9019, + "step": 2045 + }, + { + "epoch": 0.13813550756376133, + "grad_norm": 4.624461076983736, + "learning_rate": 9.955753571460322e-07, + "loss": 1.8316, + "step": 2050 + }, + { + "epoch": 0.13847242343586808, + "grad_norm": 4.895182906588043, + "learning_rate": 9.95496956022371e-07, + "loss": 1.822, + "step": 2055 + }, + { + "epoch": 0.1388093393079748, + "grad_norm": 5.1361871293028045, + "learning_rate": 9.95417869524317e-07, + "loss": 1.8116, + "step": 2060 + }, + { + "epoch": 0.13914625518008153, + "grad_norm": 4.598925274456651, + "learning_rate": 9.953380977612633e-07, + "loss": 1.8424, + "step": 2065 + }, + { + "epoch": 0.13948317105218827, + "grad_norm": 4.804272735140545, + "learning_rate": 9.952576408435505e-07, + "loss": 1.8828, + "step": 2070 + }, + { + "epoch": 0.139820086924295, + "grad_norm": 4.6975159711246075, + "learning_rate": 9.951764988824674e-07, + "loss": 1.8063, + "step": 2075 + }, + { + "epoch": 0.14015700279640173, + "grad_norm": 4.544302853074937, + "learning_rate": 9.950946719902498e-07, + "loss": 1.8352, + "step": 2080 + }, + { + "epoch": 0.14049391866850847, + "grad_norm": 4.3131351605485975, + "learning_rate": 9.950121602800813e-07, + "loss": 1.7853, + "step": 2085 + }, + { + "epoch": 0.1408308345406152, + "grad_norm": 4.95121376201706, + "learning_rate": 9.949289638660922e-07, + "loss": 1.7655, + "step": 2090 + }, + { + "epoch": 0.14116775041272195, + "grad_norm": 4.776858845008987, + "learning_rate": 9.948450828633608e-07, + "loss": 1.799, + "step": 2095 + }, + { + "epoch": 0.1415046662848287, + "grad_norm": 4.620614138437872, + "learning_rate": 9.947605173879115e-07, + "loss": 1.789, + "step": 2100 + }, + { + "epoch": 0.1418415821569354, + "grad_norm": 4.734440582941355, + "learning_rate": 9.94675267556716e-07, + "loss": 1.7768, + "step": 2105 + }, + { + "epoch": 0.14217849802904214, + "grad_norm": 5.165465783061854, + "learning_rate": 9.94589333487692e-07, + "loss": 1.79, + "step": 2110 + }, + { + "epoch": 0.14251541390114889, + "grad_norm": 4.910963698898701, + "learning_rate": 9.945027152997046e-07, + "loss": 1.8761, + "step": 2115 + }, + { + "epoch": 0.14285232977325563, + "grad_norm": 4.8125377943903445, + "learning_rate": 9.944154131125642e-07, + "loss": 1.8496, + "step": 2120 + }, + { + "epoch": 0.14318924564536234, + "grad_norm": 4.980214347335999, + "learning_rate": 9.94327427047028e-07, + "loss": 1.8448, + "step": 2125 + }, + { + "epoch": 0.14352616151746908, + "grad_norm": 5.2676984518019845, + "learning_rate": 9.942387572247983e-07, + "loss": 1.8144, + "step": 2130 + }, + { + "epoch": 0.14386307738957582, + "grad_norm": 4.578702371456273, + "learning_rate": 9.941494037685243e-07, + "loss": 1.8304, + "step": 2135 + }, + { + "epoch": 0.14419999326168256, + "grad_norm": 4.560416512561316, + "learning_rate": 9.940593668017998e-07, + "loss": 1.8057, + "step": 2140 + }, + { + "epoch": 0.1445369091337893, + "grad_norm": 5.543083573534103, + "learning_rate": 9.93968646449165e-07, + "loss": 1.8248, + "step": 2145 + }, + { + "epoch": 0.14487382500589602, + "grad_norm": 4.966033102521214, + "learning_rate": 9.938772428361045e-07, + "loss": 1.8475, + "step": 2150 + }, + { + "epoch": 0.14521074087800276, + "grad_norm": 5.1807178939148395, + "learning_rate": 9.937851560890484e-07, + "loss": 1.7746, + "step": 2155 + }, + { + "epoch": 0.1455476567501095, + "grad_norm": 4.75754428630411, + "learning_rate": 9.936923863353717e-07, + "loss": 1.8487, + "step": 2160 + }, + { + "epoch": 0.14588457262221624, + "grad_norm": 4.574863274091443, + "learning_rate": 9.935989337033939e-07, + "loss": 1.8253, + "step": 2165 + }, + { + "epoch": 0.14622148849432295, + "grad_norm": 4.544911337212059, + "learning_rate": 9.935047983223794e-07, + "loss": 1.7665, + "step": 2170 + }, + { + "epoch": 0.1465584043664297, + "grad_norm": 4.536591505508547, + "learning_rate": 9.934099803225367e-07, + "loss": 1.7767, + "step": 2175 + }, + { + "epoch": 0.14689532023853644, + "grad_norm": 4.588954312124332, + "learning_rate": 9.933144798350188e-07, + "loss": 1.8268, + "step": 2180 + }, + { + "epoch": 0.14723223611064318, + "grad_norm": 4.893042613752849, + "learning_rate": 9.932182969919228e-07, + "loss": 1.8961, + "step": 2185 + }, + { + "epoch": 0.14756915198274992, + "grad_norm": 4.131484985175047, + "learning_rate": 9.931214319262885e-07, + "loss": 1.7747, + "step": 2190 + }, + { + "epoch": 0.14790606785485663, + "grad_norm": 4.78203381923773, + "learning_rate": 9.930238847721013e-07, + "loss": 1.7877, + "step": 2195 + }, + { + "epoch": 0.14824298372696337, + "grad_norm": 9.076085543417564, + "learning_rate": 9.929256556642884e-07, + "loss": 1.7307, + "step": 2200 + }, + { + "epoch": 0.14857989959907011, + "grad_norm": 4.873351760425082, + "learning_rate": 9.92826744738721e-07, + "loss": 1.865, + "step": 2205 + }, + { + "epoch": 0.14891681547117686, + "grad_norm": 11.613145342066806, + "learning_rate": 9.927271521322134e-07, + "loss": 1.8322, + "step": 2210 + }, + { + "epoch": 0.14925373134328357, + "grad_norm": 4.743206734367761, + "learning_rate": 9.926268779825224e-07, + "loss": 1.8232, + "step": 2215 + }, + { + "epoch": 0.1495906472153903, + "grad_norm": 4.449486056308384, + "learning_rate": 9.925259224283484e-07, + "loss": 1.7962, + "step": 2220 + }, + { + "epoch": 0.14992756308749705, + "grad_norm": 4.712926106412736, + "learning_rate": 9.924242856093332e-07, + "loss": 1.7963, + "step": 2225 + }, + { + "epoch": 0.1502644789596038, + "grad_norm": 4.312335008664891, + "learning_rate": 9.923219676660614e-07, + "loss": 1.7713, + "step": 2230 + }, + { + "epoch": 0.15060139483171053, + "grad_norm": 4.519951384285313, + "learning_rate": 9.922189687400603e-07, + "loss": 1.7679, + "step": 2235 + }, + { + "epoch": 0.15093831070381725, + "grad_norm": 5.135990711467957, + "learning_rate": 9.921152889737984e-07, + "loss": 1.8021, + "step": 2240 + }, + { + "epoch": 0.151275226575924, + "grad_norm": 4.969576681103405, + "learning_rate": 9.92010928510686e-07, + "loss": 1.7999, + "step": 2245 + }, + { + "epoch": 0.15161214244803073, + "grad_norm": 5.093841787426275, + "learning_rate": 9.919058874950754e-07, + "loss": 1.8234, + "step": 2250 + }, + { + "epoch": 0.15194905832013747, + "grad_norm": 4.306254185807312, + "learning_rate": 9.9180016607226e-07, + "loss": 1.7889, + "step": 2255 + }, + { + "epoch": 0.15228597419224418, + "grad_norm": 5.155846839902419, + "learning_rate": 9.916937643884737e-07, + "loss": 1.8429, + "step": 2260 + }, + { + "epoch": 0.15262289006435092, + "grad_norm": 5.240572597887352, + "learning_rate": 9.915866825908927e-07, + "loss": 1.8422, + "step": 2265 + }, + { + "epoch": 0.15295980593645767, + "grad_norm": 4.27444987327846, + "learning_rate": 9.914789208276329e-07, + "loss": 1.8744, + "step": 2270 + }, + { + "epoch": 0.1532967218085644, + "grad_norm": 5.0907807471676465, + "learning_rate": 9.913704792477511e-07, + "loss": 1.8303, + "step": 2275 + }, + { + "epoch": 0.15363363768067115, + "grad_norm": 4.14235267626192, + "learning_rate": 9.91261358001244e-07, + "loss": 1.7255, + "step": 2280 + }, + { + "epoch": 0.15397055355277786, + "grad_norm": 4.74663474434792, + "learning_rate": 9.911515572390495e-07, + "loss": 1.809, + "step": 2285 + }, + { + "epoch": 0.1543074694248846, + "grad_norm": 4.984496588449255, + "learning_rate": 9.91041077113044e-07, + "loss": 1.8025, + "step": 2290 + }, + { + "epoch": 0.15464438529699134, + "grad_norm": 4.850549839234142, + "learning_rate": 9.909299177760445e-07, + "loss": 1.8208, + "step": 2295 + }, + { + "epoch": 0.15498130116909808, + "grad_norm": 4.816762254065827, + "learning_rate": 9.90818079381807e-07, + "loss": 1.8963, + "step": 2300 + }, + { + "epoch": 0.1553182170412048, + "grad_norm": 4.930040222816709, + "learning_rate": 9.907055620850277e-07, + "loss": 1.7785, + "step": 2305 + }, + { + "epoch": 0.15565513291331154, + "grad_norm": 6.8194169990152576, + "learning_rate": 9.905923660413409e-07, + "loss": 1.7386, + "step": 2310 + }, + { + "epoch": 0.15599204878541828, + "grad_norm": 5.119532349582884, + "learning_rate": 9.904784914073196e-07, + "loss": 1.8177, + "step": 2315 + }, + { + "epoch": 0.15632896465752502, + "grad_norm": 4.884587026760344, + "learning_rate": 9.903639383404765e-07, + "loss": 1.7836, + "step": 2320 + }, + { + "epoch": 0.15666588052963176, + "grad_norm": 4.790810189179543, + "learning_rate": 9.902487069992618e-07, + "loss": 1.7596, + "step": 2325 + }, + { + "epoch": 0.15700279640173848, + "grad_norm": 5.134335268782031, + "learning_rate": 9.901327975430645e-07, + "loss": 1.8548, + "step": 2330 + }, + { + "epoch": 0.15733971227384522, + "grad_norm": 5.08823307216309, + "learning_rate": 9.900162101322106e-07, + "loss": 1.7812, + "step": 2335 + }, + { + "epoch": 0.15767662814595196, + "grad_norm": 5.059545413463581, + "learning_rate": 9.898989449279653e-07, + "loss": 1.7619, + "step": 2340 + }, + { + "epoch": 0.1580135440180587, + "grad_norm": 4.680401030004896, + "learning_rate": 9.8978100209253e-07, + "loss": 1.8315, + "step": 2345 + }, + { + "epoch": 0.1583504598901654, + "grad_norm": 4.600218824615641, + "learning_rate": 9.89662381789044e-07, + "loss": 1.8484, + "step": 2350 + }, + { + "epoch": 0.15868737576227215, + "grad_norm": 4.937748124614686, + "learning_rate": 9.89543084181584e-07, + "loss": 1.7603, + "step": 2355 + }, + { + "epoch": 0.1590242916343789, + "grad_norm": 5.031865797237023, + "learning_rate": 9.894231094351628e-07, + "loss": 1.8257, + "step": 2360 + }, + { + "epoch": 0.15936120750648564, + "grad_norm": 4.489253013228351, + "learning_rate": 9.893024577157303e-07, + "loss": 1.8541, + "step": 2365 + }, + { + "epoch": 0.15969812337859238, + "grad_norm": 4.759951564369919, + "learning_rate": 9.891811291901727e-07, + "loss": 1.7888, + "step": 2370 + }, + { + "epoch": 0.1600350392506991, + "grad_norm": 4.803084737310507, + "learning_rate": 9.890591240263124e-07, + "loss": 1.7885, + "step": 2375 + }, + { + "epoch": 0.16037195512280583, + "grad_norm": 4.777856099384717, + "learning_rate": 9.889364423929075e-07, + "loss": 1.7622, + "step": 2380 + }, + { + "epoch": 0.16070887099491257, + "grad_norm": 4.658918440571349, + "learning_rate": 9.888130844596524e-07, + "loss": 1.7666, + "step": 2385 + }, + { + "epoch": 0.1610457868670193, + "grad_norm": 4.643205378034103, + "learning_rate": 9.88689050397176e-07, + "loss": 1.8317, + "step": 2390 + }, + { + "epoch": 0.16138270273912603, + "grad_norm": 4.758968390841792, + "learning_rate": 9.885643403770431e-07, + "loss": 1.7614, + "step": 2395 + }, + { + "epoch": 0.16171961861123277, + "grad_norm": 5.076582216217152, + "learning_rate": 9.884389545717538e-07, + "loss": 1.7926, + "step": 2400 + }, + { + "epoch": 0.1620565344833395, + "grad_norm": 5.28772251665507, + "learning_rate": 9.88312893154742e-07, + "loss": 1.7311, + "step": 2405 + }, + { + "epoch": 0.16239345035544625, + "grad_norm": 4.759748331327468, + "learning_rate": 9.881861563003766e-07, + "loss": 1.8694, + "step": 2410 + }, + { + "epoch": 0.162730366227553, + "grad_norm": 4.5348602623665535, + "learning_rate": 9.880587441839613e-07, + "loss": 1.783, + "step": 2415 + }, + { + "epoch": 0.1630672820996597, + "grad_norm": 4.92603836017639, + "learning_rate": 9.87930656981733e-07, + "loss": 1.8056, + "step": 2420 + }, + { + "epoch": 0.16340419797176645, + "grad_norm": 5.15426511329714, + "learning_rate": 9.878018948708625e-07, + "loss": 1.6898, + "step": 2425 + }, + { + "epoch": 0.1637411138438732, + "grad_norm": 4.970467692997654, + "learning_rate": 9.876724580294546e-07, + "loss": 1.8129, + "step": 2430 + }, + { + "epoch": 0.16407802971597993, + "grad_norm": 4.584293618459513, + "learning_rate": 9.875423466365471e-07, + "loss": 1.8001, + "step": 2435 + }, + { + "epoch": 0.16441494558808664, + "grad_norm": 4.617563282597616, + "learning_rate": 9.874115608721107e-07, + "loss": 1.7551, + "step": 2440 + }, + { + "epoch": 0.16475186146019338, + "grad_norm": 4.756934248759054, + "learning_rate": 9.872801009170492e-07, + "loss": 1.8694, + "step": 2445 + }, + { + "epoch": 0.16508877733230012, + "grad_norm": 4.474706782061384, + "learning_rate": 9.871479669531988e-07, + "loss": 1.7236, + "step": 2450 + }, + { + "epoch": 0.16542569320440687, + "grad_norm": 5.216098770623675, + "learning_rate": 9.87015159163328e-07, + "loss": 1.7731, + "step": 2455 + }, + { + "epoch": 0.1657626090765136, + "grad_norm": 5.3566900141122105, + "learning_rate": 9.868816777311372e-07, + "loss": 1.8435, + "step": 2460 + }, + { + "epoch": 0.16609952494862032, + "grad_norm": 4.724843591610029, + "learning_rate": 9.867475228412592e-07, + "loss": 1.7726, + "step": 2465 + }, + { + "epoch": 0.16643644082072706, + "grad_norm": 5.500471088365371, + "learning_rate": 9.866126946792572e-07, + "loss": 1.7542, + "step": 2470 + }, + { + "epoch": 0.1667733566928338, + "grad_norm": 4.7605627099736125, + "learning_rate": 9.864771934316268e-07, + "loss": 1.7708, + "step": 2475 + }, + { + "epoch": 0.16711027256494054, + "grad_norm": 4.725097378571309, + "learning_rate": 9.863410192857938e-07, + "loss": 1.8176, + "step": 2480 + }, + { + "epoch": 0.16744718843704726, + "grad_norm": 4.5019201902072234, + "learning_rate": 9.862041724301154e-07, + "loss": 1.8011, + "step": 2485 + }, + { + "epoch": 0.167784104309154, + "grad_norm": 4.563439436344494, + "learning_rate": 9.860666530538787e-07, + "loss": 1.8648, + "step": 2490 + }, + { + "epoch": 0.16812102018126074, + "grad_norm": 4.245541125514505, + "learning_rate": 9.859284613473017e-07, + "loss": 1.7836, + "step": 2495 + }, + { + "epoch": 0.16845793605336748, + "grad_norm": 4.289654542861537, + "learning_rate": 9.857895975015318e-07, + "loss": 1.8215, + "step": 2500 + }, + { + "epoch": 0.16879485192547422, + "grad_norm": 4.680020276493947, + "learning_rate": 9.856500617086463e-07, + "loss": 1.7658, + "step": 2505 + }, + { + "epoch": 0.16913176779758093, + "grad_norm": 4.79697808623628, + "learning_rate": 9.85509854161652e-07, + "loss": 1.7638, + "step": 2510 + }, + { + "epoch": 0.16946868366968768, + "grad_norm": 4.688992073069253, + "learning_rate": 9.853689750544849e-07, + "loss": 1.8517, + "step": 2515 + }, + { + "epoch": 0.16980559954179442, + "grad_norm": 9.766438264833477, + "learning_rate": 9.852274245820095e-07, + "loss": 1.7899, + "step": 2520 + }, + { + "epoch": 0.17014251541390116, + "grad_norm": 5.374427197648961, + "learning_rate": 9.850852029400198e-07, + "loss": 1.8161, + "step": 2525 + }, + { + "epoch": 0.17047943128600787, + "grad_norm": 4.740432344703743, + "learning_rate": 9.849423103252374e-07, + "loss": 1.767, + "step": 2530 + }, + { + "epoch": 0.1708163471581146, + "grad_norm": 4.848915892043279, + "learning_rate": 9.84798746935312e-07, + "loss": 1.7695, + "step": 2535 + }, + { + "epoch": 0.17115326303022135, + "grad_norm": 4.546388184572521, + "learning_rate": 9.846545129688217e-07, + "loss": 1.731, + "step": 2540 + }, + { + "epoch": 0.1714901789023281, + "grad_norm": 4.920312702284114, + "learning_rate": 9.845096086252716e-07, + "loss": 1.7672, + "step": 2545 + }, + { + "epoch": 0.17182709477443484, + "grad_norm": 4.6089934584516135, + "learning_rate": 9.843640341050944e-07, + "loss": 1.846, + "step": 2550 + }, + { + "epoch": 0.17216401064654155, + "grad_norm": 4.56883920848349, + "learning_rate": 9.842177896096493e-07, + "loss": 1.8548, + "step": 2555 + }, + { + "epoch": 0.1725009265186483, + "grad_norm": 4.704343360713175, + "learning_rate": 9.84070875341223e-07, + "loss": 1.7832, + "step": 2560 + }, + { + "epoch": 0.17283784239075503, + "grad_norm": 4.560170584618913, + "learning_rate": 9.83923291503028e-07, + "loss": 1.8017, + "step": 2565 + }, + { + "epoch": 0.17317475826286177, + "grad_norm": 4.648787556226848, + "learning_rate": 9.837750382992033e-07, + "loss": 1.775, + "step": 2570 + }, + { + "epoch": 0.17351167413496849, + "grad_norm": 4.863410571412938, + "learning_rate": 9.836261159348135e-07, + "loss": 1.7932, + "step": 2575 + }, + { + "epoch": 0.17384859000707523, + "grad_norm": 4.952071220515145, + "learning_rate": 9.834765246158488e-07, + "loss": 1.8207, + "step": 2580 + }, + { + "epoch": 0.17418550587918197, + "grad_norm": 4.8410219655558615, + "learning_rate": 9.83326264549225e-07, + "loss": 1.818, + "step": 2585 + }, + { + "epoch": 0.1745224217512887, + "grad_norm": 4.5337820540456795, + "learning_rate": 9.83175335942783e-07, + "loss": 1.7326, + "step": 2590 + }, + { + "epoch": 0.17485933762339545, + "grad_norm": 4.7911244830213215, + "learning_rate": 9.830237390052876e-07, + "loss": 1.7431, + "step": 2595 + }, + { + "epoch": 0.17519625349550216, + "grad_norm": 5.11291379581688, + "learning_rate": 9.82871473946429e-07, + "loss": 1.8238, + "step": 2600 + }, + { + "epoch": 0.1755331693676089, + "grad_norm": 4.8347628505716145, + "learning_rate": 9.82718540976821e-07, + "loss": 1.781, + "step": 2605 + }, + { + "epoch": 0.17587008523971565, + "grad_norm": 5.1804157814006455, + "learning_rate": 9.825649403080015e-07, + "loss": 1.781, + "step": 2610 + }, + { + "epoch": 0.1762070011118224, + "grad_norm": 4.94004659492207, + "learning_rate": 9.824106721524317e-07, + "loss": 1.737, + "step": 2615 + }, + { + "epoch": 0.1765439169839291, + "grad_norm": 4.583871412666966, + "learning_rate": 9.822557367234962e-07, + "loss": 1.7941, + "step": 2620 + }, + { + "epoch": 0.17688083285603584, + "grad_norm": 4.996180207831546, + "learning_rate": 9.82100134235503e-07, + "loss": 1.8262, + "step": 2625 + }, + { + "epoch": 0.17721774872814258, + "grad_norm": 4.799695846367883, + "learning_rate": 9.819438649036823e-07, + "loss": 1.8264, + "step": 2630 + }, + { + "epoch": 0.17755466460024932, + "grad_norm": 4.91429312170365, + "learning_rate": 9.817869289441864e-07, + "loss": 1.7259, + "step": 2635 + }, + { + "epoch": 0.17789158047235606, + "grad_norm": 4.8369882287978365, + "learning_rate": 9.816293265740907e-07, + "loss": 1.8009, + "step": 2640 + }, + { + "epoch": 0.17822849634446278, + "grad_norm": 7.820380148630697, + "learning_rate": 9.81471058011391e-07, + "loss": 1.812, + "step": 2645 + }, + { + "epoch": 0.17856541221656952, + "grad_norm": 4.591956442861727, + "learning_rate": 9.81312123475006e-07, + "loss": 1.7916, + "step": 2650 + }, + { + "epoch": 0.17890232808867626, + "grad_norm": 5.029746480437414, + "learning_rate": 9.811525231847746e-07, + "loss": 1.8452, + "step": 2655 + }, + { + "epoch": 0.179239243960783, + "grad_norm": 4.811826857148872, + "learning_rate": 9.809922573614569e-07, + "loss": 1.7681, + "step": 2660 + }, + { + "epoch": 0.17957615983288971, + "grad_norm": 4.900869827689216, + "learning_rate": 9.808313262267337e-07, + "loss": 1.7657, + "step": 2665 + }, + { + "epoch": 0.17991307570499646, + "grad_norm": 4.717354952119474, + "learning_rate": 9.806697300032057e-07, + "loss": 1.7752, + "step": 2670 + }, + { + "epoch": 0.1802499915771032, + "grad_norm": 5.128437055986362, + "learning_rate": 9.805074689143938e-07, + "loss": 1.7887, + "step": 2675 + }, + { + "epoch": 0.18058690744920994, + "grad_norm": 5.150193518417055, + "learning_rate": 9.803445431847388e-07, + "loss": 1.7366, + "step": 2680 + }, + { + "epoch": 0.18092382332131668, + "grad_norm": 4.763856294704906, + "learning_rate": 9.801809530396003e-07, + "loss": 1.8641, + "step": 2685 + }, + { + "epoch": 0.1812607391934234, + "grad_norm": 4.425971270262882, + "learning_rate": 9.800166987052572e-07, + "loss": 1.7468, + "step": 2690 + }, + { + "epoch": 0.18159765506553013, + "grad_norm": 4.638375651468754, + "learning_rate": 9.798517804089072e-07, + "loss": 1.9051, + "step": 2695 + }, + { + "epoch": 0.18193457093763687, + "grad_norm": 4.2910226154650335, + "learning_rate": 9.796861983786661e-07, + "loss": 1.729, + "step": 2700 + }, + { + "epoch": 0.18227148680974362, + "grad_norm": 4.529098176187787, + "learning_rate": 9.795199528435682e-07, + "loss": 1.7729, + "step": 2705 + }, + { + "epoch": 0.18260840268185033, + "grad_norm": 4.832329865493561, + "learning_rate": 9.793530440335654e-07, + "loss": 1.7572, + "step": 2710 + }, + { + "epoch": 0.18294531855395707, + "grad_norm": 4.810104590212989, + "learning_rate": 9.791854721795264e-07, + "loss": 1.7605, + "step": 2715 + }, + { + "epoch": 0.1832822344260638, + "grad_norm": 4.820817246781472, + "learning_rate": 9.790172375132385e-07, + "loss": 1.7599, + "step": 2720 + }, + { + "epoch": 0.18361915029817055, + "grad_norm": 4.605366486448338, + "learning_rate": 9.788483402674041e-07, + "loss": 1.8545, + "step": 2725 + }, + { + "epoch": 0.1839560661702773, + "grad_norm": 4.748751803024309, + "learning_rate": 9.786787806756434e-07, + "loss": 1.7956, + "step": 2730 + }, + { + "epoch": 0.184292982042384, + "grad_norm": 5.117426579850719, + "learning_rate": 9.78508558972492e-07, + "loss": 1.8082, + "step": 2735 + }, + { + "epoch": 0.18462989791449075, + "grad_norm": 4.569683293912544, + "learning_rate": 9.783376753934015e-07, + "loss": 1.8217, + "step": 2740 + }, + { + "epoch": 0.1849668137865975, + "grad_norm": 4.834824323691271, + "learning_rate": 9.781661301747393e-07, + "loss": 1.7359, + "step": 2745 + }, + { + "epoch": 0.18530372965870423, + "grad_norm": 4.728916185918625, + "learning_rate": 9.779939235537879e-07, + "loss": 1.8533, + "step": 2750 + }, + { + "epoch": 0.18564064553081094, + "grad_norm": 4.883042265853466, + "learning_rate": 9.778210557687443e-07, + "loss": 1.8023, + "step": 2755 + }, + { + "epoch": 0.18597756140291768, + "grad_norm": 4.652974648533559, + "learning_rate": 9.776475270587205e-07, + "loss": 1.7946, + "step": 2760 + }, + { + "epoch": 0.18631447727502443, + "grad_norm": 4.739243517296054, + "learning_rate": 9.774733376637421e-07, + "loss": 1.7767, + "step": 2765 + }, + { + "epoch": 0.18665139314713117, + "grad_norm": 5.11851445798632, + "learning_rate": 9.772984878247493e-07, + "loss": 1.7976, + "step": 2770 + }, + { + "epoch": 0.1869883090192379, + "grad_norm": 5.074930861110047, + "learning_rate": 9.771229777835952e-07, + "loss": 1.7576, + "step": 2775 + }, + { + "epoch": 0.18732522489134462, + "grad_norm": 4.520407367212375, + "learning_rate": 9.769468077830466e-07, + "loss": 1.7678, + "step": 2780 + }, + { + "epoch": 0.18766214076345136, + "grad_norm": 4.81402917555932, + "learning_rate": 9.767699780667827e-07, + "loss": 1.8164, + "step": 2785 + }, + { + "epoch": 0.1879990566355581, + "grad_norm": 4.964921504879331, + "learning_rate": 9.765924888793955e-07, + "loss": 1.7382, + "step": 2790 + }, + { + "epoch": 0.18833597250766484, + "grad_norm": 4.347692369389646, + "learning_rate": 9.76414340466389e-07, + "loss": 1.8065, + "step": 2795 + }, + { + "epoch": 0.18867288837977156, + "grad_norm": 5.298426455276457, + "learning_rate": 9.762355330741794e-07, + "loss": 1.8219, + "step": 2800 + }, + { + "epoch": 0.1890098042518783, + "grad_norm": 4.5164283817941975, + "learning_rate": 9.760560669500941e-07, + "loss": 1.7944, + "step": 2805 + }, + { + "epoch": 0.18934672012398504, + "grad_norm": 4.715610201482097, + "learning_rate": 9.758759423423716e-07, + "loss": 1.7729, + "step": 2810 + }, + { + "epoch": 0.18968363599609178, + "grad_norm": 4.716260025180655, + "learning_rate": 9.756951595001617e-07, + "loss": 1.7253, + "step": 2815 + }, + { + "epoch": 0.19002055186819852, + "grad_norm": 5.3440331043854075, + "learning_rate": 9.755137186735238e-07, + "loss": 1.7707, + "step": 2820 + }, + { + "epoch": 0.19035746774030524, + "grad_norm": 5.02041013091292, + "learning_rate": 9.753316201134282e-07, + "loss": 1.7937, + "step": 2825 + }, + { + "epoch": 0.19069438361241198, + "grad_norm": 4.555395708579455, + "learning_rate": 9.75148864071755e-07, + "loss": 1.7525, + "step": 2830 + }, + { + "epoch": 0.19103129948451872, + "grad_norm": 5.174506656565936, + "learning_rate": 9.74965450801293e-07, + "loss": 1.7621, + "step": 2835 + }, + { + "epoch": 0.19136821535662546, + "grad_norm": 4.931304293894075, + "learning_rate": 9.747813805557408e-07, + "loss": 1.7584, + "step": 2840 + }, + { + "epoch": 0.19170513122873217, + "grad_norm": 4.7073358538658185, + "learning_rate": 9.745966535897054e-07, + "loss": 1.7606, + "step": 2845 + }, + { + "epoch": 0.19204204710083891, + "grad_norm": 4.582053551568521, + "learning_rate": 9.744112701587024e-07, + "loss": 1.7712, + "step": 2850 + }, + { + "epoch": 0.19237896297294566, + "grad_norm": 4.767250149002942, + "learning_rate": 9.742252305191551e-07, + "loss": 1.7563, + "step": 2855 + }, + { + "epoch": 0.1927158788450524, + "grad_norm": 4.709027679259578, + "learning_rate": 9.740385349283946e-07, + "loss": 1.793, + "step": 2860 + }, + { + "epoch": 0.19305279471715914, + "grad_norm": 4.956158981373378, + "learning_rate": 9.738511836446596e-07, + "loss": 1.8203, + "step": 2865 + }, + { + "epoch": 0.19338971058926585, + "grad_norm": 4.474867003420559, + "learning_rate": 9.736631769270957e-07, + "loss": 1.7836, + "step": 2870 + }, + { + "epoch": 0.1937266264613726, + "grad_norm": 4.843134161039688, + "learning_rate": 9.734745150357544e-07, + "loss": 1.7853, + "step": 2875 + }, + { + "epoch": 0.19406354233347933, + "grad_norm": 4.655571844872947, + "learning_rate": 9.732851982315944e-07, + "loss": 1.8458, + "step": 2880 + }, + { + "epoch": 0.19440045820558607, + "grad_norm": 4.856240480203855, + "learning_rate": 9.730952267764796e-07, + "loss": 1.761, + "step": 2885 + }, + { + "epoch": 0.1947373740776928, + "grad_norm": 4.826308380318007, + "learning_rate": 9.729046009331798e-07, + "loss": 1.7442, + "step": 2890 + }, + { + "epoch": 0.19507428994979953, + "grad_norm": 4.632076537884298, + "learning_rate": 9.727133209653696e-07, + "loss": 1.7747, + "step": 2895 + }, + { + "epoch": 0.19541120582190627, + "grad_norm": 4.493737956408813, + "learning_rate": 9.72521387137629e-07, + "loss": 1.8317, + "step": 2900 + }, + { + "epoch": 0.195748121694013, + "grad_norm": 4.802050440898538, + "learning_rate": 9.723287997154419e-07, + "loss": 1.8332, + "step": 2905 + }, + { + "epoch": 0.19608503756611975, + "grad_norm": 4.480297291602395, + "learning_rate": 9.72135558965196e-07, + "loss": 1.7894, + "step": 2910 + }, + { + "epoch": 0.19642195343822647, + "grad_norm": 4.563719923514326, + "learning_rate": 9.719416651541837e-07, + "loss": 1.7317, + "step": 2915 + }, + { + "epoch": 0.1967588693103332, + "grad_norm": 4.315766072130242, + "learning_rate": 9.717471185505996e-07, + "loss": 1.7465, + "step": 2920 + }, + { + "epoch": 0.19709578518243995, + "grad_norm": 4.769139822472816, + "learning_rate": 9.715519194235422e-07, + "loss": 1.7184, + "step": 2925 + }, + { + "epoch": 0.1974327010545467, + "grad_norm": 4.814639381264535, + "learning_rate": 9.713560680430117e-07, + "loss": 1.801, + "step": 2930 + }, + { + "epoch": 0.1977696169266534, + "grad_norm": 4.894385656327445, + "learning_rate": 9.71159564679911e-07, + "loss": 1.7801, + "step": 2935 + }, + { + "epoch": 0.19810653279876014, + "grad_norm": 4.96835121805525, + "learning_rate": 9.709624096060449e-07, + "loss": 1.7787, + "step": 2940 + }, + { + "epoch": 0.19844344867086688, + "grad_norm": 4.395255870875639, + "learning_rate": 9.707646030941192e-07, + "loss": 1.7774, + "step": 2945 + }, + { + "epoch": 0.19878036454297363, + "grad_norm": 4.596421154913451, + "learning_rate": 9.705661454177416e-07, + "loss": 1.7741, + "step": 2950 + }, + { + "epoch": 0.19911728041508037, + "grad_norm": 4.904942759236581, + "learning_rate": 9.703670368514192e-07, + "loss": 1.7855, + "step": 2955 + }, + { + "epoch": 0.19945419628718708, + "grad_norm": 4.6289143433439035, + "learning_rate": 9.701672776705609e-07, + "loss": 1.7948, + "step": 2960 + }, + { + "epoch": 0.19979111215929382, + "grad_norm": 4.703689873640086, + "learning_rate": 9.699668681514746e-07, + "loss": 1.8122, + "step": 2965 + }, + { + "epoch": 0.20012802803140056, + "grad_norm": 4.322341146884426, + "learning_rate": 9.697658085713676e-07, + "loss": 1.8207, + "step": 2970 + }, + { + "epoch": 0.2004649439035073, + "grad_norm": 5.107208490116794, + "learning_rate": 9.695640992083471e-07, + "loss": 1.732, + "step": 2975 + }, + { + "epoch": 0.20080185977561402, + "grad_norm": 4.618863119770153, + "learning_rate": 9.693617403414188e-07, + "loss": 1.7793, + "step": 2980 + }, + { + "epoch": 0.20113877564772076, + "grad_norm": 5.063764438355028, + "learning_rate": 9.691587322504865e-07, + "loss": 1.7599, + "step": 2985 + }, + { + "epoch": 0.2014756915198275, + "grad_norm": 4.919214411010044, + "learning_rate": 9.68955075216352e-07, + "loss": 1.844, + "step": 2990 + }, + { + "epoch": 0.20181260739193424, + "grad_norm": 7.731079171027298, + "learning_rate": 9.687507695207154e-07, + "loss": 1.7809, + "step": 2995 + }, + { + "epoch": 0.20214952326404098, + "grad_norm": 4.88502046387205, + "learning_rate": 9.685458154461731e-07, + "loss": 1.7439, + "step": 3000 + }, + { + "epoch": 0.2024864391361477, + "grad_norm": 4.325672374975772, + "learning_rate": 9.683402132762193e-07, + "loss": 1.7976, + "step": 3005 + }, + { + "epoch": 0.20282335500825444, + "grad_norm": 4.696787912223931, + "learning_rate": 9.68133963295244e-07, + "loss": 1.8553, + "step": 3010 + }, + { + "epoch": 0.20316027088036118, + "grad_norm": 4.807613388751499, + "learning_rate": 9.679270657885334e-07, + "loss": 1.6998, + "step": 3015 + }, + { + "epoch": 0.20349718675246792, + "grad_norm": 4.563573042960349, + "learning_rate": 9.677195210422693e-07, + "loss": 1.8247, + "step": 3020 + }, + { + "epoch": 0.20383410262457463, + "grad_norm": 4.6931253642053115, + "learning_rate": 9.675113293435288e-07, + "loss": 1.7735, + "step": 3025 + }, + { + "epoch": 0.20417101849668137, + "grad_norm": 4.478719320401698, + "learning_rate": 9.673024909802841e-07, + "loss": 1.745, + "step": 3030 + }, + { + "epoch": 0.2045079343687881, + "grad_norm": 4.447484104577244, + "learning_rate": 9.670930062414017e-07, + "loss": 1.7063, + "step": 3035 + }, + { + "epoch": 0.20484485024089485, + "grad_norm": 4.8927883667051955, + "learning_rate": 9.66882875416642e-07, + "loss": 1.8208, + "step": 3040 + }, + { + "epoch": 0.2051817661130016, + "grad_norm": 4.682209855532116, + "learning_rate": 9.666720987966595e-07, + "loss": 1.7681, + "step": 3045 + }, + { + "epoch": 0.2055186819851083, + "grad_norm": 4.916006159522313, + "learning_rate": 9.664606766730012e-07, + "loss": 1.7754, + "step": 3050 + }, + { + "epoch": 0.20585559785721505, + "grad_norm": 4.684215933574209, + "learning_rate": 9.662486093381082e-07, + "loss": 1.7737, + "step": 3055 + }, + { + "epoch": 0.2061925137293218, + "grad_norm": 4.813741584691905, + "learning_rate": 9.660358970853126e-07, + "loss": 1.7012, + "step": 3060 + }, + { + "epoch": 0.20652942960142853, + "grad_norm": 4.382040182047615, + "learning_rate": 9.658225402088395e-07, + "loss": 1.7321, + "step": 3065 + }, + { + "epoch": 0.20686634547353525, + "grad_norm": 4.458843911617041, + "learning_rate": 9.656085390038058e-07, + "loss": 1.7993, + "step": 3070 + }, + { + "epoch": 0.207203261345642, + "grad_norm": 4.55896931783871, + "learning_rate": 9.653938937662187e-07, + "loss": 1.7798, + "step": 3075 + }, + { + "epoch": 0.20754017721774873, + "grad_norm": 4.292995630768965, + "learning_rate": 9.651786047929772e-07, + "loss": 1.768, + "step": 3080 + }, + { + "epoch": 0.20787709308985547, + "grad_norm": 4.741061046681705, + "learning_rate": 9.649626723818702e-07, + "loss": 1.7983, + "step": 3085 + }, + { + "epoch": 0.2082140089619622, + "grad_norm": 4.914730551973815, + "learning_rate": 9.647460968315767e-07, + "loss": 1.8451, + "step": 3090 + }, + { + "epoch": 0.20855092483406892, + "grad_norm": 4.748415205041859, + "learning_rate": 9.645288784416652e-07, + "loss": 1.7624, + "step": 3095 + }, + { + "epoch": 0.20888784070617566, + "grad_norm": 4.615780661397535, + "learning_rate": 9.643110175125935e-07, + "loss": 1.8054, + "step": 3100 + }, + { + "epoch": 0.2092247565782824, + "grad_norm": 4.533202597147934, + "learning_rate": 9.640925143457084e-07, + "loss": 1.7423, + "step": 3105 + }, + { + "epoch": 0.20956167245038915, + "grad_norm": 5.095719194842791, + "learning_rate": 9.638733692432448e-07, + "loss": 1.7796, + "step": 3110 + }, + { + "epoch": 0.20989858832249586, + "grad_norm": 4.785132671207265, + "learning_rate": 9.636535825083252e-07, + "loss": 1.8073, + "step": 3115 + }, + { + "epoch": 0.2102355041946026, + "grad_norm": 4.462607949552112, + "learning_rate": 9.634331544449601e-07, + "loss": 1.8014, + "step": 3120 + }, + { + "epoch": 0.21057242006670934, + "grad_norm": 4.62396497723698, + "learning_rate": 9.632120853580472e-07, + "loss": 1.8046, + "step": 3125 + }, + { + "epoch": 0.21090933593881608, + "grad_norm": 4.440303646153727, + "learning_rate": 9.6299037555337e-07, + "loss": 1.73, + "step": 3130 + }, + { + "epoch": 0.21124625181092282, + "grad_norm": 4.684820392427264, + "learning_rate": 9.627680253375997e-07, + "loss": 1.7287, + "step": 3135 + }, + { + "epoch": 0.21158316768302954, + "grad_norm": 4.699281477495129, + "learning_rate": 9.625450350182918e-07, + "loss": 1.7721, + "step": 3140 + }, + { + "epoch": 0.21192008355513628, + "grad_norm": 4.404129361919743, + "learning_rate": 9.62321404903888e-07, + "loss": 1.7183, + "step": 3145 + }, + { + "epoch": 0.21225699942724302, + "grad_norm": 4.46828072383975, + "learning_rate": 9.620971353037148e-07, + "loss": 1.7568, + "step": 3150 + }, + { + "epoch": 0.21259391529934976, + "grad_norm": 5.0456383160573015, + "learning_rate": 9.618722265279835e-07, + "loss": 1.7834, + "step": 3155 + }, + { + "epoch": 0.21293083117145647, + "grad_norm": 5.447392712626318, + "learning_rate": 9.61646678887789e-07, + "loss": 1.7902, + "step": 3160 + }, + { + "epoch": 0.21326774704356322, + "grad_norm": 5.16918774828708, + "learning_rate": 9.614204926951102e-07, + "loss": 1.8112, + "step": 3165 + }, + { + "epoch": 0.21360466291566996, + "grad_norm": 4.4757864244714005, + "learning_rate": 9.611936682628095e-07, + "loss": 1.7717, + "step": 3170 + }, + { + "epoch": 0.2139415787877767, + "grad_norm": 4.598126050187275, + "learning_rate": 9.609662059046315e-07, + "loss": 1.7461, + "step": 3175 + }, + { + "epoch": 0.21427849465988344, + "grad_norm": 4.604780314975367, + "learning_rate": 9.607381059352038e-07, + "loss": 1.7711, + "step": 3180 + }, + { + "epoch": 0.21461541053199015, + "grad_norm": 4.64864748054051, + "learning_rate": 9.605093686700353e-07, + "loss": 1.8126, + "step": 3185 + }, + { + "epoch": 0.2149523264040969, + "grad_norm": 4.781745831526942, + "learning_rate": 9.602799944255172e-07, + "loss": 1.7298, + "step": 3190 + }, + { + "epoch": 0.21528924227620364, + "grad_norm": 4.743722422243644, + "learning_rate": 9.60049983518921e-07, + "loss": 1.8324, + "step": 3195 + }, + { + "epoch": 0.21562615814831038, + "grad_norm": 4.440157266990944, + "learning_rate": 9.598193362683995e-07, + "loss": 1.7269, + "step": 3200 + }, + { + "epoch": 0.2159630740204171, + "grad_norm": 5.245985893785636, + "learning_rate": 9.59588052992985e-07, + "loss": 1.7371, + "step": 3205 + }, + { + "epoch": 0.21629998989252383, + "grad_norm": 4.593649997820267, + "learning_rate": 9.5935613401259e-07, + "loss": 1.8086, + "step": 3210 + }, + { + "epoch": 0.21663690576463057, + "grad_norm": 4.490308962417356, + "learning_rate": 9.591235796480064e-07, + "loss": 1.7968, + "step": 3215 + }, + { + "epoch": 0.2169738216367373, + "grad_norm": 4.803010238652498, + "learning_rate": 9.588903902209048e-07, + "loss": 1.7773, + "step": 3220 + }, + { + "epoch": 0.21731073750884405, + "grad_norm": 4.646133140777154, + "learning_rate": 9.586565660538343e-07, + "loss": 1.7521, + "step": 3225 + }, + { + "epoch": 0.21764765338095077, + "grad_norm": 4.473035021116014, + "learning_rate": 9.584221074702217e-07, + "loss": 1.856, + "step": 3230 + }, + { + "epoch": 0.2179845692530575, + "grad_norm": 5.055380502337204, + "learning_rate": 9.581870147943715e-07, + "loss": 1.776, + "step": 3235 + }, + { + "epoch": 0.21832148512516425, + "grad_norm": 4.594212929510007, + "learning_rate": 9.579512883514656e-07, + "loss": 1.8087, + "step": 3240 + }, + { + "epoch": 0.218658400997271, + "grad_norm": 4.858762880181832, + "learning_rate": 9.577149284675619e-07, + "loss": 1.791, + "step": 3245 + }, + { + "epoch": 0.2189953168693777, + "grad_norm": 4.3746877376150595, + "learning_rate": 9.574779354695951e-07, + "loss": 1.7996, + "step": 3250 + }, + { + "epoch": 0.21933223274148445, + "grad_norm": 4.603856398111249, + "learning_rate": 9.572403096853754e-07, + "loss": 1.7624, + "step": 3255 + }, + { + "epoch": 0.2196691486135912, + "grad_norm": 4.5083144954047825, + "learning_rate": 9.570020514435878e-07, + "loss": 1.785, + "step": 3260 + }, + { + "epoch": 0.22000606448569793, + "grad_norm": 5.316421851324029, + "learning_rate": 9.567631610737929e-07, + "loss": 1.7005, + "step": 3265 + }, + { + "epoch": 0.22034298035780467, + "grad_norm": 4.881523181211674, + "learning_rate": 9.565236389064255e-07, + "loss": 1.8059, + "step": 3270 + }, + { + "epoch": 0.22067989622991138, + "grad_norm": 4.474918888361089, + "learning_rate": 9.562834852727935e-07, + "loss": 1.7975, + "step": 3275 + }, + { + "epoch": 0.22101681210201812, + "grad_norm": 4.423307719931098, + "learning_rate": 9.560427005050793e-07, + "loss": 1.7793, + "step": 3280 + }, + { + "epoch": 0.22135372797412486, + "grad_norm": 4.413118229643694, + "learning_rate": 9.55801284936338e-07, + "loss": 1.7529, + "step": 3285 + }, + { + "epoch": 0.2216906438462316, + "grad_norm": 4.933721416291058, + "learning_rate": 9.555592389004966e-07, + "loss": 1.7266, + "step": 3290 + }, + { + "epoch": 0.22202755971833832, + "grad_norm": 4.731100188336562, + "learning_rate": 9.553165627323548e-07, + "loss": 1.7748, + "step": 3295 + }, + { + "epoch": 0.22236447559044506, + "grad_norm": 4.994696255584328, + "learning_rate": 9.55073256767584e-07, + "loss": 1.7628, + "step": 3300 + }, + { + "epoch": 0.2227013914625518, + "grad_norm": 4.733796536914988, + "learning_rate": 9.548293213427262e-07, + "loss": 1.7621, + "step": 3305 + }, + { + "epoch": 0.22303830733465854, + "grad_norm": 4.6191701476158835, + "learning_rate": 9.545847567951944e-07, + "loss": 1.7833, + "step": 3310 + }, + { + "epoch": 0.22337522320676528, + "grad_norm": 4.655742693958854, + "learning_rate": 9.543395634632721e-07, + "loss": 1.793, + "step": 3315 + }, + { + "epoch": 0.223712139078872, + "grad_norm": 4.447754347858813, + "learning_rate": 9.540937416861117e-07, + "loss": 1.7705, + "step": 3320 + }, + { + "epoch": 0.22404905495097874, + "grad_norm": 5.29641811601241, + "learning_rate": 9.538472918037356e-07, + "loss": 1.8197, + "step": 3325 + }, + { + "epoch": 0.22438597082308548, + "grad_norm": 5.925090397904477, + "learning_rate": 9.536002141570348e-07, + "loss": 1.8162, + "step": 3330 + }, + { + "epoch": 0.22472288669519222, + "grad_norm": 4.732129210837873, + "learning_rate": 9.533525090877688e-07, + "loss": 1.8002, + "step": 3335 + }, + { + "epoch": 0.22505980256729893, + "grad_norm": 4.827993433570109, + "learning_rate": 9.531041769385641e-07, + "loss": 1.7571, + "step": 3340 + }, + { + "epoch": 0.22539671843940567, + "grad_norm": 4.967863123114332, + "learning_rate": 9.528552180529161e-07, + "loss": 1.8661, + "step": 3345 + }, + { + "epoch": 0.22573363431151242, + "grad_norm": 6.94281139383591, + "learning_rate": 9.526056327751856e-07, + "loss": 1.7828, + "step": 3350 + }, + { + "epoch": 0.22607055018361916, + "grad_norm": 4.9852835606633406, + "learning_rate": 9.523554214506006e-07, + "loss": 1.8015, + "step": 3355 + }, + { + "epoch": 0.2264074660557259, + "grad_norm": 5.415149724311492, + "learning_rate": 9.521045844252551e-07, + "loss": 1.8005, + "step": 3360 + }, + { + "epoch": 0.2267443819278326, + "grad_norm": 4.803260723539399, + "learning_rate": 9.518531220461084e-07, + "loss": 1.7508, + "step": 3365 + }, + { + "epoch": 0.22708129779993935, + "grad_norm": 4.0781328070330956, + "learning_rate": 9.516010346609845e-07, + "loss": 1.7467, + "step": 3370 + }, + { + "epoch": 0.2274182136720461, + "grad_norm": 4.638943670268183, + "learning_rate": 9.513483226185723e-07, + "loss": 1.8606, + "step": 3375 + }, + { + "epoch": 0.22775512954415283, + "grad_norm": 4.7751639719223435, + "learning_rate": 9.510949862684248e-07, + "loss": 1.7434, + "step": 3380 + }, + { + "epoch": 0.22809204541625955, + "grad_norm": 4.485643625780828, + "learning_rate": 9.508410259609583e-07, + "loss": 1.7891, + "step": 3385 + }, + { + "epoch": 0.2284289612883663, + "grad_norm": 4.787103456250361, + "learning_rate": 9.505864420474522e-07, + "loss": 1.7787, + "step": 3390 + }, + { + "epoch": 0.22876587716047303, + "grad_norm": 4.746721577697795, + "learning_rate": 9.503312348800485e-07, + "loss": 1.737, + "step": 3395 + }, + { + "epoch": 0.22910279303257977, + "grad_norm": 4.484589801607281, + "learning_rate": 9.500754048117514e-07, + "loss": 1.7541, + "step": 3400 + }, + { + "epoch": 0.2294397089046865, + "grad_norm": 4.502077592044502, + "learning_rate": 9.498189521964263e-07, + "loss": 1.7034, + "step": 3405 + }, + { + "epoch": 0.22977662477679323, + "grad_norm": 4.690815487498, + "learning_rate": 9.495618773888006e-07, + "loss": 1.6797, + "step": 3410 + }, + { + "epoch": 0.23011354064889997, + "grad_norm": 4.762323506148509, + "learning_rate": 9.49304180744461e-07, + "loss": 1.7822, + "step": 3415 + }, + { + "epoch": 0.2304504565210067, + "grad_norm": 4.387162226702979, + "learning_rate": 9.490458626198556e-07, + "loss": 1.7564, + "step": 3420 + }, + { + "epoch": 0.23078737239311345, + "grad_norm": 5.108928623435626, + "learning_rate": 9.487869233722915e-07, + "loss": 1.752, + "step": 3425 + }, + { + "epoch": 0.23112428826522016, + "grad_norm": 4.696698057501602, + "learning_rate": 9.485273633599348e-07, + "loss": 1.754, + "step": 3430 + }, + { + "epoch": 0.2314612041373269, + "grad_norm": 4.664626924419995, + "learning_rate": 9.482671829418107e-07, + "loss": 1.7531, + "step": 3435 + }, + { + "epoch": 0.23179812000943364, + "grad_norm": 4.5031239498515, + "learning_rate": 9.480063824778024e-07, + "loss": 1.7475, + "step": 3440 + }, + { + "epoch": 0.23213503588154039, + "grad_norm": 6.209457625066587, + "learning_rate": 9.477449623286505e-07, + "loss": 1.6885, + "step": 3445 + }, + { + "epoch": 0.23247195175364713, + "grad_norm": 4.361227999396086, + "learning_rate": 9.474829228559529e-07, + "loss": 1.8086, + "step": 3450 + }, + { + "epoch": 0.23280886762575384, + "grad_norm": 4.969357599319759, + "learning_rate": 9.472202644221643e-07, + "loss": 1.839, + "step": 3455 + }, + { + "epoch": 0.23314578349786058, + "grad_norm": 4.872284734828359, + "learning_rate": 9.469569873905955e-07, + "loss": 1.7324, + "step": 3460 + }, + { + "epoch": 0.23348269936996732, + "grad_norm": 4.601976719361907, + "learning_rate": 9.466930921254128e-07, + "loss": 1.7301, + "step": 3465 + }, + { + "epoch": 0.23381961524207406, + "grad_norm": 4.904552543549206, + "learning_rate": 9.464285789916376e-07, + "loss": 1.777, + "step": 3470 + }, + { + "epoch": 0.23415653111418078, + "grad_norm": 4.83831726667101, + "learning_rate": 9.461634483551464e-07, + "loss": 1.7213, + "step": 3475 + }, + { + "epoch": 0.23449344698628752, + "grad_norm": 4.658287483564059, + "learning_rate": 9.458977005826691e-07, + "loss": 1.733, + "step": 3480 + }, + { + "epoch": 0.23483036285839426, + "grad_norm": 4.591347678384667, + "learning_rate": 9.456313360417899e-07, + "loss": 1.7999, + "step": 3485 + }, + { + "epoch": 0.235167278730501, + "grad_norm": 4.411330890134835, + "learning_rate": 9.453643551009459e-07, + "loss": 1.7489, + "step": 3490 + }, + { + "epoch": 0.23550419460260774, + "grad_norm": 4.8367941401303645, + "learning_rate": 9.450967581294265e-07, + "loss": 1.8119, + "step": 3495 + }, + { + "epoch": 0.23584111047471445, + "grad_norm": 4.344443850260619, + "learning_rate": 9.448285454973737e-07, + "loss": 1.7566, + "step": 3500 + }, + { + "epoch": 0.2361780263468212, + "grad_norm": 5.118104466239838, + "learning_rate": 9.445597175757806e-07, + "loss": 1.7038, + "step": 3505 + }, + { + "epoch": 0.23651494221892794, + "grad_norm": 4.453125272027257, + "learning_rate": 9.442902747364918e-07, + "loss": 1.7057, + "step": 3510 + }, + { + "epoch": 0.23685185809103468, + "grad_norm": 4.610512909539744, + "learning_rate": 9.440202173522022e-07, + "loss": 1.7718, + "step": 3515 + }, + { + "epoch": 0.2371887739631414, + "grad_norm": 4.661575193646813, + "learning_rate": 9.437495457964568e-07, + "loss": 1.762, + "step": 3520 + }, + { + "epoch": 0.23752568983524813, + "grad_norm": 4.4907095983362595, + "learning_rate": 9.434782604436502e-07, + "loss": 1.7386, + "step": 3525 + }, + { + "epoch": 0.23786260570735487, + "grad_norm": 4.405418469834188, + "learning_rate": 9.432063616690258e-07, + "loss": 1.7547, + "step": 3530 + }, + { + "epoch": 0.23819952157946161, + "grad_norm": 5.532433958681073, + "learning_rate": 9.429338498486758e-07, + "loss": 1.7206, + "step": 3535 + }, + { + "epoch": 0.23853643745156836, + "grad_norm": 4.468416192322411, + "learning_rate": 9.426607253595402e-07, + "loss": 1.7891, + "step": 3540 + }, + { + "epoch": 0.23887335332367507, + "grad_norm": 4.6161099223279525, + "learning_rate": 9.423869885794063e-07, + "loss": 1.7656, + "step": 3545 + }, + { + "epoch": 0.2392102691957818, + "grad_norm": 5.047487557295027, + "learning_rate": 9.421126398869086e-07, + "loss": 1.8102, + "step": 3550 + }, + { + "epoch": 0.23954718506788855, + "grad_norm": 4.922422180793085, + "learning_rate": 9.418376796615279e-07, + "loss": 1.7425, + "step": 3555 + }, + { + "epoch": 0.2398841009399953, + "grad_norm": 4.660146502713987, + "learning_rate": 9.415621082835908e-07, + "loss": 1.7607, + "step": 3560 + }, + { + "epoch": 0.240221016812102, + "grad_norm": 4.433734686963888, + "learning_rate": 9.412859261342691e-07, + "loss": 1.7146, + "step": 3565 + }, + { + "epoch": 0.24055793268420875, + "grad_norm": 4.960946433754072, + "learning_rate": 9.410091335955798e-07, + "loss": 1.7406, + "step": 3570 + }, + { + "epoch": 0.2408948485563155, + "grad_norm": 5.0094940207061125, + "learning_rate": 9.407317310503841e-07, + "loss": 1.7632, + "step": 3575 + }, + { + "epoch": 0.24123176442842223, + "grad_norm": 5.031730983676167, + "learning_rate": 9.404537188823869e-07, + "loss": 1.813, + "step": 3580 + }, + { + "epoch": 0.24156868030052897, + "grad_norm": 4.494391366636777, + "learning_rate": 9.40175097476136e-07, + "loss": 1.7598, + "step": 3585 + }, + { + "epoch": 0.24190559617263568, + "grad_norm": 5.229951556761748, + "learning_rate": 9.398958672170225e-07, + "loss": 1.7348, + "step": 3590 + }, + { + "epoch": 0.24224251204474243, + "grad_norm": 4.599923341117953, + "learning_rate": 9.396160284912795e-07, + "loss": 1.7758, + "step": 3595 + }, + { + "epoch": 0.24257942791684917, + "grad_norm": 4.664723816998234, + "learning_rate": 9.393355816859813e-07, + "loss": 1.7659, + "step": 3600 + }, + { + "epoch": 0.2429163437889559, + "grad_norm": 5.234803010614216, + "learning_rate": 9.390545271890437e-07, + "loss": 1.7718, + "step": 3605 + }, + { + "epoch": 0.24325325966106262, + "grad_norm": 4.8706966068665825, + "learning_rate": 9.387728653892233e-07, + "loss": 1.7528, + "step": 3610 + }, + { + "epoch": 0.24359017553316936, + "grad_norm": 4.705019456003629, + "learning_rate": 9.384905966761159e-07, + "loss": 1.7693, + "step": 3615 + }, + { + "epoch": 0.2439270914052761, + "grad_norm": 5.494231676301332, + "learning_rate": 9.382077214401576e-07, + "loss": 1.7571, + "step": 3620 + }, + { + "epoch": 0.24426400727738284, + "grad_norm": 5.008941519368937, + "learning_rate": 9.379242400726232e-07, + "loss": 1.772, + "step": 3625 + }, + { + "epoch": 0.24460092314948959, + "grad_norm": 5.222723460942831, + "learning_rate": 9.376401529656257e-07, + "loss": 1.7858, + "step": 3630 + }, + { + "epoch": 0.2449378390215963, + "grad_norm": 4.47696532920893, + "learning_rate": 9.373554605121161e-07, + "loss": 1.6848, + "step": 3635 + }, + { + "epoch": 0.24527475489370304, + "grad_norm": 4.877984037043299, + "learning_rate": 9.370701631058828e-07, + "loss": 1.7502, + "step": 3640 + }, + { + "epoch": 0.24561167076580978, + "grad_norm": 5.048132174905781, + "learning_rate": 9.367842611415508e-07, + "loss": 1.7651, + "step": 3645 + }, + { + "epoch": 0.24594858663791652, + "grad_norm": 4.855385590365053, + "learning_rate": 9.364977550145816e-07, + "loss": 1.7338, + "step": 3650 + }, + { + "epoch": 0.24628550251002324, + "grad_norm": 4.5235322260702535, + "learning_rate": 9.362106451212721e-07, + "loss": 1.8073, + "step": 3655 + }, + { + "epoch": 0.24662241838212998, + "grad_norm": 5.083006810411546, + "learning_rate": 9.359229318587545e-07, + "loss": 1.7719, + "step": 3660 + }, + { + "epoch": 0.24695933425423672, + "grad_norm": 4.72527339625108, + "learning_rate": 9.356346156249954e-07, + "loss": 1.7813, + "step": 3665 + }, + { + "epoch": 0.24729625012634346, + "grad_norm": 4.6472501747489625, + "learning_rate": 9.353456968187958e-07, + "loss": 1.799, + "step": 3670 + }, + { + "epoch": 0.2476331659984502, + "grad_norm": 4.272567937938117, + "learning_rate": 9.350561758397897e-07, + "loss": 1.769, + "step": 3675 + }, + { + "epoch": 0.2479700818705569, + "grad_norm": 5.471616163784983, + "learning_rate": 9.347660530884442e-07, + "loss": 1.7788, + "step": 3680 + }, + { + "epoch": 0.24830699774266365, + "grad_norm": 4.558443147025116, + "learning_rate": 9.344753289660592e-07, + "loss": 1.7638, + "step": 3685 + }, + { + "epoch": 0.2486439136147704, + "grad_norm": 5.069595483982285, + "learning_rate": 9.34184003874766e-07, + "loss": 1.7975, + "step": 3690 + }, + { + "epoch": 0.24898082948687714, + "grad_norm": 4.760449032055557, + "learning_rate": 9.338920782175269e-07, + "loss": 1.7857, + "step": 3695 + }, + { + "epoch": 0.24931774535898385, + "grad_norm": 5.0621587885567205, + "learning_rate": 9.335995523981355e-07, + "loss": 1.7693, + "step": 3700 + }, + { + "epoch": 0.2496546612310906, + "grad_norm": 5.071621445026028, + "learning_rate": 9.333064268212153e-07, + "loss": 1.699, + "step": 3705 + }, + { + "epoch": 0.24999157710319733, + "grad_norm": 5.508852897124171, + "learning_rate": 9.330127018922193e-07, + "loss": 1.7654, + "step": 3710 + }, + { + "epoch": 0.2503284929753041, + "grad_norm": 4.415117796587092, + "learning_rate": 9.327183780174296e-07, + "loss": 1.7226, + "step": 3715 + }, + { + "epoch": 0.2506654088474108, + "grad_norm": 4.6970538069049095, + "learning_rate": 9.324234556039567e-07, + "loss": 1.7406, + "step": 3720 + }, + { + "epoch": 0.25100232471951756, + "grad_norm": 4.582266018979021, + "learning_rate": 9.321279350597393e-07, + "loss": 1.7339, + "step": 3725 + }, + { + "epoch": 0.25133924059162427, + "grad_norm": 4.240943943779937, + "learning_rate": 9.31831816793543e-07, + "loss": 1.7072, + "step": 3730 + }, + { + "epoch": 0.251676156463731, + "grad_norm": 4.502592636166306, + "learning_rate": 9.315351012149605e-07, + "loss": 1.7358, + "step": 3735 + }, + { + "epoch": 0.25201307233583775, + "grad_norm": 4.506192024283769, + "learning_rate": 9.312377887344105e-07, + "loss": 1.6949, + "step": 3740 + }, + { + "epoch": 0.25234998820794446, + "grad_norm": 4.595924942826764, + "learning_rate": 9.309398797631374e-07, + "loss": 1.7713, + "step": 3745 + }, + { + "epoch": 0.25268690408005123, + "grad_norm": 4.646056480790715, + "learning_rate": 9.306413747132108e-07, + "loss": 1.7086, + "step": 3750 + }, + { + "epoch": 0.25302381995215795, + "grad_norm": 5.667337214028473, + "learning_rate": 9.303422739975246e-07, + "loss": 1.8206, + "step": 3755 + }, + { + "epoch": 0.25336073582426466, + "grad_norm": 4.852279815852281, + "learning_rate": 9.300425780297968e-07, + "loss": 1.7577, + "step": 3760 + }, + { + "epoch": 0.25369765169637143, + "grad_norm": 4.908151755388344, + "learning_rate": 9.297422872245686e-07, + "loss": 1.7385, + "step": 3765 + }, + { + "epoch": 0.25403456756847814, + "grad_norm": 4.764781340681072, + "learning_rate": 9.294414019972043e-07, + "loss": 1.7649, + "step": 3770 + }, + { + "epoch": 0.2543714834405849, + "grad_norm": 5.077139413996835, + "learning_rate": 9.291399227638898e-07, + "loss": 1.7791, + "step": 3775 + }, + { + "epoch": 0.2547083993126916, + "grad_norm": 4.822220559468089, + "learning_rate": 9.288378499416332e-07, + "loss": 1.7173, + "step": 3780 + }, + { + "epoch": 0.25504531518479834, + "grad_norm": 4.829738862421561, + "learning_rate": 9.285351839482634e-07, + "loss": 1.7591, + "step": 3785 + }, + { + "epoch": 0.2553822310569051, + "grad_norm": 4.7186128991399094, + "learning_rate": 9.2823192520243e-07, + "loss": 1.7636, + "step": 3790 + }, + { + "epoch": 0.2557191469290118, + "grad_norm": 5.155182474504667, + "learning_rate": 9.27928074123602e-07, + "loss": 1.7268, + "step": 3795 + }, + { + "epoch": 0.25605606280111853, + "grad_norm": 4.627076461643508, + "learning_rate": 9.276236311320684e-07, + "loss": 1.7572, + "step": 3800 + }, + { + "epoch": 0.2563929786732253, + "grad_norm": 4.7149669408466, + "learning_rate": 9.273185966489365e-07, + "loss": 1.7925, + "step": 3805 + }, + { + "epoch": 0.256729894545332, + "grad_norm": 4.654856525421342, + "learning_rate": 9.270129710961318e-07, + "loss": 1.6885, + "step": 3810 + }, + { + "epoch": 0.2570668104174388, + "grad_norm": 4.272444207688499, + "learning_rate": 9.267067548963974e-07, + "loss": 1.7254, + "step": 3815 + }, + { + "epoch": 0.2574037262895455, + "grad_norm": 5.00860276979963, + "learning_rate": 9.263999484732934e-07, + "loss": 1.7835, + "step": 3820 + }, + { + "epoch": 0.2577406421616522, + "grad_norm": 4.624510433408824, + "learning_rate": 9.260925522511962e-07, + "loss": 1.7666, + "step": 3825 + }, + { + "epoch": 0.258077558033759, + "grad_norm": 4.466190636554243, + "learning_rate": 9.257845666552984e-07, + "loss": 1.7831, + "step": 3830 + }, + { + "epoch": 0.2584144739058657, + "grad_norm": 4.650752529976314, + "learning_rate": 9.254759921116073e-07, + "loss": 1.7767, + "step": 3835 + }, + { + "epoch": 0.25875138977797246, + "grad_norm": 4.947586647901737, + "learning_rate": 9.251668290469452e-07, + "loss": 1.7696, + "step": 3840 + }, + { + "epoch": 0.2590883056500792, + "grad_norm": 4.3973160545600365, + "learning_rate": 9.248570778889484e-07, + "loss": 1.7347, + "step": 3845 + }, + { + "epoch": 0.2594252215221859, + "grad_norm": 4.971082227222947, + "learning_rate": 9.245467390660664e-07, + "loss": 1.6983, + "step": 3850 + }, + { + "epoch": 0.25976213739429266, + "grad_norm": 4.770499620243425, + "learning_rate": 9.242358130075618e-07, + "loss": 1.7691, + "step": 3855 + }, + { + "epoch": 0.26009905326639937, + "grad_norm": 4.5826918853514735, + "learning_rate": 9.239243001435093e-07, + "loss": 1.709, + "step": 3860 + }, + { + "epoch": 0.26043596913850614, + "grad_norm": 4.714242048087625, + "learning_rate": 9.236122009047957e-07, + "loss": 1.6842, + "step": 3865 + }, + { + "epoch": 0.26077288501061285, + "grad_norm": 5.01574436470716, + "learning_rate": 9.232995157231182e-07, + "loss": 1.7489, + "step": 3870 + }, + { + "epoch": 0.26110980088271957, + "grad_norm": 4.443135356323211, + "learning_rate": 9.229862450309851e-07, + "loss": 1.7254, + "step": 3875 + }, + { + "epoch": 0.26144671675482634, + "grad_norm": 4.552093105227505, + "learning_rate": 9.226723892617141e-07, + "loss": 1.7634, + "step": 3880 + }, + { + "epoch": 0.26178363262693305, + "grad_norm": 3.9443617773456405, + "learning_rate": 9.223579488494327e-07, + "loss": 1.7155, + "step": 3885 + }, + { + "epoch": 0.26212054849903976, + "grad_norm": 4.5681734596339965, + "learning_rate": 9.220429242290763e-07, + "loss": 1.7267, + "step": 3890 + }, + { + "epoch": 0.26245746437114653, + "grad_norm": 4.716056869546119, + "learning_rate": 9.217273158363894e-07, + "loss": 1.6855, + "step": 3895 + }, + { + "epoch": 0.26279438024325324, + "grad_norm": 4.24968952182465, + "learning_rate": 9.214111241079232e-07, + "loss": 1.7645, + "step": 3900 + }, + { + "epoch": 0.26313129611536, + "grad_norm": 4.424195580222474, + "learning_rate": 9.21094349481036e-07, + "loss": 1.7845, + "step": 3905 + }, + { + "epoch": 0.2634682119874667, + "grad_norm": 4.7637558094248265, + "learning_rate": 9.207769923938924e-07, + "loss": 1.7245, + "step": 3910 + }, + { + "epoch": 0.26380512785957344, + "grad_norm": 5.284391995626364, + "learning_rate": 9.204590532854627e-07, + "loss": 1.662, + "step": 3915 + }, + { + "epoch": 0.2641420437316802, + "grad_norm": 4.7254299136588855, + "learning_rate": 9.20140532595522e-07, + "loss": 1.8177, + "step": 3920 + }, + { + "epoch": 0.2644789596037869, + "grad_norm": 4.555412680215885, + "learning_rate": 9.198214307646504e-07, + "loss": 1.7283, + "step": 3925 + }, + { + "epoch": 0.2648158754758937, + "grad_norm": 4.676744050611532, + "learning_rate": 9.195017482342313e-07, + "loss": 1.7688, + "step": 3930 + }, + { + "epoch": 0.2651527913480004, + "grad_norm": 4.923776597293947, + "learning_rate": 9.191814854464514e-07, + "loss": 1.743, + "step": 3935 + }, + { + "epoch": 0.2654897072201071, + "grad_norm": 4.624858906489115, + "learning_rate": 9.188606428443002e-07, + "loss": 1.727, + "step": 3940 + }, + { + "epoch": 0.2658266230922139, + "grad_norm": 4.50535143830333, + "learning_rate": 9.185392208715692e-07, + "loss": 1.6601, + "step": 3945 + }, + { + "epoch": 0.2661635389643206, + "grad_norm": 4.715817481252642, + "learning_rate": 9.182172199728513e-07, + "loss": 1.7566, + "step": 3950 + }, + { + "epoch": 0.26650045483642737, + "grad_norm": 5.076272945810852, + "learning_rate": 9.178946405935398e-07, + "loss": 1.7813, + "step": 3955 + }, + { + "epoch": 0.2668373707085341, + "grad_norm": 4.903244847025362, + "learning_rate": 9.175714831798287e-07, + "loss": 1.764, + "step": 3960 + }, + { + "epoch": 0.2671742865806408, + "grad_norm": 4.819745696387984, + "learning_rate": 9.172477481787113e-07, + "loss": 1.6826, + "step": 3965 + }, + { + "epoch": 0.26751120245274757, + "grad_norm": 4.686633159763879, + "learning_rate": 9.169234360379796e-07, + "loss": 1.7017, + "step": 3970 + }, + { + "epoch": 0.2678481183248543, + "grad_norm": 5.155877126897368, + "learning_rate": 9.165985472062244e-07, + "loss": 1.7647, + "step": 3975 + }, + { + "epoch": 0.268185034196961, + "grad_norm": 4.385164684257074, + "learning_rate": 9.162730821328337e-07, + "loss": 1.7524, + "step": 3980 + }, + { + "epoch": 0.26852195006906776, + "grad_norm": 4.881753774963838, + "learning_rate": 9.159470412679928e-07, + "loss": 1.7793, + "step": 3985 + }, + { + "epoch": 0.2688588659411745, + "grad_norm": 5.161223622188654, + "learning_rate": 9.156204250626836e-07, + "loss": 1.7525, + "step": 3990 + }, + { + "epoch": 0.26919578181328124, + "grad_norm": 5.334488964309807, + "learning_rate": 9.152932339686833e-07, + "loss": 1.784, + "step": 3995 + }, + { + "epoch": 0.26953269768538796, + "grad_norm": 4.5325612354681155, + "learning_rate": 9.149654684385647e-07, + "loss": 1.7371, + "step": 4000 + }, + { + "epoch": 0.26986961355749467, + "grad_norm": 5.240484693296526, + "learning_rate": 9.146371289256952e-07, + "loss": 1.6598, + "step": 4005 + }, + { + "epoch": 0.27020652942960144, + "grad_norm": 4.377746387816449, + "learning_rate": 9.143082158842359e-07, + "loss": 1.7966, + "step": 4010 + }, + { + "epoch": 0.27054344530170815, + "grad_norm": 4.593458441322993, + "learning_rate": 9.139787297691413e-07, + "loss": 1.7707, + "step": 4015 + }, + { + "epoch": 0.2708803611738149, + "grad_norm": 4.6933419598528445, + "learning_rate": 9.136486710361586e-07, + "loss": 1.8005, + "step": 4020 + }, + { + "epoch": 0.27121727704592163, + "grad_norm": 4.423648259837012, + "learning_rate": 9.13318040141827e-07, + "loss": 1.7397, + "step": 4025 + }, + { + "epoch": 0.27155419291802835, + "grad_norm": 4.58381209779022, + "learning_rate": 9.129868375434774e-07, + "loss": 1.7167, + "step": 4030 + }, + { + "epoch": 0.2718911087901351, + "grad_norm": 5.591361058814464, + "learning_rate": 9.12655063699231e-07, + "loss": 1.7489, + "step": 4035 + }, + { + "epoch": 0.27222802466224183, + "grad_norm": 4.826560920094557, + "learning_rate": 9.123227190679994e-07, + "loss": 1.6668, + "step": 4040 + }, + { + "epoch": 0.2725649405343486, + "grad_norm": 4.91009253142952, + "learning_rate": 9.119898041094838e-07, + "loss": 1.7787, + "step": 4045 + }, + { + "epoch": 0.2729018564064553, + "grad_norm": 4.752985829366886, + "learning_rate": 9.116563192841741e-07, + "loss": 1.7492, + "step": 4050 + }, + { + "epoch": 0.273238772278562, + "grad_norm": 4.934952686039615, + "learning_rate": 9.113222650533486e-07, + "loss": 1.7972, + "step": 4055 + }, + { + "epoch": 0.2735756881506688, + "grad_norm": 4.940692386574961, + "learning_rate": 9.109876418790731e-07, + "loss": 1.7277, + "step": 4060 + }, + { + "epoch": 0.2739126040227755, + "grad_norm": 4.917215340655875, + "learning_rate": 9.106524502242004e-07, + "loss": 1.7261, + "step": 4065 + }, + { + "epoch": 0.2742495198948822, + "grad_norm": 4.369817949456342, + "learning_rate": 9.103166905523699e-07, + "loss": 1.7005, + "step": 4070 + }, + { + "epoch": 0.274586435766989, + "grad_norm": 4.654979197581099, + "learning_rate": 9.099803633280059e-07, + "loss": 1.8421, + "step": 4075 + }, + { + "epoch": 0.2749233516390957, + "grad_norm": 4.987317284899288, + "learning_rate": 9.096434690163184e-07, + "loss": 1.7533, + "step": 4080 + }, + { + "epoch": 0.27526026751120247, + "grad_norm": 4.679060945617795, + "learning_rate": 9.093060080833019e-07, + "loss": 1.7128, + "step": 4085 + }, + { + "epoch": 0.2755971833833092, + "grad_norm": 4.934341742568025, + "learning_rate": 9.089679809957343e-07, + "loss": 1.7116, + "step": 4090 + }, + { + "epoch": 0.2759340992554159, + "grad_norm": 5.535564494067952, + "learning_rate": 9.086293882211768e-07, + "loss": 1.795, + "step": 4095 + }, + { + "epoch": 0.27627101512752267, + "grad_norm": 4.59416034924243, + "learning_rate": 9.082902302279726e-07, + "loss": 1.7278, + "step": 4100 + }, + { + "epoch": 0.2766079309996294, + "grad_norm": 4.81347731163381, + "learning_rate": 9.079505074852476e-07, + "loss": 1.7593, + "step": 4105 + }, + { + "epoch": 0.27694484687173615, + "grad_norm": 4.9285174913634915, + "learning_rate": 9.076102204629082e-07, + "loss": 1.7159, + "step": 4110 + }, + { + "epoch": 0.27728176274384286, + "grad_norm": 4.3498627752306716, + "learning_rate": 9.072693696316411e-07, + "loss": 1.7344, + "step": 4115 + }, + { + "epoch": 0.2776186786159496, + "grad_norm": 4.656514571023645, + "learning_rate": 9.069279554629137e-07, + "loss": 1.7313, + "step": 4120 + }, + { + "epoch": 0.27795559448805635, + "grad_norm": 4.755094094103996, + "learning_rate": 9.06585978428972e-07, + "loss": 1.7245, + "step": 4125 + }, + { + "epoch": 0.27829251036016306, + "grad_norm": 4.70651566158845, + "learning_rate": 9.062434390028407e-07, + "loss": 1.7576, + "step": 4130 + }, + { + "epoch": 0.27862942623226983, + "grad_norm": 4.5157833239482486, + "learning_rate": 9.059003376583223e-07, + "loss": 1.7557, + "step": 4135 + }, + { + "epoch": 0.27896634210437654, + "grad_norm": 4.3165534493294455, + "learning_rate": 9.055566748699968e-07, + "loss": 1.7643, + "step": 4140 + }, + { + "epoch": 0.27930325797648325, + "grad_norm": 4.511559044415423, + "learning_rate": 9.052124511132204e-07, + "loss": 1.7322, + "step": 4145 + }, + { + "epoch": 0.27964017384859, + "grad_norm": 4.491252006627144, + "learning_rate": 9.04867666864126e-07, + "loss": 1.7048, + "step": 4150 + }, + { + "epoch": 0.27997708972069674, + "grad_norm": 5.162440504545584, + "learning_rate": 9.045223225996207e-07, + "loss": 1.7349, + "step": 4155 + }, + { + "epoch": 0.28031400559280345, + "grad_norm": 4.837794801527783, + "learning_rate": 9.041764187973871e-07, + "loss": 1.7243, + "step": 4160 + }, + { + "epoch": 0.2806509214649102, + "grad_norm": 4.86864277556133, + "learning_rate": 9.038299559358815e-07, + "loss": 1.7357, + "step": 4165 + }, + { + "epoch": 0.28098783733701693, + "grad_norm": 4.675098805367262, + "learning_rate": 9.034829344943331e-07, + "loss": 1.7346, + "step": 4170 + }, + { + "epoch": 0.2813247532091237, + "grad_norm": 4.675719749121135, + "learning_rate": 9.031353549527444e-07, + "loss": 1.755, + "step": 4175 + }, + { + "epoch": 0.2816616690812304, + "grad_norm": 4.656544713004674, + "learning_rate": 9.027872177918894e-07, + "loss": 1.6781, + "step": 4180 + }, + { + "epoch": 0.28199858495333713, + "grad_norm": 4.093237425850474, + "learning_rate": 9.024385234933134e-07, + "loss": 1.7954, + "step": 4185 + }, + { + "epoch": 0.2823355008254439, + "grad_norm": 4.392579552620834, + "learning_rate": 9.020892725393326e-07, + "loss": 1.6816, + "step": 4190 + }, + { + "epoch": 0.2826724166975506, + "grad_norm": 4.79325848165355, + "learning_rate": 9.017394654130332e-07, + "loss": 1.7271, + "step": 4195 + }, + { + "epoch": 0.2830093325696574, + "grad_norm": 4.486480700182664, + "learning_rate": 9.013891025982703e-07, + "loss": 1.7621, + "step": 4200 + }, + { + "epoch": 0.2833462484417641, + "grad_norm": 4.742598461547793, + "learning_rate": 9.010381845796677e-07, + "loss": 1.7021, + "step": 4205 + }, + { + "epoch": 0.2836831643138708, + "grad_norm": 4.910310236706784, + "learning_rate": 9.006867118426178e-07, + "loss": 1.7157, + "step": 4210 + }, + { + "epoch": 0.2840200801859776, + "grad_norm": 5.0356189419192425, + "learning_rate": 9.003346848732793e-07, + "loss": 1.7219, + "step": 4215 + }, + { + "epoch": 0.2843569960580843, + "grad_norm": 4.1734770119817, + "learning_rate": 8.999821041585787e-07, + "loss": 1.6756, + "step": 4220 + }, + { + "epoch": 0.28469391193019106, + "grad_norm": 4.596840625940221, + "learning_rate": 8.996289701862072e-07, + "loss": 1.8047, + "step": 4225 + }, + { + "epoch": 0.28503082780229777, + "grad_norm": 4.6224423547224385, + "learning_rate": 8.99275283444622e-07, + "loss": 1.7589, + "step": 4230 + }, + { + "epoch": 0.2853677436744045, + "grad_norm": 4.491183430135914, + "learning_rate": 8.989210444230449e-07, + "loss": 1.7477, + "step": 4235 + }, + { + "epoch": 0.28570465954651125, + "grad_norm": 4.685179009285273, + "learning_rate": 8.985662536114612e-07, + "loss": 1.7634, + "step": 4240 + }, + { + "epoch": 0.28604157541861797, + "grad_norm": 4.750370079829212, + "learning_rate": 8.9821091150062e-07, + "loss": 1.7168, + "step": 4245 + }, + { + "epoch": 0.2863784912907247, + "grad_norm": 5.005679815546969, + "learning_rate": 8.978550185820323e-07, + "loss": 1.7179, + "step": 4250 + }, + { + "epoch": 0.28671540716283145, + "grad_norm": 4.483443011591814, + "learning_rate": 8.974985753479718e-07, + "loss": 1.7561, + "step": 4255 + }, + { + "epoch": 0.28705232303493816, + "grad_norm": 4.514312535623764, + "learning_rate": 8.971415822914726e-07, + "loss": 1.744, + "step": 4260 + }, + { + "epoch": 0.28738923890704493, + "grad_norm": 4.70665221972189, + "learning_rate": 8.967840399063298e-07, + "loss": 1.7066, + "step": 4265 + }, + { + "epoch": 0.28772615477915164, + "grad_norm": 5.139115959292068, + "learning_rate": 8.964259486870982e-07, + "loss": 1.7072, + "step": 4270 + }, + { + "epoch": 0.28806307065125836, + "grad_norm": 4.664997783932267, + "learning_rate": 8.960673091290916e-07, + "loss": 1.7119, + "step": 4275 + }, + { + "epoch": 0.2883999865233651, + "grad_norm": 4.892565476825081, + "learning_rate": 8.957081217283825e-07, + "loss": 1.7438, + "step": 4280 + }, + { + "epoch": 0.28873690239547184, + "grad_norm": 4.559694716957321, + "learning_rate": 8.953483869818013e-07, + "loss": 1.7316, + "step": 4285 + }, + { + "epoch": 0.2890738182675786, + "grad_norm": 4.698321844576039, + "learning_rate": 8.949881053869348e-07, + "loss": 1.7336, + "step": 4290 + }, + { + "epoch": 0.2894107341396853, + "grad_norm": 4.765465874545801, + "learning_rate": 8.946272774421271e-07, + "loss": 1.7119, + "step": 4295 + }, + { + "epoch": 0.28974765001179204, + "grad_norm": 4.753263661353172, + "learning_rate": 8.942659036464775e-07, + "loss": 1.74, + "step": 4300 + }, + { + "epoch": 0.2900845658838988, + "grad_norm": 4.495261380126989, + "learning_rate": 8.939039844998403e-07, + "loss": 1.7512, + "step": 4305 + }, + { + "epoch": 0.2904214817560055, + "grad_norm": 4.557559595948686, + "learning_rate": 8.935415205028243e-07, + "loss": 1.7104, + "step": 4310 + }, + { + "epoch": 0.2907583976281123, + "grad_norm": 4.202594698597534, + "learning_rate": 8.931785121567921e-07, + "loss": 1.6826, + "step": 4315 + }, + { + "epoch": 0.291095313500219, + "grad_norm": 4.729706790893457, + "learning_rate": 8.928149599638588e-07, + "loss": 1.7567, + "step": 4320 + }, + { + "epoch": 0.2914322293723257, + "grad_norm": 4.812500510407962, + "learning_rate": 8.924508644268921e-07, + "loss": 1.8317, + "step": 4325 + }, + { + "epoch": 0.2917691452444325, + "grad_norm": 4.5072102814438395, + "learning_rate": 8.920862260495111e-07, + "loss": 1.7463, + "step": 4330 + }, + { + "epoch": 0.2921060611165392, + "grad_norm": 4.815210343847766, + "learning_rate": 8.917210453360859e-07, + "loss": 1.7456, + "step": 4335 + }, + { + "epoch": 0.2924429769886459, + "grad_norm": 5.3841241146256, + "learning_rate": 8.913553227917365e-07, + "loss": 1.679, + "step": 4340 + }, + { + "epoch": 0.2927798928607527, + "grad_norm": 4.844902712349313, + "learning_rate": 8.909890589223329e-07, + "loss": 1.729, + "step": 4345 + }, + { + "epoch": 0.2931168087328594, + "grad_norm": 4.5901739139881075, + "learning_rate": 8.906222542344932e-07, + "loss": 1.7552, + "step": 4350 + }, + { + "epoch": 0.29345372460496616, + "grad_norm": 4.538804261257026, + "learning_rate": 8.902549092355839e-07, + "loss": 1.7488, + "step": 4355 + }, + { + "epoch": 0.2937906404770729, + "grad_norm": 5.2062038276879825, + "learning_rate": 8.898870244337189e-07, + "loss": 1.8083, + "step": 4360 + }, + { + "epoch": 0.2941275563491796, + "grad_norm": 4.726134277199526, + "learning_rate": 8.895186003377586e-07, + "loss": 1.7307, + "step": 4365 + }, + { + "epoch": 0.29446447222128636, + "grad_norm": 5.516228137928749, + "learning_rate": 8.891496374573095e-07, + "loss": 1.8018, + "step": 4370 + }, + { + "epoch": 0.29480138809339307, + "grad_norm": 4.799889103520739, + "learning_rate": 8.887801363027233e-07, + "loss": 1.7538, + "step": 4375 + }, + { + "epoch": 0.29513830396549984, + "grad_norm": 4.379041171560821, + "learning_rate": 8.884100973850962e-07, + "loss": 1.7804, + "step": 4380 + }, + { + "epoch": 0.29547521983760655, + "grad_norm": 4.357662889414455, + "learning_rate": 8.880395212162684e-07, + "loss": 1.7696, + "step": 4385 + }, + { + "epoch": 0.29581213570971326, + "grad_norm": 4.917230947405535, + "learning_rate": 8.87668408308823e-07, + "loss": 1.6723, + "step": 4390 + }, + { + "epoch": 0.29614905158182003, + "grad_norm": 4.543552014294589, + "learning_rate": 8.872967591760856e-07, + "loss": 1.7145, + "step": 4395 + }, + { + "epoch": 0.29648596745392675, + "grad_norm": 4.472875036743257, + "learning_rate": 8.869245743321234e-07, + "loss": 1.7572, + "step": 4400 + }, + { + "epoch": 0.2968228833260335, + "grad_norm": 5.049587485702916, + "learning_rate": 8.865518542917452e-07, + "loss": 1.6626, + "step": 4405 + }, + { + "epoch": 0.29715979919814023, + "grad_norm": 4.672040595743891, + "learning_rate": 8.861785995704991e-07, + "loss": 1.764, + "step": 4410 + }, + { + "epoch": 0.29749671507024694, + "grad_norm": 4.495170230013994, + "learning_rate": 8.858048106846735e-07, + "loss": 1.7532, + "step": 4415 + }, + { + "epoch": 0.2978336309423537, + "grad_norm": 4.823875109397055, + "learning_rate": 8.854304881512955e-07, + "loss": 1.6923, + "step": 4420 + }, + { + "epoch": 0.2981705468144604, + "grad_norm": 4.821873035905844, + "learning_rate": 8.850556324881302e-07, + "loss": 1.7809, + "step": 4425 + }, + { + "epoch": 0.29850746268656714, + "grad_norm": 4.410412924960029, + "learning_rate": 8.846802442136804e-07, + "loss": 1.7013, + "step": 4430 + }, + { + "epoch": 0.2988443785586739, + "grad_norm": 4.390679279180808, + "learning_rate": 8.843043238471853e-07, + "loss": 1.7589, + "step": 4435 + }, + { + "epoch": 0.2991812944307806, + "grad_norm": 4.734392692475669, + "learning_rate": 8.839278719086201e-07, + "loss": 1.7504, + "step": 4440 + }, + { + "epoch": 0.2995182103028874, + "grad_norm": 4.403725418684815, + "learning_rate": 8.835508889186956e-07, + "loss": 1.7657, + "step": 4445 + }, + { + "epoch": 0.2998551261749941, + "grad_norm": 5.210710942210437, + "learning_rate": 8.83173375398857e-07, + "loss": 1.7567, + "step": 4450 + }, + { + "epoch": 0.3001920420471008, + "grad_norm": 4.66405783513022, + "learning_rate": 8.827953318712831e-07, + "loss": 1.7397, + "step": 4455 + }, + { + "epoch": 0.3005289579192076, + "grad_norm": 5.118422608455292, + "learning_rate": 8.824167588588861e-07, + "loss": 1.7122, + "step": 4460 + }, + { + "epoch": 0.3008658737913143, + "grad_norm": 4.680432440576183, + "learning_rate": 8.820376568853105e-07, + "loss": 1.6974, + "step": 4465 + }, + { + "epoch": 0.30120278966342107, + "grad_norm": 4.800049981676269, + "learning_rate": 8.816580264749325e-07, + "loss": 1.735, + "step": 4470 + }, + { + "epoch": 0.3015397055355278, + "grad_norm": 4.143241744072462, + "learning_rate": 8.81277868152859e-07, + "loss": 1.7497, + "step": 4475 + }, + { + "epoch": 0.3018766214076345, + "grad_norm": 4.739632425074766, + "learning_rate": 8.808971824449274e-07, + "loss": 1.7366, + "step": 4480 + }, + { + "epoch": 0.30221353727974126, + "grad_norm": 4.389277440452169, + "learning_rate": 8.805159698777045e-07, + "loss": 1.7156, + "step": 4485 + }, + { + "epoch": 0.302550453151848, + "grad_norm": 4.837990330467177, + "learning_rate": 8.801342309784858e-07, + "loss": 1.7185, + "step": 4490 + }, + { + "epoch": 0.30288736902395474, + "grad_norm": 4.185487779722894, + "learning_rate": 8.79751966275295e-07, + "loss": 1.6727, + "step": 4495 + }, + { + "epoch": 0.30322428489606146, + "grad_norm": 5.036912125810241, + "learning_rate": 8.793691762968827e-07, + "loss": 1.7088, + "step": 4500 + }, + { + "epoch": 0.30356120076816817, + "grad_norm": 4.863760552977038, + "learning_rate": 8.789858615727264e-07, + "loss": 1.7243, + "step": 4505 + }, + { + "epoch": 0.30389811664027494, + "grad_norm": 4.464862420363283, + "learning_rate": 8.786020226330295e-07, + "loss": 1.7092, + "step": 4510 + }, + { + "epoch": 0.30423503251238165, + "grad_norm": 4.631321680323668, + "learning_rate": 8.782176600087203e-07, + "loss": 1.6525, + "step": 4515 + }, + { + "epoch": 0.30457194838448837, + "grad_norm": 4.493210645127753, + "learning_rate": 8.778327742314513e-07, + "loss": 1.7311, + "step": 4520 + }, + { + "epoch": 0.30490886425659514, + "grad_norm": 4.721685250690692, + "learning_rate": 8.77447365833599e-07, + "loss": 1.7066, + "step": 4525 + }, + { + "epoch": 0.30524578012870185, + "grad_norm": 4.731921578899134, + "learning_rate": 8.770614353482628e-07, + "loss": 1.7536, + "step": 4530 + }, + { + "epoch": 0.3055826960008086, + "grad_norm": 4.423861680399267, + "learning_rate": 8.766749833092638e-07, + "loss": 1.7985, + "step": 4535 + }, + { + "epoch": 0.30591961187291533, + "grad_norm": 4.2791878376307295, + "learning_rate": 8.76288010251145e-07, + "loss": 1.7224, + "step": 4540 + }, + { + "epoch": 0.30625652774502204, + "grad_norm": 4.903125873267787, + "learning_rate": 8.759005167091697e-07, + "loss": 1.7735, + "step": 4545 + }, + { + "epoch": 0.3065934436171288, + "grad_norm": 4.8237777891217375, + "learning_rate": 8.755125032193214e-07, + "loss": 1.656, + "step": 4550 + }, + { + "epoch": 0.3069303594892355, + "grad_norm": 4.823939629101453, + "learning_rate": 8.751239703183026e-07, + "loss": 1.7152, + "step": 4555 + }, + { + "epoch": 0.3072672753613423, + "grad_norm": 4.566940546243774, + "learning_rate": 8.747349185435348e-07, + "loss": 1.716, + "step": 4560 + }, + { + "epoch": 0.307604191233449, + "grad_norm": 4.732548527043846, + "learning_rate": 8.743453484331562e-07, + "loss": 1.7865, + "step": 4565 + }, + { + "epoch": 0.3079411071055557, + "grad_norm": 4.623294109058839, + "learning_rate": 8.73955260526023e-07, + "loss": 1.7017, + "step": 4570 + }, + { + "epoch": 0.3082780229776625, + "grad_norm": 4.485493004343804, + "learning_rate": 8.735646553617069e-07, + "loss": 1.8001, + "step": 4575 + }, + { + "epoch": 0.3086149388497692, + "grad_norm": 4.492019215877914, + "learning_rate": 8.731735334804953e-07, + "loss": 1.7515, + "step": 4580 + }, + { + "epoch": 0.308951854721876, + "grad_norm": 4.7049240303578275, + "learning_rate": 8.727818954233904e-07, + "loss": 1.7318, + "step": 4585 + }, + { + "epoch": 0.3092887705939827, + "grad_norm": 4.938502374647742, + "learning_rate": 8.723897417321084e-07, + "loss": 1.725, + "step": 4590 + }, + { + "epoch": 0.3096256864660894, + "grad_norm": 5.022532343934432, + "learning_rate": 8.719970729490788e-07, + "loss": 1.7443, + "step": 4595 + }, + { + "epoch": 0.30996260233819617, + "grad_norm": 4.738767482257275, + "learning_rate": 8.716038896174432e-07, + "loss": 1.755, + "step": 4600 + }, + { + "epoch": 0.3102995182103029, + "grad_norm": 4.7114300519860475, + "learning_rate": 8.712101922810551e-07, + "loss": 1.75, + "step": 4605 + }, + { + "epoch": 0.3106364340824096, + "grad_norm": 4.571041536354113, + "learning_rate": 8.708159814844793e-07, + "loss": 1.7098, + "step": 4610 + }, + { + "epoch": 0.31097334995451636, + "grad_norm": 4.407045547143283, + "learning_rate": 8.704212577729905e-07, + "loss": 1.7793, + "step": 4615 + }, + { + "epoch": 0.3113102658266231, + "grad_norm": 4.728634153355158, + "learning_rate": 8.700260216925728e-07, + "loss": 1.6889, + "step": 4620 + }, + { + "epoch": 0.31164718169872985, + "grad_norm": 4.563219061062255, + "learning_rate": 8.696302737899192e-07, + "loss": 1.7611, + "step": 4625 + }, + { + "epoch": 0.31198409757083656, + "grad_norm": 4.783988230321884, + "learning_rate": 8.692340146124308e-07, + "loss": 1.7104, + "step": 4630 + }, + { + "epoch": 0.3123210134429433, + "grad_norm": 4.847737749788932, + "learning_rate": 8.688372447082153e-07, + "loss": 1.7511, + "step": 4635 + }, + { + "epoch": 0.31265792931505004, + "grad_norm": 4.907400664885099, + "learning_rate": 8.684399646260876e-07, + "loss": 1.7158, + "step": 4640 + }, + { + "epoch": 0.31299484518715676, + "grad_norm": 4.876331767645437, + "learning_rate": 8.680421749155677e-07, + "loss": 1.6995, + "step": 4645 + }, + { + "epoch": 0.3133317610592635, + "grad_norm": 4.894058583174596, + "learning_rate": 8.676438761268808e-07, + "loss": 1.7764, + "step": 4650 + }, + { + "epoch": 0.31366867693137024, + "grad_norm": 4.672221745908909, + "learning_rate": 8.672450688109563e-07, + "loss": 1.7844, + "step": 4655 + }, + { + "epoch": 0.31400559280347695, + "grad_norm": 5.147054583280883, + "learning_rate": 8.668457535194267e-07, + "loss": 1.8186, + "step": 4660 + }, + { + "epoch": 0.3143425086755837, + "grad_norm": 4.51560485470913, + "learning_rate": 8.664459308046274e-07, + "loss": 1.7676, + "step": 4665 + }, + { + "epoch": 0.31467942454769043, + "grad_norm": 4.767998872271937, + "learning_rate": 8.660456012195957e-07, + "loss": 1.7302, + "step": 4670 + }, + { + "epoch": 0.3150163404197972, + "grad_norm": 4.765982682989915, + "learning_rate": 8.656447653180699e-07, + "loss": 1.7017, + "step": 4675 + }, + { + "epoch": 0.3153532562919039, + "grad_norm": 4.6837995867112285, + "learning_rate": 8.652434236544886e-07, + "loss": 1.7546, + "step": 4680 + }, + { + "epoch": 0.31569017216401063, + "grad_norm": 4.823408524687895, + "learning_rate": 8.648415767839899e-07, + "loss": 1.7132, + "step": 4685 + }, + { + "epoch": 0.3160270880361174, + "grad_norm": 4.730875315239921, + "learning_rate": 8.644392252624108e-07, + "loss": 1.6749, + "step": 4690 + }, + { + "epoch": 0.3163640039082241, + "grad_norm": 4.282912686826423, + "learning_rate": 8.640363696462869e-07, + "loss": 1.7665, + "step": 4695 + }, + { + "epoch": 0.3167009197803308, + "grad_norm": 4.436758751754393, + "learning_rate": 8.636330104928499e-07, + "loss": 1.7438, + "step": 4700 + }, + { + "epoch": 0.3170378356524376, + "grad_norm": 4.964885054397722, + "learning_rate": 8.632291483600289e-07, + "loss": 1.7099, + "step": 4705 + }, + { + "epoch": 0.3173747515245443, + "grad_norm": 4.675853565648876, + "learning_rate": 8.628247838064485e-07, + "loss": 1.7111, + "step": 4710 + }, + { + "epoch": 0.3177116673966511, + "grad_norm": 4.6934367206975365, + "learning_rate": 8.624199173914279e-07, + "loss": 1.7971, + "step": 4715 + }, + { + "epoch": 0.3180485832687578, + "grad_norm": 4.512080853138636, + "learning_rate": 8.620145496749811e-07, + "loss": 1.7319, + "step": 4720 + }, + { + "epoch": 0.3183854991408645, + "grad_norm": 5.079728958055047, + "learning_rate": 8.616086812178151e-07, + "loss": 1.737, + "step": 4725 + }, + { + "epoch": 0.31872241501297127, + "grad_norm": 4.640827385755181, + "learning_rate": 8.612023125813296e-07, + "loss": 1.7648, + "step": 4730 + }, + { + "epoch": 0.319059330885078, + "grad_norm": 4.276778816417824, + "learning_rate": 8.607954443276162e-07, + "loss": 1.7802, + "step": 4735 + }, + { + "epoch": 0.31939624675718475, + "grad_norm": 4.481763322120392, + "learning_rate": 8.603880770194574e-07, + "loss": 1.7305, + "step": 4740 + }, + { + "epoch": 0.31973316262929147, + "grad_norm": 4.615513878941768, + "learning_rate": 8.59980211220326e-07, + "loss": 1.6987, + "step": 4745 + }, + { + "epoch": 0.3200700785013982, + "grad_norm": 4.7455902906252465, + "learning_rate": 8.595718474943849e-07, + "loss": 1.7027, + "step": 4750 + }, + { + "epoch": 0.32040699437350495, + "grad_norm": 4.68398292422727, + "learning_rate": 8.591629864064851e-07, + "loss": 1.7455, + "step": 4755 + }, + { + "epoch": 0.32074391024561166, + "grad_norm": 4.9222045458553225, + "learning_rate": 8.587536285221655e-07, + "loss": 1.8028, + "step": 4760 + }, + { + "epoch": 0.32108082611771843, + "grad_norm": 4.502320783272846, + "learning_rate": 8.583437744076527e-07, + "loss": 1.7402, + "step": 4765 + }, + { + "epoch": 0.32141774198982515, + "grad_norm": 5.144779588736784, + "learning_rate": 8.579334246298592e-07, + "loss": 1.7628, + "step": 4770 + }, + { + "epoch": 0.32175465786193186, + "grad_norm": 4.504761407192705, + "learning_rate": 8.575225797563834e-07, + "loss": 1.7915, + "step": 4775 + }, + { + "epoch": 0.3220915737340386, + "grad_norm": 5.735894915412811, + "learning_rate": 8.571112403555083e-07, + "loss": 1.7218, + "step": 4780 + }, + { + "epoch": 0.32242848960614534, + "grad_norm": 4.7323785258509785, + "learning_rate": 8.566994069962012e-07, + "loss": 1.7429, + "step": 4785 + }, + { + "epoch": 0.32276540547825205, + "grad_norm": 4.487702277209751, + "learning_rate": 8.562870802481126e-07, + "loss": 1.7489, + "step": 4790 + }, + { + "epoch": 0.3231023213503588, + "grad_norm": 4.928490583624491, + "learning_rate": 8.55874260681575e-07, + "loss": 1.7347, + "step": 4795 + }, + { + "epoch": 0.32343923722246554, + "grad_norm": 4.877761123390889, + "learning_rate": 8.554609488676032e-07, + "loss": 1.7209, + "step": 4800 + }, + { + "epoch": 0.3237761530945723, + "grad_norm": 4.764011730790988, + "learning_rate": 8.550471453778925e-07, + "loss": 1.717, + "step": 4805 + }, + { + "epoch": 0.324113068966679, + "grad_norm": 4.799601103958323, + "learning_rate": 8.546328507848184e-07, + "loss": 1.7163, + "step": 4810 + }, + { + "epoch": 0.32444998483878573, + "grad_norm": 4.504239537326746, + "learning_rate": 8.542180656614358e-07, + "loss": 1.6741, + "step": 4815 + }, + { + "epoch": 0.3247869007108925, + "grad_norm": 4.770834350267517, + "learning_rate": 8.538027905814778e-07, + "loss": 1.6956, + "step": 4820 + }, + { + "epoch": 0.3251238165829992, + "grad_norm": 7.096648611390167, + "learning_rate": 8.533870261193556e-07, + "loss": 1.6588, + "step": 4825 + }, + { + "epoch": 0.325460732455106, + "grad_norm": 4.864466677890027, + "learning_rate": 8.529707728501571e-07, + "loss": 1.6915, + "step": 4830 + }, + { + "epoch": 0.3257976483272127, + "grad_norm": 4.744648508382125, + "learning_rate": 8.525540313496462e-07, + "loss": 1.7164, + "step": 4835 + }, + { + "epoch": 0.3261345641993194, + "grad_norm": 4.529035511663301, + "learning_rate": 8.521368021942623e-07, + "loss": 1.7921, + "step": 4840 + }, + { + "epoch": 0.3264714800714262, + "grad_norm": 4.9316185678104025, + "learning_rate": 8.517190859611195e-07, + "loss": 1.6849, + "step": 4845 + }, + { + "epoch": 0.3268083959435329, + "grad_norm": 5.14026374289455, + "learning_rate": 8.513008832280053e-07, + "loss": 1.8128, + "step": 4850 + }, + { + "epoch": 0.32714531181563966, + "grad_norm": 4.985258213485664, + "learning_rate": 8.508821945733802e-07, + "loss": 1.7436, + "step": 4855 + }, + { + "epoch": 0.3274822276877464, + "grad_norm": 4.869434253185966, + "learning_rate": 8.504630205763768e-07, + "loss": 1.7338, + "step": 4860 + }, + { + "epoch": 0.3278191435598531, + "grad_norm": 4.569578875489374, + "learning_rate": 8.500433618167992e-07, + "loss": 1.7813, + "step": 4865 + }, + { + "epoch": 0.32815605943195986, + "grad_norm": 4.72390686940452, + "learning_rate": 8.496232188751222e-07, + "loss": 1.7272, + "step": 4870 + }, + { + "epoch": 0.32849297530406657, + "grad_norm": 4.337728617203676, + "learning_rate": 8.492025923324897e-07, + "loss": 1.711, + "step": 4875 + }, + { + "epoch": 0.3288298911761733, + "grad_norm": 4.665189493399873, + "learning_rate": 8.487814827707152e-07, + "loss": 1.7424, + "step": 4880 + }, + { + "epoch": 0.32916680704828005, + "grad_norm": 4.764469314965838, + "learning_rate": 8.483598907722795e-07, + "loss": 1.7619, + "step": 4885 + }, + { + "epoch": 0.32950372292038677, + "grad_norm": 4.415938675590576, + "learning_rate": 8.479378169203317e-07, + "loss": 1.6712, + "step": 4890 + }, + { + "epoch": 0.32984063879249353, + "grad_norm": 4.798044411903519, + "learning_rate": 8.475152617986869e-07, + "loss": 1.698, + "step": 4895 + }, + { + "epoch": 0.33017755466460025, + "grad_norm": 4.675109222425787, + "learning_rate": 8.470922259918254e-07, + "loss": 1.6894, + "step": 4900 + }, + { + "epoch": 0.33051447053670696, + "grad_norm": 4.481553305526358, + "learning_rate": 8.466687100848935e-07, + "loss": 1.7655, + "step": 4905 + }, + { + "epoch": 0.33085138640881373, + "grad_norm": 4.93030004046697, + "learning_rate": 8.462447146637006e-07, + "loss": 1.7206, + "step": 4910 + }, + { + "epoch": 0.33118830228092044, + "grad_norm": 6.135512951857502, + "learning_rate": 8.458202403147199e-07, + "loss": 1.725, + "step": 4915 + }, + { + "epoch": 0.3315252181530272, + "grad_norm": 4.855174964493657, + "learning_rate": 8.453952876250867e-07, + "loss": 1.6884, + "step": 4920 + }, + { + "epoch": 0.3318621340251339, + "grad_norm": 4.830245687846346, + "learning_rate": 8.449698571825984e-07, + "loss": 1.777, + "step": 4925 + }, + { + "epoch": 0.33219904989724064, + "grad_norm": 4.81643428731798, + "learning_rate": 8.445439495757127e-07, + "loss": 1.7166, + "step": 4930 + }, + { + "epoch": 0.3325359657693474, + "grad_norm": 4.766650179919701, + "learning_rate": 8.44117565393548e-07, + "loss": 1.6928, + "step": 4935 + }, + { + "epoch": 0.3328728816414541, + "grad_norm": 4.504805907009001, + "learning_rate": 8.436907052258808e-07, + "loss": 1.7757, + "step": 4940 + }, + { + "epoch": 0.3332097975135609, + "grad_norm": 4.871702033132882, + "learning_rate": 8.432633696631473e-07, + "loss": 1.8047, + "step": 4945 + }, + { + "epoch": 0.3335467133856676, + "grad_norm": 4.360999298002896, + "learning_rate": 8.428355592964405e-07, + "loss": 1.7514, + "step": 4950 + }, + { + "epoch": 0.3338836292577743, + "grad_norm": 4.40738849972985, + "learning_rate": 8.424072747175102e-07, + "loss": 1.7638, + "step": 4955 + }, + { + "epoch": 0.3342205451298811, + "grad_norm": 4.402646732849628, + "learning_rate": 8.419785165187621e-07, + "loss": 1.6341, + "step": 4960 + }, + { + "epoch": 0.3345574610019878, + "grad_norm": 4.641381273642867, + "learning_rate": 8.415492852932573e-07, + "loss": 1.6812, + "step": 4965 + }, + { + "epoch": 0.3348943768740945, + "grad_norm": 4.92094782072377, + "learning_rate": 8.41119581634711e-07, + "loss": 1.8236, + "step": 4970 + }, + { + "epoch": 0.3352312927462013, + "grad_norm": 4.592892836304687, + "learning_rate": 8.406894061374918e-07, + "loss": 1.8073, + "step": 4975 + }, + { + "epoch": 0.335568208618308, + "grad_norm": 4.612862661847665, + "learning_rate": 8.402587593966213e-07, + "loss": 1.6964, + "step": 4980 + }, + { + "epoch": 0.33590512449041476, + "grad_norm": 4.537376816334139, + "learning_rate": 8.398276420077726e-07, + "loss": 1.7642, + "step": 4985 + }, + { + "epoch": 0.3362420403625215, + "grad_norm": 4.712056499426122, + "learning_rate": 8.393960545672698e-07, + "loss": 1.7134, + "step": 4990 + }, + { + "epoch": 0.3365789562346282, + "grad_norm": 4.564828506764209, + "learning_rate": 8.389639976720873e-07, + "loss": 1.7204, + "step": 4995 + }, + { + "epoch": 0.33691587210673496, + "grad_norm": 4.914182336061254, + "learning_rate": 8.385314719198487e-07, + "loss": 1.7386, + "step": 5000 + }, + { + "epoch": 0.3372527879788417, + "grad_norm": 4.484842012048959, + "learning_rate": 8.380984779088264e-07, + "loss": 1.7416, + "step": 5005 + }, + { + "epoch": 0.33758970385094844, + "grad_norm": 5.139741044975532, + "learning_rate": 8.376650162379404e-07, + "loss": 1.7514, + "step": 5010 + }, + { + "epoch": 0.33792661972305515, + "grad_norm": 4.47276838902141, + "learning_rate": 8.372310875067572e-07, + "loss": 1.7606, + "step": 5015 + }, + { + "epoch": 0.33826353559516187, + "grad_norm": 4.531427897815532, + "learning_rate": 8.367966923154899e-07, + "loss": 1.6455, + "step": 5020 + }, + { + "epoch": 0.33860045146726864, + "grad_norm": 4.661435301070422, + "learning_rate": 8.363618312649967e-07, + "loss": 1.7284, + "step": 5025 + }, + { + "epoch": 0.33893736733937535, + "grad_norm": 4.678787625067841, + "learning_rate": 8.359265049567796e-07, + "loss": 1.719, + "step": 5030 + }, + { + "epoch": 0.3392742832114821, + "grad_norm": 4.110099175210853, + "learning_rate": 8.35490713992985e-07, + "loss": 1.7321, + "step": 5035 + }, + { + "epoch": 0.33961119908358883, + "grad_norm": 4.7168890532127135, + "learning_rate": 8.350544589764015e-07, + "loss": 1.7626, + "step": 5040 + }, + { + "epoch": 0.33994811495569555, + "grad_norm": 4.5142169100739205, + "learning_rate": 8.346177405104595e-07, + "loss": 1.7402, + "step": 5045 + }, + { + "epoch": 0.3402850308278023, + "grad_norm": 4.844467413578708, + "learning_rate": 8.341805591992308e-07, + "loss": 1.7593, + "step": 5050 + }, + { + "epoch": 0.34062194669990903, + "grad_norm": 4.517706615153855, + "learning_rate": 8.337429156474272e-07, + "loss": 1.6679, + "step": 5055 + }, + { + "epoch": 0.34095886257201574, + "grad_norm": 4.377813395323879, + "learning_rate": 8.333048104603999e-07, + "loss": 1.6646, + "step": 5060 + }, + { + "epoch": 0.3412957784441225, + "grad_norm": 4.482064436846207, + "learning_rate": 8.328662442441388e-07, + "loss": 1.6389, + "step": 5065 + }, + { + "epoch": 0.3416326943162292, + "grad_norm": 4.400411146015806, + "learning_rate": 8.32427217605271e-07, + "loss": 1.7307, + "step": 5070 + }, + { + "epoch": 0.341969610188336, + "grad_norm": 4.528585841227934, + "learning_rate": 8.319877311510612e-07, + "loss": 1.7653, + "step": 5075 + }, + { + "epoch": 0.3423065260604427, + "grad_norm": 4.697407386040308, + "learning_rate": 8.315477854894095e-07, + "loss": 1.7074, + "step": 5080 + }, + { + "epoch": 0.3426434419325494, + "grad_norm": 4.59364070633189, + "learning_rate": 8.311073812288513e-07, + "loss": 1.7516, + "step": 5085 + }, + { + "epoch": 0.3429803578046562, + "grad_norm": 4.618861246582405, + "learning_rate": 8.306665189785567e-07, + "loss": 1.723, + "step": 5090 + }, + { + "epoch": 0.3433172736767629, + "grad_norm": 4.3958818021585655, + "learning_rate": 8.302251993483289e-07, + "loss": 1.7322, + "step": 5095 + }, + { + "epoch": 0.34365418954886967, + "grad_norm": 5.09198950014467, + "learning_rate": 8.297834229486039e-07, + "loss": 1.7217, + "step": 5100 + }, + { + "epoch": 0.3439911054209764, + "grad_norm": 4.498521150001597, + "learning_rate": 8.293411903904496e-07, + "loss": 1.6854, + "step": 5105 + }, + { + "epoch": 0.3443280212930831, + "grad_norm": 4.456639889055303, + "learning_rate": 8.288985022855645e-07, + "loss": 1.7743, + "step": 5110 + }, + { + "epoch": 0.34466493716518987, + "grad_norm": 4.612852747434726, + "learning_rate": 8.284553592462778e-07, + "loss": 1.7581, + "step": 5115 + }, + { + "epoch": 0.3450018530372966, + "grad_norm": 4.313561479912604, + "learning_rate": 8.280117618855475e-07, + "loss": 1.75, + "step": 5120 + }, + { + "epoch": 0.34533876890940335, + "grad_norm": 4.774021466238921, + "learning_rate": 8.2756771081696e-07, + "loss": 1.7794, + "step": 5125 + }, + { + "epoch": 0.34567568478151006, + "grad_norm": 4.532488619269362, + "learning_rate": 8.271232066547296e-07, + "loss": 1.7032, + "step": 5130 + }, + { + "epoch": 0.3460126006536168, + "grad_norm": 4.210061067829154, + "learning_rate": 8.266782500136971e-07, + "loss": 1.7748, + "step": 5135 + }, + { + "epoch": 0.34634951652572354, + "grad_norm": 4.882640898960227, + "learning_rate": 8.262328415093293e-07, + "loss": 1.7544, + "step": 5140 + }, + { + "epoch": 0.34668643239783026, + "grad_norm": 4.676380957510316, + "learning_rate": 8.257869817577179e-07, + "loss": 1.7428, + "step": 5145 + }, + { + "epoch": 0.34702334826993697, + "grad_norm": 4.494144685289505, + "learning_rate": 8.253406713755786e-07, + "loss": 1.7142, + "step": 5150 + }, + { + "epoch": 0.34736026414204374, + "grad_norm": 4.81974015891061, + "learning_rate": 8.24893910980251e-07, + "loss": 1.6675, + "step": 5155 + }, + { + "epoch": 0.34769718001415045, + "grad_norm": 4.66644833407102, + "learning_rate": 8.244467011896965e-07, + "loss": 1.66, + "step": 5160 + }, + { + "epoch": 0.3480340958862572, + "grad_norm": 4.265031234744358, + "learning_rate": 8.239990426224986e-07, + "loss": 1.7472, + "step": 5165 + }, + { + "epoch": 0.34837101175836394, + "grad_norm": 4.46755983851988, + "learning_rate": 8.235509358978611e-07, + "loss": 1.7599, + "step": 5170 + }, + { + "epoch": 0.34870792763047065, + "grad_norm": 4.846201035405282, + "learning_rate": 8.231023816356081e-07, + "loss": 1.6988, + "step": 5175 + }, + { + "epoch": 0.3490448435025774, + "grad_norm": 5.068632343934939, + "learning_rate": 8.226533804561826e-07, + "loss": 1.7098, + "step": 5180 + }, + { + "epoch": 0.34938175937468413, + "grad_norm": 4.780165677711516, + "learning_rate": 8.222039329806456e-07, + "loss": 1.7011, + "step": 5185 + }, + { + "epoch": 0.3497186752467909, + "grad_norm": 4.621335846200827, + "learning_rate": 8.217540398306757e-07, + "loss": 1.6735, + "step": 5190 + }, + { + "epoch": 0.3500555911188976, + "grad_norm": 4.513458917850163, + "learning_rate": 8.213037016285679e-07, + "loss": 1.7434, + "step": 5195 + }, + { + "epoch": 0.3503925069910043, + "grad_norm": 4.9286192435302185, + "learning_rate": 8.208529189972325e-07, + "loss": 1.7284, + "step": 5200 + }, + { + "epoch": 0.3507294228631111, + "grad_norm": 4.934007063420536, + "learning_rate": 8.204016925601951e-07, + "loss": 1.7539, + "step": 5205 + }, + { + "epoch": 0.3510663387352178, + "grad_norm": 4.855237741263649, + "learning_rate": 8.199500229415945e-07, + "loss": 1.7085, + "step": 5210 + }, + { + "epoch": 0.3514032546073246, + "grad_norm": 4.615249141184611, + "learning_rate": 8.19497910766183e-07, + "loss": 1.7298, + "step": 5215 + }, + { + "epoch": 0.3517401704794313, + "grad_norm": 4.495205667452235, + "learning_rate": 8.19045356659325e-07, + "loss": 1.7362, + "step": 5220 + }, + { + "epoch": 0.352077086351538, + "grad_norm": 4.829057942023865, + "learning_rate": 8.185923612469958e-07, + "loss": 1.6869, + "step": 5225 + }, + { + "epoch": 0.3524140022236448, + "grad_norm": 4.725434418643222, + "learning_rate": 8.181389251557817e-07, + "loss": 1.7194, + "step": 5230 + }, + { + "epoch": 0.3527509180957515, + "grad_norm": 4.58503870736107, + "learning_rate": 8.176850490128782e-07, + "loss": 1.7013, + "step": 5235 + }, + { + "epoch": 0.3530878339678582, + "grad_norm": 4.782838738330386, + "learning_rate": 8.172307334460892e-07, + "loss": 1.7747, + "step": 5240 + }, + { + "epoch": 0.35342474983996497, + "grad_norm": 5.451500093580164, + "learning_rate": 8.167759790838273e-07, + "loss": 1.7207, + "step": 5245 + }, + { + "epoch": 0.3537616657120717, + "grad_norm": 4.695952472643797, + "learning_rate": 8.163207865551111e-07, + "loss": 1.6691, + "step": 5250 + }, + { + "epoch": 0.35409858158417845, + "grad_norm": 4.953931315209139, + "learning_rate": 8.158651564895657e-07, + "loss": 1.68, + "step": 5255 + }, + { + "epoch": 0.35443549745628516, + "grad_norm": 4.731523917919795, + "learning_rate": 8.154090895174215e-07, + "loss": 1.7114, + "step": 5260 + }, + { + "epoch": 0.3547724133283919, + "grad_norm": 4.665813772982818, + "learning_rate": 8.149525862695131e-07, + "loss": 1.7682, + "step": 5265 + }, + { + "epoch": 0.35510932920049865, + "grad_norm": 4.2650023278644005, + "learning_rate": 8.144956473772784e-07, + "loss": 1.7071, + "step": 5270 + }, + { + "epoch": 0.35544624507260536, + "grad_norm": 4.583863138977418, + "learning_rate": 8.140382734727581e-07, + "loss": 1.7393, + "step": 5275 + }, + { + "epoch": 0.35578316094471213, + "grad_norm": 4.401047996458969, + "learning_rate": 8.135804651885946e-07, + "loss": 1.7556, + "step": 5280 + }, + { + "epoch": 0.35612007681681884, + "grad_norm": 4.948765436864181, + "learning_rate": 8.131222231580313e-07, + "loss": 1.6948, + "step": 5285 + }, + { + "epoch": 0.35645699268892556, + "grad_norm": 4.647264244397854, + "learning_rate": 8.126635480149107e-07, + "loss": 1.7475, + "step": 5290 + }, + { + "epoch": 0.3567939085610323, + "grad_norm": 4.6376077560112545, + "learning_rate": 8.122044403936759e-07, + "loss": 1.7301, + "step": 5295 + }, + { + "epoch": 0.35713082443313904, + "grad_norm": 4.792132621017928, + "learning_rate": 8.117449009293668e-07, + "loss": 1.7136, + "step": 5300 + }, + { + "epoch": 0.3574677403052458, + "grad_norm": 4.6050017365222535, + "learning_rate": 8.112849302576212e-07, + "loss": 1.7232, + "step": 5305 + }, + { + "epoch": 0.3578046561773525, + "grad_norm": 4.333389497999406, + "learning_rate": 8.108245290146735e-07, + "loss": 1.6384, + "step": 5310 + }, + { + "epoch": 0.35814157204945923, + "grad_norm": 4.78399376828893, + "learning_rate": 8.103636978373534e-07, + "loss": 1.6869, + "step": 5315 + }, + { + "epoch": 0.358478487921566, + "grad_norm": 4.895830217254766, + "learning_rate": 8.099024373630854e-07, + "loss": 1.67, + "step": 5320 + }, + { + "epoch": 0.3588154037936727, + "grad_norm": 4.518244253442206, + "learning_rate": 8.094407482298877e-07, + "loss": 1.7102, + "step": 5325 + }, + { + "epoch": 0.35915231966577943, + "grad_norm": 4.402545137279259, + "learning_rate": 8.089786310763716e-07, + "loss": 1.7257, + "step": 5330 + }, + { + "epoch": 0.3594892355378862, + "grad_norm": 4.404324471348795, + "learning_rate": 8.085160865417403e-07, + "loss": 1.6728, + "step": 5335 + }, + { + "epoch": 0.3598261514099929, + "grad_norm": 4.619273776898724, + "learning_rate": 8.080531152657884e-07, + "loss": 1.6772, + "step": 5340 + }, + { + "epoch": 0.3601630672820997, + "grad_norm": 5.085968829325327, + "learning_rate": 8.075897178889002e-07, + "loss": 1.7058, + "step": 5345 + }, + { + "epoch": 0.3604999831542064, + "grad_norm": 4.681702471986685, + "learning_rate": 8.071258950520501e-07, + "loss": 1.7616, + "step": 5350 + }, + { + "epoch": 0.3608368990263131, + "grad_norm": 4.542808993087876, + "learning_rate": 8.066616473968005e-07, + "loss": 1.7601, + "step": 5355 + }, + { + "epoch": 0.3611738148984199, + "grad_norm": 4.7748260170083245, + "learning_rate": 8.061969755653013e-07, + "loss": 1.7815, + "step": 5360 + }, + { + "epoch": 0.3615107307705266, + "grad_norm": 4.829390136039957, + "learning_rate": 8.0573188020029e-07, + "loss": 1.7522, + "step": 5365 + }, + { + "epoch": 0.36184764664263336, + "grad_norm": 4.538492090991936, + "learning_rate": 8.052663619450889e-07, + "loss": 1.6557, + "step": 5370 + }, + { + "epoch": 0.36218456251474007, + "grad_norm": 4.7861584328042674, + "learning_rate": 8.048004214436058e-07, + "loss": 1.7294, + "step": 5375 + }, + { + "epoch": 0.3625214783868468, + "grad_norm": 4.833761542360912, + "learning_rate": 8.043340593403325e-07, + "loss": 1.8014, + "step": 5380 + }, + { + "epoch": 0.36285839425895355, + "grad_norm": 4.986294508483786, + "learning_rate": 8.038672762803437e-07, + "loss": 1.7386, + "step": 5385 + }, + { + "epoch": 0.36319531013106027, + "grad_norm": 4.369718973973582, + "learning_rate": 8.034000729092967e-07, + "loss": 1.7509, + "step": 5390 + }, + { + "epoch": 0.36353222600316704, + "grad_norm": 5.256644811129503, + "learning_rate": 8.029324498734299e-07, + "loss": 1.6701, + "step": 5395 + }, + { + "epoch": 0.36386914187527375, + "grad_norm": 4.6274278308145105, + "learning_rate": 8.024644078195625e-07, + "loss": 1.7283, + "step": 5400 + }, + { + "epoch": 0.36420605774738046, + "grad_norm": 4.652898587715458, + "learning_rate": 8.01995947395093e-07, + "loss": 1.7084, + "step": 5405 + }, + { + "epoch": 0.36454297361948723, + "grad_norm": 4.469834686326115, + "learning_rate": 8.015270692479988e-07, + "loss": 1.7923, + "step": 5410 + }, + { + "epoch": 0.36487988949159395, + "grad_norm": 4.904048624023098, + "learning_rate": 8.010577740268347e-07, + "loss": 1.7368, + "step": 5415 + }, + { + "epoch": 0.36521680536370066, + "grad_norm": 4.77922639432497, + "learning_rate": 8.005880623807331e-07, + "loss": 1.636, + "step": 5420 + }, + { + "epoch": 0.3655537212358074, + "grad_norm": 5.2469243350698065, + "learning_rate": 8.001179349594016e-07, + "loss": 1.78, + "step": 5425 + }, + { + "epoch": 0.36589063710791414, + "grad_norm": 4.696982829348919, + "learning_rate": 7.996473924131236e-07, + "loss": 1.7995, + "step": 5430 + }, + { + "epoch": 0.3662275529800209, + "grad_norm": 4.754900374319797, + "learning_rate": 7.991764353927562e-07, + "loss": 1.7253, + "step": 5435 + }, + { + "epoch": 0.3665644688521276, + "grad_norm": 4.766043957422087, + "learning_rate": 7.987050645497302e-07, + "loss": 1.7251, + "step": 5440 + }, + { + "epoch": 0.36690138472423434, + "grad_norm": 5.495055814974811, + "learning_rate": 7.982332805360486e-07, + "loss": 1.721, + "step": 5445 + }, + { + "epoch": 0.3672383005963411, + "grad_norm": 5.185117084314556, + "learning_rate": 7.977610840042856e-07, + "loss": 1.7162, + "step": 5450 + }, + { + "epoch": 0.3675752164684478, + "grad_norm": 4.326014002095558, + "learning_rate": 7.972884756075867e-07, + "loss": 1.8196, + "step": 5455 + }, + { + "epoch": 0.3679121323405546, + "grad_norm": 4.957005167673788, + "learning_rate": 7.968154559996665e-07, + "loss": 1.7502, + "step": 5460 + }, + { + "epoch": 0.3682490482126613, + "grad_norm": 4.551368177373224, + "learning_rate": 7.963420258348086e-07, + "loss": 1.6959, + "step": 5465 + }, + { + "epoch": 0.368585964084768, + "grad_norm": 4.234515423236089, + "learning_rate": 7.958681857678645e-07, + "loss": 1.7148, + "step": 5470 + }, + { + "epoch": 0.3689228799568748, + "grad_norm": 4.230681426026155, + "learning_rate": 7.953939364542523e-07, + "loss": 1.6616, + "step": 5475 + }, + { + "epoch": 0.3692597958289815, + "grad_norm": 4.696843792805101, + "learning_rate": 7.949192785499573e-07, + "loss": 1.6891, + "step": 5480 + }, + { + "epoch": 0.36959671170108827, + "grad_norm": 4.732075643282494, + "learning_rate": 7.944442127115285e-07, + "loss": 1.6966, + "step": 5485 + }, + { + "epoch": 0.369933627573195, + "grad_norm": 4.746063635976779, + "learning_rate": 7.939687395960802e-07, + "loss": 1.6941, + "step": 5490 + }, + { + "epoch": 0.3702705434453017, + "grad_norm": 4.850456587535771, + "learning_rate": 7.934928598612895e-07, + "loss": 1.6475, + "step": 5495 + }, + { + "epoch": 0.37060745931740846, + "grad_norm": 4.415645353505913, + "learning_rate": 7.930165741653964e-07, + "loss": 1.7229, + "step": 5500 + }, + { + "epoch": 0.3709443751895152, + "grad_norm": 4.669235129577563, + "learning_rate": 7.925398831672018e-07, + "loss": 1.6973, + "step": 5505 + }, + { + "epoch": 0.3712812910616219, + "grad_norm": 4.404915667914949, + "learning_rate": 7.920627875260679e-07, + "loss": 1.8221, + "step": 5510 + }, + { + "epoch": 0.37161820693372866, + "grad_norm": 4.560927372059081, + "learning_rate": 7.91585287901916e-07, + "loss": 1.7294, + "step": 5515 + }, + { + "epoch": 0.37195512280583537, + "grad_norm": 4.552193802277202, + "learning_rate": 7.911073849552267e-07, + "loss": 1.7327, + "step": 5520 + }, + { + "epoch": 0.37229203867794214, + "grad_norm": 4.6558433797374255, + "learning_rate": 7.906290793470382e-07, + "loss": 1.7607, + "step": 5525 + }, + { + "epoch": 0.37262895455004885, + "grad_norm": 4.894533129056544, + "learning_rate": 7.901503717389458e-07, + "loss": 1.7314, + "step": 5530 + }, + { + "epoch": 0.37296587042215557, + "grad_norm": 4.8296195430988975, + "learning_rate": 7.896712627931004e-07, + "loss": 1.7676, + "step": 5535 + }, + { + "epoch": 0.37330278629426233, + "grad_norm": 4.68120650909417, + "learning_rate": 7.891917531722087e-07, + "loss": 1.728, + "step": 5540 + }, + { + "epoch": 0.37363970216636905, + "grad_norm": 4.377869202077806, + "learning_rate": 7.887118435395314e-07, + "loss": 1.7314, + "step": 5545 + }, + { + "epoch": 0.3739766180384758, + "grad_norm": 4.522942860706774, + "learning_rate": 7.882315345588823e-07, + "loss": 1.722, + "step": 5550 + }, + { + "epoch": 0.37431353391058253, + "grad_norm": 4.296050621560291, + "learning_rate": 7.877508268946275e-07, + "loss": 1.6951, + "step": 5555 + }, + { + "epoch": 0.37465044978268924, + "grad_norm": 4.295345819522486, + "learning_rate": 7.87269721211685e-07, + "loss": 1.7105, + "step": 5560 + }, + { + "epoch": 0.374987365654796, + "grad_norm": 4.613573892628392, + "learning_rate": 7.86788218175523e-07, + "loss": 1.7668, + "step": 5565 + }, + { + "epoch": 0.3753242815269027, + "grad_norm": 4.4369340983936665, + "learning_rate": 7.863063184521595e-07, + "loss": 1.6742, + "step": 5570 + }, + { + "epoch": 0.3756611973990095, + "grad_norm": 4.378969798272695, + "learning_rate": 7.858240227081611e-07, + "loss": 1.6726, + "step": 5575 + }, + { + "epoch": 0.3759981132711162, + "grad_norm": 4.696000087330842, + "learning_rate": 7.85341331610642e-07, + "loss": 1.7498, + "step": 5580 + }, + { + "epoch": 0.3763350291432229, + "grad_norm": 4.399452146909051, + "learning_rate": 7.848582458272637e-07, + "loss": 1.7782, + "step": 5585 + }, + { + "epoch": 0.3766719450153297, + "grad_norm": 5.4100168587899455, + "learning_rate": 7.843747660262333e-07, + "loss": 1.6216, + "step": 5590 + }, + { + "epoch": 0.3770088608874364, + "grad_norm": 4.542837235889031, + "learning_rate": 7.838908928763028e-07, + "loss": 1.711, + "step": 5595 + }, + { + "epoch": 0.3773457767595431, + "grad_norm": 4.628730063837452, + "learning_rate": 7.834066270467689e-07, + "loss": 1.7258, + "step": 5600 + }, + { + "epoch": 0.3776826926316499, + "grad_norm": 4.599442184002633, + "learning_rate": 7.829219692074707e-07, + "loss": 1.7791, + "step": 5605 + }, + { + "epoch": 0.3780196085037566, + "grad_norm": 4.616484916394916, + "learning_rate": 7.824369200287899e-07, + "loss": 1.7359, + "step": 5610 + }, + { + "epoch": 0.37835652437586337, + "grad_norm": 4.447771634117519, + "learning_rate": 7.819514801816496e-07, + "loss": 1.7386, + "step": 5615 + }, + { + "epoch": 0.3786934402479701, + "grad_norm": 4.521321589840539, + "learning_rate": 7.814656503375128e-07, + "loss": 1.6908, + "step": 5620 + }, + { + "epoch": 0.3790303561200768, + "grad_norm": 4.903527067915301, + "learning_rate": 7.809794311683828e-07, + "loss": 1.7085, + "step": 5625 + }, + { + "epoch": 0.37936727199218356, + "grad_norm": 4.183363887244186, + "learning_rate": 7.804928233468006e-07, + "loss": 1.7351, + "step": 5630 + }, + { + "epoch": 0.3797041878642903, + "grad_norm": 4.413163244248408, + "learning_rate": 7.80005827545845e-07, + "loss": 1.7733, + "step": 5635 + }, + { + "epoch": 0.38004110373639705, + "grad_norm": 4.907578526330726, + "learning_rate": 7.795184444391318e-07, + "loss": 1.7099, + "step": 5640 + }, + { + "epoch": 0.38037801960850376, + "grad_norm": 4.351667924698213, + "learning_rate": 7.790306747008119e-07, + "loss": 1.6799, + "step": 5645 + }, + { + "epoch": 0.3807149354806105, + "grad_norm": 4.370233455711544, + "learning_rate": 7.785425190055719e-07, + "loss": 1.7144, + "step": 5650 + }, + { + "epoch": 0.38105185135271724, + "grad_norm": 5.18211571994226, + "learning_rate": 7.780539780286312e-07, + "loss": 1.6515, + "step": 5655 + }, + { + "epoch": 0.38138876722482395, + "grad_norm": 4.682021085687706, + "learning_rate": 7.77565052445743e-07, + "loss": 1.7568, + "step": 5660 + }, + { + "epoch": 0.3817256830969307, + "grad_norm": 4.383186975516879, + "learning_rate": 7.770757429331919e-07, + "loss": 1.7622, + "step": 5665 + }, + { + "epoch": 0.38206259896903744, + "grad_norm": 4.8086361127286255, + "learning_rate": 7.765860501677939e-07, + "loss": 1.7293, + "step": 5670 + }, + { + "epoch": 0.38239951484114415, + "grad_norm": 4.268414299928615, + "learning_rate": 7.760959748268949e-07, + "loss": 1.668, + "step": 5675 + }, + { + "epoch": 0.3827364307132509, + "grad_norm": 4.499269064286495, + "learning_rate": 7.756055175883701e-07, + "loss": 1.7434, + "step": 5680 + }, + { + "epoch": 0.38307334658535763, + "grad_norm": 4.662888573636476, + "learning_rate": 7.751146791306231e-07, + "loss": 1.7432, + "step": 5685 + }, + { + "epoch": 0.38341026245746435, + "grad_norm": 4.369518419175261, + "learning_rate": 7.746234601325843e-07, + "loss": 1.6939, + "step": 5690 + }, + { + "epoch": 0.3837471783295711, + "grad_norm": 4.659617437659864, + "learning_rate": 7.741318612737111e-07, + "loss": 1.6964, + "step": 5695 + }, + { + "epoch": 0.38408409420167783, + "grad_norm": 4.743139224013069, + "learning_rate": 7.73639883233986e-07, + "loss": 1.6675, + "step": 5700 + }, + { + "epoch": 0.3844210100737846, + "grad_norm": 4.438211753386986, + "learning_rate": 7.731475266939158e-07, + "loss": 1.6696, + "step": 5705 + }, + { + "epoch": 0.3847579259458913, + "grad_norm": 6.7396570413557235, + "learning_rate": 7.726547923345313e-07, + "loss": 1.7069, + "step": 5710 + }, + { + "epoch": 0.385094841817998, + "grad_norm": 4.752177328606303, + "learning_rate": 7.721616808373855e-07, + "loss": 1.7465, + "step": 5715 + }, + { + "epoch": 0.3854317576901048, + "grad_norm": 4.5518253954744035, + "learning_rate": 7.716681928845532e-07, + "loss": 1.6894, + "step": 5720 + }, + { + "epoch": 0.3857686735622115, + "grad_norm": 4.567439842382045, + "learning_rate": 7.711743291586298e-07, + "loss": 1.7141, + "step": 5725 + }, + { + "epoch": 0.3861055894343183, + "grad_norm": 4.459533696503045, + "learning_rate": 7.706800903427309e-07, + "loss": 1.6393, + "step": 5730 + }, + { + "epoch": 0.386442505306425, + "grad_norm": 4.585967312052329, + "learning_rate": 7.701854771204905e-07, + "loss": 1.7233, + "step": 5735 + }, + { + "epoch": 0.3867794211785317, + "grad_norm": 4.572181355645383, + "learning_rate": 7.696904901760606e-07, + "loss": 1.688, + "step": 5740 + }, + { + "epoch": 0.38711633705063847, + "grad_norm": 4.78962813492679, + "learning_rate": 7.691951301941102e-07, + "loss": 1.7155, + "step": 5745 + }, + { + "epoch": 0.3874532529227452, + "grad_norm": 5.055611062311816, + "learning_rate": 7.68699397859824e-07, + "loss": 1.7816, + "step": 5750 + }, + { + "epoch": 0.38779016879485195, + "grad_norm": 4.4705802156633, + "learning_rate": 7.682032938589023e-07, + "loss": 1.7335, + "step": 5755 + }, + { + "epoch": 0.38812708466695867, + "grad_norm": 4.920709221588469, + "learning_rate": 7.677068188775589e-07, + "loss": 1.6501, + "step": 5760 + }, + { + "epoch": 0.3884640005390654, + "grad_norm": 4.36634733581777, + "learning_rate": 7.67209973602521e-07, + "loss": 1.7318, + "step": 5765 + }, + { + "epoch": 0.38880091641117215, + "grad_norm": 4.501042191329174, + "learning_rate": 7.667127587210282e-07, + "loss": 1.7573, + "step": 5770 + }, + { + "epoch": 0.38913783228327886, + "grad_norm": 4.750610976620214, + "learning_rate": 7.66215174920831e-07, + "loss": 1.735, + "step": 5775 + }, + { + "epoch": 0.3894747481553856, + "grad_norm": 4.518360253175234, + "learning_rate": 7.657172228901905e-07, + "loss": 1.6926, + "step": 5780 + }, + { + "epoch": 0.38981166402749234, + "grad_norm": 4.656074322809511, + "learning_rate": 7.652189033178766e-07, + "loss": 1.7086, + "step": 5785 + }, + { + "epoch": 0.39014857989959906, + "grad_norm": 4.808862642475306, + "learning_rate": 7.647202168931683e-07, + "loss": 1.6493, + "step": 5790 + }, + { + "epoch": 0.3904854957717058, + "grad_norm": 4.652819924014255, + "learning_rate": 7.642211643058516e-07, + "loss": 1.6836, + "step": 5795 + }, + { + "epoch": 0.39082241164381254, + "grad_norm": 4.461568567691955, + "learning_rate": 7.637217462462189e-07, + "loss": 1.7185, + "step": 5800 + }, + { + "epoch": 0.39115932751591925, + "grad_norm": 4.605998238250465, + "learning_rate": 7.632219634050685e-07, + "loss": 1.6989, + "step": 5805 + }, + { + "epoch": 0.391496243388026, + "grad_norm": 4.592693407010997, + "learning_rate": 7.62721816473703e-07, + "loss": 1.7713, + "step": 5810 + }, + { + "epoch": 0.39183315926013274, + "grad_norm": 5.078801838131806, + "learning_rate": 7.622213061439287e-07, + "loss": 1.6707, + "step": 5815 + }, + { + "epoch": 0.3921700751322395, + "grad_norm": 5.172560564048931, + "learning_rate": 7.617204331080544e-07, + "loss": 1.6874, + "step": 5820 + }, + { + "epoch": 0.3925069910043462, + "grad_norm": 4.616475259743771, + "learning_rate": 7.612191980588907e-07, + "loss": 1.6954, + "step": 5825 + }, + { + "epoch": 0.39284390687645293, + "grad_norm": 4.547897998439806, + "learning_rate": 7.60717601689749e-07, + "loss": 1.7337, + "step": 5830 + }, + { + "epoch": 0.3931808227485597, + "grad_norm": 4.60176135143651, + "learning_rate": 7.602156446944405e-07, + "loss": 1.7093, + "step": 5835 + }, + { + "epoch": 0.3935177386206664, + "grad_norm": 4.695085433755888, + "learning_rate": 7.597133277672751e-07, + "loss": 1.6814, + "step": 5840 + }, + { + "epoch": 0.3938546544927732, + "grad_norm": 5.077084145839541, + "learning_rate": 7.592106516030607e-07, + "loss": 1.6684, + "step": 5845 + }, + { + "epoch": 0.3941915703648799, + "grad_norm": 4.969131663429761, + "learning_rate": 7.587076168971022e-07, + "loss": 1.6603, + "step": 5850 + }, + { + "epoch": 0.3945284862369866, + "grad_norm": 4.578673498032634, + "learning_rate": 7.582042243451998e-07, + "loss": 1.7228, + "step": 5855 + }, + { + "epoch": 0.3948654021090934, + "grad_norm": 4.866217603578348, + "learning_rate": 7.577004746436494e-07, + "loss": 1.7278, + "step": 5860 + }, + { + "epoch": 0.3952023179812001, + "grad_norm": 4.706464482205939, + "learning_rate": 7.571963684892404e-07, + "loss": 1.7142, + "step": 5865 + }, + { + "epoch": 0.3955392338533068, + "grad_norm": 4.308379972805312, + "learning_rate": 7.566919065792558e-07, + "loss": 1.7033, + "step": 5870 + }, + { + "epoch": 0.3958761497254136, + "grad_norm": 4.853147236613105, + "learning_rate": 7.561870896114704e-07, + "loss": 1.6539, + "step": 5875 + }, + { + "epoch": 0.3962130655975203, + "grad_norm": 4.690507382976398, + "learning_rate": 7.556819182841498e-07, + "loss": 1.6581, + "step": 5880 + }, + { + "epoch": 0.39654998146962706, + "grad_norm": 4.519726029042387, + "learning_rate": 7.551763932960502e-07, + "loss": 1.7522, + "step": 5885 + }, + { + "epoch": 0.39688689734173377, + "grad_norm": 4.533553164902254, + "learning_rate": 7.546705153464168e-07, + "loss": 1.7346, + "step": 5890 + }, + { + "epoch": 0.3972238132138405, + "grad_norm": 4.610001390974794, + "learning_rate": 7.54164285134983e-07, + "loss": 1.6907, + "step": 5895 + }, + { + "epoch": 0.39756072908594725, + "grad_norm": 4.760381077129122, + "learning_rate": 7.536577033619696e-07, + "loss": 1.7315, + "step": 5900 + }, + { + "epoch": 0.39789764495805396, + "grad_norm": 4.629241226901361, + "learning_rate": 7.531507707280836e-07, + "loss": 1.6959, + "step": 5905 + }, + { + "epoch": 0.39823456083016073, + "grad_norm": 4.446115728404233, + "learning_rate": 7.526434879345171e-07, + "loss": 1.7361, + "step": 5910 + }, + { + "epoch": 0.39857147670226745, + "grad_norm": 4.633205448295902, + "learning_rate": 7.521358556829469e-07, + "loss": 1.7537, + "step": 5915 + }, + { + "epoch": 0.39890839257437416, + "grad_norm": 4.670072836193434, + "learning_rate": 7.51627874675533e-07, + "loss": 1.769, + "step": 5920 + }, + { + "epoch": 0.39924530844648093, + "grad_norm": 4.631995984236844, + "learning_rate": 7.511195456149177e-07, + "loss": 1.7361, + "step": 5925 + }, + { + "epoch": 0.39958222431858764, + "grad_norm": 4.396825301474403, + "learning_rate": 7.50610869204225e-07, + "loss": 1.7077, + "step": 5930 + }, + { + "epoch": 0.3999191401906944, + "grad_norm": 4.668120397333451, + "learning_rate": 7.50101846147059e-07, + "loss": 1.8077, + "step": 5935 + }, + { + "epoch": 0.4002560560628011, + "grad_norm": 4.610399362003563, + "learning_rate": 7.495924771475037e-07, + "loss": 1.7378, + "step": 5940 + }, + { + "epoch": 0.40059297193490784, + "grad_norm": 4.662838108941898, + "learning_rate": 7.490827629101211e-07, + "loss": 1.7173, + "step": 5945 + }, + { + "epoch": 0.4009298878070146, + "grad_norm": 4.490648340857639, + "learning_rate": 7.485727041399513e-07, + "loss": 1.671, + "step": 5950 + }, + { + "epoch": 0.4012668036791213, + "grad_norm": 4.506223348196042, + "learning_rate": 7.480623015425105e-07, + "loss": 1.6699, + "step": 5955 + }, + { + "epoch": 0.40160371955122803, + "grad_norm": 4.2928272510154635, + "learning_rate": 7.475515558237909e-07, + "loss": 1.7237, + "step": 5960 + }, + { + "epoch": 0.4019406354233348, + "grad_norm": 4.6387104283175855, + "learning_rate": 7.470404676902587e-07, + "loss": 1.7306, + "step": 5965 + }, + { + "epoch": 0.4022775512954415, + "grad_norm": 4.4761023884421745, + "learning_rate": 7.465290378488544e-07, + "loss": 1.7759, + "step": 5970 + }, + { + "epoch": 0.4026144671675483, + "grad_norm": 4.732318729645672, + "learning_rate": 7.460172670069909e-07, + "loss": 1.739, + "step": 5975 + }, + { + "epoch": 0.402951383039655, + "grad_norm": 4.457319075535821, + "learning_rate": 7.455051558725524e-07, + "loss": 1.7418, + "step": 5980 + }, + { + "epoch": 0.4032882989117617, + "grad_norm": 4.478093757793555, + "learning_rate": 7.449927051538944e-07, + "loss": 1.6646, + "step": 5985 + }, + { + "epoch": 0.4036252147838685, + "grad_norm": 4.71073854533818, + "learning_rate": 7.444799155598419e-07, + "loss": 1.7251, + "step": 5990 + }, + { + "epoch": 0.4039621306559752, + "grad_norm": 4.612275010632581, + "learning_rate": 7.439667877996884e-07, + "loss": 1.6614, + "step": 5995 + }, + { + "epoch": 0.40429904652808196, + "grad_norm": 4.166735332378841, + "learning_rate": 7.434533225831951e-07, + "loss": 1.7487, + "step": 6000 + }, + { + "epoch": 0.4046359624001887, + "grad_norm": 4.437607194642227, + "learning_rate": 7.429395206205908e-07, + "loss": 1.6909, + "step": 6005 + }, + { + "epoch": 0.4049728782722954, + "grad_norm": 4.928632105168086, + "learning_rate": 7.424253826225689e-07, + "loss": 1.6897, + "step": 6010 + }, + { + "epoch": 0.40530979414440216, + "grad_norm": 4.623826941528856, + "learning_rate": 7.419109093002887e-07, + "loss": 1.6674, + "step": 6015 + }, + { + "epoch": 0.40564671001650887, + "grad_norm": 4.754454500032497, + "learning_rate": 7.413961013653725e-07, + "loss": 1.7288, + "step": 6020 + }, + { + "epoch": 0.40598362588861564, + "grad_norm": 4.848985166114699, + "learning_rate": 7.408809595299057e-07, + "loss": 1.6862, + "step": 6025 + }, + { + "epoch": 0.40632054176072235, + "grad_norm": 4.441849353174314, + "learning_rate": 7.403654845064358e-07, + "loss": 1.7385, + "step": 6030 + }, + { + "epoch": 0.40665745763282907, + "grad_norm": 5.058126999436806, + "learning_rate": 7.398496770079709e-07, + "loss": 1.7187, + "step": 6035 + }, + { + "epoch": 0.40699437350493584, + "grad_norm": 4.42595695672934, + "learning_rate": 7.393335377479792e-07, + "loss": 1.7185, + "step": 6040 + }, + { + "epoch": 0.40733128937704255, + "grad_norm": 4.560697874187131, + "learning_rate": 7.388170674403872e-07, + "loss": 1.6642, + "step": 6045 + }, + { + "epoch": 0.40766820524914926, + "grad_norm": 4.931680925309431, + "learning_rate": 7.383002667995804e-07, + "loss": 1.7109, + "step": 6050 + }, + { + "epoch": 0.40800512112125603, + "grad_norm": 4.604546109504864, + "learning_rate": 7.377831365404001e-07, + "loss": 1.6834, + "step": 6055 + }, + { + "epoch": 0.40834203699336274, + "grad_norm": 4.576988553550718, + "learning_rate": 7.372656773781442e-07, + "loss": 1.7549, + "step": 6060 + }, + { + "epoch": 0.4086789528654695, + "grad_norm": 4.46737050763205, + "learning_rate": 7.367478900285654e-07, + "loss": 1.7219, + "step": 6065 + }, + { + "epoch": 0.4090158687375762, + "grad_norm": 4.664858766858329, + "learning_rate": 7.362297752078702e-07, + "loss": 1.6568, + "step": 6070 + }, + { + "epoch": 0.40935278460968294, + "grad_norm": 4.710657927303359, + "learning_rate": 7.357113336327181e-07, + "loss": 1.7278, + "step": 6075 + }, + { + "epoch": 0.4096897004817897, + "grad_norm": 4.597622314315028, + "learning_rate": 7.351925660202207e-07, + "loss": 1.6781, + "step": 6080 + }, + { + "epoch": 0.4100266163538964, + "grad_norm": 4.731937253448, + "learning_rate": 7.346734730879407e-07, + "loss": 1.7386, + "step": 6085 + }, + { + "epoch": 0.4103635322260032, + "grad_norm": 4.532128690172484, + "learning_rate": 7.341540555538902e-07, + "loss": 1.7474, + "step": 6090 + }, + { + "epoch": 0.4107004480981099, + "grad_norm": 5.063100649227562, + "learning_rate": 7.33634314136531e-07, + "loss": 1.7513, + "step": 6095 + }, + { + "epoch": 0.4110373639702166, + "grad_norm": 4.861672431801468, + "learning_rate": 7.331142495547724e-07, + "loss": 1.7346, + "step": 6100 + }, + { + "epoch": 0.4113742798423234, + "grad_norm": 4.760654858558883, + "learning_rate": 7.325938625279709e-07, + "loss": 1.6565, + "step": 6105 + }, + { + "epoch": 0.4117111957144301, + "grad_norm": 4.591183402492795, + "learning_rate": 7.320731537759293e-07, + "loss": 1.7999, + "step": 6110 + }, + { + "epoch": 0.41204811158653687, + "grad_norm": 4.8148792163856, + "learning_rate": 7.315521240188944e-07, + "loss": 1.7689, + "step": 6115 + }, + { + "epoch": 0.4123850274586436, + "grad_norm": 5.225593664936883, + "learning_rate": 7.310307739775585e-07, + "loss": 1.7211, + "step": 6120 + }, + { + "epoch": 0.4127219433307503, + "grad_norm": 4.873654095668805, + "learning_rate": 7.305091043730557e-07, + "loss": 1.761, + "step": 6125 + }, + { + "epoch": 0.41305885920285706, + "grad_norm": 4.790346021865599, + "learning_rate": 7.299871159269626e-07, + "loss": 1.7007, + "step": 6130 + }, + { + "epoch": 0.4133957750749638, + "grad_norm": 4.52525638545263, + "learning_rate": 7.294648093612968e-07, + "loss": 1.7322, + "step": 6135 + }, + { + "epoch": 0.4137326909470705, + "grad_norm": 4.465403456484435, + "learning_rate": 7.28942185398516e-07, + "loss": 1.6736, + "step": 6140 + }, + { + "epoch": 0.41406960681917726, + "grad_norm": 4.68397753694206, + "learning_rate": 7.284192447615168e-07, + "loss": 1.689, + "step": 6145 + }, + { + "epoch": 0.414406522691284, + "grad_norm": 4.5810644765337365, + "learning_rate": 7.278959881736338e-07, + "loss": 1.7383, + "step": 6150 + }, + { + "epoch": 0.41474343856339074, + "grad_norm": 5.0164765638110635, + "learning_rate": 7.273724163586387e-07, + "loss": 1.7011, + "step": 6155 + }, + { + "epoch": 0.41508035443549746, + "grad_norm": 5.086309499155851, + "learning_rate": 7.268485300407394e-07, + "loss": 1.7185, + "step": 6160 + }, + { + "epoch": 0.41541727030760417, + "grad_norm": 3.905456024402334, + "learning_rate": 7.263243299445783e-07, + "loss": 1.7467, + "step": 6165 + }, + { + "epoch": 0.41575418617971094, + "grad_norm": 4.934234757320097, + "learning_rate": 7.257998167952322e-07, + "loss": 1.6398, + "step": 6170 + }, + { + "epoch": 0.41609110205181765, + "grad_norm": 4.438570362494904, + "learning_rate": 7.25274991318211e-07, + "loss": 1.6563, + "step": 6175 + }, + { + "epoch": 0.4164280179239244, + "grad_norm": 4.752967431338472, + "learning_rate": 7.247498542394566e-07, + "loss": 1.7653, + "step": 6180 + }, + { + "epoch": 0.41676493379603113, + "grad_norm": 4.518082197921104, + "learning_rate": 7.242244062853416e-07, + "loss": 1.7131, + "step": 6185 + }, + { + "epoch": 0.41710184966813785, + "grad_norm": 4.581616402189418, + "learning_rate": 7.236986481826688e-07, + "loss": 1.7834, + "step": 6190 + }, + { + "epoch": 0.4174387655402446, + "grad_norm": 4.220211283032085, + "learning_rate": 7.231725806586699e-07, + "loss": 1.6772, + "step": 6195 + }, + { + "epoch": 0.41777568141235133, + "grad_norm": 5.213439810668329, + "learning_rate": 7.22646204441005e-07, + "loss": 1.7462, + "step": 6200 + }, + { + "epoch": 0.4181125972844581, + "grad_norm": 4.329098312504522, + "learning_rate": 7.221195202577606e-07, + "loss": 1.713, + "step": 6205 + }, + { + "epoch": 0.4184495131565648, + "grad_norm": 4.80682214677469, + "learning_rate": 7.215925288374496e-07, + "loss": 1.7204, + "step": 6210 + }, + { + "epoch": 0.4187864290286715, + "grad_norm": 4.914051282059273, + "learning_rate": 7.210652309090098e-07, + "loss": 1.7357, + "step": 6215 + }, + { + "epoch": 0.4191233449007783, + "grad_norm": 4.845584202761493, + "learning_rate": 7.205376272018025e-07, + "loss": 1.7637, + "step": 6220 + }, + { + "epoch": 0.419460260772885, + "grad_norm": 4.643682497538772, + "learning_rate": 7.200097184456128e-07, + "loss": 1.7463, + "step": 6225 + }, + { + "epoch": 0.4197971766449917, + "grad_norm": 4.54581096465623, + "learning_rate": 7.19481505370647e-07, + "loss": 1.7802, + "step": 6230 + }, + { + "epoch": 0.4201340925170985, + "grad_norm": 4.745316630456448, + "learning_rate": 7.189529887075327e-07, + "loss": 1.7233, + "step": 6235 + }, + { + "epoch": 0.4204710083892052, + "grad_norm": 4.574264194845761, + "learning_rate": 7.184241691873174e-07, + "loss": 1.7399, + "step": 6240 + }, + { + "epoch": 0.42080792426131197, + "grad_norm": 4.473298847625997, + "learning_rate": 7.178950475414675e-07, + "loss": 1.7663, + "step": 6245 + }, + { + "epoch": 0.4211448401334187, + "grad_norm": 4.791887801785815, + "learning_rate": 7.173656245018671e-07, + "loss": 1.7203, + "step": 6250 + }, + { + "epoch": 0.4214817560055254, + "grad_norm": 4.4061237952087415, + "learning_rate": 7.168359008008177e-07, + "loss": 1.7423, + "step": 6255 + }, + { + "epoch": 0.42181867187763217, + "grad_norm": 4.862344084483968, + "learning_rate": 7.163058771710358e-07, + "loss": 1.6863, + "step": 6260 + }, + { + "epoch": 0.4221555877497389, + "grad_norm": 4.4568155120703326, + "learning_rate": 7.157755543456539e-07, + "loss": 1.7336, + "step": 6265 + }, + { + "epoch": 0.42249250362184565, + "grad_norm": 4.681291018047077, + "learning_rate": 7.152449330582173e-07, + "loss": 1.7215, + "step": 6270 + }, + { + "epoch": 0.42282941949395236, + "grad_norm": 4.665256916999848, + "learning_rate": 7.147140140426848e-07, + "loss": 1.7557, + "step": 6275 + }, + { + "epoch": 0.4231663353660591, + "grad_norm": 4.961134710940176, + "learning_rate": 7.141827980334265e-07, + "loss": 1.6411, + "step": 6280 + }, + { + "epoch": 0.42350325123816585, + "grad_norm": 4.371043870195574, + "learning_rate": 7.136512857652239e-07, + "loss": 1.7018, + "step": 6285 + }, + { + "epoch": 0.42384016711027256, + "grad_norm": 4.8217954723863174, + "learning_rate": 7.131194779732681e-07, + "loss": 1.7602, + "step": 6290 + }, + { + "epoch": 0.4241770829823793, + "grad_norm": 4.613353819208612, + "learning_rate": 7.125873753931586e-07, + "loss": 1.6274, + "step": 6295 + }, + { + "epoch": 0.42451399885448604, + "grad_norm": 4.945775372506359, + "learning_rate": 7.120549787609029e-07, + "loss": 1.6926, + "step": 6300 + }, + { + "epoch": 0.42485091472659275, + "grad_norm": 4.535099056814416, + "learning_rate": 7.115222888129156e-07, + "loss": 1.7193, + "step": 6305 + }, + { + "epoch": 0.4251878305986995, + "grad_norm": 4.593782020802655, + "learning_rate": 7.109893062860161e-07, + "loss": 1.6644, + "step": 6310 + }, + { + "epoch": 0.42552474647080624, + "grad_norm": 4.674544728983331, + "learning_rate": 7.104560319174296e-07, + "loss": 1.7831, + "step": 6315 + }, + { + "epoch": 0.42586166234291295, + "grad_norm": 4.335763702766309, + "learning_rate": 7.099224664447841e-07, + "loss": 1.7124, + "step": 6320 + }, + { + "epoch": 0.4261985782150197, + "grad_norm": 5.061683568780474, + "learning_rate": 7.093886106061106e-07, + "loss": 1.7773, + "step": 6325 + }, + { + "epoch": 0.42653549408712643, + "grad_norm": 4.67914727019353, + "learning_rate": 7.088544651398421e-07, + "loss": 1.7213, + "step": 6330 + }, + { + "epoch": 0.4268724099592332, + "grad_norm": 4.722921833002874, + "learning_rate": 7.083200307848115e-07, + "loss": 1.7383, + "step": 6335 + }, + { + "epoch": 0.4272093258313399, + "grad_norm": 4.585454080536829, + "learning_rate": 7.077853082802516e-07, + "loss": 1.755, + "step": 6340 + }, + { + "epoch": 0.42754624170344663, + "grad_norm": 4.68463042835788, + "learning_rate": 7.072502983657939e-07, + "loss": 1.6551, + "step": 6345 + }, + { + "epoch": 0.4278831575755534, + "grad_norm": 4.6391044127251995, + "learning_rate": 7.067150017814676e-07, + "loss": 1.7024, + "step": 6350 + }, + { + "epoch": 0.4282200734476601, + "grad_norm": 4.824270442615483, + "learning_rate": 7.061794192676979e-07, + "loss": 1.7594, + "step": 6355 + }, + { + "epoch": 0.4285569893197669, + "grad_norm": 4.745912639857761, + "learning_rate": 7.056435515653058e-07, + "loss": 1.7587, + "step": 6360 + }, + { + "epoch": 0.4288939051918736, + "grad_norm": 4.5638877301431, + "learning_rate": 7.051073994155068e-07, + "loss": 1.7336, + "step": 6365 + }, + { + "epoch": 0.4292308210639803, + "grad_norm": 4.842666081654743, + "learning_rate": 7.045709635599098e-07, + "loss": 1.7183, + "step": 6370 + }, + { + "epoch": 0.4295677369360871, + "grad_norm": 5.268210461664882, + "learning_rate": 7.040342447405161e-07, + "loss": 1.6839, + "step": 6375 + }, + { + "epoch": 0.4299046528081938, + "grad_norm": 4.663730097112814, + "learning_rate": 7.034972436997184e-07, + "loss": 1.6709, + "step": 6380 + }, + { + "epoch": 0.4302415686803005, + "grad_norm": 4.389744444342654, + "learning_rate": 7.029599611803e-07, + "loss": 1.7155, + "step": 6385 + }, + { + "epoch": 0.43057848455240727, + "grad_norm": 4.698100060875378, + "learning_rate": 7.024223979254331e-07, + "loss": 1.7587, + "step": 6390 + }, + { + "epoch": 0.430915400424514, + "grad_norm": 4.846149276102287, + "learning_rate": 7.018845546786787e-07, + "loss": 1.6894, + "step": 6395 + }, + { + "epoch": 0.43125231629662075, + "grad_norm": 4.697432621029227, + "learning_rate": 7.013464321839845e-07, + "loss": 1.6051, + "step": 6400 + }, + { + "epoch": 0.43158923216872747, + "grad_norm": 4.405268539181798, + "learning_rate": 7.00808031185685e-07, + "loss": 1.6693, + "step": 6405 + }, + { + "epoch": 0.4319261480408342, + "grad_norm": 4.647272804848751, + "learning_rate": 7.002693524284997e-07, + "loss": 1.6855, + "step": 6410 + }, + { + "epoch": 0.43226306391294095, + "grad_norm": 4.751884151380426, + "learning_rate": 6.997303966575322e-07, + "loss": 1.7463, + "step": 6415 + }, + { + "epoch": 0.43259997978504766, + "grad_norm": 4.829801676870609, + "learning_rate": 6.991911646182696e-07, + "loss": 1.7059, + "step": 6420 + }, + { + "epoch": 0.43293689565715443, + "grad_norm": 5.165031597531048, + "learning_rate": 6.986516570565809e-07, + "loss": 1.8163, + "step": 6425 + }, + { + "epoch": 0.43327381152926114, + "grad_norm": 4.400940338706048, + "learning_rate": 6.981118747187163e-07, + "loss": 1.6662, + "step": 6430 + }, + { + "epoch": 0.43361072740136786, + "grad_norm": 4.232144300389157, + "learning_rate": 6.975718183513056e-07, + "loss": 1.6838, + "step": 6435 + }, + { + "epoch": 0.4339476432734746, + "grad_norm": 4.88653823297036, + "learning_rate": 6.970314887013585e-07, + "loss": 1.6594, + "step": 6440 + }, + { + "epoch": 0.43428455914558134, + "grad_norm": 4.673060912673942, + "learning_rate": 6.964908865162617e-07, + "loss": 1.7143, + "step": 6445 + }, + { + "epoch": 0.4346214750176881, + "grad_norm": 5.075954008879407, + "learning_rate": 6.959500125437801e-07, + "loss": 1.7682, + "step": 6450 + }, + { + "epoch": 0.4349583908897948, + "grad_norm": 4.652966290889153, + "learning_rate": 6.954088675320534e-07, + "loss": 1.6783, + "step": 6455 + }, + { + "epoch": 0.43529530676190153, + "grad_norm": 4.460818120238903, + "learning_rate": 6.948674522295969e-07, + "loss": 1.7432, + "step": 6460 + }, + { + "epoch": 0.4356322226340083, + "grad_norm": 4.5582353433289695, + "learning_rate": 6.943257673852993e-07, + "loss": 1.6726, + "step": 6465 + }, + { + "epoch": 0.435969138506115, + "grad_norm": 5.079974355852745, + "learning_rate": 6.937838137484225e-07, + "loss": 1.7775, + "step": 6470 + }, + { + "epoch": 0.43630605437822173, + "grad_norm": 5.003592553951793, + "learning_rate": 6.932415920686001e-07, + "loss": 1.6158, + "step": 6475 + }, + { + "epoch": 0.4366429702503285, + "grad_norm": 4.383664043122529, + "learning_rate": 6.926991030958362e-07, + "loss": 1.7012, + "step": 6480 + }, + { + "epoch": 0.4369798861224352, + "grad_norm": 4.937261191482905, + "learning_rate": 6.921563475805051e-07, + "loss": 1.6558, + "step": 6485 + }, + { + "epoch": 0.437316801994542, + "grad_norm": 4.544148828349001, + "learning_rate": 6.916133262733493e-07, + "loss": 1.6468, + "step": 6490 + }, + { + "epoch": 0.4376537178666487, + "grad_norm": 4.984524384019845, + "learning_rate": 6.910700399254793e-07, + "loss": 1.7185, + "step": 6495 + }, + { + "epoch": 0.4379906337387554, + "grad_norm": 4.292729528464248, + "learning_rate": 6.905264892883721e-07, + "loss": 1.6787, + "step": 6500 + }, + { + "epoch": 0.4383275496108622, + "grad_norm": 4.11769249289178, + "learning_rate": 6.899826751138701e-07, + "loss": 1.6883, + "step": 6505 + }, + { + "epoch": 0.4386644654829689, + "grad_norm": 5.018149594762948, + "learning_rate": 6.894385981541804e-07, + "loss": 1.7168, + "step": 6510 + }, + { + "epoch": 0.43900138135507566, + "grad_norm": 4.676645625070917, + "learning_rate": 6.888942591618736e-07, + "loss": 1.6919, + "step": 6515 + }, + { + "epoch": 0.4393382972271824, + "grad_norm": 4.921357277560915, + "learning_rate": 6.883496588898827e-07, + "loss": 1.6816, + "step": 6520 + }, + { + "epoch": 0.4396752130992891, + "grad_norm": 4.9134861344297605, + "learning_rate": 6.87804798091502e-07, + "loss": 1.7108, + "step": 6525 + }, + { + "epoch": 0.44001212897139586, + "grad_norm": 4.708196322941507, + "learning_rate": 6.872596775203864e-07, + "loss": 1.7465, + "step": 6530 + }, + { + "epoch": 0.44034904484350257, + "grad_norm": 4.510797543770323, + "learning_rate": 6.867142979305498e-07, + "loss": 1.6689, + "step": 6535 + }, + { + "epoch": 0.44068596071560934, + "grad_norm": 4.522970178745911, + "learning_rate": 6.861686600763648e-07, + "loss": 1.7117, + "step": 6540 + }, + { + "epoch": 0.44102287658771605, + "grad_norm": 4.812495045573041, + "learning_rate": 6.856227647125607e-07, + "loss": 1.7273, + "step": 6545 + }, + { + "epoch": 0.44135979245982276, + "grad_norm": 4.846794241054585, + "learning_rate": 6.850766125942235e-07, + "loss": 1.7438, + "step": 6550 + }, + { + "epoch": 0.44169670833192953, + "grad_norm": 4.8551636572445265, + "learning_rate": 6.84530204476794e-07, + "loss": 1.7021, + "step": 6555 + }, + { + "epoch": 0.44203362420403625, + "grad_norm": 4.651962161849585, + "learning_rate": 6.839835411160673e-07, + "loss": 1.743, + "step": 6560 + }, + { + "epoch": 0.44237054007614296, + "grad_norm": 4.39048228251673, + "learning_rate": 6.834366232681915e-07, + "loss": 1.6973, + "step": 6565 + }, + { + "epoch": 0.44270745594824973, + "grad_norm": 4.530452505592518, + "learning_rate": 6.828894516896664e-07, + "loss": 1.6399, + "step": 6570 + }, + { + "epoch": 0.44304437182035644, + "grad_norm": 4.824517867034706, + "learning_rate": 6.823420271373433e-07, + "loss": 1.7254, + "step": 6575 + }, + { + "epoch": 0.4433812876924632, + "grad_norm": 5.0237910952503775, + "learning_rate": 6.817943503684232e-07, + "loss": 1.7502, + "step": 6580 + }, + { + "epoch": 0.4437182035645699, + "grad_norm": 4.262745340509699, + "learning_rate": 6.812464221404558e-07, + "loss": 1.6935, + "step": 6585 + }, + { + "epoch": 0.44405511943667664, + "grad_norm": 4.321290314072325, + "learning_rate": 6.806982432113388e-07, + "loss": 1.7191, + "step": 6590 + }, + { + "epoch": 0.4443920353087834, + "grad_norm": 4.424079719502189, + "learning_rate": 6.801498143393168e-07, + "loss": 1.719, + "step": 6595 + }, + { + "epoch": 0.4447289511808901, + "grad_norm": 4.7796436067671575, + "learning_rate": 6.796011362829794e-07, + "loss": 1.7639, + "step": 6600 + }, + { + "epoch": 0.4450658670529969, + "grad_norm": 4.834184053342531, + "learning_rate": 6.790522098012621e-07, + "loss": 1.6925, + "step": 6605 + }, + { + "epoch": 0.4454027829251036, + "grad_norm": 4.8753621292969305, + "learning_rate": 6.785030356534428e-07, + "loss": 1.7009, + "step": 6610 + }, + { + "epoch": 0.4457396987972103, + "grad_norm": 5.00324934803504, + "learning_rate": 6.779536145991427e-07, + "loss": 1.7248, + "step": 6615 + }, + { + "epoch": 0.4460766146693171, + "grad_norm": 4.888331632690152, + "learning_rate": 6.774039473983243e-07, + "loss": 1.6402, + "step": 6620 + }, + { + "epoch": 0.4464135305414238, + "grad_norm": 4.648432475886737, + "learning_rate": 6.768540348112906e-07, + "loss": 1.7037, + "step": 6625 + }, + { + "epoch": 0.44675044641353057, + "grad_norm": 4.413897273595553, + "learning_rate": 6.763038775986842e-07, + "loss": 1.7358, + "step": 6630 + }, + { + "epoch": 0.4470873622856373, + "grad_norm": 4.429203314804429, + "learning_rate": 6.757534765214858e-07, + "loss": 1.7404, + "step": 6635 + }, + { + "epoch": 0.447424278157744, + "grad_norm": 4.931142537284251, + "learning_rate": 6.752028323410134e-07, + "loss": 1.652, + "step": 6640 + }, + { + "epoch": 0.44776119402985076, + "grad_norm": 4.500035301333365, + "learning_rate": 6.746519458189214e-07, + "loss": 1.7007, + "step": 6645 + }, + { + "epoch": 0.4480981099019575, + "grad_norm": 4.260996915055294, + "learning_rate": 6.741008177171993e-07, + "loss": 1.7038, + "step": 6650 + }, + { + "epoch": 0.4484350257740642, + "grad_norm": 4.390040709591359, + "learning_rate": 6.735494487981711e-07, + "loss": 1.6483, + "step": 6655 + }, + { + "epoch": 0.44877194164617096, + "grad_norm": 4.197183115180016, + "learning_rate": 6.729978398244935e-07, + "loss": 1.6631, + "step": 6660 + }, + { + "epoch": 0.44910885751827767, + "grad_norm": 4.554047787891987, + "learning_rate": 6.724459915591551e-07, + "loss": 1.7313, + "step": 6665 + }, + { + "epoch": 0.44944577339038444, + "grad_norm": 4.359809841144607, + "learning_rate": 6.718939047654763e-07, + "loss": 1.7046, + "step": 6670 + }, + { + "epoch": 0.44978268926249115, + "grad_norm": 4.529598264112517, + "learning_rate": 6.713415802071064e-07, + "loss": 1.7339, + "step": 6675 + }, + { + "epoch": 0.45011960513459787, + "grad_norm": 4.422320100652781, + "learning_rate": 6.707890186480244e-07, + "loss": 1.7212, + "step": 6680 + }, + { + "epoch": 0.45045652100670464, + "grad_norm": 4.461507269286107, + "learning_rate": 6.702362208525366e-07, + "loss": 1.6548, + "step": 6685 + }, + { + "epoch": 0.45079343687881135, + "grad_norm": 4.240905367444598, + "learning_rate": 6.696831875852763e-07, + "loss": 1.6188, + "step": 6690 + }, + { + "epoch": 0.4511303527509181, + "grad_norm": 4.2509022444509545, + "learning_rate": 6.691299196112025e-07, + "loss": 1.7538, + "step": 6695 + }, + { + "epoch": 0.45146726862302483, + "grad_norm": 4.572894242799344, + "learning_rate": 6.685764176955991e-07, + "loss": 1.6576, + "step": 6700 + }, + { + "epoch": 0.45180418449513154, + "grad_norm": 4.6272307826667, + "learning_rate": 6.680226826040727e-07, + "loss": 1.7617, + "step": 6705 + }, + { + "epoch": 0.4521411003672383, + "grad_norm": 4.766793229835689, + "learning_rate": 6.674687151025535e-07, + "loss": 1.6255, + "step": 6710 + }, + { + "epoch": 0.452478016239345, + "grad_norm": 4.144650326322135, + "learning_rate": 6.669145159572924e-07, + "loss": 1.699, + "step": 6715 + }, + { + "epoch": 0.4528149321114518, + "grad_norm": 4.15141194465061, + "learning_rate": 6.663600859348615e-07, + "loss": 1.6959, + "step": 6720 + }, + { + "epoch": 0.4531518479835585, + "grad_norm": 4.311310984212423, + "learning_rate": 6.658054258021513e-07, + "loss": 1.7403, + "step": 6725 + }, + { + "epoch": 0.4534887638556652, + "grad_norm": 4.10292672099467, + "learning_rate": 6.652505363263712e-07, + "loss": 1.7093, + "step": 6730 + }, + { + "epoch": 0.453825679727772, + "grad_norm": 4.669724380678364, + "learning_rate": 6.646954182750478e-07, + "loss": 1.6632, + "step": 6735 + }, + { + "epoch": 0.4541625955998787, + "grad_norm": 4.460778912261257, + "learning_rate": 6.641400724160234e-07, + "loss": 1.6495, + "step": 6740 + }, + { + "epoch": 0.4544995114719854, + "grad_norm": 4.2960984447404025, + "learning_rate": 6.635844995174561e-07, + "loss": 1.7223, + "step": 6745 + }, + { + "epoch": 0.4548364273440922, + "grad_norm": 4.740934736927874, + "learning_rate": 6.630287003478176e-07, + "loss": 1.7362, + "step": 6750 + }, + { + "epoch": 0.4551733432161989, + "grad_norm": 4.5375663036291, + "learning_rate": 6.624726756758927e-07, + "loss": 1.625, + "step": 6755 + }, + { + "epoch": 0.45551025908830567, + "grad_norm": 4.837729915756388, + "learning_rate": 6.619164262707782e-07, + "loss": 1.7317, + "step": 6760 + }, + { + "epoch": 0.4558471749604124, + "grad_norm": 4.47076219383477, + "learning_rate": 6.613599529018815e-07, + "loss": 1.6642, + "step": 6765 + }, + { + "epoch": 0.4561840908325191, + "grad_norm": 4.5442838582563025, + "learning_rate": 6.608032563389198e-07, + "loss": 1.6893, + "step": 6770 + }, + { + "epoch": 0.45652100670462586, + "grad_norm": 4.480466187001192, + "learning_rate": 6.602463373519196e-07, + "loss": 1.6984, + "step": 6775 + }, + { + "epoch": 0.4568579225767326, + "grad_norm": 4.401722232385289, + "learning_rate": 6.596891967112143e-07, + "loss": 1.67, + "step": 6780 + }, + { + "epoch": 0.45719483844883935, + "grad_norm": 4.122189683257488, + "learning_rate": 6.59131835187444e-07, + "loss": 1.6179, + "step": 6785 + }, + { + "epoch": 0.45753175432094606, + "grad_norm": 4.893513098192096, + "learning_rate": 6.58574253551555e-07, + "loss": 1.7089, + "step": 6790 + }, + { + "epoch": 0.4578686701930528, + "grad_norm": 5.100913785880151, + "learning_rate": 6.580164525747973e-07, + "loss": 1.6715, + "step": 6795 + }, + { + "epoch": 0.45820558606515954, + "grad_norm": 4.756979979355757, + "learning_rate": 6.574584330287247e-07, + "loss": 1.7278, + "step": 6800 + }, + { + "epoch": 0.45854250193726626, + "grad_norm": 4.3372744318168115, + "learning_rate": 6.569001956851932e-07, + "loss": 1.734, + "step": 6805 + }, + { + "epoch": 0.458879417809373, + "grad_norm": 4.170241900109172, + "learning_rate": 6.563417413163601e-07, + "loss": 1.7156, + "step": 6810 + }, + { + "epoch": 0.45921633368147974, + "grad_norm": 4.411221588815247, + "learning_rate": 6.55783070694683e-07, + "loss": 1.7191, + "step": 6815 + }, + { + "epoch": 0.45955324955358645, + "grad_norm": 4.373715138284619, + "learning_rate": 6.55224184592918e-07, + "loss": 1.7069, + "step": 6820 + }, + { + "epoch": 0.4598901654256932, + "grad_norm": 4.464547670300906, + "learning_rate": 6.546650837841203e-07, + "loss": 1.7035, + "step": 6825 + }, + { + "epoch": 0.46022708129779993, + "grad_norm": 4.717468251452631, + "learning_rate": 6.541057690416414e-07, + "loss": 1.6196, + "step": 6830 + }, + { + "epoch": 0.46056399716990665, + "grad_norm": 4.841886356211968, + "learning_rate": 6.535462411391284e-07, + "loss": 1.6812, + "step": 6835 + }, + { + "epoch": 0.4609009130420134, + "grad_norm": 4.34564642283158, + "learning_rate": 6.529865008505244e-07, + "loss": 1.6883, + "step": 6840 + }, + { + "epoch": 0.46123782891412013, + "grad_norm": 5.414010936464202, + "learning_rate": 6.524265489500651e-07, + "loss": 1.7524, + "step": 6845 + }, + { + "epoch": 0.4615747447862269, + "grad_norm": 4.5914727683812995, + "learning_rate": 6.518663862122794e-07, + "loss": 1.6202, + "step": 6850 + }, + { + "epoch": 0.4619116606583336, + "grad_norm": 4.375868168460969, + "learning_rate": 6.513060134119878e-07, + "loss": 1.6169, + "step": 6855 + }, + { + "epoch": 0.4622485765304403, + "grad_norm": 4.441564082721185, + "learning_rate": 6.507454313243015e-07, + "loss": 1.6837, + "step": 6860 + }, + { + "epoch": 0.4625854924025471, + "grad_norm": 4.472804618626952, + "learning_rate": 6.50184640724621e-07, + "loss": 1.7191, + "step": 6865 + }, + { + "epoch": 0.4629224082746538, + "grad_norm": 4.819571378275609, + "learning_rate": 6.496236423886351e-07, + "loss": 1.7436, + "step": 6870 + }, + { + "epoch": 0.4632593241467606, + "grad_norm": 4.511185185898374, + "learning_rate": 6.490624370923201e-07, + "loss": 1.6813, + "step": 6875 + }, + { + "epoch": 0.4635962400188673, + "grad_norm": 4.474284380549234, + "learning_rate": 6.485010256119388e-07, + "loss": 1.6681, + "step": 6880 + }, + { + "epoch": 0.463933155890974, + "grad_norm": 4.46092530350263, + "learning_rate": 6.479394087240389e-07, + "loss": 1.7347, + "step": 6885 + }, + { + "epoch": 0.46427007176308077, + "grad_norm": 5.22492163790687, + "learning_rate": 6.473775872054521e-07, + "loss": 1.763, + "step": 6890 + }, + { + "epoch": 0.4646069876351875, + "grad_norm": 4.797720542641283, + "learning_rate": 6.468155618332936e-07, + "loss": 1.6729, + "step": 6895 + }, + { + "epoch": 0.46494390350729425, + "grad_norm": 4.562185869700717, + "learning_rate": 6.462533333849599e-07, + "loss": 1.7049, + "step": 6900 + }, + { + "epoch": 0.46528081937940097, + "grad_norm": 4.29159762801429, + "learning_rate": 6.456909026381292e-07, + "loss": 1.668, + "step": 6905 + }, + { + "epoch": 0.4656177352515077, + "grad_norm": 4.92003769196269, + "learning_rate": 6.451282703707591e-07, + "loss": 1.6669, + "step": 6910 + }, + { + "epoch": 0.46595465112361445, + "grad_norm": 5.059190111464898, + "learning_rate": 6.445654373610854e-07, + "loss": 1.7021, + "step": 6915 + }, + { + "epoch": 0.46629156699572116, + "grad_norm": 4.653309079648875, + "learning_rate": 6.440024043876229e-07, + "loss": 1.7259, + "step": 6920 + }, + { + "epoch": 0.4666284828678279, + "grad_norm": 4.274946723216935, + "learning_rate": 6.434391722291618e-07, + "loss": 1.7479, + "step": 6925 + }, + { + "epoch": 0.46696539873993465, + "grad_norm": 4.808800818933682, + "learning_rate": 6.428757416647683e-07, + "loss": 1.6954, + "step": 6930 + }, + { + "epoch": 0.46730231461204136, + "grad_norm": 5.013223950151504, + "learning_rate": 6.42312113473783e-07, + "loss": 1.6911, + "step": 6935 + }, + { + "epoch": 0.4676392304841481, + "grad_norm": 4.476533094171933, + "learning_rate": 6.417482884358196e-07, + "loss": 1.6955, + "step": 6940 + }, + { + "epoch": 0.46797614635625484, + "grad_norm": 4.28394035440652, + "learning_rate": 6.411842673307648e-07, + "loss": 1.7042, + "step": 6945 + }, + { + "epoch": 0.46831306222836155, + "grad_norm": 4.698352832884206, + "learning_rate": 6.406200509387756e-07, + "loss": 1.6375, + "step": 6950 + }, + { + "epoch": 0.4686499781004683, + "grad_norm": 4.794921943207182, + "learning_rate": 6.400556400402796e-07, + "loss": 1.7587, + "step": 6955 + }, + { + "epoch": 0.46898689397257504, + "grad_norm": 4.662930706736691, + "learning_rate": 6.394910354159736e-07, + "loss": 1.6615, + "step": 6960 + }, + { + "epoch": 0.4693238098446818, + "grad_norm": 4.693125552744149, + "learning_rate": 6.389262378468219e-07, + "loss": 1.7346, + "step": 6965 + }, + { + "epoch": 0.4696607257167885, + "grad_norm": 4.3614045657280744, + "learning_rate": 6.38361248114056e-07, + "loss": 1.6847, + "step": 6970 + }, + { + "epoch": 0.46999764158889523, + "grad_norm": 4.471600545769862, + "learning_rate": 6.377960669991733e-07, + "loss": 1.8017, + "step": 6975 + }, + { + "epoch": 0.470334557461002, + "grad_norm": 4.964624731566448, + "learning_rate": 6.372306952839353e-07, + "loss": 1.7063, + "step": 6980 + }, + { + "epoch": 0.4706714733331087, + "grad_norm": 4.616530701520827, + "learning_rate": 6.36665133750368e-07, + "loss": 1.7691, + "step": 6985 + }, + { + "epoch": 0.4710083892052155, + "grad_norm": 4.712769110974502, + "learning_rate": 6.360993831807593e-07, + "loss": 1.6286, + "step": 6990 + }, + { + "epoch": 0.4713453050773222, + "grad_norm": 4.601223725894037, + "learning_rate": 6.355334443576589e-07, + "loss": 1.7151, + "step": 6995 + }, + { + "epoch": 0.4716822209494289, + "grad_norm": 4.451414543137239, + "learning_rate": 6.349673180638769e-07, + "loss": 1.6681, + "step": 7000 + }, + { + "epoch": 0.4720191368215357, + "grad_norm": 5.152496315584951, + "learning_rate": 6.344010050824824e-07, + "loss": 1.7217, + "step": 7005 + }, + { + "epoch": 0.4723560526936424, + "grad_norm": 4.3986744071785004, + "learning_rate": 6.338345061968032e-07, + "loss": 1.7014, + "step": 7010 + }, + { + "epoch": 0.4726929685657491, + "grad_norm": 4.3217833852883345, + "learning_rate": 6.33267822190424e-07, + "loss": 1.6387, + "step": 7015 + }, + { + "epoch": 0.4730298844378559, + "grad_norm": 4.400063987546612, + "learning_rate": 6.327009538471853e-07, + "loss": 1.7177, + "step": 7020 + }, + { + "epoch": 0.4733668003099626, + "grad_norm": 4.616172815467751, + "learning_rate": 6.321339019511828e-07, + "loss": 1.6774, + "step": 7025 + }, + { + "epoch": 0.47370371618206936, + "grad_norm": 4.541432828621094, + "learning_rate": 6.315666672867664e-07, + "loss": 1.726, + "step": 7030 + }, + { + "epoch": 0.47404063205417607, + "grad_norm": 4.688208120332176, + "learning_rate": 6.309992506385385e-07, + "loss": 1.6291, + "step": 7035 + }, + { + "epoch": 0.4743775479262828, + "grad_norm": 4.6366612851099, + "learning_rate": 6.304316527913531e-07, + "loss": 1.6624, + "step": 7040 + }, + { + "epoch": 0.47471446379838955, + "grad_norm": 4.691085653627631, + "learning_rate": 6.29863874530315e-07, + "loss": 1.6532, + "step": 7045 + }, + { + "epoch": 0.47505137967049627, + "grad_norm": 4.7136046478678715, + "learning_rate": 6.292959166407785e-07, + "loss": 1.7466, + "step": 7050 + }, + { + "epoch": 0.47538829554260303, + "grad_norm": 4.9212482928274, + "learning_rate": 6.287277799083466e-07, + "loss": 1.687, + "step": 7055 + }, + { + "epoch": 0.47572521141470975, + "grad_norm": 4.526217309277262, + "learning_rate": 6.281594651188693e-07, + "loss": 1.6252, + "step": 7060 + }, + { + "epoch": 0.47606212728681646, + "grad_norm": 4.396351782230909, + "learning_rate": 6.275909730584431e-07, + "loss": 1.6776, + "step": 7065 + }, + { + "epoch": 0.47639904315892323, + "grad_norm": 4.51842155556115, + "learning_rate": 6.270223045134095e-07, + "loss": 1.7123, + "step": 7070 + }, + { + "epoch": 0.47673595903102994, + "grad_norm": 4.759606117812422, + "learning_rate": 6.264534602703546e-07, + "loss": 1.6613, + "step": 7075 + }, + { + "epoch": 0.4770728749031367, + "grad_norm": 4.707001993687859, + "learning_rate": 6.25884441116107e-07, + "loss": 1.733, + "step": 7080 + }, + { + "epoch": 0.4774097907752434, + "grad_norm": 4.467447574118067, + "learning_rate": 6.253152478377375e-07, + "loss": 1.6931, + "step": 7085 + }, + { + "epoch": 0.47774670664735014, + "grad_norm": 4.611033578404069, + "learning_rate": 6.247458812225576e-07, + "loss": 1.7302, + "step": 7090 + }, + { + "epoch": 0.4780836225194569, + "grad_norm": 4.4569214323453155, + "learning_rate": 6.241763420581188e-07, + "loss": 1.7401, + "step": 7095 + }, + { + "epoch": 0.4784205383915636, + "grad_norm": 4.982762182500195, + "learning_rate": 6.23606631132211e-07, + "loss": 1.6746, + "step": 7100 + }, + { + "epoch": 0.47875745426367033, + "grad_norm": 4.607941073155509, + "learning_rate": 6.23036749232862e-07, + "loss": 1.7207, + "step": 7105 + }, + { + "epoch": 0.4790943701357771, + "grad_norm": 4.887796210429232, + "learning_rate": 6.224666971483355e-07, + "loss": 1.6298, + "step": 7110 + }, + { + "epoch": 0.4794312860078838, + "grad_norm": 4.712956172096251, + "learning_rate": 6.218964756671315e-07, + "loss": 1.6563, + "step": 7115 + }, + { + "epoch": 0.4797682018799906, + "grad_norm": 4.778565383140019, + "learning_rate": 6.213260855779834e-07, + "loss": 1.6974, + "step": 7120 + }, + { + "epoch": 0.4801051177520973, + "grad_norm": 4.40756947233122, + "learning_rate": 6.207555276698584e-07, + "loss": 1.7371, + "step": 7125 + }, + { + "epoch": 0.480442033624204, + "grad_norm": 4.631757813103534, + "learning_rate": 6.201848027319556e-07, + "loss": 1.7037, + "step": 7130 + }, + { + "epoch": 0.4807789494963108, + "grad_norm": 4.520106006973044, + "learning_rate": 6.196139115537054e-07, + "loss": 1.6721, + "step": 7135 + }, + { + "epoch": 0.4811158653684175, + "grad_norm": 4.778967002753012, + "learning_rate": 6.190428549247677e-07, + "loss": 1.7327, + "step": 7140 + }, + { + "epoch": 0.48145278124052426, + "grad_norm": 4.366124855672215, + "learning_rate": 6.184716336350316e-07, + "loss": 1.7767, + "step": 7145 + }, + { + "epoch": 0.481789697112631, + "grad_norm": 4.938934702989457, + "learning_rate": 6.179002484746137e-07, + "loss": 1.7204, + "step": 7150 + }, + { + "epoch": 0.4821266129847377, + "grad_norm": 4.570886894075659, + "learning_rate": 6.173287002338577e-07, + "loss": 1.6922, + "step": 7155 + }, + { + "epoch": 0.48246352885684446, + "grad_norm": 4.584354407905234, + "learning_rate": 6.167569897033322e-07, + "loss": 1.6977, + "step": 7160 + }, + { + "epoch": 0.4828004447289512, + "grad_norm": 4.478905810314622, + "learning_rate": 6.16185117673831e-07, + "loss": 1.7657, + "step": 7165 + }, + { + "epoch": 0.48313736060105794, + "grad_norm": 4.468921511502001, + "learning_rate": 6.15613084936371e-07, + "loss": 1.7682, + "step": 7170 + }, + { + "epoch": 0.48347427647316465, + "grad_norm": 4.40177007449443, + "learning_rate": 6.150408922821911e-07, + "loss": 1.7354, + "step": 7175 + }, + { + "epoch": 0.48381119234527137, + "grad_norm": 4.6505765708076625, + "learning_rate": 6.144685405027518e-07, + "loss": 1.6936, + "step": 7180 + }, + { + "epoch": 0.48414810821737814, + "grad_norm": 4.939569186419913, + "learning_rate": 6.138960303897335e-07, + "loss": 1.7379, + "step": 7185 + }, + { + "epoch": 0.48448502408948485, + "grad_norm": 4.679413425948785, + "learning_rate": 6.133233627350355e-07, + "loss": 1.6535, + "step": 7190 + }, + { + "epoch": 0.48482193996159156, + "grad_norm": 4.325726315028096, + "learning_rate": 6.127505383307754e-07, + "loss": 1.6835, + "step": 7195 + }, + { + "epoch": 0.48515885583369833, + "grad_norm": 4.42673458969308, + "learning_rate": 6.121775579692873e-07, + "loss": 1.7601, + "step": 7200 + }, + { + "epoch": 0.48549577170580505, + "grad_norm": 4.845073129236577, + "learning_rate": 6.116044224431212e-07, + "loss": 1.7168, + "step": 7205 + }, + { + "epoch": 0.4858326875779118, + "grad_norm": 4.546143580258222, + "learning_rate": 6.110311325450416e-07, + "loss": 1.6114, + "step": 7210 + }, + { + "epoch": 0.48616960345001853, + "grad_norm": 4.77554325019675, + "learning_rate": 6.104576890680263e-07, + "loss": 1.69, + "step": 7215 + }, + { + "epoch": 0.48650651932212524, + "grad_norm": 4.71098928853645, + "learning_rate": 6.098840928052663e-07, + "loss": 1.7593, + "step": 7220 + }, + { + "epoch": 0.486843435194232, + "grad_norm": 4.750660306122936, + "learning_rate": 6.093103445501629e-07, + "loss": 1.6479, + "step": 7225 + }, + { + "epoch": 0.4871803510663387, + "grad_norm": 4.739154005718471, + "learning_rate": 6.087364450963286e-07, + "loss": 1.6892, + "step": 7230 + }, + { + "epoch": 0.4875172669384455, + "grad_norm": 4.530538648921853, + "learning_rate": 6.081623952375843e-07, + "loss": 1.7401, + "step": 7235 + }, + { + "epoch": 0.4878541828105522, + "grad_norm": 4.649623744583918, + "learning_rate": 6.075881957679593e-07, + "loss": 1.704, + "step": 7240 + }, + { + "epoch": 0.4881910986826589, + "grad_norm": 5.577179816173861, + "learning_rate": 6.0701384748169e-07, + "loss": 1.7216, + "step": 7245 + }, + { + "epoch": 0.4885280145547657, + "grad_norm": 4.912479991065959, + "learning_rate": 6.064393511732181e-07, + "loss": 1.7308, + "step": 7250 + }, + { + "epoch": 0.4888649304268724, + "grad_norm": 4.539142069414185, + "learning_rate": 6.058647076371906e-07, + "loss": 1.7037, + "step": 7255 + }, + { + "epoch": 0.48920184629897917, + "grad_norm": 4.557851134005118, + "learning_rate": 6.052899176684579e-07, + "loss": 1.6553, + "step": 7260 + }, + { + "epoch": 0.4895387621710859, + "grad_norm": 4.739125290233805, + "learning_rate": 6.047149820620729e-07, + "loss": 1.6802, + "step": 7265 + }, + { + "epoch": 0.4898756780431926, + "grad_norm": 4.791864662336917, + "learning_rate": 6.0413990161329e-07, + "loss": 1.6821, + "step": 7270 + }, + { + "epoch": 0.49021259391529937, + "grad_norm": 4.4891631106081284, + "learning_rate": 6.035646771175642e-07, + "loss": 1.7102, + "step": 7275 + }, + { + "epoch": 0.4905495097874061, + "grad_norm": 4.7430657098261495, + "learning_rate": 6.029893093705491e-07, + "loss": 1.6943, + "step": 7280 + }, + { + "epoch": 0.4908864256595128, + "grad_norm": 4.156109932566331, + "learning_rate": 6.024137991680973e-07, + "loss": 1.6247, + "step": 7285 + }, + { + "epoch": 0.49122334153161956, + "grad_norm": 4.328377079553324, + "learning_rate": 6.018381473062575e-07, + "loss": 1.7405, + "step": 7290 + }, + { + "epoch": 0.4915602574037263, + "grad_norm": 4.822065127779098, + "learning_rate": 6.012623545812754e-07, + "loss": 1.6312, + "step": 7295 + }, + { + "epoch": 0.49189717327583304, + "grad_norm": 4.6088445308051265, + "learning_rate": 6.006864217895906e-07, + "loss": 1.7427, + "step": 7300 + }, + { + "epoch": 0.49223408914793976, + "grad_norm": 4.379384111050929, + "learning_rate": 6.001103497278369e-07, + "loss": 1.7152, + "step": 7305 + }, + { + "epoch": 0.49257100502004647, + "grad_norm": 5.148479140033514, + "learning_rate": 5.995341391928408e-07, + "loss": 1.7061, + "step": 7310 + }, + { + "epoch": 0.49290792089215324, + "grad_norm": 4.437063736650041, + "learning_rate": 5.9895779098162e-07, + "loss": 1.7114, + "step": 7315 + }, + { + "epoch": 0.49324483676425995, + "grad_norm": 4.3579861769268335, + "learning_rate": 5.983813058913829e-07, + "loss": 1.6559, + "step": 7320 + }, + { + "epoch": 0.4935817526363667, + "grad_norm": 4.415788857014313, + "learning_rate": 5.978046847195272e-07, + "loss": 1.732, + "step": 7325 + }, + { + "epoch": 0.49391866850847344, + "grad_norm": 4.855952082704319, + "learning_rate": 5.97227928263639e-07, + "loss": 1.7554, + "step": 7330 + }, + { + "epoch": 0.49425558438058015, + "grad_norm": 4.370984522374044, + "learning_rate": 5.96651037321491e-07, + "loss": 1.7363, + "step": 7335 + }, + { + "epoch": 0.4945925002526869, + "grad_norm": 4.38389535228623, + "learning_rate": 5.960740126910425e-07, + "loss": 1.6725, + "step": 7340 + }, + { + "epoch": 0.49492941612479363, + "grad_norm": 4.6681190283608265, + "learning_rate": 5.954968551704373e-07, + "loss": 1.7207, + "step": 7345 + }, + { + "epoch": 0.4952663319969004, + "grad_norm": 4.697238368356332, + "learning_rate": 5.949195655580032e-07, + "loss": 1.6947, + "step": 7350 + }, + { + "epoch": 0.4956032478690071, + "grad_norm": 4.868515081677355, + "learning_rate": 5.943421446522509e-07, + "loss": 1.7416, + "step": 7355 + }, + { + "epoch": 0.4959401637411138, + "grad_norm": 4.611727186234703, + "learning_rate": 5.93764593251872e-07, + "loss": 1.6861, + "step": 7360 + }, + { + "epoch": 0.4962770796132206, + "grad_norm": 5.139350862969239, + "learning_rate": 5.931869121557397e-07, + "loss": 1.7155, + "step": 7365 + }, + { + "epoch": 0.4966139954853273, + "grad_norm": 4.446635040666518, + "learning_rate": 5.926091021629055e-07, + "loss": 1.6639, + "step": 7370 + }, + { + "epoch": 0.496950911357434, + "grad_norm": 4.559284920143756, + "learning_rate": 5.920311640726e-07, + "loss": 1.6769, + "step": 7375 + }, + { + "epoch": 0.4972878272295408, + "grad_norm": 4.80113478282041, + "learning_rate": 5.914530986842307e-07, + "loss": 1.7342, + "step": 7380 + }, + { + "epoch": 0.4976247431016475, + "grad_norm": 4.784159804294562, + "learning_rate": 5.908749067973809e-07, + "loss": 1.7198, + "step": 7385 + }, + { + "epoch": 0.4979616589737543, + "grad_norm": 4.570174722848811, + "learning_rate": 5.902965892118093e-07, + "loss": 1.7289, + "step": 7390 + }, + { + "epoch": 0.498298574845861, + "grad_norm": 4.374276853118066, + "learning_rate": 5.89718146727448e-07, + "loss": 1.7007, + "step": 7395 + }, + { + "epoch": 0.4986354907179677, + "grad_norm": 4.580820249984145, + "learning_rate": 5.891395801444026e-07, + "loss": 1.6943, + "step": 7400 + }, + { + "epoch": 0.49897240659007447, + "grad_norm": 4.883342042123659, + "learning_rate": 5.885608902629496e-07, + "loss": 1.6594, + "step": 7405 + }, + { + "epoch": 0.4993093224621812, + "grad_norm": 4.621394877055568, + "learning_rate": 5.879820778835364e-07, + "loss": 1.7269, + "step": 7410 + }, + { + "epoch": 0.49964623833428795, + "grad_norm": 5.160480894377575, + "learning_rate": 5.874031438067799e-07, + "loss": 1.6898, + "step": 7415 + }, + { + "epoch": 0.49998315420639466, + "grad_norm": 4.6208075920844305, + "learning_rate": 5.868240888334652e-07, + "loss": 1.694, + "step": 7420 + }, + { + "epoch": 0.5003200700785014, + "grad_norm": 4.736793560219323, + "learning_rate": 5.862449137645444e-07, + "loss": 1.7042, + "step": 7425 + }, + { + "epoch": 0.5006569859506081, + "grad_norm": 4.597792580937797, + "learning_rate": 5.856656194011365e-07, + "loss": 1.6511, + "step": 7430 + }, + { + "epoch": 0.5009939018227149, + "grad_norm": 4.866240037242486, + "learning_rate": 5.850862065445243e-07, + "loss": 1.6591, + "step": 7435 + }, + { + "epoch": 0.5013308176948216, + "grad_norm": 4.610706051955656, + "learning_rate": 5.845066759961557e-07, + "loss": 1.7021, + "step": 7440 + }, + { + "epoch": 0.5016677335669283, + "grad_norm": 4.425839858456235, + "learning_rate": 5.839270285576407e-07, + "loss": 1.7601, + "step": 7445 + }, + { + "epoch": 0.5020046494390351, + "grad_norm": 4.322417058164186, + "learning_rate": 5.833472650307509e-07, + "loss": 1.6423, + "step": 7450 + }, + { + "epoch": 0.5023415653111418, + "grad_norm": 4.505490220698428, + "learning_rate": 5.827673862174192e-07, + "loss": 1.7577, + "step": 7455 + }, + { + "epoch": 0.5026784811832485, + "grad_norm": 4.496510986517032, + "learning_rate": 5.821873929197371e-07, + "loss": 1.7129, + "step": 7460 + }, + { + "epoch": 0.5030153970553553, + "grad_norm": 5.187889955890472, + "learning_rate": 5.81607285939955e-07, + "loss": 1.6744, + "step": 7465 + }, + { + "epoch": 0.503352312927462, + "grad_norm": 5.006933034613527, + "learning_rate": 5.810270660804805e-07, + "loss": 1.7044, + "step": 7470 + }, + { + "epoch": 0.5036892287995688, + "grad_norm": 4.355856888149377, + "learning_rate": 5.80446734143877e-07, + "loss": 1.68, + "step": 7475 + }, + { + "epoch": 0.5040261446716755, + "grad_norm": 4.923953800176147, + "learning_rate": 5.798662909328633e-07, + "loss": 1.67, + "step": 7480 + }, + { + "epoch": 0.5043630605437822, + "grad_norm": 4.69561635565534, + "learning_rate": 5.792857372503119e-07, + "loss": 1.6919, + "step": 7485 + }, + { + "epoch": 0.5046999764158889, + "grad_norm": 4.77170903782858, + "learning_rate": 5.787050738992481e-07, + "loss": 1.6912, + "step": 7490 + }, + { + "epoch": 0.5050368922879956, + "grad_norm": 4.869923711817952, + "learning_rate": 5.781243016828492e-07, + "loss": 1.7032, + "step": 7495 + }, + { + "epoch": 0.5053738081601025, + "grad_norm": 4.585049776668319, + "learning_rate": 5.775434214044427e-07, + "loss": 1.7285, + "step": 7500 + }, + { + "epoch": 0.5057107240322092, + "grad_norm": 4.874001250553335, + "learning_rate": 5.769624338675057e-07, + "loss": 1.7505, + "step": 7505 + }, + { + "epoch": 0.5060476399043159, + "grad_norm": 4.590180315940176, + "learning_rate": 5.763813398756637e-07, + "loss": 1.7429, + "step": 7510 + }, + { + "epoch": 0.5063845557764226, + "grad_norm": 4.4868477944766925, + "learning_rate": 5.758001402326895e-07, + "loss": 1.7306, + "step": 7515 + }, + { + "epoch": 0.5067214716485293, + "grad_norm": 4.81873671171168, + "learning_rate": 5.752188357425019e-07, + "loss": 1.7365, + "step": 7520 + }, + { + "epoch": 0.5070583875206361, + "grad_norm": 4.484325181883257, + "learning_rate": 5.746374272091648e-07, + "loss": 1.7848, + "step": 7525 + }, + { + "epoch": 0.5073953033927429, + "grad_norm": 4.58168391806788, + "learning_rate": 5.74055915436886e-07, + "loss": 1.7356, + "step": 7530 + }, + { + "epoch": 0.5077322192648496, + "grad_norm": 4.6016598603627274, + "learning_rate": 5.734743012300162e-07, + "loss": 1.658, + "step": 7535 + }, + { + "epoch": 0.5080691351369563, + "grad_norm": 4.663029396984945, + "learning_rate": 5.728925853930475e-07, + "loss": 1.6654, + "step": 7540 + }, + { + "epoch": 0.508406051009063, + "grad_norm": 4.708893231070334, + "learning_rate": 5.72310768730613e-07, + "loss": 1.694, + "step": 7545 + }, + { + "epoch": 0.5087429668811698, + "grad_norm": 4.928385739239798, + "learning_rate": 5.717288520474849e-07, + "loss": 1.7709, + "step": 7550 + }, + { + "epoch": 0.5090798827532765, + "grad_norm": 4.420335047530358, + "learning_rate": 5.711468361485739e-07, + "loss": 1.687, + "step": 7555 + }, + { + "epoch": 0.5094167986253832, + "grad_norm": 4.471384860102386, + "learning_rate": 5.70564721838928e-07, + "loss": 1.7015, + "step": 7560 + }, + { + "epoch": 0.50975371449749, + "grad_norm": 4.865562364958896, + "learning_rate": 5.69982509923731e-07, + "loss": 1.7155, + "step": 7565 + }, + { + "epoch": 0.5100906303695967, + "grad_norm": 4.66852339522664, + "learning_rate": 5.694002012083022e-07, + "loss": 1.7676, + "step": 7570 + }, + { + "epoch": 0.5104275462417035, + "grad_norm": 4.242499891955819, + "learning_rate": 5.688177964980946e-07, + "loss": 1.6495, + "step": 7575 + }, + { + "epoch": 0.5107644621138102, + "grad_norm": 4.528198372867546, + "learning_rate": 5.682352965986935e-07, + "loss": 1.6835, + "step": 7580 + }, + { + "epoch": 0.5111013779859169, + "grad_norm": 5.2865806513696665, + "learning_rate": 5.676527023158169e-07, + "loss": 1.7656, + "step": 7585 + }, + { + "epoch": 0.5114382938580236, + "grad_norm": 5.006350707748134, + "learning_rate": 5.670700144553122e-07, + "loss": 1.7122, + "step": 7590 + }, + { + "epoch": 0.5117752097301304, + "grad_norm": 4.890176475676627, + "learning_rate": 5.664872338231571e-07, + "loss": 1.718, + "step": 7595 + }, + { + "epoch": 0.5121121256022371, + "grad_norm": 4.283085242905683, + "learning_rate": 5.659043612254573e-07, + "loss": 1.7049, + "step": 7600 + }, + { + "epoch": 0.5124490414743439, + "grad_norm": 4.671644584097648, + "learning_rate": 5.653213974684455e-07, + "loss": 1.6345, + "step": 7605 + }, + { + "epoch": 0.5127859573464506, + "grad_norm": 4.497378025154527, + "learning_rate": 5.647383433584807e-07, + "loss": 1.6532, + "step": 7610 + }, + { + "epoch": 0.5131228732185573, + "grad_norm": 4.770390373926907, + "learning_rate": 5.641551997020472e-07, + "loss": 1.7167, + "step": 7615 + }, + { + "epoch": 0.513459789090664, + "grad_norm": 4.644610367955537, + "learning_rate": 5.635719673057524e-07, + "loss": 1.7123, + "step": 7620 + }, + { + "epoch": 0.5137967049627707, + "grad_norm": 4.453064547486986, + "learning_rate": 5.629886469763273e-07, + "loss": 1.6907, + "step": 7625 + }, + { + "epoch": 0.5141336208348776, + "grad_norm": 4.341259433438874, + "learning_rate": 5.624052395206239e-07, + "loss": 1.7277, + "step": 7630 + }, + { + "epoch": 0.5144705367069843, + "grad_norm": 4.713591617595884, + "learning_rate": 5.618217457456151e-07, + "loss": 1.675, + "step": 7635 + }, + { + "epoch": 0.514807452579091, + "grad_norm": 4.5922560977258975, + "learning_rate": 5.612381664583928e-07, + "loss": 1.7212, + "step": 7640 + }, + { + "epoch": 0.5151443684511977, + "grad_norm": 4.508581513278687, + "learning_rate": 5.606545024661674e-07, + "loss": 1.7216, + "step": 7645 + }, + { + "epoch": 0.5154812843233044, + "grad_norm": 4.291503221118531, + "learning_rate": 5.600707545762667e-07, + "loss": 1.6839, + "step": 7650 + }, + { + "epoch": 0.5158182001954112, + "grad_norm": 4.549053737765561, + "learning_rate": 5.594869235961342e-07, + "loss": 1.7465, + "step": 7655 + }, + { + "epoch": 0.516155116067518, + "grad_norm": 4.281551741122212, + "learning_rate": 5.589030103333282e-07, + "loss": 1.7556, + "step": 7660 + }, + { + "epoch": 0.5164920319396247, + "grad_norm": 4.6183154550647165, + "learning_rate": 5.583190155955215e-07, + "loss": 1.6429, + "step": 7665 + }, + { + "epoch": 0.5168289478117314, + "grad_norm": 4.713779318598783, + "learning_rate": 5.57734940190499e-07, + "loss": 1.6639, + "step": 7670 + }, + { + "epoch": 0.5171658636838381, + "grad_norm": 4.568884925715355, + "learning_rate": 5.571507849261572e-07, + "loss": 1.6695, + "step": 7675 + }, + { + "epoch": 0.5175027795559449, + "grad_norm": 4.726799714559216, + "learning_rate": 5.565665506105035e-07, + "loss": 1.6608, + "step": 7680 + }, + { + "epoch": 0.5178396954280516, + "grad_norm": 4.747440214485794, + "learning_rate": 5.559822380516539e-07, + "loss": 1.7234, + "step": 7685 + }, + { + "epoch": 0.5181766113001584, + "grad_norm": 5.041416882602425, + "learning_rate": 5.553978480578335e-07, + "loss": 1.6882, + "step": 7690 + }, + { + "epoch": 0.5185135271722651, + "grad_norm": 4.270518135256821, + "learning_rate": 5.548133814373738e-07, + "loss": 1.6844, + "step": 7695 + }, + { + "epoch": 0.5188504430443718, + "grad_norm": 4.713569188055113, + "learning_rate": 5.542288389987128e-07, + "loss": 1.7434, + "step": 7700 + }, + { + "epoch": 0.5191873589164786, + "grad_norm": 4.424402536805608, + "learning_rate": 5.536442215503929e-07, + "loss": 1.7149, + "step": 7705 + }, + { + "epoch": 0.5195242747885853, + "grad_norm": 4.691497277737614, + "learning_rate": 5.530595299010606e-07, + "loss": 1.6705, + "step": 7710 + }, + { + "epoch": 0.519861190660692, + "grad_norm": 4.968253526258946, + "learning_rate": 5.524747648594651e-07, + "loss": 1.7873, + "step": 7715 + }, + { + "epoch": 0.5201981065327987, + "grad_norm": 4.504586292746603, + "learning_rate": 5.518899272344568e-07, + "loss": 1.7035, + "step": 7720 + }, + { + "epoch": 0.5205350224049055, + "grad_norm": 4.391879900734021, + "learning_rate": 5.513050178349866e-07, + "loss": 1.7332, + "step": 7725 + }, + { + "epoch": 0.5208719382770123, + "grad_norm": 4.961582186100007, + "learning_rate": 5.507200374701048e-07, + "loss": 1.7472, + "step": 7730 + }, + { + "epoch": 0.521208854149119, + "grad_norm": 4.274600688997273, + "learning_rate": 5.501349869489596e-07, + "loss": 1.7823, + "step": 7735 + }, + { + "epoch": 0.5215457700212257, + "grad_norm": 4.608506610910215, + "learning_rate": 5.495498670807967e-07, + "loss": 1.69, + "step": 7740 + }, + { + "epoch": 0.5218826858933324, + "grad_norm": 4.657102507065289, + "learning_rate": 5.489646786749574e-07, + "loss": 1.6813, + "step": 7745 + }, + { + "epoch": 0.5222196017654391, + "grad_norm": 4.6290567932911735, + "learning_rate": 5.483794225408777e-07, + "loss": 1.7001, + "step": 7750 + }, + { + "epoch": 0.522556517637546, + "grad_norm": 4.582264959213173, + "learning_rate": 5.477940994880877e-07, + "loss": 1.5716, + "step": 7755 + }, + { + "epoch": 0.5228934335096527, + "grad_norm": 4.457243577063691, + "learning_rate": 5.472087103262094e-07, + "loss": 1.7249, + "step": 7760 + }, + { + "epoch": 0.5232303493817594, + "grad_norm": 4.391587991132564, + "learning_rate": 5.46623255864957e-07, + "loss": 1.7002, + "step": 7765 + }, + { + "epoch": 0.5235672652538661, + "grad_norm": 4.251022723910024, + "learning_rate": 5.460377369141345e-07, + "loss": 1.7482, + "step": 7770 + }, + { + "epoch": 0.5239041811259728, + "grad_norm": 4.796986297299885, + "learning_rate": 5.454521542836351e-07, + "loss": 1.6758, + "step": 7775 + }, + { + "epoch": 0.5242410969980795, + "grad_norm": 4.9104458316986355, + "learning_rate": 5.448665087834405e-07, + "loss": 1.6565, + "step": 7780 + }, + { + "epoch": 0.5245780128701863, + "grad_norm": 4.873279146811863, + "learning_rate": 5.442808012236192e-07, + "loss": 1.6767, + "step": 7785 + }, + { + "epoch": 0.5249149287422931, + "grad_norm": 4.731997781914198, + "learning_rate": 5.436950324143251e-07, + "loss": 1.7282, + "step": 7790 + }, + { + "epoch": 0.5252518446143998, + "grad_norm": 4.5914824351135435, + "learning_rate": 5.431092031657973e-07, + "loss": 1.6881, + "step": 7795 + }, + { + "epoch": 0.5255887604865065, + "grad_norm": 4.370668534964779, + "learning_rate": 5.425233142883585e-07, + "loss": 1.7491, + "step": 7800 + }, + { + "epoch": 0.5259256763586132, + "grad_norm": 4.46181656752716, + "learning_rate": 5.419373665924136e-07, + "loss": 1.6698, + "step": 7805 + }, + { + "epoch": 0.52626259223072, + "grad_norm": 4.164236490813964, + "learning_rate": 5.413513608884491e-07, + "loss": 1.728, + "step": 7810 + }, + { + "epoch": 0.5265995081028267, + "grad_norm": 4.507873534552062, + "learning_rate": 5.407652979870315e-07, + "loss": 1.6963, + "step": 7815 + }, + { + "epoch": 0.5269364239749335, + "grad_norm": 4.677386986638094, + "learning_rate": 5.401791786988068e-07, + "loss": 1.7004, + "step": 7820 + }, + { + "epoch": 0.5272733398470402, + "grad_norm": 4.284763421389914, + "learning_rate": 5.395930038344986e-07, + "loss": 1.7088, + "step": 7825 + }, + { + "epoch": 0.5276102557191469, + "grad_norm": 4.470418550681959, + "learning_rate": 5.390067742049073e-07, + "loss": 1.6565, + "step": 7830 + }, + { + "epoch": 0.5279471715912537, + "grad_norm": 4.89005652035299, + "learning_rate": 5.384204906209097e-07, + "loss": 1.633, + "step": 7835 + }, + { + "epoch": 0.5282840874633604, + "grad_norm": 4.277901423918713, + "learning_rate": 5.378341538934566e-07, + "loss": 1.6048, + "step": 7840 + }, + { + "epoch": 0.5286210033354671, + "grad_norm": 4.935713002066088, + "learning_rate": 5.372477648335725e-07, + "loss": 1.7168, + "step": 7845 + }, + { + "epoch": 0.5289579192075738, + "grad_norm": 4.594470771134848, + "learning_rate": 5.366613242523544e-07, + "loss": 1.6782, + "step": 7850 + }, + { + "epoch": 0.5292948350796806, + "grad_norm": 4.470432848589744, + "learning_rate": 5.360748329609702e-07, + "loss": 1.7082, + "step": 7855 + }, + { + "epoch": 0.5296317509517874, + "grad_norm": 4.58838789888362, + "learning_rate": 5.354882917706586e-07, + "loss": 1.74, + "step": 7860 + }, + { + "epoch": 0.5299686668238941, + "grad_norm": 4.931385031949068, + "learning_rate": 5.349017014927267e-07, + "loss": 1.76, + "step": 7865 + }, + { + "epoch": 0.5303055826960008, + "grad_norm": 4.316385158985576, + "learning_rate": 5.343150629385496e-07, + "loss": 1.6984, + "step": 7870 + }, + { + "epoch": 0.5306424985681075, + "grad_norm": 4.95603801804168, + "learning_rate": 5.337283769195696e-07, + "loss": 1.5816, + "step": 7875 + }, + { + "epoch": 0.5309794144402142, + "grad_norm": 4.63138953237527, + "learning_rate": 5.331416442472941e-07, + "loss": 1.6088, + "step": 7880 + }, + { + "epoch": 0.5313163303123211, + "grad_norm": 5.136600945477254, + "learning_rate": 5.325548657332956e-07, + "loss": 1.6416, + "step": 7885 + }, + { + "epoch": 0.5316532461844278, + "grad_norm": 4.597913163170623, + "learning_rate": 5.319680421892095e-07, + "loss": 1.629, + "step": 7890 + }, + { + "epoch": 0.5319901620565345, + "grad_norm": 4.912203001671273, + "learning_rate": 5.313811744267336e-07, + "loss": 1.6955, + "step": 7895 + }, + { + "epoch": 0.5323270779286412, + "grad_norm": 4.898778418458837, + "learning_rate": 5.30794263257627e-07, + "loss": 1.6931, + "step": 7900 + }, + { + "epoch": 0.5326639938007479, + "grad_norm": 4.712869973378353, + "learning_rate": 5.302073094937089e-07, + "loss": 1.6538, + "step": 7905 + }, + { + "epoch": 0.5330009096728547, + "grad_norm": 4.631259210071004, + "learning_rate": 5.296203139468571e-07, + "loss": 1.6861, + "step": 7910 + }, + { + "epoch": 0.5333378255449615, + "grad_norm": 4.547471371920815, + "learning_rate": 5.290332774290077e-07, + "loss": 1.6822, + "step": 7915 + }, + { + "epoch": 0.5336747414170682, + "grad_norm": 4.628705770790463, + "learning_rate": 5.284462007521528e-07, + "loss": 1.7231, + "step": 7920 + }, + { + "epoch": 0.5340116572891749, + "grad_norm": 4.378755980314644, + "learning_rate": 5.278590847283407e-07, + "loss": 1.6724, + "step": 7925 + }, + { + "epoch": 0.5343485731612816, + "grad_norm": 4.620965174090904, + "learning_rate": 5.27271930169674e-07, + "loss": 1.6702, + "step": 7930 + }, + { + "epoch": 0.5346854890333884, + "grad_norm": 4.666805516823848, + "learning_rate": 5.266847378883079e-07, + "loss": 1.6935, + "step": 7935 + }, + { + "epoch": 0.5350224049054951, + "grad_norm": 4.800429109529865, + "learning_rate": 5.260975086964507e-07, + "loss": 1.7228, + "step": 7940 + }, + { + "epoch": 0.5353593207776018, + "grad_norm": 4.732114405330755, + "learning_rate": 5.255102434063612e-07, + "loss": 1.6925, + "step": 7945 + }, + { + "epoch": 0.5356962366497086, + "grad_norm": 4.934511610082521, + "learning_rate": 5.249229428303486e-07, + "loss": 1.7302, + "step": 7950 + }, + { + "epoch": 0.5360331525218153, + "grad_norm": 4.691119337420431, + "learning_rate": 5.243356077807704e-07, + "loss": 1.7188, + "step": 7955 + }, + { + "epoch": 0.536370068393922, + "grad_norm": 4.210217220737946, + "learning_rate": 5.237482390700319e-07, + "loss": 1.7517, + "step": 7960 + }, + { + "epoch": 0.5367069842660288, + "grad_norm": 4.463884209035557, + "learning_rate": 5.231608375105852e-07, + "loss": 1.6262, + "step": 7965 + }, + { + "epoch": 0.5370439001381355, + "grad_norm": 4.718179447099045, + "learning_rate": 5.225734039149277e-07, + "loss": 1.6842, + "step": 7970 + }, + { + "epoch": 0.5373808160102422, + "grad_norm": 4.269434927425726, + "learning_rate": 5.219859390956012e-07, + "loss": 1.6166, + "step": 7975 + }, + { + "epoch": 0.537717731882349, + "grad_norm": 4.675892807792239, + "learning_rate": 5.213984438651904e-07, + "loss": 1.671, + "step": 7980 + }, + { + "epoch": 0.5380546477544557, + "grad_norm": 5.157607246722674, + "learning_rate": 5.208109190363222e-07, + "loss": 1.7264, + "step": 7985 + }, + { + "epoch": 0.5383915636265625, + "grad_norm": 4.74183668799578, + "learning_rate": 5.202233654216649e-07, + "loss": 1.6967, + "step": 7990 + }, + { + "epoch": 0.5387284794986692, + "grad_norm": 5.059820409448004, + "learning_rate": 5.196357838339259e-07, + "loss": 1.6226, + "step": 7995 + }, + { + "epoch": 0.5390653953707759, + "grad_norm": 4.353630566773812, + "learning_rate": 5.190481750858516e-07, + "loss": 1.6884, + "step": 8000 + }, + { + "epoch": 0.5394023112428826, + "grad_norm": 4.820195518964003, + "learning_rate": 5.184605399902262e-07, + "loss": 1.6845, + "step": 8005 + }, + { + "epoch": 0.5397392271149893, + "grad_norm": 4.450052549010921, + "learning_rate": 5.178728793598699e-07, + "loss": 1.7251, + "step": 8010 + }, + { + "epoch": 0.5400761429870962, + "grad_norm": 4.7654340761853575, + "learning_rate": 5.172851940076387e-07, + "loss": 1.6985, + "step": 8015 + }, + { + "epoch": 0.5404130588592029, + "grad_norm": 4.683470373788527, + "learning_rate": 5.166974847464223e-07, + "loss": 1.7706, + "step": 8020 + }, + { + "epoch": 0.5407499747313096, + "grad_norm": 4.824119687454772, + "learning_rate": 5.161097523891437e-07, + "loss": 1.7028, + "step": 8025 + }, + { + "epoch": 0.5410868906034163, + "grad_norm": 4.516681596644095, + "learning_rate": 5.15521997748758e-07, + "loss": 1.7156, + "step": 8030 + }, + { + "epoch": 0.541423806475523, + "grad_norm": 4.349696412018392, + "learning_rate": 5.149342216382511e-07, + "loss": 1.6188, + "step": 8035 + }, + { + "epoch": 0.5417607223476298, + "grad_norm": 4.442839169291913, + "learning_rate": 5.143464248706381e-07, + "loss": 1.6777, + "step": 8040 + }, + { + "epoch": 0.5420976382197366, + "grad_norm": 4.842887030499734, + "learning_rate": 5.137586082589633e-07, + "loss": 1.6884, + "step": 8045 + }, + { + "epoch": 0.5424345540918433, + "grad_norm": 5.260125717485399, + "learning_rate": 5.131707726162983e-07, + "loss": 1.7046, + "step": 8050 + }, + { + "epoch": 0.54277146996395, + "grad_norm": 4.4292959246006305, + "learning_rate": 5.125829187557406e-07, + "loss": 1.6895, + "step": 8055 + }, + { + "epoch": 0.5431083858360567, + "grad_norm": 4.398986244582595, + "learning_rate": 5.119950474904137e-07, + "loss": 1.7533, + "step": 8060 + }, + { + "epoch": 0.5434453017081635, + "grad_norm": 4.294749638256764, + "learning_rate": 5.114071596334642e-07, + "loss": 1.6337, + "step": 8065 + }, + { + "epoch": 0.5437822175802702, + "grad_norm": 4.678822056409349, + "learning_rate": 5.108192559980623e-07, + "loss": 1.7385, + "step": 8070 + }, + { + "epoch": 0.544119133452377, + "grad_norm": 4.549614415148617, + "learning_rate": 5.102313373974e-07, + "loss": 1.7601, + "step": 8075 + }, + { + "epoch": 0.5444560493244837, + "grad_norm": 4.64005770931557, + "learning_rate": 5.096434046446898e-07, + "loss": 1.6658, + "step": 8080 + }, + { + "epoch": 0.5447929651965904, + "grad_norm": 4.0286388311888, + "learning_rate": 5.090554585531639e-07, + "loss": 1.6677, + "step": 8085 + }, + { + "epoch": 0.5451298810686972, + "grad_norm": 4.634030770095132, + "learning_rate": 5.084674999360729e-07, + "loss": 1.6842, + "step": 8090 + }, + { + "epoch": 0.5454667969408039, + "grad_norm": 4.507452689503554, + "learning_rate": 5.078795296066846e-07, + "loss": 1.7786, + "step": 8095 + }, + { + "epoch": 0.5458037128129106, + "grad_norm": 4.50967203132743, + "learning_rate": 5.072915483782833e-07, + "loss": 1.5791, + "step": 8100 + }, + { + "epoch": 0.5461406286850173, + "grad_norm": 4.401152112417684, + "learning_rate": 5.067035570641678e-07, + "loss": 1.7083, + "step": 8105 + }, + { + "epoch": 0.546477544557124, + "grad_norm": 4.680901428735062, + "learning_rate": 5.061155564776517e-07, + "loss": 1.6894, + "step": 8110 + }, + { + "epoch": 0.5468144604292309, + "grad_norm": 4.297623735657957, + "learning_rate": 5.055275474320609e-07, + "loss": 1.7282, + "step": 8115 + }, + { + "epoch": 0.5471513763013376, + "grad_norm": 4.652741505309037, + "learning_rate": 5.049395307407328e-07, + "loss": 1.7205, + "step": 8120 + }, + { + "epoch": 0.5474882921734443, + "grad_norm": 4.396575176637696, + "learning_rate": 5.04351507217016e-07, + "loss": 1.6343, + "step": 8125 + }, + { + "epoch": 0.547825208045551, + "grad_norm": 5.067335330992528, + "learning_rate": 5.03763477674268e-07, + "loss": 1.66, + "step": 8130 + }, + { + "epoch": 0.5481621239176577, + "grad_norm": 4.764149280006885, + "learning_rate": 5.031754429258549e-07, + "loss": 1.7649, + "step": 8135 + }, + { + "epoch": 0.5484990397897644, + "grad_norm": 4.689365821442324, + "learning_rate": 5.025874037851499e-07, + "loss": 1.6931, + "step": 8140 + }, + { + "epoch": 0.5488359556618713, + "grad_norm": 4.6085486128563184, + "learning_rate": 5.019993610655322e-07, + "loss": 1.6921, + "step": 8145 + }, + { + "epoch": 0.549172871533978, + "grad_norm": 4.419580714168477, + "learning_rate": 5.014113155803863e-07, + "loss": 1.7069, + "step": 8150 + }, + { + "epoch": 0.5495097874060847, + "grad_norm": 4.479763083233425, + "learning_rate": 5.008232681430999e-07, + "loss": 1.7082, + "step": 8155 + }, + { + "epoch": 0.5498467032781914, + "grad_norm": 4.729296163432992, + "learning_rate": 5.002352195670643e-07, + "loss": 1.5793, + "step": 8160 + }, + { + "epoch": 0.5501836191502981, + "grad_norm": 4.920600762682583, + "learning_rate": 4.996471706656715e-07, + "loss": 1.6859, + "step": 8165 + }, + { + "epoch": 0.5505205350224049, + "grad_norm": 4.949329604420274, + "learning_rate": 4.990591222523142e-07, + "loss": 1.7081, + "step": 8170 + }, + { + "epoch": 0.5508574508945117, + "grad_norm": 4.1980818218131555, + "learning_rate": 4.984710751403849e-07, + "loss": 1.6864, + "step": 8175 + }, + { + "epoch": 0.5511943667666184, + "grad_norm": 4.418385051349536, + "learning_rate": 4.978830301432738e-07, + "loss": 1.7467, + "step": 8180 + }, + { + "epoch": 0.5515312826387251, + "grad_norm": 4.73152418477475, + "learning_rate": 4.97294988074368e-07, + "loss": 1.7329, + "step": 8185 + }, + { + "epoch": 0.5518681985108318, + "grad_norm": 4.71362442505742, + "learning_rate": 4.96706949747051e-07, + "loss": 1.6406, + "step": 8190 + }, + { + "epoch": 0.5522051143829386, + "grad_norm": 4.487306736267114, + "learning_rate": 4.961189159747015e-07, + "loss": 1.6963, + "step": 8195 + }, + { + "epoch": 0.5525420302550453, + "grad_norm": 4.8689064749222375, + "learning_rate": 4.955308875706905e-07, + "loss": 1.7162, + "step": 8200 + }, + { + "epoch": 0.552878946127152, + "grad_norm": 4.954032629608422, + "learning_rate": 4.94942865348383e-07, + "loss": 1.6799, + "step": 8205 + }, + { + "epoch": 0.5532158619992588, + "grad_norm": 4.609550126338583, + "learning_rate": 4.943548501211351e-07, + "loss": 1.6984, + "step": 8210 + }, + { + "epoch": 0.5535527778713655, + "grad_norm": 4.592731793929496, + "learning_rate": 4.937668427022924e-07, + "loss": 1.6618, + "step": 8215 + }, + { + "epoch": 0.5538896937434723, + "grad_norm": 4.612954517966647, + "learning_rate": 4.931788439051909e-07, + "loss": 1.6494, + "step": 8220 + }, + { + "epoch": 0.554226609615579, + "grad_norm": 4.781552723452898, + "learning_rate": 4.925908545431537e-07, + "loss": 1.6531, + "step": 8225 + }, + { + "epoch": 0.5545635254876857, + "grad_norm": 4.96982515502249, + "learning_rate": 4.920028754294915e-07, + "loss": 1.6884, + "step": 8230 + }, + { + "epoch": 0.5549004413597924, + "grad_norm": 4.909103884057576, + "learning_rate": 4.914149073775003e-07, + "loss": 1.7128, + "step": 8235 + }, + { + "epoch": 0.5552373572318992, + "grad_norm": 4.559122268469247, + "learning_rate": 4.908269512004613e-07, + "loss": 1.6685, + "step": 8240 + }, + { + "epoch": 0.555574273104006, + "grad_norm": 4.7935399767154925, + "learning_rate": 4.902390077116392e-07, + "loss": 1.6834, + "step": 8245 + }, + { + "epoch": 0.5559111889761127, + "grad_norm": 4.675577266111019, + "learning_rate": 4.896510777242805e-07, + "loss": 1.6316, + "step": 8250 + }, + { + "epoch": 0.5562481048482194, + "grad_norm": 4.75778670778229, + "learning_rate": 4.890631620516141e-07, + "loss": 1.6933, + "step": 8255 + }, + { + "epoch": 0.5565850207203261, + "grad_norm": 4.88551208195117, + "learning_rate": 4.88475261506848e-07, + "loss": 1.6561, + "step": 8260 + }, + { + "epoch": 0.5569219365924328, + "grad_norm": 4.661130950299994, + "learning_rate": 4.878873769031702e-07, + "loss": 1.7364, + "step": 8265 + }, + { + "epoch": 0.5572588524645397, + "grad_norm": 4.422533661936322, + "learning_rate": 4.872995090537459e-07, + "loss": 1.6691, + "step": 8270 + }, + { + "epoch": 0.5575957683366464, + "grad_norm": 4.661440808148939, + "learning_rate": 4.867116587717179e-07, + "loss": 1.7266, + "step": 8275 + }, + { + "epoch": 0.5579326842087531, + "grad_norm": 4.239501455891196, + "learning_rate": 4.861238268702039e-07, + "loss": 1.6937, + "step": 8280 + }, + { + "epoch": 0.5582696000808598, + "grad_norm": 5.038883363121014, + "learning_rate": 4.855360141622965e-07, + "loss": 1.723, + "step": 8285 + }, + { + "epoch": 0.5586065159529665, + "grad_norm": 4.262685657903031, + "learning_rate": 4.849482214610623e-07, + "loss": 1.6939, + "step": 8290 + }, + { + "epoch": 0.5589434318250733, + "grad_norm": 4.399744779227197, + "learning_rate": 4.843604495795392e-07, + "loss": 1.7151, + "step": 8295 + }, + { + "epoch": 0.55928034769718, + "grad_norm": 4.703118750408812, + "learning_rate": 4.83772699330737e-07, + "loss": 1.6589, + "step": 8300 + }, + { + "epoch": 0.5596172635692868, + "grad_norm": 4.650840967246696, + "learning_rate": 4.831849715276355e-07, + "loss": 1.6736, + "step": 8305 + }, + { + "epoch": 0.5599541794413935, + "grad_norm": 4.992067739030684, + "learning_rate": 4.825972669831834e-07, + "loss": 1.6443, + "step": 8310 + }, + { + "epoch": 0.5602910953135002, + "grad_norm": 4.656959268996734, + "learning_rate": 4.82009586510297e-07, + "loss": 1.6643, + "step": 8315 + }, + { + "epoch": 0.5606280111856069, + "grad_norm": 4.581554977226479, + "learning_rate": 4.814219309218594e-07, + "loss": 1.6773, + "step": 8320 + }, + { + "epoch": 0.5609649270577137, + "grad_norm": 4.290945719182593, + "learning_rate": 4.808343010307199e-07, + "loss": 1.7589, + "step": 8325 + }, + { + "epoch": 0.5613018429298204, + "grad_norm": 4.7021818808486255, + "learning_rate": 4.802466976496911e-07, + "loss": 1.7013, + "step": 8330 + }, + { + "epoch": 0.5616387588019272, + "grad_norm": 4.853582184853659, + "learning_rate": 4.796591215915498e-07, + "loss": 1.7118, + "step": 8335 + }, + { + "epoch": 0.5619756746740339, + "grad_norm": 4.591796496584383, + "learning_rate": 4.79071573669035e-07, + "loss": 1.7245, + "step": 8340 + }, + { + "epoch": 0.5623125905461406, + "grad_norm": 4.312408889428106, + "learning_rate": 4.784840546948463e-07, + "loss": 1.6718, + "step": 8345 + }, + { + "epoch": 0.5626495064182474, + "grad_norm": 4.678194520576363, + "learning_rate": 4.778965654816435e-07, + "loss": 1.6318, + "step": 8350 + }, + { + "epoch": 0.5629864222903541, + "grad_norm": 4.630467160745489, + "learning_rate": 4.773091068420455e-07, + "loss": 1.7436, + "step": 8355 + }, + { + "epoch": 0.5633233381624608, + "grad_norm": 4.541776089598689, + "learning_rate": 4.767216795886281e-07, + "loss": 1.7184, + "step": 8360 + }, + { + "epoch": 0.5636602540345675, + "grad_norm": 4.467363162187823, + "learning_rate": 4.761342845339246e-07, + "loss": 1.6953, + "step": 8365 + }, + { + "epoch": 0.5639971699066743, + "grad_norm": 5.089902203731988, + "learning_rate": 4.7554692249042345e-07, + "loss": 1.7457, + "step": 8370 + }, + { + "epoch": 0.5643340857787811, + "grad_norm": 4.488762450890132, + "learning_rate": 4.7495959427056754e-07, + "loss": 1.7263, + "step": 8375 + }, + { + "epoch": 0.5646710016508878, + "grad_norm": 4.267196572613947, + "learning_rate": 4.743723006867523e-07, + "loss": 1.6892, + "step": 8380 + }, + { + "epoch": 0.5650079175229945, + "grad_norm": 4.471832534053066, + "learning_rate": 4.737850425513263e-07, + "loss": 1.699, + "step": 8385 + }, + { + "epoch": 0.5653448333951012, + "grad_norm": 4.32195057372782, + "learning_rate": 4.731978206765884e-07, + "loss": 1.69, + "step": 8390 + }, + { + "epoch": 0.5656817492672079, + "grad_norm": 4.772982759500117, + "learning_rate": 4.726106358747871e-07, + "loss": 1.7185, + "step": 8395 + }, + { + "epoch": 0.5660186651393148, + "grad_norm": 4.506780870023458, + "learning_rate": 4.720234889581203e-07, + "loss": 1.7407, + "step": 8400 + }, + { + "epoch": 0.5663555810114215, + "grad_norm": 4.775965979267713, + "learning_rate": 4.714363807387333e-07, + "loss": 1.7065, + "step": 8405 + }, + { + "epoch": 0.5666924968835282, + "grad_norm": 4.543574977359184, + "learning_rate": 4.708493120287175e-07, + "loss": 1.715, + "step": 8410 + }, + { + "epoch": 0.5670294127556349, + "grad_norm": 4.216614068049089, + "learning_rate": 4.7026228364010984e-07, + "loss": 1.6877, + "step": 8415 + }, + { + "epoch": 0.5673663286277416, + "grad_norm": 4.571120024348289, + "learning_rate": 4.69675296384892e-07, + "loss": 1.7409, + "step": 8420 + }, + { + "epoch": 0.5677032444998484, + "grad_norm": 4.452168437386814, + "learning_rate": 4.6908835107498775e-07, + "loss": 1.7086, + "step": 8425 + }, + { + "epoch": 0.5680401603719551, + "grad_norm": 4.496686032131901, + "learning_rate": 4.685014485222637e-07, + "loss": 1.6897, + "step": 8430 + }, + { + "epoch": 0.5683770762440619, + "grad_norm": 4.513536089655021, + "learning_rate": 4.679145895385269e-07, + "loss": 1.6941, + "step": 8435 + }, + { + "epoch": 0.5687139921161686, + "grad_norm": 4.919282730735273, + "learning_rate": 4.673277749355245e-07, + "loss": 1.6058, + "step": 8440 + }, + { + "epoch": 0.5690509079882753, + "grad_norm": 5.0241144585906685, + "learning_rate": 4.667410055249417e-07, + "loss": 1.6192, + "step": 8445 + }, + { + "epoch": 0.5693878238603821, + "grad_norm": 4.825190139086496, + "learning_rate": 4.6615428211840154e-07, + "loss": 1.6666, + "step": 8450 + }, + { + "epoch": 0.5697247397324888, + "grad_norm": 4.567783140676474, + "learning_rate": 4.655676055274637e-07, + "loss": 1.6983, + "step": 8455 + }, + { + "epoch": 0.5700616556045955, + "grad_norm": 4.82863687523337, + "learning_rate": 4.6498097656362247e-07, + "loss": 1.7003, + "step": 8460 + }, + { + "epoch": 0.5703985714767023, + "grad_norm": 4.7561387266728, + "learning_rate": 4.643943960383067e-07, + "loss": 1.5966, + "step": 8465 + }, + { + "epoch": 0.570735487348809, + "grad_norm": 4.680484457748234, + "learning_rate": 4.638078647628782e-07, + "loss": 1.802, + "step": 8470 + }, + { + "epoch": 0.5710724032209158, + "grad_norm": 4.6946401693960125, + "learning_rate": 4.632213835486305e-07, + "loss": 1.7394, + "step": 8475 + }, + { + "epoch": 0.5714093190930225, + "grad_norm": 4.540704185339018, + "learning_rate": 4.626349532067879e-07, + "loss": 1.7039, + "step": 8480 + }, + { + "epoch": 0.5717462349651292, + "grad_norm": 4.396382603367253, + "learning_rate": 4.620485745485046e-07, + "loss": 1.6342, + "step": 8485 + }, + { + "epoch": 0.5720831508372359, + "grad_norm": 4.650468869736771, + "learning_rate": 4.6146224838486287e-07, + "loss": 1.7561, + "step": 8490 + }, + { + "epoch": 0.5724200667093426, + "grad_norm": 4.3725778710452845, + "learning_rate": 4.6087597552687275e-07, + "loss": 1.7229, + "step": 8495 + }, + { + "epoch": 0.5727569825814494, + "grad_norm": 7.127703602208947, + "learning_rate": 4.602897567854705e-07, + "loss": 1.6767, + "step": 8500 + }, + { + "epoch": 0.5730938984535562, + "grad_norm": 4.749927884410711, + "learning_rate": 4.5970359297151733e-07, + "loss": 1.6543, + "step": 8505 + }, + { + "epoch": 0.5734308143256629, + "grad_norm": 4.559570945610894, + "learning_rate": 4.591174848957986e-07, + "loss": 1.7265, + "step": 8510 + }, + { + "epoch": 0.5737677301977696, + "grad_norm": 4.5939906488538895, + "learning_rate": 4.585314333690224e-07, + "loss": 1.7119, + "step": 8515 + }, + { + "epoch": 0.5741046460698763, + "grad_norm": 4.249421774835835, + "learning_rate": 4.579454392018192e-07, + "loss": 1.6535, + "step": 8520 + }, + { + "epoch": 0.574441561941983, + "grad_norm": 4.109291213056605, + "learning_rate": 4.5735950320473915e-07, + "loss": 1.6659, + "step": 8525 + }, + { + "epoch": 0.5747784778140899, + "grad_norm": 4.618222010138749, + "learning_rate": 4.5677362618825265e-07, + "loss": 1.6468, + "step": 8530 + }, + { + "epoch": 0.5751153936861966, + "grad_norm": 4.612200159477145, + "learning_rate": 4.5618780896274866e-07, + "loss": 1.6851, + "step": 8535 + }, + { + "epoch": 0.5754523095583033, + "grad_norm": 5.003053852009229, + "learning_rate": 4.556020523385326e-07, + "loss": 1.6119, + "step": 8540 + }, + { + "epoch": 0.57578922543041, + "grad_norm": 4.6683042791197575, + "learning_rate": 4.55016357125827e-07, + "loss": 1.6521, + "step": 8545 + }, + { + "epoch": 0.5761261413025167, + "grad_norm": 4.494709104565061, + "learning_rate": 4.5443072413476877e-07, + "loss": 1.6743, + "step": 8550 + }, + { + "epoch": 0.5764630571746235, + "grad_norm": 4.874792839672999, + "learning_rate": 4.5384515417540914e-07, + "loss": 1.6915, + "step": 8555 + }, + { + "epoch": 0.5767999730467303, + "grad_norm": 4.481805781018169, + "learning_rate": 4.5325964805771187e-07, + "loss": 1.685, + "step": 8560 + }, + { + "epoch": 0.577136888918837, + "grad_norm": 4.5394444499066715, + "learning_rate": 4.526742065915528e-07, + "loss": 1.6167, + "step": 8565 + }, + { + "epoch": 0.5774738047909437, + "grad_norm": 4.3024721598181594, + "learning_rate": 4.520888305867181e-07, + "loss": 1.647, + "step": 8570 + }, + { + "epoch": 0.5778107206630504, + "grad_norm": 4.93852787215156, + "learning_rate": 4.5150352085290315e-07, + "loss": 1.7097, + "step": 8575 + }, + { + "epoch": 0.5781476365351572, + "grad_norm": 4.604154090081095, + "learning_rate": 4.5091827819971207e-07, + "loss": 1.7154, + "step": 8580 + }, + { + "epoch": 0.5784845524072639, + "grad_norm": 4.635206348927524, + "learning_rate": 4.503331034366563e-07, + "loss": 1.6914, + "step": 8585 + }, + { + "epoch": 0.5788214682793706, + "grad_norm": 4.462169939610559, + "learning_rate": 4.4974799737315274e-07, + "loss": 1.6473, + "step": 8590 + }, + { + "epoch": 0.5791583841514774, + "grad_norm": 4.404248007791692, + "learning_rate": 4.491629608185237e-07, + "loss": 1.6687, + "step": 8595 + }, + { + "epoch": 0.5794953000235841, + "grad_norm": 4.417246419975575, + "learning_rate": 4.485779945819956e-07, + "loss": 1.6734, + "step": 8600 + }, + { + "epoch": 0.5798322158956909, + "grad_norm": 4.587835613701123, + "learning_rate": 4.479930994726968e-07, + "loss": 1.6319, + "step": 8605 + }, + { + "epoch": 0.5801691317677976, + "grad_norm": 4.543488439399577, + "learning_rate": 4.474082762996581e-07, + "loss": 1.7323, + "step": 8610 + }, + { + "epoch": 0.5805060476399043, + "grad_norm": 4.543906159067551, + "learning_rate": 4.468235258718105e-07, + "loss": 1.7344, + "step": 8615 + }, + { + "epoch": 0.580842963512011, + "grad_norm": 4.396940012204263, + "learning_rate": 4.4623884899798397e-07, + "loss": 1.6825, + "step": 8620 + }, + { + "epoch": 0.5811798793841177, + "grad_norm": 5.062533344351063, + "learning_rate": 4.4565424648690743e-07, + "loss": 1.6386, + "step": 8625 + }, + { + "epoch": 0.5815167952562246, + "grad_norm": 4.736125482927736, + "learning_rate": 4.450697191472067e-07, + "loss": 1.6547, + "step": 8630 + }, + { + "epoch": 0.5818537111283313, + "grad_norm": 4.425455347464461, + "learning_rate": 4.4448526778740327e-07, + "loss": 1.6442, + "step": 8635 + }, + { + "epoch": 0.582190627000438, + "grad_norm": 4.84707850623792, + "learning_rate": 4.439008932159138e-07, + "loss": 1.7129, + "step": 8640 + }, + { + "epoch": 0.5825275428725447, + "grad_norm": 4.67296322499853, + "learning_rate": 4.4331659624104876e-07, + "loss": 1.7726, + "step": 8645 + }, + { + "epoch": 0.5828644587446514, + "grad_norm": 4.778465340510698, + "learning_rate": 4.427323776710117e-07, + "loss": 1.7425, + "step": 8650 + }, + { + "epoch": 0.5832013746167583, + "grad_norm": 4.711550493290235, + "learning_rate": 4.4214823831389663e-07, + "loss": 1.6981, + "step": 8655 + }, + { + "epoch": 0.583538290488865, + "grad_norm": 4.512779690254907, + "learning_rate": 4.41564178977689e-07, + "loss": 1.6096, + "step": 8660 + }, + { + "epoch": 0.5838752063609717, + "grad_norm": 4.6872911357761184, + "learning_rate": 4.4098020047026343e-07, + "loss": 1.6494, + "step": 8665 + }, + { + "epoch": 0.5842121222330784, + "grad_norm": 4.547380845970744, + "learning_rate": 4.4039630359938194e-07, + "loss": 1.6356, + "step": 8670 + }, + { + "epoch": 0.5845490381051851, + "grad_norm": 4.4410239787703265, + "learning_rate": 4.3981248917269477e-07, + "loss": 1.7121, + "step": 8675 + }, + { + "epoch": 0.5848859539772918, + "grad_norm": 4.447517319316789, + "learning_rate": 4.3922875799773735e-07, + "loss": 1.6612, + "step": 8680 + }, + { + "epoch": 0.5852228698493986, + "grad_norm": 4.4385798637549, + "learning_rate": 4.386451108819302e-07, + "loss": 1.6545, + "step": 8685 + }, + { + "epoch": 0.5855597857215054, + "grad_norm": 4.926861478661365, + "learning_rate": 4.380615486325774e-07, + "loss": 1.6946, + "step": 8690 + }, + { + "epoch": 0.5858967015936121, + "grad_norm": 4.119226148156626, + "learning_rate": 4.3747807205686616e-07, + "loss": 1.6452, + "step": 8695 + }, + { + "epoch": 0.5862336174657188, + "grad_norm": 3.9868492954091796, + "learning_rate": 4.3689468196186433e-07, + "loss": 1.6384, + "step": 8700 + }, + { + "epoch": 0.5865705333378255, + "grad_norm": 4.26822813610702, + "learning_rate": 4.36311379154521e-07, + "loss": 1.7479, + "step": 8705 + }, + { + "epoch": 0.5869074492099323, + "grad_norm": 4.618293683972388, + "learning_rate": 4.3572816444166406e-07, + "loss": 1.7083, + "step": 8710 + }, + { + "epoch": 0.587244365082039, + "grad_norm": 5.183493055108485, + "learning_rate": 4.351450386299996e-07, + "loss": 1.6114, + "step": 8715 + }, + { + "epoch": 0.5875812809541457, + "grad_norm": 4.6830635864687755, + "learning_rate": 4.3456200252611075e-07, + "loss": 1.6875, + "step": 8720 + }, + { + "epoch": 0.5879181968262525, + "grad_norm": 4.48502779266541, + "learning_rate": 4.3397905693645653e-07, + "loss": 1.721, + "step": 8725 + }, + { + "epoch": 0.5882551126983592, + "grad_norm": 4.91326546399509, + "learning_rate": 4.3339620266737116e-07, + "loss": 1.6608, + "step": 8730 + }, + { + "epoch": 0.588592028570466, + "grad_norm": 4.653039821741647, + "learning_rate": 4.328134405250617e-07, + "loss": 1.6222, + "step": 8735 + }, + { + "epoch": 0.5889289444425727, + "grad_norm": 4.919960637616408, + "learning_rate": 4.322307713156085e-07, + "loss": 1.6197, + "step": 8740 + }, + { + "epoch": 0.5892658603146794, + "grad_norm": 4.813487307948052, + "learning_rate": 4.316481958449634e-07, + "loss": 1.6323, + "step": 8745 + }, + { + "epoch": 0.5896027761867861, + "grad_norm": 4.390267109347444, + "learning_rate": 4.310657149189478e-07, + "loss": 1.7268, + "step": 8750 + }, + { + "epoch": 0.5899396920588929, + "grad_norm": 4.6590548821479665, + "learning_rate": 4.3048332934325325e-07, + "loss": 1.7155, + "step": 8755 + }, + { + "epoch": 0.5902766079309997, + "grad_norm": 5.0419072706091645, + "learning_rate": 4.2990103992343893e-07, + "loss": 1.6464, + "step": 8760 + }, + { + "epoch": 0.5906135238031064, + "grad_norm": 4.331131220822677, + "learning_rate": 4.2931884746493107e-07, + "loss": 1.7478, + "step": 8765 + }, + { + "epoch": 0.5909504396752131, + "grad_norm": 4.689736914803806, + "learning_rate": 4.287367527730216e-07, + "loss": 1.6688, + "step": 8770 + }, + { + "epoch": 0.5912873555473198, + "grad_norm": 4.618094275967223, + "learning_rate": 4.2815475665286766e-07, + "loss": 1.7149, + "step": 8775 + }, + { + "epoch": 0.5916242714194265, + "grad_norm": 5.835306696527706, + "learning_rate": 4.2757285990948993e-07, + "loss": 1.6927, + "step": 8780 + }, + { + "epoch": 0.5919611872915334, + "grad_norm": 4.7646671532063465, + "learning_rate": 4.269910633477711e-07, + "loss": 1.6413, + "step": 8785 + }, + { + "epoch": 0.5922981031636401, + "grad_norm": 4.451157033474741, + "learning_rate": 4.264093677724561e-07, + "loss": 1.6888, + "step": 8790 + }, + { + "epoch": 0.5926350190357468, + "grad_norm": 4.365302012587232, + "learning_rate": 4.2582777398814966e-07, + "loss": 1.6304, + "step": 8795 + }, + { + "epoch": 0.5929719349078535, + "grad_norm": 4.360400169057322, + "learning_rate": 4.252462827993158e-07, + "loss": 1.6725, + "step": 8800 + }, + { + "epoch": 0.5933088507799602, + "grad_norm": 4.847639710618478, + "learning_rate": 4.246648950102765e-07, + "loss": 1.7565, + "step": 8805 + }, + { + "epoch": 0.593645766652067, + "grad_norm": 4.531584210029071, + "learning_rate": 4.240836114252112e-07, + "loss": 1.6825, + "step": 8810 + }, + { + "epoch": 0.5939826825241737, + "grad_norm": 4.446728074727535, + "learning_rate": 4.2350243284815445e-07, + "loss": 1.6699, + "step": 8815 + }, + { + "epoch": 0.5943195983962805, + "grad_norm": 4.557072399740796, + "learning_rate": 4.229213600829963e-07, + "loss": 1.7023, + "step": 8820 + }, + { + "epoch": 0.5946565142683872, + "grad_norm": 4.320685633324107, + "learning_rate": 4.223403939334802e-07, + "loss": 1.7022, + "step": 8825 + }, + { + "epoch": 0.5949934301404939, + "grad_norm": 5.091185808385919, + "learning_rate": 4.217595352032017e-07, + "loss": 1.717, + "step": 8830 + }, + { + "epoch": 0.5953303460126007, + "grad_norm": 4.689383125941451, + "learning_rate": 4.2117878469560834e-07, + "loss": 1.6521, + "step": 8835 + }, + { + "epoch": 0.5956672618847074, + "grad_norm": 4.2042503348182585, + "learning_rate": 4.205981432139978e-07, + "loss": 1.6621, + "step": 8840 + }, + { + "epoch": 0.5960041777568141, + "grad_norm": 4.5052957339457835, + "learning_rate": 4.200176115615169e-07, + "loss": 1.6988, + "step": 8845 + }, + { + "epoch": 0.5963410936289208, + "grad_norm": 4.6778001572913075, + "learning_rate": 4.1943719054116027e-07, + "loss": 1.647, + "step": 8850 + }, + { + "epoch": 0.5966780095010276, + "grad_norm": 4.868220790218576, + "learning_rate": 4.1885688095577e-07, + "loss": 1.6256, + "step": 8855 + }, + { + "epoch": 0.5970149253731343, + "grad_norm": 4.329463664628172, + "learning_rate": 4.182766836080339e-07, + "loss": 1.6238, + "step": 8860 + }, + { + "epoch": 0.5973518412452411, + "grad_norm": 4.70703333345465, + "learning_rate": 4.176965993004842e-07, + "loss": 1.6875, + "step": 8865 + }, + { + "epoch": 0.5976887571173478, + "grad_norm": 4.904360263585603, + "learning_rate": 4.171166288354971e-07, + "loss": 1.6849, + "step": 8870 + }, + { + "epoch": 0.5980256729894545, + "grad_norm": 4.32012699596512, + "learning_rate": 4.165367730152917e-07, + "loss": 1.6848, + "step": 8875 + }, + { + "epoch": 0.5983625888615612, + "grad_norm": 4.7755314086339, + "learning_rate": 4.1595703264192737e-07, + "loss": 1.6969, + "step": 8880 + }, + { + "epoch": 0.598699504733668, + "grad_norm": 4.404946565600434, + "learning_rate": 4.15377408517305e-07, + "loss": 1.6712, + "step": 8885 + }, + { + "epoch": 0.5990364206057748, + "grad_norm": 4.62456766911183, + "learning_rate": 4.147979014431642e-07, + "loss": 1.6782, + "step": 8890 + }, + { + "epoch": 0.5993733364778815, + "grad_norm": 5.036178958688704, + "learning_rate": 4.142185122210823e-07, + "loss": 1.6734, + "step": 8895 + }, + { + "epoch": 0.5997102523499882, + "grad_norm": 4.478025050021537, + "learning_rate": 4.136392416524742e-07, + "loss": 1.6853, + "step": 8900 + }, + { + "epoch": 0.6000471682220949, + "grad_norm": 4.802397490056363, + "learning_rate": 4.1306009053859043e-07, + "loss": 1.6727, + "step": 8905 + }, + { + "epoch": 0.6003840840942016, + "grad_norm": 4.824037986443692, + "learning_rate": 4.124810596805166e-07, + "loss": 1.6883, + "step": 8910 + }, + { + "epoch": 0.6007209999663085, + "grad_norm": 4.49654239932616, + "learning_rate": 4.119021498791712e-07, + "loss": 1.7514, + "step": 8915 + }, + { + "epoch": 0.6010579158384152, + "grad_norm": 4.118394632761788, + "learning_rate": 4.113233619353062e-07, + "loss": 1.6592, + "step": 8920 + }, + { + "epoch": 0.6013948317105219, + "grad_norm": 4.8909611172651415, + "learning_rate": 4.107446966495044e-07, + "loss": 1.7011, + "step": 8925 + }, + { + "epoch": 0.6017317475826286, + "grad_norm": 4.801420603074635, + "learning_rate": 4.101661548221792e-07, + "loss": 1.7113, + "step": 8930 + }, + { + "epoch": 0.6020686634547353, + "grad_norm": 4.893714227439976, + "learning_rate": 4.0958773725357297e-07, + "loss": 1.6746, + "step": 8935 + }, + { + "epoch": 0.6024055793268421, + "grad_norm": 4.6529537641686085, + "learning_rate": 4.0900944474375674e-07, + "loss": 1.6973, + "step": 8940 + }, + { + "epoch": 0.6027424951989488, + "grad_norm": 4.672275429926195, + "learning_rate": 4.084312780926279e-07, + "loss": 1.6491, + "step": 8945 + }, + { + "epoch": 0.6030794110710556, + "grad_norm": 4.679615356530205, + "learning_rate": 4.0785323809991006e-07, + "loss": 1.7096, + "step": 8950 + }, + { + "epoch": 0.6034163269431623, + "grad_norm": 4.81123110598509, + "learning_rate": 4.072753255651521e-07, + "loss": 1.6589, + "step": 8955 + }, + { + "epoch": 0.603753242815269, + "grad_norm": 4.916986795492, + "learning_rate": 4.066975412877255e-07, + "loss": 1.6249, + "step": 8960 + }, + { + "epoch": 0.6040901586873758, + "grad_norm": 5.411200068581318, + "learning_rate": 4.0611988606682544e-07, + "loss": 1.6395, + "step": 8965 + }, + { + "epoch": 0.6044270745594825, + "grad_norm": 4.884173846637528, + "learning_rate": 4.0554236070146785e-07, + "loss": 1.698, + "step": 8970 + }, + { + "epoch": 0.6047639904315892, + "grad_norm": 4.846861187549292, + "learning_rate": 4.0496496599048963e-07, + "loss": 1.6821, + "step": 8975 + }, + { + "epoch": 0.605100906303696, + "grad_norm": 4.879563415748026, + "learning_rate": 4.0438770273254624e-07, + "loss": 1.6807, + "step": 8980 + }, + { + "epoch": 0.6054378221758027, + "grad_norm": 4.725484457298363, + "learning_rate": 4.038105717261119e-07, + "loss": 1.6332, + "step": 8985 + }, + { + "epoch": 0.6057747380479095, + "grad_norm": 4.649479102245512, + "learning_rate": 4.03233573769478e-07, + "loss": 1.71, + "step": 8990 + }, + { + "epoch": 0.6061116539200162, + "grad_norm": 4.945730805267605, + "learning_rate": 4.026567096607511e-07, + "loss": 1.6461, + "step": 8995 + }, + { + "epoch": 0.6064485697921229, + "grad_norm": 4.501885990649764, + "learning_rate": 4.020799801978535e-07, + "loss": 1.7412, + "step": 9000 + }, + { + "epoch": 0.6067854856642296, + "grad_norm": 4.707450309124685, + "learning_rate": 4.015033861785208e-07, + "loss": 1.6886, + "step": 9005 + }, + { + "epoch": 0.6071224015363363, + "grad_norm": 4.549425438204147, + "learning_rate": 4.0092692840030126e-07, + "loss": 1.7204, + "step": 9010 + }, + { + "epoch": 0.6074593174084432, + "grad_norm": 4.412648886343521, + "learning_rate": 4.003506076605547e-07, + "loss": 1.6506, + "step": 9015 + }, + { + "epoch": 0.6077962332805499, + "grad_norm": 4.728839488949062, + "learning_rate": 3.997744247564519e-07, + "loss": 1.5739, + "step": 9020 + }, + { + "epoch": 0.6081331491526566, + "grad_norm": 4.469095502288587, + "learning_rate": 3.9919838048497197e-07, + "loss": 1.7551, + "step": 9025 + }, + { + "epoch": 0.6084700650247633, + "grad_norm": 4.439886130777974, + "learning_rate": 3.98622475642903e-07, + "loss": 1.6805, + "step": 9030 + }, + { + "epoch": 0.60880698089687, + "grad_norm": 5.252294964963045, + "learning_rate": 3.980467110268405e-07, + "loss": 1.6273, + "step": 9035 + }, + { + "epoch": 0.6091438967689767, + "grad_norm": 4.822617039300253, + "learning_rate": 3.9747108743318493e-07, + "loss": 1.7041, + "step": 9040 + }, + { + "epoch": 0.6094808126410836, + "grad_norm": 4.619438478538796, + "learning_rate": 3.968956056581428e-07, + "loss": 1.6624, + "step": 9045 + }, + { + "epoch": 0.6098177285131903, + "grad_norm": 4.6117011288419505, + "learning_rate": 3.9632026649772366e-07, + "loss": 1.6139, + "step": 9050 + }, + { + "epoch": 0.610154644385297, + "grad_norm": 4.911659054403481, + "learning_rate": 3.9574507074774056e-07, + "loss": 1.6501, + "step": 9055 + }, + { + "epoch": 0.6104915602574037, + "grad_norm": 4.6589618898907, + "learning_rate": 3.951700192038072e-07, + "loss": 1.6172, + "step": 9060 + }, + { + "epoch": 0.6108284761295104, + "grad_norm": 4.311424914105757, + "learning_rate": 3.945951126613387e-07, + "loss": 1.7492, + "step": 9065 + }, + { + "epoch": 0.6111653920016172, + "grad_norm": 5.116936063115301, + "learning_rate": 3.9402035191554937e-07, + "loss": 1.6774, + "step": 9070 + }, + { + "epoch": 0.611502307873724, + "grad_norm": 4.611695226643144, + "learning_rate": 3.934457377614514e-07, + "loss": 1.6253, + "step": 9075 + }, + { + "epoch": 0.6118392237458307, + "grad_norm": 4.345447725618097, + "learning_rate": 3.9287127099385483e-07, + "loss": 1.7269, + "step": 9080 + }, + { + "epoch": 0.6121761396179374, + "grad_norm": 5.182706851476699, + "learning_rate": 3.9229695240736567e-07, + "loss": 1.6901, + "step": 9085 + }, + { + "epoch": 0.6125130554900441, + "grad_norm": 4.751801305761913, + "learning_rate": 3.917227827963846e-07, + "loss": 1.662, + "step": 9090 + }, + { + "epoch": 0.6128499713621509, + "grad_norm": 5.101440856558443, + "learning_rate": 3.9114876295510653e-07, + "loss": 1.6569, + "step": 9095 + }, + { + "epoch": 0.6131868872342576, + "grad_norm": 4.7541345176185095, + "learning_rate": 3.9057489367751947e-07, + "loss": 1.6445, + "step": 9100 + }, + { + "epoch": 0.6135238031063643, + "grad_norm": 4.459634190814962, + "learning_rate": 3.900011757574024e-07, + "loss": 1.7521, + "step": 9105 + }, + { + "epoch": 0.613860718978471, + "grad_norm": 4.852675298666461, + "learning_rate": 3.894276099883258e-07, + "loss": 1.6383, + "step": 9110 + }, + { + "epoch": 0.6141976348505778, + "grad_norm": 4.8681803338120435, + "learning_rate": 3.888541971636492e-07, + "loss": 1.6305, + "step": 9115 + }, + { + "epoch": 0.6145345507226846, + "grad_norm": 5.104377471505262, + "learning_rate": 3.8828093807652095e-07, + "loss": 1.7823, + "step": 9120 + }, + { + "epoch": 0.6148714665947913, + "grad_norm": 4.711200384568286, + "learning_rate": 3.8770783351987605e-07, + "loss": 1.6415, + "step": 9125 + }, + { + "epoch": 0.615208382466898, + "grad_norm": 4.986953684029128, + "learning_rate": 3.8713488428643656e-07, + "loss": 1.6783, + "step": 9130 + }, + { + "epoch": 0.6155452983390047, + "grad_norm": 4.464817870891317, + "learning_rate": 3.8656209116870906e-07, + "loss": 1.6513, + "step": 9135 + }, + { + "epoch": 0.6158822142111114, + "grad_norm": 4.602491329149551, + "learning_rate": 3.859894549589847e-07, + "loss": 1.6982, + "step": 9140 + }, + { + "epoch": 0.6162191300832183, + "grad_norm": 5.174034990707735, + "learning_rate": 3.854169764493371e-07, + "loss": 1.6953, + "step": 9145 + }, + { + "epoch": 0.616556045955325, + "grad_norm": 4.567308775862828, + "learning_rate": 3.848446564316223e-07, + "loss": 1.6572, + "step": 9150 + }, + { + "epoch": 0.6168929618274317, + "grad_norm": 4.619582257540075, + "learning_rate": 3.8427249569747656e-07, + "loss": 1.6788, + "step": 9155 + }, + { + "epoch": 0.6172298776995384, + "grad_norm": 4.615649431114516, + "learning_rate": 3.8370049503831614e-07, + "loss": 1.6735, + "step": 9160 + }, + { + "epoch": 0.6175667935716451, + "grad_norm": 4.421585714223809, + "learning_rate": 3.8312865524533606e-07, + "loss": 1.6711, + "step": 9165 + }, + { + "epoch": 0.617903709443752, + "grad_norm": 4.826993098533326, + "learning_rate": 3.825569771095082e-07, + "loss": 1.7108, + "step": 9170 + }, + { + "epoch": 0.6182406253158587, + "grad_norm": 4.613669265520506, + "learning_rate": 3.819854614215814e-07, + "loss": 1.7083, + "step": 9175 + }, + { + "epoch": 0.6185775411879654, + "grad_norm": 4.94162415452775, + "learning_rate": 3.814141089720796e-07, + "loss": 1.7266, + "step": 9180 + }, + { + "epoch": 0.6189144570600721, + "grad_norm": 4.8380036054530215, + "learning_rate": 3.8084292055130126e-07, + "loss": 1.736, + "step": 9185 + }, + { + "epoch": 0.6192513729321788, + "grad_norm": 4.846548864779536, + "learning_rate": 3.8027189694931715e-07, + "loss": 1.6704, + "step": 9190 + }, + { + "epoch": 0.6195882888042856, + "grad_norm": 4.51327248172737, + "learning_rate": 3.797010389559708e-07, + "loss": 1.6785, + "step": 9195 + }, + { + "epoch": 0.6199252046763923, + "grad_norm": 5.070239294852287, + "learning_rate": 3.7913034736087677e-07, + "loss": 1.7457, + "step": 9200 + }, + { + "epoch": 0.620262120548499, + "grad_norm": 4.755063973244972, + "learning_rate": 3.785598229534186e-07, + "loss": 1.6858, + "step": 9205 + }, + { + "epoch": 0.6205990364206058, + "grad_norm": 4.561460179584289, + "learning_rate": 3.7798946652274943e-07, + "loss": 1.6691, + "step": 9210 + }, + { + "epoch": 0.6209359522927125, + "grad_norm": 4.412726574679065, + "learning_rate": 3.7741927885778966e-07, + "loss": 1.5899, + "step": 9215 + }, + { + "epoch": 0.6212728681648192, + "grad_norm": 4.3476674942315, + "learning_rate": 3.768492607472263e-07, + "loss": 1.7019, + "step": 9220 + }, + { + "epoch": 0.621609784036926, + "grad_norm": 4.390399887814367, + "learning_rate": 3.7627941297951183e-07, + "loss": 1.7051, + "step": 9225 + }, + { + "epoch": 0.6219466999090327, + "grad_norm": 4.35192116787193, + "learning_rate": 3.7570973634286334e-07, + "loss": 1.6842, + "step": 9230 + }, + { + "epoch": 0.6222836157811394, + "grad_norm": 4.431659041359632, + "learning_rate": 3.7514023162526066e-07, + "loss": 1.6323, + "step": 9235 + }, + { + "epoch": 0.6226205316532462, + "grad_norm": 4.346065772652617, + "learning_rate": 3.745708996144463e-07, + "loss": 1.7121, + "step": 9240 + }, + { + "epoch": 0.6229574475253529, + "grad_norm": 4.983399406107984, + "learning_rate": 3.740017410979239e-07, + "loss": 1.6523, + "step": 9245 + }, + { + "epoch": 0.6232943633974597, + "grad_norm": 4.70323233287576, + "learning_rate": 3.734327568629569e-07, + "loss": 1.6911, + "step": 9250 + }, + { + "epoch": 0.6236312792695664, + "grad_norm": 4.170760438250213, + "learning_rate": 3.728639476965678e-07, + "loss": 1.6334, + "step": 9255 + }, + { + "epoch": 0.6239681951416731, + "grad_norm": 5.020288150663386, + "learning_rate": 3.7229531438553664e-07, + "loss": 1.6802, + "step": 9260 + }, + { + "epoch": 0.6243051110137798, + "grad_norm": 4.723953618912066, + "learning_rate": 3.7172685771640076e-07, + "loss": 1.7039, + "step": 9265 + }, + { + "epoch": 0.6246420268858865, + "grad_norm": 4.824681211683872, + "learning_rate": 3.7115857847545264e-07, + "loss": 1.6453, + "step": 9270 + }, + { + "epoch": 0.6249789427579934, + "grad_norm": 4.396461053658457, + "learning_rate": 3.7059047744873955e-07, + "loss": 1.6911, + "step": 9275 + }, + { + "epoch": 0.6253158586301001, + "grad_norm": 4.131448856801017, + "learning_rate": 3.700225554220626e-07, + "loss": 1.6903, + "step": 9280 + }, + { + "epoch": 0.6256527745022068, + "grad_norm": 4.375253047858914, + "learning_rate": 3.694548131809747e-07, + "loss": 1.6685, + "step": 9285 + }, + { + "epoch": 0.6259896903743135, + "grad_norm": 4.418083233974953, + "learning_rate": 3.6888725151078024e-07, + "loss": 1.6877, + "step": 9290 + }, + { + "epoch": 0.6263266062464202, + "grad_norm": 4.3329316853506565, + "learning_rate": 3.683198711965345e-07, + "loss": 1.6335, + "step": 9295 + }, + { + "epoch": 0.626663522118527, + "grad_norm": 4.504174343408066, + "learning_rate": 3.677526730230408e-07, + "loss": 1.6475, + "step": 9300 + }, + { + "epoch": 0.6270004379906338, + "grad_norm": 5.38608186396199, + "learning_rate": 3.671856577748512e-07, + "loss": 1.6343, + "step": 9305 + }, + { + "epoch": 0.6273373538627405, + "grad_norm": 5.39440651463409, + "learning_rate": 3.666188262362648e-07, + "loss": 1.6907, + "step": 9310 + }, + { + "epoch": 0.6276742697348472, + "grad_norm": 4.676904103854555, + "learning_rate": 3.660521791913265e-07, + "loss": 1.6992, + "step": 9315 + }, + { + "epoch": 0.6280111856069539, + "grad_norm": 4.559801782962305, + "learning_rate": 3.654857174238256e-07, + "loss": 1.6791, + "step": 9320 + }, + { + "epoch": 0.6283481014790607, + "grad_norm": 4.136109394326187, + "learning_rate": 3.649194417172957e-07, + "loss": 1.7617, + "step": 9325 + }, + { + "epoch": 0.6286850173511674, + "grad_norm": 4.590955654084948, + "learning_rate": 3.6435335285501283e-07, + "loss": 1.5974, + "step": 9330 + }, + { + "epoch": 0.6290219332232742, + "grad_norm": 4.253775067725555, + "learning_rate": 3.6378745161999426e-07, + "loss": 1.6966, + "step": 9335 + }, + { + "epoch": 0.6293588490953809, + "grad_norm": 4.671874023298497, + "learning_rate": 3.632217387949983e-07, + "loss": 1.6913, + "step": 9340 + }, + { + "epoch": 0.6296957649674876, + "grad_norm": 4.672636100483996, + "learning_rate": 3.626562151625223e-07, + "loss": 1.6015, + "step": 9345 + }, + { + "epoch": 0.6300326808395944, + "grad_norm": 4.572731068631272, + "learning_rate": 3.6209088150480173e-07, + "loss": 1.626, + "step": 9350 + }, + { + "epoch": 0.6303695967117011, + "grad_norm": 5.048291419272467, + "learning_rate": 3.6152573860380964e-07, + "loss": 1.6399, + "step": 9355 + }, + { + "epoch": 0.6307065125838078, + "grad_norm": 4.66081177092855, + "learning_rate": 3.6096078724125544e-07, + "loss": 1.7308, + "step": 9360 + }, + { + "epoch": 0.6310434284559145, + "grad_norm": 4.917944557473536, + "learning_rate": 3.603960281985828e-07, + "loss": 1.7118, + "step": 9365 + }, + { + "epoch": 0.6313803443280213, + "grad_norm": 4.639561844571507, + "learning_rate": 3.5983146225697007e-07, + "loss": 1.6598, + "step": 9370 + }, + { + "epoch": 0.6317172602001281, + "grad_norm": 4.809402432690832, + "learning_rate": 3.5926709019732855e-07, + "loss": 1.7416, + "step": 9375 + }, + { + "epoch": 0.6320541760722348, + "grad_norm": 4.481316598432365, + "learning_rate": 3.587029128003006e-07, + "loss": 1.6618, + "step": 9380 + }, + { + "epoch": 0.6323910919443415, + "grad_norm": 4.878186037519498, + "learning_rate": 3.581389308462601e-07, + "loss": 1.6539, + "step": 9385 + }, + { + "epoch": 0.6327280078164482, + "grad_norm": 4.691055116278031, + "learning_rate": 3.5757514511531016e-07, + "loss": 1.7012, + "step": 9390 + }, + { + "epoch": 0.6330649236885549, + "grad_norm": 4.596865110399032, + "learning_rate": 3.5701155638728297e-07, + "loss": 1.6794, + "step": 9395 + }, + { + "epoch": 0.6334018395606617, + "grad_norm": 4.358380598019165, + "learning_rate": 3.564481654417374e-07, + "loss": 1.661, + "step": 9400 + }, + { + "epoch": 0.6337387554327685, + "grad_norm": 4.412280666871581, + "learning_rate": 3.558849730579594e-07, + "loss": 1.6904, + "step": 9405 + }, + { + "epoch": 0.6340756713048752, + "grad_norm": 4.595888399957703, + "learning_rate": 3.553219800149603e-07, + "loss": 1.6599, + "step": 9410 + }, + { + "epoch": 0.6344125871769819, + "grad_norm": 4.504509215702589, + "learning_rate": 3.547591870914752e-07, + "loss": 1.6241, + "step": 9415 + }, + { + "epoch": 0.6347495030490886, + "grad_norm": 4.5321538447474286, + "learning_rate": 3.5419659506596287e-07, + "loss": 1.6667, + "step": 9420 + }, + { + "epoch": 0.6350864189211953, + "grad_norm": 4.770870718979196, + "learning_rate": 3.536342047166039e-07, + "loss": 1.6589, + "step": 9425 + }, + { + "epoch": 0.6354233347933022, + "grad_norm": 4.599599013949839, + "learning_rate": 3.530720168213001e-07, + "loss": 1.6545, + "step": 9430 + }, + { + "epoch": 0.6357602506654089, + "grad_norm": 4.39991418260652, + "learning_rate": 3.5251003215767305e-07, + "loss": 1.6523, + "step": 9435 + }, + { + "epoch": 0.6360971665375156, + "grad_norm": 4.722938780137947, + "learning_rate": 3.519482515030636e-07, + "loss": 1.6727, + "step": 9440 + }, + { + "epoch": 0.6364340824096223, + "grad_norm": 4.298009738528461, + "learning_rate": 3.5138667563452983e-07, + "loss": 1.7054, + "step": 9445 + }, + { + "epoch": 0.636770998281729, + "grad_norm": 5.373001197803553, + "learning_rate": 3.5082530532884703e-07, + "loss": 1.6143, + "step": 9450 + }, + { + "epoch": 0.6371079141538358, + "grad_norm": 4.362506764538609, + "learning_rate": 3.5026414136250607e-07, + "loss": 1.6585, + "step": 9455 + }, + { + "epoch": 0.6374448300259425, + "grad_norm": 3.956197636663764, + "learning_rate": 3.497031845117124e-07, + "loss": 1.7065, + "step": 9460 + }, + { + "epoch": 0.6377817458980493, + "grad_norm": 4.968824829024163, + "learning_rate": 3.4914243555238476e-07, + "loss": 1.7093, + "step": 9465 + }, + { + "epoch": 0.638118661770156, + "grad_norm": 4.654729665807823, + "learning_rate": 3.4858189526015453e-07, + "loss": 1.6439, + "step": 9470 + }, + { + "epoch": 0.6384555776422627, + "grad_norm": 4.7396445196150605, + "learning_rate": 3.4802156441036467e-07, + "loss": 1.6802, + "step": 9475 + }, + { + "epoch": 0.6387924935143695, + "grad_norm": 4.324066565036445, + "learning_rate": 3.4746144377806785e-07, + "loss": 1.6634, + "step": 9480 + }, + { + "epoch": 0.6391294093864762, + "grad_norm": 4.746429983809976, + "learning_rate": 3.4690153413802653e-07, + "loss": 1.7065, + "step": 9485 + }, + { + "epoch": 0.6394663252585829, + "grad_norm": 4.714466290466101, + "learning_rate": 3.4634183626471125e-07, + "loss": 1.7503, + "step": 9490 + }, + { + "epoch": 0.6398032411306896, + "grad_norm": 4.702082264829062, + "learning_rate": 3.457823509322992e-07, + "loss": 1.744, + "step": 9495 + }, + { + "epoch": 0.6401401570027964, + "grad_norm": 4.871189714313392, + "learning_rate": 3.452230789146741e-07, + "loss": 1.7177, + "step": 9500 + }, + { + "epoch": 0.6404770728749032, + "grad_norm": 4.972530681728673, + "learning_rate": 3.4466402098542435e-07, + "loss": 1.6523, + "step": 9505 + }, + { + "epoch": 0.6408139887470099, + "grad_norm": 4.753078175132621, + "learning_rate": 3.441051779178422e-07, + "loss": 1.6754, + "step": 9510 + }, + { + "epoch": 0.6411509046191166, + "grad_norm": 4.491634183568621, + "learning_rate": 3.4354655048492277e-07, + "loss": 1.6013, + "step": 9515 + }, + { + "epoch": 0.6414878204912233, + "grad_norm": 4.225886018919661, + "learning_rate": 3.429881394593629e-07, + "loss": 1.6376, + "step": 9520 + }, + { + "epoch": 0.64182473636333, + "grad_norm": 4.679624121585962, + "learning_rate": 3.4242994561356043e-07, + "loss": 1.6383, + "step": 9525 + }, + { + "epoch": 0.6421616522354369, + "grad_norm": 4.679224868456952, + "learning_rate": 3.4187196971961185e-07, + "loss": 1.7326, + "step": 9530 + }, + { + "epoch": 0.6424985681075436, + "grad_norm": 4.809163806682364, + "learning_rate": 3.4131421254931326e-07, + "loss": 1.704, + "step": 9535 + }, + { + "epoch": 0.6428354839796503, + "grad_norm": 4.6325336560221615, + "learning_rate": 3.4075667487415785e-07, + "loss": 1.7466, + "step": 9540 + }, + { + "epoch": 0.643172399851757, + "grad_norm": 4.436009749711743, + "learning_rate": 3.4019935746533474e-07, + "loss": 1.6869, + "step": 9545 + }, + { + "epoch": 0.6435093157238637, + "grad_norm": 4.864438953089616, + "learning_rate": 3.3964226109372884e-07, + "loss": 1.77, + "step": 9550 + }, + { + "epoch": 0.6438462315959704, + "grad_norm": 4.522632462926502, + "learning_rate": 3.390853865299195e-07, + "loss": 1.6717, + "step": 9555 + }, + { + "epoch": 0.6441831474680773, + "grad_norm": 4.744488425863059, + "learning_rate": 3.385287345441786e-07, + "loss": 1.7493, + "step": 9560 + }, + { + "epoch": 0.644520063340184, + "grad_norm": 4.833057581829567, + "learning_rate": 3.3797230590647073e-07, + "loss": 1.6675, + "step": 9565 + }, + { + "epoch": 0.6448569792122907, + "grad_norm": 4.492473060812444, + "learning_rate": 3.374161013864515e-07, + "loss": 1.6674, + "step": 9570 + }, + { + "epoch": 0.6451938950843974, + "grad_norm": 4.5854657066579785, + "learning_rate": 3.368601217534661e-07, + "loss": 1.7302, + "step": 9575 + }, + { + "epoch": 0.6455308109565041, + "grad_norm": 4.794003300008669, + "learning_rate": 3.3630436777654903e-07, + "loss": 1.7257, + "step": 9580 + }, + { + "epoch": 0.6458677268286109, + "grad_norm": 4.582700632423808, + "learning_rate": 3.357488402244227e-07, + "loss": 1.7046, + "step": 9585 + }, + { + "epoch": 0.6462046427007176, + "grad_norm": 5.099103738821147, + "learning_rate": 3.3519353986549604e-07, + "loss": 1.5777, + "step": 9590 + }, + { + "epoch": 0.6465415585728244, + "grad_norm": 4.861722997661979, + "learning_rate": 3.346384674678639e-07, + "loss": 1.7682, + "step": 9595 + }, + { + "epoch": 0.6468784744449311, + "grad_norm": 4.8580449100967, + "learning_rate": 3.3408362379930576e-07, + "loss": 1.6197, + "step": 9600 + }, + { + "epoch": 0.6472153903170378, + "grad_norm": 4.314387420963666, + "learning_rate": 3.335290096272849e-07, + "loss": 1.6476, + "step": 9605 + }, + { + "epoch": 0.6475523061891446, + "grad_norm": 4.8963693523258485, + "learning_rate": 3.3297462571894673e-07, + "loss": 1.6491, + "step": 9610 + }, + { + "epoch": 0.6478892220612513, + "grad_norm": 4.406712709507465, + "learning_rate": 3.3242047284111857e-07, + "loss": 1.6501, + "step": 9615 + }, + { + "epoch": 0.648226137933358, + "grad_norm": 4.610325002229822, + "learning_rate": 3.3186655176030826e-07, + "loss": 1.733, + "step": 9620 + }, + { + "epoch": 0.6485630538054648, + "grad_norm": 4.313952984941844, + "learning_rate": 3.3131286324270234e-07, + "loss": 1.5754, + "step": 9625 + }, + { + "epoch": 0.6488999696775715, + "grad_norm": 4.2904274217804845, + "learning_rate": 3.3075940805416654e-07, + "loss": 1.6616, + "step": 9630 + }, + { + "epoch": 0.6492368855496783, + "grad_norm": 4.33447125046999, + "learning_rate": 3.3020618696024316e-07, + "loss": 1.707, + "step": 9635 + }, + { + "epoch": 0.649573801421785, + "grad_norm": 4.51287714996691, + "learning_rate": 3.2965320072615113e-07, + "loss": 1.722, + "step": 9640 + }, + { + "epoch": 0.6499107172938917, + "grad_norm": 4.615478632755968, + "learning_rate": 3.2910045011678424e-07, + "loss": 1.6432, + "step": 9645 + }, + { + "epoch": 0.6502476331659984, + "grad_norm": 4.413389662630831, + "learning_rate": 3.2854793589671046e-07, + "loss": 1.739, + "step": 9650 + }, + { + "epoch": 0.6505845490381051, + "grad_norm": 4.421718857382131, + "learning_rate": 3.279956588301712e-07, + "loss": 1.665, + "step": 9655 + }, + { + "epoch": 0.650921464910212, + "grad_norm": 4.566051335669866, + "learning_rate": 3.274436196810789e-07, + "loss": 1.6526, + "step": 9660 + }, + { + "epoch": 0.6512583807823187, + "grad_norm": 4.772530503841975, + "learning_rate": 3.268918192130178e-07, + "loss": 1.6811, + "step": 9665 + }, + { + "epoch": 0.6515952966544254, + "grad_norm": 4.8099020061260624, + "learning_rate": 3.263402581892415e-07, + "loss": 1.628, + "step": 9670 + }, + { + "epoch": 0.6519322125265321, + "grad_norm": 4.617221914497451, + "learning_rate": 3.257889373726726e-07, + "loss": 1.7204, + "step": 9675 + }, + { + "epoch": 0.6522691283986388, + "grad_norm": 4.488349346785449, + "learning_rate": 3.252378575259013e-07, + "loss": 1.6725, + "step": 9680 + }, + { + "epoch": 0.6526060442707456, + "grad_norm": 4.567181671797062, + "learning_rate": 3.246870194111849e-07, + "loss": 1.68, + "step": 9685 + }, + { + "epoch": 0.6529429601428524, + "grad_norm": 4.893953362104648, + "learning_rate": 3.2413642379044557e-07, + "loss": 1.656, + "step": 9690 + }, + { + "epoch": 0.6532798760149591, + "grad_norm": 4.642415265648212, + "learning_rate": 3.235860714252708e-07, + "loss": 1.707, + "step": 9695 + }, + { + "epoch": 0.6536167918870658, + "grad_norm": 4.619809492061879, + "learning_rate": 3.2303596307691137e-07, + "loss": 1.6804, + "step": 9700 + }, + { + "epoch": 0.6539537077591725, + "grad_norm": 4.518938554948811, + "learning_rate": 3.2248609950628023e-07, + "loss": 1.7232, + "step": 9705 + }, + { + "epoch": 0.6542906236312793, + "grad_norm": 5.0305921574382175, + "learning_rate": 3.219364814739522e-07, + "loss": 1.6637, + "step": 9710 + }, + { + "epoch": 0.654627539503386, + "grad_norm": 4.958018906140687, + "learning_rate": 3.2138710974016226e-07, + "loss": 1.6516, + "step": 9715 + }, + { + "epoch": 0.6549644553754927, + "grad_norm": 4.411380116115451, + "learning_rate": 3.208379850648046e-07, + "loss": 1.7346, + "step": 9720 + }, + { + "epoch": 0.6553013712475995, + "grad_norm": 4.41765731158713, + "learning_rate": 3.202891082074318e-07, + "loss": 1.6891, + "step": 9725 + }, + { + "epoch": 0.6556382871197062, + "grad_norm": 4.41496004100469, + "learning_rate": 3.197404799272537e-07, + "loss": 1.69, + "step": 9730 + }, + { + "epoch": 0.6559752029918129, + "grad_norm": 5.0894961933664495, + "learning_rate": 3.191921009831365e-07, + "loss": 1.6848, + "step": 9735 + }, + { + "epoch": 0.6563121188639197, + "grad_norm": 4.773661036376162, + "learning_rate": 3.1864397213360093e-07, + "loss": 1.6932, + "step": 9740 + }, + { + "epoch": 0.6566490347360264, + "grad_norm": 4.564536291163815, + "learning_rate": 3.180960941368223e-07, + "loss": 1.6904, + "step": 9745 + }, + { + "epoch": 0.6569859506081331, + "grad_norm": 4.480751994541764, + "learning_rate": 3.175484677506288e-07, + "loss": 1.6445, + "step": 9750 + }, + { + "epoch": 0.6573228664802399, + "grad_norm": 4.814133616042183, + "learning_rate": 3.1700109373250063e-07, + "loss": 1.6457, + "step": 9755 + }, + { + "epoch": 0.6576597823523466, + "grad_norm": 4.939708091385734, + "learning_rate": 3.1645397283956843e-07, + "loss": 1.6636, + "step": 9760 + }, + { + "epoch": 0.6579966982244534, + "grad_norm": 4.437001055650553, + "learning_rate": 3.159071058286138e-07, + "loss": 1.6724, + "step": 9765 + }, + { + "epoch": 0.6583336140965601, + "grad_norm": 4.375092204411314, + "learning_rate": 3.1536049345606586e-07, + "loss": 1.7786, + "step": 9770 + }, + { + "epoch": 0.6586705299686668, + "grad_norm": 4.495874315937704, + "learning_rate": 3.1481413647800247e-07, + "loss": 1.5484, + "step": 9775 + }, + { + "epoch": 0.6590074458407735, + "grad_norm": 4.545858285177803, + "learning_rate": 3.14268035650148e-07, + "loss": 1.7075, + "step": 9780 + }, + { + "epoch": 0.6593443617128802, + "grad_norm": 4.455537213297519, + "learning_rate": 3.137221917278723e-07, + "loss": 1.7664, + "step": 9785 + }, + { + "epoch": 0.6596812775849871, + "grad_norm": 4.599843359938901, + "learning_rate": 3.1317660546618986e-07, + "loss": 1.5995, + "step": 9790 + }, + { + "epoch": 0.6600181934570938, + "grad_norm": 4.651667753009877, + "learning_rate": 3.1263127761975917e-07, + "loss": 1.6857, + "step": 9795 + }, + { + "epoch": 0.6603551093292005, + "grad_norm": 4.935378017068128, + "learning_rate": 3.12086208942881e-07, + "loss": 1.722, + "step": 9800 + }, + { + "epoch": 0.6606920252013072, + "grad_norm": 4.434594377115127, + "learning_rate": 3.1154140018949736e-07, + "loss": 1.6791, + "step": 9805 + }, + { + "epoch": 0.6610289410734139, + "grad_norm": 4.898749340528809, + "learning_rate": 3.1099685211319116e-07, + "loss": 1.6559, + "step": 9810 + }, + { + "epoch": 0.6613658569455207, + "grad_norm": 4.364968079409384, + "learning_rate": 3.104525654671849e-07, + "loss": 1.656, + "step": 9815 + }, + { + "epoch": 0.6617027728176275, + "grad_norm": 4.756136283646096, + "learning_rate": 3.099085410043386e-07, + "loss": 1.6067, + "step": 9820 + }, + { + "epoch": 0.6620396886897342, + "grad_norm": 4.232110056496074, + "learning_rate": 3.0936477947715064e-07, + "loss": 1.6345, + "step": 9825 + }, + { + "epoch": 0.6623766045618409, + "grad_norm": 4.5660844839178525, + "learning_rate": 3.088212816377552e-07, + "loss": 1.6549, + "step": 9830 + }, + { + "epoch": 0.6627135204339476, + "grad_norm": 4.580686875748608, + "learning_rate": 3.0827804823792157e-07, + "loss": 1.6828, + "step": 9835 + }, + { + "epoch": 0.6630504363060544, + "grad_norm": 4.541892340707929, + "learning_rate": 3.077350800290537e-07, + "loss": 1.6968, + "step": 9840 + }, + { + "epoch": 0.6633873521781611, + "grad_norm": 4.383080748763885, + "learning_rate": 3.071923777621885e-07, + "loss": 1.6321, + "step": 9845 + }, + { + "epoch": 0.6637242680502679, + "grad_norm": 4.6197233210843045, + "learning_rate": 3.066499421879948e-07, + "loss": 1.7752, + "step": 9850 + }, + { + "epoch": 0.6640611839223746, + "grad_norm": 4.571457917714042, + "learning_rate": 3.0610777405677286e-07, + "loss": 1.683, + "step": 9855 + }, + { + "epoch": 0.6643980997944813, + "grad_norm": 4.209426918741808, + "learning_rate": 3.05565874118453e-07, + "loss": 1.6486, + "step": 9860 + }, + { + "epoch": 0.6647350156665881, + "grad_norm": 4.81882087804017, + "learning_rate": 3.050242431225948e-07, + "loss": 1.6834, + "step": 9865 + }, + { + "epoch": 0.6650719315386948, + "grad_norm": 4.7481008632017065, + "learning_rate": 3.0448288181838487e-07, + "loss": 1.6682, + "step": 9870 + }, + { + "epoch": 0.6654088474108015, + "grad_norm": 4.476031844564653, + "learning_rate": 3.0394179095463804e-07, + "loss": 1.7116, + "step": 9875 + }, + { + "epoch": 0.6657457632829082, + "grad_norm": 4.791042749509363, + "learning_rate": 3.0340097127979426e-07, + "loss": 1.7137, + "step": 9880 + }, + { + "epoch": 0.666082679155015, + "grad_norm": 4.725157551146566, + "learning_rate": 3.0286042354191844e-07, + "loss": 1.6872, + "step": 9885 + }, + { + "epoch": 0.6664195950271218, + "grad_norm": 4.493288785483739, + "learning_rate": 3.0232014848869955e-07, + "loss": 1.6942, + "step": 9890 + }, + { + "epoch": 0.6667565108992285, + "grad_norm": 4.855990739567336, + "learning_rate": 3.0178014686744966e-07, + "loss": 1.6451, + "step": 9895 + }, + { + "epoch": 0.6670934267713352, + "grad_norm": 4.034583816812956, + "learning_rate": 3.0124041942510175e-07, + "loss": 1.689, + "step": 9900 + }, + { + "epoch": 0.6674303426434419, + "grad_norm": 5.4785906083317455, + "learning_rate": 3.007009669082103e-07, + "loss": 1.6689, + "step": 9905 + }, + { + "epoch": 0.6677672585155486, + "grad_norm": 4.546350851107137, + "learning_rate": 3.001617900629496e-07, + "loss": 1.7015, + "step": 9910 + }, + { + "epoch": 0.6681041743876553, + "grad_norm": 4.615618260513504, + "learning_rate": 2.996228896351119e-07, + "loss": 1.6844, + "step": 9915 + }, + { + "epoch": 0.6684410902597622, + "grad_norm": 4.372487354691214, + "learning_rate": 2.9908426637010773e-07, + "loss": 1.7324, + "step": 9920 + }, + { + "epoch": 0.6687780061318689, + "grad_norm": 4.693262050453, + "learning_rate": 2.98545921012964e-07, + "loss": 1.6777, + "step": 9925 + }, + { + "epoch": 0.6691149220039756, + "grad_norm": 4.827171097205222, + "learning_rate": 2.9800785430832354e-07, + "loss": 1.6361, + "step": 9930 + }, + { + "epoch": 0.6694518378760823, + "grad_norm": 4.4256333493258255, + "learning_rate": 2.9747006700044295e-07, + "loss": 1.6837, + "step": 9935 + }, + { + "epoch": 0.669788753748189, + "grad_norm": 4.330977095836329, + "learning_rate": 2.969325598331932e-07, + "loss": 1.6356, + "step": 9940 + }, + { + "epoch": 0.6701256696202958, + "grad_norm": 4.530930969038274, + "learning_rate": 2.9639533355005773e-07, + "loss": 1.666, + "step": 9945 + }, + { + "epoch": 0.6704625854924026, + "grad_norm": 4.304911554637418, + "learning_rate": 2.958583888941306e-07, + "loss": 1.6104, + "step": 9950 + }, + { + "epoch": 0.6707995013645093, + "grad_norm": 4.810772596336117, + "learning_rate": 2.9532172660811745e-07, + "loss": 1.6974, + "step": 9955 + }, + { + "epoch": 0.671136417236616, + "grad_norm": 4.560408079183627, + "learning_rate": 2.9478534743433247e-07, + "loss": 1.6618, + "step": 9960 + }, + { + "epoch": 0.6714733331087227, + "grad_norm": 3.9849964262716044, + "learning_rate": 2.9424925211469876e-07, + "loss": 1.658, + "step": 9965 + }, + { + "epoch": 0.6718102489808295, + "grad_norm": 4.9512408667329355, + "learning_rate": 2.9371344139074645e-07, + "loss": 1.6443, + "step": 9970 + }, + { + "epoch": 0.6721471648529362, + "grad_norm": 4.808192781712395, + "learning_rate": 2.9317791600361243e-07, + "loss": 1.5463, + "step": 9975 + }, + { + "epoch": 0.672484080725043, + "grad_norm": 4.265183883192022, + "learning_rate": 2.9264267669403833e-07, + "loss": 1.6875, + "step": 9980 + }, + { + "epoch": 0.6728209965971497, + "grad_norm": 4.724613844375058, + "learning_rate": 2.921077242023706e-07, + "loss": 1.6789, + "step": 9985 + }, + { + "epoch": 0.6731579124692564, + "grad_norm": 4.932628162130658, + "learning_rate": 2.9157305926855893e-07, + "loss": 1.6627, + "step": 9990 + }, + { + "epoch": 0.6734948283413632, + "grad_norm": 4.5177985804674545, + "learning_rate": 2.910386826321549e-07, + "loss": 1.6965, + "step": 9995 + }, + { + "epoch": 0.6738317442134699, + "grad_norm": 4.774452767288286, + "learning_rate": 2.905045950323114e-07, + "loss": 1.6736, + "step": 10000 + }, + { + "epoch": 0.6741686600855766, + "grad_norm": 4.831788795618858, + "learning_rate": 2.899707972077817e-07, + "loss": 1.6555, + "step": 10005 + }, + { + "epoch": 0.6745055759576833, + "grad_norm": 5.235512873128096, + "learning_rate": 2.8943728989691857e-07, + "loss": 1.7077, + "step": 10010 + }, + { + "epoch": 0.6748424918297901, + "grad_norm": 4.913331508484025, + "learning_rate": 2.88904073837672e-07, + "loss": 1.6784, + "step": 10015 + }, + { + "epoch": 0.6751794077018969, + "grad_norm": 4.516721095762328, + "learning_rate": 2.883711497675899e-07, + "loss": 1.6134, + "step": 10020 + }, + { + "epoch": 0.6755163235740036, + "grad_norm": 4.337341745892736, + "learning_rate": 2.878385184238163e-07, + "loss": 1.6352, + "step": 10025 + }, + { + "epoch": 0.6758532394461103, + "grad_norm": 4.56965178511602, + "learning_rate": 2.8730618054308964e-07, + "loss": 1.608, + "step": 10030 + }, + { + "epoch": 0.676190155318217, + "grad_norm": 4.322189613617754, + "learning_rate": 2.8677413686174325e-07, + "loss": 1.7271, + "step": 10035 + }, + { + "epoch": 0.6765270711903237, + "grad_norm": 5.232513125407505, + "learning_rate": 2.8624238811570325e-07, + "loss": 1.6729, + "step": 10040 + }, + { + "epoch": 0.6768639870624306, + "grad_norm": 4.4509227801927205, + "learning_rate": 2.8571093504048737e-07, + "loss": 1.6776, + "step": 10045 + }, + { + "epoch": 0.6772009029345373, + "grad_norm": 4.394021508706804, + "learning_rate": 2.851797783712049e-07, + "loss": 1.5934, + "step": 10050 + }, + { + "epoch": 0.677537818806644, + "grad_norm": 4.60912000955251, + "learning_rate": 2.8464891884255515e-07, + "loss": 1.7002, + "step": 10055 + }, + { + "epoch": 0.6778747346787507, + "grad_norm": 4.764954353498366, + "learning_rate": 2.8411835718882593e-07, + "loss": 1.6836, + "step": 10060 + }, + { + "epoch": 0.6782116505508574, + "grad_norm": 4.414848267445215, + "learning_rate": 2.835880941438936e-07, + "loss": 1.6317, + "step": 10065 + }, + { + "epoch": 0.6785485664229642, + "grad_norm": 4.601263605609601, + "learning_rate": 2.8305813044122093e-07, + "loss": 1.6786, + "step": 10070 + }, + { + "epoch": 0.678885482295071, + "grad_norm": 5.073682142084875, + "learning_rate": 2.8252846681385734e-07, + "loss": 1.755, + "step": 10075 + }, + { + "epoch": 0.6792223981671777, + "grad_norm": 4.657590732042178, + "learning_rate": 2.8199910399443625e-07, + "loss": 1.706, + "step": 10080 + }, + { + "epoch": 0.6795593140392844, + "grad_norm": 4.806472860391699, + "learning_rate": 2.8147004271517584e-07, + "loss": 1.6204, + "step": 10085 + }, + { + "epoch": 0.6798962299113911, + "grad_norm": 4.6660240490616705, + "learning_rate": 2.8094128370787694e-07, + "loss": 1.7356, + "step": 10090 + }, + { + "epoch": 0.6802331457834978, + "grad_norm": 5.228044220619245, + "learning_rate": 2.8041282770392196e-07, + "loss": 1.6482, + "step": 10095 + }, + { + "epoch": 0.6805700616556046, + "grad_norm": 4.918634141984381, + "learning_rate": 2.7988467543427454e-07, + "loss": 1.6087, + "step": 10100 + }, + { + "epoch": 0.6809069775277113, + "grad_norm": 5.041088919289125, + "learning_rate": 2.7935682762947837e-07, + "loss": 1.7085, + "step": 10105 + }, + { + "epoch": 0.6812438933998181, + "grad_norm": 4.557815685478516, + "learning_rate": 2.788292850196553e-07, + "loss": 1.6837, + "step": 10110 + }, + { + "epoch": 0.6815808092719248, + "grad_norm": 4.49296018717802, + "learning_rate": 2.783020483345057e-07, + "loss": 1.6294, + "step": 10115 + }, + { + "epoch": 0.6819177251440315, + "grad_norm": 4.344023180239478, + "learning_rate": 2.777751183033067e-07, + "loss": 1.7425, + "step": 10120 + }, + { + "epoch": 0.6822546410161383, + "grad_norm": 4.352887650061733, + "learning_rate": 2.772484956549107e-07, + "loss": 1.6764, + "step": 10125 + }, + { + "epoch": 0.682591556888245, + "grad_norm": 4.701595075307773, + "learning_rate": 2.7672218111774566e-07, + "loss": 1.7387, + "step": 10130 + }, + { + "epoch": 0.6829284727603517, + "grad_norm": 4.220584191142634, + "learning_rate": 2.7619617541981287e-07, + "loss": 1.6475, + "step": 10135 + }, + { + "epoch": 0.6832653886324584, + "grad_norm": 4.72955412832327, + "learning_rate": 2.756704792886869e-07, + "loss": 1.6484, + "step": 10140 + }, + { + "epoch": 0.6836023045045652, + "grad_norm": 4.6550694748926515, + "learning_rate": 2.7514509345151347e-07, + "loss": 1.6406, + "step": 10145 + }, + { + "epoch": 0.683939220376672, + "grad_norm": 4.782860902895655, + "learning_rate": 2.746200186350097e-07, + "loss": 1.7218, + "step": 10150 + }, + { + "epoch": 0.6842761362487787, + "grad_norm": 4.376748815020733, + "learning_rate": 2.740952555654622e-07, + "loss": 1.6845, + "step": 10155 + }, + { + "epoch": 0.6846130521208854, + "grad_norm": 4.783809039044787, + "learning_rate": 2.735708049687262e-07, + "loss": 1.6067, + "step": 10160 + }, + { + "epoch": 0.6849499679929921, + "grad_norm": 4.511631761949158, + "learning_rate": 2.730466675702251e-07, + "loss": 1.7025, + "step": 10165 + }, + { + "epoch": 0.6852868838650988, + "grad_norm": 5.087600336235812, + "learning_rate": 2.7252284409494906e-07, + "loss": 1.6463, + "step": 10170 + }, + { + "epoch": 0.6856237997372057, + "grad_norm": 4.2007227960927835, + "learning_rate": 2.7199933526745364e-07, + "loss": 1.7036, + "step": 10175 + }, + { + "epoch": 0.6859607156093124, + "grad_norm": 5.117839466222247, + "learning_rate": 2.714761418118596e-07, + "loss": 1.6092, + "step": 10180 + }, + { + "epoch": 0.6862976314814191, + "grad_norm": 5.356330190705277, + "learning_rate": 2.7095326445185143e-07, + "loss": 1.7043, + "step": 10185 + }, + { + "epoch": 0.6866345473535258, + "grad_norm": 4.564267086174341, + "learning_rate": 2.704307039106759e-07, + "loss": 1.6383, + "step": 10190 + }, + { + "epoch": 0.6869714632256325, + "grad_norm": 4.164845663181458, + "learning_rate": 2.6990846091114205e-07, + "loss": 1.7422, + "step": 10195 + }, + { + "epoch": 0.6873083790977393, + "grad_norm": 4.131763760270544, + "learning_rate": 2.6938653617561967e-07, + "loss": 1.7038, + "step": 10200 + }, + { + "epoch": 0.687645294969846, + "grad_norm": 4.312564061995232, + "learning_rate": 2.688649304260383e-07, + "loss": 1.6819, + "step": 10205 + }, + { + "epoch": 0.6879822108419528, + "grad_norm": 4.61153702729294, + "learning_rate": 2.683436443838859e-07, + "loss": 1.6896, + "step": 10210 + }, + { + "epoch": 0.6883191267140595, + "grad_norm": 4.703111722307184, + "learning_rate": 2.678226787702086e-07, + "loss": 1.7162, + "step": 10215 + }, + { + "epoch": 0.6886560425861662, + "grad_norm": 4.635701429196508, + "learning_rate": 2.673020343056094e-07, + "loss": 1.7633, + "step": 10220 + }, + { + "epoch": 0.688992958458273, + "grad_norm": 4.7344644151009465, + "learning_rate": 2.6678171171024657e-07, + "loss": 1.7299, + "step": 10225 + }, + { + "epoch": 0.6893298743303797, + "grad_norm": 4.486499868121771, + "learning_rate": 2.6626171170383373e-07, + "loss": 1.6064, + "step": 10230 + }, + { + "epoch": 0.6896667902024864, + "grad_norm": 4.534823258749324, + "learning_rate": 2.6574203500563776e-07, + "loss": 1.6902, + "step": 10235 + }, + { + "epoch": 0.6900037060745932, + "grad_norm": 4.439413468251779, + "learning_rate": 2.6522268233447894e-07, + "loss": 1.7037, + "step": 10240 + }, + { + "epoch": 0.6903406219466999, + "grad_norm": 4.552475395505495, + "learning_rate": 2.6470365440872866e-07, + "loss": 1.6477, + "step": 10245 + }, + { + "epoch": 0.6906775378188067, + "grad_norm": 5.036103615948694, + "learning_rate": 2.641849519463099e-07, + "loss": 1.7112, + "step": 10250 + }, + { + "epoch": 0.6910144536909134, + "grad_norm": 4.445684686793016, + "learning_rate": 2.6366657566469465e-07, + "loss": 1.6505, + "step": 10255 + }, + { + "epoch": 0.6913513695630201, + "grad_norm": 4.4173485045453, + "learning_rate": 2.631485262809043e-07, + "loss": 1.6391, + "step": 10260 + }, + { + "epoch": 0.6916882854351268, + "grad_norm": 4.317080090850698, + "learning_rate": 2.6263080451150797e-07, + "loss": 1.6771, + "step": 10265 + }, + { + "epoch": 0.6920252013072336, + "grad_norm": 4.64941590077001, + "learning_rate": 2.621134110726217e-07, + "loss": 1.7569, + "step": 10270 + }, + { + "epoch": 0.6923621171793403, + "grad_norm": 4.590236812787074, + "learning_rate": 2.6159634667990683e-07, + "loss": 1.7185, + "step": 10275 + }, + { + "epoch": 0.6926990330514471, + "grad_norm": 4.9812699111640795, + "learning_rate": 2.610796120485701e-07, + "loss": 1.6939, + "step": 10280 + }, + { + "epoch": 0.6930359489235538, + "grad_norm": 4.375038058888252, + "learning_rate": 2.605632078933623e-07, + "loss": 1.6136, + "step": 10285 + }, + { + "epoch": 0.6933728647956605, + "grad_norm": 4.897729962824886, + "learning_rate": 2.600471349285763e-07, + "loss": 1.7073, + "step": 10290 + }, + { + "epoch": 0.6937097806677672, + "grad_norm": 4.565936668798104, + "learning_rate": 2.5953139386804764e-07, + "loss": 1.679, + "step": 10295 + }, + { + "epoch": 0.6940466965398739, + "grad_norm": 5.169593685108845, + "learning_rate": 2.5901598542515256e-07, + "loss": 1.6327, + "step": 10300 + }, + { + "epoch": 0.6943836124119808, + "grad_norm": 4.373417565376957, + "learning_rate": 2.5850091031280684e-07, + "loss": 1.6841, + "step": 10305 + }, + { + "epoch": 0.6947205282840875, + "grad_norm": 4.559109121588478, + "learning_rate": 2.579861692434658e-07, + "loss": 1.6803, + "step": 10310 + }, + { + "epoch": 0.6950574441561942, + "grad_norm": 4.571150904547719, + "learning_rate": 2.574717629291222e-07, + "loss": 1.6552, + "step": 10315 + }, + { + "epoch": 0.6953943600283009, + "grad_norm": 4.4202180568282, + "learning_rate": 2.5695769208130615e-07, + "loss": 1.6534, + "step": 10320 + }, + { + "epoch": 0.6957312759004076, + "grad_norm": 4.343547857486007, + "learning_rate": 2.564439574110833e-07, + "loss": 1.6725, + "step": 10325 + }, + { + "epoch": 0.6960681917725144, + "grad_norm": 4.568973039945001, + "learning_rate": 2.559305596290547e-07, + "loss": 1.7006, + "step": 10330 + }, + { + "epoch": 0.6964051076446212, + "grad_norm": 4.772607107022066, + "learning_rate": 2.554174994453555e-07, + "loss": 1.6212, + "step": 10335 + }, + { + "epoch": 0.6967420235167279, + "grad_norm": 4.558184718510563, + "learning_rate": 2.549047775696532e-07, + "loss": 1.65, + "step": 10340 + }, + { + "epoch": 0.6970789393888346, + "grad_norm": 4.684970264407261, + "learning_rate": 2.543923947111481e-07, + "loss": 1.6709, + "step": 10345 + }, + { + "epoch": 0.6974158552609413, + "grad_norm": 4.531575026071747, + "learning_rate": 2.538803515785714e-07, + "loss": 1.7056, + "step": 10350 + }, + { + "epoch": 0.6977527711330481, + "grad_norm": 4.527121588986311, + "learning_rate": 2.5336864888018393e-07, + "loss": 1.641, + "step": 10355 + }, + { + "epoch": 0.6980896870051548, + "grad_norm": 4.644098459060704, + "learning_rate": 2.528572873237761e-07, + "loss": 1.6763, + "step": 10360 + }, + { + "epoch": 0.6984266028772615, + "grad_norm": 4.5149636521464425, + "learning_rate": 2.5234626761666647e-07, + "loss": 1.7769, + "step": 10365 + }, + { + "epoch": 0.6987635187493683, + "grad_norm": 4.273525129862513, + "learning_rate": 2.5183559046570036e-07, + "loss": 1.7503, + "step": 10370 + }, + { + "epoch": 0.699100434621475, + "grad_norm": 4.438322061742733, + "learning_rate": 2.513252565772496e-07, + "loss": 1.6898, + "step": 10375 + }, + { + "epoch": 0.6994373504935818, + "grad_norm": 4.534576711817759, + "learning_rate": 2.5081526665721133e-07, + "loss": 1.6594, + "step": 10380 + }, + { + "epoch": 0.6997742663656885, + "grad_norm": 4.441597881770876, + "learning_rate": 2.503056214110062e-07, + "loss": 1.6447, + "step": 10385 + }, + { + "epoch": 0.7001111822377952, + "grad_norm": 4.525682977168778, + "learning_rate": 2.497963215435789e-07, + "loss": 1.6098, + "step": 10390 + }, + { + "epoch": 0.7004480981099019, + "grad_norm": 4.983403352698664, + "learning_rate": 2.492873677593964e-07, + "loss": 1.6142, + "step": 10395 + }, + { + "epoch": 0.7007850139820087, + "grad_norm": 4.6617257994767884, + "learning_rate": 2.4877876076244626e-07, + "loss": 1.7664, + "step": 10400 + }, + { + "epoch": 0.7011219298541155, + "grad_norm": 4.9910938176979105, + "learning_rate": 2.482705012562367e-07, + "loss": 1.6845, + "step": 10405 + }, + { + "epoch": 0.7014588457262222, + "grad_norm": 4.5161646653470795, + "learning_rate": 2.4776258994379546e-07, + "loss": 1.6812, + "step": 10410 + }, + { + "epoch": 0.7017957615983289, + "grad_norm": 4.645809092572502, + "learning_rate": 2.4725502752766883e-07, + "loss": 1.6761, + "step": 10415 + }, + { + "epoch": 0.7021326774704356, + "grad_norm": 4.762960103801375, + "learning_rate": 2.4674781470991967e-07, + "loss": 1.5992, + "step": 10420 + }, + { + "epoch": 0.7024695933425423, + "grad_norm": 4.868038772852577, + "learning_rate": 2.462409521921282e-07, + "loss": 1.6409, + "step": 10425 + }, + { + "epoch": 0.7028065092146492, + "grad_norm": 4.489504746004885, + "learning_rate": 2.4573444067538985e-07, + "loss": 1.7111, + "step": 10430 + }, + { + "epoch": 0.7031434250867559, + "grad_norm": 4.853823090419402, + "learning_rate": 2.4522828086031404e-07, + "loss": 1.752, + "step": 10435 + }, + { + "epoch": 0.7034803409588626, + "grad_norm": 4.867281068587823, + "learning_rate": 2.4472247344702424e-07, + "loss": 1.6618, + "step": 10440 + }, + { + "epoch": 0.7038172568309693, + "grad_norm": 4.832483270849909, + "learning_rate": 2.442170191351566e-07, + "loss": 1.7637, + "step": 10445 + }, + { + "epoch": 0.704154172703076, + "grad_norm": 4.850938617706512, + "learning_rate": 2.4371191862385816e-07, + "loss": 1.6092, + "step": 10450 + }, + { + "epoch": 0.7044910885751827, + "grad_norm": 4.641938087798896, + "learning_rate": 2.4320717261178715e-07, + "loss": 1.6935, + "step": 10455 + }, + { + "epoch": 0.7048280044472895, + "grad_norm": 4.740015041009832, + "learning_rate": 2.4270278179711163e-07, + "loss": 1.655, + "step": 10460 + }, + { + "epoch": 0.7051649203193963, + "grad_norm": 4.939551223908603, + "learning_rate": 2.4219874687750754e-07, + "loss": 1.6852, + "step": 10465 + }, + { + "epoch": 0.705501836191503, + "grad_norm": 4.711384090241523, + "learning_rate": 2.4169506855015923e-07, + "loss": 1.741, + "step": 10470 + }, + { + "epoch": 0.7058387520636097, + "grad_norm": 4.5166802497839535, + "learning_rate": 2.4119174751175787e-07, + "loss": 1.6763, + "step": 10475 + }, + { + "epoch": 0.7061756679357164, + "grad_norm": 4.663782473551683, + "learning_rate": 2.406887844584998e-07, + "loss": 1.6727, + "step": 10480 + }, + { + "epoch": 0.7065125838078232, + "grad_norm": 4.6119383856125875, + "learning_rate": 2.401861800860868e-07, + "loss": 1.651, + "step": 10485 + }, + { + "epoch": 0.7068494996799299, + "grad_norm": 4.251793594934676, + "learning_rate": 2.396839350897241e-07, + "loss": 1.7073, + "step": 10490 + }, + { + "epoch": 0.7071864155520367, + "grad_norm": 4.832759698580428, + "learning_rate": 2.391820501641203e-07, + "loss": 1.7365, + "step": 10495 + }, + { + "epoch": 0.7075233314241434, + "grad_norm": 4.917880501129237, + "learning_rate": 2.3868052600348524e-07, + "loss": 1.6904, + "step": 10500 + }, + { + "epoch": 0.7078602472962501, + "grad_norm": 4.330584540056101, + "learning_rate": 2.381793633015305e-07, + "loss": 1.6593, + "step": 10505 + }, + { + "epoch": 0.7081971631683569, + "grad_norm": 5.002212666859194, + "learning_rate": 2.3767856275146748e-07, + "loss": 1.6624, + "step": 10510 + }, + { + "epoch": 0.7085340790404636, + "grad_norm": 4.736950520168997, + "learning_rate": 2.3717812504600616e-07, + "loss": 1.6459, + "step": 10515 + }, + { + "epoch": 0.7088709949125703, + "grad_norm": 4.418489255619871, + "learning_rate": 2.3667805087735516e-07, + "loss": 1.6726, + "step": 10520 + }, + { + "epoch": 0.709207910784677, + "grad_norm": 4.6673695105380135, + "learning_rate": 2.3617834093722033e-07, + "loss": 1.637, + "step": 10525 + }, + { + "epoch": 0.7095448266567838, + "grad_norm": 4.7641420953866005, + "learning_rate": 2.3567899591680317e-07, + "loss": 1.679, + "step": 10530 + }, + { + "epoch": 0.7098817425288906, + "grad_norm": 4.505140740962068, + "learning_rate": 2.351800165068008e-07, + "loss": 1.6526, + "step": 10535 + }, + { + "epoch": 0.7102186584009973, + "grad_norm": 4.365411249482726, + "learning_rate": 2.346814033974047e-07, + "loss": 1.686, + "step": 10540 + }, + { + "epoch": 0.710555574273104, + "grad_norm": 4.693059176177268, + "learning_rate": 2.3418315727829962e-07, + "loss": 1.723, + "step": 10545 + }, + { + "epoch": 0.7108924901452107, + "grad_norm": 4.619080120396149, + "learning_rate": 2.336852788386623e-07, + "loss": 1.7133, + "step": 10550 + }, + { + "epoch": 0.7112294060173174, + "grad_norm": 4.939014950379131, + "learning_rate": 2.331877687671614e-07, + "loss": 1.6821, + "step": 10555 + }, + { + "epoch": 0.7115663218894243, + "grad_norm": 4.685813885840334, + "learning_rate": 2.3269062775195596e-07, + "loss": 1.6593, + "step": 10560 + }, + { + "epoch": 0.711903237761531, + "grad_norm": 4.524468772976415, + "learning_rate": 2.321938564806944e-07, + "loss": 1.7253, + "step": 10565 + }, + { + "epoch": 0.7122401536336377, + "grad_norm": 4.892348621641314, + "learning_rate": 2.3169745564051353e-07, + "loss": 1.6513, + "step": 10570 + }, + { + "epoch": 0.7125770695057444, + "grad_norm": 4.6502041358305, + "learning_rate": 2.3120142591803825e-07, + "loss": 1.7138, + "step": 10575 + }, + { + "epoch": 0.7129139853778511, + "grad_norm": 4.838257165305869, + "learning_rate": 2.307057679993797e-07, + "loss": 1.6281, + "step": 10580 + }, + { + "epoch": 0.7132509012499579, + "grad_norm": 4.64288900947958, + "learning_rate": 2.30210482570135e-07, + "loss": 1.7001, + "step": 10585 + }, + { + "epoch": 0.7135878171220646, + "grad_norm": 4.518161574505775, + "learning_rate": 2.2971557031538607e-07, + "loss": 1.6696, + "step": 10590 + }, + { + "epoch": 0.7139247329941714, + "grad_norm": 4.390328603188015, + "learning_rate": 2.2922103191969828e-07, + "loss": 1.7425, + "step": 10595 + }, + { + "epoch": 0.7142616488662781, + "grad_norm": 4.573838466451132, + "learning_rate": 2.2872686806712032e-07, + "loss": 1.6373, + "step": 10600 + }, + { + "epoch": 0.7145985647383848, + "grad_norm": 4.732632510738885, + "learning_rate": 2.2823307944118254e-07, + "loss": 1.6169, + "step": 10605 + }, + { + "epoch": 0.7149354806104916, + "grad_norm": 4.817100325987074, + "learning_rate": 2.2773966672489665e-07, + "loss": 1.7377, + "step": 10610 + }, + { + "epoch": 0.7152723964825983, + "grad_norm": 4.597601777541554, + "learning_rate": 2.2724663060075368e-07, + "loss": 1.644, + "step": 10615 + }, + { + "epoch": 0.715609312354705, + "grad_norm": 4.521845052152514, + "learning_rate": 2.2675397175072437e-07, + "loss": 1.6741, + "step": 10620 + }, + { + "epoch": 0.7159462282268118, + "grad_norm": 4.103897229849213, + "learning_rate": 2.2626169085625762e-07, + "loss": 1.6386, + "step": 10625 + }, + { + "epoch": 0.7162831440989185, + "grad_norm": 4.501031185020447, + "learning_rate": 2.25769788598279e-07, + "loss": 1.667, + "step": 10630 + }, + { + "epoch": 0.7166200599710252, + "grad_norm": 4.635506503161774, + "learning_rate": 2.2527826565719084e-07, + "loss": 1.6141, + "step": 10635 + }, + { + "epoch": 0.716956975843132, + "grad_norm": 4.404740981961707, + "learning_rate": 2.2478712271287087e-07, + "loss": 1.7058, + "step": 10640 + }, + { + "epoch": 0.7172938917152387, + "grad_norm": 4.688543744546726, + "learning_rate": 2.2429636044467059e-07, + "loss": 1.6766, + "step": 10645 + }, + { + "epoch": 0.7176308075873454, + "grad_norm": 4.211449995644894, + "learning_rate": 2.2380597953141573e-07, + "loss": 1.7425, + "step": 10650 + }, + { + "epoch": 0.7179677234594521, + "grad_norm": 4.504845812085762, + "learning_rate": 2.2331598065140396e-07, + "loss": 1.6595, + "step": 10655 + }, + { + "epoch": 0.7183046393315589, + "grad_norm": 4.676702954559052, + "learning_rate": 2.228263644824045e-07, + "loss": 1.7615, + "step": 10660 + }, + { + "epoch": 0.7186415552036657, + "grad_norm": 4.186233739778813, + "learning_rate": 2.2233713170165757e-07, + "loss": 1.6644, + "step": 10665 + }, + { + "epoch": 0.7189784710757724, + "grad_norm": 4.680317760117085, + "learning_rate": 2.2184828298587298e-07, + "loss": 1.6968, + "step": 10670 + }, + { + "epoch": 0.7193153869478791, + "grad_norm": 4.355114775428761, + "learning_rate": 2.213598190112294e-07, + "loss": 1.6111, + "step": 10675 + }, + { + "epoch": 0.7196523028199858, + "grad_norm": 4.722909090510962, + "learning_rate": 2.2087174045337275e-07, + "loss": 1.6572, + "step": 10680 + }, + { + "epoch": 0.7199892186920925, + "grad_norm": 4.268160931804694, + "learning_rate": 2.2038404798741644e-07, + "loss": 1.678, + "step": 10685 + }, + { + "epoch": 0.7203261345641994, + "grad_norm": 4.3897851171954, + "learning_rate": 2.1989674228793987e-07, + "loss": 1.6375, + "step": 10690 + }, + { + "epoch": 0.7206630504363061, + "grad_norm": 4.946929325535152, + "learning_rate": 2.1940982402898684e-07, + "loss": 1.6924, + "step": 10695 + }, + { + "epoch": 0.7209999663084128, + "grad_norm": 4.429108850083181, + "learning_rate": 2.1892329388406582e-07, + "loss": 1.6301, + "step": 10700 + }, + { + "epoch": 0.7213368821805195, + "grad_norm": 4.576282749266952, + "learning_rate": 2.1843715252614847e-07, + "loss": 1.6423, + "step": 10705 + }, + { + "epoch": 0.7216737980526262, + "grad_norm": 4.7374662176046956, + "learning_rate": 2.179514006276681e-07, + "loss": 1.694, + "step": 10710 + }, + { + "epoch": 0.722010713924733, + "grad_norm": 4.108799578698973, + "learning_rate": 2.1746603886051978e-07, + "loss": 1.7134, + "step": 10715 + }, + { + "epoch": 0.7223476297968398, + "grad_norm": 4.562017959059164, + "learning_rate": 2.169810678960591e-07, + "loss": 1.6636, + "step": 10720 + }, + { + "epoch": 0.7226845456689465, + "grad_norm": 4.1553309206914655, + "learning_rate": 2.1649648840510047e-07, + "loss": 1.6234, + "step": 10725 + }, + { + "epoch": 0.7230214615410532, + "grad_norm": 5.012536579623761, + "learning_rate": 2.1601230105791751e-07, + "loss": 1.709, + "step": 10730 + }, + { + "epoch": 0.7233583774131599, + "grad_norm": 4.8962071217764125, + "learning_rate": 2.1552850652424077e-07, + "loss": 1.7263, + "step": 10735 + }, + { + "epoch": 0.7236952932852667, + "grad_norm": 4.720798604388035, + "learning_rate": 2.150451054732581e-07, + "loss": 1.6333, + "step": 10740 + }, + { + "epoch": 0.7240322091573734, + "grad_norm": 4.7913493095136825, + "learning_rate": 2.1456209857361246e-07, + "loss": 1.659, + "step": 10745 + }, + { + "epoch": 0.7243691250294801, + "grad_norm": 4.498833574428302, + "learning_rate": 2.1407948649340208e-07, + "loss": 1.6841, + "step": 10750 + }, + { + "epoch": 0.7247060409015869, + "grad_norm": 5.1590058288463645, + "learning_rate": 2.1359726990017908e-07, + "loss": 1.6044, + "step": 10755 + }, + { + "epoch": 0.7250429567736936, + "grad_norm": 4.936858137916012, + "learning_rate": 2.13115449460948e-07, + "loss": 1.6876, + "step": 10760 + }, + { + "epoch": 0.7253798726458004, + "grad_norm": 4.799281359022529, + "learning_rate": 2.12634025842166e-07, + "loss": 1.6138, + "step": 10765 + }, + { + "epoch": 0.7257167885179071, + "grad_norm": 5.074248882474168, + "learning_rate": 2.1215299970974132e-07, + "loss": 1.6955, + "step": 10770 + }, + { + "epoch": 0.7260537043900138, + "grad_norm": 4.311575992161255, + "learning_rate": 2.116723717290318e-07, + "loss": 1.7084, + "step": 10775 + }, + { + "epoch": 0.7263906202621205, + "grad_norm": 5.082312637120859, + "learning_rate": 2.111921425648453e-07, + "loss": 1.7333, + "step": 10780 + }, + { + "epoch": 0.7267275361342272, + "grad_norm": 4.6329981092644195, + "learning_rate": 2.1071231288143777e-07, + "loss": 1.7053, + "step": 10785 + }, + { + "epoch": 0.7270644520063341, + "grad_norm": 4.775432945237623, + "learning_rate": 2.1023288334251222e-07, + "loss": 1.6273, + "step": 10790 + }, + { + "epoch": 0.7274013678784408, + "grad_norm": 4.185322290611864, + "learning_rate": 2.0975385461121864e-07, + "loss": 1.7184, + "step": 10795 + }, + { + "epoch": 0.7277382837505475, + "grad_norm": 4.7166590882906, + "learning_rate": 2.0927522735015268e-07, + "loss": 1.5728, + "step": 10800 + }, + { + "epoch": 0.7280751996226542, + "grad_norm": 4.4664703136065596, + "learning_rate": 2.0879700222135416e-07, + "loss": 1.6866, + "step": 10805 + }, + { + "epoch": 0.7284121154947609, + "grad_norm": 4.844312254152635, + "learning_rate": 2.083191798863072e-07, + "loss": 1.6721, + "step": 10810 + }, + { + "epoch": 0.7287490313668676, + "grad_norm": 4.6986867193190065, + "learning_rate": 2.0784176100593836e-07, + "loss": 1.6653, + "step": 10815 + }, + { + "epoch": 0.7290859472389745, + "grad_norm": 4.896429792837326, + "learning_rate": 2.0736474624061655e-07, + "loss": 1.7465, + "step": 10820 + }, + { + "epoch": 0.7294228631110812, + "grad_norm": 4.492589280114266, + "learning_rate": 2.0688813625015123e-07, + "loss": 1.6479, + "step": 10825 + }, + { + "epoch": 0.7297597789831879, + "grad_norm": 4.778220852460387, + "learning_rate": 2.064119316937923e-07, + "loss": 1.7129, + "step": 10830 + }, + { + "epoch": 0.7300966948552946, + "grad_norm": 4.643533713787365, + "learning_rate": 2.0593613323022907e-07, + "loss": 1.6287, + "step": 10835 + }, + { + "epoch": 0.7304336107274013, + "grad_norm": 5.0151875683234834, + "learning_rate": 2.054607415175884e-07, + "loss": 1.6879, + "step": 10840 + }, + { + "epoch": 0.7307705265995081, + "grad_norm": 4.846287699529274, + "learning_rate": 2.0498575721343525e-07, + "loss": 1.69, + "step": 10845 + }, + { + "epoch": 0.7311074424716149, + "grad_norm": 4.665676909970296, + "learning_rate": 2.0451118097477093e-07, + "loss": 1.6734, + "step": 10850 + }, + { + "epoch": 0.7314443583437216, + "grad_norm": 5.059613017774967, + "learning_rate": 2.0403701345803186e-07, + "loss": 1.6899, + "step": 10855 + }, + { + "epoch": 0.7317812742158283, + "grad_norm": 4.647770298843087, + "learning_rate": 2.0356325531908952e-07, + "loss": 1.6748, + "step": 10860 + }, + { + "epoch": 0.732118190087935, + "grad_norm": 4.3547322930879435, + "learning_rate": 2.0308990721324926e-07, + "loss": 1.6818, + "step": 10865 + }, + { + "epoch": 0.7324551059600418, + "grad_norm": 4.3281885006679, + "learning_rate": 2.0261696979524873e-07, + "loss": 1.6646, + "step": 10870 + }, + { + "epoch": 0.7327920218321485, + "grad_norm": 4.585176262897073, + "learning_rate": 2.0214444371925793e-07, + "loss": 1.5786, + "step": 10875 + }, + { + "epoch": 0.7331289377042552, + "grad_norm": 4.47905166124732, + "learning_rate": 2.0167232963887787e-07, + "loss": 1.6532, + "step": 10880 + }, + { + "epoch": 0.733465853576362, + "grad_norm": 4.852975997183129, + "learning_rate": 2.0120062820713974e-07, + "loss": 1.6787, + "step": 10885 + }, + { + "epoch": 0.7338027694484687, + "grad_norm": 4.907158016902662, + "learning_rate": 2.0072934007650345e-07, + "loss": 1.6677, + "step": 10890 + }, + { + "epoch": 0.7341396853205755, + "grad_norm": 4.440284305274986, + "learning_rate": 2.0025846589885798e-07, + "loss": 1.6863, + "step": 10895 + }, + { + "epoch": 0.7344766011926822, + "grad_norm": 4.427979545085985, + "learning_rate": 1.99788006325519e-07, + "loss": 1.5922, + "step": 10900 + }, + { + "epoch": 0.7348135170647889, + "grad_norm": 4.416115150299774, + "learning_rate": 1.9931796200722943e-07, + "loss": 1.6723, + "step": 10905 + }, + { + "epoch": 0.7351504329368956, + "grad_norm": 4.736296900364095, + "learning_rate": 1.9884833359415698e-07, + "loss": 1.6714, + "step": 10910 + }, + { + "epoch": 0.7354873488090024, + "grad_norm": 4.682669507953387, + "learning_rate": 1.9837912173589494e-07, + "loss": 1.7085, + "step": 10915 + }, + { + "epoch": 0.7358242646811092, + "grad_norm": 4.710098455479335, + "learning_rate": 1.979103270814596e-07, + "loss": 1.7463, + "step": 10920 + }, + { + "epoch": 0.7361611805532159, + "grad_norm": 4.56398031316537, + "learning_rate": 1.9744195027929072e-07, + "loss": 1.691, + "step": 10925 + }, + { + "epoch": 0.7364980964253226, + "grad_norm": 4.883794149339468, + "learning_rate": 1.9697399197725023e-07, + "loss": 1.7055, + "step": 10930 + }, + { + "epoch": 0.7368350122974293, + "grad_norm": 4.420876529292157, + "learning_rate": 1.965064528226204e-07, + "loss": 1.7008, + "step": 10935 + }, + { + "epoch": 0.737171928169536, + "grad_norm": 4.765157524613675, + "learning_rate": 1.9603933346210445e-07, + "loss": 1.7264, + "step": 10940 + }, + { + "epoch": 0.7375088440416429, + "grad_norm": 4.578533250992113, + "learning_rate": 1.9557263454182476e-07, + "loss": 1.6727, + "step": 10945 + }, + { + "epoch": 0.7378457599137496, + "grad_norm": 4.4105063470620625, + "learning_rate": 1.9510635670732216e-07, + "loss": 1.669, + "step": 10950 + }, + { + "epoch": 0.7381826757858563, + "grad_norm": 4.762507051489227, + "learning_rate": 1.946405006035548e-07, + "loss": 1.6808, + "step": 10955 + }, + { + "epoch": 0.738519591657963, + "grad_norm": 4.59971833063233, + "learning_rate": 1.9417506687489772e-07, + "loss": 1.6845, + "step": 10960 + }, + { + "epoch": 0.7388565075300697, + "grad_norm": 4.442562019236419, + "learning_rate": 1.937100561651418e-07, + "loss": 1.6556, + "step": 10965 + }, + { + "epoch": 0.7391934234021765, + "grad_norm": 4.809988398340547, + "learning_rate": 1.9324546911749246e-07, + "loss": 1.6909, + "step": 10970 + }, + { + "epoch": 0.7395303392742832, + "grad_norm": 4.294425250731902, + "learning_rate": 1.9278130637456957e-07, + "loss": 1.5945, + "step": 10975 + }, + { + "epoch": 0.73986725514639, + "grad_norm": 5.316130213272725, + "learning_rate": 1.923175685784056e-07, + "loss": 1.6772, + "step": 10980 + }, + { + "epoch": 0.7402041710184967, + "grad_norm": 4.942738826736831, + "learning_rate": 1.9185425637044567e-07, + "loss": 1.6838, + "step": 10985 + }, + { + "epoch": 0.7405410868906034, + "grad_norm": 4.547439281413283, + "learning_rate": 1.9139137039154584e-07, + "loss": 1.7118, + "step": 10990 + }, + { + "epoch": 0.7408780027627101, + "grad_norm": 4.497209974141303, + "learning_rate": 1.9092891128197308e-07, + "loss": 1.6566, + "step": 10995 + }, + { + "epoch": 0.7412149186348169, + "grad_norm": 4.723607597807449, + "learning_rate": 1.904668796814033e-07, + "loss": 1.782, + "step": 11000 + }, + { + "epoch": 0.7415518345069236, + "grad_norm": 5.126840547555821, + "learning_rate": 1.9000527622892154e-07, + "loss": 1.6948, + "step": 11005 + }, + { + "epoch": 0.7418887503790303, + "grad_norm": 4.564915672077015, + "learning_rate": 1.895441015630206e-07, + "loss": 1.6858, + "step": 11010 + }, + { + "epoch": 0.7422256662511371, + "grad_norm": 4.565825346690561, + "learning_rate": 1.8908335632160011e-07, + "loss": 1.7515, + "step": 11015 + }, + { + "epoch": 0.7425625821232438, + "grad_norm": 4.607141520570874, + "learning_rate": 1.8862304114196542e-07, + "loss": 1.6714, + "step": 11020 + }, + { + "epoch": 0.7428994979953506, + "grad_norm": 4.878415824316645, + "learning_rate": 1.8816315666082744e-07, + "loss": 1.6519, + "step": 11025 + }, + { + "epoch": 0.7432364138674573, + "grad_norm": 4.502110040030405, + "learning_rate": 1.877037035143013e-07, + "loss": 1.696, + "step": 11030 + }, + { + "epoch": 0.743573329739564, + "grad_norm": 4.419294474942364, + "learning_rate": 1.8724468233790512e-07, + "loss": 1.6732, + "step": 11035 + }, + { + "epoch": 0.7439102456116707, + "grad_norm": 4.957901599135894, + "learning_rate": 1.867860937665599e-07, + "loss": 1.7591, + "step": 11040 + }, + { + "epoch": 0.7442471614837775, + "grad_norm": 4.307693778434295, + "learning_rate": 1.8632793843458827e-07, + "loss": 1.7046, + "step": 11045 + }, + { + "epoch": 0.7445840773558843, + "grad_norm": 5.0797146696117155, + "learning_rate": 1.8587021697571313e-07, + "loss": 1.668, + "step": 11050 + }, + { + "epoch": 0.744920993227991, + "grad_norm": 4.733020859471026, + "learning_rate": 1.854129300230578e-07, + "loss": 1.6791, + "step": 11055 + }, + { + "epoch": 0.7452579091000977, + "grad_norm": 4.7002276751867385, + "learning_rate": 1.849560782091445e-07, + "loss": 1.6457, + "step": 11060 + }, + { + "epoch": 0.7455948249722044, + "grad_norm": 4.77322186606534, + "learning_rate": 1.8449966216589319e-07, + "loss": 1.6891, + "step": 11065 + }, + { + "epoch": 0.7459317408443111, + "grad_norm": 4.959855386815174, + "learning_rate": 1.8404368252462128e-07, + "loss": 1.6506, + "step": 11070 + }, + { + "epoch": 0.746268656716418, + "grad_norm": 4.726346375566695, + "learning_rate": 1.8358813991604262e-07, + "loss": 1.7413, + "step": 11075 + }, + { + "epoch": 0.7466055725885247, + "grad_norm": 4.487389952604869, + "learning_rate": 1.8313303497026673e-07, + "loss": 1.6747, + "step": 11080 + }, + { + "epoch": 0.7469424884606314, + "grad_norm": 5.067852653243485, + "learning_rate": 1.8267836831679718e-07, + "loss": 1.7386, + "step": 11085 + }, + { + "epoch": 0.7472794043327381, + "grad_norm": 4.796721785210641, + "learning_rate": 1.8222414058453183e-07, + "loss": 1.7148, + "step": 11090 + }, + { + "epoch": 0.7476163202048448, + "grad_norm": 4.357543678187077, + "learning_rate": 1.8177035240176136e-07, + "loss": 1.7123, + "step": 11095 + }, + { + "epoch": 0.7479532360769516, + "grad_norm": 5.122671623146191, + "learning_rate": 1.8131700439616803e-07, + "loss": 1.7409, + "step": 11100 + }, + { + "epoch": 0.7482901519490583, + "grad_norm": 4.448290814257407, + "learning_rate": 1.8086409719482576e-07, + "loss": 1.6186, + "step": 11105 + }, + { + "epoch": 0.7486270678211651, + "grad_norm": 4.511243100329989, + "learning_rate": 1.8041163142419857e-07, + "loss": 1.7047, + "step": 11110 + }, + { + "epoch": 0.7489639836932718, + "grad_norm": 4.772657575056115, + "learning_rate": 1.7995960771013962e-07, + "loss": 1.6571, + "step": 11115 + }, + { + "epoch": 0.7493008995653785, + "grad_norm": 5.016379241843775, + "learning_rate": 1.7950802667789107e-07, + "loss": 1.7197, + "step": 11120 + }, + { + "epoch": 0.7496378154374853, + "grad_norm": 4.31718338088045, + "learning_rate": 1.7905688895208259e-07, + "loss": 1.6686, + "step": 11125 + }, + { + "epoch": 0.749974731309592, + "grad_norm": 4.460748644736323, + "learning_rate": 1.7860619515673032e-07, + "loss": 1.6984, + "step": 11130 + }, + { + "epoch": 0.7503116471816987, + "grad_norm": 4.940196253932972, + "learning_rate": 1.7815594591523687e-07, + "loss": 1.7274, + "step": 11135 + }, + { + "epoch": 0.7506485630538055, + "grad_norm": 4.619600331904886, + "learning_rate": 1.777061418503898e-07, + "loss": 1.6933, + "step": 11140 + }, + { + "epoch": 0.7509854789259122, + "grad_norm": 4.643905236837146, + "learning_rate": 1.7725678358436053e-07, + "loss": 1.671, + "step": 11145 + }, + { + "epoch": 0.751322394798019, + "grad_norm": 4.49307652200486, + "learning_rate": 1.7680787173870454e-07, + "loss": 1.6081, + "step": 11150 + }, + { + "epoch": 0.7516593106701257, + "grad_norm": 4.390677025866737, + "learning_rate": 1.763594069343589e-07, + "loss": 1.6297, + "step": 11155 + }, + { + "epoch": 0.7519962265422324, + "grad_norm": 5.09476617875323, + "learning_rate": 1.7591138979164337e-07, + "loss": 1.6135, + "step": 11160 + }, + { + "epoch": 0.7523331424143391, + "grad_norm": 4.56890945684151, + "learning_rate": 1.7546382093025758e-07, + "loss": 1.6907, + "step": 11165 + }, + { + "epoch": 0.7526700582864458, + "grad_norm": 4.894976797363558, + "learning_rate": 1.7501670096928162e-07, + "loss": 1.7158, + "step": 11170 + }, + { + "epoch": 0.7530069741585526, + "grad_norm": 4.160433485102773, + "learning_rate": 1.7457003052717473e-07, + "loss": 1.7441, + "step": 11175 + }, + { + "epoch": 0.7533438900306594, + "grad_norm": 5.015497260674881, + "learning_rate": 1.741238102217738e-07, + "loss": 1.6565, + "step": 11180 + }, + { + "epoch": 0.7536808059027661, + "grad_norm": 4.777756060009474, + "learning_rate": 1.736780406702937e-07, + "loss": 1.6735, + "step": 11185 + }, + { + "epoch": 0.7540177217748728, + "grad_norm": 4.831225099167053, + "learning_rate": 1.7323272248932564e-07, + "loss": 1.6308, + "step": 11190 + }, + { + "epoch": 0.7543546376469795, + "grad_norm": 4.364088966880527, + "learning_rate": 1.727878562948362e-07, + "loss": 1.6505, + "step": 11195 + }, + { + "epoch": 0.7546915535190862, + "grad_norm": 5.26461802860794, + "learning_rate": 1.723434427021671e-07, + "loss": 1.6145, + "step": 11200 + }, + { + "epoch": 0.7550284693911931, + "grad_norm": 4.703754105055933, + "learning_rate": 1.7189948232603412e-07, + "loss": 1.6986, + "step": 11205 + }, + { + "epoch": 0.7553653852632998, + "grad_norm": 4.1370611882167685, + "learning_rate": 1.7145597578052557e-07, + "loss": 1.7309, + "step": 11210 + }, + { + "epoch": 0.7557023011354065, + "grad_norm": 4.56949563785717, + "learning_rate": 1.7101292367910259e-07, + "loss": 1.7101, + "step": 11215 + }, + { + "epoch": 0.7560392170075132, + "grad_norm": 4.734655354891645, + "learning_rate": 1.7057032663459768e-07, + "loss": 1.7221, + "step": 11220 + }, + { + "epoch": 0.7563761328796199, + "grad_norm": 5.206838932444794, + "learning_rate": 1.701281852592134e-07, + "loss": 1.6888, + "step": 11225 + }, + { + "epoch": 0.7567130487517267, + "grad_norm": 4.6074064994191835, + "learning_rate": 1.696865001645228e-07, + "loss": 1.6462, + "step": 11230 + }, + { + "epoch": 0.7570499646238334, + "grad_norm": 4.177604176716723, + "learning_rate": 1.6924527196146692e-07, + "loss": 1.738, + "step": 11235 + }, + { + "epoch": 0.7573868804959402, + "grad_norm": 4.560519854232525, + "learning_rate": 1.6880450126035572e-07, + "loss": 1.6563, + "step": 11240 + }, + { + "epoch": 0.7577237963680469, + "grad_norm": 4.882181848246992, + "learning_rate": 1.683641886708655e-07, + "loss": 1.6493, + "step": 11245 + }, + { + "epoch": 0.7580607122401536, + "grad_norm": 4.570694014559624, + "learning_rate": 1.6792433480203955e-07, + "loss": 1.6609, + "step": 11250 + }, + { + "epoch": 0.7583976281122604, + "grad_norm": 4.615121303405667, + "learning_rate": 1.674849402622865e-07, + "loss": 1.6867, + "step": 11255 + }, + { + "epoch": 0.7587345439843671, + "grad_norm": 4.570503569438364, + "learning_rate": 1.6704600565937927e-07, + "loss": 1.7028, + "step": 11260 + }, + { + "epoch": 0.7590714598564738, + "grad_norm": 4.702666604621073, + "learning_rate": 1.6660753160045498e-07, + "loss": 1.6276, + "step": 11265 + }, + { + "epoch": 0.7594083757285806, + "grad_norm": 4.62630415804712, + "learning_rate": 1.6616951869201378e-07, + "loss": 1.6527, + "step": 11270 + }, + { + "epoch": 0.7597452916006873, + "grad_norm": 4.096803295319819, + "learning_rate": 1.6573196753991747e-07, + "loss": 1.7448, + "step": 11275 + }, + { + "epoch": 0.7600822074727941, + "grad_norm": 4.56230206217443, + "learning_rate": 1.652948787493896e-07, + "loss": 1.6791, + "step": 11280 + }, + { + "epoch": 0.7604191233449008, + "grad_norm": 4.79645465297604, + "learning_rate": 1.64858252925014e-07, + "loss": 1.7032, + "step": 11285 + }, + { + "epoch": 0.7607560392170075, + "grad_norm": 4.617093579090467, + "learning_rate": 1.6442209067073442e-07, + "loss": 1.6793, + "step": 11290 + }, + { + "epoch": 0.7610929550891142, + "grad_norm": 4.69948875641977, + "learning_rate": 1.639863925898527e-07, + "loss": 1.6961, + "step": 11295 + }, + { + "epoch": 0.761429870961221, + "grad_norm": 4.811597345530875, + "learning_rate": 1.6355115928502934e-07, + "loss": 1.7011, + "step": 11300 + }, + { + "epoch": 0.7617667868333278, + "grad_norm": 4.718286454240784, + "learning_rate": 1.6311639135828176e-07, + "loss": 1.7075, + "step": 11305 + }, + { + "epoch": 0.7621037027054345, + "grad_norm": 5.058155884325266, + "learning_rate": 1.6268208941098344e-07, + "loss": 1.6541, + "step": 11310 + }, + { + "epoch": 0.7624406185775412, + "grad_norm": 4.513026537436967, + "learning_rate": 1.6224825404386326e-07, + "loss": 1.6382, + "step": 11315 + }, + { + "epoch": 0.7627775344496479, + "grad_norm": 4.346149115041241, + "learning_rate": 1.6181488585700541e-07, + "loss": 1.5756, + "step": 11320 + }, + { + "epoch": 0.7631144503217546, + "grad_norm": 5.096384285195435, + "learning_rate": 1.6138198544984692e-07, + "loss": 1.7126, + "step": 11325 + }, + { + "epoch": 0.7634513661938614, + "grad_norm": 4.680091495711113, + "learning_rate": 1.609495534211785e-07, + "loss": 1.6963, + "step": 11330 + }, + { + "epoch": 0.7637882820659682, + "grad_norm": 4.456805907972039, + "learning_rate": 1.6051759036914286e-07, + "loss": 1.6394, + "step": 11335 + }, + { + "epoch": 0.7641251979380749, + "grad_norm": 4.667742373252477, + "learning_rate": 1.6008609689123364e-07, + "loss": 1.7233, + "step": 11340 + }, + { + "epoch": 0.7644621138101816, + "grad_norm": 4.881823308634071, + "learning_rate": 1.596550735842953e-07, + "loss": 1.7119, + "step": 11345 + }, + { + "epoch": 0.7647990296822883, + "grad_norm": 4.588255263137127, + "learning_rate": 1.5922452104452204e-07, + "loss": 1.6889, + "step": 11350 + }, + { + "epoch": 0.765135945554395, + "grad_norm": 4.937138907995441, + "learning_rate": 1.5879443986745678e-07, + "loss": 1.709, + "step": 11355 + }, + { + "epoch": 0.7654728614265018, + "grad_norm": 4.623024164988955, + "learning_rate": 1.583648306479901e-07, + "loss": 1.6697, + "step": 11360 + }, + { + "epoch": 0.7658097772986086, + "grad_norm": 4.8931561862953865, + "learning_rate": 1.5793569398036032e-07, + "loss": 1.7047, + "step": 11365 + }, + { + "epoch": 0.7661466931707153, + "grad_norm": 4.33678060112718, + "learning_rate": 1.57507030458152e-07, + "loss": 1.6264, + "step": 11370 + }, + { + "epoch": 0.766483609042822, + "grad_norm": 4.519741492496316, + "learning_rate": 1.5707884067429471e-07, + "loss": 1.7615, + "step": 11375 + }, + { + "epoch": 0.7668205249149287, + "grad_norm": 4.4632645139705085, + "learning_rate": 1.566511252210635e-07, + "loss": 1.645, + "step": 11380 + }, + { + "epoch": 0.7671574407870355, + "grad_norm": 4.230664086216158, + "learning_rate": 1.5622388469007696e-07, + "loss": 1.6724, + "step": 11385 + }, + { + "epoch": 0.7674943566591422, + "grad_norm": 4.568825730112045, + "learning_rate": 1.5579711967229652e-07, + "loss": 1.6245, + "step": 11390 + }, + { + "epoch": 0.7678312725312489, + "grad_norm": 4.781519886710452, + "learning_rate": 1.5537083075802647e-07, + "loss": 1.7008, + "step": 11395 + }, + { + "epoch": 0.7681681884033557, + "grad_norm": 4.4330956754887705, + "learning_rate": 1.5494501853691195e-07, + "loss": 1.6979, + "step": 11400 + }, + { + "epoch": 0.7685051042754624, + "grad_norm": 5.104008891311832, + "learning_rate": 1.5451968359793927e-07, + "loss": 1.6358, + "step": 11405 + }, + { + "epoch": 0.7688420201475692, + "grad_norm": 4.526648061418248, + "learning_rate": 1.5409482652943396e-07, + "loss": 1.641, + "step": 11410 + }, + { + "epoch": 0.7691789360196759, + "grad_norm": 4.78227432726269, + "learning_rate": 1.536704479190611e-07, + "loss": 1.6142, + "step": 11415 + }, + { + "epoch": 0.7695158518917826, + "grad_norm": 4.3575853373097155, + "learning_rate": 1.5324654835382384e-07, + "loss": 1.7033, + "step": 11420 + }, + { + "epoch": 0.7698527677638893, + "grad_norm": 4.574266836550961, + "learning_rate": 1.5282312842006238e-07, + "loss": 1.6819, + "step": 11425 + }, + { + "epoch": 0.770189683635996, + "grad_norm": 4.453632511820118, + "learning_rate": 1.5240018870345388e-07, + "loss": 1.6517, + "step": 11430 + }, + { + "epoch": 0.7705265995081029, + "grad_norm": 4.8945122291555005, + "learning_rate": 1.519777297890113e-07, + "loss": 1.6195, + "step": 11435 + }, + { + "epoch": 0.7708635153802096, + "grad_norm": 5.369730796017658, + "learning_rate": 1.5155575226108198e-07, + "loss": 1.6486, + "step": 11440 + }, + { + "epoch": 0.7712004312523163, + "grad_norm": 4.538548054640214, + "learning_rate": 1.51134256703348e-07, + "loss": 1.7399, + "step": 11445 + }, + { + "epoch": 0.771537347124423, + "grad_norm": 4.460016295353051, + "learning_rate": 1.5071324369882478e-07, + "loss": 1.6284, + "step": 11450 + }, + { + "epoch": 0.7718742629965297, + "grad_norm": 4.738163460247311, + "learning_rate": 1.5029271382985964e-07, + "loss": 1.683, + "step": 11455 + }, + { + "epoch": 0.7722111788686365, + "grad_norm": 4.641725415440229, + "learning_rate": 1.498726676781323e-07, + "loss": 1.7271, + "step": 11460 + }, + { + "epoch": 0.7725480947407433, + "grad_norm": 4.921504012646379, + "learning_rate": 1.4945310582465327e-07, + "loss": 1.7093, + "step": 11465 + }, + { + "epoch": 0.77288501061285, + "grad_norm": 4.554110504574998, + "learning_rate": 1.4903402884976262e-07, + "loss": 1.7071, + "step": 11470 + }, + { + "epoch": 0.7732219264849567, + "grad_norm": 5.112272763733612, + "learning_rate": 1.4861543733313065e-07, + "loss": 1.6968, + "step": 11475 + }, + { + "epoch": 0.7735588423570634, + "grad_norm": 4.556784459285417, + "learning_rate": 1.4819733185375531e-07, + "loss": 1.7668, + "step": 11480 + }, + { + "epoch": 0.7738957582291702, + "grad_norm": 4.508391558534176, + "learning_rate": 1.4777971298996288e-07, + "loss": 1.6464, + "step": 11485 + }, + { + "epoch": 0.7742326741012769, + "grad_norm": 4.411711682730823, + "learning_rate": 1.4736258131940605e-07, + "loss": 1.6264, + "step": 11490 + }, + { + "epoch": 0.7745695899733837, + "grad_norm": 5.015932123347625, + "learning_rate": 1.4694593741906403e-07, + "loss": 1.7101, + "step": 11495 + }, + { + "epoch": 0.7749065058454904, + "grad_norm": 4.787863644165874, + "learning_rate": 1.4652978186524135e-07, + "loss": 1.6964, + "step": 11500 + }, + { + "epoch": 0.7752434217175971, + "grad_norm": 4.2832899134958815, + "learning_rate": 1.4611411523356653e-07, + "loss": 1.6985, + "step": 11505 + }, + { + "epoch": 0.7755803375897039, + "grad_norm": 4.406066609439304, + "learning_rate": 1.4569893809899242e-07, + "loss": 1.7463, + "step": 11510 + }, + { + "epoch": 0.7759172534618106, + "grad_norm": 4.635127474132326, + "learning_rate": 1.452842510357946e-07, + "loss": 1.6584, + "step": 11515 + }, + { + "epoch": 0.7762541693339173, + "grad_norm": 4.433167172796796, + "learning_rate": 1.4487005461757051e-07, + "loss": 1.6375, + "step": 11520 + }, + { + "epoch": 0.776591085206024, + "grad_norm": 4.447787680123461, + "learning_rate": 1.4445634941723927e-07, + "loss": 1.6763, + "step": 11525 + }, + { + "epoch": 0.7769280010781308, + "grad_norm": 4.5157465552949505, + "learning_rate": 1.4404313600704054e-07, + "loss": 1.7647, + "step": 11530 + }, + { + "epoch": 0.7772649169502375, + "grad_norm": 4.968898885521456, + "learning_rate": 1.4363041495853334e-07, + "loss": 1.6746, + "step": 11535 + }, + { + "epoch": 0.7776018328223443, + "grad_norm": 4.778289663537236, + "learning_rate": 1.4321818684259607e-07, + "loss": 1.7011, + "step": 11540 + }, + { + "epoch": 0.777938748694451, + "grad_norm": 4.433981527727124, + "learning_rate": 1.4280645222942535e-07, + "loss": 1.6845, + "step": 11545 + }, + { + "epoch": 0.7782756645665577, + "grad_norm": 4.404414874555339, + "learning_rate": 1.4239521168853458e-07, + "loss": 1.6378, + "step": 11550 + }, + { + "epoch": 0.7786125804386644, + "grad_norm": 4.4047141653845046, + "learning_rate": 1.4198446578875444e-07, + "loss": 1.6676, + "step": 11555 + }, + { + "epoch": 0.7789494963107712, + "grad_norm": 4.5657660248855505, + "learning_rate": 1.4157421509823119e-07, + "loss": 1.6269, + "step": 11560 + }, + { + "epoch": 0.779286412182878, + "grad_norm": 4.45762595000863, + "learning_rate": 1.4116446018442608e-07, + "loss": 1.5861, + "step": 11565 + }, + { + "epoch": 0.7796233280549847, + "grad_norm": 5.183066557912235, + "learning_rate": 1.4075520161411425e-07, + "loss": 1.6269, + "step": 11570 + }, + { + "epoch": 0.7799602439270914, + "grad_norm": 4.763250474607571, + "learning_rate": 1.403464399533849e-07, + "loss": 1.6923, + "step": 11575 + }, + { + "epoch": 0.7802971597991981, + "grad_norm": 4.606123422875513, + "learning_rate": 1.3993817576763983e-07, + "loss": 1.7652, + "step": 11580 + }, + { + "epoch": 0.7806340756713048, + "grad_norm": 4.8621835506838496, + "learning_rate": 1.3953040962159207e-07, + "loss": 1.686, + "step": 11585 + }, + { + "epoch": 0.7809709915434117, + "grad_norm": 4.839325519832752, + "learning_rate": 1.3912314207926657e-07, + "loss": 1.6517, + "step": 11590 + }, + { + "epoch": 0.7813079074155184, + "grad_norm": 4.441109434606644, + "learning_rate": 1.3871637370399824e-07, + "loss": 1.6852, + "step": 11595 + }, + { + "epoch": 0.7816448232876251, + "grad_norm": 4.643129780812165, + "learning_rate": 1.3831010505843139e-07, + "loss": 1.6516, + "step": 11600 + }, + { + "epoch": 0.7819817391597318, + "grad_norm": 4.509617088988801, + "learning_rate": 1.3790433670451927e-07, + "loss": 1.6263, + "step": 11605 + }, + { + "epoch": 0.7823186550318385, + "grad_norm": 4.4981983354573245, + "learning_rate": 1.374990692035235e-07, + "loss": 1.6663, + "step": 11610 + }, + { + "epoch": 0.7826555709039453, + "grad_norm": 4.709987900224131, + "learning_rate": 1.3709430311601205e-07, + "loss": 1.6481, + "step": 11615 + }, + { + "epoch": 0.782992486776052, + "grad_norm": 4.519779682868838, + "learning_rate": 1.366900390018601e-07, + "loss": 1.5717, + "step": 11620 + }, + { + "epoch": 0.7833294026481588, + "grad_norm": 4.5215699973254395, + "learning_rate": 1.3628627742024812e-07, + "loss": 1.738, + "step": 11625 + }, + { + "epoch": 0.7836663185202655, + "grad_norm": 5.250318064412388, + "learning_rate": 1.3588301892966182e-07, + "loss": 1.5477, + "step": 11630 + }, + { + "epoch": 0.7840032343923722, + "grad_norm": 4.88331049164947, + "learning_rate": 1.3548026408789044e-07, + "loss": 1.7003, + "step": 11635 + }, + { + "epoch": 0.784340150264479, + "grad_norm": 4.931461361213741, + "learning_rate": 1.350780134520272e-07, + "loss": 1.7331, + "step": 11640 + }, + { + "epoch": 0.7846770661365857, + "grad_norm": 4.53955275256855, + "learning_rate": 1.3467626757846733e-07, + "loss": 1.6507, + "step": 11645 + }, + { + "epoch": 0.7850139820086924, + "grad_norm": 4.583660293606964, + "learning_rate": 1.342750270229085e-07, + "loss": 1.6317, + "step": 11650 + }, + { + "epoch": 0.7853508978807991, + "grad_norm": 4.975257250805664, + "learning_rate": 1.338742923403487e-07, + "loss": 1.5883, + "step": 11655 + }, + { + "epoch": 0.7856878137529059, + "grad_norm": 5.910548010361466, + "learning_rate": 1.3347406408508694e-07, + "loss": 1.6157, + "step": 11660 + }, + { + "epoch": 0.7860247296250127, + "grad_norm": 4.495450387975911, + "learning_rate": 1.3307434281072106e-07, + "loss": 1.7168, + "step": 11665 + }, + { + "epoch": 0.7863616454971194, + "grad_norm": 4.871421983989498, + "learning_rate": 1.326751290701481e-07, + "loss": 1.6807, + "step": 11670 + }, + { + "epoch": 0.7866985613692261, + "grad_norm": 4.3982658790692, + "learning_rate": 1.3227642341556306e-07, + "loss": 1.703, + "step": 11675 + }, + { + "epoch": 0.7870354772413328, + "grad_norm": 4.295004600481432, + "learning_rate": 1.318782263984577e-07, + "loss": 1.6987, + "step": 11680 + }, + { + "epoch": 0.7873723931134395, + "grad_norm": 4.58100374784792, + "learning_rate": 1.314805385696207e-07, + "loss": 1.6905, + "step": 11685 + }, + { + "epoch": 0.7877093089855464, + "grad_norm": 4.834714800486215, + "learning_rate": 1.3108336047913633e-07, + "loss": 1.7305, + "step": 11690 + }, + { + "epoch": 0.7880462248576531, + "grad_norm": 4.328656303402874, + "learning_rate": 1.3068669267638377e-07, + "loss": 1.6603, + "step": 11695 + }, + { + "epoch": 0.7883831407297598, + "grad_norm": 4.512299890876141, + "learning_rate": 1.3029053571003619e-07, + "loss": 1.6928, + "step": 11700 + }, + { + "epoch": 0.7887200566018665, + "grad_norm": 4.760951108743499, + "learning_rate": 1.2989489012806033e-07, + "loss": 1.7068, + "step": 11705 + }, + { + "epoch": 0.7890569724739732, + "grad_norm": 5.381231127812247, + "learning_rate": 1.294997564777157e-07, + "loss": 1.6708, + "step": 11710 + }, + { + "epoch": 0.7893938883460799, + "grad_norm": 4.90579764808831, + "learning_rate": 1.291051353055534e-07, + "loss": 1.7308, + "step": 11715 + }, + { + "epoch": 0.7897308042181868, + "grad_norm": 4.641833152824364, + "learning_rate": 1.28711027157416e-07, + "loss": 1.6489, + "step": 11720 + }, + { + "epoch": 0.7900677200902935, + "grad_norm": 4.5697377870925235, + "learning_rate": 1.2831743257843597e-07, + "loss": 1.6771, + "step": 11725 + }, + { + "epoch": 0.7904046359624002, + "grad_norm": 4.638576200215656, + "learning_rate": 1.279243521130361e-07, + "loss": 1.6553, + "step": 11730 + }, + { + "epoch": 0.7907415518345069, + "grad_norm": 4.898438222833477, + "learning_rate": 1.2753178630492733e-07, + "loss": 1.7493, + "step": 11735 + }, + { + "epoch": 0.7910784677066136, + "grad_norm": 4.8068008257623696, + "learning_rate": 1.271397356971094e-07, + "loss": 1.6344, + "step": 11740 + }, + { + "epoch": 0.7914153835787204, + "grad_norm": 5.032163072622715, + "learning_rate": 1.267482008318687e-07, + "loss": 1.6641, + "step": 11745 + }, + { + "epoch": 0.7917522994508271, + "grad_norm": 4.739291643331048, + "learning_rate": 1.2635718225077884e-07, + "loss": 1.7942, + "step": 11750 + }, + { + "epoch": 0.7920892153229339, + "grad_norm": 4.620255847392547, + "learning_rate": 1.259666804946991e-07, + "loss": 1.6376, + "step": 11755 + }, + { + "epoch": 0.7924261311950406, + "grad_norm": 4.826991904490677, + "learning_rate": 1.2557669610377397e-07, + "loss": 1.6789, + "step": 11760 + }, + { + "epoch": 0.7927630470671473, + "grad_norm": 4.653677114654714, + "learning_rate": 1.25187229617432e-07, + "loss": 1.7308, + "step": 11765 + }, + { + "epoch": 0.7930999629392541, + "grad_norm": 4.354166048350967, + "learning_rate": 1.247982815743857e-07, + "loss": 1.6842, + "step": 11770 + }, + { + "epoch": 0.7934368788113608, + "grad_norm": 4.791557277404768, + "learning_rate": 1.2440985251263054e-07, + "loss": 1.6012, + "step": 11775 + }, + { + "epoch": 0.7937737946834675, + "grad_norm": 4.593305862093821, + "learning_rate": 1.2402194296944363e-07, + "loss": 1.6843, + "step": 11780 + }, + { + "epoch": 0.7941107105555743, + "grad_norm": 4.673979764626909, + "learning_rate": 1.236345534813839e-07, + "loss": 1.6214, + "step": 11785 + }, + { + "epoch": 0.794447626427681, + "grad_norm": 4.479878569305674, + "learning_rate": 1.2324768458429107e-07, + "loss": 1.6624, + "step": 11790 + }, + { + "epoch": 0.7947845422997878, + "grad_norm": 4.945547685723316, + "learning_rate": 1.228613368132842e-07, + "loss": 1.7297, + "step": 11795 + }, + { + "epoch": 0.7951214581718945, + "grad_norm": 4.36357357134437, + "learning_rate": 1.2247551070276207e-07, + "loss": 1.5405, + "step": 11800 + }, + { + "epoch": 0.7954583740440012, + "grad_norm": 4.782501621716016, + "learning_rate": 1.2209020678640176e-07, + "loss": 1.743, + "step": 11805 + }, + { + "epoch": 0.7957952899161079, + "grad_norm": 4.780308924663592, + "learning_rate": 1.2170542559715775e-07, + "loss": 1.6265, + "step": 11810 + }, + { + "epoch": 0.7961322057882146, + "grad_norm": 4.7301440980568215, + "learning_rate": 1.2132116766726196e-07, + "loss": 1.6937, + "step": 11815 + }, + { + "epoch": 0.7964691216603215, + "grad_norm": 4.7183597873504795, + "learning_rate": 1.2093743352822206e-07, + "loss": 1.6882, + "step": 11820 + }, + { + "epoch": 0.7968060375324282, + "grad_norm": 5.4563281880583165, + "learning_rate": 1.2055422371082168e-07, + "loss": 1.6439, + "step": 11825 + }, + { + "epoch": 0.7971429534045349, + "grad_norm": 4.558590412599557, + "learning_rate": 1.2017153874511865e-07, + "loss": 1.6541, + "step": 11830 + }, + { + "epoch": 0.7974798692766416, + "grad_norm": 4.769716161730735, + "learning_rate": 1.1978937916044534e-07, + "loss": 1.6999, + "step": 11835 + }, + { + "epoch": 0.7978167851487483, + "grad_norm": 4.766641399567675, + "learning_rate": 1.1940774548540733e-07, + "loss": 1.6257, + "step": 11840 + }, + { + "epoch": 0.7981537010208551, + "grad_norm": 4.4988420622667755, + "learning_rate": 1.1902663824788233e-07, + "loss": 1.6595, + "step": 11845 + }, + { + "epoch": 0.7984906168929619, + "grad_norm": 4.384373031174729, + "learning_rate": 1.1864605797502031e-07, + "loss": 1.6879, + "step": 11850 + }, + { + "epoch": 0.7988275327650686, + "grad_norm": 4.172777385405688, + "learning_rate": 1.1826600519324237e-07, + "loss": 1.7245, + "step": 11855 + }, + { + "epoch": 0.7991644486371753, + "grad_norm": 4.817536572367274, + "learning_rate": 1.1788648042823956e-07, + "loss": 1.6383, + "step": 11860 + }, + { + "epoch": 0.799501364509282, + "grad_norm": 4.74080247539222, + "learning_rate": 1.1750748420497298e-07, + "loss": 1.635, + "step": 11865 + }, + { + "epoch": 0.7998382803813888, + "grad_norm": 4.966353394973762, + "learning_rate": 1.1712901704767253e-07, + "loss": 1.649, + "step": 11870 + }, + { + "epoch": 0.8001751962534955, + "grad_norm": 4.347264494303227, + "learning_rate": 1.1675107947983615e-07, + "loss": 1.698, + "step": 11875 + }, + { + "epoch": 0.8005121121256022, + "grad_norm": 4.573477117456977, + "learning_rate": 1.1637367202422943e-07, + "loss": 1.7489, + "step": 11880 + }, + { + "epoch": 0.800849027997709, + "grad_norm": 4.351379168903883, + "learning_rate": 1.159967952028848e-07, + "loss": 1.7147, + "step": 11885 + }, + { + "epoch": 0.8011859438698157, + "grad_norm": 4.5742306257265755, + "learning_rate": 1.1562044953710032e-07, + "loss": 1.5991, + "step": 11890 + }, + { + "epoch": 0.8015228597419224, + "grad_norm": 4.24646806235061, + "learning_rate": 1.152446355474398e-07, + "loss": 1.6763, + "step": 11895 + }, + { + "epoch": 0.8018597756140292, + "grad_norm": 5.2105463873461035, + "learning_rate": 1.1486935375373124e-07, + "loss": 1.7155, + "step": 11900 + }, + { + "epoch": 0.8021966914861359, + "grad_norm": 5.47357024870505, + "learning_rate": 1.1449460467506689e-07, + "loss": 1.6485, + "step": 11905 + }, + { + "epoch": 0.8025336073582426, + "grad_norm": 4.295231435421802, + "learning_rate": 1.1412038882980174e-07, + "loss": 1.6926, + "step": 11910 + }, + { + "epoch": 0.8028705232303494, + "grad_norm": 4.339085780781735, + "learning_rate": 1.1374670673555348e-07, + "loss": 1.6742, + "step": 11915 + }, + { + "epoch": 0.8032074391024561, + "grad_norm": 4.371106160876894, + "learning_rate": 1.1337355890920169e-07, + "loss": 1.7035, + "step": 11920 + }, + { + "epoch": 0.8035443549745629, + "grad_norm": 4.112654956639733, + "learning_rate": 1.130009458668863e-07, + "loss": 1.6723, + "step": 11925 + }, + { + "epoch": 0.8038812708466696, + "grad_norm": 4.500666264030187, + "learning_rate": 1.1262886812400813e-07, + "loss": 1.6475, + "step": 11930 + }, + { + "epoch": 0.8042181867187763, + "grad_norm": 4.561129079706414, + "learning_rate": 1.1225732619522754e-07, + "loss": 1.6282, + "step": 11935 + }, + { + "epoch": 0.804555102590883, + "grad_norm": 4.683667511801432, + "learning_rate": 1.118863205944633e-07, + "loss": 1.6257, + "step": 11940 + }, + { + "epoch": 0.8048920184629897, + "grad_norm": 5.150745324206002, + "learning_rate": 1.1151585183489266e-07, + "loss": 1.6967, + "step": 11945 + }, + { + "epoch": 0.8052289343350966, + "grad_norm": 4.535603700059998, + "learning_rate": 1.1114592042895044e-07, + "loss": 1.6512, + "step": 11950 + }, + { + "epoch": 0.8055658502072033, + "grad_norm": 4.697947520091324, + "learning_rate": 1.1077652688832772e-07, + "loss": 1.672, + "step": 11955 + }, + { + "epoch": 0.80590276607931, + "grad_norm": 4.58900673943732, + "learning_rate": 1.1040767172397209e-07, + "loss": 1.6287, + "step": 11960 + }, + { + "epoch": 0.8062396819514167, + "grad_norm": 4.697800139739896, + "learning_rate": 1.1003935544608612e-07, + "loss": 1.6606, + "step": 11965 + }, + { + "epoch": 0.8065765978235234, + "grad_norm": 4.517313865394391, + "learning_rate": 1.0967157856412739e-07, + "loss": 1.6497, + "step": 11970 + }, + { + "epoch": 0.8069135136956302, + "grad_norm": 4.488096600110159, + "learning_rate": 1.093043415868069e-07, + "loss": 1.7794, + "step": 11975 + }, + { + "epoch": 0.807250429567737, + "grad_norm": 4.441004794884173, + "learning_rate": 1.0893764502208891e-07, + "loss": 1.7114, + "step": 11980 + }, + { + "epoch": 0.8075873454398437, + "grad_norm": 4.795206946774002, + "learning_rate": 1.0857148937719063e-07, + "loss": 1.6427, + "step": 11985 + }, + { + "epoch": 0.8079242613119504, + "grad_norm": 4.659472879632121, + "learning_rate": 1.0820587515858054e-07, + "loss": 1.6943, + "step": 11990 + }, + { + "epoch": 0.8082611771840571, + "grad_norm": 4.606554652971352, + "learning_rate": 1.078408028719785e-07, + "loss": 1.6448, + "step": 11995 + }, + { + "epoch": 0.8085980930561639, + "grad_norm": 4.291101869310125, + "learning_rate": 1.0747627302235491e-07, + "loss": 1.7005, + "step": 12000 + }, + { + "epoch": 0.8089350089282706, + "grad_norm": 4.841403602767737, + "learning_rate": 1.0711228611392936e-07, + "loss": 1.7057, + "step": 12005 + }, + { + "epoch": 0.8092719248003774, + "grad_norm": 4.756702586221024, + "learning_rate": 1.0674884265017086e-07, + "loss": 1.6921, + "step": 12010 + }, + { + "epoch": 0.8096088406724841, + "grad_norm": 4.727776169377255, + "learning_rate": 1.0638594313379678e-07, + "loss": 1.5935, + "step": 12015 + }, + { + "epoch": 0.8099457565445908, + "grad_norm": 4.685242195126687, + "learning_rate": 1.060235880667717e-07, + "loss": 1.7255, + "step": 12020 + }, + { + "epoch": 0.8102826724166976, + "grad_norm": 4.314878206791833, + "learning_rate": 1.056617779503074e-07, + "loss": 1.6571, + "step": 12025 + }, + { + "epoch": 0.8106195882888043, + "grad_norm": 5.029269653182732, + "learning_rate": 1.053005132848619e-07, + "loss": 1.675, + "step": 12030 + }, + { + "epoch": 0.810956504160911, + "grad_norm": 4.342726447389877, + "learning_rate": 1.0493979457013874e-07, + "loss": 1.7437, + "step": 12035 + }, + { + "epoch": 0.8112934200330177, + "grad_norm": 4.235853993662135, + "learning_rate": 1.0457962230508599e-07, + "loss": 1.6567, + "step": 12040 + }, + { + "epoch": 0.8116303359051245, + "grad_norm": 4.687902595868502, + "learning_rate": 1.042199969878963e-07, + "loss": 1.6397, + "step": 12045 + }, + { + "epoch": 0.8119672517772313, + "grad_norm": 4.520022707269915, + "learning_rate": 1.0386091911600564e-07, + "loss": 1.6126, + "step": 12050 + }, + { + "epoch": 0.812304167649338, + "grad_norm": 4.744073377652037, + "learning_rate": 1.0350238918609244e-07, + "loss": 1.6284, + "step": 12055 + }, + { + "epoch": 0.8126410835214447, + "grad_norm": 4.328349619953094, + "learning_rate": 1.0314440769407784e-07, + "loss": 1.6412, + "step": 12060 + }, + { + "epoch": 0.8129779993935514, + "grad_norm": 4.867324986966181, + "learning_rate": 1.0278697513512375e-07, + "loss": 1.5896, + "step": 12065 + }, + { + "epoch": 0.8133149152656581, + "grad_norm": 4.7885057818111365, + "learning_rate": 1.0243009200363295e-07, + "loss": 1.5919, + "step": 12070 + }, + { + "epoch": 0.8136518311377648, + "grad_norm": 5.090797412053734, + "learning_rate": 1.0207375879324853e-07, + "loss": 1.6436, + "step": 12075 + }, + { + "epoch": 0.8139887470098717, + "grad_norm": 4.920044634686522, + "learning_rate": 1.0171797599685284e-07, + "loss": 1.6884, + "step": 12080 + }, + { + "epoch": 0.8143256628819784, + "grad_norm": 4.217021254878892, + "learning_rate": 1.0136274410656653e-07, + "loss": 1.689, + "step": 12085 + }, + { + "epoch": 0.8146625787540851, + "grad_norm": 4.706327392597494, + "learning_rate": 1.010080636137487e-07, + "loss": 1.6614, + "step": 12090 + }, + { + "epoch": 0.8149994946261918, + "grad_norm": 4.917225692982013, + "learning_rate": 1.0065393500899549e-07, + "loss": 1.5856, + "step": 12095 + }, + { + "epoch": 0.8153364104982985, + "grad_norm": 4.909986128400832, + "learning_rate": 1.0030035878213988e-07, + "loss": 1.6692, + "step": 12100 + }, + { + "epoch": 0.8156733263704053, + "grad_norm": 4.634205918538887, + "learning_rate": 9.994733542225037e-08, + "loss": 1.6299, + "step": 12105 + }, + { + "epoch": 0.8160102422425121, + "grad_norm": 4.313804535342684, + "learning_rate": 9.959486541763118e-08, + "loss": 1.6404, + "step": 12110 + }, + { + "epoch": 0.8163471581146188, + "grad_norm": 4.680390901578063, + "learning_rate": 9.924294925582105e-08, + "loss": 1.6987, + "step": 12115 + }, + { + "epoch": 0.8166840739867255, + "grad_norm": 4.753692254136698, + "learning_rate": 9.889158742359233e-08, + "loss": 1.6577, + "step": 12120 + }, + { + "epoch": 0.8170209898588322, + "grad_norm": 4.5693819665720525, + "learning_rate": 9.854078040695096e-08, + "loss": 1.6935, + "step": 12125 + }, + { + "epoch": 0.817357905730939, + "grad_norm": 4.549210623708017, + "learning_rate": 9.819052869113543e-08, + "loss": 1.6723, + "step": 12130 + }, + { + "epoch": 0.8176948216030457, + "grad_norm": 4.8659961093999975, + "learning_rate": 9.784083276061578e-08, + "loss": 1.6669, + "step": 12135 + }, + { + "epoch": 0.8180317374751525, + "grad_norm": 5.086215136308865, + "learning_rate": 9.749169309909382e-08, + "loss": 1.6106, + "step": 12140 + }, + { + "epoch": 0.8183686533472592, + "grad_norm": 4.46335613876711, + "learning_rate": 9.714311018950139e-08, + "loss": 1.6771, + "step": 12145 + }, + { + "epoch": 0.8187055692193659, + "grad_norm": 4.577487028769546, + "learning_rate": 9.67950845140007e-08, + "loss": 1.7008, + "step": 12150 + }, + { + "epoch": 0.8190424850914727, + "grad_norm": 4.742273478717242, + "learning_rate": 9.644761655398276e-08, + "loss": 1.6126, + "step": 12155 + }, + { + "epoch": 0.8193794009635794, + "grad_norm": 5.175051265752663, + "learning_rate": 9.61007067900675e-08, + "loss": 1.7169, + "step": 12160 + }, + { + "epoch": 0.8197163168356861, + "grad_norm": 4.305291595947235, + "learning_rate": 9.575435570210266e-08, + "loss": 1.6949, + "step": 12165 + }, + { + "epoch": 0.8200532327077928, + "grad_norm": 4.412018556271386, + "learning_rate": 9.5408563769163e-08, + "loss": 1.6687, + "step": 12170 + }, + { + "epoch": 0.8203901485798996, + "grad_norm": 4.170303077983749, + "learning_rate": 9.506333146955009e-08, + "loss": 1.5886, + "step": 12175 + }, + { + "epoch": 0.8207270644520064, + "grad_norm": 4.287215524208355, + "learning_rate": 9.471865928079148e-08, + "loss": 1.6817, + "step": 12180 + }, + { + "epoch": 0.8210639803241131, + "grad_norm": 4.1863139310236805, + "learning_rate": 9.437454767963954e-08, + "loss": 1.6514, + "step": 12185 + }, + { + "epoch": 0.8214008961962198, + "grad_norm": 4.782584978988166, + "learning_rate": 9.403099714207174e-08, + "loss": 1.6714, + "step": 12190 + }, + { + "epoch": 0.8217378120683265, + "grad_norm": 4.61780885498575, + "learning_rate": 9.368800814328931e-08, + "loss": 1.6771, + "step": 12195 + }, + { + "epoch": 0.8220747279404332, + "grad_norm": 4.920092623434809, + "learning_rate": 9.334558115771646e-08, + "loss": 1.6746, + "step": 12200 + }, + { + "epoch": 0.8224116438125401, + "grad_norm": 4.308366345769101, + "learning_rate": 9.300371665900048e-08, + "loss": 1.6434, + "step": 12205 + }, + { + "epoch": 0.8227485596846468, + "grad_norm": 4.8831688275939, + "learning_rate": 9.266241512001044e-08, + "loss": 1.5913, + "step": 12210 + }, + { + "epoch": 0.8230854755567535, + "grad_norm": 4.686820772021366, + "learning_rate": 9.23216770128365e-08, + "loss": 1.6709, + "step": 12215 + }, + { + "epoch": 0.8234223914288602, + "grad_norm": 4.719416017221441, + "learning_rate": 9.19815028087898e-08, + "loss": 1.6748, + "step": 12220 + }, + { + "epoch": 0.8237593073009669, + "grad_norm": 4.340410251877525, + "learning_rate": 9.164189297840147e-08, + "loss": 1.6961, + "step": 12225 + }, + { + "epoch": 0.8240962231730737, + "grad_norm": 5.11434879075709, + "learning_rate": 9.130284799142179e-08, + "loss": 1.6409, + "step": 12230 + }, + { + "epoch": 0.8244331390451805, + "grad_norm": 4.862919162784708, + "learning_rate": 9.09643683168197e-08, + "loss": 1.667, + "step": 12235 + }, + { + "epoch": 0.8247700549172872, + "grad_norm": 4.439123911206303, + "learning_rate": 9.062645442278244e-08, + "loss": 1.706, + "step": 12240 + }, + { + "epoch": 0.8251069707893939, + "grad_norm": 4.410873781311947, + "learning_rate": 9.028910677671469e-08, + "loss": 1.7492, + "step": 12245 + }, + { + "epoch": 0.8254438866615006, + "grad_norm": 4.661431721051673, + "learning_rate": 8.995232584523754e-08, + "loss": 1.6371, + "step": 12250 + }, + { + "epoch": 0.8257808025336073, + "grad_norm": 4.885583165906465, + "learning_rate": 8.961611209418851e-08, + "loss": 1.6724, + "step": 12255 + }, + { + "epoch": 0.8261177184057141, + "grad_norm": 4.583887928593409, + "learning_rate": 8.928046598862065e-08, + "loss": 1.6665, + "step": 12260 + }, + { + "epoch": 0.8264546342778208, + "grad_norm": 4.62236702480345, + "learning_rate": 8.894538799280138e-08, + "loss": 1.732, + "step": 12265 + }, + { + "epoch": 0.8267915501499276, + "grad_norm": 4.196005535999056, + "learning_rate": 8.861087857021282e-08, + "loss": 1.6653, + "step": 12270 + }, + { + "epoch": 0.8271284660220343, + "grad_norm": 5.14421976931363, + "learning_rate": 8.827693818355048e-08, + "loss": 1.5841, + "step": 12275 + }, + { + "epoch": 0.827465381894141, + "grad_norm": 4.836820671207669, + "learning_rate": 8.794356729472252e-08, + "loss": 1.6485, + "step": 12280 + }, + { + "epoch": 0.8278022977662478, + "grad_norm": 4.303365938631198, + "learning_rate": 8.76107663648497e-08, + "loss": 1.6251, + "step": 12285 + }, + { + "epoch": 0.8281392136383545, + "grad_norm": 4.4132607946935885, + "learning_rate": 8.727853585426436e-08, + "loss": 1.7011, + "step": 12290 + }, + { + "epoch": 0.8284761295104612, + "grad_norm": 4.489080068730756, + "learning_rate": 8.694687622250963e-08, + "loss": 1.6709, + "step": 12295 + }, + { + "epoch": 0.828813045382568, + "grad_norm": 4.431555011379005, + "learning_rate": 8.661578792833907e-08, + "loss": 1.6917, + "step": 12300 + }, + { + "epoch": 0.8291499612546747, + "grad_norm": 4.606480976685833, + "learning_rate": 8.628527142971632e-08, + "loss": 1.5667, + "step": 12305 + }, + { + "epoch": 0.8294868771267815, + "grad_norm": 4.415920989112211, + "learning_rate": 8.595532718381338e-08, + "loss": 1.6725, + "step": 12310 + }, + { + "epoch": 0.8298237929988882, + "grad_norm": 4.577414386363807, + "learning_rate": 8.562595564701153e-08, + "loss": 1.7033, + "step": 12315 + }, + { + "epoch": 0.8301607088709949, + "grad_norm": 4.807819936598261, + "learning_rate": 8.529715727489912e-08, + "loss": 1.7281, + "step": 12320 + }, + { + "epoch": 0.8304976247431016, + "grad_norm": 4.591824431615779, + "learning_rate": 8.496893252227238e-08, + "loss": 1.6812, + "step": 12325 + }, + { + "epoch": 0.8308345406152083, + "grad_norm": 4.232434383612212, + "learning_rate": 8.464128184313346e-08, + "loss": 1.6564, + "step": 12330 + }, + { + "epoch": 0.8311714564873152, + "grad_norm": 4.660216015221876, + "learning_rate": 8.431420569069093e-08, + "loss": 1.6009, + "step": 12335 + }, + { + "epoch": 0.8315083723594219, + "grad_norm": 4.84665822926301, + "learning_rate": 8.398770451735865e-08, + "loss": 1.6645, + "step": 12340 + }, + { + "epoch": 0.8318452882315286, + "grad_norm": 4.5925610308319165, + "learning_rate": 8.366177877475473e-08, + "loss": 1.6727, + "step": 12345 + }, + { + "epoch": 0.8321822041036353, + "grad_norm": 4.273499635590044, + "learning_rate": 8.333642891370174e-08, + "loss": 1.7604, + "step": 12350 + }, + { + "epoch": 0.832519119975742, + "grad_norm": 4.289284690779844, + "learning_rate": 8.301165538422577e-08, + "loss": 1.6998, + "step": 12355 + }, + { + "epoch": 0.8328560358478488, + "grad_norm": 4.708588816998743, + "learning_rate": 8.268745863555521e-08, + "loss": 1.5891, + "step": 12360 + }, + { + "epoch": 0.8331929517199556, + "grad_norm": 4.731731935159316, + "learning_rate": 8.236383911612116e-08, + "loss": 1.6399, + "step": 12365 + }, + { + "epoch": 0.8335298675920623, + "grad_norm": 4.368128769303862, + "learning_rate": 8.204079727355611e-08, + "loss": 1.6887, + "step": 12370 + }, + { + "epoch": 0.833866783464169, + "grad_norm": 5.031103795653992, + "learning_rate": 8.171833355469354e-08, + "loss": 1.7402, + "step": 12375 + }, + { + "epoch": 0.8342036993362757, + "grad_norm": 5.039376453564224, + "learning_rate": 8.139644840556703e-08, + "loss": 1.6775, + "step": 12380 + }, + { + "epoch": 0.8345406152083825, + "grad_norm": 4.652131385190657, + "learning_rate": 8.107514227141032e-08, + "loss": 1.6316, + "step": 12385 + }, + { + "epoch": 0.8348775310804892, + "grad_norm": 4.482192191227387, + "learning_rate": 8.075441559665569e-08, + "loss": 1.6908, + "step": 12390 + }, + { + "epoch": 0.835214446952596, + "grad_norm": 4.178251657515203, + "learning_rate": 8.04342688249346e-08, + "loss": 1.7173, + "step": 12395 + }, + { + "epoch": 0.8355513628247027, + "grad_norm": 4.644346393479015, + "learning_rate": 8.011470239907558e-08, + "loss": 1.6686, + "step": 12400 + }, + { + "epoch": 0.8358882786968094, + "grad_norm": 4.762354253384477, + "learning_rate": 7.979571676110525e-08, + "loss": 1.6626, + "step": 12405 + }, + { + "epoch": 0.8362251945689162, + "grad_norm": 5.008823526360671, + "learning_rate": 7.947731235224614e-08, + "loss": 1.7438, + "step": 12410 + }, + { + "epoch": 0.8365621104410229, + "grad_norm": 4.502314779203351, + "learning_rate": 7.915948961291729e-08, + "loss": 1.6587, + "step": 12415 + }, + { + "epoch": 0.8368990263131296, + "grad_norm": 4.793427388427419, + "learning_rate": 7.884224898273322e-08, + "loss": 1.6446, + "step": 12420 + }, + { + "epoch": 0.8372359421852363, + "grad_norm": 4.939647629786324, + "learning_rate": 7.852559090050276e-08, + "loss": 1.6378, + "step": 12425 + }, + { + "epoch": 0.837572858057343, + "grad_norm": 4.808437181984224, + "learning_rate": 7.820951580422952e-08, + "loss": 1.6157, + "step": 12430 + }, + { + "epoch": 0.8379097739294498, + "grad_norm": 4.546950694277577, + "learning_rate": 7.789402413111041e-08, + "loss": 1.6062, + "step": 12435 + }, + { + "epoch": 0.8382466898015566, + "grad_norm": 4.4368217224106505, + "learning_rate": 7.757911631753556e-08, + "loss": 1.6451, + "step": 12440 + }, + { + "epoch": 0.8385836056736633, + "grad_norm": 4.455951392656074, + "learning_rate": 7.72647927990871e-08, + "loss": 1.7056, + "step": 12445 + }, + { + "epoch": 0.83892052154577, + "grad_norm": 4.4739168194216195, + "learning_rate": 7.695105401053942e-08, + "loss": 1.7121, + "step": 12450 + }, + { + "epoch": 0.8392574374178767, + "grad_norm": 4.440868666208389, + "learning_rate": 7.663790038585794e-08, + "loss": 1.7122, + "step": 12455 + }, + { + "epoch": 0.8395943532899834, + "grad_norm": 4.547711961346844, + "learning_rate": 7.63253323581985e-08, + "loss": 1.6672, + "step": 12460 + }, + { + "epoch": 0.8399312691620903, + "grad_norm": 4.768378256765404, + "learning_rate": 7.601335035990714e-08, + "loss": 1.7202, + "step": 12465 + }, + { + "epoch": 0.840268185034197, + "grad_norm": 4.852415223059402, + "learning_rate": 7.57019548225194e-08, + "loss": 1.6276, + "step": 12470 + }, + { + "epoch": 0.8406051009063037, + "grad_norm": 4.900333256099145, + "learning_rate": 7.539114617675941e-08, + "loss": 1.6976, + "step": 12475 + }, + { + "epoch": 0.8409420167784104, + "grad_norm": 4.267078803636009, + "learning_rate": 7.508092485253936e-08, + "loss": 1.6408, + "step": 12480 + }, + { + "epoch": 0.8412789326505171, + "grad_norm": 4.717401305057401, + "learning_rate": 7.477129127895954e-08, + "loss": 1.6725, + "step": 12485 + }, + { + "epoch": 0.8416158485226239, + "grad_norm": 4.46002079290251, + "learning_rate": 7.446224588430678e-08, + "loss": 1.6679, + "step": 12490 + }, + { + "epoch": 0.8419527643947307, + "grad_norm": 4.269028617491822, + "learning_rate": 7.415378909605457e-08, + "loss": 1.75, + "step": 12495 + }, + { + "epoch": 0.8422896802668374, + "grad_norm": 4.578804117489516, + "learning_rate": 7.384592134086231e-08, + "loss": 1.6489, + "step": 12500 + }, + { + "epoch": 0.8426265961389441, + "grad_norm": 4.81371661431306, + "learning_rate": 7.353864304457463e-08, + "loss": 1.687, + "step": 12505 + }, + { + "epoch": 0.8429635120110508, + "grad_norm": 4.980069333005033, + "learning_rate": 7.323195463222054e-08, + "loss": 1.6767, + "step": 12510 + }, + { + "epoch": 0.8433004278831576, + "grad_norm": 4.524134363057272, + "learning_rate": 7.292585652801331e-08, + "loss": 1.6008, + "step": 12515 + }, + { + "epoch": 0.8436373437552643, + "grad_norm": 4.4534725851542, + "learning_rate": 7.262034915534993e-08, + "loss": 1.6912, + "step": 12520 + }, + { + "epoch": 0.843974259627371, + "grad_norm": 4.747417878677229, + "learning_rate": 7.231543293680969e-08, + "loss": 1.6438, + "step": 12525 + }, + { + "epoch": 0.8443111754994778, + "grad_norm": 5.1122874036227755, + "learning_rate": 7.20111082941548e-08, + "loss": 1.7049, + "step": 12530 + }, + { + "epoch": 0.8446480913715845, + "grad_norm": 4.536409694031237, + "learning_rate": 7.170737564832902e-08, + "loss": 1.7071, + "step": 12535 + }, + { + "epoch": 0.8449850072436913, + "grad_norm": 4.587192664311009, + "learning_rate": 7.14042354194569e-08, + "loss": 1.6938, + "step": 12540 + }, + { + "epoch": 0.845321923115798, + "grad_norm": 4.385212609700637, + "learning_rate": 7.110168802684408e-08, + "loss": 1.589, + "step": 12545 + }, + { + "epoch": 0.8456588389879047, + "grad_norm": 4.395202562503766, + "learning_rate": 7.079973388897592e-08, + "loss": 1.6925, + "step": 12550 + }, + { + "epoch": 0.8459957548600114, + "grad_norm": 4.560065549888014, + "learning_rate": 7.049837342351706e-08, + "loss": 1.6441, + "step": 12555 + }, + { + "epoch": 0.8463326707321182, + "grad_norm": 5.114443613657291, + "learning_rate": 7.019760704731131e-08, + "loss": 1.716, + "step": 12560 + }, + { + "epoch": 0.846669586604225, + "grad_norm": 4.429698035578612, + "learning_rate": 6.989743517638053e-08, + "loss": 1.6688, + "step": 12565 + }, + { + "epoch": 0.8470065024763317, + "grad_norm": 4.620772449380527, + "learning_rate": 6.959785822592402e-08, + "loss": 1.7159, + "step": 12570 + }, + { + "epoch": 0.8473434183484384, + "grad_norm": 4.888061963996254, + "learning_rate": 6.929887661031864e-08, + "loss": 1.6162, + "step": 12575 + }, + { + "epoch": 0.8476803342205451, + "grad_norm": 4.2420582924615795, + "learning_rate": 6.900049074311753e-08, + "loss": 1.651, + "step": 12580 + }, + { + "epoch": 0.8480172500926518, + "grad_norm": 4.839083686871792, + "learning_rate": 6.870270103705e-08, + "loss": 1.6384, + "step": 12585 + }, + { + "epoch": 0.8483541659647587, + "grad_norm": 4.6168931337714465, + "learning_rate": 6.840550790402027e-08, + "loss": 1.69, + "step": 12590 + }, + { + "epoch": 0.8486910818368654, + "grad_norm": 4.487334521520262, + "learning_rate": 6.810891175510792e-08, + "loss": 1.7088, + "step": 12595 + }, + { + "epoch": 0.8490279977089721, + "grad_norm": 4.6057406298661, + "learning_rate": 6.781291300056647e-08, + "loss": 1.6772, + "step": 12600 + }, + { + "epoch": 0.8493649135810788, + "grad_norm": 4.6230805899692236, + "learning_rate": 6.751751204982309e-08, + "loss": 1.5621, + "step": 12605 + }, + { + "epoch": 0.8497018294531855, + "grad_norm": 4.645235329625154, + "learning_rate": 6.722270931147827e-08, + "loss": 1.6998, + "step": 12610 + }, + { + "epoch": 0.8500387453252922, + "grad_norm": 4.556960627434323, + "learning_rate": 6.692850519330506e-08, + "loss": 1.6397, + "step": 12615 + }, + { + "epoch": 0.850375661197399, + "grad_norm": 4.723727212397595, + "learning_rate": 6.66349001022481e-08, + "loss": 1.5809, + "step": 12620 + }, + { + "epoch": 0.8507125770695058, + "grad_norm": 4.276618460688099, + "learning_rate": 6.634189444442389e-08, + "loss": 1.7389, + "step": 12625 + }, + { + "epoch": 0.8510494929416125, + "grad_norm": 4.8765777916342845, + "learning_rate": 6.604948862511977e-08, + "loss": 1.7438, + "step": 12630 + }, + { + "epoch": 0.8513864088137192, + "grad_norm": 4.6285342572963195, + "learning_rate": 6.575768304879292e-08, + "loss": 1.6929, + "step": 12635 + }, + { + "epoch": 0.8517233246858259, + "grad_norm": 4.637050698862129, + "learning_rate": 6.546647811907091e-08, + "loss": 1.5862, + "step": 12640 + }, + { + "epoch": 0.8520602405579327, + "grad_norm": 4.592917589678165, + "learning_rate": 6.517587423874988e-08, + "loss": 1.6897, + "step": 12645 + }, + { + "epoch": 0.8523971564300394, + "grad_norm": 4.345754092042132, + "learning_rate": 6.48858718097951e-08, + "loss": 1.5901, + "step": 12650 + }, + { + "epoch": 0.8527340723021462, + "grad_norm": 4.297560031338897, + "learning_rate": 6.459647123333956e-08, + "loss": 1.5688, + "step": 12655 + }, + { + "epoch": 0.8530709881742529, + "grad_norm": 4.712073899028917, + "learning_rate": 6.430767290968387e-08, + "loss": 1.6449, + "step": 12660 + }, + { + "epoch": 0.8534079040463596, + "grad_norm": 4.744460867221172, + "learning_rate": 6.401947723829576e-08, + "loss": 1.6518, + "step": 12665 + }, + { + "epoch": 0.8537448199184664, + "grad_norm": 4.4916813376687355, + "learning_rate": 6.373188461780904e-08, + "loss": 1.6605, + "step": 12670 + }, + { + "epoch": 0.8540817357905731, + "grad_norm": 4.551996624487793, + "learning_rate": 6.344489544602371e-08, + "loss": 1.7003, + "step": 12675 + }, + { + "epoch": 0.8544186516626798, + "grad_norm": 4.699348939364616, + "learning_rate": 6.315851011990498e-08, + "loss": 1.7217, + "step": 12680 + }, + { + "epoch": 0.8547555675347865, + "grad_norm": 4.314103210677744, + "learning_rate": 6.28727290355826e-08, + "loss": 1.6255, + "step": 12685 + }, + { + "epoch": 0.8550924834068933, + "grad_norm": 4.616664419421673, + "learning_rate": 6.258755258835075e-08, + "loss": 1.674, + "step": 12690 + }, + { + "epoch": 0.8554293992790001, + "grad_norm": 4.769742677903213, + "learning_rate": 6.230298117266736e-08, + "loss": 1.6362, + "step": 12695 + }, + { + "epoch": 0.8557663151511068, + "grad_norm": 4.750891827945278, + "learning_rate": 6.201901518215313e-08, + "loss": 1.6539, + "step": 12700 + }, + { + "epoch": 0.8561032310232135, + "grad_norm": 5.300031969588006, + "learning_rate": 6.173565500959165e-08, + "loss": 1.7062, + "step": 12705 + }, + { + "epoch": 0.8564401468953202, + "grad_norm": 4.9877195879588925, + "learning_rate": 6.14529010469284e-08, + "loss": 1.7033, + "step": 12710 + }, + { + "epoch": 0.8567770627674269, + "grad_norm": 4.465620946765142, + "learning_rate": 6.117075368527053e-08, + "loss": 1.5669, + "step": 12715 + }, + { + "epoch": 0.8571139786395338, + "grad_norm": 4.567565940501714, + "learning_rate": 6.088921331488566e-08, + "loss": 1.6474, + "step": 12720 + }, + { + "epoch": 0.8574508945116405, + "grad_norm": 4.5006741327829465, + "learning_rate": 6.060828032520249e-08, + "loss": 1.662, + "step": 12725 + }, + { + "epoch": 0.8577878103837472, + "grad_norm": 4.7060071164822155, + "learning_rate": 6.032795510480904e-08, + "loss": 1.6264, + "step": 12730 + }, + { + "epoch": 0.8581247262558539, + "grad_norm": 4.648153265111918, + "learning_rate": 6.004823804145276e-08, + "loss": 1.7249, + "step": 12735 + }, + { + "epoch": 0.8584616421279606, + "grad_norm": 4.47213474728508, + "learning_rate": 5.976912952204016e-08, + "loss": 1.6887, + "step": 12740 + }, + { + "epoch": 0.8587985580000674, + "grad_norm": 4.524244747562429, + "learning_rate": 5.9490629932635815e-08, + "loss": 1.7353, + "step": 12745 + }, + { + "epoch": 0.8591354738721741, + "grad_norm": 5.297426837911389, + "learning_rate": 5.921273965846191e-08, + "loss": 1.648, + "step": 12750 + }, + { + "epoch": 0.8594723897442809, + "grad_norm": 4.965235141374218, + "learning_rate": 5.893545908389807e-08, + "loss": 1.5881, + "step": 12755 + }, + { + "epoch": 0.8598093056163876, + "grad_norm": 4.652217619220733, + "learning_rate": 5.865878859248058e-08, + "loss": 1.5889, + "step": 12760 + }, + { + "epoch": 0.8601462214884943, + "grad_norm": 4.837036525431273, + "learning_rate": 5.838272856690146e-08, + "loss": 1.596, + "step": 12765 + }, + { + "epoch": 0.860483137360601, + "grad_norm": 4.317476977979007, + "learning_rate": 5.810727938900878e-08, + "loss": 1.6771, + "step": 12770 + }, + { + "epoch": 0.8608200532327078, + "grad_norm": 4.485083808259918, + "learning_rate": 5.7832441439805536e-08, + "loss": 1.6996, + "step": 12775 + }, + { + "epoch": 0.8611569691048145, + "grad_norm": 4.602719696368838, + "learning_rate": 5.755821509944925e-08, + "loss": 1.6995, + "step": 12780 + }, + { + "epoch": 0.8614938849769213, + "grad_norm": 4.82873736404331, + "learning_rate": 5.728460074725133e-08, + "loss": 1.6445, + "step": 12785 + }, + { + "epoch": 0.861830800849028, + "grad_norm": 4.613031757102517, + "learning_rate": 5.701159876167688e-08, + "loss": 1.6395, + "step": 12790 + }, + { + "epoch": 0.8621677167211347, + "grad_norm": 4.739452889007518, + "learning_rate": 5.673920952034406e-08, + "loss": 1.6522, + "step": 12795 + }, + { + "epoch": 0.8625046325932415, + "grad_norm": 4.825662079465411, + "learning_rate": 5.646743340002302e-08, + "loss": 1.6343, + "step": 12800 + }, + { + "epoch": 0.8628415484653482, + "grad_norm": 5.3045173960429635, + "learning_rate": 5.619627077663636e-08, + "loss": 1.7305, + "step": 12805 + }, + { + "epoch": 0.8631784643374549, + "grad_norm": 4.556268810510644, + "learning_rate": 5.5925722025257746e-08, + "loss": 1.735, + "step": 12810 + }, + { + "epoch": 0.8635153802095616, + "grad_norm": 4.584380506792433, + "learning_rate": 5.5655787520111966e-08, + "loss": 1.7302, + "step": 12815 + }, + { + "epoch": 0.8638522960816684, + "grad_norm": 4.495296345448851, + "learning_rate": 5.538646763457389e-08, + "loss": 1.6434, + "step": 12820 + }, + { + "epoch": 0.8641892119537752, + "grad_norm": 4.587139082186452, + "learning_rate": 5.511776274116864e-08, + "loss": 1.6176, + "step": 12825 + }, + { + "epoch": 0.8645261278258819, + "grad_norm": 4.861128343283088, + "learning_rate": 5.484967321157019e-08, + "loss": 1.6692, + "step": 12830 + }, + { + "epoch": 0.8648630436979886, + "grad_norm": 4.753311652307588, + "learning_rate": 5.4582199416601746e-08, + "loss": 1.6115, + "step": 12835 + }, + { + "epoch": 0.8651999595700953, + "grad_norm": 4.775924852820231, + "learning_rate": 5.43153417262347e-08, + "loss": 1.7149, + "step": 12840 + }, + { + "epoch": 0.865536875442202, + "grad_norm": 5.105625607161886, + "learning_rate": 5.404910050958833e-08, + "loss": 1.6891, + "step": 12845 + }, + { + "epoch": 0.8658737913143089, + "grad_norm": 4.469873946974137, + "learning_rate": 5.378347613492884e-08, + "loss": 1.6519, + "step": 12850 + }, + { + "epoch": 0.8662107071864156, + "grad_norm": 4.364768077012948, + "learning_rate": 5.351846896966966e-08, + "loss": 1.6981, + "step": 12855 + }, + { + "epoch": 0.8665476230585223, + "grad_norm": 4.9119501639318415, + "learning_rate": 5.32540793803703e-08, + "loss": 1.6897, + "step": 12860 + }, + { + "epoch": 0.866884538930629, + "grad_norm": 4.43964878962444, + "learning_rate": 5.299030773273594e-08, + "loss": 1.6619, + "step": 12865 + }, + { + "epoch": 0.8672214548027357, + "grad_norm": 4.672406178991766, + "learning_rate": 5.272715439161718e-08, + "loss": 1.6509, + "step": 12870 + }, + { + "epoch": 0.8675583706748425, + "grad_norm": 4.772545341332354, + "learning_rate": 5.246461972100941e-08, + "loss": 1.6213, + "step": 12875 + }, + { + "epoch": 0.8678952865469493, + "grad_norm": 4.286308937980272, + "learning_rate": 5.220270408405197e-08, + "loss": 1.6349, + "step": 12880 + }, + { + "epoch": 0.868232202419056, + "grad_norm": 4.634791844153955, + "learning_rate": 5.194140784302836e-08, + "loss": 1.6495, + "step": 12885 + }, + { + "epoch": 0.8685691182911627, + "grad_norm": 3.8557299998167927, + "learning_rate": 5.168073135936496e-08, + "loss": 1.6539, + "step": 12890 + }, + { + "epoch": 0.8689060341632694, + "grad_norm": 4.746078778573034, + "learning_rate": 5.1420674993631285e-08, + "loss": 1.6101, + "step": 12895 + }, + { + "epoch": 0.8692429500353762, + "grad_norm": 4.596974751954337, + "learning_rate": 5.116123910553854e-08, + "loss": 1.6959, + "step": 12900 + }, + { + "epoch": 0.8695798659074829, + "grad_norm": 4.183227031111133, + "learning_rate": 5.0902424053940406e-08, + "loss": 1.7306, + "step": 12905 + }, + { + "epoch": 0.8699167817795896, + "grad_norm": 4.706195131417817, + "learning_rate": 5.064423019683106e-08, + "loss": 1.6512, + "step": 12910 + }, + { + "epoch": 0.8702536976516964, + "grad_norm": 4.883634643878311, + "learning_rate": 5.0386657891346e-08, + "loss": 1.6405, + "step": 12915 + }, + { + "epoch": 0.8705906135238031, + "grad_norm": 4.121854755010217, + "learning_rate": 5.012970749376083e-08, + "loss": 1.6358, + "step": 12920 + }, + { + "epoch": 0.8709275293959099, + "grad_norm": 4.713311198178696, + "learning_rate": 4.987337935949087e-08, + "loss": 1.6824, + "step": 12925 + }, + { + "epoch": 0.8712644452680166, + "grad_norm": 4.535731428384727, + "learning_rate": 4.961767384309068e-08, + "loss": 1.7024, + "step": 12930 + }, + { + "epoch": 0.8716013611401233, + "grad_norm": 4.190502023346841, + "learning_rate": 4.936259129825376e-08, + "loss": 1.662, + "step": 12935 + }, + { + "epoch": 0.87193827701223, + "grad_norm": 4.541026079557271, + "learning_rate": 4.9108132077811836e-08, + "loss": 1.6413, + "step": 12940 + }, + { + "epoch": 0.8722751928843367, + "grad_norm": 4.692232599207379, + "learning_rate": 4.885429653373435e-08, + "loss": 1.6707, + "step": 12945 + }, + { + "epoch": 0.8726121087564435, + "grad_norm": 5.017348687830675, + "learning_rate": 4.860108501712823e-08, + "loss": 1.6969, + "step": 12950 + }, + { + "epoch": 0.8729490246285503, + "grad_norm": 4.679428429265382, + "learning_rate": 4.834849787823725e-08, + "loss": 1.6445, + "step": 12955 + }, + { + "epoch": 0.873285940500657, + "grad_norm": 5.00770849393764, + "learning_rate": 4.809653546644132e-08, + "loss": 1.7464, + "step": 12960 + }, + { + "epoch": 0.8736228563727637, + "grad_norm": 4.780319317876716, + "learning_rate": 4.7845198130256395e-08, + "loss": 1.7075, + "step": 12965 + }, + { + "epoch": 0.8739597722448704, + "grad_norm": 4.568955739105629, + "learning_rate": 4.759448621733403e-08, + "loss": 1.6402, + "step": 12970 + }, + { + "epoch": 0.8742966881169771, + "grad_norm": 4.670815008886996, + "learning_rate": 4.7344400074460276e-08, + "loss": 1.6548, + "step": 12975 + }, + { + "epoch": 0.874633603989084, + "grad_norm": 4.6192757150973565, + "learning_rate": 4.709494004755571e-08, + "loss": 1.6799, + "step": 12980 + }, + { + "epoch": 0.8749705198611907, + "grad_norm": 4.607392535169383, + "learning_rate": 4.684610648167503e-08, + "loss": 1.6268, + "step": 12985 + }, + { + "epoch": 0.8753074357332974, + "grad_norm": 4.975556056688224, + "learning_rate": 4.659789972100647e-08, + "loss": 1.6801, + "step": 12990 + }, + { + "epoch": 0.8756443516054041, + "grad_norm": 4.383102938364268, + "learning_rate": 4.635032010887097e-08, + "loss": 1.7203, + "step": 12995 + }, + { + "epoch": 0.8759812674775108, + "grad_norm": 4.367463559610928, + "learning_rate": 4.610336798772213e-08, + "loss": 1.6703, + "step": 13000 + }, + { + "epoch": 0.8763181833496176, + "grad_norm": 4.442038336994325, + "learning_rate": 4.5857043699145834e-08, + "loss": 1.6961, + "step": 13005 + }, + { + "epoch": 0.8766550992217244, + "grad_norm": 4.793599970051047, + "learning_rate": 4.5611347583859095e-08, + "loss": 1.6731, + "step": 13010 + }, + { + "epoch": 0.8769920150938311, + "grad_norm": 4.9522389045914625, + "learning_rate": 4.536627998171033e-08, + "loss": 1.6996, + "step": 13015 + }, + { + "epoch": 0.8773289309659378, + "grad_norm": 4.980252194482352, + "learning_rate": 4.512184123167867e-08, + "loss": 1.6263, + "step": 13020 + }, + { + "epoch": 0.8776658468380445, + "grad_norm": 4.9646395476564384, + "learning_rate": 4.487803167187304e-08, + "loss": 1.6801, + "step": 13025 + }, + { + "epoch": 0.8780027627101513, + "grad_norm": 4.778315521986718, + "learning_rate": 4.463485163953246e-08, + "loss": 1.7304, + "step": 13030 + }, + { + "epoch": 0.878339678582258, + "grad_norm": 4.36506282947457, + "learning_rate": 4.4392301471025074e-08, + "loss": 1.7238, + "step": 13035 + }, + { + "epoch": 0.8786765944543647, + "grad_norm": 4.4129541893069835, + "learning_rate": 4.415038150184758e-08, + "loss": 1.6556, + "step": 13040 + }, + { + "epoch": 0.8790135103264715, + "grad_norm": 4.600992013387407, + "learning_rate": 4.3909092066625245e-08, + "loss": 1.6354, + "step": 13045 + }, + { + "epoch": 0.8793504261985782, + "grad_norm": 4.501353404766527, + "learning_rate": 4.366843349911109e-08, + "loss": 1.6193, + "step": 13050 + }, + { + "epoch": 0.879687342070685, + "grad_norm": 4.793533490720773, + "learning_rate": 4.342840613218546e-08, + "loss": 1.6213, + "step": 13055 + }, + { + "epoch": 0.8800242579427917, + "grad_norm": 4.873768264726616, + "learning_rate": 4.318901029785571e-08, + "loss": 1.6917, + "step": 13060 + }, + { + "epoch": 0.8803611738148984, + "grad_norm": 4.514341122154152, + "learning_rate": 4.2950246327255523e-08, + "loss": 1.6181, + "step": 13065 + }, + { + "epoch": 0.8806980896870051, + "grad_norm": 4.267152040052711, + "learning_rate": 4.271211455064483e-08, + "loss": 1.7006, + "step": 13070 + }, + { + "epoch": 0.8810350055591119, + "grad_norm": 4.785818717093623, + "learning_rate": 4.2474615297408754e-08, + "loss": 1.647, + "step": 13075 + }, + { + "epoch": 0.8813719214312187, + "grad_norm": 4.30173252242199, + "learning_rate": 4.223774889605775e-08, + "loss": 1.6541, + "step": 13080 + }, + { + "epoch": 0.8817088373033254, + "grad_norm": 4.565940042679433, + "learning_rate": 4.200151567422699e-08, + "loss": 1.6905, + "step": 13085 + }, + { + "epoch": 0.8820457531754321, + "grad_norm": 4.69539783865438, + "learning_rate": 4.176591595867557e-08, + "loss": 1.6944, + "step": 13090 + }, + { + "epoch": 0.8823826690475388, + "grad_norm": 4.380967972336634, + "learning_rate": 4.153095007528645e-08, + "loss": 1.6728, + "step": 13095 + }, + { + "epoch": 0.8827195849196455, + "grad_norm": 4.516035300863843, + "learning_rate": 4.1296618349066e-08, + "loss": 1.603, + "step": 13100 + }, + { + "epoch": 0.8830565007917524, + "grad_norm": 4.296821049635356, + "learning_rate": 4.106292110414311e-08, + "loss": 1.7034, + "step": 13105 + }, + { + "epoch": 0.8833934166638591, + "grad_norm": 4.437609089662178, + "learning_rate": 4.082985866376926e-08, + "loss": 1.7042, + "step": 13110 + }, + { + "epoch": 0.8837303325359658, + "grad_norm": 5.020502142421371, + "learning_rate": 4.05974313503179e-08, + "loss": 1.7177, + "step": 13115 + }, + { + "epoch": 0.8840672484080725, + "grad_norm": 4.797384079762569, + "learning_rate": 4.036563948528393e-08, + "loss": 1.6754, + "step": 13120 + }, + { + "epoch": 0.8844041642801792, + "grad_norm": 4.420504657788444, + "learning_rate": 4.01344833892831e-08, + "loss": 1.7012, + "step": 13125 + }, + { + "epoch": 0.8847410801522859, + "grad_norm": 5.010665239478789, + "learning_rate": 3.990396338205204e-08, + "loss": 1.6727, + "step": 13130 + }, + { + "epoch": 0.8850779960243927, + "grad_norm": 4.875792248941043, + "learning_rate": 3.967407978244747e-08, + "loss": 1.5873, + "step": 13135 + }, + { + "epoch": 0.8854149118964995, + "grad_norm": 4.920319954446455, + "learning_rate": 3.944483290844575e-08, + "loss": 1.6536, + "step": 13140 + }, + { + "epoch": 0.8857518277686062, + "grad_norm": 4.702279123087858, + "learning_rate": 3.9216223077142394e-08, + "loss": 1.7247, + "step": 13145 + }, + { + "epoch": 0.8860887436407129, + "grad_norm": 4.690033811483646, + "learning_rate": 3.8988250604752135e-08, + "loss": 1.6406, + "step": 13150 + }, + { + "epoch": 0.8864256595128196, + "grad_norm": 5.082844262732006, + "learning_rate": 3.876091580660762e-08, + "loss": 1.7613, + "step": 13155 + }, + { + "epoch": 0.8867625753849264, + "grad_norm": 4.662418270415757, + "learning_rate": 3.853421899715992e-08, + "loss": 1.638, + "step": 13160 + }, + { + "epoch": 0.8870994912570331, + "grad_norm": 5.224077723143157, + "learning_rate": 3.8308160489977424e-08, + "loss": 1.709, + "step": 13165 + }, + { + "epoch": 0.8874364071291398, + "grad_norm": 4.947169326251201, + "learning_rate": 3.808274059774552e-08, + "loss": 1.7174, + "step": 13170 + }, + { + "epoch": 0.8877733230012466, + "grad_norm": 4.600213672685725, + "learning_rate": 3.785795963226646e-08, + "loss": 1.6272, + "step": 13175 + }, + { + "epoch": 0.8881102388733533, + "grad_norm": 4.644106545994671, + "learning_rate": 3.7633817904458574e-08, + "loss": 1.709, + "step": 13180 + }, + { + "epoch": 0.8884471547454601, + "grad_norm": 4.881787365902224, + "learning_rate": 3.741031572435615e-08, + "loss": 1.6412, + "step": 13185 + }, + { + "epoch": 0.8887840706175668, + "grad_norm": 4.601119029827488, + "learning_rate": 3.718745340110868e-08, + "loss": 1.6599, + "step": 13190 + }, + { + "epoch": 0.8891209864896735, + "grad_norm": 4.876499663076449, + "learning_rate": 3.6965231242980624e-08, + "loss": 1.6504, + "step": 13195 + }, + { + "epoch": 0.8894579023617802, + "grad_norm": 4.614845675235782, + "learning_rate": 3.6743649557351265e-08, + "loss": 1.6961, + "step": 13200 + }, + { + "epoch": 0.889794818233887, + "grad_norm": 4.2887236933033055, + "learning_rate": 3.652270865071344e-08, + "loss": 1.6644, + "step": 13205 + }, + { + "epoch": 0.8901317341059938, + "grad_norm": 4.013386340686639, + "learning_rate": 3.630240882867408e-08, + "loss": 1.609, + "step": 13210 + }, + { + "epoch": 0.8904686499781005, + "grad_norm": 4.589800677924639, + "learning_rate": 3.608275039595332e-08, + "loss": 1.6885, + "step": 13215 + }, + { + "epoch": 0.8908055658502072, + "grad_norm": 4.996418734298018, + "learning_rate": 3.5863733656383844e-08, + "loss": 1.6432, + "step": 13220 + }, + { + "epoch": 0.8911424817223139, + "grad_norm": 4.561616629992068, + "learning_rate": 3.564535891291115e-08, + "loss": 1.6334, + "step": 13225 + }, + { + "epoch": 0.8914793975944206, + "grad_norm": 4.5190475576262195, + "learning_rate": 3.542762646759234e-08, + "loss": 1.6786, + "step": 13230 + }, + { + "epoch": 0.8918163134665275, + "grad_norm": 4.885770968192226, + "learning_rate": 3.521053662159629e-08, + "loss": 1.6563, + "step": 13235 + }, + { + "epoch": 0.8921532293386342, + "grad_norm": 4.311300465911783, + "learning_rate": 3.499408967520295e-08, + "loss": 1.6554, + "step": 13240 + }, + { + "epoch": 0.8924901452107409, + "grad_norm": 4.661435819756512, + "learning_rate": 3.477828592780319e-08, + "loss": 1.6824, + "step": 13245 + }, + { + "epoch": 0.8928270610828476, + "grad_norm": 4.507106001053341, + "learning_rate": 3.456312567789793e-08, + "loss": 1.7095, + "step": 13250 + }, + { + "epoch": 0.8931639769549543, + "grad_norm": 4.3042004781365675, + "learning_rate": 3.4348609223098125e-08, + "loss": 1.6764, + "step": 13255 + }, + { + "epoch": 0.8935008928270611, + "grad_norm": 4.459285002947242, + "learning_rate": 3.41347368601243e-08, + "loss": 1.608, + "step": 13260 + }, + { + "epoch": 0.8938378086991678, + "grad_norm": 5.012943421245698, + "learning_rate": 3.39215088848061e-08, + "loss": 1.7396, + "step": 13265 + }, + { + "epoch": 0.8941747245712746, + "grad_norm": 5.035475395776995, + "learning_rate": 3.370892559208155e-08, + "loss": 1.6454, + "step": 13270 + }, + { + "epoch": 0.8945116404433813, + "grad_norm": 4.46377384535415, + "learning_rate": 3.34969872759972e-08, + "loss": 1.6899, + "step": 13275 + }, + { + "epoch": 0.894848556315488, + "grad_norm": 4.47489904509951, + "learning_rate": 3.328569422970762e-08, + "loss": 1.7486, + "step": 13280 + }, + { + "epoch": 0.8951854721875948, + "grad_norm": 4.39226374900135, + "learning_rate": 3.307504674547429e-08, + "loss": 1.6666, + "step": 13285 + }, + { + "epoch": 0.8955223880597015, + "grad_norm": 6.3409003089939935, + "learning_rate": 3.286504511466631e-08, + "loss": 1.7211, + "step": 13290 + }, + { + "epoch": 0.8958593039318082, + "grad_norm": 4.92187136039575, + "learning_rate": 3.265568962775927e-08, + "loss": 1.672, + "step": 13295 + }, + { + "epoch": 0.896196219803915, + "grad_norm": 4.862516911103943, + "learning_rate": 3.2446980574334706e-08, + "loss": 1.6782, + "step": 13300 + }, + { + "epoch": 0.8965331356760217, + "grad_norm": 4.954131476977874, + "learning_rate": 3.2238918243080505e-08, + "loss": 1.6374, + "step": 13305 + }, + { + "epoch": 0.8968700515481284, + "grad_norm": 5.185311614713857, + "learning_rate": 3.203150292178952e-08, + "loss": 1.6918, + "step": 13310 + }, + { + "epoch": 0.8972069674202352, + "grad_norm": 4.859702788623765, + "learning_rate": 3.182473489736004e-08, + "loss": 1.6278, + "step": 13315 + }, + { + "epoch": 0.8975438832923419, + "grad_norm": 5.444427223124619, + "learning_rate": 3.161861445579478e-08, + "loss": 1.6935, + "step": 13320 + }, + { + "epoch": 0.8978807991644486, + "grad_norm": 4.6618984522024345, + "learning_rate": 3.1413141882200736e-08, + "loss": 1.6618, + "step": 13325 + }, + { + "epoch": 0.8982177150365553, + "grad_norm": 4.597572632532803, + "learning_rate": 3.120831746078895e-08, + "loss": 1.6797, + "step": 13330 + }, + { + "epoch": 0.898554630908662, + "grad_norm": 4.66671806352106, + "learning_rate": 3.100414147487368e-08, + "loss": 1.7287, + "step": 13335 + }, + { + "epoch": 0.8988915467807689, + "grad_norm": 4.432222168219578, + "learning_rate": 3.0800614206872413e-08, + "loss": 1.6574, + "step": 13340 + }, + { + "epoch": 0.8992284626528756, + "grad_norm": 5.389463648441865, + "learning_rate": 3.059773593830539e-08, + "loss": 1.6732, + "step": 13345 + }, + { + "epoch": 0.8995653785249823, + "grad_norm": 4.51707048609424, + "learning_rate": 3.039550694979492e-08, + "loss": 1.6256, + "step": 13350 + }, + { + "epoch": 0.899902294397089, + "grad_norm": 4.596592919058153, + "learning_rate": 3.019392752106548e-08, + "loss": 1.6426, + "step": 13355 + }, + { + "epoch": 0.9002392102691957, + "grad_norm": 4.534597115346769, + "learning_rate": 2.9992997930942954e-08, + "loss": 1.6079, + "step": 13360 + }, + { + "epoch": 0.9005761261413026, + "grad_norm": 4.501842736749262, + "learning_rate": 2.979271845735426e-08, + "loss": 1.5888, + "step": 13365 + }, + { + "epoch": 0.9009130420134093, + "grad_norm": 4.543584289239795, + "learning_rate": 2.9593089377327242e-08, + "loss": 1.6151, + "step": 13370 + }, + { + "epoch": 0.901249957885516, + "grad_norm": 4.656857934626395, + "learning_rate": 2.9394110966990184e-08, + "loss": 1.6873, + "step": 13375 + }, + { + "epoch": 0.9015868737576227, + "grad_norm": 4.501681745676623, + "learning_rate": 2.9195783501570982e-08, + "loss": 1.6992, + "step": 13380 + }, + { + "epoch": 0.9019237896297294, + "grad_norm": 4.907087522799292, + "learning_rate": 2.8998107255397643e-08, + "loss": 1.6739, + "step": 13385 + }, + { + "epoch": 0.9022607055018362, + "grad_norm": 4.490445324603956, + "learning_rate": 2.880108250189689e-08, + "loss": 1.6372, + "step": 13390 + }, + { + "epoch": 0.902597621373943, + "grad_norm": 4.762399631606492, + "learning_rate": 2.860470951359478e-08, + "loss": 1.649, + "step": 13395 + }, + { + "epoch": 0.9029345372460497, + "grad_norm": 4.965958597373103, + "learning_rate": 2.8408988562115488e-08, + "loss": 1.678, + "step": 13400 + }, + { + "epoch": 0.9032714531181564, + "grad_norm": 4.873670679901971, + "learning_rate": 2.8213919918181393e-08, + "loss": 1.7255, + "step": 13405 + }, + { + "epoch": 0.9036083689902631, + "grad_norm": 4.637630650167888, + "learning_rate": 2.8019503851612837e-08, + "loss": 1.6833, + "step": 13410 + }, + { + "epoch": 0.9039452848623699, + "grad_norm": 4.614907766408516, + "learning_rate": 2.782574063132703e-08, + "loss": 1.7035, + "step": 13415 + }, + { + "epoch": 0.9042822007344766, + "grad_norm": 4.698127813148663, + "learning_rate": 2.7632630525338597e-08, + "loss": 1.6726, + "step": 13420 + }, + { + "epoch": 0.9046191166065833, + "grad_norm": 4.908958225004308, + "learning_rate": 2.7440173800758583e-08, + "loss": 1.693, + "step": 13425 + }, + { + "epoch": 0.90495603247869, + "grad_norm": 4.553215982139727, + "learning_rate": 2.7248370723794268e-08, + "loss": 1.6548, + "step": 13430 + }, + { + "epoch": 0.9052929483507968, + "grad_norm": 4.873564430369931, + "learning_rate": 2.7057221559748822e-08, + "loss": 1.6762, + "step": 13435 + }, + { + "epoch": 0.9056298642229036, + "grad_norm": 4.485392880009042, + "learning_rate": 2.6866726573021025e-08, + "loss": 1.6518, + "step": 13440 + }, + { + "epoch": 0.9059667800950103, + "grad_norm": 4.932314532115502, + "learning_rate": 2.667688602710455e-08, + "loss": 1.6006, + "step": 13445 + }, + { + "epoch": 0.906303695967117, + "grad_norm": 4.503122097316531, + "learning_rate": 2.648770018458807e-08, + "loss": 1.6812, + "step": 13450 + }, + { + "epoch": 0.9066406118392237, + "grad_norm": 4.574449543803867, + "learning_rate": 2.6299169307154535e-08, + "loss": 1.6163, + "step": 13455 + }, + { + "epoch": 0.9069775277113304, + "grad_norm": 4.834936908101332, + "learning_rate": 2.611129365558118e-08, + "loss": 1.6345, + "step": 13460 + }, + { + "epoch": 0.9073144435834373, + "grad_norm": 4.628745448605451, + "learning_rate": 2.592407348973852e-08, + "loss": 1.6528, + "step": 13465 + }, + { + "epoch": 0.907651359455544, + "grad_norm": 4.335332294524596, + "learning_rate": 2.573750906859079e-08, + "loss": 1.6507, + "step": 13470 + }, + { + "epoch": 0.9079882753276507, + "grad_norm": 4.6604992266607095, + "learning_rate": 2.5551600650194906e-08, + "loss": 1.6526, + "step": 13475 + }, + { + "epoch": 0.9083251911997574, + "grad_norm": 4.4333865162999055, + "learning_rate": 2.536634849170055e-08, + "loss": 1.6162, + "step": 13480 + }, + { + "epoch": 0.9086621070718641, + "grad_norm": 4.85890159742819, + "learning_rate": 2.5181752849349593e-08, + "loss": 1.7159, + "step": 13485 + }, + { + "epoch": 0.9089990229439708, + "grad_norm": 4.729622761774583, + "learning_rate": 2.4997813978476e-08, + "loss": 1.629, + "step": 13490 + }, + { + "epoch": 0.9093359388160777, + "grad_norm": 4.445209264708288, + "learning_rate": 2.481453213350493e-08, + "loss": 1.6073, + "step": 13495 + }, + { + "epoch": 0.9096728546881844, + "grad_norm": 4.331937548330955, + "learning_rate": 2.463190756795308e-08, + "loss": 1.7122, + "step": 13500 + }, + { + "epoch": 0.9100097705602911, + "grad_norm": 4.210089541483122, + "learning_rate": 2.4449940534427836e-08, + "loss": 1.6794, + "step": 13505 + }, + { + "epoch": 0.9103466864323978, + "grad_norm": 4.522200369117284, + "learning_rate": 2.4268631284627027e-08, + "loss": 1.6555, + "step": 13510 + }, + { + "epoch": 0.9106836023045045, + "grad_norm": 4.620662367302598, + "learning_rate": 2.408798006933882e-08, + "loss": 1.6215, + "step": 13515 + }, + { + "epoch": 0.9110205181766113, + "grad_norm": 4.935550615365647, + "learning_rate": 2.3907987138440945e-08, + "loss": 1.745, + "step": 13520 + }, + { + "epoch": 0.911357434048718, + "grad_norm": 4.809365117645508, + "learning_rate": 2.3728652740900856e-08, + "loss": 1.7111, + "step": 13525 + }, + { + "epoch": 0.9116943499208248, + "grad_norm": 4.958322472785031, + "learning_rate": 2.3549977124774857e-08, + "loss": 1.6901, + "step": 13530 + }, + { + "epoch": 0.9120312657929315, + "grad_norm": 4.935261215153186, + "learning_rate": 2.337196053720819e-08, + "loss": 1.73, + "step": 13535 + }, + { + "epoch": 0.9123681816650382, + "grad_norm": 4.511277252542813, + "learning_rate": 2.319460322443456e-08, + "loss": 1.7142, + "step": 13540 + }, + { + "epoch": 0.912705097537145, + "grad_norm": 4.723475456942302, + "learning_rate": 2.301790543177551e-08, + "loss": 1.7186, + "step": 13545 + }, + { + "epoch": 0.9130420134092517, + "grad_norm": 4.584142787010846, + "learning_rate": 2.284186740364069e-08, + "loss": 1.6963, + "step": 13550 + }, + { + "epoch": 0.9133789292813584, + "grad_norm": 4.415909599002564, + "learning_rate": 2.266648938352672e-08, + "loss": 1.6398, + "step": 13555 + }, + { + "epoch": 0.9137158451534652, + "grad_norm": 4.569776155689798, + "learning_rate": 2.249177161401783e-08, + "loss": 1.633, + "step": 13560 + }, + { + "epoch": 0.9140527610255719, + "grad_norm": 4.694072322744868, + "learning_rate": 2.2317714336784422e-08, + "loss": 1.6676, + "step": 13565 + }, + { + "epoch": 0.9143896768976787, + "grad_norm": 4.6866620357865605, + "learning_rate": 2.21443177925838e-08, + "loss": 1.6787, + "step": 13570 + }, + { + "epoch": 0.9147265927697854, + "grad_norm": 4.860394188870198, + "learning_rate": 2.1971582221258944e-08, + "loss": 1.5489, + "step": 13575 + }, + { + "epoch": 0.9150635086418921, + "grad_norm": 4.196003767451703, + "learning_rate": 2.1799507861738788e-08, + "loss": 1.6717, + "step": 13580 + }, + { + "epoch": 0.9154004245139988, + "grad_norm": 4.565008110985724, + "learning_rate": 2.1628094952037713e-08, + "loss": 1.657, + "step": 13585 + }, + { + "epoch": 0.9157373403861055, + "grad_norm": 4.4052643514904695, + "learning_rate": 2.1457343729255062e-08, + "loss": 1.6563, + "step": 13590 + }, + { + "epoch": 0.9160742562582124, + "grad_norm": 4.565101687356525, + "learning_rate": 2.128725442957491e-08, + "loss": 1.6745, + "step": 13595 + }, + { + "epoch": 0.9164111721303191, + "grad_norm": 4.6695200252580955, + "learning_rate": 2.111782728826583e-08, + "loss": 1.6678, + "step": 13600 + }, + { + "epoch": 0.9167480880024258, + "grad_norm": 4.512246052954719, + "learning_rate": 2.0949062539680486e-08, + "loss": 1.7268, + "step": 13605 + }, + { + "epoch": 0.9170850038745325, + "grad_norm": 4.864232850938187, + "learning_rate": 2.07809604172553e-08, + "loss": 1.644, + "step": 13610 + }, + { + "epoch": 0.9174219197466392, + "grad_norm": 4.5261117115863545, + "learning_rate": 2.0613521153510115e-08, + "loss": 1.6487, + "step": 13615 + }, + { + "epoch": 0.917758835618746, + "grad_norm": 4.720489538078507, + "learning_rate": 2.0446744980048002e-08, + "loss": 1.6534, + "step": 13620 + }, + { + "epoch": 0.9180957514908528, + "grad_norm": 4.8300960188069055, + "learning_rate": 2.0280632127554708e-08, + "loss": 1.7123, + "step": 13625 + }, + { + "epoch": 0.9184326673629595, + "grad_norm": 4.36414883570687, + "learning_rate": 2.011518282579855e-08, + "loss": 1.6826, + "step": 13630 + }, + { + "epoch": 0.9187695832350662, + "grad_norm": 4.419503790016664, + "learning_rate": 1.9950397303630075e-08, + "loss": 1.7687, + "step": 13635 + }, + { + "epoch": 0.9191064991071729, + "grad_norm": 4.517493564314203, + "learning_rate": 1.9786275788981565e-08, + "loss": 1.6415, + "step": 13640 + }, + { + "epoch": 0.9194434149792797, + "grad_norm": 4.63746474526432, + "learning_rate": 1.9622818508866823e-08, + "loss": 1.6636, + "step": 13645 + }, + { + "epoch": 0.9197803308513864, + "grad_norm": 4.791877863432056, + "learning_rate": 1.9460025689381043e-08, + "loss": 1.7187, + "step": 13650 + }, + { + "epoch": 0.9201172467234932, + "grad_norm": 4.074508619807229, + "learning_rate": 1.9297897555700216e-08, + "loss": 1.6615, + "step": 13655 + }, + { + "epoch": 0.9204541625955999, + "grad_norm": 4.488988677170429, + "learning_rate": 1.9136434332080898e-08, + "loss": 1.6658, + "step": 13660 + }, + { + "epoch": 0.9207910784677066, + "grad_norm": 4.572599413440709, + "learning_rate": 1.8975636241860048e-08, + "loss": 1.6673, + "step": 13665 + }, + { + "epoch": 0.9211279943398133, + "grad_norm": 4.591900564378601, + "learning_rate": 1.8815503507454644e-08, + "loss": 1.5942, + "step": 13670 + }, + { + "epoch": 0.9214649102119201, + "grad_norm": 4.651051241662432, + "learning_rate": 1.8656036350361117e-08, + "loss": 1.6395, + "step": 13675 + }, + { + "epoch": 0.9218018260840268, + "grad_norm": 4.819658152787535, + "learning_rate": 1.8497234991155463e-08, + "loss": 1.6572, + "step": 13680 + }, + { + "epoch": 0.9221387419561335, + "grad_norm": 4.300806686835021, + "learning_rate": 1.8339099649492762e-08, + "loss": 1.6533, + "step": 13685 + }, + { + "epoch": 0.9224756578282403, + "grad_norm": 4.702436296805395, + "learning_rate": 1.8181630544106653e-08, + "loss": 1.6403, + "step": 13690 + }, + { + "epoch": 0.922812573700347, + "grad_norm": 5.236256063034201, + "learning_rate": 1.8024827892809346e-08, + "loss": 1.7093, + "step": 13695 + }, + { + "epoch": 0.9231494895724538, + "grad_norm": 4.781425278603808, + "learning_rate": 1.7868691912491352e-08, + "loss": 1.6634, + "step": 13700 + }, + { + "epoch": 0.9234864054445605, + "grad_norm": 5.000408465760196, + "learning_rate": 1.77132228191208e-08, + "loss": 1.7284, + "step": 13705 + }, + { + "epoch": 0.9238233213166672, + "grad_norm": 4.511488885466128, + "learning_rate": 1.7558420827743505e-08, + "loss": 1.6045, + "step": 13710 + }, + { + "epoch": 0.9241602371887739, + "grad_norm": 4.4336520051897415, + "learning_rate": 1.7404286152482573e-08, + "loss": 1.7259, + "step": 13715 + }, + { + "epoch": 0.9244971530608806, + "grad_norm": 4.913660389081854, + "learning_rate": 1.725081900653791e-08, + "loss": 1.6808, + "step": 13720 + }, + { + "epoch": 0.9248340689329875, + "grad_norm": 4.697631827271392, + "learning_rate": 1.7098019602186376e-08, + "loss": 1.6687, + "step": 13725 + }, + { + "epoch": 0.9251709848050942, + "grad_norm": 4.553841575367267, + "learning_rate": 1.6945888150780797e-08, + "loss": 1.6873, + "step": 13730 + }, + { + "epoch": 0.9255079006772009, + "grad_norm": 4.4992528881221485, + "learning_rate": 1.6794424862750568e-08, + "loss": 1.6747, + "step": 13735 + }, + { + "epoch": 0.9258448165493076, + "grad_norm": 5.104827419984812, + "learning_rate": 1.664362994760038e-08, + "loss": 1.6637, + "step": 13740 + }, + { + "epoch": 0.9261817324214143, + "grad_norm": 4.680722518422123, + "learning_rate": 1.649350361391083e-08, + "loss": 1.6732, + "step": 13745 + }, + { + "epoch": 0.9265186482935212, + "grad_norm": 5.31436095897836, + "learning_rate": 1.6344046069337646e-08, + "loss": 1.6788, + "step": 13750 + }, + { + "epoch": 0.9268555641656279, + "grad_norm": 4.234586773332621, + "learning_rate": 1.6195257520611182e-08, + "loss": 1.6974, + "step": 13755 + }, + { + "epoch": 0.9271924800377346, + "grad_norm": 4.272670471430088, + "learning_rate": 1.604713817353681e-08, + "loss": 1.6658, + "step": 13760 + }, + { + "epoch": 0.9275293959098413, + "grad_norm": 4.6885220919191966, + "learning_rate": 1.5899688232994147e-08, + "loss": 1.6807, + "step": 13765 + }, + { + "epoch": 0.927866311781948, + "grad_norm": 4.662232899644492, + "learning_rate": 1.5752907902936707e-08, + "loss": 1.7172, + "step": 13770 + }, + { + "epoch": 0.9282032276540548, + "grad_norm": 4.6833045439786405, + "learning_rate": 1.560679738639198e-08, + "loss": 1.68, + "step": 13775 + }, + { + "epoch": 0.9285401435261615, + "grad_norm": 4.756429528552997, + "learning_rate": 1.5461356885461075e-08, + "loss": 1.6828, + "step": 13780 + }, + { + "epoch": 0.9288770593982683, + "grad_norm": 4.710974308535877, + "learning_rate": 1.5316586601317905e-08, + "loss": 1.6583, + "step": 13785 + }, + { + "epoch": 0.929213975270375, + "grad_norm": 4.489974244824489, + "learning_rate": 1.5172486734209788e-08, + "loss": 1.6617, + "step": 13790 + }, + { + "epoch": 0.9295508911424817, + "grad_norm": 4.54926827654004, + "learning_rate": 1.502905748345651e-08, + "loss": 1.712, + "step": 13795 + }, + { + "epoch": 0.9298878070145885, + "grad_norm": 4.55393602811608, + "learning_rate": 1.4886299047450257e-08, + "loss": 1.7058, + "step": 13800 + }, + { + "epoch": 0.9302247228866952, + "grad_norm": 4.473944643605008, + "learning_rate": 1.4744211623655356e-08, + "loss": 1.6269, + "step": 13805 + }, + { + "epoch": 0.9305616387588019, + "grad_norm": 4.874670195538181, + "learning_rate": 1.4602795408607982e-08, + "loss": 1.739, + "step": 13810 + }, + { + "epoch": 0.9308985546309086, + "grad_norm": 4.354158570656502, + "learning_rate": 1.4462050597915942e-08, + "loss": 1.6832, + "step": 13815 + }, + { + "epoch": 0.9312354705030154, + "grad_norm": 4.510572112208238, + "learning_rate": 1.4321977386258289e-08, + "loss": 1.6461, + "step": 13820 + }, + { + "epoch": 0.9315723863751222, + "grad_norm": 4.497444842424594, + "learning_rate": 1.4182575967385092e-08, + "loss": 1.683, + "step": 13825 + }, + { + "epoch": 0.9319093022472289, + "grad_norm": 4.651578766108472, + "learning_rate": 1.4043846534117331e-08, + "loss": 1.7365, + "step": 13830 + }, + { + "epoch": 0.9322462181193356, + "grad_norm": 4.604232959356164, + "learning_rate": 1.3905789278346347e-08, + "loss": 1.7186, + "step": 13835 + }, + { + "epoch": 0.9325831339914423, + "grad_norm": 4.8235544459128885, + "learning_rate": 1.3768404391033717e-08, + "loss": 1.6283, + "step": 13840 + }, + { + "epoch": 0.932920049863549, + "grad_norm": 4.535664336555867, + "learning_rate": 1.3631692062211209e-08, + "loss": 1.674, + "step": 13845 + }, + { + "epoch": 0.9332569657356558, + "grad_norm": 4.220888842317781, + "learning_rate": 1.3495652480979947e-08, + "loss": 1.593, + "step": 13850 + }, + { + "epoch": 0.9335938816077626, + "grad_norm": 4.7901986238588234, + "learning_rate": 1.3360285835510854e-08, + "loss": 1.6707, + "step": 13855 + }, + { + "epoch": 0.9339307974798693, + "grad_norm": 4.325902736144492, + "learning_rate": 1.322559231304382e-08, + "loss": 1.6401, + "step": 13860 + }, + { + "epoch": 0.934267713351976, + "grad_norm": 4.222275752497884, + "learning_rate": 1.3091572099887816e-08, + "loss": 1.6086, + "step": 13865 + }, + { + "epoch": 0.9346046292240827, + "grad_norm": 4.971828618203986, + "learning_rate": 1.2958225381420329e-08, + "loss": 1.7208, + "step": 13870 + }, + { + "epoch": 0.9349415450961894, + "grad_norm": 4.478315582636149, + "learning_rate": 1.282555234208732e-08, + "loss": 1.6177, + "step": 13875 + }, + { + "epoch": 0.9352784609682963, + "grad_norm": 4.480192189419776, + "learning_rate": 1.2693553165403104e-08, + "loss": 1.6847, + "step": 13880 + }, + { + "epoch": 0.935615376840403, + "grad_norm": 4.280287532922316, + "learning_rate": 1.2562228033949628e-08, + "loss": 1.6754, + "step": 13885 + }, + { + "epoch": 0.9359522927125097, + "grad_norm": 4.6862475175220135, + "learning_rate": 1.243157712937659e-08, + "loss": 1.673, + "step": 13890 + }, + { + "epoch": 0.9362892085846164, + "grad_norm": 4.444473930426543, + "learning_rate": 1.230160063240121e-08, + "loss": 1.675, + "step": 13895 + }, + { + "epoch": 0.9366261244567231, + "grad_norm": 4.765538180006135, + "learning_rate": 1.2172298722807617e-08, + "loss": 1.6323, + "step": 13900 + }, + { + "epoch": 0.9369630403288299, + "grad_norm": 5.033579348083475, + "learning_rate": 1.204367157944708e-08, + "loss": 1.6684, + "step": 13905 + }, + { + "epoch": 0.9372999562009366, + "grad_norm": 4.739347042185302, + "learning_rate": 1.19157193802375e-08, + "loss": 1.6221, + "step": 13910 + }, + { + "epoch": 0.9376368720730434, + "grad_norm": 4.498216776239286, + "learning_rate": 1.1788442302163026e-08, + "loss": 1.5921, + "step": 13915 + }, + { + "epoch": 0.9379737879451501, + "grad_norm": 4.109009748201763, + "learning_rate": 1.1661840521274168e-08, + "loss": 1.6288, + "step": 13920 + }, + { + "epoch": 0.9383107038172568, + "grad_norm": 4.419909793832432, + "learning_rate": 1.1535914212687237e-08, + "loss": 1.6096, + "step": 13925 + }, + { + "epoch": 0.9386476196893636, + "grad_norm": 4.7263234586750125, + "learning_rate": 1.1410663550584287e-08, + "loss": 1.6956, + "step": 13930 + }, + { + "epoch": 0.9389845355614703, + "grad_norm": 4.617355329685817, + "learning_rate": 1.1286088708212793e-08, + "loss": 1.6558, + "step": 13935 + }, + { + "epoch": 0.939321451433577, + "grad_norm": 4.721202684803363, + "learning_rate": 1.1162189857885362e-08, + "loss": 1.5686, + "step": 13940 + }, + { + "epoch": 0.9396583673056838, + "grad_norm": 4.7742840434126474, + "learning_rate": 1.1038967170979741e-08, + "loss": 1.718, + "step": 13945 + }, + { + "epoch": 0.9399952831777905, + "grad_norm": 4.78729301438576, + "learning_rate": 1.0916420817938254e-08, + "loss": 1.6548, + "step": 13950 + }, + { + "epoch": 0.9403321990498973, + "grad_norm": 4.736016915905797, + "learning_rate": 1.0794550968267701e-08, + "loss": 1.6462, + "step": 13955 + }, + { + "epoch": 0.940669114922004, + "grad_norm": 3.9679066975931954, + "learning_rate": 1.0673357790539294e-08, + "loss": 1.6619, + "step": 13960 + }, + { + "epoch": 0.9410060307941107, + "grad_norm": 4.608423358301694, + "learning_rate": 1.0552841452388105e-08, + "loss": 1.6532, + "step": 13965 + }, + { + "epoch": 0.9413429466662174, + "grad_norm": 4.90480375150542, + "learning_rate": 1.0433002120513123e-08, + "loss": 1.7195, + "step": 13970 + }, + { + "epoch": 0.9416798625383241, + "grad_norm": 4.2680828511473665, + "learning_rate": 1.0313839960676751e-08, + "loss": 1.6184, + "step": 13975 + }, + { + "epoch": 0.942016778410431, + "grad_norm": 4.180504127514293, + "learning_rate": 1.019535513770492e-08, + "loss": 1.5969, + "step": 13980 + }, + { + "epoch": 0.9423536942825377, + "grad_norm": 4.751749615006871, + "learning_rate": 1.0077547815486476e-08, + "loss": 1.7189, + "step": 13985 + }, + { + "epoch": 0.9426906101546444, + "grad_norm": 4.557131971025592, + "learning_rate": 9.960418156973238e-09, + "loss": 1.7, + "step": 13990 + }, + { + "epoch": 0.9430275260267511, + "grad_norm": 4.8323487176650355, + "learning_rate": 9.843966324179609e-09, + "loss": 1.668, + "step": 13995 + }, + { + "epoch": 0.9433644418988578, + "grad_norm": 4.241230044588303, + "learning_rate": 9.728192478182573e-09, + "loss": 1.6605, + "step": 14000 + }, + { + "epoch": 0.9437013577709646, + "grad_norm": 4.681284350687982, + "learning_rate": 9.613096779121089e-09, + "loss": 1.6712, + "step": 14005 + }, + { + "epoch": 0.9440382736430714, + "grad_norm": 4.862508336979253, + "learning_rate": 9.498679386196417e-09, + "loss": 1.5407, + "step": 14010 + }, + { + "epoch": 0.9443751895151781, + "grad_norm": 4.877091231184954, + "learning_rate": 9.384940457671186e-09, + "loss": 1.6501, + "step": 14015 + }, + { + "epoch": 0.9447121053872848, + "grad_norm": 4.841069261109426, + "learning_rate": 9.271880150869882e-09, + "loss": 1.7064, + "step": 14020 + }, + { + "epoch": 0.9450490212593915, + "grad_norm": 4.725458488667362, + "learning_rate": 9.15949862217824e-09, + "loss": 1.684, + "step": 14025 + }, + { + "epoch": 0.9453859371314982, + "grad_norm": 4.8010599888096595, + "learning_rate": 9.04779602704292e-09, + "loss": 1.7179, + "step": 14030 + }, + { + "epoch": 0.945722853003605, + "grad_norm": 4.666711180892035, + "learning_rate": 8.936772519971769e-09, + "loss": 1.6452, + "step": 14035 + }, + { + "epoch": 0.9460597688757117, + "grad_norm": 4.563602390830038, + "learning_rate": 8.826428254533169e-09, + "loss": 1.709, + "step": 14040 + }, + { + "epoch": 0.9463966847478185, + "grad_norm": 4.238437561918158, + "learning_rate": 8.716763383355862e-09, + "loss": 1.6716, + "step": 14045 + }, + { + "epoch": 0.9467336006199252, + "grad_norm": 4.777398386226606, + "learning_rate": 8.607778058129122e-09, + "loss": 1.6786, + "step": 14050 + }, + { + "epoch": 0.9470705164920319, + "grad_norm": 4.558655591881119, + "learning_rate": 8.499472429601972e-09, + "loss": 1.618, + "step": 14055 + }, + { + "epoch": 0.9474074323641387, + "grad_norm": 4.747135549338924, + "learning_rate": 8.391846647583468e-09, + "loss": 1.692, + "step": 14060 + }, + { + "epoch": 0.9477443482362454, + "grad_norm": 4.881503970705126, + "learning_rate": 8.284900860942246e-09, + "loss": 1.7147, + "step": 14065 + }, + { + "epoch": 0.9480812641083521, + "grad_norm": 4.694090559014302, + "learning_rate": 8.178635217606367e-09, + "loss": 1.6388, + "step": 14070 + }, + { + "epoch": 0.9484181799804589, + "grad_norm": 4.462356406892563, + "learning_rate": 8.073049864563142e-09, + "loss": 1.6103, + "step": 14075 + }, + { + "epoch": 0.9487550958525656, + "grad_norm": 4.268419903555753, + "learning_rate": 7.968144947858801e-09, + "loss": 1.6273, + "step": 14080 + }, + { + "epoch": 0.9490920117246724, + "grad_norm": 4.72808438390043, + "learning_rate": 7.863920612598496e-09, + "loss": 1.6714, + "step": 14085 + }, + { + "epoch": 0.9494289275967791, + "grad_norm": 4.456322087598728, + "learning_rate": 7.760377002945961e-09, + "loss": 1.677, + "step": 14090 + }, + { + "epoch": 0.9497658434688858, + "grad_norm": 4.381592471889056, + "learning_rate": 7.657514262123354e-09, + "loss": 1.6303, + "step": 14095 + }, + { + "epoch": 0.9501027593409925, + "grad_norm": 5.084398199395626, + "learning_rate": 7.55533253241103e-09, + "loss": 1.6657, + "step": 14100 + }, + { + "epoch": 0.9504396752130992, + "grad_norm": 4.506764254750484, + "learning_rate": 7.453831955147428e-09, + "loss": 1.67, + "step": 14105 + }, + { + "epoch": 0.9507765910852061, + "grad_norm": 4.1594856114480265, + "learning_rate": 7.353012670728631e-09, + "loss": 1.6293, + "step": 14110 + }, + { + "epoch": 0.9511135069573128, + "grad_norm": 4.2947296679424225, + "learning_rate": 7.252874818608645e-09, + "loss": 1.6333, + "step": 14115 + }, + { + "epoch": 0.9514504228294195, + "grad_norm": 4.54393570232016, + "learning_rate": 7.153418537298617e-09, + "loss": 1.7011, + "step": 14120 + }, + { + "epoch": 0.9517873387015262, + "grad_norm": 4.728057239456153, + "learning_rate": 7.0546439643671685e-09, + "loss": 1.6314, + "step": 14125 + }, + { + "epoch": 0.9521242545736329, + "grad_norm": 4.668701870639253, + "learning_rate": 6.9565512364398445e-09, + "loss": 1.6822, + "step": 14130 + }, + { + "epoch": 0.9524611704457397, + "grad_norm": 4.381883653333877, + "learning_rate": 6.859140489199167e-09, + "loss": 1.6452, + "step": 14135 + }, + { + "epoch": 0.9527980863178465, + "grad_norm": 4.4705819028917535, + "learning_rate": 6.762411857384187e-09, + "loss": 1.71, + "step": 14140 + }, + { + "epoch": 0.9531350021899532, + "grad_norm": 5.099898183629039, + "learning_rate": 6.666365474790492e-09, + "loss": 1.6914, + "step": 14145 + }, + { + "epoch": 0.9534719180620599, + "grad_norm": 5.008579652213383, + "learning_rate": 6.571001474270144e-09, + "loss": 1.6008, + "step": 14150 + }, + { + "epoch": 0.9538088339341666, + "grad_norm": 4.4435992523929215, + "learning_rate": 6.4763199877311825e-09, + "loss": 1.6745, + "step": 14155 + }, + { + "epoch": 0.9541457498062734, + "grad_norm": 4.416720324518652, + "learning_rate": 6.382321146137571e-09, + "loss": 1.7335, + "step": 14160 + }, + { + "epoch": 0.9544826656783801, + "grad_norm": 4.6157251630017155, + "learning_rate": 6.28900507950908e-09, + "loss": 1.6982, + "step": 14165 + }, + { + "epoch": 0.9548195815504869, + "grad_norm": 4.576436969845522, + "learning_rate": 6.196371916921073e-09, + "loss": 1.6821, + "step": 14170 + }, + { + "epoch": 0.9551564974225936, + "grad_norm": 4.8030806252738385, + "learning_rate": 6.1044217865043325e-09, + "loss": 1.6766, + "step": 14175 + }, + { + "epoch": 0.9554934132947003, + "grad_norm": 4.788114104485619, + "learning_rate": 6.013154815444732e-09, + "loss": 1.7202, + "step": 14180 + }, + { + "epoch": 0.9558303291668071, + "grad_norm": 4.497660090547918, + "learning_rate": 5.922571129983456e-09, + "loss": 1.716, + "step": 14185 + }, + { + "epoch": 0.9561672450389138, + "grad_norm": 4.977854172865926, + "learning_rate": 5.832670855416277e-09, + "loss": 1.6901, + "step": 14190 + }, + { + "epoch": 0.9565041609110205, + "grad_norm": 4.665062252009661, + "learning_rate": 5.7434541160938375e-09, + "loss": 1.6698, + "step": 14195 + }, + { + "epoch": 0.9568410767831272, + "grad_norm": 4.503462544044519, + "learning_rate": 5.6549210354212565e-09, + "loss": 1.7074, + "step": 14200 + }, + { + "epoch": 0.957177992655234, + "grad_norm": 4.828820266872503, + "learning_rate": 5.567071735858131e-09, + "loss": 1.6173, + "step": 14205 + }, + { + "epoch": 0.9575149085273407, + "grad_norm": 4.577331498261228, + "learning_rate": 5.4799063389179834e-09, + "loss": 1.6618, + "step": 14210 + }, + { + "epoch": 0.9578518243994475, + "grad_norm": 4.70378738630458, + "learning_rate": 5.393424965168702e-09, + "loss": 1.7063, + "step": 14215 + }, + { + "epoch": 0.9581887402715542, + "grad_norm": 5.166081051847853, + "learning_rate": 5.307627734231657e-09, + "loss": 1.7223, + "step": 14220 + }, + { + "epoch": 0.9585256561436609, + "grad_norm": 4.51665984078035, + "learning_rate": 5.222514764782193e-09, + "loss": 1.735, + "step": 14225 + }, + { + "epoch": 0.9588625720157676, + "grad_norm": 4.452215666971605, + "learning_rate": 5.138086174549083e-09, + "loss": 1.6995, + "step": 14230 + }, + { + "epoch": 0.9591994878878743, + "grad_norm": 4.794617784858695, + "learning_rate": 5.054342080314522e-09, + "loss": 1.6653, + "step": 14235 + }, + { + "epoch": 0.9595364037599812, + "grad_norm": 4.384539843873983, + "learning_rate": 4.97128259791374e-09, + "loss": 1.5975, + "step": 14240 + }, + { + "epoch": 0.9598733196320879, + "grad_norm": 4.7112314437798775, + "learning_rate": 4.888907842235113e-09, + "loss": 1.6114, + "step": 14245 + }, + { + "epoch": 0.9602102355041946, + "grad_norm": 4.718198652581357, + "learning_rate": 4.807217927220053e-09, + "loss": 1.6625, + "step": 14250 + }, + { + "epoch": 0.9605471513763013, + "grad_norm": 5.051593564690264, + "learning_rate": 4.726212965862342e-09, + "loss": 1.6603, + "step": 14255 + }, + { + "epoch": 0.960884067248408, + "grad_norm": 4.688799213508077, + "learning_rate": 4.645893070208684e-09, + "loss": 1.6349, + "step": 14260 + }, + { + "epoch": 0.9612209831205148, + "grad_norm": 4.922175546380499, + "learning_rate": 4.566258351357988e-09, + "loss": 1.6075, + "step": 14265 + }, + { + "epoch": 0.9615578989926216, + "grad_norm": 4.917762419900809, + "learning_rate": 4.48730891946153e-09, + "loss": 1.5986, + "step": 14270 + }, + { + "epoch": 0.9618948148647283, + "grad_norm": 4.598810064957941, + "learning_rate": 4.409044883722568e-09, + "loss": 1.6256, + "step": 14275 + }, + { + "epoch": 0.962231730736835, + "grad_norm": 4.356705697302099, + "learning_rate": 4.331466352396396e-09, + "loss": 1.7101, + "step": 14280 + }, + { + "epoch": 0.9625686466089417, + "grad_norm": 4.328389990749196, + "learning_rate": 4.2545734327902315e-09, + "loss": 1.6263, + "step": 14285 + }, + { + "epoch": 0.9629055624810485, + "grad_norm": 4.995883922084056, + "learning_rate": 4.178366231262665e-09, + "loss": 1.6551, + "step": 14290 + }, + { + "epoch": 0.9632424783531552, + "grad_norm": 4.475910128620982, + "learning_rate": 4.102844853224041e-09, + "loss": 1.6777, + "step": 14295 + }, + { + "epoch": 0.963579394225262, + "grad_norm": 4.658822634083728, + "learning_rate": 4.028009403135968e-09, + "loss": 1.6598, + "step": 14300 + }, + { + "epoch": 0.9639163100973687, + "grad_norm": 4.7826733156830255, + "learning_rate": 3.95385998451131e-09, + "loss": 1.7016, + "step": 14305 + }, + { + "epoch": 0.9642532259694754, + "grad_norm": 4.233830555284409, + "learning_rate": 3.880396699913968e-09, + "loss": 1.6825, + "step": 14310 + }, + { + "epoch": 0.9645901418415822, + "grad_norm": 4.995866302785352, + "learning_rate": 3.807619650958827e-09, + "loss": 1.6545, + "step": 14315 + }, + { + "epoch": 0.9649270577136889, + "grad_norm": 4.873719835419959, + "learning_rate": 3.7355289383115276e-09, + "loss": 1.6973, + "step": 14320 + }, + { + "epoch": 0.9652639735857956, + "grad_norm": 4.569288481946704, + "learning_rate": 3.664124661688417e-09, + "loss": 1.7109, + "step": 14325 + }, + { + "epoch": 0.9656008894579023, + "grad_norm": 5.142916963140143, + "learning_rate": 3.5934069198562677e-09, + "loss": 1.5927, + "step": 14330 + }, + { + "epoch": 0.9659378053300091, + "grad_norm": 4.503412551781982, + "learning_rate": 3.5233758106322787e-09, + "loss": 1.6719, + "step": 14335 + }, + { + "epoch": 0.9662747212021159, + "grad_norm": 4.78116076892331, + "learning_rate": 3.4540314308839635e-09, + "loss": 1.6598, + "step": 14340 + }, + { + "epoch": 0.9666116370742226, + "grad_norm": 4.503686920939599, + "learning_rate": 3.385373876528874e-09, + "loss": 1.7066, + "step": 14345 + }, + { + "epoch": 0.9669485529463293, + "grad_norm": 4.86363545716925, + "learning_rate": 3.3174032425345444e-09, + "loss": 1.6455, + "step": 14350 + }, + { + "epoch": 0.967285468818436, + "grad_norm": 4.69142574421157, + "learning_rate": 3.250119622918379e-09, + "loss": 1.6145, + "step": 14355 + }, + { + "epoch": 0.9676223846905427, + "grad_norm": 4.598668097010771, + "learning_rate": 3.1835231107474323e-09, + "loss": 1.6796, + "step": 14360 + }, + { + "epoch": 0.9679593005626496, + "grad_norm": 4.704583569768571, + "learning_rate": 3.1176137981385185e-09, + "loss": 1.6821, + "step": 14365 + }, + { + "epoch": 0.9682962164347563, + "grad_norm": 4.451333372128447, + "learning_rate": 3.0523917762576568e-09, + "loss": 1.6355, + "step": 14370 + }, + { + "epoch": 0.968633132306863, + "grad_norm": 4.419630961259664, + "learning_rate": 2.9878571353204595e-09, + "loss": 1.7504, + "step": 14375 + }, + { + "epoch": 0.9689700481789697, + "grad_norm": 4.668303542736468, + "learning_rate": 2.924009964591578e-09, + "loss": 1.6296, + "step": 14380 + }, + { + "epoch": 0.9693069640510764, + "grad_norm": 4.481187207413503, + "learning_rate": 2.8608503523848136e-09, + "loss": 1.7121, + "step": 14385 + }, + { + "epoch": 0.9696438799231831, + "grad_norm": 5.098540734967322, + "learning_rate": 2.7983783860629496e-09, + "loss": 1.5944, + "step": 14390 + }, + { + "epoch": 0.96998079579529, + "grad_norm": 4.44133751167953, + "learning_rate": 2.7365941520375303e-09, + "loss": 1.6733, + "step": 14395 + }, + { + "epoch": 0.9703177116673967, + "grad_norm": 4.46888227527462, + "learning_rate": 2.6754977357689724e-09, + "loss": 1.6552, + "step": 14400 + }, + { + "epoch": 0.9706546275395034, + "grad_norm": 4.348433001446189, + "learning_rate": 2.6150892217660647e-09, + "loss": 1.5949, + "step": 14405 + }, + { + "epoch": 0.9709915434116101, + "grad_norm": 4.788880812069154, + "learning_rate": 2.5553686935864126e-09, + "loss": 1.7321, + "step": 14410 + }, + { + "epoch": 0.9713284592837168, + "grad_norm": 4.64048628521751, + "learning_rate": 2.496336233835661e-09, + "loss": 1.6318, + "step": 14415 + }, + { + "epoch": 0.9716653751558236, + "grad_norm": 5.386824440878254, + "learning_rate": 2.437991924167937e-09, + "loss": 1.72, + "step": 14420 + }, + { + "epoch": 0.9720022910279303, + "grad_norm": 4.345030594331005, + "learning_rate": 2.380335845285464e-09, + "loss": 1.6684, + "step": 14425 + }, + { + "epoch": 0.9723392069000371, + "grad_norm": 5.205111442516552, + "learning_rate": 2.323368076938448e-09, + "loss": 1.6683, + "step": 14430 + }, + { + "epoch": 0.9726761227721438, + "grad_norm": 4.539297425674057, + "learning_rate": 2.2670886979250235e-09, + "loss": 1.6197, + "step": 14435 + }, + { + "epoch": 0.9730130386442505, + "grad_norm": 5.3078680962201386, + "learning_rate": 2.211497786091143e-09, + "loss": 1.6613, + "step": 14440 + }, + { + "epoch": 0.9733499545163573, + "grad_norm": 4.093183911517651, + "learning_rate": 2.1565954183306313e-09, + "loss": 1.6449, + "step": 14445 + }, + { + "epoch": 0.973686870388464, + "grad_norm": 4.956998797141432, + "learning_rate": 2.1023816705846853e-09, + "loss": 1.677, + "step": 14450 + }, + { + "epoch": 0.9740237862605707, + "grad_norm": 4.401344494084036, + "learning_rate": 2.048856617842043e-09, + "loss": 1.5762, + "step": 14455 + }, + { + "epoch": 0.9743607021326774, + "grad_norm": 4.460249220034049, + "learning_rate": 1.9960203341389813e-09, + "loss": 1.6107, + "step": 14460 + }, + { + "epoch": 0.9746976180047842, + "grad_norm": 4.684091839068217, + "learning_rate": 1.943872892558929e-09, + "loss": 1.693, + "step": 14465 + }, + { + "epoch": 0.975034533876891, + "grad_norm": 4.627584897372627, + "learning_rate": 1.8924143652325196e-09, + "loss": 1.6675, + "step": 14470 + }, + { + "epoch": 0.9753714497489977, + "grad_norm": 4.5804086807255855, + "learning_rate": 1.8416448233374848e-09, + "loss": 1.6034, + "step": 14475 + }, + { + "epoch": 0.9757083656211044, + "grad_norm": 4.784239841374212, + "learning_rate": 1.79156433709865e-09, + "loss": 1.698, + "step": 14480 + }, + { + "epoch": 0.9760452814932111, + "grad_norm": 4.50530685834436, + "learning_rate": 1.742172975787548e-09, + "loss": 1.6361, + "step": 14485 + }, + { + "epoch": 0.9763821973653178, + "grad_norm": 4.68302514179738, + "learning_rate": 1.6934708077226411e-09, + "loss": 1.6463, + "step": 14490 + }, + { + "epoch": 0.9767191132374247, + "grad_norm": 4.602230205378361, + "learning_rate": 1.6454579002690982e-09, + "loss": 1.6715, + "step": 14495 + }, + { + "epoch": 0.9770560291095314, + "grad_norm": 4.913828834206417, + "learning_rate": 1.5981343198386288e-09, + "loss": 1.6651, + "step": 14500 + }, + { + "epoch": 0.9773929449816381, + "grad_norm": 4.734186118385281, + "learning_rate": 1.5515001318895382e-09, + "loss": 1.6073, + "step": 14505 + }, + { + "epoch": 0.9777298608537448, + "grad_norm": 4.4545084698284505, + "learning_rate": 1.5055554009264505e-09, + "loss": 1.6929, + "step": 14510 + }, + { + "epoch": 0.9780667767258515, + "grad_norm": 4.785564771151706, + "learning_rate": 1.4603001905004187e-09, + "loss": 1.6346, + "step": 14515 + }, + { + "epoch": 0.9784036925979583, + "grad_norm": 4.645996933378816, + "learning_rate": 1.4157345632087592e-09, + "loss": 1.5743, + "step": 14520 + }, + { + "epoch": 0.978740608470065, + "grad_norm": 4.391419944898517, + "learning_rate": 1.37185858069494e-09, + "loss": 1.6349, + "step": 14525 + }, + { + "epoch": 0.9790775243421718, + "grad_norm": 4.301204477830003, + "learning_rate": 1.328672303648415e-09, + "loss": 1.6671, + "step": 14530 + }, + { + "epoch": 0.9794144402142785, + "grad_norm": 4.65876078456801, + "learning_rate": 1.2861757918046778e-09, + "loss": 1.6325, + "step": 14535 + }, + { + "epoch": 0.9797513560863852, + "grad_norm": 4.656387193046978, + "learning_rate": 1.2443691039452642e-09, + "loss": 1.6799, + "step": 14540 + }, + { + "epoch": 0.980088271958492, + "grad_norm": 4.856871488888764, + "learning_rate": 1.203252297897417e-09, + "loss": 1.6712, + "step": 14545 + }, + { + "epoch": 0.9804251878305987, + "grad_norm": 4.4246714925954755, + "learning_rate": 1.1628254305340869e-09, + "loss": 1.6516, + "step": 14550 + }, + { + "epoch": 0.9807621037027054, + "grad_norm": 4.861361000481014, + "learning_rate": 1.123088557773988e-09, + "loss": 1.6693, + "step": 14555 + }, + { + "epoch": 0.9810990195748122, + "grad_norm": 4.5445239138117595, + "learning_rate": 1.0840417345814312e-09, + "loss": 1.6838, + "step": 14560 + }, + { + "epoch": 0.9814359354469189, + "grad_norm": 4.296831382491651, + "learning_rate": 1.0456850149662134e-09, + "loss": 1.6672, + "step": 14565 + }, + { + "epoch": 0.9817728513190256, + "grad_norm": 4.360980348229525, + "learning_rate": 1.0080184519835056e-09, + "loss": 1.6291, + "step": 14570 + }, + { + "epoch": 0.9821097671911324, + "grad_norm": 4.6410736774559584, + "learning_rate": 9.71042097734076e-10, + "loss": 1.5869, + "step": 14575 + }, + { + "epoch": 0.9824466830632391, + "grad_norm": 4.672315989477686, + "learning_rate": 9.347560033637347e-10, + "loss": 1.687, + "step": 14580 + }, + { + "epoch": 0.9827835989353458, + "grad_norm": 4.760298790473192, + "learning_rate": 8.991602190636105e-10, + "loss": 1.6779, + "step": 14585 + }, + { + "epoch": 0.9831205148074526, + "grad_norm": 4.5956833044108105, + "learning_rate": 8.642547940700961e-10, + "loss": 1.6209, + "step": 14590 + }, + { + "epoch": 0.9834574306795593, + "grad_norm": 4.852699110524061, + "learning_rate": 8.300397766644595e-10, + "loss": 1.66, + "step": 14595 + }, + { + "epoch": 0.9837943465516661, + "grad_norm": 4.573099918969528, + "learning_rate": 7.965152141732878e-10, + "loss": 1.6092, + "step": 14600 + }, + { + "epoch": 0.9841312624237728, + "grad_norm": 5.041789334977715, + "learning_rate": 7.636811529678211e-10, + "loss": 1.7897, + "step": 14605 + }, + { + "epoch": 0.9844681782958795, + "grad_norm": 4.865882608674358, + "learning_rate": 7.315376384643968e-10, + "loss": 1.6522, + "step": 14610 + }, + { + "epoch": 0.9848050941679862, + "grad_norm": 4.754816598507756, + "learning_rate": 7.000847151240608e-10, + "loss": 1.6973, + "step": 14615 + }, + { + "epoch": 0.9851420100400929, + "grad_norm": 4.544176767785006, + "learning_rate": 6.693224264527897e-10, + "loss": 1.6629, + "step": 14620 + }, + { + "epoch": 0.9854789259121998, + "grad_norm": 4.345974794927777, + "learning_rate": 6.392508150011023e-10, + "loss": 1.6632, + "step": 14625 + }, + { + "epoch": 0.9858158417843065, + "grad_norm": 4.473605639509804, + "learning_rate": 6.098699223641701e-10, + "loss": 1.6382, + "step": 14630 + }, + { + "epoch": 0.9861527576564132, + "grad_norm": 4.2474880352741256, + "learning_rate": 5.811797891819847e-10, + "loss": 1.6289, + "step": 14635 + }, + { + "epoch": 0.9864896735285199, + "grad_norm": 4.706147394472757, + "learning_rate": 5.531804551387464e-10, + "loss": 1.6884, + "step": 14640 + }, + { + "epoch": 0.9868265894006266, + "grad_norm": 4.6198804722109506, + "learning_rate": 5.258719589634198e-10, + "loss": 1.5749, + "step": 14645 + }, + { + "epoch": 0.9871635052727334, + "grad_norm": 5.169200607824497, + "learning_rate": 4.992543384291781e-10, + "loss": 1.6929, + "step": 14650 + }, + { + "epoch": 0.9875004211448402, + "grad_norm": 5.093086280987769, + "learning_rate": 4.733276303537925e-10, + "loss": 1.7123, + "step": 14655 + }, + { + "epoch": 0.9878373370169469, + "grad_norm": 4.469256470389324, + "learning_rate": 4.480918705991321e-10, + "loss": 1.6728, + "step": 14660 + }, + { + "epoch": 0.9881742528890536, + "grad_norm": 4.937986515455233, + "learning_rate": 4.235470940715524e-10, + "loss": 1.6923, + "step": 14665 + }, + { + "epoch": 0.9885111687611603, + "grad_norm": 4.774095402654197, + "learning_rate": 3.99693334721507e-10, + "loss": 1.6331, + "step": 14670 + }, + { + "epoch": 0.9888480846332671, + "grad_norm": 4.895959891617084, + "learning_rate": 3.765306255436029e-10, + "loss": 1.6954, + "step": 14675 + }, + { + "epoch": 0.9891850005053738, + "grad_norm": 4.588526481995063, + "learning_rate": 3.540589985766562e-10, + "loss": 1.6825, + "step": 14680 + }, + { + "epoch": 0.9895219163774805, + "grad_norm": 4.632597593371431, + "learning_rate": 3.322784849036364e-10, + "loss": 1.6749, + "step": 14685 + }, + { + "epoch": 0.9898588322495873, + "grad_norm": 4.7885886471647, + "learning_rate": 3.11189114651389e-10, + "loss": 1.5306, + "step": 14690 + }, + { + "epoch": 0.990195748121694, + "grad_norm": 5.4431857664056285, + "learning_rate": 2.9079091699091287e-10, + "loss": 1.6503, + "step": 14695 + }, + { + "epoch": 0.9905326639938008, + "grad_norm": 4.560136830848719, + "learning_rate": 2.710839201370829e-10, + "loss": 1.7277, + "step": 14700 + }, + { + "epoch": 0.9908695798659075, + "grad_norm": 4.534976189027352, + "learning_rate": 2.5206815134881655e-10, + "loss": 1.6648, + "step": 14705 + }, + { + "epoch": 0.9912064957380142, + "grad_norm": 4.549435008199678, + "learning_rate": 2.337436369287404e-10, + "loss": 1.6607, + "step": 14710 + }, + { + "epoch": 0.9915434116101209, + "grad_norm": 4.429287754065413, + "learning_rate": 2.1611040222346833e-10, + "loss": 1.6488, + "step": 14715 + }, + { + "epoch": 0.9918803274822277, + "grad_norm": 4.450792157939868, + "learning_rate": 1.9916847162343432e-10, + "loss": 1.6863, + "step": 14720 + }, + { + "epoch": 0.9922172433543345, + "grad_norm": 4.456301955946831, + "learning_rate": 1.829178685627264e-10, + "loss": 1.6766, + "step": 14725 + }, + { + "epoch": 0.9925541592264412, + "grad_norm": 4.536387361240234, + "learning_rate": 1.6735861551936402e-10, + "loss": 1.6072, + "step": 14730 + }, + { + "epoch": 0.9928910750985479, + "grad_norm": 4.5301760503399064, + "learning_rate": 1.5249073401502055e-10, + "loss": 1.6165, + "step": 14735 + }, + { + "epoch": 0.9932279909706546, + "grad_norm": 4.626033879980004, + "learning_rate": 1.3831424461496766e-10, + "loss": 1.6704, + "step": 14740 + }, + { + "epoch": 0.9935649068427613, + "grad_norm": 4.638098471878353, + "learning_rate": 1.2482916692824197e-10, + "loss": 1.6164, + "step": 14745 + }, + { + "epoch": 0.993901822714868, + "grad_norm": 4.640388344532539, + "learning_rate": 1.1203551960742297e-10, + "loss": 1.6507, + "step": 14750 + }, + { + "epoch": 0.9942387385869749, + "grad_norm": 4.703240878023018, + "learning_rate": 9.993332034891056e-11, + "loss": 1.6233, + "step": 14755 + }, + { + "epoch": 0.9945756544590816, + "grad_norm": 4.59940282508, + "learning_rate": 8.852258589236994e-11, + "loss": 1.6481, + "step": 14760 + }, + { + "epoch": 0.9949125703311883, + "grad_norm": 4.95979390658074, + "learning_rate": 7.780333202134226e-11, + "loss": 1.6457, + "step": 14765 + }, + { + "epoch": 0.995249486203295, + "grad_norm": 4.709459134710041, + "learning_rate": 6.777557356263397e-11, + "loss": 1.6773, + "step": 14770 + }, + { + "epoch": 0.9955864020754017, + "grad_norm": 4.6127703869383545, + "learning_rate": 5.843932438681643e-11, + "loss": 1.7132, + "step": 14775 + }, + { + "epoch": 0.9959233179475085, + "grad_norm": 4.708135222754572, + "learning_rate": 4.97945974077818e-11, + "loss": 1.7054, + "step": 14780 + }, + { + "epoch": 0.9962602338196153, + "grad_norm": 5.013474030961767, + "learning_rate": 4.1841404582965143e-11, + "loss": 1.6342, + "step": 14785 + }, + { + "epoch": 0.996597149691722, + "grad_norm": 4.300498430663266, + "learning_rate": 3.457975691334436e-11, + "loss": 1.5682, + "step": 14790 + }, + { + "epoch": 0.9969340655638287, + "grad_norm": 4.445836794215863, + "learning_rate": 2.800966444316266e-11, + "loss": 1.6325, + "step": 14795 + }, + { + "epoch": 0.9972709814359354, + "grad_norm": 4.491154818955257, + "learning_rate": 2.213113626026164e-11, + "loss": 1.6928, + "step": 14800 + }, + { + "epoch": 0.9976078973080422, + "grad_norm": 4.732056867538516, + "learning_rate": 1.6944180495914728e-11, + "loss": 1.5832, + "step": 14805 + }, + { + "epoch": 0.9979448131801489, + "grad_norm": 4.736707269460872, + "learning_rate": 1.2448804324660667e-11, + "loss": 1.7032, + "step": 14810 + }, + { + "epoch": 0.9982817290522557, + "grad_norm": 4.805801710749307, + "learning_rate": 8.645013964581061e-12, + "loss": 1.7129, + "step": 14815 + }, + { + "epoch": 0.9986186449243624, + "grad_norm": 4.748295330431418, + "learning_rate": 5.532814677133846e-12, + "loss": 1.684, + "step": 14820 + }, + { + "epoch": 0.9989555607964691, + "grad_norm": 4.76094035726937, + "learning_rate": 3.112210767042267e-12, + "loss": 1.692, + "step": 14825 + }, + { + "epoch": 0.9992924766685759, + "grad_norm": 4.743959538402305, + "learning_rate": 1.383205582516922e-12, + "loss": 1.5827, + "step": 14830 + }, + { + "epoch": 0.9996293925406826, + "grad_norm": 4.50407060815357, + "learning_rate": 3.4580151520025024e-13, + "loss": 1.7556, + "step": 14835 + }, + { + "epoch": 0.9999663084127893, + "grad_norm": 4.787374440275167, + "learning_rate": 0.0, + "loss": 1.6964, + "step": 14840 + }, + { + "epoch": 0.9999663084127893, + "eval_loss": NaN, + "eval_runtime": 130.4563, + "eval_samples_per_second": 36.779, + "eval_steps_per_second": 4.599, + "step": 14840 + }, + { + "epoch": 0.9999663084127893, + "step": 14840, + "total_flos": 1.8023025589165752e+18, + "train_loss": 1.7489710538856424, + "train_runtime": 43730.9212, + "train_samples_per_second": 10.859, + "train_steps_per_second": 0.339 + } + ], + "logging_steps": 5, + "max_steps": 14840, + "num_input_tokens_seen": 0, + "num_train_epochs": 1, + "save_steps": 1000, + "stateful_callbacks": { + "TrainerControl": { + "args": { + "should_epoch_stop": false, + "should_evaluate": false, + "should_log": false, + "should_save": true, + "should_training_stop": true + }, + "attributes": {} + } + }, + "total_flos": 1.8023025589165752e+18, + "train_batch_size": 2, + "trial_name": null, + "trial_params": null +}