diff --git "a/checkpoint-6500/trainer_state.json" "b/checkpoint-6500/trainer_state.json" new file mode 100644--- /dev/null +++ "b/checkpoint-6500/trainer_state.json" @@ -0,0 +1,4584 @@ +{ + "best_global_step": null, + "best_metric": null, + "best_model_checkpoint": null, + "epoch": 1.0051031256644696, + "eval_steps": 500, + "global_step": 6500, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "epoch": 0.0015464017165059054, + "grad_norm": 4352.0, + "learning_rate": 1.9972307692307693e-05, + "loss": 10.9174, + "step": 10 + }, + { + "epoch": 0.0030928034330118107, + "grad_norm": 71168.0, + "learning_rate": 1.9941538461538464e-05, + "loss": 11.9649, + "step": 20 + }, + { + "epoch": 0.004639205149517716, + "grad_norm": 190.0, + "learning_rate": 1.9910769230769232e-05, + "loss": 5.27, + "step": 30 + }, + { + "epoch": 0.0061856068660236215, + "grad_norm": 16.125, + "learning_rate": 1.9880000000000003e-05, + "loss": 0.3647, + "step": 40 + }, + { + "epoch": 0.007732008582529527, + "grad_norm": 3.890625, + "learning_rate": 1.984923076923077e-05, + "loss": 0.3099, + "step": 50 + }, + { + "epoch": 0.009278410299035433, + "grad_norm": 1.8828125, + "learning_rate": 1.9818461538461538e-05, + "loss": 0.2842, + "step": 60 + }, + { + "epoch": 0.010824812015541337, + "grad_norm": 1.6953125, + "learning_rate": 1.978769230769231e-05, + "loss": 0.2943, + "step": 70 + }, + { + "epoch": 0.012371213732047243, + "grad_norm": 1.1640625, + "learning_rate": 1.9756923076923077e-05, + "loss": 0.3539, + "step": 80 + }, + { + "epoch": 0.013917615448553147, + "grad_norm": 1.046875, + "learning_rate": 1.9726153846153848e-05, + "loss": 0.259, + "step": 90 + }, + { + "epoch": 0.015464017165059053, + "grad_norm": 1.203125, + "learning_rate": 1.9695384615384616e-05, + "loss": 0.2741, + "step": 100 + }, + { + "epoch": 0.01701041888156496, + "grad_norm": 1.0390625, + "learning_rate": 1.9664615384615387e-05, + "loss": 0.281, + "step": 110 + }, + { + "epoch": 0.018556820598070865, + "grad_norm": 0.921875, + "learning_rate": 1.9633846153846155e-05, + "loss": 0.2586, + "step": 120 + }, + { + "epoch": 0.020103222314576768, + "grad_norm": 0.921875, + "learning_rate": 1.9603076923076926e-05, + "loss": 0.2776, + "step": 130 + }, + { + "epoch": 0.021649624031082674, + "grad_norm": 1.140625, + "learning_rate": 1.9572307692307693e-05, + "loss": 0.3186, + "step": 140 + }, + { + "epoch": 0.02319602574758858, + "grad_norm": 0.85546875, + "learning_rate": 1.9541538461538464e-05, + "loss": 0.3315, + "step": 150 + }, + { + "epoch": 0.024742427464094486, + "grad_norm": 1.1171875, + "learning_rate": 1.9510769230769232e-05, + "loss": 0.257, + "step": 160 + }, + { + "epoch": 0.026288829180600392, + "grad_norm": 1.1640625, + "learning_rate": 1.948e-05, + "loss": 0.2592, + "step": 170 + }, + { + "epoch": 0.027835230897106295, + "grad_norm": 0.91015625, + "learning_rate": 1.944923076923077e-05, + "loss": 0.2703, + "step": 180 + }, + { + "epoch": 0.0293816326136122, + "grad_norm": 0.94140625, + "learning_rate": 1.941846153846154e-05, + "loss": 0.2547, + "step": 190 + }, + { + "epoch": 0.030928034330118107, + "grad_norm": 1.078125, + "learning_rate": 1.938769230769231e-05, + "loss": 0.3182, + "step": 200 + }, + { + "epoch": 0.03247443604662401, + "grad_norm": 0.94140625, + "learning_rate": 1.9356923076923077e-05, + "loss": 0.3005, + "step": 210 + }, + { + "epoch": 0.03402083776312992, + "grad_norm": 1.09375, + "learning_rate": 1.932615384615385e-05, + "loss": 0.2693, + "step": 220 + }, + { + "epoch": 0.035567239479635825, + "grad_norm": 0.95703125, + "learning_rate": 1.929538461538462e-05, + "loss": 0.2925, + "step": 230 + }, + { + "epoch": 0.03711364119614173, + "grad_norm": 1.078125, + "learning_rate": 1.9264615384615387e-05, + "loss": 0.3165, + "step": 240 + }, + { + "epoch": 0.03866004291264763, + "grad_norm": 0.82421875, + "learning_rate": 1.9233846153846155e-05, + "loss": 0.2606, + "step": 250 + }, + { + "epoch": 0.040206444629153536, + "grad_norm": 1.109375, + "learning_rate": 1.9203076923076923e-05, + "loss": 0.324, + "step": 260 + }, + { + "epoch": 0.04175284634565944, + "grad_norm": 1.0390625, + "learning_rate": 1.9172307692307694e-05, + "loss": 0.2787, + "step": 270 + }, + { + "epoch": 0.04329924806216535, + "grad_norm": 1.046875, + "learning_rate": 1.914153846153846e-05, + "loss": 0.3092, + "step": 280 + }, + { + "epoch": 0.044845649778671254, + "grad_norm": 0.875, + "learning_rate": 1.9110769230769233e-05, + "loss": 0.2831, + "step": 290 + }, + { + "epoch": 0.04639205149517716, + "grad_norm": 1.0390625, + "learning_rate": 1.908e-05, + "loss": 0.282, + "step": 300 + }, + { + "epoch": 0.047938453211683066, + "grad_norm": 1.8125, + "learning_rate": 1.904923076923077e-05, + "loss": 0.3863, + "step": 310 + }, + { + "epoch": 0.04948485492818897, + "grad_norm": 0.85546875, + "learning_rate": 1.901846153846154e-05, + "loss": 0.246, + "step": 320 + }, + { + "epoch": 0.05103125664469488, + "grad_norm": 1.3984375, + "learning_rate": 1.898769230769231e-05, + "loss": 0.3483, + "step": 330 + }, + { + "epoch": 0.052577658361200784, + "grad_norm": 1.0546875, + "learning_rate": 1.8956923076923078e-05, + "loss": 0.4107, + "step": 340 + }, + { + "epoch": 0.05412406007770668, + "grad_norm": 1.0, + "learning_rate": 1.892615384615385e-05, + "loss": 0.2813, + "step": 350 + }, + { + "epoch": 0.05567046179421259, + "grad_norm": 0.828125, + "learning_rate": 1.8895384615384617e-05, + "loss": 0.283, + "step": 360 + }, + { + "epoch": 0.057216863510718495, + "grad_norm": 0.9765625, + "learning_rate": 1.8864615384615384e-05, + "loss": 0.268, + "step": 370 + }, + { + "epoch": 0.0587632652272244, + "grad_norm": 1.0859375, + "learning_rate": 1.8833846153846155e-05, + "loss": 0.2852, + "step": 380 + }, + { + "epoch": 0.06030966694373031, + "grad_norm": 0.921875, + "learning_rate": 1.8803076923076923e-05, + "loss": 0.2477, + "step": 390 + }, + { + "epoch": 0.06185606866023621, + "grad_norm": 1.0, + "learning_rate": 1.8772307692307694e-05, + "loss": 0.2418, + "step": 400 + }, + { + "epoch": 0.06340247037674211, + "grad_norm": 0.85546875, + "learning_rate": 1.8741538461538462e-05, + "loss": 0.2218, + "step": 410 + }, + { + "epoch": 0.06494887209324803, + "grad_norm": 1.1484375, + "learning_rate": 1.8710769230769233e-05, + "loss": 0.2616, + "step": 420 + }, + { + "epoch": 0.06649527380975392, + "grad_norm": 1.1171875, + "learning_rate": 1.8680000000000004e-05, + "loss": 0.3475, + "step": 430 + }, + { + "epoch": 0.06804167552625984, + "grad_norm": 1.125, + "learning_rate": 1.8649230769230772e-05, + "loss": 0.3025, + "step": 440 + }, + { + "epoch": 0.06958807724276574, + "grad_norm": 0.93359375, + "learning_rate": 1.861846153846154e-05, + "loss": 0.3119, + "step": 450 + }, + { + "epoch": 0.07113447895927165, + "grad_norm": 0.90625, + "learning_rate": 1.8587692307692307e-05, + "loss": 0.3004, + "step": 460 + }, + { + "epoch": 0.07268088067577755, + "grad_norm": 0.97265625, + "learning_rate": 1.8556923076923078e-05, + "loss": 0.2957, + "step": 470 + }, + { + "epoch": 0.07422728239228346, + "grad_norm": 1.1328125, + "learning_rate": 1.8526153846153846e-05, + "loss": 0.3162, + "step": 480 + }, + { + "epoch": 0.07577368410878936, + "grad_norm": 1.75, + "learning_rate": 1.8495384615384617e-05, + "loss": 0.3637, + "step": 490 + }, + { + "epoch": 0.07732008582529526, + "grad_norm": 1.0390625, + "learning_rate": 1.8464615384615385e-05, + "loss": 0.2379, + "step": 500 + }, + { + "epoch": 0.07886648754180117, + "grad_norm": 0.984375, + "learning_rate": 1.8433846153846156e-05, + "loss": 0.3098, + "step": 510 + }, + { + "epoch": 0.08041288925830707, + "grad_norm": 0.99609375, + "learning_rate": 1.8403076923076924e-05, + "loss": 0.3977, + "step": 520 + }, + { + "epoch": 0.08195929097481298, + "grad_norm": 0.9609375, + "learning_rate": 1.8372307692307695e-05, + "loss": 0.3034, + "step": 530 + }, + { + "epoch": 0.08350569269131888, + "grad_norm": 0.7421875, + "learning_rate": 1.8341538461538462e-05, + "loss": 0.2327, + "step": 540 + }, + { + "epoch": 0.0850520944078248, + "grad_norm": 1.0625, + "learning_rate": 1.8310769230769233e-05, + "loss": 0.2561, + "step": 550 + }, + { + "epoch": 0.0865984961243307, + "grad_norm": 0.8515625, + "learning_rate": 1.828e-05, + "loss": 0.3739, + "step": 560 + }, + { + "epoch": 0.08814489784083661, + "grad_norm": 0.87890625, + "learning_rate": 1.824923076923077e-05, + "loss": 0.3605, + "step": 570 + }, + { + "epoch": 0.08969129955734251, + "grad_norm": 1.0, + "learning_rate": 1.821846153846154e-05, + "loss": 0.2557, + "step": 580 + }, + { + "epoch": 0.09123770127384842, + "grad_norm": 1.03125, + "learning_rate": 1.8187692307692308e-05, + "loss": 0.2806, + "step": 590 + }, + { + "epoch": 0.09278410299035432, + "grad_norm": 0.875, + "learning_rate": 1.815692307692308e-05, + "loss": 0.2977, + "step": 600 + }, + { + "epoch": 0.09433050470686022, + "grad_norm": 0.7890625, + "learning_rate": 1.8126153846153846e-05, + "loss": 0.2845, + "step": 610 + }, + { + "epoch": 0.09587690642336613, + "grad_norm": 0.859375, + "learning_rate": 1.8095384615384618e-05, + "loss": 0.3309, + "step": 620 + }, + { + "epoch": 0.09742330813987203, + "grad_norm": 0.95703125, + "learning_rate": 1.806461538461539e-05, + "loss": 0.3197, + "step": 630 + }, + { + "epoch": 0.09896970985637794, + "grad_norm": 0.83203125, + "learning_rate": 1.8033846153846156e-05, + "loss": 0.2654, + "step": 640 + }, + { + "epoch": 0.10051611157288384, + "grad_norm": 1.09375, + "learning_rate": 1.8003076923076924e-05, + "loss": 0.2954, + "step": 650 + }, + { + "epoch": 0.10206251328938976, + "grad_norm": 1.0390625, + "learning_rate": 1.7972307692307692e-05, + "loss": 0.3237, + "step": 660 + }, + { + "epoch": 0.10360891500589565, + "grad_norm": 0.82421875, + "learning_rate": 1.7941538461538463e-05, + "loss": 0.2887, + "step": 670 + }, + { + "epoch": 0.10515531672240157, + "grad_norm": 1.0546875, + "learning_rate": 1.791076923076923e-05, + "loss": 0.3018, + "step": 680 + }, + { + "epoch": 0.10670171843890747, + "grad_norm": 0.890625, + "learning_rate": 1.788e-05, + "loss": 0.261, + "step": 690 + }, + { + "epoch": 0.10824812015541337, + "grad_norm": 0.84375, + "learning_rate": 1.784923076923077e-05, + "loss": 0.254, + "step": 700 + }, + { + "epoch": 0.10979452187191928, + "grad_norm": 0.96875, + "learning_rate": 1.781846153846154e-05, + "loss": 0.2944, + "step": 710 + }, + { + "epoch": 0.11134092358842518, + "grad_norm": 0.9375, + "learning_rate": 1.778769230769231e-05, + "loss": 0.3163, + "step": 720 + }, + { + "epoch": 0.11288732530493109, + "grad_norm": 0.640625, + "learning_rate": 1.775692307692308e-05, + "loss": 0.2838, + "step": 730 + }, + { + "epoch": 0.11443372702143699, + "grad_norm": 0.98828125, + "learning_rate": 1.7726153846153847e-05, + "loss": 0.236, + "step": 740 + }, + { + "epoch": 0.1159801287379429, + "grad_norm": 0.6796875, + "learning_rate": 1.7695384615384618e-05, + "loss": 0.2164, + "step": 750 + }, + { + "epoch": 0.1175265304544488, + "grad_norm": 0.7109375, + "learning_rate": 1.7664615384615386e-05, + "loss": 0.3331, + "step": 760 + }, + { + "epoch": 0.11907293217095472, + "grad_norm": 0.859375, + "learning_rate": 1.7633846153846153e-05, + "loss": 0.303, + "step": 770 + }, + { + "epoch": 0.12061933388746061, + "grad_norm": 0.80078125, + "learning_rate": 1.7603076923076924e-05, + "loss": 0.3264, + "step": 780 + }, + { + "epoch": 0.12216573560396653, + "grad_norm": 0.85546875, + "learning_rate": 1.7572307692307692e-05, + "loss": 0.2097, + "step": 790 + }, + { + "epoch": 0.12371213732047243, + "grad_norm": 0.80859375, + "learning_rate": 1.7541538461538463e-05, + "loss": 0.2456, + "step": 800 + }, + { + "epoch": 0.12525853903697834, + "grad_norm": 0.875, + "learning_rate": 1.751076923076923e-05, + "loss": 0.2877, + "step": 810 + }, + { + "epoch": 0.12680494075348422, + "grad_norm": 0.93359375, + "learning_rate": 1.7480000000000002e-05, + "loss": 0.2902, + "step": 820 + }, + { + "epoch": 0.12835134246999014, + "grad_norm": 0.875, + "learning_rate": 1.7449230769230773e-05, + "loss": 0.2357, + "step": 830 + }, + { + "epoch": 0.12989774418649605, + "grad_norm": 0.79296875, + "learning_rate": 1.741846153846154e-05, + "loss": 0.2926, + "step": 840 + }, + { + "epoch": 0.13144414590300196, + "grad_norm": 0.85546875, + "learning_rate": 1.738769230769231e-05, + "loss": 0.2301, + "step": 850 + }, + { + "epoch": 0.13299054761950785, + "grad_norm": 1.1015625, + "learning_rate": 1.7356923076923076e-05, + "loss": 0.2501, + "step": 860 + }, + { + "epoch": 0.13453694933601376, + "grad_norm": 1.0234375, + "learning_rate": 1.7326153846153847e-05, + "loss": 0.2393, + "step": 870 + }, + { + "epoch": 0.13608335105251967, + "grad_norm": 1.0625, + "learning_rate": 1.7295384615384615e-05, + "loss": 0.2337, + "step": 880 + }, + { + "epoch": 0.1376297527690256, + "grad_norm": 1.1171875, + "learning_rate": 1.7264615384615386e-05, + "loss": 0.3147, + "step": 890 + }, + { + "epoch": 0.13917615448553147, + "grad_norm": 0.828125, + "learning_rate": 1.7233846153846154e-05, + "loss": 0.2949, + "step": 900 + }, + { + "epoch": 0.14072255620203739, + "grad_norm": 1.1484375, + "learning_rate": 1.7203076923076925e-05, + "loss": 0.3394, + "step": 910 + }, + { + "epoch": 0.1422689579185433, + "grad_norm": 0.72265625, + "learning_rate": 1.7172307692307696e-05, + "loss": 0.3119, + "step": 920 + }, + { + "epoch": 0.14381535963504918, + "grad_norm": 0.8828125, + "learning_rate": 1.7141538461538464e-05, + "loss": 0.2959, + "step": 930 + }, + { + "epoch": 0.1453617613515551, + "grad_norm": 0.86328125, + "learning_rate": 1.711076923076923e-05, + "loss": 0.2677, + "step": 940 + }, + { + "epoch": 0.146908163068061, + "grad_norm": 0.99609375, + "learning_rate": 1.7080000000000002e-05, + "loss": 0.2575, + "step": 950 + }, + { + "epoch": 0.14845456478456692, + "grad_norm": 1.1640625, + "learning_rate": 1.704923076923077e-05, + "loss": 0.2419, + "step": 960 + }, + { + "epoch": 0.1500009665010728, + "grad_norm": 0.86328125, + "learning_rate": 1.7018461538461538e-05, + "loss": 0.2631, + "step": 970 + }, + { + "epoch": 0.15154736821757872, + "grad_norm": 0.8125, + "learning_rate": 1.698769230769231e-05, + "loss": 0.2415, + "step": 980 + }, + { + "epoch": 0.15309376993408463, + "grad_norm": 0.96484375, + "learning_rate": 1.6956923076923077e-05, + "loss": 0.2498, + "step": 990 + }, + { + "epoch": 0.15464017165059052, + "grad_norm": 0.93359375, + "learning_rate": 1.6926153846153848e-05, + "loss": 0.2845, + "step": 1000 + }, + { + "epoch": 0.15618657336709643, + "grad_norm": 1.0, + "learning_rate": 1.6895384615384615e-05, + "loss": 0.3159, + "step": 1010 + }, + { + "epoch": 0.15773297508360234, + "grad_norm": 1.15625, + "learning_rate": 1.6864615384615387e-05, + "loss": 0.2969, + "step": 1020 + }, + { + "epoch": 0.15927937680010826, + "grad_norm": 0.9765625, + "learning_rate": 1.6833846153846158e-05, + "loss": 0.3195, + "step": 1030 + }, + { + "epoch": 0.16082577851661414, + "grad_norm": 0.96484375, + "learning_rate": 1.6803076923076925e-05, + "loss": 0.3086, + "step": 1040 + }, + { + "epoch": 0.16237218023312006, + "grad_norm": 0.87109375, + "learning_rate": 1.6772307692307693e-05, + "loss": 0.297, + "step": 1050 + }, + { + "epoch": 0.16391858194962597, + "grad_norm": 1.015625, + "learning_rate": 1.674153846153846e-05, + "loss": 0.2677, + "step": 1060 + }, + { + "epoch": 0.16546498366613188, + "grad_norm": 1.015625, + "learning_rate": 1.6710769230769232e-05, + "loss": 0.294, + "step": 1070 + }, + { + "epoch": 0.16701138538263777, + "grad_norm": 0.953125, + "learning_rate": 1.668e-05, + "loss": 0.2483, + "step": 1080 + }, + { + "epoch": 0.16855778709914368, + "grad_norm": 0.9375, + "learning_rate": 1.664923076923077e-05, + "loss": 0.2564, + "step": 1090 + }, + { + "epoch": 0.1701041888156496, + "grad_norm": 0.96484375, + "learning_rate": 1.661846153846154e-05, + "loss": 0.2363, + "step": 1100 + }, + { + "epoch": 0.17165059053215548, + "grad_norm": 1.015625, + "learning_rate": 1.658769230769231e-05, + "loss": 0.2486, + "step": 1110 + }, + { + "epoch": 0.1731969922486614, + "grad_norm": 1.046875, + "learning_rate": 1.655692307692308e-05, + "loss": 0.3142, + "step": 1120 + }, + { + "epoch": 0.1747433939651673, + "grad_norm": 1.609375, + "learning_rate": 1.6526153846153848e-05, + "loss": 0.4319, + "step": 1130 + }, + { + "epoch": 0.17628979568167322, + "grad_norm": 0.9609375, + "learning_rate": 1.6495384615384616e-05, + "loss": 0.2727, + "step": 1140 + }, + { + "epoch": 0.1778361973981791, + "grad_norm": 0.92578125, + "learning_rate": 1.6464615384615387e-05, + "loss": 0.2472, + "step": 1150 + }, + { + "epoch": 0.17938259911468502, + "grad_norm": 0.9609375, + "learning_rate": 1.6433846153846155e-05, + "loss": 0.3036, + "step": 1160 + }, + { + "epoch": 0.18092900083119093, + "grad_norm": 0.99609375, + "learning_rate": 1.6403076923076922e-05, + "loss": 0.2199, + "step": 1170 + }, + { + "epoch": 0.18247540254769684, + "grad_norm": 1.1484375, + "learning_rate": 1.6372307692307693e-05, + "loss": 0.2474, + "step": 1180 + }, + { + "epoch": 0.18402180426420273, + "grad_norm": 0.9609375, + "learning_rate": 1.634153846153846e-05, + "loss": 0.2892, + "step": 1190 + }, + { + "epoch": 0.18556820598070864, + "grad_norm": 0.94140625, + "learning_rate": 1.6310769230769232e-05, + "loss": 0.3317, + "step": 1200 + }, + { + "epoch": 0.18711460769721455, + "grad_norm": 0.85546875, + "learning_rate": 1.628e-05, + "loss": 0.3066, + "step": 1210 + }, + { + "epoch": 0.18866100941372044, + "grad_norm": 0.88671875, + "learning_rate": 1.624923076923077e-05, + "loss": 0.2811, + "step": 1220 + }, + { + "epoch": 0.19020741113022635, + "grad_norm": 0.87890625, + "learning_rate": 1.6218461538461542e-05, + "loss": 0.2503, + "step": 1230 + }, + { + "epoch": 0.19175381284673226, + "grad_norm": 0.9140625, + "learning_rate": 1.618769230769231e-05, + "loss": 0.3128, + "step": 1240 + }, + { + "epoch": 0.19330021456323818, + "grad_norm": 0.953125, + "learning_rate": 1.6156923076923078e-05, + "loss": 0.3067, + "step": 1250 + }, + { + "epoch": 0.19484661627974406, + "grad_norm": 1.203125, + "learning_rate": 1.6126153846153845e-05, + "loss": 0.2975, + "step": 1260 + }, + { + "epoch": 0.19639301799624997, + "grad_norm": 1.140625, + "learning_rate": 1.6095384615384616e-05, + "loss": 0.3083, + "step": 1270 + }, + { + "epoch": 0.1979394197127559, + "grad_norm": 0.66796875, + "learning_rate": 1.6064615384615384e-05, + "loss": 0.2786, + "step": 1280 + }, + { + "epoch": 0.19948582142926177, + "grad_norm": 0.890625, + "learning_rate": 1.6033846153846155e-05, + "loss": 0.404, + "step": 1290 + }, + { + "epoch": 0.20103222314576769, + "grad_norm": 0.8984375, + "learning_rate": 1.6003076923076923e-05, + "loss": 0.3213, + "step": 1300 + }, + { + "epoch": 0.2025786248622736, + "grad_norm": 0.91015625, + "learning_rate": 1.5972307692307694e-05, + "loss": 0.24, + "step": 1310 + }, + { + "epoch": 0.2041250265787795, + "grad_norm": 1.2890625, + "learning_rate": 1.5941538461538465e-05, + "loss": 0.2711, + "step": 1320 + }, + { + "epoch": 0.2056714282952854, + "grad_norm": 0.83203125, + "learning_rate": 1.5910769230769233e-05, + "loss": 0.2493, + "step": 1330 + }, + { + "epoch": 0.2072178300117913, + "grad_norm": 1.4609375, + "learning_rate": 1.588e-05, + "loss": 0.353, + "step": 1340 + }, + { + "epoch": 0.20876423172829722, + "grad_norm": 0.9375, + "learning_rate": 1.584923076923077e-05, + "loss": 0.2534, + "step": 1350 + }, + { + "epoch": 0.21031063344480314, + "grad_norm": 0.64453125, + "learning_rate": 1.581846153846154e-05, + "loss": 0.2088, + "step": 1360 + }, + { + "epoch": 0.21185703516130902, + "grad_norm": 0.94921875, + "learning_rate": 1.5787692307692307e-05, + "loss": 0.3146, + "step": 1370 + }, + { + "epoch": 0.21340343687781493, + "grad_norm": 1.125, + "learning_rate": 1.5756923076923078e-05, + "loss": 0.2947, + "step": 1380 + }, + { + "epoch": 0.21494983859432085, + "grad_norm": 0.6796875, + "learning_rate": 1.5726153846153846e-05, + "loss": 0.2039, + "step": 1390 + }, + { + "epoch": 0.21649624031082673, + "grad_norm": 0.828125, + "learning_rate": 1.5695384615384617e-05, + "loss": 0.252, + "step": 1400 + }, + { + "epoch": 0.21804264202733264, + "grad_norm": 0.875, + "learning_rate": 1.5664615384615388e-05, + "loss": 0.2689, + "step": 1410 + }, + { + "epoch": 0.21958904374383856, + "grad_norm": 1.0390625, + "learning_rate": 1.5633846153846156e-05, + "loss": 0.3239, + "step": 1420 + }, + { + "epoch": 0.22113544546034447, + "grad_norm": 1.0859375, + "learning_rate": 1.5603076923076927e-05, + "loss": 0.2891, + "step": 1430 + }, + { + "epoch": 0.22268184717685036, + "grad_norm": 0.75390625, + "learning_rate": 1.5572307692307694e-05, + "loss": 0.3306, + "step": 1440 + }, + { + "epoch": 0.22422824889335627, + "grad_norm": 1.0859375, + "learning_rate": 1.5541538461538462e-05, + "loss": 0.2971, + "step": 1450 + }, + { + "epoch": 0.22577465060986218, + "grad_norm": 0.953125, + "learning_rate": 1.551076923076923e-05, + "loss": 0.2892, + "step": 1460 + }, + { + "epoch": 0.2273210523263681, + "grad_norm": 0.75390625, + "learning_rate": 1.548e-05, + "loss": 0.2773, + "step": 1470 + }, + { + "epoch": 0.22886745404287398, + "grad_norm": 0.9453125, + "learning_rate": 1.544923076923077e-05, + "loss": 0.2767, + "step": 1480 + }, + { + "epoch": 0.2304138557593799, + "grad_norm": 1.046875, + "learning_rate": 1.541846153846154e-05, + "loss": 0.2899, + "step": 1490 + }, + { + "epoch": 0.2319602574758858, + "grad_norm": 0.890625, + "learning_rate": 1.5387692307692307e-05, + "loss": 0.2521, + "step": 1500 + }, + { + "epoch": 0.2335066591923917, + "grad_norm": 1.03125, + "learning_rate": 1.535692307692308e-05, + "loss": 0.2479, + "step": 1510 + }, + { + "epoch": 0.2350530609088976, + "grad_norm": 1.0, + "learning_rate": 1.532615384615385e-05, + "loss": 0.3154, + "step": 1520 + }, + { + "epoch": 0.23659946262540352, + "grad_norm": 0.83984375, + "learning_rate": 1.5295384615384617e-05, + "loss": 0.3391, + "step": 1530 + }, + { + "epoch": 0.23814586434190943, + "grad_norm": 1.2265625, + "learning_rate": 1.5264615384615385e-05, + "loss": 0.265, + "step": 1540 + }, + { + "epoch": 0.23969226605841532, + "grad_norm": 0.99609375, + "learning_rate": 1.5233846153846154e-05, + "loss": 0.2949, + "step": 1550 + }, + { + "epoch": 0.24123866777492123, + "grad_norm": 1.265625, + "learning_rate": 1.5203076923076925e-05, + "loss": 0.3136, + "step": 1560 + }, + { + "epoch": 0.24278506949142714, + "grad_norm": 1.4609375, + "learning_rate": 1.5172307692307693e-05, + "loss": 0.3073, + "step": 1570 + }, + { + "epoch": 0.24433147120793305, + "grad_norm": 1.140625, + "learning_rate": 1.5141538461538463e-05, + "loss": 0.3271, + "step": 1580 + }, + { + "epoch": 0.24587787292443894, + "grad_norm": 1.03125, + "learning_rate": 1.5110769230769232e-05, + "loss": 0.2722, + "step": 1590 + }, + { + "epoch": 0.24742427464094485, + "grad_norm": 0.87109375, + "learning_rate": 1.5080000000000001e-05, + "loss": 0.3513, + "step": 1600 + }, + { + "epoch": 0.24897067635745077, + "grad_norm": 1.1796875, + "learning_rate": 1.504923076923077e-05, + "loss": 0.2212, + "step": 1610 + }, + { + "epoch": 0.2505170780739567, + "grad_norm": 0.890625, + "learning_rate": 1.501846153846154e-05, + "loss": 0.2914, + "step": 1620 + }, + { + "epoch": 0.25206347979046256, + "grad_norm": 0.83984375, + "learning_rate": 1.498769230769231e-05, + "loss": 0.281, + "step": 1630 + }, + { + "epoch": 0.25360988150696845, + "grad_norm": 0.86328125, + "learning_rate": 1.4956923076923077e-05, + "loss": 0.2509, + "step": 1640 + }, + { + "epoch": 0.2551562832234744, + "grad_norm": 1.2578125, + "learning_rate": 1.4926153846153848e-05, + "loss": 0.2994, + "step": 1650 + }, + { + "epoch": 0.2567026849399803, + "grad_norm": 0.90625, + "learning_rate": 1.4895384615384616e-05, + "loss": 0.2839, + "step": 1660 + }, + { + "epoch": 0.2582490866564862, + "grad_norm": 0.81640625, + "learning_rate": 1.4864615384615385e-05, + "loss": 0.229, + "step": 1670 + }, + { + "epoch": 0.2597954883729921, + "grad_norm": 0.9453125, + "learning_rate": 1.4833846153846155e-05, + "loss": 0.2381, + "step": 1680 + }, + { + "epoch": 0.261341890089498, + "grad_norm": 0.9140625, + "learning_rate": 1.4803076923076924e-05, + "loss": 0.3495, + "step": 1690 + }, + { + "epoch": 0.2628882918060039, + "grad_norm": 0.7421875, + "learning_rate": 1.4772307692307692e-05, + "loss": 0.2756, + "step": 1700 + }, + { + "epoch": 0.2644346935225098, + "grad_norm": 0.9375, + "learning_rate": 1.4741538461538463e-05, + "loss": 0.3189, + "step": 1710 + }, + { + "epoch": 0.2659810952390157, + "grad_norm": 0.765625, + "learning_rate": 1.4710769230769232e-05, + "loss": 0.289, + "step": 1720 + }, + { + "epoch": 0.26752749695552164, + "grad_norm": 1.0703125, + "learning_rate": 1.4680000000000002e-05, + "loss": 0.2848, + "step": 1730 + }, + { + "epoch": 0.2690738986720275, + "grad_norm": 2.96875, + "learning_rate": 1.4649230769230771e-05, + "loss": 0.3115, + "step": 1740 + }, + { + "epoch": 0.2706203003885334, + "grad_norm": 0.79296875, + "learning_rate": 1.4618461538461539e-05, + "loss": 0.3075, + "step": 1750 + }, + { + "epoch": 0.27216670210503935, + "grad_norm": 1.0625, + "learning_rate": 1.458769230769231e-05, + "loss": 0.2705, + "step": 1760 + }, + { + "epoch": 0.27371310382154523, + "grad_norm": 0.98046875, + "learning_rate": 1.4556923076923078e-05, + "loss": 0.266, + "step": 1770 + }, + { + "epoch": 0.2752595055380512, + "grad_norm": 0.8203125, + "learning_rate": 1.4526153846153847e-05, + "loss": 0.3179, + "step": 1780 + }, + { + "epoch": 0.27680590725455706, + "grad_norm": 0.953125, + "learning_rate": 1.4495384615384616e-05, + "loss": 0.2775, + "step": 1790 + }, + { + "epoch": 0.27835230897106295, + "grad_norm": 0.87890625, + "learning_rate": 1.4464615384615386e-05, + "loss": 0.3279, + "step": 1800 + }, + { + "epoch": 0.2798987106875689, + "grad_norm": 0.8515625, + "learning_rate": 1.4433846153846155e-05, + "loss": 0.2373, + "step": 1810 + }, + { + "epoch": 0.28144511240407477, + "grad_norm": 0.80859375, + "learning_rate": 1.4403076923076925e-05, + "loss": 0.2216, + "step": 1820 + }, + { + "epoch": 0.28299151412058066, + "grad_norm": 0.93359375, + "learning_rate": 1.4372307692307694e-05, + "loss": 0.3206, + "step": 1830 + }, + { + "epoch": 0.2845379158370866, + "grad_norm": 1.109375, + "learning_rate": 1.4341538461538462e-05, + "loss": 0.2467, + "step": 1840 + }, + { + "epoch": 0.2860843175535925, + "grad_norm": 0.9765625, + "learning_rate": 1.4310769230769233e-05, + "loss": 0.2818, + "step": 1850 + }, + { + "epoch": 0.28763071927009837, + "grad_norm": 0.8984375, + "learning_rate": 1.428e-05, + "loss": 0.2477, + "step": 1860 + }, + { + "epoch": 0.2891771209866043, + "grad_norm": 0.734375, + "learning_rate": 1.4249230769230772e-05, + "loss": 0.2576, + "step": 1870 + }, + { + "epoch": 0.2907235227031102, + "grad_norm": 0.91015625, + "learning_rate": 1.421846153846154e-05, + "loss": 0.2841, + "step": 1880 + }, + { + "epoch": 0.2922699244196161, + "grad_norm": 0.98828125, + "learning_rate": 1.4187692307692309e-05, + "loss": 0.3371, + "step": 1890 + }, + { + "epoch": 0.293816326136122, + "grad_norm": 1.0, + "learning_rate": 1.4156923076923076e-05, + "loss": 0.3037, + "step": 1900 + }, + { + "epoch": 0.2953627278526279, + "grad_norm": 1.015625, + "learning_rate": 1.4126153846153847e-05, + "loss": 0.2526, + "step": 1910 + }, + { + "epoch": 0.29690912956913385, + "grad_norm": 0.98828125, + "learning_rate": 1.4095384615384617e-05, + "loss": 0.2125, + "step": 1920 + }, + { + "epoch": 0.29845553128563973, + "grad_norm": 0.8125, + "learning_rate": 1.4064615384615386e-05, + "loss": 0.2783, + "step": 1930 + }, + { + "epoch": 0.3000019330021456, + "grad_norm": 0.90625, + "learning_rate": 1.4033846153846156e-05, + "loss": 0.3131, + "step": 1940 + }, + { + "epoch": 0.30154833471865156, + "grad_norm": 0.78125, + "learning_rate": 1.4003076923076923e-05, + "loss": 0.3226, + "step": 1950 + }, + { + "epoch": 0.30309473643515744, + "grad_norm": 0.83203125, + "learning_rate": 1.3972307692307694e-05, + "loss": 0.2819, + "step": 1960 + }, + { + "epoch": 0.3046411381516633, + "grad_norm": 0.78515625, + "learning_rate": 1.3941538461538462e-05, + "loss": 0.2868, + "step": 1970 + }, + { + "epoch": 0.30618753986816927, + "grad_norm": 0.92578125, + "learning_rate": 1.3910769230769232e-05, + "loss": 0.2615, + "step": 1980 + }, + { + "epoch": 0.30773394158467515, + "grad_norm": 0.87109375, + "learning_rate": 1.3880000000000001e-05, + "loss": 0.255, + "step": 1990 + }, + { + "epoch": 0.30928034330118104, + "grad_norm": 0.82421875, + "learning_rate": 1.384923076923077e-05, + "loss": 0.251, + "step": 2000 + }, + { + "epoch": 0.310826745017687, + "grad_norm": 0.9609375, + "learning_rate": 1.3818461538461541e-05, + "loss": 0.2983, + "step": 2010 + }, + { + "epoch": 0.31237314673419286, + "grad_norm": 1.0859375, + "learning_rate": 1.3787692307692309e-05, + "loss": 0.2705, + "step": 2020 + }, + { + "epoch": 0.3139195484506988, + "grad_norm": 0.890625, + "learning_rate": 1.3756923076923079e-05, + "loss": 0.2937, + "step": 2030 + }, + { + "epoch": 0.3154659501672047, + "grad_norm": 1.1953125, + "learning_rate": 1.3726153846153846e-05, + "loss": 0.3296, + "step": 2040 + }, + { + "epoch": 0.3170123518837106, + "grad_norm": 1.140625, + "learning_rate": 1.3695384615384617e-05, + "loss": 0.2666, + "step": 2050 + }, + { + "epoch": 0.3185587536002165, + "grad_norm": 1.0390625, + "learning_rate": 1.3664615384615385e-05, + "loss": 0.3124, + "step": 2060 + }, + { + "epoch": 0.3201051553167224, + "grad_norm": 1.015625, + "learning_rate": 1.3633846153846156e-05, + "loss": 0.3752, + "step": 2070 + }, + { + "epoch": 0.3216515570332283, + "grad_norm": 0.9296875, + "learning_rate": 1.3603076923076924e-05, + "loss": 0.2622, + "step": 2080 + }, + { + "epoch": 0.3231979587497342, + "grad_norm": 0.734375, + "learning_rate": 1.3572307692307693e-05, + "loss": 0.2526, + "step": 2090 + }, + { + "epoch": 0.3247443604662401, + "grad_norm": 0.80078125, + "learning_rate": 1.3541538461538464e-05, + "loss": 0.2775, + "step": 2100 + }, + { + "epoch": 0.326290762182746, + "grad_norm": 1.0546875, + "learning_rate": 1.3510769230769232e-05, + "loss": 0.3322, + "step": 2110 + }, + { + "epoch": 0.32783716389925194, + "grad_norm": 1.0, + "learning_rate": 1.3480000000000001e-05, + "loss": 0.2897, + "step": 2120 + }, + { + "epoch": 0.3293835656157578, + "grad_norm": 1.0234375, + "learning_rate": 1.344923076923077e-05, + "loss": 0.2815, + "step": 2130 + }, + { + "epoch": 0.33092996733226376, + "grad_norm": 0.7109375, + "learning_rate": 1.341846153846154e-05, + "loss": 0.2959, + "step": 2140 + }, + { + "epoch": 0.33247636904876965, + "grad_norm": 0.91015625, + "learning_rate": 1.3387692307692308e-05, + "loss": 0.2571, + "step": 2150 + }, + { + "epoch": 0.33402277076527553, + "grad_norm": 0.91015625, + "learning_rate": 1.3356923076923079e-05, + "loss": 0.247, + "step": 2160 + }, + { + "epoch": 0.3355691724817815, + "grad_norm": 0.828125, + "learning_rate": 1.3326153846153847e-05, + "loss": 0.248, + "step": 2170 + }, + { + "epoch": 0.33711557419828736, + "grad_norm": 0.94140625, + "learning_rate": 1.3295384615384616e-05, + "loss": 0.2438, + "step": 2180 + }, + { + "epoch": 0.33866197591479325, + "grad_norm": 1.2734375, + "learning_rate": 1.3264615384615385e-05, + "loss": 0.3612, + "step": 2190 + }, + { + "epoch": 0.3402083776312992, + "grad_norm": 0.9609375, + "learning_rate": 1.3233846153846155e-05, + "loss": 0.3287, + "step": 2200 + }, + { + "epoch": 0.34175477934780507, + "grad_norm": 0.9609375, + "learning_rate": 1.3203076923076926e-05, + "loss": 0.2756, + "step": 2210 + }, + { + "epoch": 0.34330118106431096, + "grad_norm": 0.91796875, + "learning_rate": 1.3172307692307694e-05, + "loss": 0.2886, + "step": 2220 + }, + { + "epoch": 0.3448475827808169, + "grad_norm": 0.87109375, + "learning_rate": 1.3141538461538463e-05, + "loss": 0.2446, + "step": 2230 + }, + { + "epoch": 0.3463939844973228, + "grad_norm": 0.94140625, + "learning_rate": 1.311076923076923e-05, + "loss": 0.3064, + "step": 2240 + }, + { + "epoch": 0.3479403862138287, + "grad_norm": 0.8203125, + "learning_rate": 1.3080000000000002e-05, + "loss": 0.2376, + "step": 2250 + }, + { + "epoch": 0.3494867879303346, + "grad_norm": 0.8359375, + "learning_rate": 1.304923076923077e-05, + "loss": 0.2932, + "step": 2260 + }, + { + "epoch": 0.3510331896468405, + "grad_norm": 1.4296875, + "learning_rate": 1.301846153846154e-05, + "loss": 0.2979, + "step": 2270 + }, + { + "epoch": 0.35257959136334643, + "grad_norm": 0.84375, + "learning_rate": 1.2987692307692308e-05, + "loss": 0.2897, + "step": 2280 + }, + { + "epoch": 0.3541259930798523, + "grad_norm": 0.99609375, + "learning_rate": 1.2956923076923078e-05, + "loss": 0.2744, + "step": 2290 + }, + { + "epoch": 0.3556723947963582, + "grad_norm": 0.96484375, + "learning_rate": 1.2926153846153849e-05, + "loss": 0.2708, + "step": 2300 + }, + { + "epoch": 0.35721879651286415, + "grad_norm": 0.81640625, + "learning_rate": 1.2895384615384616e-05, + "loss": 0.2224, + "step": 2310 + }, + { + "epoch": 0.35876519822937003, + "grad_norm": 1.1015625, + "learning_rate": 1.2864615384615386e-05, + "loss": 0.2728, + "step": 2320 + }, + { + "epoch": 0.3603115999458759, + "grad_norm": 1.0859375, + "learning_rate": 1.2833846153846155e-05, + "loss": 0.2661, + "step": 2330 + }, + { + "epoch": 0.36185800166238186, + "grad_norm": 1.046875, + "learning_rate": 1.2803076923076925e-05, + "loss": 0.2892, + "step": 2340 + }, + { + "epoch": 0.36340440337888774, + "grad_norm": 0.6640625, + "learning_rate": 1.2772307692307692e-05, + "loss": 0.3092, + "step": 2350 + }, + { + "epoch": 0.3649508050953937, + "grad_norm": 0.84765625, + "learning_rate": 1.2741538461538463e-05, + "loss": 0.2542, + "step": 2360 + }, + { + "epoch": 0.36649720681189957, + "grad_norm": 0.91796875, + "learning_rate": 1.2710769230769231e-05, + "loss": 0.3589, + "step": 2370 + }, + { + "epoch": 0.36804360852840545, + "grad_norm": 0.71484375, + "learning_rate": 1.268e-05, + "loss": 0.2237, + "step": 2380 + }, + { + "epoch": 0.3695900102449114, + "grad_norm": 1.0703125, + "learning_rate": 1.264923076923077e-05, + "loss": 0.3413, + "step": 2390 + }, + { + "epoch": 0.3711364119614173, + "grad_norm": 1.0234375, + "learning_rate": 1.261846153846154e-05, + "loss": 0.2556, + "step": 2400 + }, + { + "epoch": 0.37268281367792316, + "grad_norm": 0.9296875, + "learning_rate": 1.258769230769231e-05, + "loss": 0.3087, + "step": 2410 + }, + { + "epoch": 0.3742292153944291, + "grad_norm": 0.95703125, + "learning_rate": 1.2556923076923078e-05, + "loss": 0.2609, + "step": 2420 + }, + { + "epoch": 0.375775617110935, + "grad_norm": 1.1328125, + "learning_rate": 1.2526153846153848e-05, + "loss": 0.2572, + "step": 2430 + }, + { + "epoch": 0.3773220188274409, + "grad_norm": 1.1484375, + "learning_rate": 1.2495384615384615e-05, + "loss": 0.3003, + "step": 2440 + }, + { + "epoch": 0.3788684205439468, + "grad_norm": 1.1484375, + "learning_rate": 1.2464615384615386e-05, + "loss": 0.259, + "step": 2450 + }, + { + "epoch": 0.3804148222604527, + "grad_norm": 1.203125, + "learning_rate": 1.2433846153846154e-05, + "loss": 0.2606, + "step": 2460 + }, + { + "epoch": 0.38196122397695864, + "grad_norm": 0.85546875, + "learning_rate": 1.2403076923076925e-05, + "loss": 0.2351, + "step": 2470 + }, + { + "epoch": 0.3835076256934645, + "grad_norm": 0.90625, + "learning_rate": 1.2372307692307693e-05, + "loss": 0.2664, + "step": 2480 + }, + { + "epoch": 0.3850540274099704, + "grad_norm": 0.91796875, + "learning_rate": 1.2341538461538462e-05, + "loss": 0.245, + "step": 2490 + }, + { + "epoch": 0.38660042912647635, + "grad_norm": 1.109375, + "learning_rate": 1.2310769230769233e-05, + "loss": 0.2781, + "step": 2500 + }, + { + "epoch": 0.38814683084298224, + "grad_norm": 1.046875, + "learning_rate": 1.2280000000000001e-05, + "loss": 0.2847, + "step": 2510 + }, + { + "epoch": 0.3896932325594881, + "grad_norm": 0.8203125, + "learning_rate": 1.224923076923077e-05, + "loss": 0.3223, + "step": 2520 + }, + { + "epoch": 0.39123963427599406, + "grad_norm": 0.9765625, + "learning_rate": 1.221846153846154e-05, + "loss": 0.3068, + "step": 2530 + }, + { + "epoch": 0.39278603599249995, + "grad_norm": 0.9609375, + "learning_rate": 1.218769230769231e-05, + "loss": 0.2393, + "step": 2540 + }, + { + "epoch": 0.39433243770900583, + "grad_norm": 1.09375, + "learning_rate": 1.2156923076923077e-05, + "loss": 0.2918, + "step": 2550 + }, + { + "epoch": 0.3958788394255118, + "grad_norm": 0.84375, + "learning_rate": 1.2126153846153848e-05, + "loss": 0.2146, + "step": 2560 + }, + { + "epoch": 0.39742524114201766, + "grad_norm": 0.875, + "learning_rate": 1.2095384615384616e-05, + "loss": 0.3178, + "step": 2570 + }, + { + "epoch": 0.39897164285852355, + "grad_norm": 0.78515625, + "learning_rate": 1.2064615384615385e-05, + "loss": 0.2247, + "step": 2580 + }, + { + "epoch": 0.4005180445750295, + "grad_norm": 0.93359375, + "learning_rate": 1.2033846153846154e-05, + "loss": 0.2684, + "step": 2590 + }, + { + "epoch": 0.40206444629153537, + "grad_norm": 0.99609375, + "learning_rate": 1.2003076923076924e-05, + "loss": 0.2332, + "step": 2600 + }, + { + "epoch": 0.4036108480080413, + "grad_norm": 0.86328125, + "learning_rate": 1.1972307692307695e-05, + "loss": 0.4153, + "step": 2610 + }, + { + "epoch": 0.4051572497245472, + "grad_norm": 0.96484375, + "learning_rate": 1.1941538461538463e-05, + "loss": 0.2559, + "step": 2620 + }, + { + "epoch": 0.4067036514410531, + "grad_norm": 1.53125, + "learning_rate": 1.1910769230769232e-05, + "loss": 0.2974, + "step": 2630 + }, + { + "epoch": 0.408250053157559, + "grad_norm": 1.0078125, + "learning_rate": 1.188e-05, + "loss": 0.3523, + "step": 2640 + }, + { + "epoch": 0.4097964548740649, + "grad_norm": 1.1171875, + "learning_rate": 1.1849230769230771e-05, + "loss": 0.2619, + "step": 2650 + }, + { + "epoch": 0.4113428565905708, + "grad_norm": 1.109375, + "learning_rate": 1.1818461538461539e-05, + "loss": 0.2833, + "step": 2660 + }, + { + "epoch": 0.41288925830707673, + "grad_norm": 0.96875, + "learning_rate": 1.178769230769231e-05, + "loss": 0.244, + "step": 2670 + }, + { + "epoch": 0.4144356600235826, + "grad_norm": 0.8046875, + "learning_rate": 1.1756923076923077e-05, + "loss": 0.2238, + "step": 2680 + }, + { + "epoch": 0.4159820617400885, + "grad_norm": 0.87890625, + "learning_rate": 1.1726153846153847e-05, + "loss": 0.2839, + "step": 2690 + }, + { + "epoch": 0.41752846345659445, + "grad_norm": 1.46875, + "learning_rate": 1.1695384615384618e-05, + "loss": 0.3264, + "step": 2700 + }, + { + "epoch": 0.41907486517310033, + "grad_norm": 0.953125, + "learning_rate": 1.1664615384615386e-05, + "loss": 0.2501, + "step": 2710 + }, + { + "epoch": 0.42062126688960627, + "grad_norm": 0.94140625, + "learning_rate": 1.1633846153846155e-05, + "loss": 0.3618, + "step": 2720 + }, + { + "epoch": 0.42216766860611216, + "grad_norm": 1.015625, + "learning_rate": 1.1603076923076924e-05, + "loss": 0.2353, + "step": 2730 + }, + { + "epoch": 0.42371407032261804, + "grad_norm": 0.79296875, + "learning_rate": 1.1572307692307694e-05, + "loss": 0.2745, + "step": 2740 + }, + { + "epoch": 0.425260472039124, + "grad_norm": 1.1171875, + "learning_rate": 1.1541538461538461e-05, + "loss": 0.2673, + "step": 2750 + }, + { + "epoch": 0.42680687375562987, + "grad_norm": 1.1796875, + "learning_rate": 1.1510769230769232e-05, + "loss": 0.2448, + "step": 2760 + }, + { + "epoch": 0.42835327547213575, + "grad_norm": 0.82421875, + "learning_rate": 1.148e-05, + "loss": 0.2428, + "step": 2770 + }, + { + "epoch": 0.4298996771886417, + "grad_norm": 0.97265625, + "learning_rate": 1.144923076923077e-05, + "loss": 0.2259, + "step": 2780 + }, + { + "epoch": 0.4314460789051476, + "grad_norm": 1.0625, + "learning_rate": 1.141846153846154e-05, + "loss": 0.3214, + "step": 2790 + }, + { + "epoch": 0.43299248062165346, + "grad_norm": 0.90625, + "learning_rate": 1.1387692307692308e-05, + "loss": 0.272, + "step": 2800 + }, + { + "epoch": 0.4345388823381594, + "grad_norm": 0.7890625, + "learning_rate": 1.135692307692308e-05, + "loss": 0.3055, + "step": 2810 + }, + { + "epoch": 0.4360852840546653, + "grad_norm": 1.3359375, + "learning_rate": 1.1326153846153847e-05, + "loss": 0.3783, + "step": 2820 + }, + { + "epoch": 0.43763168577117123, + "grad_norm": 0.83203125, + "learning_rate": 1.1295384615384617e-05, + "loss": 0.2318, + "step": 2830 + }, + { + "epoch": 0.4391780874876771, + "grad_norm": 1.0, + "learning_rate": 1.1264615384615384e-05, + "loss": 0.3072, + "step": 2840 + }, + { + "epoch": 0.440724489204183, + "grad_norm": 1.453125, + "learning_rate": 1.1233846153846155e-05, + "loss": 0.3272, + "step": 2850 + }, + { + "epoch": 0.44227089092068894, + "grad_norm": 1.015625, + "learning_rate": 1.1203076923076923e-05, + "loss": 0.2422, + "step": 2860 + }, + { + "epoch": 0.4438172926371948, + "grad_norm": 0.83984375, + "learning_rate": 1.1172307692307694e-05, + "loss": 0.2424, + "step": 2870 + }, + { + "epoch": 0.4453636943537007, + "grad_norm": 1.0703125, + "learning_rate": 1.1141538461538462e-05, + "loss": 0.288, + "step": 2880 + }, + { + "epoch": 0.44691009607020665, + "grad_norm": 0.90234375, + "learning_rate": 1.1110769230769231e-05, + "loss": 0.2376, + "step": 2890 + }, + { + "epoch": 0.44845649778671254, + "grad_norm": 0.80859375, + "learning_rate": 1.1080000000000002e-05, + "loss": 0.2541, + "step": 2900 + }, + { + "epoch": 0.4500028995032184, + "grad_norm": 0.80078125, + "learning_rate": 1.104923076923077e-05, + "loss": 0.2928, + "step": 2910 + }, + { + "epoch": 0.45154930121972436, + "grad_norm": 0.83984375, + "learning_rate": 1.101846153846154e-05, + "loss": 0.2582, + "step": 2920 + }, + { + "epoch": 0.45309570293623025, + "grad_norm": 0.9140625, + "learning_rate": 1.0987692307692309e-05, + "loss": 0.2548, + "step": 2930 + }, + { + "epoch": 0.4546421046527362, + "grad_norm": 1.421875, + "learning_rate": 1.0956923076923078e-05, + "loss": 0.3462, + "step": 2940 + }, + { + "epoch": 0.4561885063692421, + "grad_norm": 1.0, + "learning_rate": 1.0926153846153846e-05, + "loss": 0.3076, + "step": 2950 + }, + { + "epoch": 0.45773490808574796, + "grad_norm": 1.0546875, + "learning_rate": 1.0895384615384617e-05, + "loss": 0.2761, + "step": 2960 + }, + { + "epoch": 0.4592813098022539, + "grad_norm": 0.859375, + "learning_rate": 1.0864615384615385e-05, + "loss": 0.3359, + "step": 2970 + }, + { + "epoch": 0.4608277115187598, + "grad_norm": 0.86328125, + "learning_rate": 1.0833846153846154e-05, + "loss": 0.3213, + "step": 2980 + }, + { + "epoch": 0.46237411323526567, + "grad_norm": 0.85546875, + "learning_rate": 1.0803076923076925e-05, + "loss": 0.2917, + "step": 2990 + }, + { + "epoch": 0.4639205149517716, + "grad_norm": 0.953125, + "learning_rate": 1.0772307692307693e-05, + "loss": 0.2774, + "step": 3000 + }, + { + "epoch": 0.4654669166682775, + "grad_norm": 0.7734375, + "learning_rate": 1.0741538461538464e-05, + "loss": 0.3373, + "step": 3010 + }, + { + "epoch": 0.4670133183847834, + "grad_norm": 0.57421875, + "learning_rate": 1.0710769230769232e-05, + "loss": 0.248, + "step": 3020 + }, + { + "epoch": 0.4685597201012893, + "grad_norm": 1.0078125, + "learning_rate": 1.0680000000000001e-05, + "loss": 0.2782, + "step": 3030 + }, + { + "epoch": 0.4701061218177952, + "grad_norm": 0.80859375, + "learning_rate": 1.0649230769230769e-05, + "loss": 0.3041, + "step": 3040 + }, + { + "epoch": 0.47165252353430115, + "grad_norm": 0.8359375, + "learning_rate": 1.061846153846154e-05, + "loss": 0.2109, + "step": 3050 + }, + { + "epoch": 0.47319892525080703, + "grad_norm": 0.99609375, + "learning_rate": 1.0587692307692308e-05, + "loss": 0.2815, + "step": 3060 + }, + { + "epoch": 0.4747453269673129, + "grad_norm": 1.1875, + "learning_rate": 1.0556923076923079e-05, + "loss": 0.2775, + "step": 3070 + }, + { + "epoch": 0.47629172868381886, + "grad_norm": 1.15625, + "learning_rate": 1.0526153846153846e-05, + "loss": 0.2645, + "step": 3080 + }, + { + "epoch": 0.47783813040032475, + "grad_norm": 0.91796875, + "learning_rate": 1.0495384615384616e-05, + "loss": 0.2738, + "step": 3090 + }, + { + "epoch": 0.47938453211683063, + "grad_norm": 0.73828125, + "learning_rate": 1.0464615384615387e-05, + "loss": 0.2912, + "step": 3100 + }, + { + "epoch": 0.48093093383333657, + "grad_norm": 1.046875, + "learning_rate": 1.0433846153846155e-05, + "loss": 0.217, + "step": 3110 + }, + { + "epoch": 0.48247733554984246, + "grad_norm": 0.96875, + "learning_rate": 1.0403076923076924e-05, + "loss": 0.3397, + "step": 3120 + }, + { + "epoch": 0.48402373726634834, + "grad_norm": 0.80859375, + "learning_rate": 1.0372307692307693e-05, + "loss": 0.215, + "step": 3130 + }, + { + "epoch": 0.4855701389828543, + "grad_norm": 1.125, + "learning_rate": 1.0341538461538463e-05, + "loss": 0.2587, + "step": 3140 + }, + { + "epoch": 0.48711654069936017, + "grad_norm": 0.9765625, + "learning_rate": 1.031076923076923e-05, + "loss": 0.2183, + "step": 3150 + }, + { + "epoch": 0.4886629424158661, + "grad_norm": 1.140625, + "learning_rate": 1.0280000000000002e-05, + "loss": 0.2551, + "step": 3160 + }, + { + "epoch": 0.490209344132372, + "grad_norm": 0.859375, + "learning_rate": 1.024923076923077e-05, + "loss": 0.2389, + "step": 3170 + }, + { + "epoch": 0.4917557458488779, + "grad_norm": 0.859375, + "learning_rate": 1.0218461538461539e-05, + "loss": 0.2774, + "step": 3180 + }, + { + "epoch": 0.4933021475653838, + "grad_norm": 0.8828125, + "learning_rate": 1.018769230769231e-05, + "loss": 0.2608, + "step": 3190 + }, + { + "epoch": 0.4948485492818897, + "grad_norm": 1.0078125, + "learning_rate": 1.0156923076923077e-05, + "loss": 0.3287, + "step": 3200 + }, + { + "epoch": 0.4963949509983956, + "grad_norm": 0.80859375, + "learning_rate": 1.0126153846153849e-05, + "loss": 0.236, + "step": 3210 + }, + { + "epoch": 0.49794135271490153, + "grad_norm": 1.0, + "learning_rate": 1.0095384615384616e-05, + "loss": 0.259, + "step": 3220 + }, + { + "epoch": 0.4994877544314074, + "grad_norm": 0.6796875, + "learning_rate": 1.0064615384615386e-05, + "loss": 0.2668, + "step": 3230 + }, + { + "epoch": 0.5010341561479134, + "grad_norm": 0.86328125, + "learning_rate": 1.0033846153846153e-05, + "loss": 0.3078, + "step": 3240 + }, + { + "epoch": 0.5025805578644192, + "grad_norm": 1.1328125, + "learning_rate": 1.0003076923076924e-05, + "loss": 0.2674, + "step": 3250 + }, + { + "epoch": 0.5041269595809251, + "grad_norm": 1.03125, + "learning_rate": 9.972307692307694e-06, + "loss": 0.276, + "step": 3260 + }, + { + "epoch": 0.505673361297431, + "grad_norm": 0.58203125, + "learning_rate": 9.941538461538463e-06, + "loss": 0.2331, + "step": 3270 + }, + { + "epoch": 0.5072197630139369, + "grad_norm": 0.7734375, + "learning_rate": 9.910769230769231e-06, + "loss": 0.2518, + "step": 3280 + }, + { + "epoch": 0.5087661647304429, + "grad_norm": 0.79296875, + "learning_rate": 9.88e-06, + "loss": 0.3217, + "step": 3290 + }, + { + "epoch": 0.5103125664469488, + "grad_norm": 0.78125, + "learning_rate": 9.84923076923077e-06, + "loss": 0.2582, + "step": 3300 + }, + { + "epoch": 0.5118589681634547, + "grad_norm": 0.921875, + "learning_rate": 9.818461538461539e-06, + "loss": 0.2967, + "step": 3310 + }, + { + "epoch": 0.5134053698799605, + "grad_norm": 0.921875, + "learning_rate": 9.787692307692308e-06, + "loss": 0.2508, + "step": 3320 + }, + { + "epoch": 0.5149517715964664, + "grad_norm": 1.3203125, + "learning_rate": 9.756923076923078e-06, + "loss": 0.247, + "step": 3330 + }, + { + "epoch": 0.5164981733129724, + "grad_norm": 0.94140625, + "learning_rate": 9.726153846153847e-06, + "loss": 0.2664, + "step": 3340 + }, + { + "epoch": 0.5180445750294783, + "grad_norm": 0.96875, + "learning_rate": 9.695384615384617e-06, + "loss": 0.2963, + "step": 3350 + }, + { + "epoch": 0.5195909767459842, + "grad_norm": 1.2109375, + "learning_rate": 9.664615384615386e-06, + "loss": 0.2869, + "step": 3360 + }, + { + "epoch": 0.5211373784624901, + "grad_norm": 0.796875, + "learning_rate": 9.633846153846155e-06, + "loss": 0.2125, + "step": 3370 + }, + { + "epoch": 0.522683780178996, + "grad_norm": 0.83203125, + "learning_rate": 9.603076923076923e-06, + "loss": 0.2379, + "step": 3380 + }, + { + "epoch": 0.5242301818955019, + "grad_norm": 1.2578125, + "learning_rate": 9.572307692307693e-06, + "loss": 0.2897, + "step": 3390 + }, + { + "epoch": 0.5257765836120079, + "grad_norm": 0.94140625, + "learning_rate": 9.541538461538462e-06, + "loss": 0.2954, + "step": 3400 + }, + { + "epoch": 0.5273229853285137, + "grad_norm": 0.75390625, + "learning_rate": 9.510769230769231e-06, + "loss": 0.257, + "step": 3410 + }, + { + "epoch": 0.5288693870450196, + "grad_norm": 0.77734375, + "learning_rate": 9.48e-06, + "loss": 0.3107, + "step": 3420 + }, + { + "epoch": 0.5304157887615255, + "grad_norm": 0.7890625, + "learning_rate": 9.44923076923077e-06, + "loss": 0.2178, + "step": 3430 + }, + { + "epoch": 0.5319621904780314, + "grad_norm": 1.0625, + "learning_rate": 9.41846153846154e-06, + "loss": 0.2407, + "step": 3440 + }, + { + "epoch": 0.5335085921945374, + "grad_norm": 1.2421875, + "learning_rate": 9.387692307692309e-06, + "loss": 0.284, + "step": 3450 + }, + { + "epoch": 0.5350549939110433, + "grad_norm": 0.8984375, + "learning_rate": 9.356923076923078e-06, + "loss": 0.2358, + "step": 3460 + }, + { + "epoch": 0.5366013956275492, + "grad_norm": 1.1328125, + "learning_rate": 9.326153846153848e-06, + "loss": 0.2455, + "step": 3470 + }, + { + "epoch": 0.538147797344055, + "grad_norm": 1.171875, + "learning_rate": 9.295384615384615e-06, + "loss": 0.3416, + "step": 3480 + }, + { + "epoch": 0.5396941990605609, + "grad_norm": 1.171875, + "learning_rate": 9.264615384615385e-06, + "loss": 0.2908, + "step": 3490 + }, + { + "epoch": 0.5412406007770668, + "grad_norm": 0.83984375, + "learning_rate": 9.233846153846154e-06, + "loss": 0.2648, + "step": 3500 + }, + { + "epoch": 0.5427870024935728, + "grad_norm": 0.984375, + "learning_rate": 9.203076923076924e-06, + "loss": 0.2159, + "step": 3510 + }, + { + "epoch": 0.5443334042100787, + "grad_norm": 1.078125, + "learning_rate": 9.172307692307693e-06, + "loss": 0.3019, + "step": 3520 + }, + { + "epoch": 0.5458798059265846, + "grad_norm": 1.4375, + "learning_rate": 9.141538461538462e-06, + "loss": 0.2886, + "step": 3530 + }, + { + "epoch": 0.5474262076430905, + "grad_norm": 1.1484375, + "learning_rate": 9.110769230769232e-06, + "loss": 0.2674, + "step": 3540 + }, + { + "epoch": 0.5489726093595964, + "grad_norm": 1.015625, + "learning_rate": 9.080000000000001e-06, + "loss": 0.2809, + "step": 3550 + }, + { + "epoch": 0.5505190110761023, + "grad_norm": 0.94140625, + "learning_rate": 9.04923076923077e-06, + "loss": 0.3105, + "step": 3560 + }, + { + "epoch": 0.5520654127926082, + "grad_norm": 0.859375, + "learning_rate": 9.01846153846154e-06, + "loss": 0.3115, + "step": 3570 + }, + { + "epoch": 0.5536118145091141, + "grad_norm": 0.90234375, + "learning_rate": 8.987692307692308e-06, + "loss": 0.2605, + "step": 3580 + }, + { + "epoch": 0.55515821622562, + "grad_norm": 0.703125, + "learning_rate": 8.956923076923077e-06, + "loss": 0.2281, + "step": 3590 + }, + { + "epoch": 0.5567046179421259, + "grad_norm": 0.953125, + "learning_rate": 8.926153846153846e-06, + "loss": 0.2732, + "step": 3600 + }, + { + "epoch": 0.5582510196586318, + "grad_norm": 1.0078125, + "learning_rate": 8.895384615384616e-06, + "loss": 0.2134, + "step": 3610 + }, + { + "epoch": 0.5597974213751378, + "grad_norm": 1.140625, + "learning_rate": 8.864615384615385e-06, + "loss": 0.2788, + "step": 3620 + }, + { + "epoch": 0.5613438230916437, + "grad_norm": 0.8984375, + "learning_rate": 8.833846153846155e-06, + "loss": 0.2558, + "step": 3630 + }, + { + "epoch": 0.5628902248081495, + "grad_norm": 0.7265625, + "learning_rate": 8.803076923076924e-06, + "loss": 0.2719, + "step": 3640 + }, + { + "epoch": 0.5644366265246554, + "grad_norm": 1.0703125, + "learning_rate": 8.772307692307693e-06, + "loss": 0.2596, + "step": 3650 + }, + { + "epoch": 0.5659830282411613, + "grad_norm": 0.8828125, + "learning_rate": 8.741538461538463e-06, + "loss": 0.2484, + "step": 3660 + }, + { + "epoch": 0.5675294299576673, + "grad_norm": 0.734375, + "learning_rate": 8.710769230769232e-06, + "loss": 0.2734, + "step": 3670 + }, + { + "epoch": 0.5690758316741732, + "grad_norm": 0.91796875, + "learning_rate": 8.68e-06, + "loss": 0.2728, + "step": 3680 + }, + { + "epoch": 0.5706222333906791, + "grad_norm": 0.9765625, + "learning_rate": 8.64923076923077e-06, + "loss": 0.2746, + "step": 3690 + }, + { + "epoch": 0.572168635107185, + "grad_norm": 0.94140625, + "learning_rate": 8.618461538461539e-06, + "loss": 0.2767, + "step": 3700 + }, + { + "epoch": 0.5737150368236908, + "grad_norm": 0.80859375, + "learning_rate": 8.587692307692308e-06, + "loss": 0.2798, + "step": 3710 + }, + { + "epoch": 0.5752614385401967, + "grad_norm": 1.0703125, + "learning_rate": 8.556923076923077e-06, + "loss": 0.2573, + "step": 3720 + }, + { + "epoch": 0.5768078402567027, + "grad_norm": 1.1015625, + "learning_rate": 8.526153846153847e-06, + "loss": 0.2756, + "step": 3730 + }, + { + "epoch": 0.5783542419732086, + "grad_norm": 1.03125, + "learning_rate": 8.495384615384616e-06, + "loss": 0.2819, + "step": 3740 + }, + { + "epoch": 0.5799006436897145, + "grad_norm": 0.8046875, + "learning_rate": 8.464615384615386e-06, + "loss": 0.22, + "step": 3750 + }, + { + "epoch": 0.5814470454062204, + "grad_norm": 1.0078125, + "learning_rate": 8.433846153846155e-06, + "loss": 0.2857, + "step": 3760 + }, + { + "epoch": 0.5829934471227263, + "grad_norm": 1.0703125, + "learning_rate": 8.403076923076924e-06, + "loss": 0.2803, + "step": 3770 + }, + { + "epoch": 0.5845398488392322, + "grad_norm": 1.0625, + "learning_rate": 8.372307692307692e-06, + "loss": 0.2207, + "step": 3780 + }, + { + "epoch": 0.5860862505557382, + "grad_norm": 1.375, + "learning_rate": 8.341538461538462e-06, + "loss": 0.2684, + "step": 3790 + }, + { + "epoch": 0.587632652272244, + "grad_norm": 0.7578125, + "learning_rate": 8.310769230769231e-06, + "loss": 0.2353, + "step": 3800 + }, + { + "epoch": 0.5891790539887499, + "grad_norm": 0.6484375, + "learning_rate": 8.28e-06, + "loss": 0.3244, + "step": 3810 + }, + { + "epoch": 0.5907254557052558, + "grad_norm": 1.15625, + "learning_rate": 8.24923076923077e-06, + "loss": 0.288, + "step": 3820 + }, + { + "epoch": 0.5922718574217617, + "grad_norm": 0.91015625, + "learning_rate": 8.218461538461539e-06, + "loss": 0.261, + "step": 3830 + }, + { + "epoch": 0.5938182591382677, + "grad_norm": 0.78125, + "learning_rate": 8.187692307692309e-06, + "loss": 0.3277, + "step": 3840 + }, + { + "epoch": 0.5953646608547736, + "grad_norm": 0.87890625, + "learning_rate": 8.156923076923078e-06, + "loss": 0.2727, + "step": 3850 + }, + { + "epoch": 0.5969110625712795, + "grad_norm": 0.80078125, + "learning_rate": 8.126153846153847e-06, + "loss": 0.2319, + "step": 3860 + }, + { + "epoch": 0.5984574642877853, + "grad_norm": 0.97265625, + "learning_rate": 8.095384615384617e-06, + "loss": 0.252, + "step": 3870 + }, + { + "epoch": 0.6000038660042912, + "grad_norm": 0.96875, + "learning_rate": 8.064615384615384e-06, + "loss": 0.2683, + "step": 3880 + }, + { + "epoch": 0.6015502677207971, + "grad_norm": 0.89453125, + "learning_rate": 8.033846153846154e-06, + "loss": 0.3251, + "step": 3890 + }, + { + "epoch": 0.6030966694373031, + "grad_norm": 1.015625, + "learning_rate": 8.003076923076923e-06, + "loss": 0.3153, + "step": 3900 + }, + { + "epoch": 0.604643071153809, + "grad_norm": 0.9296875, + "learning_rate": 7.972307692307693e-06, + "loss": 0.3234, + "step": 3910 + }, + { + "epoch": 0.6061894728703149, + "grad_norm": 0.88671875, + "learning_rate": 7.941538461538462e-06, + "loss": 0.2812, + "step": 3920 + }, + { + "epoch": 0.6077358745868208, + "grad_norm": 0.890625, + "learning_rate": 7.910769230769231e-06, + "loss": 0.2959, + "step": 3930 + }, + { + "epoch": 0.6092822763033267, + "grad_norm": 0.98046875, + "learning_rate": 7.88e-06, + "loss": 0.2818, + "step": 3940 + }, + { + "epoch": 0.6108286780198326, + "grad_norm": 1.15625, + "learning_rate": 7.84923076923077e-06, + "loss": 0.3152, + "step": 3950 + }, + { + "epoch": 0.6123750797363385, + "grad_norm": 0.9375, + "learning_rate": 7.81846153846154e-06, + "loss": 0.2988, + "step": 3960 + }, + { + "epoch": 0.6139214814528444, + "grad_norm": 0.984375, + "learning_rate": 7.787692307692309e-06, + "loss": 0.2835, + "step": 3970 + }, + { + "epoch": 0.6154678831693503, + "grad_norm": 1.359375, + "learning_rate": 7.756923076923077e-06, + "loss": 0.3486, + "step": 3980 + }, + { + "epoch": 0.6170142848858562, + "grad_norm": 1.3828125, + "learning_rate": 7.726153846153846e-06, + "loss": 0.2934, + "step": 3990 + }, + { + "epoch": 0.6185606866023621, + "grad_norm": 0.92578125, + "learning_rate": 7.695384615384615e-06, + "loss": 0.2678, + "step": 4000 + }, + { + "epoch": 0.6201070883188681, + "grad_norm": 0.90625, + "learning_rate": 7.664615384615385e-06, + "loss": 0.2608, + "step": 4010 + }, + { + "epoch": 0.621653490035374, + "grad_norm": 1.0703125, + "learning_rate": 7.633846153846154e-06, + "loss": 0.289, + "step": 4020 + }, + { + "epoch": 0.6231998917518798, + "grad_norm": 1.421875, + "learning_rate": 7.6030769230769245e-06, + "loss": 0.2856, + "step": 4030 + }, + { + "epoch": 0.6247462934683857, + "grad_norm": 0.67578125, + "learning_rate": 7.572307692307693e-06, + "loss": 0.2569, + "step": 4040 + }, + { + "epoch": 0.6262926951848916, + "grad_norm": 0.96484375, + "learning_rate": 7.5415384615384624e-06, + "loss": 0.2727, + "step": 4050 + }, + { + "epoch": 0.6278390969013976, + "grad_norm": 0.75390625, + "learning_rate": 7.510769230769232e-06, + "loss": 0.279, + "step": 4060 + }, + { + "epoch": 0.6293854986179035, + "grad_norm": 0.9453125, + "learning_rate": 7.48e-06, + "loss": 0.3606, + "step": 4070 + }, + { + "epoch": 0.6309319003344094, + "grad_norm": 0.7109375, + "learning_rate": 7.44923076923077e-06, + "loss": 0.2959, + "step": 4080 + }, + { + "epoch": 0.6324783020509153, + "grad_norm": 1.03125, + "learning_rate": 7.418461538461539e-06, + "loss": 0.2622, + "step": 4090 + }, + { + "epoch": 0.6340247037674211, + "grad_norm": 0.8046875, + "learning_rate": 7.387692307692308e-06, + "loss": 0.2207, + "step": 4100 + }, + { + "epoch": 0.635571105483927, + "grad_norm": 1.0234375, + "learning_rate": 7.356923076923077e-06, + "loss": 0.3007, + "step": 4110 + }, + { + "epoch": 0.637117507200433, + "grad_norm": 1.09375, + "learning_rate": 7.326153846153847e-06, + "loss": 0.2815, + "step": 4120 + }, + { + "epoch": 0.6386639089169389, + "grad_norm": 0.9453125, + "learning_rate": 7.295384615384617e-06, + "loss": 0.2587, + "step": 4130 + }, + { + "epoch": 0.6402103106334448, + "grad_norm": 0.9453125, + "learning_rate": 7.264615384615385e-06, + "loss": 0.2999, + "step": 4140 + }, + { + "epoch": 0.6417567123499507, + "grad_norm": 0.90234375, + "learning_rate": 7.233846153846155e-06, + "loss": 0.2398, + "step": 4150 + }, + { + "epoch": 0.6433031140664566, + "grad_norm": 0.98046875, + "learning_rate": 7.203076923076924e-06, + "loss": 0.2716, + "step": 4160 + }, + { + "epoch": 0.6448495157829626, + "grad_norm": 1.0234375, + "learning_rate": 7.172307692307693e-06, + "loss": 0.2222, + "step": 4170 + }, + { + "epoch": 0.6463959174994685, + "grad_norm": 0.73828125, + "learning_rate": 7.141538461538462e-06, + "loss": 0.285, + "step": 4180 + }, + { + "epoch": 0.6479423192159743, + "grad_norm": 0.74609375, + "learning_rate": 7.1107692307692314e-06, + "loss": 0.3562, + "step": 4190 + }, + { + "epoch": 0.6494887209324802, + "grad_norm": 0.9921875, + "learning_rate": 7.08e-06, + "loss": 0.283, + "step": 4200 + }, + { + "epoch": 0.6510351226489861, + "grad_norm": 0.85546875, + "learning_rate": 7.049230769230769e-06, + "loss": 0.2915, + "step": 4210 + }, + { + "epoch": 0.652581524365492, + "grad_norm": 0.765625, + "learning_rate": 7.01846153846154e-06, + "loss": 0.2424, + "step": 4220 + }, + { + "epoch": 0.654127926081998, + "grad_norm": 0.81640625, + "learning_rate": 6.987692307692309e-06, + "loss": 0.2456, + "step": 4230 + }, + { + "epoch": 0.6556743277985039, + "grad_norm": 0.97265625, + "learning_rate": 6.9569230769230776e-06, + "loss": 0.2946, + "step": 4240 + }, + { + "epoch": 0.6572207295150098, + "grad_norm": 0.77734375, + "learning_rate": 6.926153846153847e-06, + "loss": 0.3338, + "step": 4250 + }, + { + "epoch": 0.6587671312315156, + "grad_norm": 1.2265625, + "learning_rate": 6.895384615384616e-06, + "loss": 0.2645, + "step": 4260 + }, + { + "epoch": 0.6603135329480215, + "grad_norm": 0.96484375, + "learning_rate": 6.864615384615385e-06, + "loss": 0.2671, + "step": 4270 + }, + { + "epoch": 0.6618599346645275, + "grad_norm": 1.09375, + "learning_rate": 6.833846153846154e-06, + "loss": 0.2627, + "step": 4280 + }, + { + "epoch": 0.6634063363810334, + "grad_norm": 1.2890625, + "learning_rate": 6.803076923076924e-06, + "loss": 0.2972, + "step": 4290 + }, + { + "epoch": 0.6649527380975393, + "grad_norm": 0.8359375, + "learning_rate": 6.772307692307692e-06, + "loss": 0.2637, + "step": 4300 + }, + { + "epoch": 0.6664991398140452, + "grad_norm": 0.87109375, + "learning_rate": 6.741538461538462e-06, + "loss": 0.2459, + "step": 4310 + }, + { + "epoch": 0.6680455415305511, + "grad_norm": 0.93359375, + "learning_rate": 6.710769230769232e-06, + "loss": 0.3008, + "step": 4320 + }, + { + "epoch": 0.669591943247057, + "grad_norm": 1.109375, + "learning_rate": 6.680000000000001e-06, + "loss": 0.2881, + "step": 4330 + }, + { + "epoch": 0.671138344963563, + "grad_norm": 0.67578125, + "learning_rate": 6.64923076923077e-06, + "loss": 0.2887, + "step": 4340 + }, + { + "epoch": 0.6726847466800688, + "grad_norm": 0.875, + "learning_rate": 6.618461538461539e-06, + "loss": 0.3097, + "step": 4350 + }, + { + "epoch": 0.6742311483965747, + "grad_norm": 0.8671875, + "learning_rate": 6.587692307692309e-06, + "loss": 0.2623, + "step": 4360 + }, + { + "epoch": 0.6757775501130806, + "grad_norm": 0.8984375, + "learning_rate": 6.556923076923077e-06, + "loss": 0.2589, + "step": 4370 + }, + { + "epoch": 0.6773239518295865, + "grad_norm": 0.77734375, + "learning_rate": 6.5261538461538465e-06, + "loss": 0.2149, + "step": 4380 + }, + { + "epoch": 0.6788703535460925, + "grad_norm": 0.9453125, + "learning_rate": 6.495384615384616e-06, + "loss": 0.22, + "step": 4390 + }, + { + "epoch": 0.6804167552625984, + "grad_norm": 0.98828125, + "learning_rate": 6.4646153846153845e-06, + "loss": 0.2636, + "step": 4400 + }, + { + "epoch": 0.6819631569791043, + "grad_norm": 1.0234375, + "learning_rate": 6.433846153846154e-06, + "loss": 0.3088, + "step": 4410 + }, + { + "epoch": 0.6835095586956101, + "grad_norm": 0.92578125, + "learning_rate": 6.403076923076924e-06, + "loss": 0.2378, + "step": 4420 + }, + { + "epoch": 0.685055960412116, + "grad_norm": 0.91796875, + "learning_rate": 6.3723076923076935e-06, + "loss": 0.2405, + "step": 4430 + }, + { + "epoch": 0.6866023621286219, + "grad_norm": 0.83203125, + "learning_rate": 6.341538461538462e-06, + "loss": 0.2825, + "step": 4440 + }, + { + "epoch": 0.6881487638451279, + "grad_norm": 0.9296875, + "learning_rate": 6.3107692307692315e-06, + "loss": 0.3513, + "step": 4450 + }, + { + "epoch": 0.6896951655616338, + "grad_norm": 1.078125, + "learning_rate": 6.280000000000001e-06, + "loss": 0.3271, + "step": 4460 + }, + { + "epoch": 0.6912415672781397, + "grad_norm": 0.82421875, + "learning_rate": 6.249230769230769e-06, + "loss": 0.2742, + "step": 4470 + }, + { + "epoch": 0.6927879689946456, + "grad_norm": 0.9921875, + "learning_rate": 6.218461538461539e-06, + "loss": 0.2642, + "step": 4480 + }, + { + "epoch": 0.6943343707111514, + "grad_norm": 0.96875, + "learning_rate": 6.187692307692308e-06, + "loss": 0.2364, + "step": 4490 + }, + { + "epoch": 0.6958807724276574, + "grad_norm": 1.1875, + "learning_rate": 6.156923076923077e-06, + "loss": 0.2883, + "step": 4500 + }, + { + "epoch": 0.6974271741441633, + "grad_norm": 1.03125, + "learning_rate": 6.126153846153846e-06, + "loss": 0.2219, + "step": 4510 + }, + { + "epoch": 0.6989735758606692, + "grad_norm": 0.9453125, + "learning_rate": 6.095384615384616e-06, + "loss": 0.2639, + "step": 4520 + }, + { + "epoch": 0.7005199775771751, + "grad_norm": 0.80859375, + "learning_rate": 6.064615384615386e-06, + "loss": 0.2764, + "step": 4530 + }, + { + "epoch": 0.702066379293681, + "grad_norm": 1.0390625, + "learning_rate": 6.033846153846154e-06, + "loss": 0.3097, + "step": 4540 + }, + { + "epoch": 0.7036127810101869, + "grad_norm": 0.890625, + "learning_rate": 6.003076923076924e-06, + "loss": 0.2262, + "step": 4550 + }, + { + "epoch": 0.7051591827266929, + "grad_norm": 0.82421875, + "learning_rate": 5.972307692307693e-06, + "loss": 0.2537, + "step": 4560 + }, + { + "epoch": 0.7067055844431988, + "grad_norm": 0.7734375, + "learning_rate": 5.941538461538462e-06, + "loss": 0.347, + "step": 4570 + }, + { + "epoch": 0.7082519861597046, + "grad_norm": 1.4140625, + "learning_rate": 5.910769230769231e-06, + "loss": 0.3193, + "step": 4580 + }, + { + "epoch": 0.7097983878762105, + "grad_norm": 0.75, + "learning_rate": 5.8800000000000005e-06, + "loss": 0.273, + "step": 4590 + }, + { + "epoch": 0.7113447895927164, + "grad_norm": 0.921875, + "learning_rate": 5.849230769230769e-06, + "loss": 0.2902, + "step": 4600 + }, + { + "epoch": 0.7128911913092224, + "grad_norm": 0.8046875, + "learning_rate": 5.818461538461538e-06, + "loss": 0.3653, + "step": 4610 + }, + { + "epoch": 0.7144375930257283, + "grad_norm": 0.76171875, + "learning_rate": 5.787692307692309e-06, + "loss": 0.3106, + "step": 4620 + }, + { + "epoch": 0.7159839947422342, + "grad_norm": 0.953125, + "learning_rate": 5.756923076923078e-06, + "loss": 0.2368, + "step": 4630 + }, + { + "epoch": 0.7175303964587401, + "grad_norm": 0.8203125, + "learning_rate": 5.726153846153847e-06, + "loss": 0.249, + "step": 4640 + }, + { + "epoch": 0.719076798175246, + "grad_norm": 1.1328125, + "learning_rate": 5.695384615384616e-06, + "loss": 0.3709, + "step": 4650 + }, + { + "epoch": 0.7206231998917518, + "grad_norm": 0.88671875, + "learning_rate": 5.664615384615385e-06, + "loss": 0.2921, + "step": 4660 + }, + { + "epoch": 0.7221696016082578, + "grad_norm": 1.015625, + "learning_rate": 5.633846153846154e-06, + "loss": 0.3115, + "step": 4670 + }, + { + "epoch": 0.7237160033247637, + "grad_norm": 0.87890625, + "learning_rate": 5.603076923076923e-06, + "loss": 0.2479, + "step": 4680 + }, + { + "epoch": 0.7252624050412696, + "grad_norm": 1.046875, + "learning_rate": 5.572307692307693e-06, + "loss": 0.2297, + "step": 4690 + }, + { + "epoch": 0.7268088067577755, + "grad_norm": 0.89453125, + "learning_rate": 5.541538461538461e-06, + "loss": 0.2454, + "step": 4700 + }, + { + "epoch": 0.7283552084742814, + "grad_norm": 0.79296875, + "learning_rate": 5.5107692307692315e-06, + "loss": 0.2849, + "step": 4710 + }, + { + "epoch": 0.7299016101907874, + "grad_norm": 0.74609375, + "learning_rate": 5.480000000000001e-06, + "loss": 0.2797, + "step": 4720 + }, + { + "epoch": 0.7314480119072932, + "grad_norm": 1.0234375, + "learning_rate": 5.44923076923077e-06, + "loss": 0.3882, + "step": 4730 + }, + { + "epoch": 0.7329944136237991, + "grad_norm": 0.90234375, + "learning_rate": 5.418461538461539e-06, + "loss": 0.2509, + "step": 4740 + }, + { + "epoch": 0.734540815340305, + "grad_norm": 0.81640625, + "learning_rate": 5.387692307692308e-06, + "loss": 0.2408, + "step": 4750 + }, + { + "epoch": 0.7360872170568109, + "grad_norm": 0.8828125, + "learning_rate": 5.356923076923078e-06, + "loss": 0.2413, + "step": 4760 + }, + { + "epoch": 0.7376336187733168, + "grad_norm": 0.7890625, + "learning_rate": 5.326153846153846e-06, + "loss": 0.2432, + "step": 4770 + }, + { + "epoch": 0.7391800204898228, + "grad_norm": 0.9921875, + "learning_rate": 5.2953846153846156e-06, + "loss": 0.277, + "step": 4780 + }, + { + "epoch": 0.7407264222063287, + "grad_norm": 1.15625, + "learning_rate": 5.264615384615385e-06, + "loss": 0.2486, + "step": 4790 + }, + { + "epoch": 0.7422728239228346, + "grad_norm": 0.8203125, + "learning_rate": 5.2338461538461535e-06, + "loss": 0.3, + "step": 4800 + }, + { + "epoch": 0.7438192256393404, + "grad_norm": 0.87109375, + "learning_rate": 5.203076923076924e-06, + "loss": 0.26, + "step": 4810 + }, + { + "epoch": 0.7453656273558463, + "grad_norm": 1.328125, + "learning_rate": 5.172307692307693e-06, + "loss": 0.2937, + "step": 4820 + }, + { + "epoch": 0.7469120290723523, + "grad_norm": 0.9453125, + "learning_rate": 5.1415384615384625e-06, + "loss": 0.3057, + "step": 4830 + }, + { + "epoch": 0.7484584307888582, + "grad_norm": 0.8828125, + "learning_rate": 5.110769230769231e-06, + "loss": 0.3284, + "step": 4840 + }, + { + "epoch": 0.7500048325053641, + "grad_norm": 0.98046875, + "learning_rate": 5.0800000000000005e-06, + "loss": 0.2434, + "step": 4850 + }, + { + "epoch": 0.75155123422187, + "grad_norm": 0.97265625, + "learning_rate": 5.04923076923077e-06, + "loss": 0.257, + "step": 4860 + }, + { + "epoch": 0.7530976359383759, + "grad_norm": 0.91015625, + "learning_rate": 5.0184615384615384e-06, + "loss": 0.2714, + "step": 4870 + }, + { + "epoch": 0.7546440376548817, + "grad_norm": 0.609375, + "learning_rate": 4.987692307692308e-06, + "loss": 0.2182, + "step": 4880 + }, + { + "epoch": 0.7561904393713877, + "grad_norm": 0.85546875, + "learning_rate": 4.956923076923077e-06, + "loss": 0.2855, + "step": 4890 + }, + { + "epoch": 0.7577368410878936, + "grad_norm": 1.0703125, + "learning_rate": 4.926153846153847e-06, + "loss": 0.2774, + "step": 4900 + }, + { + "epoch": 0.7592832428043995, + "grad_norm": 0.86328125, + "learning_rate": 4.895384615384616e-06, + "loss": 0.2489, + "step": 4910 + }, + { + "epoch": 0.7608296445209054, + "grad_norm": 1.1328125, + "learning_rate": 4.8646153846153846e-06, + "loss": 0.3157, + "step": 4920 + }, + { + "epoch": 0.7623760462374113, + "grad_norm": 0.9609375, + "learning_rate": 4.833846153846154e-06, + "loss": 0.2704, + "step": 4930 + }, + { + "epoch": 0.7639224479539173, + "grad_norm": 0.82421875, + "learning_rate": 4.803076923076923e-06, + "loss": 0.2995, + "step": 4940 + }, + { + "epoch": 0.7654688496704232, + "grad_norm": 0.98046875, + "learning_rate": 4.772307692307693e-06, + "loss": 0.2422, + "step": 4950 + }, + { + "epoch": 0.767015251386929, + "grad_norm": 1.296875, + "learning_rate": 4.741538461538462e-06, + "loss": 0.2692, + "step": 4960 + }, + { + "epoch": 0.7685616531034349, + "grad_norm": 1.015625, + "learning_rate": 4.710769230769231e-06, + "loss": 0.2704, + "step": 4970 + }, + { + "epoch": 0.7701080548199408, + "grad_norm": 0.85546875, + "learning_rate": 4.680000000000001e-06, + "loss": 0.3147, + "step": 4980 + }, + { + "epoch": 0.7716544565364467, + "grad_norm": 0.9609375, + "learning_rate": 4.6492307692307695e-06, + "loss": 0.2867, + "step": 4990 + }, + { + "epoch": 0.7732008582529527, + "grad_norm": 1.09375, + "learning_rate": 4.618461538461539e-06, + "loss": 0.2896, + "step": 5000 + }, + { + "epoch": 0.7747472599694586, + "grad_norm": 1.0546875, + "learning_rate": 4.587692307692308e-06, + "loss": 0.2335, + "step": 5010 + }, + { + "epoch": 0.7762936616859645, + "grad_norm": 1.0390625, + "learning_rate": 4.556923076923077e-06, + "loss": 0.2441, + "step": 5020 + }, + { + "epoch": 0.7778400634024704, + "grad_norm": 1.4453125, + "learning_rate": 4.526153846153847e-06, + "loss": 0.3049, + "step": 5030 + }, + { + "epoch": 0.7793864651189762, + "grad_norm": 1.09375, + "learning_rate": 4.495384615384616e-06, + "loss": 0.2605, + "step": 5040 + }, + { + "epoch": 0.7809328668354822, + "grad_norm": 1.2265625, + "learning_rate": 4.464615384615385e-06, + "loss": 0.2876, + "step": 5050 + }, + { + "epoch": 0.7824792685519881, + "grad_norm": 1.09375, + "learning_rate": 4.433846153846154e-06, + "loss": 0.3434, + "step": 5060 + }, + { + "epoch": 0.784025670268494, + "grad_norm": 1.046875, + "learning_rate": 4.403076923076923e-06, + "loss": 0.2956, + "step": 5070 + }, + { + "epoch": 0.7855720719849999, + "grad_norm": 1.0078125, + "learning_rate": 4.372307692307693e-06, + "loss": 0.3175, + "step": 5080 + }, + { + "epoch": 0.7871184737015058, + "grad_norm": 1.1015625, + "learning_rate": 4.341538461538462e-06, + "loss": 0.2914, + "step": 5090 + }, + { + "epoch": 0.7886648754180117, + "grad_norm": 1.125, + "learning_rate": 4.310769230769231e-06, + "loss": 0.2657, + "step": 5100 + }, + { + "epoch": 0.7902112771345177, + "grad_norm": 0.9453125, + "learning_rate": 4.2800000000000005e-06, + "loss": 0.3168, + "step": 5110 + }, + { + "epoch": 0.7917576788510235, + "grad_norm": 1.0390625, + "learning_rate": 4.249230769230769e-06, + "loss": 0.2422, + "step": 5120 + }, + { + "epoch": 0.7933040805675294, + "grad_norm": 0.8125, + "learning_rate": 4.218461538461539e-06, + "loss": 0.2651, + "step": 5130 + }, + { + "epoch": 0.7948504822840353, + "grad_norm": 0.984375, + "learning_rate": 4.187692307692308e-06, + "loss": 0.245, + "step": 5140 + }, + { + "epoch": 0.7963968840005412, + "grad_norm": 0.82421875, + "learning_rate": 4.156923076923077e-06, + "loss": 0.3055, + "step": 5150 + }, + { + "epoch": 0.7979432857170471, + "grad_norm": 1.015625, + "learning_rate": 4.126153846153847e-06, + "loss": 0.2992, + "step": 5160 + }, + { + "epoch": 0.7994896874335531, + "grad_norm": 0.796875, + "learning_rate": 4.095384615384615e-06, + "loss": 0.3123, + "step": 5170 + }, + { + "epoch": 0.801036089150059, + "grad_norm": 1.1796875, + "learning_rate": 4.0646153846153854e-06, + "loss": 0.2849, + "step": 5180 + }, + { + "epoch": 0.8025824908665649, + "grad_norm": 0.84765625, + "learning_rate": 4.033846153846154e-06, + "loss": 0.317, + "step": 5190 + }, + { + "epoch": 0.8041288925830707, + "grad_norm": 0.88671875, + "learning_rate": 4.003076923076923e-06, + "loss": 0.2567, + "step": 5200 + }, + { + "epoch": 0.8056752942995766, + "grad_norm": 1.109375, + "learning_rate": 3.972307692307693e-06, + "loss": 0.2918, + "step": 5210 + }, + { + "epoch": 0.8072216960160826, + "grad_norm": 0.9765625, + "learning_rate": 3.941538461538461e-06, + "loss": 0.3973, + "step": 5220 + }, + { + "epoch": 0.8087680977325885, + "grad_norm": 1.1015625, + "learning_rate": 3.9107692307692316e-06, + "loss": 0.3034, + "step": 5230 + }, + { + "epoch": 0.8103144994490944, + "grad_norm": 1.0546875, + "learning_rate": 3.88e-06, + "loss": 0.2369, + "step": 5240 + }, + { + "epoch": 0.8118609011656003, + "grad_norm": 0.8125, + "learning_rate": 3.8492307692307695e-06, + "loss": 0.261, + "step": 5250 + }, + { + "epoch": 0.8134073028821062, + "grad_norm": 1.015625, + "learning_rate": 3.818461538461539e-06, + "loss": 0.2657, + "step": 5260 + }, + { + "epoch": 0.814953704598612, + "grad_norm": 0.91796875, + "learning_rate": 3.787692307692308e-06, + "loss": 0.2336, + "step": 5270 + }, + { + "epoch": 0.816500106315118, + "grad_norm": 0.83984375, + "learning_rate": 3.7569230769230773e-06, + "loss": 0.2683, + "step": 5280 + }, + { + "epoch": 0.8180465080316239, + "grad_norm": 1.09375, + "learning_rate": 3.7261538461538467e-06, + "loss": 0.2703, + "step": 5290 + }, + { + "epoch": 0.8195929097481298, + "grad_norm": 1.28125, + "learning_rate": 3.6953846153846156e-06, + "loss": 0.2907, + "step": 5300 + }, + { + "epoch": 0.8211393114646357, + "grad_norm": 0.95703125, + "learning_rate": 3.6646153846153846e-06, + "loss": 0.3177, + "step": 5310 + }, + { + "epoch": 0.8226857131811416, + "grad_norm": 0.796875, + "learning_rate": 3.633846153846154e-06, + "loss": 0.3023, + "step": 5320 + }, + { + "epoch": 0.8242321148976476, + "grad_norm": 0.8359375, + "learning_rate": 3.6030769230769234e-06, + "loss": 0.2169, + "step": 5330 + }, + { + "epoch": 0.8257785166141535, + "grad_norm": 0.7890625, + "learning_rate": 3.572307692307693e-06, + "loss": 0.24, + "step": 5340 + }, + { + "epoch": 0.8273249183306594, + "grad_norm": 1.0546875, + "learning_rate": 3.5415384615384618e-06, + "loss": 0.3394, + "step": 5350 + }, + { + "epoch": 0.8288713200471652, + "grad_norm": 1.1875, + "learning_rate": 3.5107692307692307e-06, + "loss": 0.2527, + "step": 5360 + }, + { + "epoch": 0.8304177217636711, + "grad_norm": 0.69921875, + "learning_rate": 3.48e-06, + "loss": 0.2447, + "step": 5370 + }, + { + "epoch": 0.831964123480177, + "grad_norm": 1.0703125, + "learning_rate": 3.4492307692307695e-06, + "loss": 0.2509, + "step": 5380 + }, + { + "epoch": 0.833510525196683, + "grad_norm": 0.80859375, + "learning_rate": 3.418461538461539e-06, + "loss": 0.3633, + "step": 5390 + }, + { + "epoch": 0.8350569269131889, + "grad_norm": 0.69921875, + "learning_rate": 3.387692307692308e-06, + "loss": 0.3206, + "step": 5400 + }, + { + "epoch": 0.8366033286296948, + "grad_norm": 1.5078125, + "learning_rate": 3.356923076923077e-06, + "loss": 0.3542, + "step": 5410 + }, + { + "epoch": 0.8381497303462007, + "grad_norm": 1.0234375, + "learning_rate": 3.3261538461538463e-06, + "loss": 0.2731, + "step": 5420 + }, + { + "epoch": 0.8396961320627065, + "grad_norm": 0.92578125, + "learning_rate": 3.2953846153846157e-06, + "loss": 0.3256, + "step": 5430 + }, + { + "epoch": 0.8412425337792125, + "grad_norm": 0.8359375, + "learning_rate": 3.264615384615385e-06, + "loss": 0.2471, + "step": 5440 + }, + { + "epoch": 0.8427889354957184, + "grad_norm": 0.9453125, + "learning_rate": 3.233846153846154e-06, + "loss": 0.2755, + "step": 5450 + }, + { + "epoch": 0.8443353372122243, + "grad_norm": 1.046875, + "learning_rate": 3.203076923076923e-06, + "loss": 0.3139, + "step": 5460 + }, + { + "epoch": 0.8458817389287302, + "grad_norm": 0.98046875, + "learning_rate": 3.1723076923076924e-06, + "loss": 0.2722, + "step": 5470 + }, + { + "epoch": 0.8474281406452361, + "grad_norm": 0.828125, + "learning_rate": 3.141538461538462e-06, + "loss": 0.3058, + "step": 5480 + }, + { + "epoch": 0.848974542361742, + "grad_norm": 1.03125, + "learning_rate": 3.110769230769231e-06, + "loss": 0.2424, + "step": 5490 + }, + { + "epoch": 0.850520944078248, + "grad_norm": 0.98828125, + "learning_rate": 3.08e-06, + "loss": 0.2752, + "step": 5500 + }, + { + "epoch": 0.8520673457947538, + "grad_norm": 0.6640625, + "learning_rate": 3.049230769230769e-06, + "loss": 0.2309, + "step": 5510 + }, + { + "epoch": 0.8536137475112597, + "grad_norm": 0.85546875, + "learning_rate": 3.0184615384615385e-06, + "loss": 0.33, + "step": 5520 + }, + { + "epoch": 0.8551601492277656, + "grad_norm": 0.765625, + "learning_rate": 2.987692307692308e-06, + "loss": 0.2942, + "step": 5530 + }, + { + "epoch": 0.8567065509442715, + "grad_norm": 0.76171875, + "learning_rate": 2.9569230769230773e-06, + "loss": 0.3103, + "step": 5540 + }, + { + "epoch": 0.8582529526607775, + "grad_norm": 0.91796875, + "learning_rate": 2.9261538461538463e-06, + "loss": 0.2775, + "step": 5550 + }, + { + "epoch": 0.8597993543772834, + "grad_norm": 1.0234375, + "learning_rate": 2.8953846153846153e-06, + "loss": 0.2941, + "step": 5560 + }, + { + "epoch": 0.8613457560937893, + "grad_norm": 0.72265625, + "learning_rate": 2.8646153846153847e-06, + "loss": 0.2591, + "step": 5570 + }, + { + "epoch": 0.8628921578102952, + "grad_norm": 1.0390625, + "learning_rate": 2.833846153846154e-06, + "loss": 0.2801, + "step": 5580 + }, + { + "epoch": 0.864438559526801, + "grad_norm": 0.859375, + "learning_rate": 2.8030769230769234e-06, + "loss": 0.3041, + "step": 5590 + }, + { + "epoch": 0.8659849612433069, + "grad_norm": 1.2265625, + "learning_rate": 2.7723076923076924e-06, + "loss": 0.2866, + "step": 5600 + }, + { + "epoch": 0.8675313629598129, + "grad_norm": 0.99609375, + "learning_rate": 2.7415384615384614e-06, + "loss": 0.3128, + "step": 5610 + }, + { + "epoch": 0.8690777646763188, + "grad_norm": 1.0390625, + "learning_rate": 2.710769230769231e-06, + "loss": 0.3121, + "step": 5620 + }, + { + "epoch": 0.8706241663928247, + "grad_norm": 0.7578125, + "learning_rate": 2.68e-06, + "loss": 0.2264, + "step": 5630 + }, + { + "epoch": 0.8721705681093306, + "grad_norm": 1.0390625, + "learning_rate": 2.6492307692307696e-06, + "loss": 0.2619, + "step": 5640 + }, + { + "epoch": 0.8737169698258365, + "grad_norm": 0.70703125, + "learning_rate": 2.6184615384615385e-06, + "loss": 0.2631, + "step": 5650 + }, + { + "epoch": 0.8752633715423425, + "grad_norm": 0.9765625, + "learning_rate": 2.587692307692308e-06, + "loss": 0.2636, + "step": 5660 + }, + { + "epoch": 0.8768097732588483, + "grad_norm": 1.03125, + "learning_rate": 2.5569230769230773e-06, + "loss": 0.3569, + "step": 5670 + }, + { + "epoch": 0.8783561749753542, + "grad_norm": 0.76953125, + "learning_rate": 2.5261538461538463e-06, + "loss": 0.2297, + "step": 5680 + }, + { + "epoch": 0.8799025766918601, + "grad_norm": 0.89453125, + "learning_rate": 2.4953846153846157e-06, + "loss": 0.2181, + "step": 5690 + }, + { + "epoch": 0.881448978408366, + "grad_norm": 1.3359375, + "learning_rate": 2.4646153846153847e-06, + "loss": 0.3117, + "step": 5700 + }, + { + "epoch": 0.8829953801248719, + "grad_norm": 0.9296875, + "learning_rate": 2.433846153846154e-06, + "loss": 0.3071, + "step": 5710 + }, + { + "epoch": 0.8845417818413779, + "grad_norm": 0.828125, + "learning_rate": 2.4030769230769235e-06, + "loss": 0.2599, + "step": 5720 + }, + { + "epoch": 0.8860881835578838, + "grad_norm": 1.0234375, + "learning_rate": 2.3723076923076924e-06, + "loss": 0.265, + "step": 5730 + }, + { + "epoch": 0.8876345852743897, + "grad_norm": 1.0078125, + "learning_rate": 2.341538461538462e-06, + "loss": 0.2922, + "step": 5740 + }, + { + "epoch": 0.8891809869908955, + "grad_norm": 0.984375, + "learning_rate": 2.310769230769231e-06, + "loss": 0.3616, + "step": 5750 + }, + { + "epoch": 0.8907273887074014, + "grad_norm": 1.2578125, + "learning_rate": 2.28e-06, + "loss": 0.2587, + "step": 5760 + }, + { + "epoch": 0.8922737904239074, + "grad_norm": 0.7890625, + "learning_rate": 2.2492307692307696e-06, + "loss": 0.335, + "step": 5770 + }, + { + "epoch": 0.8938201921404133, + "grad_norm": 0.9375, + "learning_rate": 2.218461538461539e-06, + "loss": 0.288, + "step": 5780 + }, + { + "epoch": 0.8953665938569192, + "grad_norm": 0.921875, + "learning_rate": 2.187692307692308e-06, + "loss": 0.2932, + "step": 5790 + }, + { + "epoch": 0.8969129955734251, + "grad_norm": 1.109375, + "learning_rate": 2.156923076923077e-06, + "loss": 0.282, + "step": 5800 + }, + { + "epoch": 0.898459397289931, + "grad_norm": 0.8671875, + "learning_rate": 2.1261538461538463e-06, + "loss": 0.2073, + "step": 5810 + }, + { + "epoch": 0.9000057990064368, + "grad_norm": 0.87890625, + "learning_rate": 2.0953846153846157e-06, + "loss": 0.2583, + "step": 5820 + }, + { + "epoch": 0.9015522007229428, + "grad_norm": 1.0390625, + "learning_rate": 2.064615384615385e-06, + "loss": 0.2805, + "step": 5830 + }, + { + "epoch": 0.9030986024394487, + "grad_norm": 0.828125, + "learning_rate": 2.033846153846154e-06, + "loss": 0.2416, + "step": 5840 + }, + { + "epoch": 0.9046450041559546, + "grad_norm": 0.7890625, + "learning_rate": 2.003076923076923e-06, + "loss": 0.2826, + "step": 5850 + }, + { + "epoch": 0.9061914058724605, + "grad_norm": 0.9375, + "learning_rate": 1.9723076923076924e-06, + "loss": 0.3072, + "step": 5860 + }, + { + "epoch": 0.9077378075889664, + "grad_norm": 1.171875, + "learning_rate": 1.941538461538462e-06, + "loss": 0.3551, + "step": 5870 + }, + { + "epoch": 0.9092842093054724, + "grad_norm": 0.84765625, + "learning_rate": 1.9107692307692312e-06, + "loss": 0.3224, + "step": 5880 + }, + { + "epoch": 0.9108306110219783, + "grad_norm": 0.890625, + "learning_rate": 1.8800000000000002e-06, + "loss": 0.2501, + "step": 5890 + }, + { + "epoch": 0.9123770127384841, + "grad_norm": 0.671875, + "learning_rate": 1.8492307692307692e-06, + "loss": 0.2555, + "step": 5900 + }, + { + "epoch": 0.91392341445499, + "grad_norm": 0.8671875, + "learning_rate": 1.8184615384615386e-06, + "loss": 0.329, + "step": 5910 + }, + { + "epoch": 0.9154698161714959, + "grad_norm": 0.953125, + "learning_rate": 1.7876923076923078e-06, + "loss": 0.3193, + "step": 5920 + }, + { + "epoch": 0.9170162178880018, + "grad_norm": 1.2265625, + "learning_rate": 1.7569230769230772e-06, + "loss": 0.3162, + "step": 5930 + }, + { + "epoch": 0.9185626196045078, + "grad_norm": 0.87109375, + "learning_rate": 1.7261538461538463e-06, + "loss": 0.29, + "step": 5940 + }, + { + "epoch": 0.9201090213210137, + "grad_norm": 0.859375, + "learning_rate": 1.6953846153846153e-06, + "loss": 0.3122, + "step": 5950 + }, + { + "epoch": 0.9216554230375196, + "grad_norm": 0.75, + "learning_rate": 1.6646153846153847e-06, + "loss": 0.2374, + "step": 5960 + }, + { + "epoch": 0.9232018247540255, + "grad_norm": 0.80859375, + "learning_rate": 1.6338461538461539e-06, + "loss": 0.2562, + "step": 5970 + }, + { + "epoch": 0.9247482264705313, + "grad_norm": 1.0859375, + "learning_rate": 1.6030769230769233e-06, + "loss": 0.2854, + "step": 5980 + }, + { + "epoch": 0.9262946281870373, + "grad_norm": 1.1015625, + "learning_rate": 1.5723076923076925e-06, + "loss": 0.3549, + "step": 5990 + }, + { + "epoch": 0.9278410299035432, + "grad_norm": 1.1953125, + "learning_rate": 1.5415384615384614e-06, + "loss": 0.4152, + "step": 6000 + }, + { + "epoch": 0.9293874316200491, + "grad_norm": 1.0546875, + "learning_rate": 1.5107692307692308e-06, + "loss": 0.2626, + "step": 6010 + }, + { + "epoch": 0.930933833336555, + "grad_norm": 0.9921875, + "learning_rate": 1.48e-06, + "loss": 0.3302, + "step": 6020 + }, + { + "epoch": 0.9324802350530609, + "grad_norm": 1.0390625, + "learning_rate": 1.4492307692307694e-06, + "loss": 0.3063, + "step": 6030 + }, + { + "epoch": 0.9340266367695668, + "grad_norm": 1.2265625, + "learning_rate": 1.4184615384615386e-06, + "loss": 0.2758, + "step": 6040 + }, + { + "epoch": 0.9355730384860728, + "grad_norm": 1.015625, + "learning_rate": 1.3876923076923076e-06, + "loss": 0.2999, + "step": 6050 + }, + { + "epoch": 0.9371194402025786, + "grad_norm": 1.0234375, + "learning_rate": 1.356923076923077e-06, + "loss": 0.3438, + "step": 6060 + }, + { + "epoch": 0.9386658419190845, + "grad_norm": 0.70703125, + "learning_rate": 1.3261538461538461e-06, + "loss": 0.2368, + "step": 6070 + }, + { + "epoch": 0.9402122436355904, + "grad_norm": 0.80859375, + "learning_rate": 1.2953846153846155e-06, + "loss": 0.2524, + "step": 6080 + }, + { + "epoch": 0.9417586453520963, + "grad_norm": 0.74609375, + "learning_rate": 1.2646153846153847e-06, + "loss": 0.2264, + "step": 6090 + }, + { + "epoch": 0.9433050470686023, + "grad_norm": 0.75390625, + "learning_rate": 1.233846153846154e-06, + "loss": 0.2067, + "step": 6100 + }, + { + "epoch": 0.9448514487851082, + "grad_norm": 0.81640625, + "learning_rate": 1.2030769230769233e-06, + "loss": 0.231, + "step": 6110 + }, + { + "epoch": 0.9463978505016141, + "grad_norm": 0.9765625, + "learning_rate": 1.1723076923076925e-06, + "loss": 0.2731, + "step": 6120 + }, + { + "epoch": 0.94794425221812, + "grad_norm": 0.9140625, + "learning_rate": 1.1415384615384617e-06, + "loss": 0.2837, + "step": 6130 + }, + { + "epoch": 0.9494906539346258, + "grad_norm": 1.0078125, + "learning_rate": 1.1107692307692309e-06, + "loss": 0.2272, + "step": 6140 + }, + { + "epoch": 0.9510370556511317, + "grad_norm": 0.9921875, + "learning_rate": 1.08e-06, + "loss": 0.3152, + "step": 6150 + }, + { + "epoch": 0.9525834573676377, + "grad_norm": 0.86328125, + "learning_rate": 1.0492307692307694e-06, + "loss": 0.3017, + "step": 6160 + }, + { + "epoch": 0.9541298590841436, + "grad_norm": 0.8828125, + "learning_rate": 1.0184615384615386e-06, + "loss": 0.3378, + "step": 6170 + }, + { + "epoch": 0.9556762608006495, + "grad_norm": 0.92578125, + "learning_rate": 9.876923076923078e-07, + "loss": 0.2503, + "step": 6180 + }, + { + "epoch": 0.9572226625171554, + "grad_norm": 0.87109375, + "learning_rate": 9.56923076923077e-07, + "loss": 0.3438, + "step": 6190 + }, + { + "epoch": 0.9587690642336613, + "grad_norm": 0.875, + "learning_rate": 9.261538461538462e-07, + "loss": 0.2667, + "step": 6200 + }, + { + "epoch": 0.9603154659501673, + "grad_norm": 0.64453125, + "learning_rate": 8.953846153846155e-07, + "loss": 0.2745, + "step": 6210 + }, + { + "epoch": 0.9618618676666731, + "grad_norm": 0.84765625, + "learning_rate": 8.646153846153847e-07, + "loss": 0.2683, + "step": 6220 + }, + { + "epoch": 0.963408269383179, + "grad_norm": 0.91015625, + "learning_rate": 8.338461538461539e-07, + "loss": 0.2422, + "step": 6230 + }, + { + "epoch": 0.9649546710996849, + "grad_norm": 0.89453125, + "learning_rate": 8.030769230769231e-07, + "loss": 0.2588, + "step": 6240 + }, + { + "epoch": 0.9665010728161908, + "grad_norm": 0.87890625, + "learning_rate": 7.723076923076923e-07, + "loss": 0.2812, + "step": 6250 + }, + { + "epoch": 0.9680474745326967, + "grad_norm": 1.09375, + "learning_rate": 7.415384615384616e-07, + "loss": 0.3232, + "step": 6260 + }, + { + "epoch": 0.9695938762492027, + "grad_norm": 0.92578125, + "learning_rate": 7.107692307692309e-07, + "loss": 0.2422, + "step": 6270 + }, + { + "epoch": 0.9711402779657086, + "grad_norm": 0.9765625, + "learning_rate": 6.800000000000001e-07, + "loss": 0.2833, + "step": 6280 + }, + { + "epoch": 0.9726866796822145, + "grad_norm": 0.890625, + "learning_rate": 6.492307692307692e-07, + "loss": 0.2517, + "step": 6290 + }, + { + "epoch": 0.9742330813987203, + "grad_norm": 0.99609375, + "learning_rate": 6.184615384615385e-07, + "loss": 0.2534, + "step": 6300 + }, + { + "epoch": 0.9757794831152262, + "grad_norm": 0.8125, + "learning_rate": 5.876923076923077e-07, + "loss": 0.2911, + "step": 6310 + }, + { + "epoch": 0.9773258848317322, + "grad_norm": 1.1015625, + "learning_rate": 5.56923076923077e-07, + "loss": 0.2628, + "step": 6320 + }, + { + "epoch": 0.9788722865482381, + "grad_norm": 1.1015625, + "learning_rate": 5.261538461538462e-07, + "loss": 0.3191, + "step": 6330 + }, + { + "epoch": 0.980418688264744, + "grad_norm": 0.9765625, + "learning_rate": 4.953846153846155e-07, + "loss": 0.3225, + "step": 6340 + }, + { + "epoch": 0.9819650899812499, + "grad_norm": 0.92578125, + "learning_rate": 4.6461538461538465e-07, + "loss": 0.2836, + "step": 6350 + }, + { + "epoch": 0.9835114916977558, + "grad_norm": 1.0625, + "learning_rate": 4.3384615384615384e-07, + "loss": 0.303, + "step": 6360 + }, + { + "epoch": 0.9850578934142616, + "grad_norm": 1.171875, + "learning_rate": 4.0307692307692313e-07, + "loss": 0.2381, + "step": 6370 + }, + { + "epoch": 0.9866042951307676, + "grad_norm": 0.77734375, + "learning_rate": 3.7230769230769236e-07, + "loss": 0.2821, + "step": 6380 + }, + { + "epoch": 0.9881506968472735, + "grad_norm": 0.796875, + "learning_rate": 3.4153846153846155e-07, + "loss": 0.3187, + "step": 6390 + }, + { + "epoch": 0.9896970985637794, + "grad_norm": 0.80859375, + "learning_rate": 3.107692307692308e-07, + "loss": 0.2993, + "step": 6400 + }, + { + "epoch": 0.9912435002802853, + "grad_norm": 0.7421875, + "learning_rate": 2.8e-07, + "loss": 0.2923, + "step": 6410 + }, + { + "epoch": 0.9927899019967912, + "grad_norm": 0.953125, + "learning_rate": 2.4923076923076926e-07, + "loss": 0.2407, + "step": 6420 + }, + { + "epoch": 0.9943363037132972, + "grad_norm": 1.0078125, + "learning_rate": 2.1846153846153847e-07, + "loss": 0.257, + "step": 6430 + }, + { + "epoch": 0.9958827054298031, + "grad_norm": 1.171875, + "learning_rate": 1.8769230769230773e-07, + "loss": 0.2531, + "step": 6440 + }, + { + "epoch": 0.997429107146309, + "grad_norm": 0.69921875, + "learning_rate": 1.5692307692307694e-07, + "loss": 0.273, + "step": 6450 + }, + { + "epoch": 0.9989755088628148, + "grad_norm": 0.99609375, + "learning_rate": 1.2615384615384617e-07, + "loss": 0.313, + "step": 6460 + }, + { + "epoch": 1.0004639205149517, + "grad_norm": 0.75, + "learning_rate": 9.53846153846154e-08, + "loss": 0.2218, + "step": 6470 + }, + { + "epoch": 1.0020103222314576, + "grad_norm": 1.0625, + "learning_rate": 6.461538461538462e-08, + "loss": 0.3402, + "step": 6480 + }, + { + "epoch": 1.0035567239479637, + "grad_norm": 0.80078125, + "learning_rate": 3.384615384615385e-08, + "loss": 0.2327, + "step": 6490 + }, + { + "epoch": 1.0051031256644696, + "grad_norm": 0.98046875, + "learning_rate": 3.0769230769230774e-09, + "loss": 0.2185, + "step": 6500 + } + ], + "logging_steps": 10, + "max_steps": 6500, + "num_input_tokens_seen": 0, + "num_train_epochs": 2, + "save_steps": 500, + "stateful_callbacks": { + "TrainerControl": { + "args": { + "should_epoch_stop": false, + "should_evaluate": false, + "should_log": false, + "should_save": true, + "should_training_stop": true + }, + "attributes": {} + } + }, + "total_flos": 1.4935783864782275e+18, + "train_batch_size": 1, + "trial_name": null, + "trial_params": null +}