|
{ |
|
"best_global_step": 626, |
|
"best_metric": 0.90001261, |
|
"best_model_checkpoint": "/proj/checkpoints/zhangchen/tool-rl-dev/sft_models/Qwen2.5-7B-Instruct-MIX-KimiK2-DD3-LR2.0e-5-EPOCHS2/v1-20250919-024521/checkpoint-626", |
|
"epoch": 2.0, |
|
"eval_steps": 16, |
|
"global_step": 626, |
|
"is_hyper_param_search": false, |
|
"is_local_process_zero": true, |
|
"is_world_process_zero": true, |
|
"log_history": [ |
|
{ |
|
"epoch": 0.0032025620496397116, |
|
"grad_norm": 12.095559524428625, |
|
"learning_rate": 1.9999874072618805e-05, |
|
"loss": 1.7495059967041016, |
|
"step": 1 |
|
}, |
|
{ |
|
"epoch": 0.016012810248198558, |
|
"grad_norm": 2.4446721027205087, |
|
"learning_rate": 1.999685197404432e-05, |
|
"loss": 1.3246392011642456, |
|
"step": 5 |
|
}, |
|
{ |
|
"epoch": 0.032025620496397116, |
|
"grad_norm": 0.8674719311080642, |
|
"learning_rate": 1.9987409878190752e-05, |
|
"loss": 1.1605964660644532, |
|
"step": 10 |
|
}, |
|
{ |
|
"epoch": 0.04803843074459568, |
|
"grad_norm": 0.8078480489924645, |
|
"learning_rate": 1.9971679657231874e-05, |
|
"loss": 1.1610276222229003, |
|
"step": 15 |
|
}, |
|
{ |
|
"epoch": 0.051240992794235385, |
|
"eval_loss": 1.051803469657898, |
|
"eval_runtime": 14.5335, |
|
"eval_samples_per_second": 7.293, |
|
"eval_steps_per_second": 0.963, |
|
"eval_token_acc": 0.7288482137333997, |
|
"step": 16 |
|
}, |
|
{ |
|
"epoch": 0.06405124099279423, |
|
"grad_norm": 0.7441419049827398, |
|
"learning_rate": 1.9949671214996448e-05, |
|
"loss": 1.1230401992797852, |
|
"step": 20 |
|
}, |
|
{ |
|
"epoch": 0.08006405124099279, |
|
"grad_norm": 0.66006524950857, |
|
"learning_rate": 1.9921398408113955e-05, |
|
"loss": 1.088608741760254, |
|
"step": 25 |
|
}, |
|
{ |
|
"epoch": 0.09607686148919135, |
|
"grad_norm": 0.5824061800332928, |
|
"learning_rate": 1.9886879037290385e-05, |
|
"loss": 1.0843761444091797, |
|
"step": 30 |
|
}, |
|
{ |
|
"epoch": 0.10248198558847077, |
|
"eval_loss": 1.0146167278289795, |
|
"eval_runtime": 14.4953, |
|
"eval_samples_per_second": 7.313, |
|
"eval_steps_per_second": 0.966, |
|
"eval_token_acc": 0.7360955106479266, |
|
"step": 32 |
|
}, |
|
{ |
|
"epoch": 0.11208967173738991, |
|
"grad_norm": 0.6124558915875796, |
|
"learning_rate": 1.9846134836100797e-05, |
|
"loss": 1.0593854904174804, |
|
"step": 35 |
|
}, |
|
{ |
|
"epoch": 0.12810248198558846, |
|
"grad_norm": 0.6688978350600125, |
|
"learning_rate": 1.9799191457305767e-05, |
|
"loss": 1.042323684692383, |
|
"step": 40 |
|
}, |
|
{ |
|
"epoch": 0.14411529223378702, |
|
"grad_norm": 0.583235840286963, |
|
"learning_rate": 1.974607845670028e-05, |
|
"loss": 1.0665904998779296, |
|
"step": 45 |
|
}, |
|
{ |
|
"epoch": 0.15372297838270615, |
|
"eval_loss": 0.9931923151016235, |
|
"eval_runtime": 14.3487, |
|
"eval_samples_per_second": 7.387, |
|
"eval_steps_per_second": 0.976, |
|
"eval_token_acc": 0.740839136705638, |
|
"step": 48 |
|
}, |
|
{ |
|
"epoch": 0.16012810248198558, |
|
"grad_norm": 0.6834039523199339, |
|
"learning_rate": 1.968682927450523e-05, |
|
"loss": 1.0202735900878905, |
|
"step": 50 |
|
}, |
|
{ |
|
"epoch": 0.17614091273018415, |
|
"grad_norm": 0.7236032273618205, |
|
"learning_rate": 1.9621481214313295e-05, |
|
"loss": 1.048078727722168, |
|
"step": 55 |
|
}, |
|
{ |
|
"epoch": 0.1921537229783827, |
|
"grad_norm": 0.6147507739334064, |
|
"learning_rate": 1.955007541960241e-05, |
|
"loss": 1.0933009147644044, |
|
"step": 60 |
|
}, |
|
{ |
|
"epoch": 0.20496397117694154, |
|
"eval_loss": 0.980483889579773, |
|
"eval_runtime": 14.5799, |
|
"eval_samples_per_second": 7.27, |
|
"eval_steps_per_second": 0.96, |
|
"eval_token_acc": 0.7434513325237371, |
|
"step": 64 |
|
}, |
|
{ |
|
"epoch": 0.20816653322658127, |
|
"grad_norm": 0.5981978572505926, |
|
"learning_rate": 1.9472656847831595e-05, |
|
"loss": 1.0627766609191895, |
|
"step": 65 |
|
}, |
|
{ |
|
"epoch": 0.22417934347477983, |
|
"grad_norm": 0.5944938726625539, |
|
"learning_rate": 1.9389274242135528e-05, |
|
"loss": 1.0284326553344727, |
|
"step": 70 |
|
}, |
|
{ |
|
"epoch": 0.2401921537229784, |
|
"grad_norm": 0.5690809156814671, |
|
"learning_rate": 1.9299980100635612e-05, |
|
"loss": 0.9859808921813965, |
|
"step": 75 |
|
}, |
|
{ |
|
"epoch": 0.2562049639711769, |
|
"grad_norm": 0.5905137235030116, |
|
"learning_rate": 1.920483064338687e-05, |
|
"loss": 1.0671635627746583, |
|
"step": 80 |
|
}, |
|
{ |
|
"epoch": 0.2562049639711769, |
|
"eval_loss": 0.9712809324264526, |
|
"eval_runtime": 14.4122, |
|
"eval_samples_per_second": 7.355, |
|
"eval_steps_per_second": 0.971, |
|
"eval_token_acc": 0.7450575019507362, |
|
"step": 80 |
|
}, |
|
{ |
|
"epoch": 0.2722177742193755, |
|
"grad_norm": 0.5816842433518039, |
|
"learning_rate": 1.9103885776981517e-05, |
|
"loss": 0.9951675415039063, |
|
"step": 85 |
|
}, |
|
{ |
|
"epoch": 0.28823058446757405, |
|
"grad_norm": 0.5787862483192674, |
|
"learning_rate": 1.8997209056831462e-05, |
|
"loss": 1.049220657348633, |
|
"step": 90 |
|
}, |
|
{ |
|
"epoch": 0.3042433947157726, |
|
"grad_norm": 0.6164119168224672, |
|
"learning_rate": 1.8884867647153483e-05, |
|
"loss": 1.0655245780944824, |
|
"step": 95 |
|
}, |
|
{ |
|
"epoch": 0.3074459567654123, |
|
"eval_loss": 0.9638978838920593, |
|
"eval_runtime": 14.4222, |
|
"eval_samples_per_second": 7.35, |
|
"eval_steps_per_second": 0.971, |
|
"eval_token_acc": 0.7468503343111432, |
|
"step": 96 |
|
}, |
|
{ |
|
"epoch": 0.32025620496397117, |
|
"grad_norm": 0.5809223505771192, |
|
"learning_rate": 1.87669322786823e-05, |
|
"loss": 1.0123931884765625, |
|
"step": 100 |
|
}, |
|
{ |
|
"epoch": 0.33626901521216973, |
|
"grad_norm": 0.5370286998411188, |
|
"learning_rate": 1.8643477204138114e-05, |
|
"loss": 0.9670543670654297, |
|
"step": 105 |
|
}, |
|
{ |
|
"epoch": 0.3522818254603683, |
|
"grad_norm": 0.5969517267654925, |
|
"learning_rate": 1.851458015147673e-05, |
|
"loss": 1.0549612045288086, |
|
"step": 110 |
|
}, |
|
{ |
|
"epoch": 0.3586869495596477, |
|
"eval_loss": 0.9577141404151917, |
|
"eval_runtime": 14.3804, |
|
"eval_samples_per_second": 7.371, |
|
"eval_steps_per_second": 0.974, |
|
"eval_token_acc": 0.7479301576759162, |
|
"step": 112 |
|
}, |
|
{ |
|
"epoch": 0.36829463570856685, |
|
"grad_norm": 0.5551260620868368, |
|
"learning_rate": 1.838032227495163e-05, |
|
"loss": 0.9998938560485839, |
|
"step": 115 |
|
}, |
|
{ |
|
"epoch": 0.3843074459567654, |
|
"grad_norm": 0.5943112601394075, |
|
"learning_rate": 1.8240788104018824e-05, |
|
"loss": 0.9902477264404297, |
|
"step": 120 |
|
}, |
|
{ |
|
"epoch": 0.400320256204964, |
|
"grad_norm": 0.5771043855645519, |
|
"learning_rate": 1.809606549011667e-05, |
|
"loss": 1.0216222763061524, |
|
"step": 125 |
|
}, |
|
{ |
|
"epoch": 0.4099279423538831, |
|
"eval_loss": 0.9509572386741638, |
|
"eval_runtime": 14.4968, |
|
"eval_samples_per_second": 7.312, |
|
"eval_steps_per_second": 0.966, |
|
"eval_token_acc": 0.7491141650035217, |
|
"step": 128 |
|
}, |
|
{ |
|
"epoch": 0.41633306645316254, |
|
"grad_norm": 0.5544818028626222, |
|
"learning_rate": 1.7946245551354156e-05, |
|
"loss": 0.9958406448364258, |
|
"step": 130 |
|
}, |
|
{ |
|
"epoch": 0.4323458767013611, |
|
"grad_norm": 0.5968141913341887, |
|
"learning_rate": 1.779142261514247e-05, |
|
"loss": 0.9774517059326172, |
|
"step": 135 |
|
}, |
|
{ |
|
"epoch": 0.44835868694955966, |
|
"grad_norm": 0.5850121301746795, |
|
"learning_rate": 1.7631694158805945e-05, |
|
"loss": 1.0328359603881836, |
|
"step": 140 |
|
}, |
|
{ |
|
"epoch": 0.4611689351481185, |
|
"eval_loss": 0.9461285471916199, |
|
"eval_runtime": 14.5166, |
|
"eval_samples_per_second": 7.302, |
|
"eval_steps_per_second": 0.964, |
|
"eval_token_acc": 0.750104997900042, |
|
"step": 144 |
|
}, |
|
{ |
|
"epoch": 0.4643714971977582, |
|
"grad_norm": 0.5752900237956412, |
|
"learning_rate": 1.7467160748209872e-05, |
|
"loss": 1.0004414558410644, |
|
"step": 145 |
|
}, |
|
{ |
|
"epoch": 0.4803843074459568, |
|
"grad_norm": 0.603759980991957, |
|
"learning_rate": 1.7297925974443675e-05, |
|
"loss": 0.9869029998779297, |
|
"step": 150 |
|
}, |
|
{ |
|
"epoch": 0.49639711769415534, |
|
"grad_norm": 0.5943556191906708, |
|
"learning_rate": 1.7124096388599438e-05, |
|
"loss": 0.9789384841918946, |
|
"step": 155 |
|
}, |
|
{ |
|
"epoch": 0.5124099279423538, |
|
"grad_norm": 0.534895902584602, |
|
"learning_rate": 1.6945781434686783e-05, |
|
"loss": 0.9825244903564453, |
|
"step": 160 |
|
}, |
|
{ |
|
"epoch": 0.5124099279423538, |
|
"eval_loss": 0.9406914710998535, |
|
"eval_runtime": 14.543, |
|
"eval_samples_per_second": 7.289, |
|
"eval_steps_per_second": 0.963, |
|
"eval_token_acc": 0.7515060551502148, |
|
"step": 160 |
|
}, |
|
{ |
|
"epoch": 0.5284227381905524, |
|
"grad_norm": 0.570687760675595, |
|
"learning_rate": 1.6763093380726347e-05, |
|
"loss": 0.9817087173461914, |
|
"step": 165 |
|
}, |
|
{ |
|
"epoch": 0.544435548438751, |
|
"grad_norm": 0.538629612243421, |
|
"learning_rate": 1.6576147248065268e-05, |
|
"loss": 0.9138102531433105, |
|
"step": 170 |
|
}, |
|
{ |
|
"epoch": 0.5604483586869495, |
|
"grad_norm": 0.5367674967998894, |
|
"learning_rate": 1.6385060738959123e-05, |
|
"loss": 1.0132644653320313, |
|
"step": 175 |
|
}, |
|
{ |
|
"epoch": 0.5636509207365893, |
|
"eval_loss": 0.9347953796386719, |
|
"eval_runtime": 14.5385, |
|
"eval_samples_per_second": 7.291, |
|
"eval_steps_per_second": 0.963, |
|
"eval_token_acc": 0.7528409121740046, |
|
"step": 176 |
|
}, |
|
{ |
|
"epoch": 0.5764611689351481, |
|
"grad_norm": 0.5505130343882023, |
|
"learning_rate": 1.618995416246601e-05, |
|
"loss": 0.992160701751709, |
|
"step": 180 |
|
}, |
|
{ |
|
"epoch": 0.5924739791833467, |
|
"grad_norm": 0.5177676977332548, |
|
"learning_rate": 1.5990950358699313e-05, |
|
"loss": 0.9666297912597657, |
|
"step": 185 |
|
}, |
|
{ |
|
"epoch": 0.6084867894315452, |
|
"grad_norm": 0.5115335805994138, |
|
"learning_rate": 1.5788174621486936e-05, |
|
"loss": 1.0041114807128906, |
|
"step": 190 |
|
}, |
|
{ |
|
"epoch": 0.6148919135308246, |
|
"eval_loss": 0.93098384141922, |
|
"eval_runtime": 14.3902, |
|
"eval_samples_per_second": 7.366, |
|
"eval_steps_per_second": 0.973, |
|
"eval_token_acc": 0.7536678723789866, |
|
"step": 192 |
|
}, |
|
{ |
|
"epoch": 0.6244995996797438, |
|
"grad_norm": 0.5250074708052181, |
|
"learning_rate": 1.5581754619485665e-05, |
|
"loss": 0.9529659271240234, |
|
"step": 195 |
|
}, |
|
{ |
|
"epoch": 0.6405124099279423, |
|
"grad_norm": 0.5229963755514054, |
|
"learning_rate": 1.5371820315800316e-05, |
|
"loss": 0.9871055603027343, |
|
"step": 200 |
|
}, |
|
{ |
|
"epoch": 0.6565252201761409, |
|
"grad_norm": 0.5619728554930467, |
|
"learning_rate": 1.515850388615829e-05, |
|
"loss": 0.9758607864379882, |
|
"step": 205 |
|
}, |
|
{ |
|
"epoch": 0.6661329063250601, |
|
"eval_loss": 0.9274569749832153, |
|
"eval_runtime": 14.4012, |
|
"eval_samples_per_second": 7.36, |
|
"eval_steps_per_second": 0.972, |
|
"eval_token_acc": 0.7543787108753949, |
|
"step": 208 |
|
}, |
|
{ |
|
"epoch": 0.6725380304243395, |
|
"grad_norm": 0.5225355460662542, |
|
"learning_rate": 1.4941939635691036e-05, |
|
"loss": 0.9769810676574707, |
|
"step": 210 |
|
}, |
|
{ |
|
"epoch": 0.688550840672538, |
|
"grad_norm": 0.568577150338433, |
|
"learning_rate": 1.472226391437487e-05, |
|
"loss": 0.9766319274902344, |
|
"step": 215 |
|
}, |
|
{ |
|
"epoch": 0.7045636509207366, |
|
"grad_norm": 0.5082957989681854, |
|
"learning_rate": 1.4499615031184297e-05, |
|
"loss": 1.0206737518310547, |
|
"step": 220 |
|
}, |
|
{ |
|
"epoch": 0.7173738991192954, |
|
"eval_loss": 0.9242973327636719, |
|
"eval_runtime": 14.3413, |
|
"eval_samples_per_second": 7.391, |
|
"eval_steps_per_second": 0.976, |
|
"eval_token_acc": 0.7550070704012276, |
|
"step": 224 |
|
}, |
|
{ |
|
"epoch": 0.7205764611689351, |
|
"grad_norm": 0.5358194740461656, |
|
"learning_rate": 1.4274133167011974e-05, |
|
"loss": 0.9878074645996093, |
|
"step": 225 |
|
}, |
|
{ |
|
"epoch": 0.7365892714171337, |
|
"grad_norm": 0.5339305541304005, |
|
"learning_rate": 1.4045960286410093e-05, |
|
"loss": 0.9679292678833008, |
|
"step": 230 |
|
}, |
|
{ |
|
"epoch": 0.7526020816653323, |
|
"grad_norm": 0.5593865586297648, |
|
"learning_rate": 1.3815240048208754e-05, |
|
"loss": 1.0053502082824708, |
|
"step": 235 |
|
}, |
|
{ |
|
"epoch": 0.7686148919135308, |
|
"grad_norm": 0.5401750319818177, |
|
"learning_rate": 1.3582117715067628e-05, |
|
"loss": 0.9748393058776855, |
|
"step": 240 |
|
}, |
|
{ |
|
"epoch": 0.7686148919135308, |
|
"eval_loss": 0.9197457432746887, |
|
"eval_runtime": 14.3833, |
|
"eval_samples_per_second": 7.37, |
|
"eval_steps_per_second": 0.973, |
|
"eval_token_acc": 0.7559349588202035, |
|
"step": 240 |
|
}, |
|
{ |
|
"epoch": 0.7846277021617294, |
|
"grad_norm": 0.5111792950923116, |
|
"learning_rate": 1.3346740062017838e-05, |
|
"loss": 0.9668654441833496, |
|
"step": 245 |
|
}, |
|
{ |
|
"epoch": 0.800640512409928, |
|
"grad_norm": 0.4908084859303256, |
|
"learning_rate": 1.3109255284051615e-05, |
|
"loss": 0.9447157859802247, |
|
"step": 250 |
|
}, |
|
{ |
|
"epoch": 0.8166533226581265, |
|
"grad_norm": 0.562628279362229, |
|
"learning_rate": 1.2869812902817983e-05, |
|
"loss": 1.0060791969299316, |
|
"step": 255 |
|
}, |
|
{ |
|
"epoch": 0.8198558847077662, |
|
"eval_loss": 0.9148392081260681, |
|
"eval_runtime": 14.5143, |
|
"eval_samples_per_second": 7.303, |
|
"eval_steps_per_second": 0.965, |
|
"eval_token_acc": 0.7570961759059393, |
|
"step": 256 |
|
}, |
|
{ |
|
"epoch": 0.8326661329063251, |
|
"grad_norm": 0.5551169665873459, |
|
"learning_rate": 1.2628563672483147e-05, |
|
"loss": 0.9434755325317383, |
|
"step": 260 |
|
}, |
|
{ |
|
"epoch": 0.8486789431545236, |
|
"grad_norm": 0.5384977105444301, |
|
"learning_rate": 1.2385659484814884e-05, |
|
"loss": 0.9425140380859375, |
|
"step": 265 |
|
}, |
|
{ |
|
"epoch": 0.8646917534027222, |
|
"grad_norm": 0.5558312305489408, |
|
"learning_rate": 1.2141253273550698e-05, |
|
"loss": 0.9308363914489746, |
|
"step": 270 |
|
}, |
|
{ |
|
"epoch": 0.8710968775020016, |
|
"eval_loss": 0.9132360816001892, |
|
"eval_runtime": 14.3299, |
|
"eval_samples_per_second": 7.397, |
|
"eval_steps_per_second": 0.977, |
|
"eval_token_acc": 0.756837886498084, |
|
"step": 272 |
|
}, |
|
{ |
|
"epoch": 0.8807045636509208, |
|
"grad_norm": 0.5408069248699333, |
|
"learning_rate": 1.1895498918109944e-05, |
|
"loss": 0.9968246459960938, |
|
"step": 275 |
|
}, |
|
{ |
|
"epoch": 0.8967173738991193, |
|
"grad_norm": 0.5610077458655038, |
|
"learning_rate": 1.1648551146710557e-05, |
|
"loss": 0.9824394226074219, |
|
"step": 280 |
|
}, |
|
{ |
|
"epoch": 0.9127301841473179, |
|
"grad_norm": 0.5255605897633258, |
|
"learning_rate": 1.1400565438951343e-05, |
|
"loss": 0.9811803817749023, |
|
"step": 285 |
|
}, |
|
{ |
|
"epoch": 0.922337870296237, |
|
"eval_loss": 0.9093953371047974, |
|
"eval_runtime": 14.6394, |
|
"eval_samples_per_second": 7.241, |
|
"eval_steps_per_second": 0.956, |
|
"eval_token_acc": 0.7579763133419503, |
|
"step": 288 |
|
}, |
|
{ |
|
"epoch": 0.9287429943955164, |
|
"grad_norm": 0.5221824598602125, |
|
"learning_rate": 1.1151697927921242e-05, |
|
"loss": 0.9672993659973145, |
|
"step": 290 |
|
}, |
|
{ |
|
"epoch": 0.944755804643715, |
|
"grad_norm": 0.5526116046751202, |
|
"learning_rate": 1.0902105301897098e-05, |
|
"loss": 0.9626201629638672, |
|
"step": 295 |
|
}, |
|
{ |
|
"epoch": 0.9607686148919136, |
|
"grad_norm": 0.5572541912245995, |
|
"learning_rate": 1.065194470569193e-05, |
|
"loss": 0.9313549041748047, |
|
"step": 300 |
|
}, |
|
{ |
|
"epoch": 0.9735788630904724, |
|
"eval_loss": 0.9058682322502136, |
|
"eval_runtime": 14.643, |
|
"eval_samples_per_second": 7.239, |
|
"eval_steps_per_second": 0.956, |
|
"eval_token_acc": 0.7587338175717105, |
|
"step": 304 |
|
}, |
|
{ |
|
"epoch": 0.9767814251401121, |
|
"grad_norm": 0.5274146008466054, |
|
"learning_rate": 1.0401373641715725e-05, |
|
"loss": 0.9810738563537598, |
|
"step": 305 |
|
}, |
|
{ |
|
"epoch": 0.9927942353883107, |
|
"grad_norm": 0.524106900784095, |
|
"learning_rate": 1.0150549870811108e-05, |
|
"loss": 1.0075571060180664, |
|
"step": 310 |
|
}, |
|
{ |
|
"epoch": 1.0064051240992795, |
|
"grad_norm": 0.6435736665358253, |
|
"learning_rate": 9.899631312926303e-06, |
|
"loss": 0.911602783203125, |
|
"step": 315 |
|
}, |
|
{ |
|
"epoch": 1.022417934347478, |
|
"grad_norm": 0.5331283028468047, |
|
"learning_rate": 9.648775947687914e-06, |
|
"loss": 0.8135837554931641, |
|
"step": 320 |
|
}, |
|
{ |
|
"epoch": 1.022417934347478, |
|
"eval_loss": 0.9115325212478638, |
|
"eval_runtime": 14.6079, |
|
"eval_samples_per_second": 7.256, |
|
"eval_steps_per_second": 0.958, |
|
"eval_token_acc": 0.7582769274847061, |
|
"step": 320 |
|
}, |
|
{ |
|
"epoch": 1.0384307445956766, |
|
"grad_norm": 0.5591218522147324, |
|
"learning_rate": 9.398141714936104e-06, |
|
"loss": 0.8059094429016114, |
|
"step": 325 |
|
}, |
|
{ |
|
"epoch": 1.054443554843875, |
|
"grad_norm": 0.4934373856607571, |
|
"learning_rate": 9.147886415284903e-06, |
|
"loss": 0.7668675422668457, |
|
"step": 330 |
|
}, |
|
{ |
|
"epoch": 1.0704563650920738, |
|
"grad_norm": 0.5225315577164972, |
|
"learning_rate": 8.898167610770075e-06, |
|
"loss": 0.8142057418823242, |
|
"step": 335 |
|
}, |
|
{ |
|
"epoch": 1.0736589271417134, |
|
"eval_loss": 0.9149956107139587, |
|
"eval_runtime": 14.4577, |
|
"eval_samples_per_second": 7.332, |
|
"eval_steps_per_second": 0.968, |
|
"eval_token_acc": 0.7586025023685573, |
|
"step": 336 |
|
}, |
|
{ |
|
"epoch": 1.0864691753402722, |
|
"grad_norm": 0.5505230936617594, |
|
"learning_rate": 8.649142525647271e-06, |
|
"loss": 0.7990794658660889, |
|
"step": 340 |
|
}, |
|
{ |
|
"epoch": 1.1024819855884709, |
|
"grad_norm": 0.5093097727784036, |
|
"learning_rate": 8.400967947402802e-06, |
|
"loss": 0.8117720603942871, |
|
"step": 345 |
|
}, |
|
{ |
|
"epoch": 1.1184947958366693, |
|
"grad_norm": 0.5036948822528423, |
|
"learning_rate": 8.153800128039441e-06, |
|
"loss": 0.7948765754699707, |
|
"step": 350 |
|
}, |
|
{ |
|
"epoch": 1.1248999199359488, |
|
"eval_loss": 0.9154974818229675, |
|
"eval_runtime": 14.5423, |
|
"eval_samples_per_second": 7.289, |
|
"eval_steps_per_second": 0.963, |
|
"eval_token_acc": 0.7590235792183381, |
|
"step": 352 |
|
}, |
|
{ |
|
"epoch": 1.1345076060848678, |
|
"grad_norm": 0.4769674515169561, |
|
"learning_rate": 7.907794685699347e-06, |
|
"loss": 0.7767979621887207, |
|
"step": 355 |
|
}, |
|
{ |
|
"epoch": 1.1505204163330665, |
|
"grad_norm": 0.46860511663718424, |
|
"learning_rate": 7.663106506686057e-06, |
|
"loss": 0.8007712364196777, |
|
"step": 360 |
|
}, |
|
{ |
|
"epoch": 1.1665332265812651, |
|
"grad_norm": 0.5257245232743394, |
|
"learning_rate": 7.419889647947294e-06, |
|
"loss": 0.7960629463195801, |
|
"step": 365 |
|
}, |
|
{ |
|
"epoch": 1.176140912730184, |
|
"eval_loss": 0.9117251038551331, |
|
"eval_runtime": 14.5432, |
|
"eval_samples_per_second": 7.289, |
|
"eval_steps_per_second": 0.963, |
|
"eval_token_acc": 0.7595477547813385, |
|
"step": 368 |
|
}, |
|
{ |
|
"epoch": 1.1825460368294636, |
|
"grad_norm": 0.4875981050318745, |
|
"learning_rate": 7.1782972400798825e-06, |
|
"loss": 0.8045159339904785, |
|
"step": 370 |
|
}, |
|
{ |
|
"epoch": 1.198558847077662, |
|
"grad_norm": 0.49641916149081433, |
|
"learning_rate": 6.938481390917966e-06, |
|
"loss": 0.7951483726501465, |
|
"step": 375 |
|
}, |
|
{ |
|
"epoch": 1.2145716573258607, |
|
"grad_norm": 0.5035877161539227, |
|
"learning_rate": 6.700593089765086e-06, |
|
"loss": 0.7760053634643554, |
|
"step": 380 |
|
}, |
|
{ |
|
"epoch": 1.2273819055244195, |
|
"eval_loss": 0.9107971787452698, |
|
"eval_runtime": 14.5702, |
|
"eval_samples_per_second": 7.275, |
|
"eval_steps_per_second": 0.961, |
|
"eval_token_acc": 0.76016417656143, |
|
"step": 384 |
|
}, |
|
{ |
|
"epoch": 1.2305844675740594, |
|
"grad_norm": 0.5325891391407898, |
|
"learning_rate": 6.4647821123305595e-06, |
|
"loss": 0.8205606460571289, |
|
"step": 385 |
|
}, |
|
{ |
|
"epoch": 1.2465972778222578, |
|
"grad_norm": 0.5135559248558815, |
|
"learning_rate": 6.231196926429913e-06, |
|
"loss": 0.7906920909881592, |
|
"step": 390 |
|
}, |
|
{ |
|
"epoch": 1.2626100880704563, |
|
"grad_norm": 0.5335030354283354, |
|
"learning_rate": 5.9999845985087555e-06, |
|
"loss": 0.7642984867095948, |
|
"step": 395 |
|
}, |
|
{ |
|
"epoch": 1.278622898318655, |
|
"grad_norm": 0.48895214640029644, |
|
"learning_rate": 5.7712907010490036e-06, |
|
"loss": 0.7710936069488525, |
|
"step": 400 |
|
}, |
|
{ |
|
"epoch": 1.278622898318655, |
|
"eval_loss": 0.9103301167488098, |
|
"eval_runtime": 14.5639, |
|
"eval_samples_per_second": 7.278, |
|
"eval_steps_per_second": 0.961, |
|
"eval_token_acc": 0.7599558086357653, |
|
"step": 400 |
|
}, |
|
{ |
|
"epoch": 1.2946357085668536, |
|
"grad_norm": 0.48139293101652547, |
|
"learning_rate": 5.54525922091568e-06, |
|
"loss": 0.7966293811798095, |
|
"step": 405 |
|
}, |
|
{ |
|
"epoch": 1.310648518815052, |
|
"grad_norm": 0.49809536135629684, |
|
"learning_rate": 5.322032468702037e-06, |
|
"loss": 0.7998294830322266, |
|
"step": 410 |
|
}, |
|
{ |
|
"epoch": 1.3266613290632505, |
|
"grad_norm": 0.4763833723377931, |
|
"learning_rate": 5.101750989130061e-06, |
|
"loss": 0.7967349529266358, |
|
"step": 415 |
|
}, |
|
{ |
|
"epoch": 1.3298638911128904, |
|
"eval_loss": 0.9072456955909729, |
|
"eval_runtime": 14.9836, |
|
"eval_samples_per_second": 7.074, |
|
"eval_steps_per_second": 0.934, |
|
"eval_token_acc": 0.7605136269367636, |
|
"step": 416 |
|
}, |
|
{ |
|
"epoch": 1.3426741393114492, |
|
"grad_norm": 0.47173729117652685, |
|
"learning_rate": 4.884553472562809e-06, |
|
"loss": 0.7906300544738769, |
|
"step": 420 |
|
}, |
|
{ |
|
"epoch": 1.3586869495596476, |
|
"grad_norm": 0.48564485172021316, |
|
"learning_rate": 4.670576667684217e-06, |
|
"loss": 0.8035748481750489, |
|
"step": 425 |
|
}, |
|
{ |
|
"epoch": 1.3746997598078463, |
|
"grad_norm": 0.4915778703667535, |
|
"learning_rate": 4.459955295401415e-06, |
|
"loss": 0.7896071434020996, |
|
"step": 430 |
|
}, |
|
{ |
|
"epoch": 1.3811048839071258, |
|
"eval_loss": 0.9058337807655334, |
|
"eval_runtime": 15.1686, |
|
"eval_samples_per_second": 6.988, |
|
"eval_steps_per_second": 0.923, |
|
"eval_token_acc": 0.7609249365400289, |
|
"step": 432 |
|
}, |
|
{ |
|
"epoch": 1.3907125700560448, |
|
"grad_norm": 0.5001513581944296, |
|
"learning_rate": 4.2528219640237565e-06, |
|
"loss": 0.771601390838623, |
|
"step": 435 |
|
}, |
|
{ |
|
"epoch": 1.4067253803042434, |
|
"grad_norm": 0.5015899348463313, |
|
"learning_rate": 4.0493070857719305e-06, |
|
"loss": 0.8050725936889649, |
|
"step": 440 |
|
}, |
|
{ |
|
"epoch": 1.4227381905524419, |
|
"grad_norm": 0.4385885001536844, |
|
"learning_rate": 3.849538794669767e-06, |
|
"loss": 0.7711515903472901, |
|
"step": 445 |
|
}, |
|
{ |
|
"epoch": 1.432345876701361, |
|
"eval_loss": 0.9053142666816711, |
|
"eval_runtime": 14.9957, |
|
"eval_samples_per_second": 7.069, |
|
"eval_steps_per_second": 0.934, |
|
"eval_token_acc": 0.7611723734517558, |
|
"step": 448 |
|
}, |
|
{ |
|
"epoch": 1.4387510008006406, |
|
"grad_norm": 0.4923484323839949, |
|
"learning_rate": 3.6536428658703594e-06, |
|
"loss": 0.8126641273498535, |
|
"step": 450 |
|
}, |
|
{ |
|
"epoch": 1.454763811048839, |
|
"grad_norm": 0.4696764489825615, |
|
"learning_rate": 3.4617426364674088e-06, |
|
"loss": 0.8082319259643554, |
|
"step": 455 |
|
}, |
|
{ |
|
"epoch": 1.4707766212970377, |
|
"grad_norm": 0.49222563172262457, |
|
"learning_rate": 3.2739589278415252e-06, |
|
"loss": 0.7895666122436523, |
|
"step": 460 |
|
}, |
|
{ |
|
"epoch": 1.4835868694955965, |
|
"eval_loss": 0.9038885831832886, |
|
"eval_runtime": 15.1795, |
|
"eval_samples_per_second": 6.983, |
|
"eval_steps_per_second": 0.922, |
|
"eval_token_acc": 0.7614545383510934, |
|
"step": 464 |
|
}, |
|
{ |
|
"epoch": 1.4867894315452361, |
|
"grad_norm": 0.49886868372218357, |
|
"learning_rate": 3.0904099695904677e-06, |
|
"loss": 0.7927409172058105, |
|
"step": 465 |
|
}, |
|
{ |
|
"epoch": 1.5028022417934348, |
|
"grad_norm": 0.4598131832969276, |
|
"learning_rate": 2.9112113250911844e-06, |
|
"loss": 0.7818885803222656, |
|
"step": 470 |
|
}, |
|
{ |
|
"epoch": 1.5188150520416333, |
|
"grad_norm": 0.5625875539857127, |
|
"learning_rate": 2.7364758187404895e-06, |
|
"loss": 0.7819410800933838, |
|
"step": 475 |
|
}, |
|
{ |
|
"epoch": 1.534827862289832, |
|
"grad_norm": 0.5114133442743116, |
|
"learning_rate": 2.566313464920265e-06, |
|
"loss": 0.7769948959350585, |
|
"step": 480 |
|
}, |
|
{ |
|
"epoch": 1.534827862289832, |
|
"eval_loss": 0.902407705783844, |
|
"eval_runtime": 15.1633, |
|
"eval_samples_per_second": 6.991, |
|
"eval_steps_per_second": 0.923, |
|
"eval_token_acc": 0.7618387167140378, |
|
"step": 480 |
|
}, |
|
{ |
|
"epoch": 1.5508406725380304, |
|
"grad_norm": 0.4441806210622428, |
|
"learning_rate": 2.4008313987318057e-06, |
|
"loss": 0.771674919128418, |
|
"step": 485 |
|
}, |
|
{ |
|
"epoch": 1.5668534827862288, |
|
"grad_norm": 0.49075637368256414, |
|
"learning_rate": 2.2401338085430326e-06, |
|
"loss": 0.7837276935577393, |
|
"step": 490 |
|
}, |
|
{ |
|
"epoch": 1.5828662930344275, |
|
"grad_norm": 0.4597708187505928, |
|
"learning_rate": 2.0843218703909197e-06, |
|
"loss": 0.8008305549621582, |
|
"step": 495 |
|
}, |
|
{ |
|
"epoch": 1.5860688550840671, |
|
"eval_loss": 0.901805579662323, |
|
"eval_runtime": 14.941, |
|
"eval_samples_per_second": 7.095, |
|
"eval_steps_per_second": 0.937, |
|
"eval_token_acc": 0.7620666191327337, |
|
"step": 496 |
|
}, |
|
{ |
|
"epoch": 1.5988791032826262, |
|
"grad_norm": 0.4720072570006326, |
|
"learning_rate": 1.933493684280574e-06, |
|
"loss": 0.7590449333190918, |
|
"step": 500 |
|
}, |
|
{ |
|
"epoch": 1.6148919135308246, |
|
"grad_norm": 0.4681413165579918, |
|
"learning_rate": 1.7877442124209454e-06, |
|
"loss": 0.7496448516845703, |
|
"step": 505 |
|
}, |
|
{ |
|
"epoch": 1.630904723779023, |
|
"grad_norm": 0.512917797281379, |
|
"learning_rate": 1.6471652194361131e-06, |
|
"loss": 0.8017988204956055, |
|
"step": 510 |
|
}, |
|
{ |
|
"epoch": 1.6373098478783028, |
|
"eval_loss": 0.9010795950889587, |
|
"eval_runtime": 14.9598, |
|
"eval_samples_per_second": 7.086, |
|
"eval_steps_per_second": 0.936, |
|
"eval_token_acc": 0.762230491824272, |
|
"step": 512 |
|
}, |
|
{ |
|
"epoch": 1.6469175340272217, |
|
"grad_norm": 0.47200391166538574, |
|
"learning_rate": 1.5118452145898333e-06, |
|
"loss": 0.7879790782928466, |
|
"step": 515 |
|
}, |
|
{ |
|
"epoch": 1.6629303442754204, |
|
"grad_norm": 0.46302656170029055, |
|
"learning_rate": 1.3818693960596186e-06, |
|
"loss": 0.8126321792602539, |
|
"step": 520 |
|
}, |
|
{ |
|
"epoch": 1.6789431545236189, |
|
"grad_norm": 0.47488283670703535, |
|
"learning_rate": 1.2573195972955366e-06, |
|
"loss": 0.7743080139160157, |
|
"step": 525 |
|
}, |
|
{ |
|
"epoch": 1.688550840672538, |
|
"eval_loss": 0.9008040428161621, |
|
"eval_runtime": 15.0798, |
|
"eval_samples_per_second": 7.029, |
|
"eval_steps_per_second": 0.928, |
|
"eval_token_acc": 0.7623943645158104, |
|
"step": 528 |
|
}, |
|
{ |
|
"epoch": 1.6949559647718173, |
|
"grad_norm": 0.4692191060367917, |
|
"learning_rate": 1.138274235497443e-06, |
|
"loss": 0.7871302604675293, |
|
"step": 530 |
|
}, |
|
{ |
|
"epoch": 1.710968775020016, |
|
"grad_norm": 0.4643020032036482, |
|
"learning_rate": 1.0248082622431089e-06, |
|
"loss": 0.8171819686889649, |
|
"step": 535 |
|
}, |
|
{ |
|
"epoch": 1.7269815852682147, |
|
"grad_norm": 0.4858630939402049, |
|
"learning_rate": 9.169931162983137e-07, |
|
"loss": 0.799337911605835, |
|
"step": 540 |
|
}, |
|
{ |
|
"epoch": 1.7397918334667735, |
|
"eval_loss": 0.900395929813385, |
|
"eval_runtime": 15.0673, |
|
"eval_samples_per_second": 7.035, |
|
"eval_steps_per_second": 0.929, |
|
"eval_token_acc": 0.7622619640630444, |
|
"step": 544 |
|
}, |
|
{ |
|
"epoch": 1.742994395516413, |
|
"grad_norm": 0.4799218615796674, |
|
"learning_rate": 8.14896678638627e-07, |
|
"loss": 0.7801701068878174, |
|
"step": 545 |
|
}, |
|
{ |
|
"epoch": 1.7590072057646116, |
|
"grad_norm": 0.4601161306972559, |
|
"learning_rate": 7.185832297111939e-07, |
|
"loss": 0.7815041065216064, |
|
"step": 550 |
|
}, |
|
{ |
|
"epoch": 1.7750200160128102, |
|
"grad_norm": 0.49546298524642746, |
|
"learning_rate": 6.281134089634344e-07, |
|
"loss": 0.7939868927001953, |
|
"step": 555 |
|
}, |
|
{ |
|
"epoch": 1.791032826261009, |
|
"grad_norm": 0.450946115728095, |
|
"learning_rate": 5.43544176664137e-07, |
|
"loss": 0.7616684913635254, |
|
"step": 560 |
|
}, |
|
{ |
|
"epoch": 1.791032826261009, |
|
"eval_loss": 0.9009150862693787, |
|
"eval_runtime": 15.0544, |
|
"eval_samples_per_second": 7.041, |
|
"eval_steps_per_second": 0.93, |
|
"eval_token_acc": 0.7623921940165848, |
|
"step": 560 |
|
}, |
|
{ |
|
"epoch": 1.8070456365092074, |
|
"grad_norm": 0.47911563419797926, |
|
"learning_rate": 4.649287780409639e-07, |
|
"loss": 0.7857556343078613, |
|
"step": 565 |
|
}, |
|
{ |
|
"epoch": 1.8230584467574058, |
|
"grad_norm": 0.46252106401819404, |
|
"learning_rate": 3.9231670975699354e-07, |
|
"loss": 0.7810296535491943, |
|
"step": 570 |
|
}, |
|
{ |
|
"epoch": 1.8390712570056045, |
|
"grad_norm": 0.4694570979755398, |
|
"learning_rate": 3.257536887473545e-07, |
|
"loss": 0.7967131614685059, |
|
"step": 575 |
|
}, |
|
{ |
|
"epoch": 1.8422738190552441, |
|
"eval_loss": 0.9001392722129822, |
|
"eval_runtime": 14.9754, |
|
"eval_samples_per_second": 7.078, |
|
"eval_steps_per_second": 0.935, |
|
"eval_token_acc": 0.7625387027143178, |
|
"step": 576 |
|
}, |
|
{ |
|
"epoch": 1.8550840672538031, |
|
"grad_norm": 0.4661576925242358, |
|
"learning_rate": 2.6528162343561593e-07, |
|
"loss": 0.7634014129638672, |
|
"step": 580 |
|
}, |
|
{ |
|
"epoch": 1.8710968775020016, |
|
"grad_norm": 0.4606424682139549, |
|
"learning_rate": 2.109385873480141e-07, |
|
"loss": 0.803773021697998, |
|
"step": 585 |
|
}, |
|
{ |
|
"epoch": 1.8871096877502, |
|
"grad_norm": 0.45627441448144695, |
|
"learning_rate": 1.6275879514217052e-07, |
|
"loss": 0.8115141868591309, |
|
"step": 590 |
|
}, |
|
{ |
|
"epoch": 1.8935148118494796, |
|
"eval_loss": 0.9000989198684692, |
|
"eval_runtime": 15.3159, |
|
"eval_samples_per_second": 6.921, |
|
"eval_steps_per_second": 0.914, |
|
"eval_token_acc": 0.7625256797189638, |
|
"step": 592 |
|
}, |
|
{ |
|
"epoch": 1.9031224979983987, |
|
"grad_norm": 0.46130608210077223, |
|
"learning_rate": 1.2077258106536927e-07, |
|
"loss": 0.8369948387145996, |
|
"step": 595 |
|
}, |
|
{ |
|
"epoch": 1.9191353082465974, |
|
"grad_norm": 0.4743433129321771, |
|
"learning_rate": 8.50063798559475e-08, |
|
"loss": 0.7816999912261963, |
|
"step": 600 |
|
}, |
|
{ |
|
"epoch": 1.9351481184947958, |
|
"grad_norm": 0.4716524325522153, |
|
"learning_rate": 5.54827100998534e-08, |
|
"loss": 0.7883635520935058, |
|
"step": 605 |
|
}, |
|
{ |
|
"epoch": 1.944755804643715, |
|
"eval_loss": 0.90007084608078, |
|
"eval_runtime": 15.3084, |
|
"eval_samples_per_second": 6.924, |
|
"eval_steps_per_second": 0.915, |
|
"eval_token_acc": 0.7624746729871604, |
|
"step": 608 |
|
}, |
|
{ |
|
"epoch": 1.9511609287429943, |
|
"grad_norm": 0.44797137033963047, |
|
"learning_rate": 3.2220160052828245e-08, |
|
"loss": 0.7870570182800293, |
|
"step": 610 |
|
}, |
|
{ |
|
"epoch": 1.967173738991193, |
|
"grad_norm": 0.45207745564951624, |
|
"learning_rate": 1.523337593714036e-08, |
|
"loss": 0.8020266532897949, |
|
"step": 615 |
|
}, |
|
{ |
|
"epoch": 1.9831865492393916, |
|
"grad_norm": 0.46560058211157906, |
|
"learning_rate": 4.5330527202480656e-09, |
|
"loss": 0.8163101196289062, |
|
"step": 620 |
|
}, |
|
{ |
|
"epoch": 1.9959967974379502, |
|
"eval_loss": 0.9000772833824158, |
|
"eval_runtime": 14.9354, |
|
"eval_samples_per_second": 7.097, |
|
"eval_steps_per_second": 0.937, |
|
"eval_token_acc": 0.762490951731353, |
|
"step": 624 |
|
}, |
|
{ |
|
"epoch": 1.99919935948759, |
|
"grad_norm": 0.49036708117796896, |
|
"learning_rate": 1.2592738119709957e-10, |
|
"loss": 0.8205068588256836, |
|
"step": 625 |
|
}, |
|
{ |
|
"epoch": 2.0, |
|
"eval_loss": 0.9000126123428345, |
|
"eval_runtime": 15.1395, |
|
"eval_samples_per_second": 7.002, |
|
"eval_steps_per_second": 0.925, |
|
"eval_token_acc": 0.7625473847112205, |
|
"step": 626 |
|
} |
|
], |
|
"logging_steps": 5, |
|
"max_steps": 626, |
|
"num_input_tokens_seen": 0, |
|
"num_train_epochs": 2, |
|
"save_steps": 100, |
|
"stateful_callbacks": { |
|
"TrainerControl": { |
|
"args": { |
|
"should_epoch_stop": false, |
|
"should_evaluate": false, |
|
"should_log": false, |
|
"should_save": true, |
|
"should_training_stop": true |
|
}, |
|
"attributes": {} |
|
} |
|
}, |
|
"total_flos": 1.6044514006305341e+18, |
|
"train_batch_size": 1, |
|
"trial_name": null, |
|
"trial_params": null |
|
} |
|
|