{ "best_metric": null, "best_model_checkpoint": null, "epoch": 1.9977753058954395, "eval_steps": 500, "global_step": 898, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.002224694104560623, "grad_norm": 10.055098914546134, "learning_rate": 2.2222222222222224e-07, "loss": 1.3661, "step": 1 }, { "epoch": 0.011123470522803115, "grad_norm": 9.126296513090308, "learning_rate": 1.111111111111111e-06, "loss": 1.2942, "step": 5 }, { "epoch": 0.02224694104560623, "grad_norm": 7.84364716096091, "learning_rate": 2.222222222222222e-06, "loss": 1.313, "step": 10 }, { "epoch": 0.03337041156840934, "grad_norm": 5.7358942086951625, "learning_rate": 3.3333333333333333e-06, "loss": 1.173, "step": 15 }, { "epoch": 0.04449388209121246, "grad_norm": 1.8899122666121964, "learning_rate": 4.444444444444444e-06, "loss": 0.9741, "step": 20 }, { "epoch": 0.05561735261401557, "grad_norm": 1.3213559323931832, "learning_rate": 5.555555555555557e-06, "loss": 0.9238, "step": 25 }, { "epoch": 0.06674082313681869, "grad_norm": 1.041365244653977, "learning_rate": 6.666666666666667e-06, "loss": 0.8728, "step": 30 }, { "epoch": 0.0778642936596218, "grad_norm": 0.9488843220134849, "learning_rate": 7.77777777777778e-06, "loss": 0.9007, "step": 35 }, { "epoch": 0.08898776418242492, "grad_norm": 0.8275801836954026, "learning_rate": 8.888888888888888e-06, "loss": 0.8661, "step": 40 }, { "epoch": 0.10011123470522804, "grad_norm": 0.9030018185879571, "learning_rate": 1e-05, "loss": 0.8484, "step": 45 }, { "epoch": 0.11123470522803114, "grad_norm": 0.7848210903418559, "learning_rate": 1.1111111111111113e-05, "loss": 0.8291, "step": 50 }, { "epoch": 0.12235817575083426, "grad_norm": 0.9313858219522357, "learning_rate": 1.2222222222222224e-05, "loss": 0.8707, "step": 55 }, { "epoch": 0.13348164627363737, "grad_norm": 0.8227239131251839, "learning_rate": 1.3333333333333333e-05, "loss": 0.8055, "step": 60 }, { "epoch": 0.1446051167964405, "grad_norm": 0.8199201964167689, "learning_rate": 1.4444444444444446e-05, "loss": 0.8151, "step": 65 }, { "epoch": 0.1557285873192436, "grad_norm": 0.7767041809184307, "learning_rate": 1.555555555555556e-05, "loss": 0.8384, "step": 70 }, { "epoch": 0.1668520578420467, "grad_norm": 0.9088502789460952, "learning_rate": 1.6666666666666667e-05, "loss": 0.8106, "step": 75 }, { "epoch": 0.17797552836484984, "grad_norm": 0.8343739709762875, "learning_rate": 1.7777777777777777e-05, "loss": 0.7932, "step": 80 }, { "epoch": 0.18909899888765294, "grad_norm": 0.8939354128507568, "learning_rate": 1.888888888888889e-05, "loss": 0.8045, "step": 85 }, { "epoch": 0.20022246941045607, "grad_norm": 0.885411772323368, "learning_rate": 2e-05, "loss": 0.8009, "step": 90 }, { "epoch": 0.21134593993325917, "grad_norm": 0.8890660179859062, "learning_rate": 1.9998110384864614e-05, "loss": 0.8225, "step": 95 }, { "epoch": 0.22246941045606228, "grad_norm": 0.8371399480864414, "learning_rate": 1.9992442253587533e-05, "loss": 0.7893, "step": 100 }, { "epoch": 0.2335928809788654, "grad_norm": 0.8656120072046296, "learning_rate": 1.998299774828608e-05, "loss": 0.812, "step": 105 }, { "epoch": 0.2447163515016685, "grad_norm": 0.8486947049775396, "learning_rate": 1.9969780438256295e-05, "loss": 0.7886, "step": 110 }, { "epoch": 0.25583982202447164, "grad_norm": 0.8507931458580409, "learning_rate": 1.995279531862399e-05, "loss": 0.8078, "step": 115 }, { "epoch": 0.26696329254727474, "grad_norm": 0.8150830104906274, "learning_rate": 1.993204880845699e-05, "loss": 0.7672, "step": 120 }, { "epoch": 0.27808676307007785, "grad_norm": 0.8642627038468238, "learning_rate": 1.9907548748339223e-05, "loss": 0.7929, "step": 125 }, { "epoch": 0.289210233592881, "grad_norm": 0.8541146407873894, "learning_rate": 1.987930439740757e-05, "loss": 0.7873, "step": 130 }, { "epoch": 0.3003337041156841, "grad_norm": 0.8995167237276087, "learning_rate": 1.9847326429852632e-05, "loss": 0.7862, "step": 135 }, { "epoch": 0.3114571746384872, "grad_norm": 0.8173771695441595, "learning_rate": 1.981162693088471e-05, "loss": 0.7983, "step": 140 }, { "epoch": 0.3225806451612903, "grad_norm": 0.8404409912157184, "learning_rate": 1.977221939216652e-05, "loss": 0.8037, "step": 145 }, { "epoch": 0.3337041156840934, "grad_norm": 0.8245961889334814, "learning_rate": 1.9729118706714377e-05, "loss": 0.8027, "step": 150 }, { "epoch": 0.3448275862068966, "grad_norm": 0.8314933715558246, "learning_rate": 1.96823411632698e-05, "loss": 0.7843, "step": 155 }, { "epoch": 0.3559510567296997, "grad_norm": 0.8726345334934287, "learning_rate": 1.9631904440143614e-05, "loss": 0.793, "step": 160 }, { "epoch": 0.3670745272525028, "grad_norm": 0.8415196157334568, "learning_rate": 1.9577827598534888e-05, "loss": 0.7668, "step": 165 }, { "epoch": 0.3781979977753059, "grad_norm": 0.8815634680126696, "learning_rate": 1.95201310753273e-05, "loss": 0.7851, "step": 170 }, { "epoch": 0.389321468298109, "grad_norm": 0.8024209332569573, "learning_rate": 1.945883667536556e-05, "loss": 0.7772, "step": 175 }, { "epoch": 0.40044493882091214, "grad_norm": 0.8490296704540178, "learning_rate": 1.9393967563214833e-05, "loss": 0.7761, "step": 180 }, { "epoch": 0.41156840934371525, "grad_norm": 0.8454078852442415, "learning_rate": 1.9325548254406354e-05, "loss": 0.7624, "step": 185 }, { "epoch": 0.42269187986651835, "grad_norm": 0.8511908385629164, "learning_rate": 1.925360460617242e-05, "loss": 0.7668, "step": 190 }, { "epoch": 0.43381535038932145, "grad_norm": 0.7982929700309319, "learning_rate": 1.9178163807674343e-05, "loss": 0.7634, "step": 195 }, { "epoch": 0.44493882091212456, "grad_norm": 0.7816495059977551, "learning_rate": 1.9099254369727062e-05, "loss": 0.7748, "step": 200 }, { "epoch": 0.4560622914349277, "grad_norm": 0.7563436109378239, "learning_rate": 1.901690611402423e-05, "loss": 0.7722, "step": 205 }, { "epoch": 0.4671857619577308, "grad_norm": 0.8466958907764663, "learning_rate": 1.8931150161867917e-05, "loss": 0.7765, "step": 210 }, { "epoch": 0.4783092324805339, "grad_norm": 0.7620831089947199, "learning_rate": 1.8842018922407153e-05, "loss": 0.7704, "step": 215 }, { "epoch": 0.489432703003337, "grad_norm": 0.84897869766796, "learning_rate": 1.874954608038976e-05, "loss": 0.7729, "step": 220 }, { "epoch": 0.5005561735261401, "grad_norm": 0.8210216367491153, "learning_rate": 1.8653766583432114e-05, "loss": 0.7716, "step": 225 }, { "epoch": 0.5116796440489433, "grad_norm": 0.8730864132572128, "learning_rate": 1.855471662881164e-05, "loss": 0.7882, "step": 230 }, { "epoch": 0.5228031145717463, "grad_norm": 0.7987767766986732, "learning_rate": 1.845243364978702e-05, "loss": 0.7609, "step": 235 }, { "epoch": 0.5339265850945495, "grad_norm": 0.7622467591417209, "learning_rate": 1.8346956301451303e-05, "loss": 0.7551, "step": 240 }, { "epoch": 0.5450500556173526, "grad_norm": 0.831755089876671, "learning_rate": 1.8238324446123265e-05, "loss": 0.7634, "step": 245 }, { "epoch": 0.5561735261401557, "grad_norm": 0.7942015525420777, "learning_rate": 1.8126579138282502e-05, "loss": 0.7541, "step": 250 }, { "epoch": 0.5672969966629589, "grad_norm": 0.824338690380162, "learning_rate": 1.801176260905402e-05, "loss": 0.7641, "step": 255 }, { "epoch": 0.578420467185762, "grad_norm": 0.8160737587426502, "learning_rate": 1.7893918250248106e-05, "loss": 0.747, "step": 260 }, { "epoch": 0.5895439377085651, "grad_norm": 0.7619822007899363, "learning_rate": 1.7773090597961554e-05, "loss": 0.7353, "step": 265 }, { "epoch": 0.6006674082313682, "grad_norm": 0.7898915967361627, "learning_rate": 1.764932531574648e-05, "loss": 0.7588, "step": 270 }, { "epoch": 0.6117908787541713, "grad_norm": 0.9091978830168115, "learning_rate": 1.7522669177352978e-05, "loss": 0.781, "step": 275 }, { "epoch": 0.6229143492769744, "grad_norm": 0.8198662250585645, "learning_rate": 1.7393170049052274e-05, "loss": 0.7545, "step": 280 }, { "epoch": 0.6340378197997776, "grad_norm": 0.7880789917007047, "learning_rate": 1.7260876871546935e-05, "loss": 0.7726, "step": 285 }, { "epoch": 0.6451612903225806, "grad_norm": 0.8385501161327127, "learning_rate": 1.7125839641475074e-05, "loss": 0.7619, "step": 290 }, { "epoch": 0.6562847608453838, "grad_norm": 0.8924470377096518, "learning_rate": 1.6988109392515432e-05, "loss": 0.7346, "step": 295 }, { "epoch": 0.6674082313681868, "grad_norm": 0.7890602183226353, "learning_rate": 1.6847738176100632e-05, "loss": 0.7643, "step": 300 }, { "epoch": 0.67853170189099, "grad_norm": 0.8110214434516344, "learning_rate": 1.6704779041745686e-05, "loss": 0.7603, "step": 305 }, { "epoch": 0.6896551724137931, "grad_norm": 0.7873486916181355, "learning_rate": 1.65592860169994e-05, "loss": 0.7595, "step": 310 }, { "epoch": 0.7007786429365962, "grad_norm": 0.7527591581883117, "learning_rate": 1.6411314087026108e-05, "loss": 0.7508, "step": 315 }, { "epoch": 0.7119021134593994, "grad_norm": 0.8283445625547928, "learning_rate": 1.6260919173825507e-05, "loss": 0.7387, "step": 320 }, { "epoch": 0.7230255839822024, "grad_norm": 0.7262591008119376, "learning_rate": 1.6108158115098443e-05, "loss": 0.7264, "step": 325 }, { "epoch": 0.7341490545050056, "grad_norm": 0.7739070575646189, "learning_rate": 1.595308864276666e-05, "loss": 0.7435, "step": 330 }, { "epoch": 0.7452725250278087, "grad_norm": 0.8489625309544235, "learning_rate": 1.5795769361154548e-05, "loss": 0.7615, "step": 335 }, { "epoch": 0.7563959955506118, "grad_norm": 0.769293674851008, "learning_rate": 1.5636259724841224e-05, "loss": 0.7536, "step": 340 }, { "epoch": 0.7675194660734149, "grad_norm": 0.7920492833518509, "learning_rate": 1.5474620016191296e-05, "loss": 0.7431, "step": 345 }, { "epoch": 0.778642936596218, "grad_norm": 0.7468241826638446, "learning_rate": 1.531091132257275e-05, "loss": 0.732, "step": 350 }, { "epoch": 0.7897664071190211, "grad_norm": 0.743401655764991, "learning_rate": 1.5145195513270644e-05, "loss": 0.7291, "step": 355 }, { "epoch": 0.8008898776418243, "grad_norm": 0.8018681967515083, "learning_rate": 1.4977535216105258e-05, "loss": 0.7257, "step": 360 }, { "epoch": 0.8120133481646273, "grad_norm": 0.7600864193920938, "learning_rate": 1.480799379376362e-05, "loss": 0.741, "step": 365 }, { "epoch": 0.8231368186874305, "grad_norm": 0.8389942553789884, "learning_rate": 1.4636635319853274e-05, "loss": 0.742, "step": 370 }, { "epoch": 0.8342602892102335, "grad_norm": 0.7886124496265561, "learning_rate": 1.4463524554687398e-05, "loss": 0.7545, "step": 375 }, { "epoch": 0.8453837597330367, "grad_norm": 0.7344461106509269, "learning_rate": 1.4288726920810381e-05, "loss": 0.7278, "step": 380 }, { "epoch": 0.8565072302558399, "grad_norm": 0.8189552942167496, "learning_rate": 1.4112308478273144e-05, "loss": 0.7461, "step": 385 }, { "epoch": 0.8676307007786429, "grad_norm": 0.7834178214856152, "learning_rate": 1.3934335899667526e-05, "loss": 0.7378, "step": 390 }, { "epoch": 0.8787541713014461, "grad_norm": 0.7714482088214847, "learning_rate": 1.3754876444929165e-05, "loss": 0.7489, "step": 395 }, { "epoch": 0.8898776418242491, "grad_norm": 0.7755466989206458, "learning_rate": 1.357399793591844e-05, "loss": 0.7469, "step": 400 }, { "epoch": 0.9010011123470523, "grad_norm": 0.8377177614112041, "learning_rate": 1.3391768730789e-05, "loss": 0.739, "step": 405 }, { "epoch": 0.9121245828698554, "grad_norm": 0.7908000259612985, "learning_rate": 1.3208257698153677e-05, "loss": 0.7281, "step": 410 }, { "epoch": 0.9232480533926585, "grad_norm": 0.7818338363656034, "learning_rate": 1.3023534191057427e-05, "loss": 0.753, "step": 415 }, { "epoch": 0.9343715239154616, "grad_norm": 0.749284045444348, "learning_rate": 1.283766802076722e-05, "loss": 0.738, "step": 420 }, { "epoch": 0.9454949944382648, "grad_norm": 0.7839632426822802, "learning_rate": 1.2650729430388764e-05, "loss": 0.7436, "step": 425 }, { "epoch": 0.9566184649610678, "grad_norm": 0.7721084207333134, "learning_rate": 1.2462789068320016e-05, "loss": 0.748, "step": 430 }, { "epoch": 0.967741935483871, "grad_norm": 0.7290139666717954, "learning_rate": 1.2273917961551513e-05, "loss": 0.7239, "step": 435 }, { "epoch": 0.978865406006674, "grad_norm": 0.8056619429328024, "learning_rate": 1.2084187488823657e-05, "loss": 0.738, "step": 440 }, { "epoch": 0.9899888765294772, "grad_norm": 0.7770001841474352, "learning_rate": 1.1893669353651032e-05, "loss": 0.7385, "step": 445 }, { "epoch": 0.9988876529477196, "eval_loss": 0.7580433487892151, "eval_runtime": 5.7595, "eval_samples_per_second": 70.666, "eval_steps_per_second": 2.257, "step": 449 }, { "epoch": 1.0011123470522802, "grad_norm": 0.8135422551691482, "learning_rate": 1.1702435557223988e-05, "loss": 0.7266, "step": 450 }, { "epoch": 1.0122358175750834, "grad_norm": 0.8964435701944624, "learning_rate": 1.1510558371197754e-05, "loss": 0.6412, "step": 455 }, { "epoch": 1.0233592880978866, "grad_norm": 0.8600984235769464, "learning_rate": 1.1318110310379303e-05, "loss": 0.6433, "step": 460 }, { "epoch": 1.0344827586206897, "grad_norm": 0.8458880171077358, "learning_rate": 1.112516410532233e-05, "loss": 0.6292, "step": 465 }, { "epoch": 1.0456062291434929, "grad_norm": 0.8284785412200435, "learning_rate": 1.0931792674840718e-05, "loss": 0.6339, "step": 470 }, { "epoch": 1.0567296996662958, "grad_norm": 0.7444359225044646, "learning_rate": 1.073806909845082e-05, "loss": 0.6355, "step": 475 }, { "epoch": 1.067853170189099, "grad_norm": 0.7723149976392786, "learning_rate": 1.0544066588753044e-05, "loss": 0.6235, "step": 480 }, { "epoch": 1.0789766407119021, "grad_norm": 0.7593187602310192, "learning_rate": 1.0349858463763114e-05, "loss": 0.6105, "step": 485 }, { "epoch": 1.0901001112347053, "grad_norm": 0.7855139607799839, "learning_rate": 1.0155518119203511e-05, "loss": 0.6568, "step": 490 }, { "epoch": 1.1012235817575085, "grad_norm": 0.7898700329509037, "learning_rate": 9.961119000765532e-06, "loss": 0.6225, "step": 495 }, { "epoch": 1.1123470522803114, "grad_norm": 0.8669564798886822, "learning_rate": 9.766734576352478e-06, "loss": 0.6391, "step": 500 }, { "epoch": 1.1234705228031145, "grad_norm": 0.7837289412561955, "learning_rate": 9.572438308314447e-06, "loss": 0.6171, "step": 505 }, { "epoch": 1.1345939933259177, "grad_norm": 0.8070851055141667, "learning_rate": 9.378303625685196e-06, "loss": 0.6282, "step": 510 }, { "epoch": 1.1457174638487209, "grad_norm": 0.7979212620110364, "learning_rate": 9.184403896431649e-06, "loss": 0.6233, "step": 515 }, { "epoch": 1.156840934371524, "grad_norm": 0.7704548256097349, "learning_rate": 8.990812399726435e-06, "loss": 0.5992, "step": 520 }, { "epoch": 1.167964404894327, "grad_norm": 0.8961749452380681, "learning_rate": 8.797602298254005e-06, "loss": 0.6378, "step": 525 }, { "epoch": 1.1790878754171301, "grad_norm": 0.8053259116501744, "learning_rate": 8.604846610560771e-06, "loss": 0.605, "step": 530 }, { "epoch": 1.1902113459399333, "grad_norm": 0.7782548725591264, "learning_rate": 8.412618183459707e-06, "loss": 0.6081, "step": 535 }, { "epoch": 1.2013348164627364, "grad_norm": 0.7780589674933976, "learning_rate": 8.22098966449988e-06, "loss": 0.6251, "step": 540 }, { "epoch": 1.2124582869855396, "grad_norm": 0.8213283153654349, "learning_rate": 8.030033474511248e-06, "loss": 0.6092, "step": 545 }, { "epoch": 1.2235817575083425, "grad_norm": 0.7889430474165346, "learning_rate": 7.839821780235168e-06, "loss": 0.645, "step": 550 }, { "epoch": 1.2347052280311457, "grad_norm": 0.8377912406937705, "learning_rate": 7.650426467050926e-06, "loss": 0.6286, "step": 555 }, { "epoch": 1.2458286985539488, "grad_norm": 0.8488324297083317, "learning_rate": 7.4619191118085955e-06, "loss": 0.6129, "step": 560 }, { "epoch": 1.256952169076752, "grad_norm": 0.7608536209939344, "learning_rate": 7.274370955778498e-06, "loss": 0.6072, "step": 565 }, { "epoch": 1.2680756395995552, "grad_norm": 0.7928583676765779, "learning_rate": 7.0878528777274814e-06, "loss": 0.6042, "step": 570 }, { "epoch": 1.279199110122358, "grad_norm": 1.5882806537259504, "learning_rate": 6.9024353671322086e-06, "loss": 0.647, "step": 575 }, { "epoch": 1.2903225806451613, "grad_norm": 0.8447762128943721, "learning_rate": 6.718188497539554e-06, "loss": 0.6214, "step": 580 }, { "epoch": 1.3014460511679644, "grad_norm": 0.8209371297634136, "learning_rate": 6.535181900084206e-06, "loss": 0.6079, "step": 585 }, { "epoch": 1.3125695216907676, "grad_norm": 0.8624820584895021, "learning_rate": 6.35348473717345e-06, "loss": 0.6221, "step": 590 }, { "epoch": 1.3236929922135707, "grad_norm": 0.8056054069589547, "learning_rate": 6.173165676349103e-06, "loss": 0.6254, "step": 595 }, { "epoch": 1.3348164627363737, "grad_norm": 0.8273703875367165, "learning_rate": 5.994292864336473e-06, "loss": 0.6119, "step": 600 }, { "epoch": 1.3459399332591768, "grad_norm": 0.8723777846098392, "learning_rate": 5.816933901290136e-06, "loss": 0.6395, "step": 605 }, { "epoch": 1.35706340378198, "grad_norm": 0.8674540470170442, "learning_rate": 5.64115581524629e-06, "loss": 0.6163, "step": 610 }, { "epoch": 1.3681868743047831, "grad_norm": 0.8762720501618131, "learning_rate": 5.4670250367913025e-06, "loss": 0.6225, "step": 615 }, { "epoch": 1.3793103448275863, "grad_norm": 0.8491305197357123, "learning_rate": 5.294607373956071e-06, "loss": 0.6093, "step": 620 }, { "epoch": 1.3904338153503892, "grad_norm": 0.8166208130830984, "learning_rate": 5.1239679873456636e-06, "loss": 0.6361, "step": 625 }, { "epoch": 1.4015572858731924, "grad_norm": 0.8009131846316857, "learning_rate": 4.955171365513603e-06, "loss": 0.617, "step": 630 }, { "epoch": 1.4126807563959956, "grad_norm": 0.8366294436519559, "learning_rate": 4.788281300590169e-06, "loss": 0.6118, "step": 635 }, { "epoch": 1.4238042269187987, "grad_norm": 0.8808417873595291, "learning_rate": 4.623360864173893e-06, "loss": 0.6177, "step": 640 }, { "epoch": 1.4349276974416019, "grad_norm": 0.8629287010139255, "learning_rate": 4.4604723834953315e-06, "loss": 0.6251, "step": 645 }, { "epoch": 1.4460511679644048, "grad_norm": 0.7875671775994082, "learning_rate": 4.299677417862174e-06, "loss": 0.6199, "step": 650 }, { "epoch": 1.457174638487208, "grad_norm": 0.7828729245421459, "learning_rate": 4.141036735394575e-06, "loss": 0.6215, "step": 655 }, { "epoch": 1.4682981090100111, "grad_norm": 0.8007957527163657, "learning_rate": 3.984610290059467e-06, "loss": 0.6253, "step": 660 }, { "epoch": 1.4794215795328143, "grad_norm": 0.7910153849628225, "learning_rate": 3.830457199012585e-06, "loss": 0.6157, "step": 665 }, { "epoch": 1.4905450500556174, "grad_norm": 0.8255633946537543, "learning_rate": 3.6786357202567367e-06, "loss": 0.6182, "step": 670 }, { "epoch": 1.5016685205784204, "grad_norm": 0.8740651322002517, "learning_rate": 3.529203230624747e-06, "loss": 0.6334, "step": 675 }, { "epoch": 1.5127919911012235, "grad_norm": 0.7540939777667681, "learning_rate": 3.3822162040954355e-06, "loss": 0.596, "step": 680 }, { "epoch": 1.5239154616240267, "grad_norm": 0.7934734563966453, "learning_rate": 3.2377301904508163e-06, "loss": 0.5951, "step": 685 }, { "epoch": 1.5350389321468298, "grad_norm": 0.8131581917254441, "learning_rate": 3.0957997942825337e-06, "loss": 0.612, "step": 690 }, { "epoch": 1.546162402669633, "grad_norm": 0.8454586171154052, "learning_rate": 2.956478654355539e-06, "loss": 0.6293, "step": 695 }, { "epoch": 1.557285873192436, "grad_norm": 0.8339559981613245, "learning_rate": 2.8198194233367747e-06, "loss": 0.6088, "step": 700 }, { "epoch": 1.568409343715239, "grad_norm": 0.821374605567537, "learning_rate": 2.6858737478965036e-06, "loss": 0.6233, "step": 705 }, { "epoch": 1.5795328142380423, "grad_norm": 0.7711457173871649, "learning_rate": 2.5546922491898497e-06, "loss": 0.6262, "step": 710 }, { "epoch": 1.5906562847608454, "grad_norm": 0.841438327290974, "learning_rate": 2.4263245037258996e-06, "loss": 0.6359, "step": 715 }, { "epoch": 1.6017797552836486, "grad_norm": 0.8546385634639357, "learning_rate": 2.3008190246316033e-06, "loss": 0.6312, "step": 720 }, { "epoch": 1.6129032258064515, "grad_norm": 0.7477788666400325, "learning_rate": 2.178223243317532e-06, "loss": 0.6115, "step": 725 }, { "epoch": 1.624026696329255, "grad_norm": 0.8448273050299596, "learning_rate": 2.058583491552465e-06, "loss": 0.641, "step": 730 }, { "epoch": 1.6351501668520578, "grad_norm": 0.8261008969765296, "learning_rate": 1.9419449839535522e-06, "loss": 0.617, "step": 735 }, { "epoch": 1.646273637374861, "grad_norm": 0.8357122860638048, "learning_rate": 1.8283518008986566e-06, "loss": 0.607, "step": 740 }, { "epoch": 1.6573971078976641, "grad_norm": 0.8350425917666864, "learning_rate": 1.7178468718673712e-06, "loss": 0.607, "step": 745 }, { "epoch": 1.668520578420467, "grad_norm": 0.8305517949246249, "learning_rate": 1.6104719592169905e-06, "loss": 0.6151, "step": 750 }, { "epoch": 1.6796440489432705, "grad_norm": 0.8186870980107259, "learning_rate": 1.506267642399525e-06, "loss": 0.6385, "step": 755 }, { "epoch": 1.6907675194660734, "grad_norm": 0.7636300249499965, "learning_rate": 1.405273302625828e-06, "loss": 0.6075, "step": 760 }, { "epoch": 1.7018909899888766, "grad_norm": 0.8363995841599257, "learning_rate": 1.3075271079825035e-06, "loss": 0.638, "step": 765 }, { "epoch": 1.7130144605116797, "grad_norm": 0.7622345846029582, "learning_rate": 1.2130659990073146e-06, "loss": 0.6125, "step": 770 }, { "epoch": 1.7241379310344827, "grad_norm": 0.7854431717950116, "learning_rate": 1.1219256747285046e-06, "loss": 0.6203, "step": 775 }, { "epoch": 1.735261401557286, "grad_norm": 0.80577828518747, "learning_rate": 1.0341405791733183e-06, "loss": 0.6318, "step": 780 }, { "epoch": 1.746384872080089, "grad_norm": 0.8025241111868774, "learning_rate": 9.497438883507981e-07, "loss": 0.6104, "step": 785 }, { "epoch": 1.7575083426028921, "grad_norm": 0.7484880734203889, "learning_rate": 8.687674977138116e-07, "loss": 0.6111, "step": 790 }, { "epoch": 1.7686318131256953, "grad_norm": 0.7844040069263457, "learning_rate": 7.912420101050366e-07, "loss": 0.6058, "step": 795 }, { "epoch": 1.7797552836484982, "grad_norm": 0.832757853518197, "learning_rate": 7.171967241914224e-07, "loss": 0.6168, "step": 800 }, { "epoch": 1.7908787541713016, "grad_norm": 0.7982807830933683, "learning_rate": 6.466596233915601e-07, "loss": 0.6111, "step": 805 }, { "epoch": 1.8020022246941045, "grad_norm": 0.8285138016892091, "learning_rate": 5.796573653001091e-07, "loss": 0.6206, "step": 810 }, { "epoch": 1.8131256952169077, "grad_norm": 0.8098593367246673, "learning_rate": 5.162152716132662e-07, "loss": 0.6301, "step": 815 }, { "epoch": 1.8242491657397109, "grad_norm": 0.8317063889098834, "learning_rate": 4.563573185591219e-07, "loss": 0.5913, "step": 820 }, { "epoch": 1.8353726362625138, "grad_norm": 0.7802676617619877, "learning_rate": 4.0010612783648927e-07, "loss": 0.6009, "step": 825 }, { "epoch": 1.8464961067853172, "grad_norm": 0.8359715597402506, "learning_rate": 3.474829580656436e-07, "loss": 0.6129, "step": 830 }, { "epoch": 1.85761957730812, "grad_norm": 0.7764499156414768, "learning_rate": 2.9850769675419776e-07, "loss": 0.6233, "step": 835 }, { "epoch": 1.8687430478309233, "grad_norm": 0.8028424415714934, "learning_rate": 2.5319885278115907e-07, "loss": 0.6079, "step": 840 }, { "epoch": 1.8798665183537264, "grad_norm": 0.8831031088714258, "learning_rate": 2.115735494019966e-07, "loss": 0.6258, "step": 845 }, { "epoch": 1.8909899888765294, "grad_norm": 0.7653050119042255, "learning_rate": 1.7364751777736334e-07, "loss": 0.6212, "step": 850 }, { "epoch": 1.9021134593993327, "grad_norm": 0.8479149432867373, "learning_rate": 1.394350910279385e-07, "loss": 0.6006, "step": 855 }, { "epoch": 1.9132369299221357, "grad_norm": 0.8537537714787563, "learning_rate": 1.0894919881760168e-07, "loss": 0.6291, "step": 860 }, { "epoch": 1.9243604004449388, "grad_norm": 0.8315410821072657, "learning_rate": 8.220136246701926e-08, "loss": 0.6226, "step": 865 }, { "epoch": 1.935483870967742, "grad_norm": 0.8500259667306754, "learning_rate": 5.920169059947412e-08, "loss": 0.6108, "step": 870 }, { "epoch": 1.946607341490545, "grad_norm": 0.8522681381543702, "learning_rate": 3.99588753205804e-08, "loss": 0.6239, "step": 875 }, { "epoch": 1.9577308120133483, "grad_norm": 0.7583572716472294, "learning_rate": 2.4480188933336812e-08, "loss": 0.6128, "step": 880 }, { "epoch": 1.9688542825361512, "grad_norm": 0.7629282401468074, "learning_rate": 1.277148118975835e-08, "loss": 0.6022, "step": 885 }, { "epoch": 1.9799777530589544, "grad_norm": 0.817692770966463, "learning_rate": 4.837177080119215e-09, "loss": 0.6154, "step": 890 }, { "epoch": 1.9911012235817576, "grad_norm": 0.774109128612781, "learning_rate": 6.8027516064606e-10, "loss": 0.616, "step": 895 }, { "epoch": 1.9977753058954395, "eval_loss": 0.7532592415809631, "eval_runtime": 5.6688, "eval_samples_per_second": 71.797, "eval_steps_per_second": 2.293, "step": 898 }, { "epoch": 1.9977753058954395, "step": 898, "total_flos": 81428314521600.0, "train_loss": 0.7065894536026868, "train_runtime": 5871.8562, "train_samples_per_second": 19.593, "train_steps_per_second": 0.153 } ], "logging_steps": 5, "max_steps": 898, "num_input_tokens_seen": 0, "num_train_epochs": 2, "save_steps": 100, "total_flos": 81428314521600.0, "train_batch_size": 16, "trial_name": null, "trial_params": null }