| { | |
| "best_metric": null, | |
| "best_model_checkpoint": null, | |
| "epoch": 1.9977753058954395, | |
| "eval_steps": 500, | |
| "global_step": 898, | |
| "is_hyper_param_search": false, | |
| "is_local_process_zero": true, | |
| "is_world_process_zero": true, | |
| "log_history": [ | |
| { | |
| "epoch": 0.002224694104560623, | |
| "grad_norm": 10.055098914546134, | |
| "learning_rate": 2.2222222222222224e-07, | |
| "loss": 1.3661, | |
| "step": 1 | |
| }, | |
| { | |
| "epoch": 0.011123470522803115, | |
| "grad_norm": 9.126296513090308, | |
| "learning_rate": 1.111111111111111e-06, | |
| "loss": 1.2942, | |
| "step": 5 | |
| }, | |
| { | |
| "epoch": 0.02224694104560623, | |
| "grad_norm": 7.84364716096091, | |
| "learning_rate": 2.222222222222222e-06, | |
| "loss": 1.313, | |
| "step": 10 | |
| }, | |
| { | |
| "epoch": 0.03337041156840934, | |
| "grad_norm": 5.7358942086951625, | |
| "learning_rate": 3.3333333333333333e-06, | |
| "loss": 1.173, | |
| "step": 15 | |
| }, | |
| { | |
| "epoch": 0.04449388209121246, | |
| "grad_norm": 1.8899122666121964, | |
| "learning_rate": 4.444444444444444e-06, | |
| "loss": 0.9741, | |
| "step": 20 | |
| }, | |
| { | |
| "epoch": 0.05561735261401557, | |
| "grad_norm": 1.3213559323931832, | |
| "learning_rate": 5.555555555555557e-06, | |
| "loss": 0.9238, | |
| "step": 25 | |
| }, | |
| { | |
| "epoch": 0.06674082313681869, | |
| "grad_norm": 1.041365244653977, | |
| "learning_rate": 6.666666666666667e-06, | |
| "loss": 0.8728, | |
| "step": 30 | |
| }, | |
| { | |
| "epoch": 0.0778642936596218, | |
| "grad_norm": 0.9488843220134849, | |
| "learning_rate": 7.77777777777778e-06, | |
| "loss": 0.9007, | |
| "step": 35 | |
| }, | |
| { | |
| "epoch": 0.08898776418242492, | |
| "grad_norm": 0.8275801836954026, | |
| "learning_rate": 8.888888888888888e-06, | |
| "loss": 0.8661, | |
| "step": 40 | |
| }, | |
| { | |
| "epoch": 0.10011123470522804, | |
| "grad_norm": 0.9030018185879571, | |
| "learning_rate": 1e-05, | |
| "loss": 0.8484, | |
| "step": 45 | |
| }, | |
| { | |
| "epoch": 0.11123470522803114, | |
| "grad_norm": 0.7848210903418559, | |
| "learning_rate": 1.1111111111111113e-05, | |
| "loss": 0.8291, | |
| "step": 50 | |
| }, | |
| { | |
| "epoch": 0.12235817575083426, | |
| "grad_norm": 0.9313858219522357, | |
| "learning_rate": 1.2222222222222224e-05, | |
| "loss": 0.8707, | |
| "step": 55 | |
| }, | |
| { | |
| "epoch": 0.13348164627363737, | |
| "grad_norm": 0.8227239131251839, | |
| "learning_rate": 1.3333333333333333e-05, | |
| "loss": 0.8055, | |
| "step": 60 | |
| }, | |
| { | |
| "epoch": 0.1446051167964405, | |
| "grad_norm": 0.8199201964167689, | |
| "learning_rate": 1.4444444444444446e-05, | |
| "loss": 0.8151, | |
| "step": 65 | |
| }, | |
| { | |
| "epoch": 0.1557285873192436, | |
| "grad_norm": 0.7767041809184307, | |
| "learning_rate": 1.555555555555556e-05, | |
| "loss": 0.8384, | |
| "step": 70 | |
| }, | |
| { | |
| "epoch": 0.1668520578420467, | |
| "grad_norm": 0.9088502789460952, | |
| "learning_rate": 1.6666666666666667e-05, | |
| "loss": 0.8106, | |
| "step": 75 | |
| }, | |
| { | |
| "epoch": 0.17797552836484984, | |
| "grad_norm": 0.8343739709762875, | |
| "learning_rate": 1.7777777777777777e-05, | |
| "loss": 0.7932, | |
| "step": 80 | |
| }, | |
| { | |
| "epoch": 0.18909899888765294, | |
| "grad_norm": 0.8939354128507568, | |
| "learning_rate": 1.888888888888889e-05, | |
| "loss": 0.8045, | |
| "step": 85 | |
| }, | |
| { | |
| "epoch": 0.20022246941045607, | |
| "grad_norm": 0.885411772323368, | |
| "learning_rate": 2e-05, | |
| "loss": 0.8009, | |
| "step": 90 | |
| }, | |
| { | |
| "epoch": 0.21134593993325917, | |
| "grad_norm": 0.8890660179859062, | |
| "learning_rate": 1.9998110384864614e-05, | |
| "loss": 0.8225, | |
| "step": 95 | |
| }, | |
| { | |
| "epoch": 0.22246941045606228, | |
| "grad_norm": 0.8371399480864414, | |
| "learning_rate": 1.9992442253587533e-05, | |
| "loss": 0.7893, | |
| "step": 100 | |
| }, | |
| { | |
| "epoch": 0.2335928809788654, | |
| "grad_norm": 0.8656120072046296, | |
| "learning_rate": 1.998299774828608e-05, | |
| "loss": 0.812, | |
| "step": 105 | |
| }, | |
| { | |
| "epoch": 0.2447163515016685, | |
| "grad_norm": 0.8486947049775396, | |
| "learning_rate": 1.9969780438256295e-05, | |
| "loss": 0.7886, | |
| "step": 110 | |
| }, | |
| { | |
| "epoch": 0.25583982202447164, | |
| "grad_norm": 0.8507931458580409, | |
| "learning_rate": 1.995279531862399e-05, | |
| "loss": 0.8078, | |
| "step": 115 | |
| }, | |
| { | |
| "epoch": 0.26696329254727474, | |
| "grad_norm": 0.8150830104906274, | |
| "learning_rate": 1.993204880845699e-05, | |
| "loss": 0.7672, | |
| "step": 120 | |
| }, | |
| { | |
| "epoch": 0.27808676307007785, | |
| "grad_norm": 0.8642627038468238, | |
| "learning_rate": 1.9907548748339223e-05, | |
| "loss": 0.7929, | |
| "step": 125 | |
| }, | |
| { | |
| "epoch": 0.289210233592881, | |
| "grad_norm": 0.8541146407873894, | |
| "learning_rate": 1.987930439740757e-05, | |
| "loss": 0.7873, | |
| "step": 130 | |
| }, | |
| { | |
| "epoch": 0.3003337041156841, | |
| "grad_norm": 0.8995167237276087, | |
| "learning_rate": 1.9847326429852632e-05, | |
| "loss": 0.7862, | |
| "step": 135 | |
| }, | |
| { | |
| "epoch": 0.3114571746384872, | |
| "grad_norm": 0.8173771695441595, | |
| "learning_rate": 1.981162693088471e-05, | |
| "loss": 0.7983, | |
| "step": 140 | |
| }, | |
| { | |
| "epoch": 0.3225806451612903, | |
| "grad_norm": 0.8404409912157184, | |
| "learning_rate": 1.977221939216652e-05, | |
| "loss": 0.8037, | |
| "step": 145 | |
| }, | |
| { | |
| "epoch": 0.3337041156840934, | |
| "grad_norm": 0.8245961889334814, | |
| "learning_rate": 1.9729118706714377e-05, | |
| "loss": 0.8027, | |
| "step": 150 | |
| }, | |
| { | |
| "epoch": 0.3448275862068966, | |
| "grad_norm": 0.8314933715558246, | |
| "learning_rate": 1.96823411632698e-05, | |
| "loss": 0.7843, | |
| "step": 155 | |
| }, | |
| { | |
| "epoch": 0.3559510567296997, | |
| "grad_norm": 0.8726345334934287, | |
| "learning_rate": 1.9631904440143614e-05, | |
| "loss": 0.793, | |
| "step": 160 | |
| }, | |
| { | |
| "epoch": 0.3670745272525028, | |
| "grad_norm": 0.8415196157334568, | |
| "learning_rate": 1.9577827598534888e-05, | |
| "loss": 0.7668, | |
| "step": 165 | |
| }, | |
| { | |
| "epoch": 0.3781979977753059, | |
| "grad_norm": 0.8815634680126696, | |
| "learning_rate": 1.95201310753273e-05, | |
| "loss": 0.7851, | |
| "step": 170 | |
| }, | |
| { | |
| "epoch": 0.389321468298109, | |
| "grad_norm": 0.8024209332569573, | |
| "learning_rate": 1.945883667536556e-05, | |
| "loss": 0.7772, | |
| "step": 175 | |
| }, | |
| { | |
| "epoch": 0.40044493882091214, | |
| "grad_norm": 0.8490296704540178, | |
| "learning_rate": 1.9393967563214833e-05, | |
| "loss": 0.7761, | |
| "step": 180 | |
| }, | |
| { | |
| "epoch": 0.41156840934371525, | |
| "grad_norm": 0.8454078852442415, | |
| "learning_rate": 1.9325548254406354e-05, | |
| "loss": 0.7624, | |
| "step": 185 | |
| }, | |
| { | |
| "epoch": 0.42269187986651835, | |
| "grad_norm": 0.8511908385629164, | |
| "learning_rate": 1.925360460617242e-05, | |
| "loss": 0.7668, | |
| "step": 190 | |
| }, | |
| { | |
| "epoch": 0.43381535038932145, | |
| "grad_norm": 0.7982929700309319, | |
| "learning_rate": 1.9178163807674343e-05, | |
| "loss": 0.7634, | |
| "step": 195 | |
| }, | |
| { | |
| "epoch": 0.44493882091212456, | |
| "grad_norm": 0.7816495059977551, | |
| "learning_rate": 1.9099254369727062e-05, | |
| "loss": 0.7748, | |
| "step": 200 | |
| }, | |
| { | |
| "epoch": 0.4560622914349277, | |
| "grad_norm": 0.7563436109378239, | |
| "learning_rate": 1.901690611402423e-05, | |
| "loss": 0.7722, | |
| "step": 205 | |
| }, | |
| { | |
| "epoch": 0.4671857619577308, | |
| "grad_norm": 0.8466958907764663, | |
| "learning_rate": 1.8931150161867917e-05, | |
| "loss": 0.7765, | |
| "step": 210 | |
| }, | |
| { | |
| "epoch": 0.4783092324805339, | |
| "grad_norm": 0.7620831089947199, | |
| "learning_rate": 1.8842018922407153e-05, | |
| "loss": 0.7704, | |
| "step": 215 | |
| }, | |
| { | |
| "epoch": 0.489432703003337, | |
| "grad_norm": 0.84897869766796, | |
| "learning_rate": 1.874954608038976e-05, | |
| "loss": 0.7729, | |
| "step": 220 | |
| }, | |
| { | |
| "epoch": 0.5005561735261401, | |
| "grad_norm": 0.8210216367491153, | |
| "learning_rate": 1.8653766583432114e-05, | |
| "loss": 0.7716, | |
| "step": 225 | |
| }, | |
| { | |
| "epoch": 0.5116796440489433, | |
| "grad_norm": 0.8730864132572128, | |
| "learning_rate": 1.855471662881164e-05, | |
| "loss": 0.7882, | |
| "step": 230 | |
| }, | |
| { | |
| "epoch": 0.5228031145717463, | |
| "grad_norm": 0.7987767766986732, | |
| "learning_rate": 1.845243364978702e-05, | |
| "loss": 0.7609, | |
| "step": 235 | |
| }, | |
| { | |
| "epoch": 0.5339265850945495, | |
| "grad_norm": 0.7622467591417209, | |
| "learning_rate": 1.8346956301451303e-05, | |
| "loss": 0.7551, | |
| "step": 240 | |
| }, | |
| { | |
| "epoch": 0.5450500556173526, | |
| "grad_norm": 0.831755089876671, | |
| "learning_rate": 1.8238324446123265e-05, | |
| "loss": 0.7634, | |
| "step": 245 | |
| }, | |
| { | |
| "epoch": 0.5561735261401557, | |
| "grad_norm": 0.7942015525420777, | |
| "learning_rate": 1.8126579138282502e-05, | |
| "loss": 0.7541, | |
| "step": 250 | |
| }, | |
| { | |
| "epoch": 0.5672969966629589, | |
| "grad_norm": 0.824338690380162, | |
| "learning_rate": 1.801176260905402e-05, | |
| "loss": 0.7641, | |
| "step": 255 | |
| }, | |
| { | |
| "epoch": 0.578420467185762, | |
| "grad_norm": 0.8160737587426502, | |
| "learning_rate": 1.7893918250248106e-05, | |
| "loss": 0.747, | |
| "step": 260 | |
| }, | |
| { | |
| "epoch": 0.5895439377085651, | |
| "grad_norm": 0.7619822007899363, | |
| "learning_rate": 1.7773090597961554e-05, | |
| "loss": 0.7353, | |
| "step": 265 | |
| }, | |
| { | |
| "epoch": 0.6006674082313682, | |
| "grad_norm": 0.7898915967361627, | |
| "learning_rate": 1.764932531574648e-05, | |
| "loss": 0.7588, | |
| "step": 270 | |
| }, | |
| { | |
| "epoch": 0.6117908787541713, | |
| "grad_norm": 0.9091978830168115, | |
| "learning_rate": 1.7522669177352978e-05, | |
| "loss": 0.781, | |
| "step": 275 | |
| }, | |
| { | |
| "epoch": 0.6229143492769744, | |
| "grad_norm": 0.8198662250585645, | |
| "learning_rate": 1.7393170049052274e-05, | |
| "loss": 0.7545, | |
| "step": 280 | |
| }, | |
| { | |
| "epoch": 0.6340378197997776, | |
| "grad_norm": 0.7880789917007047, | |
| "learning_rate": 1.7260876871546935e-05, | |
| "loss": 0.7726, | |
| "step": 285 | |
| }, | |
| { | |
| "epoch": 0.6451612903225806, | |
| "grad_norm": 0.8385501161327127, | |
| "learning_rate": 1.7125839641475074e-05, | |
| "loss": 0.7619, | |
| "step": 290 | |
| }, | |
| { | |
| "epoch": 0.6562847608453838, | |
| "grad_norm": 0.8924470377096518, | |
| "learning_rate": 1.6988109392515432e-05, | |
| "loss": 0.7346, | |
| "step": 295 | |
| }, | |
| { | |
| "epoch": 0.6674082313681868, | |
| "grad_norm": 0.7890602183226353, | |
| "learning_rate": 1.6847738176100632e-05, | |
| "loss": 0.7643, | |
| "step": 300 | |
| }, | |
| { | |
| "epoch": 0.67853170189099, | |
| "grad_norm": 0.8110214434516344, | |
| "learning_rate": 1.6704779041745686e-05, | |
| "loss": 0.7603, | |
| "step": 305 | |
| }, | |
| { | |
| "epoch": 0.6896551724137931, | |
| "grad_norm": 0.7873486916181355, | |
| "learning_rate": 1.65592860169994e-05, | |
| "loss": 0.7595, | |
| "step": 310 | |
| }, | |
| { | |
| "epoch": 0.7007786429365962, | |
| "grad_norm": 0.7527591581883117, | |
| "learning_rate": 1.6411314087026108e-05, | |
| "loss": 0.7508, | |
| "step": 315 | |
| }, | |
| { | |
| "epoch": 0.7119021134593994, | |
| "grad_norm": 0.8283445625547928, | |
| "learning_rate": 1.6260919173825507e-05, | |
| "loss": 0.7387, | |
| "step": 320 | |
| }, | |
| { | |
| "epoch": 0.7230255839822024, | |
| "grad_norm": 0.7262591008119376, | |
| "learning_rate": 1.6108158115098443e-05, | |
| "loss": 0.7264, | |
| "step": 325 | |
| }, | |
| { | |
| "epoch": 0.7341490545050056, | |
| "grad_norm": 0.7739070575646189, | |
| "learning_rate": 1.595308864276666e-05, | |
| "loss": 0.7435, | |
| "step": 330 | |
| }, | |
| { | |
| "epoch": 0.7452725250278087, | |
| "grad_norm": 0.8489625309544235, | |
| "learning_rate": 1.5795769361154548e-05, | |
| "loss": 0.7615, | |
| "step": 335 | |
| }, | |
| { | |
| "epoch": 0.7563959955506118, | |
| "grad_norm": 0.769293674851008, | |
| "learning_rate": 1.5636259724841224e-05, | |
| "loss": 0.7536, | |
| "step": 340 | |
| }, | |
| { | |
| "epoch": 0.7675194660734149, | |
| "grad_norm": 0.7920492833518509, | |
| "learning_rate": 1.5474620016191296e-05, | |
| "loss": 0.7431, | |
| "step": 345 | |
| }, | |
| { | |
| "epoch": 0.778642936596218, | |
| "grad_norm": 0.7468241826638446, | |
| "learning_rate": 1.531091132257275e-05, | |
| "loss": 0.732, | |
| "step": 350 | |
| }, | |
| { | |
| "epoch": 0.7897664071190211, | |
| "grad_norm": 0.743401655764991, | |
| "learning_rate": 1.5145195513270644e-05, | |
| "loss": 0.7291, | |
| "step": 355 | |
| }, | |
| { | |
| "epoch": 0.8008898776418243, | |
| "grad_norm": 0.8018681967515083, | |
| "learning_rate": 1.4977535216105258e-05, | |
| "loss": 0.7257, | |
| "step": 360 | |
| }, | |
| { | |
| "epoch": 0.8120133481646273, | |
| "grad_norm": 0.7600864193920938, | |
| "learning_rate": 1.480799379376362e-05, | |
| "loss": 0.741, | |
| "step": 365 | |
| }, | |
| { | |
| "epoch": 0.8231368186874305, | |
| "grad_norm": 0.8389942553789884, | |
| "learning_rate": 1.4636635319853274e-05, | |
| "loss": 0.742, | |
| "step": 370 | |
| }, | |
| { | |
| "epoch": 0.8342602892102335, | |
| "grad_norm": 0.7886124496265561, | |
| "learning_rate": 1.4463524554687398e-05, | |
| "loss": 0.7545, | |
| "step": 375 | |
| }, | |
| { | |
| "epoch": 0.8453837597330367, | |
| "grad_norm": 0.7344461106509269, | |
| "learning_rate": 1.4288726920810381e-05, | |
| "loss": 0.7278, | |
| "step": 380 | |
| }, | |
| { | |
| "epoch": 0.8565072302558399, | |
| "grad_norm": 0.8189552942167496, | |
| "learning_rate": 1.4112308478273144e-05, | |
| "loss": 0.7461, | |
| "step": 385 | |
| }, | |
| { | |
| "epoch": 0.8676307007786429, | |
| "grad_norm": 0.7834178214856152, | |
| "learning_rate": 1.3934335899667526e-05, | |
| "loss": 0.7378, | |
| "step": 390 | |
| }, | |
| { | |
| "epoch": 0.8787541713014461, | |
| "grad_norm": 0.7714482088214847, | |
| "learning_rate": 1.3754876444929165e-05, | |
| "loss": 0.7489, | |
| "step": 395 | |
| }, | |
| { | |
| "epoch": 0.8898776418242491, | |
| "grad_norm": 0.7755466989206458, | |
| "learning_rate": 1.357399793591844e-05, | |
| "loss": 0.7469, | |
| "step": 400 | |
| }, | |
| { | |
| "epoch": 0.9010011123470523, | |
| "grad_norm": 0.8377177614112041, | |
| "learning_rate": 1.3391768730789e-05, | |
| "loss": 0.739, | |
| "step": 405 | |
| }, | |
| { | |
| "epoch": 0.9121245828698554, | |
| "grad_norm": 0.7908000259612985, | |
| "learning_rate": 1.3208257698153677e-05, | |
| "loss": 0.7281, | |
| "step": 410 | |
| }, | |
| { | |
| "epoch": 0.9232480533926585, | |
| "grad_norm": 0.7818338363656034, | |
| "learning_rate": 1.3023534191057427e-05, | |
| "loss": 0.753, | |
| "step": 415 | |
| }, | |
| { | |
| "epoch": 0.9343715239154616, | |
| "grad_norm": 0.749284045444348, | |
| "learning_rate": 1.283766802076722e-05, | |
| "loss": 0.738, | |
| "step": 420 | |
| }, | |
| { | |
| "epoch": 0.9454949944382648, | |
| "grad_norm": 0.7839632426822802, | |
| "learning_rate": 1.2650729430388764e-05, | |
| "loss": 0.7436, | |
| "step": 425 | |
| }, | |
| { | |
| "epoch": 0.9566184649610678, | |
| "grad_norm": 0.7721084207333134, | |
| "learning_rate": 1.2462789068320016e-05, | |
| "loss": 0.748, | |
| "step": 430 | |
| }, | |
| { | |
| "epoch": 0.967741935483871, | |
| "grad_norm": 0.7290139666717954, | |
| "learning_rate": 1.2273917961551513e-05, | |
| "loss": 0.7239, | |
| "step": 435 | |
| }, | |
| { | |
| "epoch": 0.978865406006674, | |
| "grad_norm": 0.8056619429328024, | |
| "learning_rate": 1.2084187488823657e-05, | |
| "loss": 0.738, | |
| "step": 440 | |
| }, | |
| { | |
| "epoch": 0.9899888765294772, | |
| "grad_norm": 0.7770001841474352, | |
| "learning_rate": 1.1893669353651032e-05, | |
| "loss": 0.7385, | |
| "step": 445 | |
| }, | |
| { | |
| "epoch": 0.9988876529477196, | |
| "eval_loss": 0.7580433487892151, | |
| "eval_runtime": 5.7595, | |
| "eval_samples_per_second": 70.666, | |
| "eval_steps_per_second": 2.257, | |
| "step": 449 | |
| }, | |
| { | |
| "epoch": 1.0011123470522802, | |
| "grad_norm": 0.8135422551691482, | |
| "learning_rate": 1.1702435557223988e-05, | |
| "loss": 0.7266, | |
| "step": 450 | |
| }, | |
| { | |
| "epoch": 1.0122358175750834, | |
| "grad_norm": 0.8964435701944624, | |
| "learning_rate": 1.1510558371197754e-05, | |
| "loss": 0.6412, | |
| "step": 455 | |
| }, | |
| { | |
| "epoch": 1.0233592880978866, | |
| "grad_norm": 0.8600984235769464, | |
| "learning_rate": 1.1318110310379303e-05, | |
| "loss": 0.6433, | |
| "step": 460 | |
| }, | |
| { | |
| "epoch": 1.0344827586206897, | |
| "grad_norm": 0.8458880171077358, | |
| "learning_rate": 1.112516410532233e-05, | |
| "loss": 0.6292, | |
| "step": 465 | |
| }, | |
| { | |
| "epoch": 1.0456062291434929, | |
| "grad_norm": 0.8284785412200435, | |
| "learning_rate": 1.0931792674840718e-05, | |
| "loss": 0.6339, | |
| "step": 470 | |
| }, | |
| { | |
| "epoch": 1.0567296996662958, | |
| "grad_norm": 0.7444359225044646, | |
| "learning_rate": 1.073806909845082e-05, | |
| "loss": 0.6355, | |
| "step": 475 | |
| }, | |
| { | |
| "epoch": 1.067853170189099, | |
| "grad_norm": 0.7723149976392786, | |
| "learning_rate": 1.0544066588753044e-05, | |
| "loss": 0.6235, | |
| "step": 480 | |
| }, | |
| { | |
| "epoch": 1.0789766407119021, | |
| "grad_norm": 0.7593187602310192, | |
| "learning_rate": 1.0349858463763114e-05, | |
| "loss": 0.6105, | |
| "step": 485 | |
| }, | |
| { | |
| "epoch": 1.0901001112347053, | |
| "grad_norm": 0.7855139607799839, | |
| "learning_rate": 1.0155518119203511e-05, | |
| "loss": 0.6568, | |
| "step": 490 | |
| }, | |
| { | |
| "epoch": 1.1012235817575085, | |
| "grad_norm": 0.7898700329509037, | |
| "learning_rate": 9.961119000765532e-06, | |
| "loss": 0.6225, | |
| "step": 495 | |
| }, | |
| { | |
| "epoch": 1.1123470522803114, | |
| "grad_norm": 0.8669564798886822, | |
| "learning_rate": 9.766734576352478e-06, | |
| "loss": 0.6391, | |
| "step": 500 | |
| }, | |
| { | |
| "epoch": 1.1234705228031145, | |
| "grad_norm": 0.7837289412561955, | |
| "learning_rate": 9.572438308314447e-06, | |
| "loss": 0.6171, | |
| "step": 505 | |
| }, | |
| { | |
| "epoch": 1.1345939933259177, | |
| "grad_norm": 0.8070851055141667, | |
| "learning_rate": 9.378303625685196e-06, | |
| "loss": 0.6282, | |
| "step": 510 | |
| }, | |
| { | |
| "epoch": 1.1457174638487209, | |
| "grad_norm": 0.7979212620110364, | |
| "learning_rate": 9.184403896431649e-06, | |
| "loss": 0.6233, | |
| "step": 515 | |
| }, | |
| { | |
| "epoch": 1.156840934371524, | |
| "grad_norm": 0.7704548256097349, | |
| "learning_rate": 8.990812399726435e-06, | |
| "loss": 0.5992, | |
| "step": 520 | |
| }, | |
| { | |
| "epoch": 1.167964404894327, | |
| "grad_norm": 0.8961749452380681, | |
| "learning_rate": 8.797602298254005e-06, | |
| "loss": 0.6378, | |
| "step": 525 | |
| }, | |
| { | |
| "epoch": 1.1790878754171301, | |
| "grad_norm": 0.8053259116501744, | |
| "learning_rate": 8.604846610560771e-06, | |
| "loss": 0.605, | |
| "step": 530 | |
| }, | |
| { | |
| "epoch": 1.1902113459399333, | |
| "grad_norm": 0.7782548725591264, | |
| "learning_rate": 8.412618183459707e-06, | |
| "loss": 0.6081, | |
| "step": 535 | |
| }, | |
| { | |
| "epoch": 1.2013348164627364, | |
| "grad_norm": 0.7780589674933976, | |
| "learning_rate": 8.22098966449988e-06, | |
| "loss": 0.6251, | |
| "step": 540 | |
| }, | |
| { | |
| "epoch": 1.2124582869855396, | |
| "grad_norm": 0.8213283153654349, | |
| "learning_rate": 8.030033474511248e-06, | |
| "loss": 0.6092, | |
| "step": 545 | |
| }, | |
| { | |
| "epoch": 1.2235817575083425, | |
| "grad_norm": 0.7889430474165346, | |
| "learning_rate": 7.839821780235168e-06, | |
| "loss": 0.645, | |
| "step": 550 | |
| }, | |
| { | |
| "epoch": 1.2347052280311457, | |
| "grad_norm": 0.8377912406937705, | |
| "learning_rate": 7.650426467050926e-06, | |
| "loss": 0.6286, | |
| "step": 555 | |
| }, | |
| { | |
| "epoch": 1.2458286985539488, | |
| "grad_norm": 0.8488324297083317, | |
| "learning_rate": 7.4619191118085955e-06, | |
| "loss": 0.6129, | |
| "step": 560 | |
| }, | |
| { | |
| "epoch": 1.256952169076752, | |
| "grad_norm": 0.7608536209939344, | |
| "learning_rate": 7.274370955778498e-06, | |
| "loss": 0.6072, | |
| "step": 565 | |
| }, | |
| { | |
| "epoch": 1.2680756395995552, | |
| "grad_norm": 0.7928583676765779, | |
| "learning_rate": 7.0878528777274814e-06, | |
| "loss": 0.6042, | |
| "step": 570 | |
| }, | |
| { | |
| "epoch": 1.279199110122358, | |
| "grad_norm": 1.5882806537259504, | |
| "learning_rate": 6.9024353671322086e-06, | |
| "loss": 0.647, | |
| "step": 575 | |
| }, | |
| { | |
| "epoch": 1.2903225806451613, | |
| "grad_norm": 0.8447762128943721, | |
| "learning_rate": 6.718188497539554e-06, | |
| "loss": 0.6214, | |
| "step": 580 | |
| }, | |
| { | |
| "epoch": 1.3014460511679644, | |
| "grad_norm": 0.8209371297634136, | |
| "learning_rate": 6.535181900084206e-06, | |
| "loss": 0.6079, | |
| "step": 585 | |
| }, | |
| { | |
| "epoch": 1.3125695216907676, | |
| "grad_norm": 0.8624820584895021, | |
| "learning_rate": 6.35348473717345e-06, | |
| "loss": 0.6221, | |
| "step": 590 | |
| }, | |
| { | |
| "epoch": 1.3236929922135707, | |
| "grad_norm": 0.8056054069589547, | |
| "learning_rate": 6.173165676349103e-06, | |
| "loss": 0.6254, | |
| "step": 595 | |
| }, | |
| { | |
| "epoch": 1.3348164627363737, | |
| "grad_norm": 0.8273703875367165, | |
| "learning_rate": 5.994292864336473e-06, | |
| "loss": 0.6119, | |
| "step": 600 | |
| }, | |
| { | |
| "epoch": 1.3459399332591768, | |
| "grad_norm": 0.8723777846098392, | |
| "learning_rate": 5.816933901290136e-06, | |
| "loss": 0.6395, | |
| "step": 605 | |
| }, | |
| { | |
| "epoch": 1.35706340378198, | |
| "grad_norm": 0.8674540470170442, | |
| "learning_rate": 5.64115581524629e-06, | |
| "loss": 0.6163, | |
| "step": 610 | |
| }, | |
| { | |
| "epoch": 1.3681868743047831, | |
| "grad_norm": 0.8762720501618131, | |
| "learning_rate": 5.4670250367913025e-06, | |
| "loss": 0.6225, | |
| "step": 615 | |
| }, | |
| { | |
| "epoch": 1.3793103448275863, | |
| "grad_norm": 0.8491305197357123, | |
| "learning_rate": 5.294607373956071e-06, | |
| "loss": 0.6093, | |
| "step": 620 | |
| }, | |
| { | |
| "epoch": 1.3904338153503892, | |
| "grad_norm": 0.8166208130830984, | |
| "learning_rate": 5.1239679873456636e-06, | |
| "loss": 0.6361, | |
| "step": 625 | |
| }, | |
| { | |
| "epoch": 1.4015572858731924, | |
| "grad_norm": 0.8009131846316857, | |
| "learning_rate": 4.955171365513603e-06, | |
| "loss": 0.617, | |
| "step": 630 | |
| }, | |
| { | |
| "epoch": 1.4126807563959956, | |
| "grad_norm": 0.8366294436519559, | |
| "learning_rate": 4.788281300590169e-06, | |
| "loss": 0.6118, | |
| "step": 635 | |
| }, | |
| { | |
| "epoch": 1.4238042269187987, | |
| "grad_norm": 0.8808417873595291, | |
| "learning_rate": 4.623360864173893e-06, | |
| "loss": 0.6177, | |
| "step": 640 | |
| }, | |
| { | |
| "epoch": 1.4349276974416019, | |
| "grad_norm": 0.8629287010139255, | |
| "learning_rate": 4.4604723834953315e-06, | |
| "loss": 0.6251, | |
| "step": 645 | |
| }, | |
| { | |
| "epoch": 1.4460511679644048, | |
| "grad_norm": 0.7875671775994082, | |
| "learning_rate": 4.299677417862174e-06, | |
| "loss": 0.6199, | |
| "step": 650 | |
| }, | |
| { | |
| "epoch": 1.457174638487208, | |
| "grad_norm": 0.7828729245421459, | |
| "learning_rate": 4.141036735394575e-06, | |
| "loss": 0.6215, | |
| "step": 655 | |
| }, | |
| { | |
| "epoch": 1.4682981090100111, | |
| "grad_norm": 0.8007957527163657, | |
| "learning_rate": 3.984610290059467e-06, | |
| "loss": 0.6253, | |
| "step": 660 | |
| }, | |
| { | |
| "epoch": 1.4794215795328143, | |
| "grad_norm": 0.7910153849628225, | |
| "learning_rate": 3.830457199012585e-06, | |
| "loss": 0.6157, | |
| "step": 665 | |
| }, | |
| { | |
| "epoch": 1.4905450500556174, | |
| "grad_norm": 0.8255633946537543, | |
| "learning_rate": 3.6786357202567367e-06, | |
| "loss": 0.6182, | |
| "step": 670 | |
| }, | |
| { | |
| "epoch": 1.5016685205784204, | |
| "grad_norm": 0.8740651322002517, | |
| "learning_rate": 3.529203230624747e-06, | |
| "loss": 0.6334, | |
| "step": 675 | |
| }, | |
| { | |
| "epoch": 1.5127919911012235, | |
| "grad_norm": 0.7540939777667681, | |
| "learning_rate": 3.3822162040954355e-06, | |
| "loss": 0.596, | |
| "step": 680 | |
| }, | |
| { | |
| "epoch": 1.5239154616240267, | |
| "grad_norm": 0.7934734563966453, | |
| "learning_rate": 3.2377301904508163e-06, | |
| "loss": 0.5951, | |
| "step": 685 | |
| }, | |
| { | |
| "epoch": 1.5350389321468298, | |
| "grad_norm": 0.8131581917254441, | |
| "learning_rate": 3.0957997942825337e-06, | |
| "loss": 0.612, | |
| "step": 690 | |
| }, | |
| { | |
| "epoch": 1.546162402669633, | |
| "grad_norm": 0.8454586171154052, | |
| "learning_rate": 2.956478654355539e-06, | |
| "loss": 0.6293, | |
| "step": 695 | |
| }, | |
| { | |
| "epoch": 1.557285873192436, | |
| "grad_norm": 0.8339559981613245, | |
| "learning_rate": 2.8198194233367747e-06, | |
| "loss": 0.6088, | |
| "step": 700 | |
| }, | |
| { | |
| "epoch": 1.568409343715239, | |
| "grad_norm": 0.821374605567537, | |
| "learning_rate": 2.6858737478965036e-06, | |
| "loss": 0.6233, | |
| "step": 705 | |
| }, | |
| { | |
| "epoch": 1.5795328142380423, | |
| "grad_norm": 0.7711457173871649, | |
| "learning_rate": 2.5546922491898497e-06, | |
| "loss": 0.6262, | |
| "step": 710 | |
| }, | |
| { | |
| "epoch": 1.5906562847608454, | |
| "grad_norm": 0.841438327290974, | |
| "learning_rate": 2.4263245037258996e-06, | |
| "loss": 0.6359, | |
| "step": 715 | |
| }, | |
| { | |
| "epoch": 1.6017797552836486, | |
| "grad_norm": 0.8546385634639357, | |
| "learning_rate": 2.3008190246316033e-06, | |
| "loss": 0.6312, | |
| "step": 720 | |
| }, | |
| { | |
| "epoch": 1.6129032258064515, | |
| "grad_norm": 0.7477788666400325, | |
| "learning_rate": 2.178223243317532e-06, | |
| "loss": 0.6115, | |
| "step": 725 | |
| }, | |
| { | |
| "epoch": 1.624026696329255, | |
| "grad_norm": 0.8448273050299596, | |
| "learning_rate": 2.058583491552465e-06, | |
| "loss": 0.641, | |
| "step": 730 | |
| }, | |
| { | |
| "epoch": 1.6351501668520578, | |
| "grad_norm": 0.8261008969765296, | |
| "learning_rate": 1.9419449839535522e-06, | |
| "loss": 0.617, | |
| "step": 735 | |
| }, | |
| { | |
| "epoch": 1.646273637374861, | |
| "grad_norm": 0.8357122860638048, | |
| "learning_rate": 1.8283518008986566e-06, | |
| "loss": 0.607, | |
| "step": 740 | |
| }, | |
| { | |
| "epoch": 1.6573971078976641, | |
| "grad_norm": 0.8350425917666864, | |
| "learning_rate": 1.7178468718673712e-06, | |
| "loss": 0.607, | |
| "step": 745 | |
| }, | |
| { | |
| "epoch": 1.668520578420467, | |
| "grad_norm": 0.8305517949246249, | |
| "learning_rate": 1.6104719592169905e-06, | |
| "loss": 0.6151, | |
| "step": 750 | |
| }, | |
| { | |
| "epoch": 1.6796440489432705, | |
| "grad_norm": 0.8186870980107259, | |
| "learning_rate": 1.506267642399525e-06, | |
| "loss": 0.6385, | |
| "step": 755 | |
| }, | |
| { | |
| "epoch": 1.6907675194660734, | |
| "grad_norm": 0.7636300249499965, | |
| "learning_rate": 1.405273302625828e-06, | |
| "loss": 0.6075, | |
| "step": 760 | |
| }, | |
| { | |
| "epoch": 1.7018909899888766, | |
| "grad_norm": 0.8363995841599257, | |
| "learning_rate": 1.3075271079825035e-06, | |
| "loss": 0.638, | |
| "step": 765 | |
| }, | |
| { | |
| "epoch": 1.7130144605116797, | |
| "grad_norm": 0.7622345846029582, | |
| "learning_rate": 1.2130659990073146e-06, | |
| "loss": 0.6125, | |
| "step": 770 | |
| }, | |
| { | |
| "epoch": 1.7241379310344827, | |
| "grad_norm": 0.7854431717950116, | |
| "learning_rate": 1.1219256747285046e-06, | |
| "loss": 0.6203, | |
| "step": 775 | |
| }, | |
| { | |
| "epoch": 1.735261401557286, | |
| "grad_norm": 0.80577828518747, | |
| "learning_rate": 1.0341405791733183e-06, | |
| "loss": 0.6318, | |
| "step": 780 | |
| }, | |
| { | |
| "epoch": 1.746384872080089, | |
| "grad_norm": 0.8025241111868774, | |
| "learning_rate": 9.497438883507981e-07, | |
| "loss": 0.6104, | |
| "step": 785 | |
| }, | |
| { | |
| "epoch": 1.7575083426028921, | |
| "grad_norm": 0.7484880734203889, | |
| "learning_rate": 8.687674977138116e-07, | |
| "loss": 0.6111, | |
| "step": 790 | |
| }, | |
| { | |
| "epoch": 1.7686318131256953, | |
| "grad_norm": 0.7844040069263457, | |
| "learning_rate": 7.912420101050366e-07, | |
| "loss": 0.6058, | |
| "step": 795 | |
| }, | |
| { | |
| "epoch": 1.7797552836484982, | |
| "grad_norm": 0.832757853518197, | |
| "learning_rate": 7.171967241914224e-07, | |
| "loss": 0.6168, | |
| "step": 800 | |
| }, | |
| { | |
| "epoch": 1.7908787541713016, | |
| "grad_norm": 0.7982807830933683, | |
| "learning_rate": 6.466596233915601e-07, | |
| "loss": 0.6111, | |
| "step": 805 | |
| }, | |
| { | |
| "epoch": 1.8020022246941045, | |
| "grad_norm": 0.8285138016892091, | |
| "learning_rate": 5.796573653001091e-07, | |
| "loss": 0.6206, | |
| "step": 810 | |
| }, | |
| { | |
| "epoch": 1.8131256952169077, | |
| "grad_norm": 0.8098593367246673, | |
| "learning_rate": 5.162152716132662e-07, | |
| "loss": 0.6301, | |
| "step": 815 | |
| }, | |
| { | |
| "epoch": 1.8242491657397109, | |
| "grad_norm": 0.8317063889098834, | |
| "learning_rate": 4.563573185591219e-07, | |
| "loss": 0.5913, | |
| "step": 820 | |
| }, | |
| { | |
| "epoch": 1.8353726362625138, | |
| "grad_norm": 0.7802676617619877, | |
| "learning_rate": 4.0010612783648927e-07, | |
| "loss": 0.6009, | |
| "step": 825 | |
| }, | |
| { | |
| "epoch": 1.8464961067853172, | |
| "grad_norm": 0.8359715597402506, | |
| "learning_rate": 3.474829580656436e-07, | |
| "loss": 0.6129, | |
| "step": 830 | |
| }, | |
| { | |
| "epoch": 1.85761957730812, | |
| "grad_norm": 0.7764499156414768, | |
| "learning_rate": 2.9850769675419776e-07, | |
| "loss": 0.6233, | |
| "step": 835 | |
| }, | |
| { | |
| "epoch": 1.8687430478309233, | |
| "grad_norm": 0.8028424415714934, | |
| "learning_rate": 2.5319885278115907e-07, | |
| "loss": 0.6079, | |
| "step": 840 | |
| }, | |
| { | |
| "epoch": 1.8798665183537264, | |
| "grad_norm": 0.8831031088714258, | |
| "learning_rate": 2.115735494019966e-07, | |
| "loss": 0.6258, | |
| "step": 845 | |
| }, | |
| { | |
| "epoch": 1.8909899888765294, | |
| "grad_norm": 0.7653050119042255, | |
| "learning_rate": 1.7364751777736334e-07, | |
| "loss": 0.6212, | |
| "step": 850 | |
| }, | |
| { | |
| "epoch": 1.9021134593993327, | |
| "grad_norm": 0.8479149432867373, | |
| "learning_rate": 1.394350910279385e-07, | |
| "loss": 0.6006, | |
| "step": 855 | |
| }, | |
| { | |
| "epoch": 1.9132369299221357, | |
| "grad_norm": 0.8537537714787563, | |
| "learning_rate": 1.0894919881760168e-07, | |
| "loss": 0.6291, | |
| "step": 860 | |
| }, | |
| { | |
| "epoch": 1.9243604004449388, | |
| "grad_norm": 0.8315410821072657, | |
| "learning_rate": 8.220136246701926e-08, | |
| "loss": 0.6226, | |
| "step": 865 | |
| }, | |
| { | |
| "epoch": 1.935483870967742, | |
| "grad_norm": 0.8500259667306754, | |
| "learning_rate": 5.920169059947412e-08, | |
| "loss": 0.6108, | |
| "step": 870 | |
| }, | |
| { | |
| "epoch": 1.946607341490545, | |
| "grad_norm": 0.8522681381543702, | |
| "learning_rate": 3.99588753205804e-08, | |
| "loss": 0.6239, | |
| "step": 875 | |
| }, | |
| { | |
| "epoch": 1.9577308120133483, | |
| "grad_norm": 0.7583572716472294, | |
| "learning_rate": 2.4480188933336812e-08, | |
| "loss": 0.6128, | |
| "step": 880 | |
| }, | |
| { | |
| "epoch": 1.9688542825361512, | |
| "grad_norm": 0.7629282401468074, | |
| "learning_rate": 1.277148118975835e-08, | |
| "loss": 0.6022, | |
| "step": 885 | |
| }, | |
| { | |
| "epoch": 1.9799777530589544, | |
| "grad_norm": 0.817692770966463, | |
| "learning_rate": 4.837177080119215e-09, | |
| "loss": 0.6154, | |
| "step": 890 | |
| }, | |
| { | |
| "epoch": 1.9911012235817576, | |
| "grad_norm": 0.774109128612781, | |
| "learning_rate": 6.8027516064606e-10, | |
| "loss": 0.616, | |
| "step": 895 | |
| }, | |
| { | |
| "epoch": 1.9977753058954395, | |
| "eval_loss": 0.7532592415809631, | |
| "eval_runtime": 5.6688, | |
| "eval_samples_per_second": 71.797, | |
| "eval_steps_per_second": 2.293, | |
| "step": 898 | |
| }, | |
| { | |
| "epoch": 1.9977753058954395, | |
| "step": 898, | |
| "total_flos": 81428314521600.0, | |
| "train_loss": 0.7065894536026868, | |
| "train_runtime": 5871.8562, | |
| "train_samples_per_second": 19.593, | |
| "train_steps_per_second": 0.153 | |
| } | |
| ], | |
| "logging_steps": 5, | |
| "max_steps": 898, | |
| "num_input_tokens_seen": 0, | |
| "num_train_epochs": 2, | |
| "save_steps": 100, | |
| "total_flos": 81428314521600.0, | |
| "train_batch_size": 16, | |
| "trial_name": null, | |
| "trial_params": null | |
| } | |