yihanwang617's picture
Model save
b724a14 verified
{
"best_metric": null,
"best_model_checkpoint": null,
"epoch": 1.9977753058954395,
"eval_steps": 500,
"global_step": 898,
"is_hyper_param_search": false,
"is_local_process_zero": true,
"is_world_process_zero": true,
"log_history": [
{
"epoch": 0.002224694104560623,
"grad_norm": 10.055098914546134,
"learning_rate": 2.2222222222222224e-07,
"loss": 1.3661,
"step": 1
},
{
"epoch": 0.011123470522803115,
"grad_norm": 9.126296513090308,
"learning_rate": 1.111111111111111e-06,
"loss": 1.2942,
"step": 5
},
{
"epoch": 0.02224694104560623,
"grad_norm": 7.84364716096091,
"learning_rate": 2.222222222222222e-06,
"loss": 1.313,
"step": 10
},
{
"epoch": 0.03337041156840934,
"grad_norm": 5.7358942086951625,
"learning_rate": 3.3333333333333333e-06,
"loss": 1.173,
"step": 15
},
{
"epoch": 0.04449388209121246,
"grad_norm": 1.8899122666121964,
"learning_rate": 4.444444444444444e-06,
"loss": 0.9741,
"step": 20
},
{
"epoch": 0.05561735261401557,
"grad_norm": 1.3213559323931832,
"learning_rate": 5.555555555555557e-06,
"loss": 0.9238,
"step": 25
},
{
"epoch": 0.06674082313681869,
"grad_norm": 1.041365244653977,
"learning_rate": 6.666666666666667e-06,
"loss": 0.8728,
"step": 30
},
{
"epoch": 0.0778642936596218,
"grad_norm": 0.9488843220134849,
"learning_rate": 7.77777777777778e-06,
"loss": 0.9007,
"step": 35
},
{
"epoch": 0.08898776418242492,
"grad_norm": 0.8275801836954026,
"learning_rate": 8.888888888888888e-06,
"loss": 0.8661,
"step": 40
},
{
"epoch": 0.10011123470522804,
"grad_norm": 0.9030018185879571,
"learning_rate": 1e-05,
"loss": 0.8484,
"step": 45
},
{
"epoch": 0.11123470522803114,
"grad_norm": 0.7848210903418559,
"learning_rate": 1.1111111111111113e-05,
"loss": 0.8291,
"step": 50
},
{
"epoch": 0.12235817575083426,
"grad_norm": 0.9313858219522357,
"learning_rate": 1.2222222222222224e-05,
"loss": 0.8707,
"step": 55
},
{
"epoch": 0.13348164627363737,
"grad_norm": 0.8227239131251839,
"learning_rate": 1.3333333333333333e-05,
"loss": 0.8055,
"step": 60
},
{
"epoch": 0.1446051167964405,
"grad_norm": 0.8199201964167689,
"learning_rate": 1.4444444444444446e-05,
"loss": 0.8151,
"step": 65
},
{
"epoch": 0.1557285873192436,
"grad_norm": 0.7767041809184307,
"learning_rate": 1.555555555555556e-05,
"loss": 0.8384,
"step": 70
},
{
"epoch": 0.1668520578420467,
"grad_norm": 0.9088502789460952,
"learning_rate": 1.6666666666666667e-05,
"loss": 0.8106,
"step": 75
},
{
"epoch": 0.17797552836484984,
"grad_norm": 0.8343739709762875,
"learning_rate": 1.7777777777777777e-05,
"loss": 0.7932,
"step": 80
},
{
"epoch": 0.18909899888765294,
"grad_norm": 0.8939354128507568,
"learning_rate": 1.888888888888889e-05,
"loss": 0.8045,
"step": 85
},
{
"epoch": 0.20022246941045607,
"grad_norm": 0.885411772323368,
"learning_rate": 2e-05,
"loss": 0.8009,
"step": 90
},
{
"epoch": 0.21134593993325917,
"grad_norm": 0.8890660179859062,
"learning_rate": 1.9998110384864614e-05,
"loss": 0.8225,
"step": 95
},
{
"epoch": 0.22246941045606228,
"grad_norm": 0.8371399480864414,
"learning_rate": 1.9992442253587533e-05,
"loss": 0.7893,
"step": 100
},
{
"epoch": 0.2335928809788654,
"grad_norm": 0.8656120072046296,
"learning_rate": 1.998299774828608e-05,
"loss": 0.812,
"step": 105
},
{
"epoch": 0.2447163515016685,
"grad_norm": 0.8486947049775396,
"learning_rate": 1.9969780438256295e-05,
"loss": 0.7886,
"step": 110
},
{
"epoch": 0.25583982202447164,
"grad_norm": 0.8507931458580409,
"learning_rate": 1.995279531862399e-05,
"loss": 0.8078,
"step": 115
},
{
"epoch": 0.26696329254727474,
"grad_norm": 0.8150830104906274,
"learning_rate": 1.993204880845699e-05,
"loss": 0.7672,
"step": 120
},
{
"epoch": 0.27808676307007785,
"grad_norm": 0.8642627038468238,
"learning_rate": 1.9907548748339223e-05,
"loss": 0.7929,
"step": 125
},
{
"epoch": 0.289210233592881,
"grad_norm": 0.8541146407873894,
"learning_rate": 1.987930439740757e-05,
"loss": 0.7873,
"step": 130
},
{
"epoch": 0.3003337041156841,
"grad_norm": 0.8995167237276087,
"learning_rate": 1.9847326429852632e-05,
"loss": 0.7862,
"step": 135
},
{
"epoch": 0.3114571746384872,
"grad_norm": 0.8173771695441595,
"learning_rate": 1.981162693088471e-05,
"loss": 0.7983,
"step": 140
},
{
"epoch": 0.3225806451612903,
"grad_norm": 0.8404409912157184,
"learning_rate": 1.977221939216652e-05,
"loss": 0.8037,
"step": 145
},
{
"epoch": 0.3337041156840934,
"grad_norm": 0.8245961889334814,
"learning_rate": 1.9729118706714377e-05,
"loss": 0.8027,
"step": 150
},
{
"epoch": 0.3448275862068966,
"grad_norm": 0.8314933715558246,
"learning_rate": 1.96823411632698e-05,
"loss": 0.7843,
"step": 155
},
{
"epoch": 0.3559510567296997,
"grad_norm": 0.8726345334934287,
"learning_rate": 1.9631904440143614e-05,
"loss": 0.793,
"step": 160
},
{
"epoch": 0.3670745272525028,
"grad_norm": 0.8415196157334568,
"learning_rate": 1.9577827598534888e-05,
"loss": 0.7668,
"step": 165
},
{
"epoch": 0.3781979977753059,
"grad_norm": 0.8815634680126696,
"learning_rate": 1.95201310753273e-05,
"loss": 0.7851,
"step": 170
},
{
"epoch": 0.389321468298109,
"grad_norm": 0.8024209332569573,
"learning_rate": 1.945883667536556e-05,
"loss": 0.7772,
"step": 175
},
{
"epoch": 0.40044493882091214,
"grad_norm": 0.8490296704540178,
"learning_rate": 1.9393967563214833e-05,
"loss": 0.7761,
"step": 180
},
{
"epoch": 0.41156840934371525,
"grad_norm": 0.8454078852442415,
"learning_rate": 1.9325548254406354e-05,
"loss": 0.7624,
"step": 185
},
{
"epoch": 0.42269187986651835,
"grad_norm": 0.8511908385629164,
"learning_rate": 1.925360460617242e-05,
"loss": 0.7668,
"step": 190
},
{
"epoch": 0.43381535038932145,
"grad_norm": 0.7982929700309319,
"learning_rate": 1.9178163807674343e-05,
"loss": 0.7634,
"step": 195
},
{
"epoch": 0.44493882091212456,
"grad_norm": 0.7816495059977551,
"learning_rate": 1.9099254369727062e-05,
"loss": 0.7748,
"step": 200
},
{
"epoch": 0.4560622914349277,
"grad_norm": 0.7563436109378239,
"learning_rate": 1.901690611402423e-05,
"loss": 0.7722,
"step": 205
},
{
"epoch": 0.4671857619577308,
"grad_norm": 0.8466958907764663,
"learning_rate": 1.8931150161867917e-05,
"loss": 0.7765,
"step": 210
},
{
"epoch": 0.4783092324805339,
"grad_norm": 0.7620831089947199,
"learning_rate": 1.8842018922407153e-05,
"loss": 0.7704,
"step": 215
},
{
"epoch": 0.489432703003337,
"grad_norm": 0.84897869766796,
"learning_rate": 1.874954608038976e-05,
"loss": 0.7729,
"step": 220
},
{
"epoch": 0.5005561735261401,
"grad_norm": 0.8210216367491153,
"learning_rate": 1.8653766583432114e-05,
"loss": 0.7716,
"step": 225
},
{
"epoch": 0.5116796440489433,
"grad_norm": 0.8730864132572128,
"learning_rate": 1.855471662881164e-05,
"loss": 0.7882,
"step": 230
},
{
"epoch": 0.5228031145717463,
"grad_norm": 0.7987767766986732,
"learning_rate": 1.845243364978702e-05,
"loss": 0.7609,
"step": 235
},
{
"epoch": 0.5339265850945495,
"grad_norm": 0.7622467591417209,
"learning_rate": 1.8346956301451303e-05,
"loss": 0.7551,
"step": 240
},
{
"epoch": 0.5450500556173526,
"grad_norm": 0.831755089876671,
"learning_rate": 1.8238324446123265e-05,
"loss": 0.7634,
"step": 245
},
{
"epoch": 0.5561735261401557,
"grad_norm": 0.7942015525420777,
"learning_rate": 1.8126579138282502e-05,
"loss": 0.7541,
"step": 250
},
{
"epoch": 0.5672969966629589,
"grad_norm": 0.824338690380162,
"learning_rate": 1.801176260905402e-05,
"loss": 0.7641,
"step": 255
},
{
"epoch": 0.578420467185762,
"grad_norm": 0.8160737587426502,
"learning_rate": 1.7893918250248106e-05,
"loss": 0.747,
"step": 260
},
{
"epoch": 0.5895439377085651,
"grad_norm": 0.7619822007899363,
"learning_rate": 1.7773090597961554e-05,
"loss": 0.7353,
"step": 265
},
{
"epoch": 0.6006674082313682,
"grad_norm": 0.7898915967361627,
"learning_rate": 1.764932531574648e-05,
"loss": 0.7588,
"step": 270
},
{
"epoch": 0.6117908787541713,
"grad_norm": 0.9091978830168115,
"learning_rate": 1.7522669177352978e-05,
"loss": 0.781,
"step": 275
},
{
"epoch": 0.6229143492769744,
"grad_norm": 0.8198662250585645,
"learning_rate": 1.7393170049052274e-05,
"loss": 0.7545,
"step": 280
},
{
"epoch": 0.6340378197997776,
"grad_norm": 0.7880789917007047,
"learning_rate": 1.7260876871546935e-05,
"loss": 0.7726,
"step": 285
},
{
"epoch": 0.6451612903225806,
"grad_norm": 0.8385501161327127,
"learning_rate": 1.7125839641475074e-05,
"loss": 0.7619,
"step": 290
},
{
"epoch": 0.6562847608453838,
"grad_norm": 0.8924470377096518,
"learning_rate": 1.6988109392515432e-05,
"loss": 0.7346,
"step": 295
},
{
"epoch": 0.6674082313681868,
"grad_norm": 0.7890602183226353,
"learning_rate": 1.6847738176100632e-05,
"loss": 0.7643,
"step": 300
},
{
"epoch": 0.67853170189099,
"grad_norm": 0.8110214434516344,
"learning_rate": 1.6704779041745686e-05,
"loss": 0.7603,
"step": 305
},
{
"epoch": 0.6896551724137931,
"grad_norm": 0.7873486916181355,
"learning_rate": 1.65592860169994e-05,
"loss": 0.7595,
"step": 310
},
{
"epoch": 0.7007786429365962,
"grad_norm": 0.7527591581883117,
"learning_rate": 1.6411314087026108e-05,
"loss": 0.7508,
"step": 315
},
{
"epoch": 0.7119021134593994,
"grad_norm": 0.8283445625547928,
"learning_rate": 1.6260919173825507e-05,
"loss": 0.7387,
"step": 320
},
{
"epoch": 0.7230255839822024,
"grad_norm": 0.7262591008119376,
"learning_rate": 1.6108158115098443e-05,
"loss": 0.7264,
"step": 325
},
{
"epoch": 0.7341490545050056,
"grad_norm": 0.7739070575646189,
"learning_rate": 1.595308864276666e-05,
"loss": 0.7435,
"step": 330
},
{
"epoch": 0.7452725250278087,
"grad_norm": 0.8489625309544235,
"learning_rate": 1.5795769361154548e-05,
"loss": 0.7615,
"step": 335
},
{
"epoch": 0.7563959955506118,
"grad_norm": 0.769293674851008,
"learning_rate": 1.5636259724841224e-05,
"loss": 0.7536,
"step": 340
},
{
"epoch": 0.7675194660734149,
"grad_norm": 0.7920492833518509,
"learning_rate": 1.5474620016191296e-05,
"loss": 0.7431,
"step": 345
},
{
"epoch": 0.778642936596218,
"grad_norm": 0.7468241826638446,
"learning_rate": 1.531091132257275e-05,
"loss": 0.732,
"step": 350
},
{
"epoch": 0.7897664071190211,
"grad_norm": 0.743401655764991,
"learning_rate": 1.5145195513270644e-05,
"loss": 0.7291,
"step": 355
},
{
"epoch": 0.8008898776418243,
"grad_norm": 0.8018681967515083,
"learning_rate": 1.4977535216105258e-05,
"loss": 0.7257,
"step": 360
},
{
"epoch": 0.8120133481646273,
"grad_norm": 0.7600864193920938,
"learning_rate": 1.480799379376362e-05,
"loss": 0.741,
"step": 365
},
{
"epoch": 0.8231368186874305,
"grad_norm": 0.8389942553789884,
"learning_rate": 1.4636635319853274e-05,
"loss": 0.742,
"step": 370
},
{
"epoch": 0.8342602892102335,
"grad_norm": 0.7886124496265561,
"learning_rate": 1.4463524554687398e-05,
"loss": 0.7545,
"step": 375
},
{
"epoch": 0.8453837597330367,
"grad_norm": 0.7344461106509269,
"learning_rate": 1.4288726920810381e-05,
"loss": 0.7278,
"step": 380
},
{
"epoch": 0.8565072302558399,
"grad_norm": 0.8189552942167496,
"learning_rate": 1.4112308478273144e-05,
"loss": 0.7461,
"step": 385
},
{
"epoch": 0.8676307007786429,
"grad_norm": 0.7834178214856152,
"learning_rate": 1.3934335899667526e-05,
"loss": 0.7378,
"step": 390
},
{
"epoch": 0.8787541713014461,
"grad_norm": 0.7714482088214847,
"learning_rate": 1.3754876444929165e-05,
"loss": 0.7489,
"step": 395
},
{
"epoch": 0.8898776418242491,
"grad_norm": 0.7755466989206458,
"learning_rate": 1.357399793591844e-05,
"loss": 0.7469,
"step": 400
},
{
"epoch": 0.9010011123470523,
"grad_norm": 0.8377177614112041,
"learning_rate": 1.3391768730789e-05,
"loss": 0.739,
"step": 405
},
{
"epoch": 0.9121245828698554,
"grad_norm": 0.7908000259612985,
"learning_rate": 1.3208257698153677e-05,
"loss": 0.7281,
"step": 410
},
{
"epoch": 0.9232480533926585,
"grad_norm": 0.7818338363656034,
"learning_rate": 1.3023534191057427e-05,
"loss": 0.753,
"step": 415
},
{
"epoch": 0.9343715239154616,
"grad_norm": 0.749284045444348,
"learning_rate": 1.283766802076722e-05,
"loss": 0.738,
"step": 420
},
{
"epoch": 0.9454949944382648,
"grad_norm": 0.7839632426822802,
"learning_rate": 1.2650729430388764e-05,
"loss": 0.7436,
"step": 425
},
{
"epoch": 0.9566184649610678,
"grad_norm": 0.7721084207333134,
"learning_rate": 1.2462789068320016e-05,
"loss": 0.748,
"step": 430
},
{
"epoch": 0.967741935483871,
"grad_norm": 0.7290139666717954,
"learning_rate": 1.2273917961551513e-05,
"loss": 0.7239,
"step": 435
},
{
"epoch": 0.978865406006674,
"grad_norm": 0.8056619429328024,
"learning_rate": 1.2084187488823657e-05,
"loss": 0.738,
"step": 440
},
{
"epoch": 0.9899888765294772,
"grad_norm": 0.7770001841474352,
"learning_rate": 1.1893669353651032e-05,
"loss": 0.7385,
"step": 445
},
{
"epoch": 0.9988876529477196,
"eval_loss": 0.7580433487892151,
"eval_runtime": 5.7595,
"eval_samples_per_second": 70.666,
"eval_steps_per_second": 2.257,
"step": 449
},
{
"epoch": 1.0011123470522802,
"grad_norm": 0.8135422551691482,
"learning_rate": 1.1702435557223988e-05,
"loss": 0.7266,
"step": 450
},
{
"epoch": 1.0122358175750834,
"grad_norm": 0.8964435701944624,
"learning_rate": 1.1510558371197754e-05,
"loss": 0.6412,
"step": 455
},
{
"epoch": 1.0233592880978866,
"grad_norm": 0.8600984235769464,
"learning_rate": 1.1318110310379303e-05,
"loss": 0.6433,
"step": 460
},
{
"epoch": 1.0344827586206897,
"grad_norm": 0.8458880171077358,
"learning_rate": 1.112516410532233e-05,
"loss": 0.6292,
"step": 465
},
{
"epoch": 1.0456062291434929,
"grad_norm": 0.8284785412200435,
"learning_rate": 1.0931792674840718e-05,
"loss": 0.6339,
"step": 470
},
{
"epoch": 1.0567296996662958,
"grad_norm": 0.7444359225044646,
"learning_rate": 1.073806909845082e-05,
"loss": 0.6355,
"step": 475
},
{
"epoch": 1.067853170189099,
"grad_norm": 0.7723149976392786,
"learning_rate": 1.0544066588753044e-05,
"loss": 0.6235,
"step": 480
},
{
"epoch": 1.0789766407119021,
"grad_norm": 0.7593187602310192,
"learning_rate": 1.0349858463763114e-05,
"loss": 0.6105,
"step": 485
},
{
"epoch": 1.0901001112347053,
"grad_norm": 0.7855139607799839,
"learning_rate": 1.0155518119203511e-05,
"loss": 0.6568,
"step": 490
},
{
"epoch": 1.1012235817575085,
"grad_norm": 0.7898700329509037,
"learning_rate": 9.961119000765532e-06,
"loss": 0.6225,
"step": 495
},
{
"epoch": 1.1123470522803114,
"grad_norm": 0.8669564798886822,
"learning_rate": 9.766734576352478e-06,
"loss": 0.6391,
"step": 500
},
{
"epoch": 1.1234705228031145,
"grad_norm": 0.7837289412561955,
"learning_rate": 9.572438308314447e-06,
"loss": 0.6171,
"step": 505
},
{
"epoch": 1.1345939933259177,
"grad_norm": 0.8070851055141667,
"learning_rate": 9.378303625685196e-06,
"loss": 0.6282,
"step": 510
},
{
"epoch": 1.1457174638487209,
"grad_norm": 0.7979212620110364,
"learning_rate": 9.184403896431649e-06,
"loss": 0.6233,
"step": 515
},
{
"epoch": 1.156840934371524,
"grad_norm": 0.7704548256097349,
"learning_rate": 8.990812399726435e-06,
"loss": 0.5992,
"step": 520
},
{
"epoch": 1.167964404894327,
"grad_norm": 0.8961749452380681,
"learning_rate": 8.797602298254005e-06,
"loss": 0.6378,
"step": 525
},
{
"epoch": 1.1790878754171301,
"grad_norm": 0.8053259116501744,
"learning_rate": 8.604846610560771e-06,
"loss": 0.605,
"step": 530
},
{
"epoch": 1.1902113459399333,
"grad_norm": 0.7782548725591264,
"learning_rate": 8.412618183459707e-06,
"loss": 0.6081,
"step": 535
},
{
"epoch": 1.2013348164627364,
"grad_norm": 0.7780589674933976,
"learning_rate": 8.22098966449988e-06,
"loss": 0.6251,
"step": 540
},
{
"epoch": 1.2124582869855396,
"grad_norm": 0.8213283153654349,
"learning_rate": 8.030033474511248e-06,
"loss": 0.6092,
"step": 545
},
{
"epoch": 1.2235817575083425,
"grad_norm": 0.7889430474165346,
"learning_rate": 7.839821780235168e-06,
"loss": 0.645,
"step": 550
},
{
"epoch": 1.2347052280311457,
"grad_norm": 0.8377912406937705,
"learning_rate": 7.650426467050926e-06,
"loss": 0.6286,
"step": 555
},
{
"epoch": 1.2458286985539488,
"grad_norm": 0.8488324297083317,
"learning_rate": 7.4619191118085955e-06,
"loss": 0.6129,
"step": 560
},
{
"epoch": 1.256952169076752,
"grad_norm": 0.7608536209939344,
"learning_rate": 7.274370955778498e-06,
"loss": 0.6072,
"step": 565
},
{
"epoch": 1.2680756395995552,
"grad_norm": 0.7928583676765779,
"learning_rate": 7.0878528777274814e-06,
"loss": 0.6042,
"step": 570
},
{
"epoch": 1.279199110122358,
"grad_norm": 1.5882806537259504,
"learning_rate": 6.9024353671322086e-06,
"loss": 0.647,
"step": 575
},
{
"epoch": 1.2903225806451613,
"grad_norm": 0.8447762128943721,
"learning_rate": 6.718188497539554e-06,
"loss": 0.6214,
"step": 580
},
{
"epoch": 1.3014460511679644,
"grad_norm": 0.8209371297634136,
"learning_rate": 6.535181900084206e-06,
"loss": 0.6079,
"step": 585
},
{
"epoch": 1.3125695216907676,
"grad_norm": 0.8624820584895021,
"learning_rate": 6.35348473717345e-06,
"loss": 0.6221,
"step": 590
},
{
"epoch": 1.3236929922135707,
"grad_norm": 0.8056054069589547,
"learning_rate": 6.173165676349103e-06,
"loss": 0.6254,
"step": 595
},
{
"epoch": 1.3348164627363737,
"grad_norm": 0.8273703875367165,
"learning_rate": 5.994292864336473e-06,
"loss": 0.6119,
"step": 600
},
{
"epoch": 1.3459399332591768,
"grad_norm": 0.8723777846098392,
"learning_rate": 5.816933901290136e-06,
"loss": 0.6395,
"step": 605
},
{
"epoch": 1.35706340378198,
"grad_norm": 0.8674540470170442,
"learning_rate": 5.64115581524629e-06,
"loss": 0.6163,
"step": 610
},
{
"epoch": 1.3681868743047831,
"grad_norm": 0.8762720501618131,
"learning_rate": 5.4670250367913025e-06,
"loss": 0.6225,
"step": 615
},
{
"epoch": 1.3793103448275863,
"grad_norm": 0.8491305197357123,
"learning_rate": 5.294607373956071e-06,
"loss": 0.6093,
"step": 620
},
{
"epoch": 1.3904338153503892,
"grad_norm": 0.8166208130830984,
"learning_rate": 5.1239679873456636e-06,
"loss": 0.6361,
"step": 625
},
{
"epoch": 1.4015572858731924,
"grad_norm": 0.8009131846316857,
"learning_rate": 4.955171365513603e-06,
"loss": 0.617,
"step": 630
},
{
"epoch": 1.4126807563959956,
"grad_norm": 0.8366294436519559,
"learning_rate": 4.788281300590169e-06,
"loss": 0.6118,
"step": 635
},
{
"epoch": 1.4238042269187987,
"grad_norm": 0.8808417873595291,
"learning_rate": 4.623360864173893e-06,
"loss": 0.6177,
"step": 640
},
{
"epoch": 1.4349276974416019,
"grad_norm": 0.8629287010139255,
"learning_rate": 4.4604723834953315e-06,
"loss": 0.6251,
"step": 645
},
{
"epoch": 1.4460511679644048,
"grad_norm": 0.7875671775994082,
"learning_rate": 4.299677417862174e-06,
"loss": 0.6199,
"step": 650
},
{
"epoch": 1.457174638487208,
"grad_norm": 0.7828729245421459,
"learning_rate": 4.141036735394575e-06,
"loss": 0.6215,
"step": 655
},
{
"epoch": 1.4682981090100111,
"grad_norm": 0.8007957527163657,
"learning_rate": 3.984610290059467e-06,
"loss": 0.6253,
"step": 660
},
{
"epoch": 1.4794215795328143,
"grad_norm": 0.7910153849628225,
"learning_rate": 3.830457199012585e-06,
"loss": 0.6157,
"step": 665
},
{
"epoch": 1.4905450500556174,
"grad_norm": 0.8255633946537543,
"learning_rate": 3.6786357202567367e-06,
"loss": 0.6182,
"step": 670
},
{
"epoch": 1.5016685205784204,
"grad_norm": 0.8740651322002517,
"learning_rate": 3.529203230624747e-06,
"loss": 0.6334,
"step": 675
},
{
"epoch": 1.5127919911012235,
"grad_norm": 0.7540939777667681,
"learning_rate": 3.3822162040954355e-06,
"loss": 0.596,
"step": 680
},
{
"epoch": 1.5239154616240267,
"grad_norm": 0.7934734563966453,
"learning_rate": 3.2377301904508163e-06,
"loss": 0.5951,
"step": 685
},
{
"epoch": 1.5350389321468298,
"grad_norm": 0.8131581917254441,
"learning_rate": 3.0957997942825337e-06,
"loss": 0.612,
"step": 690
},
{
"epoch": 1.546162402669633,
"grad_norm": 0.8454586171154052,
"learning_rate": 2.956478654355539e-06,
"loss": 0.6293,
"step": 695
},
{
"epoch": 1.557285873192436,
"grad_norm": 0.8339559981613245,
"learning_rate": 2.8198194233367747e-06,
"loss": 0.6088,
"step": 700
},
{
"epoch": 1.568409343715239,
"grad_norm": 0.821374605567537,
"learning_rate": 2.6858737478965036e-06,
"loss": 0.6233,
"step": 705
},
{
"epoch": 1.5795328142380423,
"grad_norm": 0.7711457173871649,
"learning_rate": 2.5546922491898497e-06,
"loss": 0.6262,
"step": 710
},
{
"epoch": 1.5906562847608454,
"grad_norm": 0.841438327290974,
"learning_rate": 2.4263245037258996e-06,
"loss": 0.6359,
"step": 715
},
{
"epoch": 1.6017797552836486,
"grad_norm": 0.8546385634639357,
"learning_rate": 2.3008190246316033e-06,
"loss": 0.6312,
"step": 720
},
{
"epoch": 1.6129032258064515,
"grad_norm": 0.7477788666400325,
"learning_rate": 2.178223243317532e-06,
"loss": 0.6115,
"step": 725
},
{
"epoch": 1.624026696329255,
"grad_norm": 0.8448273050299596,
"learning_rate": 2.058583491552465e-06,
"loss": 0.641,
"step": 730
},
{
"epoch": 1.6351501668520578,
"grad_norm": 0.8261008969765296,
"learning_rate": 1.9419449839535522e-06,
"loss": 0.617,
"step": 735
},
{
"epoch": 1.646273637374861,
"grad_norm": 0.8357122860638048,
"learning_rate": 1.8283518008986566e-06,
"loss": 0.607,
"step": 740
},
{
"epoch": 1.6573971078976641,
"grad_norm": 0.8350425917666864,
"learning_rate": 1.7178468718673712e-06,
"loss": 0.607,
"step": 745
},
{
"epoch": 1.668520578420467,
"grad_norm": 0.8305517949246249,
"learning_rate": 1.6104719592169905e-06,
"loss": 0.6151,
"step": 750
},
{
"epoch": 1.6796440489432705,
"grad_norm": 0.8186870980107259,
"learning_rate": 1.506267642399525e-06,
"loss": 0.6385,
"step": 755
},
{
"epoch": 1.6907675194660734,
"grad_norm": 0.7636300249499965,
"learning_rate": 1.405273302625828e-06,
"loss": 0.6075,
"step": 760
},
{
"epoch": 1.7018909899888766,
"grad_norm": 0.8363995841599257,
"learning_rate": 1.3075271079825035e-06,
"loss": 0.638,
"step": 765
},
{
"epoch": 1.7130144605116797,
"grad_norm": 0.7622345846029582,
"learning_rate": 1.2130659990073146e-06,
"loss": 0.6125,
"step": 770
},
{
"epoch": 1.7241379310344827,
"grad_norm": 0.7854431717950116,
"learning_rate": 1.1219256747285046e-06,
"loss": 0.6203,
"step": 775
},
{
"epoch": 1.735261401557286,
"grad_norm": 0.80577828518747,
"learning_rate": 1.0341405791733183e-06,
"loss": 0.6318,
"step": 780
},
{
"epoch": 1.746384872080089,
"grad_norm": 0.8025241111868774,
"learning_rate": 9.497438883507981e-07,
"loss": 0.6104,
"step": 785
},
{
"epoch": 1.7575083426028921,
"grad_norm": 0.7484880734203889,
"learning_rate": 8.687674977138116e-07,
"loss": 0.6111,
"step": 790
},
{
"epoch": 1.7686318131256953,
"grad_norm": 0.7844040069263457,
"learning_rate": 7.912420101050366e-07,
"loss": 0.6058,
"step": 795
},
{
"epoch": 1.7797552836484982,
"grad_norm": 0.832757853518197,
"learning_rate": 7.171967241914224e-07,
"loss": 0.6168,
"step": 800
},
{
"epoch": 1.7908787541713016,
"grad_norm": 0.7982807830933683,
"learning_rate": 6.466596233915601e-07,
"loss": 0.6111,
"step": 805
},
{
"epoch": 1.8020022246941045,
"grad_norm": 0.8285138016892091,
"learning_rate": 5.796573653001091e-07,
"loss": 0.6206,
"step": 810
},
{
"epoch": 1.8131256952169077,
"grad_norm": 0.8098593367246673,
"learning_rate": 5.162152716132662e-07,
"loss": 0.6301,
"step": 815
},
{
"epoch": 1.8242491657397109,
"grad_norm": 0.8317063889098834,
"learning_rate": 4.563573185591219e-07,
"loss": 0.5913,
"step": 820
},
{
"epoch": 1.8353726362625138,
"grad_norm": 0.7802676617619877,
"learning_rate": 4.0010612783648927e-07,
"loss": 0.6009,
"step": 825
},
{
"epoch": 1.8464961067853172,
"grad_norm": 0.8359715597402506,
"learning_rate": 3.474829580656436e-07,
"loss": 0.6129,
"step": 830
},
{
"epoch": 1.85761957730812,
"grad_norm": 0.7764499156414768,
"learning_rate": 2.9850769675419776e-07,
"loss": 0.6233,
"step": 835
},
{
"epoch": 1.8687430478309233,
"grad_norm": 0.8028424415714934,
"learning_rate": 2.5319885278115907e-07,
"loss": 0.6079,
"step": 840
},
{
"epoch": 1.8798665183537264,
"grad_norm": 0.8831031088714258,
"learning_rate": 2.115735494019966e-07,
"loss": 0.6258,
"step": 845
},
{
"epoch": 1.8909899888765294,
"grad_norm": 0.7653050119042255,
"learning_rate": 1.7364751777736334e-07,
"loss": 0.6212,
"step": 850
},
{
"epoch": 1.9021134593993327,
"grad_norm": 0.8479149432867373,
"learning_rate": 1.394350910279385e-07,
"loss": 0.6006,
"step": 855
},
{
"epoch": 1.9132369299221357,
"grad_norm": 0.8537537714787563,
"learning_rate": 1.0894919881760168e-07,
"loss": 0.6291,
"step": 860
},
{
"epoch": 1.9243604004449388,
"grad_norm": 0.8315410821072657,
"learning_rate": 8.220136246701926e-08,
"loss": 0.6226,
"step": 865
},
{
"epoch": 1.935483870967742,
"grad_norm": 0.8500259667306754,
"learning_rate": 5.920169059947412e-08,
"loss": 0.6108,
"step": 870
},
{
"epoch": 1.946607341490545,
"grad_norm": 0.8522681381543702,
"learning_rate": 3.99588753205804e-08,
"loss": 0.6239,
"step": 875
},
{
"epoch": 1.9577308120133483,
"grad_norm": 0.7583572716472294,
"learning_rate": 2.4480188933336812e-08,
"loss": 0.6128,
"step": 880
},
{
"epoch": 1.9688542825361512,
"grad_norm": 0.7629282401468074,
"learning_rate": 1.277148118975835e-08,
"loss": 0.6022,
"step": 885
},
{
"epoch": 1.9799777530589544,
"grad_norm": 0.817692770966463,
"learning_rate": 4.837177080119215e-09,
"loss": 0.6154,
"step": 890
},
{
"epoch": 1.9911012235817576,
"grad_norm": 0.774109128612781,
"learning_rate": 6.8027516064606e-10,
"loss": 0.616,
"step": 895
},
{
"epoch": 1.9977753058954395,
"eval_loss": 0.7532592415809631,
"eval_runtime": 5.6688,
"eval_samples_per_second": 71.797,
"eval_steps_per_second": 2.293,
"step": 898
},
{
"epoch": 1.9977753058954395,
"step": 898,
"total_flos": 81428314521600.0,
"train_loss": 0.7065894536026868,
"train_runtime": 5871.8562,
"train_samples_per_second": 19.593,
"train_steps_per_second": 0.153
}
],
"logging_steps": 5,
"max_steps": 898,
"num_input_tokens_seen": 0,
"num_train_epochs": 2,
"save_steps": 100,
"total_flos": 81428314521600.0,
"train_batch_size": 16,
"trial_name": null,
"trial_params": null
}