| { | |
| "best_global_step": 2000, | |
| "best_metric": 0.8275940579902538, | |
| "best_model_checkpoint": "./results-2/checkpoint-2000", | |
| "epoch": 5.0, | |
| "eval_steps": 500, | |
| "global_step": 2400, | |
| "is_hyper_param_search": false, | |
| "is_local_process_zero": true, | |
| "is_world_process_zero": true, | |
| "log_history": [ | |
| { | |
| "epoch": 0.010416666666666666, | |
| "grad_norm": 7.359697341918945, | |
| "learning_rate": 6.249999999999999e-07, | |
| "loss": 2.1862, | |
| "step": 5 | |
| }, | |
| { | |
| "epoch": 0.020833333333333332, | |
| "grad_norm": 8.139945030212402, | |
| "learning_rate": 1.40625e-06, | |
| "loss": 2.1488, | |
| "step": 10 | |
| }, | |
| { | |
| "epoch": 0.03125, | |
| "grad_norm": 5.7283034324646, | |
| "learning_rate": 2.1875000000000002e-06, | |
| "loss": 2.1511, | |
| "step": 15 | |
| }, | |
| { | |
| "epoch": 0.041666666666666664, | |
| "grad_norm": 5.567634582519531, | |
| "learning_rate": 2.9687499999999997e-06, | |
| "loss": 2.0677, | |
| "step": 20 | |
| }, | |
| { | |
| "epoch": 0.052083333333333336, | |
| "grad_norm": 6.152674674987793, | |
| "learning_rate": 3.75e-06, | |
| "loss": 1.9718, | |
| "step": 25 | |
| }, | |
| { | |
| "epoch": 0.0625, | |
| "grad_norm": 5.273221015930176, | |
| "learning_rate": 4.53125e-06, | |
| "loss": 1.9334, | |
| "step": 30 | |
| }, | |
| { | |
| "epoch": 0.07291666666666667, | |
| "grad_norm": 6.301107406616211, | |
| "learning_rate": 5.3125e-06, | |
| "loss": 1.8644, | |
| "step": 35 | |
| }, | |
| { | |
| "epoch": 0.08333333333333333, | |
| "grad_norm": 6.154560089111328, | |
| "learning_rate": 6.09375e-06, | |
| "loss": 1.7975, | |
| "step": 40 | |
| }, | |
| { | |
| "epoch": 0.09375, | |
| "grad_norm": 7.833106994628906, | |
| "learning_rate": 6.875e-06, | |
| "loss": 1.694, | |
| "step": 45 | |
| }, | |
| { | |
| "epoch": 0.10416666666666667, | |
| "grad_norm": 6.560196399688721, | |
| "learning_rate": 7.65625e-06, | |
| "loss": 1.647, | |
| "step": 50 | |
| }, | |
| { | |
| "epoch": 0.11458333333333333, | |
| "grad_norm": 5.196597099304199, | |
| "learning_rate": 8.4375e-06, | |
| "loss": 1.5169, | |
| "step": 55 | |
| }, | |
| { | |
| "epoch": 0.125, | |
| "grad_norm": 4.286194801330566, | |
| "learning_rate": 9.21875e-06, | |
| "loss": 1.4902, | |
| "step": 60 | |
| }, | |
| { | |
| "epoch": 0.13541666666666666, | |
| "grad_norm": 5.786597728729248, | |
| "learning_rate": 9.999999999999999e-06, | |
| "loss": 1.4229, | |
| "step": 65 | |
| }, | |
| { | |
| "epoch": 0.14583333333333334, | |
| "grad_norm": 4.149752616882324, | |
| "learning_rate": 1.078125e-05, | |
| "loss": 1.4483, | |
| "step": 70 | |
| }, | |
| { | |
| "epoch": 0.15625, | |
| "grad_norm": 3.51442551612854, | |
| "learning_rate": 1.1562500000000002e-05, | |
| "loss": 1.2548, | |
| "step": 75 | |
| }, | |
| { | |
| "epoch": 0.16666666666666666, | |
| "grad_norm": 3.1361498832702637, | |
| "learning_rate": 1.234375e-05, | |
| "loss": 1.264, | |
| "step": 80 | |
| }, | |
| { | |
| "epoch": 0.17708333333333334, | |
| "grad_norm": 3.731511116027832, | |
| "learning_rate": 1.3125e-05, | |
| "loss": 1.2934, | |
| "step": 85 | |
| }, | |
| { | |
| "epoch": 0.1875, | |
| "grad_norm": 6.455506801605225, | |
| "learning_rate": 1.3906250000000001e-05, | |
| "loss": 1.19, | |
| "step": 90 | |
| }, | |
| { | |
| "epoch": 0.19791666666666666, | |
| "grad_norm": 2.6737632751464844, | |
| "learning_rate": 1.46875e-05, | |
| "loss": 1.2097, | |
| "step": 95 | |
| }, | |
| { | |
| "epoch": 0.20833333333333334, | |
| "grad_norm": 2.970381021499634, | |
| "learning_rate": 1.546875e-05, | |
| "loss": 1.2297, | |
| "step": 100 | |
| }, | |
| { | |
| "epoch": 0.21875, | |
| "grad_norm": 3.7188503742218018, | |
| "learning_rate": 1.625e-05, | |
| "loss": 1.1337, | |
| "step": 105 | |
| }, | |
| { | |
| "epoch": 0.22916666666666666, | |
| "grad_norm": 3.1286487579345703, | |
| "learning_rate": 1.703125e-05, | |
| "loss": 1.0445, | |
| "step": 110 | |
| }, | |
| { | |
| "epoch": 0.23958333333333334, | |
| "grad_norm": 3.423229694366455, | |
| "learning_rate": 1.78125e-05, | |
| "loss": 1.0965, | |
| "step": 115 | |
| }, | |
| { | |
| "epoch": 0.25, | |
| "grad_norm": 2.224884510040283, | |
| "learning_rate": 1.8593749999999998e-05, | |
| "loss": 1.0723, | |
| "step": 120 | |
| }, | |
| { | |
| "epoch": 0.2604166666666667, | |
| "grad_norm": 4.343578815460205, | |
| "learning_rate": 1.9375e-05, | |
| "loss": 1.1381, | |
| "step": 125 | |
| }, | |
| { | |
| "epoch": 0.2708333333333333, | |
| "grad_norm": 4.854529857635498, | |
| "learning_rate": 2.0156250000000002e-05, | |
| "loss": 1.0556, | |
| "step": 130 | |
| }, | |
| { | |
| "epoch": 0.28125, | |
| "grad_norm": 3.126429796218872, | |
| "learning_rate": 2.09375e-05, | |
| "loss": 1.0332, | |
| "step": 135 | |
| }, | |
| { | |
| "epoch": 0.2916666666666667, | |
| "grad_norm": 2.374694585800171, | |
| "learning_rate": 2.1718750000000003e-05, | |
| "loss": 0.9925, | |
| "step": 140 | |
| }, | |
| { | |
| "epoch": 0.3020833333333333, | |
| "grad_norm": 3.0362579822540283, | |
| "learning_rate": 2.25e-05, | |
| "loss": 0.9653, | |
| "step": 145 | |
| }, | |
| { | |
| "epoch": 0.3125, | |
| "grad_norm": 3.6780600547790527, | |
| "learning_rate": 2.328125e-05, | |
| "loss": 1.0233, | |
| "step": 150 | |
| }, | |
| { | |
| "epoch": 0.3229166666666667, | |
| "grad_norm": 2.872868537902832, | |
| "learning_rate": 2.4062500000000002e-05, | |
| "loss": 0.9576, | |
| "step": 155 | |
| }, | |
| { | |
| "epoch": 0.3333333333333333, | |
| "grad_norm": 4.167120456695557, | |
| "learning_rate": 2.484375e-05, | |
| "loss": 0.9723, | |
| "step": 160 | |
| }, | |
| { | |
| "epoch": 0.34375, | |
| "grad_norm": 4.553691387176514, | |
| "learning_rate": 2.5625e-05, | |
| "loss": 0.9644, | |
| "step": 165 | |
| }, | |
| { | |
| "epoch": 0.3541666666666667, | |
| "grad_norm": 2.3838977813720703, | |
| "learning_rate": 2.640625e-05, | |
| "loss": 0.9377, | |
| "step": 170 | |
| }, | |
| { | |
| "epoch": 0.3645833333333333, | |
| "grad_norm": 2.1098973751068115, | |
| "learning_rate": 2.71875e-05, | |
| "loss": 0.8711, | |
| "step": 175 | |
| }, | |
| { | |
| "epoch": 0.375, | |
| "grad_norm": 2.547675132751465, | |
| "learning_rate": 2.796875e-05, | |
| "loss": 0.8746, | |
| "step": 180 | |
| }, | |
| { | |
| "epoch": 0.3854166666666667, | |
| "grad_norm": 5.397830963134766, | |
| "learning_rate": 2.875e-05, | |
| "loss": 1.06, | |
| "step": 185 | |
| }, | |
| { | |
| "epoch": 0.3958333333333333, | |
| "grad_norm": 2.9065639972686768, | |
| "learning_rate": 2.953125e-05, | |
| "loss": 0.955, | |
| "step": 190 | |
| }, | |
| { | |
| "epoch": 0.40625, | |
| "grad_norm": 2.7439205646514893, | |
| "learning_rate": 2.999993926730281e-05, | |
| "loss": 0.9308, | |
| "step": 195 | |
| }, | |
| { | |
| "epoch": 0.4166666666666667, | |
| "grad_norm": 3.4526748657226562, | |
| "learning_rate": 2.9999256030107375e-05, | |
| "loss": 0.9358, | |
| "step": 200 | |
| }, | |
| { | |
| "epoch": 0.4270833333333333, | |
| "grad_norm": 3.5641372203826904, | |
| "learning_rate": 2.999781367453919e-05, | |
| "loss": 0.9257, | |
| "step": 205 | |
| }, | |
| { | |
| "epoch": 0.4375, | |
| "grad_norm": 3.0541481971740723, | |
| "learning_rate": 2.9995612273596444e-05, | |
| "loss": 0.9172, | |
| "step": 210 | |
| }, | |
| { | |
| "epoch": 0.4479166666666667, | |
| "grad_norm": 2.517723798751831, | |
| "learning_rate": 2.9992651938692928e-05, | |
| "loss": 0.9616, | |
| "step": 215 | |
| }, | |
| { | |
| "epoch": 0.4583333333333333, | |
| "grad_norm": 3.186232805252075, | |
| "learning_rate": 2.9988932819652385e-05, | |
| "loss": 0.8209, | |
| "step": 220 | |
| }, | |
| { | |
| "epoch": 0.46875, | |
| "grad_norm": 2.3760082721710205, | |
| "learning_rate": 2.9984455104700913e-05, | |
| "loss": 0.9281, | |
| "step": 225 | |
| }, | |
| { | |
| "epoch": 0.4791666666666667, | |
| "grad_norm": 2.766998291015625, | |
| "learning_rate": 2.9979219020457473e-05, | |
| "loss": 0.7327, | |
| "step": 230 | |
| }, | |
| { | |
| "epoch": 0.4895833333333333, | |
| "grad_norm": 3.173574924468994, | |
| "learning_rate": 2.9973224831922393e-05, | |
| "loss": 0.8934, | |
| "step": 235 | |
| }, | |
| { | |
| "epoch": 0.5, | |
| "grad_norm": 3.046680212020874, | |
| "learning_rate": 2.996647284246396e-05, | |
| "loss": 0.9465, | |
| "step": 240 | |
| }, | |
| { | |
| "epoch": 0.5104166666666666, | |
| "grad_norm": 4.544061660766602, | |
| "learning_rate": 2.9958963393803064e-05, | |
| "loss": 0.8779, | |
| "step": 245 | |
| }, | |
| { | |
| "epoch": 0.5208333333333334, | |
| "grad_norm": 2.511229991912842, | |
| "learning_rate": 2.995069686599593e-05, | |
| "loss": 0.8632, | |
| "step": 250 | |
| }, | |
| { | |
| "epoch": 0.53125, | |
| "grad_norm": 2.8615193367004395, | |
| "learning_rate": 2.9941673677414848e-05, | |
| "loss": 0.9559, | |
| "step": 255 | |
| }, | |
| { | |
| "epoch": 0.5416666666666666, | |
| "grad_norm": 2.5760531425476074, | |
| "learning_rate": 2.9931894284727012e-05, | |
| "loss": 0.8413, | |
| "step": 260 | |
| }, | |
| { | |
| "epoch": 0.5520833333333334, | |
| "grad_norm": 2.015033006668091, | |
| "learning_rate": 2.992135918287142e-05, | |
| "loss": 0.835, | |
| "step": 265 | |
| }, | |
| { | |
| "epoch": 0.5625, | |
| "grad_norm": 3.511301040649414, | |
| "learning_rate": 2.991006890503381e-05, | |
| "loss": 0.8166, | |
| "step": 270 | |
| }, | |
| { | |
| "epoch": 0.5729166666666666, | |
| "grad_norm": 2.851557731628418, | |
| "learning_rate": 2.9898024022619672e-05, | |
| "loss": 0.8686, | |
| "step": 275 | |
| }, | |
| { | |
| "epoch": 0.5833333333333334, | |
| "grad_norm": 3.0985593795776367, | |
| "learning_rate": 2.9885225145225342e-05, | |
| "loss": 0.8321, | |
| "step": 280 | |
| }, | |
| { | |
| "epoch": 0.59375, | |
| "grad_norm": 2.7321419715881348, | |
| "learning_rate": 2.9871672920607158e-05, | |
| "loss": 0.8536, | |
| "step": 285 | |
| }, | |
| { | |
| "epoch": 0.6041666666666666, | |
| "grad_norm": 3.942899227142334, | |
| "learning_rate": 2.985736803464864e-05, | |
| "loss": 0.8389, | |
| "step": 290 | |
| }, | |
| { | |
| "epoch": 0.6145833333333334, | |
| "grad_norm": 1.9148991107940674, | |
| "learning_rate": 2.9842311211325814e-05, | |
| "loss": 0.7741, | |
| "step": 295 | |
| }, | |
| { | |
| "epoch": 0.625, | |
| "grad_norm": 1.9662061929702759, | |
| "learning_rate": 2.9826503212670567e-05, | |
| "loss": 0.8136, | |
| "step": 300 | |
| }, | |
| { | |
| "epoch": 0.6354166666666666, | |
| "grad_norm": 2.9124855995178223, | |
| "learning_rate": 2.9809944838732046e-05, | |
| "loss": 0.7814, | |
| "step": 305 | |
| }, | |
| { | |
| "epoch": 0.6458333333333334, | |
| "grad_norm": 2.2057065963745117, | |
| "learning_rate": 2.9792636927536217e-05, | |
| "loss": 0.7673, | |
| "step": 310 | |
| }, | |
| { | |
| "epoch": 0.65625, | |
| "grad_norm": 2.531109094619751, | |
| "learning_rate": 2.9774580355043414e-05, | |
| "loss": 0.7038, | |
| "step": 315 | |
| }, | |
| { | |
| "epoch": 0.6666666666666666, | |
| "grad_norm": 2.634474992752075, | |
| "learning_rate": 2.9755776035104025e-05, | |
| "loss": 0.7753, | |
| "step": 320 | |
| }, | |
| { | |
| "epoch": 0.6770833333333334, | |
| "grad_norm": 3.2095534801483154, | |
| "learning_rate": 2.973622491941224e-05, | |
| "loss": 0.775, | |
| "step": 325 | |
| }, | |
| { | |
| "epoch": 0.6875, | |
| "grad_norm": 3.7532143592834473, | |
| "learning_rate": 2.971592799745787e-05, | |
| "loss": 0.7815, | |
| "step": 330 | |
| }, | |
| { | |
| "epoch": 0.6979166666666666, | |
| "grad_norm": 3.783160924911499, | |
| "learning_rate": 2.9694886296476295e-05, | |
| "loss": 0.6785, | |
| "step": 335 | |
| }, | |
| { | |
| "epoch": 0.7083333333333334, | |
| "grad_norm": 2.9880690574645996, | |
| "learning_rate": 2.9673100881396446e-05, | |
| "loss": 0.7276, | |
| "step": 340 | |
| }, | |
| { | |
| "epoch": 0.71875, | |
| "grad_norm": 2.739086627960205, | |
| "learning_rate": 2.965057285478694e-05, | |
| "loss": 0.7793, | |
| "step": 345 | |
| }, | |
| { | |
| "epoch": 0.7291666666666666, | |
| "grad_norm": 3.02671217918396, | |
| "learning_rate": 2.962730335680025e-05, | |
| "loss": 0.7782, | |
| "step": 350 | |
| }, | |
| { | |
| "epoch": 0.7395833333333334, | |
| "grad_norm": 2.325887441635132, | |
| "learning_rate": 2.9603293565115015e-05, | |
| "loss": 0.7952, | |
| "step": 355 | |
| }, | |
| { | |
| "epoch": 0.75, | |
| "grad_norm": 3.7881007194519043, | |
| "learning_rate": 2.9578544694876436e-05, | |
| "loss": 0.8265, | |
| "step": 360 | |
| }, | |
| { | |
| "epoch": 0.7604166666666666, | |
| "grad_norm": 3.507401943206787, | |
| "learning_rate": 2.955305799863478e-05, | |
| "loss": 0.8544, | |
| "step": 365 | |
| }, | |
| { | |
| "epoch": 0.7708333333333334, | |
| "grad_norm": 3.3894381523132324, | |
| "learning_rate": 2.952683476628198e-05, | |
| "loss": 0.79, | |
| "step": 370 | |
| }, | |
| { | |
| "epoch": 0.78125, | |
| "grad_norm": 2.6260082721710205, | |
| "learning_rate": 2.949987632498636e-05, | |
| "loss": 0.7581, | |
| "step": 375 | |
| }, | |
| { | |
| "epoch": 0.7916666666666666, | |
| "grad_norm": 3.328601598739624, | |
| "learning_rate": 2.947218403912546e-05, | |
| "loss": 0.7402, | |
| "step": 380 | |
| }, | |
| { | |
| "epoch": 0.8020833333333334, | |
| "grad_norm": 2.5484001636505127, | |
| "learning_rate": 2.944375931021699e-05, | |
| "loss": 0.7944, | |
| "step": 385 | |
| }, | |
| { | |
| "epoch": 0.8125, | |
| "grad_norm": 4.819921970367432, | |
| "learning_rate": 2.9414603576847905e-05, | |
| "loss": 0.8141, | |
| "step": 390 | |
| }, | |
| { | |
| "epoch": 0.8229166666666666, | |
| "grad_norm": 3.3109726905822754, | |
| "learning_rate": 2.9384718314601575e-05, | |
| "loss": 0.7434, | |
| "step": 395 | |
| }, | |
| { | |
| "epoch": 0.8333333333333334, | |
| "grad_norm": 3.314681053161621, | |
| "learning_rate": 2.9354105035983133e-05, | |
| "loss": 0.7197, | |
| "step": 400 | |
| }, | |
| { | |
| "epoch": 0.84375, | |
| "grad_norm": 2.9838476181030273, | |
| "learning_rate": 2.9322765290342905e-05, | |
| "loss": 0.7779, | |
| "step": 405 | |
| }, | |
| { | |
| "epoch": 0.8541666666666666, | |
| "grad_norm": 2.874854326248169, | |
| "learning_rate": 2.9290700663798007e-05, | |
| "loss": 0.7811, | |
| "step": 410 | |
| }, | |
| { | |
| "epoch": 0.8645833333333334, | |
| "grad_norm": 2.484381675720215, | |
| "learning_rate": 2.9257912779152065e-05, | |
| "loss": 0.7947, | |
| "step": 415 | |
| }, | |
| { | |
| "epoch": 0.875, | |
| "grad_norm": 2.5941007137298584, | |
| "learning_rate": 2.922440329581309e-05, | |
| "loss": 0.7329, | |
| "step": 420 | |
| }, | |
| { | |
| "epoch": 0.8854166666666666, | |
| "grad_norm": 2.691617488861084, | |
| "learning_rate": 2.9190173909709506e-05, | |
| "loss": 0.708, | |
| "step": 425 | |
| }, | |
| { | |
| "epoch": 0.8958333333333334, | |
| "grad_norm": 3.2808163166046143, | |
| "learning_rate": 2.915522635320428e-05, | |
| "loss": 0.7436, | |
| "step": 430 | |
| }, | |
| { | |
| "epoch": 0.90625, | |
| "grad_norm": 2.750916004180908, | |
| "learning_rate": 2.9119562395007294e-05, | |
| "loss": 0.7349, | |
| "step": 435 | |
| }, | |
| { | |
| "epoch": 0.9166666666666666, | |
| "grad_norm": 2.680910110473633, | |
| "learning_rate": 2.9083183840085796e-05, | |
| "loss": 0.6426, | |
| "step": 440 | |
| }, | |
| { | |
| "epoch": 0.9270833333333334, | |
| "grad_norm": 3.478559732437134, | |
| "learning_rate": 2.9046092529573063e-05, | |
| "loss": 0.7669, | |
| "step": 445 | |
| }, | |
| { | |
| "epoch": 0.9375, | |
| "grad_norm": 2.9709181785583496, | |
| "learning_rate": 2.9008290340675212e-05, | |
| "loss": 0.7688, | |
| "step": 450 | |
| }, | |
| { | |
| "epoch": 0.9479166666666666, | |
| "grad_norm": 2.9362924098968506, | |
| "learning_rate": 2.8969779186576223e-05, | |
| "loss": 0.7391, | |
| "step": 455 | |
| }, | |
| { | |
| "epoch": 0.9583333333333334, | |
| "grad_norm": 4.106695652008057, | |
| "learning_rate": 2.8930561016341062e-05, | |
| "loss": 0.7328, | |
| "step": 460 | |
| }, | |
| { | |
| "epoch": 0.96875, | |
| "grad_norm": 2.5602052211761475, | |
| "learning_rate": 2.8890637814817078e-05, | |
| "loss": 0.7083, | |
| "step": 465 | |
| }, | |
| { | |
| "epoch": 0.9791666666666666, | |
| "grad_norm": 2.393566846847534, | |
| "learning_rate": 2.885001160253355e-05, | |
| "loss": 0.7681, | |
| "step": 470 | |
| }, | |
| { | |
| "epoch": 0.9895833333333334, | |
| "grad_norm": 2.585813522338867, | |
| "learning_rate": 2.8808684435599382e-05, | |
| "loss": 0.7112, | |
| "step": 475 | |
| }, | |
| { | |
| "epoch": 1.0, | |
| "grad_norm": 2.867818593978882, | |
| "learning_rate": 2.876665840559911e-05, | |
| "loss": 0.6529, | |
| "step": 480 | |
| }, | |
| { | |
| "epoch": 1.0104166666666667, | |
| "grad_norm": 2.501804828643799, | |
| "learning_rate": 2.8723935639486985e-05, | |
| "loss": 0.6449, | |
| "step": 485 | |
| }, | |
| { | |
| "epoch": 1.0208333333333333, | |
| "grad_norm": 3.8182015419006348, | |
| "learning_rate": 2.8680518299479364e-05, | |
| "loss": 0.6794, | |
| "step": 490 | |
| }, | |
| { | |
| "epoch": 1.03125, | |
| "grad_norm": 3.0165951251983643, | |
| "learning_rate": 2.8636408582945263e-05, | |
| "loss": 0.6764, | |
| "step": 495 | |
| }, | |
| { | |
| "epoch": 1.0416666666666667, | |
| "grad_norm": 3.205742835998535, | |
| "learning_rate": 2.859160872229516e-05, | |
| "loss": 0.6624, | |
| "step": 500 | |
| }, | |
| { | |
| "epoch": 1.0416666666666667, | |
| "eval_accuracy": 0.7857720291026677, | |
| "eval_f1": 0.7578766957348528, | |
| "eval_loss": 0.6753339767456055, | |
| "eval_precision": 0.7428925293469905, | |
| "eval_recall": 0.7857720291026677, | |
| "eval_runtime": 65.6121, | |
| "eval_samples_per_second": 113.119, | |
| "eval_steps_per_second": 1.768, | |
| "step": 500 | |
| }, | |
| { | |
| "epoch": 1.0520833333333333, | |
| "grad_norm": 2.590345621109009, | |
| "learning_rate": 2.8546120984867992e-05, | |
| "loss": 0.6599, | |
| "step": 505 | |
| }, | |
| { | |
| "epoch": 1.0625, | |
| "grad_norm": 3.122945785522461, | |
| "learning_rate": 2.8499947672816424e-05, | |
| "loss": 0.6101, | |
| "step": 510 | |
| }, | |
| { | |
| "epoch": 1.0729166666666667, | |
| "grad_norm": 2.464625597000122, | |
| "learning_rate": 2.8453091122990325e-05, | |
| "loss": 0.6374, | |
| "step": 515 | |
| }, | |
| { | |
| "epoch": 1.0833333333333333, | |
| "grad_norm": 4.042583465576172, | |
| "learning_rate": 2.8405553706818504e-05, | |
| "loss": 0.6983, | |
| "step": 520 | |
| }, | |
| { | |
| "epoch": 1.09375, | |
| "grad_norm": 4.181508541107178, | |
| "learning_rate": 2.8357337830188696e-05, | |
| "loss": 0.765, | |
| "step": 525 | |
| }, | |
| { | |
| "epoch": 1.1041666666666667, | |
| "grad_norm": 2.876793622970581, | |
| "learning_rate": 2.8308445933325772e-05, | |
| "loss": 0.7002, | |
| "step": 530 | |
| }, | |
| { | |
| "epoch": 1.1145833333333333, | |
| "grad_norm": 2.624448776245117, | |
| "learning_rate": 2.8258880490668284e-05, | |
| "loss": 0.6322, | |
| "step": 535 | |
| }, | |
| { | |
| "epoch": 1.125, | |
| "grad_norm": 2.773343086242676, | |
| "learning_rate": 2.820864401074319e-05, | |
| "loss": 0.6523, | |
| "step": 540 | |
| }, | |
| { | |
| "epoch": 1.1354166666666667, | |
| "grad_norm": 3.6462056636810303, | |
| "learning_rate": 2.8157739036038917e-05, | |
| "loss": 0.7166, | |
| "step": 545 | |
| }, | |
| { | |
| "epoch": 1.1458333333333333, | |
| "grad_norm": 3.532557725906372, | |
| "learning_rate": 2.8106168142876682e-05, | |
| "loss": 0.6734, | |
| "step": 550 | |
| }, | |
| { | |
| "epoch": 1.15625, | |
| "grad_norm": 3.1640117168426514, | |
| "learning_rate": 2.8053933941280104e-05, | |
| "loss": 0.6635, | |
| "step": 555 | |
| }, | |
| { | |
| "epoch": 1.1666666666666667, | |
| "grad_norm": 3.1110782623291016, | |
| "learning_rate": 2.8001039074843115e-05, | |
| "loss": 0.6505, | |
| "step": 560 | |
| }, | |
| { | |
| "epoch": 1.1770833333333333, | |
| "grad_norm": 3.2614970207214355, | |
| "learning_rate": 2.7947486220596143e-05, | |
| "loss": 0.6774, | |
| "step": 565 | |
| }, | |
| { | |
| "epoch": 1.1875, | |
| "grad_norm": 3.7098958492279053, | |
| "learning_rate": 2.789327808887066e-05, | |
| "loss": 0.6267, | |
| "step": 570 | |
| }, | |
| { | |
| "epoch": 1.1979166666666667, | |
| "grad_norm": 3.5529346466064453, | |
| "learning_rate": 2.783841742316198e-05, | |
| "loss": 0.6851, | |
| "step": 575 | |
| }, | |
| { | |
| "epoch": 1.2083333333333333, | |
| "grad_norm": 3.6186068058013916, | |
| "learning_rate": 2.778290699999044e-05, | |
| "loss": 0.6783, | |
| "step": 580 | |
| }, | |
| { | |
| "epoch": 1.21875, | |
| "grad_norm": 3.64753794670105, | |
| "learning_rate": 2.772674962876085e-05, | |
| "loss": 0.622, | |
| "step": 585 | |
| }, | |
| { | |
| "epoch": 1.2291666666666667, | |
| "grad_norm": 3.0052146911621094, | |
| "learning_rate": 2.766994815162033e-05, | |
| "loss": 0.6375, | |
| "step": 590 | |
| }, | |
| { | |
| "epoch": 1.2395833333333333, | |
| "grad_norm": 2.9114181995391846, | |
| "learning_rate": 2.7612505443314446e-05, | |
| "loss": 0.6737, | |
| "step": 595 | |
| }, | |
| { | |
| "epoch": 1.25, | |
| "grad_norm": 3.0336954593658447, | |
| "learning_rate": 2.755442441104175e-05, | |
| "loss": 0.6082, | |
| "step": 600 | |
| }, | |
| { | |
| "epoch": 1.2604166666666667, | |
| "grad_norm": 3.0577619075775146, | |
| "learning_rate": 2.749570799430661e-05, | |
| "loss": 0.6229, | |
| "step": 605 | |
| }, | |
| { | |
| "epoch": 1.2708333333333333, | |
| "grad_norm": 3.0447328090667725, | |
| "learning_rate": 2.743635916477046e-05, | |
| "loss": 0.5965, | |
| "step": 610 | |
| }, | |
| { | |
| "epoch": 1.28125, | |
| "grad_norm": 3.437899112701416, | |
| "learning_rate": 2.7376380926101413e-05, | |
| "loss": 0.6247, | |
| "step": 615 | |
| }, | |
| { | |
| "epoch": 1.2916666666666667, | |
| "grad_norm": 3.2192740440368652, | |
| "learning_rate": 2.7315776313822212e-05, | |
| "loss": 0.6004, | |
| "step": 620 | |
| }, | |
| { | |
| "epoch": 1.3020833333333333, | |
| "grad_norm": 3.4110755920410156, | |
| "learning_rate": 2.7254548395156627e-05, | |
| "loss": 0.6318, | |
| "step": 625 | |
| }, | |
| { | |
| "epoch": 1.3125, | |
| "grad_norm": 2.8511898517608643, | |
| "learning_rate": 2.7192700268874232e-05, | |
| "loss": 0.6149, | |
| "step": 630 | |
| }, | |
| { | |
| "epoch": 1.3229166666666667, | |
| "grad_norm": 3.37019944190979, | |
| "learning_rate": 2.7130235065133522e-05, | |
| "loss": 0.541, | |
| "step": 635 | |
| }, | |
| { | |
| "epoch": 1.3333333333333333, | |
| "grad_norm": 3.9978713989257812, | |
| "learning_rate": 2.706715594532356e-05, | |
| "loss": 0.6652, | |
| "step": 640 | |
| }, | |
| { | |
| "epoch": 1.34375, | |
| "grad_norm": 3.069230556488037, | |
| "learning_rate": 2.700346610190394e-05, | |
| "loss": 0.7429, | |
| "step": 645 | |
| }, | |
| { | |
| "epoch": 1.3541666666666667, | |
| "grad_norm": 4.005884647369385, | |
| "learning_rate": 2.693916875824321e-05, | |
| "loss": 0.6592, | |
| "step": 650 | |
| }, | |
| { | |
| "epoch": 1.3645833333333333, | |
| "grad_norm": 3.44464373588562, | |
| "learning_rate": 2.6874267168455772e-05, | |
| "loss": 0.5993, | |
| "step": 655 | |
| }, | |
| { | |
| "epoch": 1.375, | |
| "grad_norm": 3.023005962371826, | |
| "learning_rate": 2.6808764617237155e-05, | |
| "loss": 0.6002, | |
| "step": 660 | |
| }, | |
| { | |
| "epoch": 1.3854166666666667, | |
| "grad_norm": 3.1900227069854736, | |
| "learning_rate": 2.674266441969778e-05, | |
| "loss": 0.5768, | |
| "step": 665 | |
| }, | |
| { | |
| "epoch": 1.3958333333333333, | |
| "grad_norm": 2.7164413928985596, | |
| "learning_rate": 2.6675969921195204e-05, | |
| "loss": 0.5787, | |
| "step": 670 | |
| }, | |
| { | |
| "epoch": 1.40625, | |
| "grad_norm": 3.9091062545776367, | |
| "learning_rate": 2.6608684497164783e-05, | |
| "loss": 0.7136, | |
| "step": 675 | |
| }, | |
| { | |
| "epoch": 1.4166666666666667, | |
| "grad_norm": 3.9219977855682373, | |
| "learning_rate": 2.6540811552948856e-05, | |
| "loss": 0.5607, | |
| "step": 680 | |
| }, | |
| { | |
| "epoch": 1.4270833333333333, | |
| "grad_norm": 3.1547436714172363, | |
| "learning_rate": 2.647235452362439e-05, | |
| "loss": 0.6226, | |
| "step": 685 | |
| }, | |
| { | |
| "epoch": 1.4375, | |
| "grad_norm": 3.2700273990631104, | |
| "learning_rate": 2.6403316873829145e-05, | |
| "loss": 0.5913, | |
| "step": 690 | |
| }, | |
| { | |
| "epoch": 1.4479166666666667, | |
| "grad_norm": 3.985025405883789, | |
| "learning_rate": 2.6333702097586304e-05, | |
| "loss": 0.6722, | |
| "step": 695 | |
| }, | |
| { | |
| "epoch": 1.4583333333333333, | |
| "grad_norm": 2.893080472946167, | |
| "learning_rate": 2.6263513718127657e-05, | |
| "loss": 0.4926, | |
| "step": 700 | |
| }, | |
| { | |
| "epoch": 1.46875, | |
| "grad_norm": 3.407822608947754, | |
| "learning_rate": 2.6192755287715284e-05, | |
| "loss": 0.5557, | |
| "step": 705 | |
| }, | |
| { | |
| "epoch": 1.4791666666666667, | |
| "grad_norm": 3.169715642929077, | |
| "learning_rate": 2.612143038746177e-05, | |
| "loss": 0.608, | |
| "step": 710 | |
| }, | |
| { | |
| "epoch": 1.4895833333333333, | |
| "grad_norm": 3.8564205169677734, | |
| "learning_rate": 2.6049542627148968e-05, | |
| "loss": 0.6358, | |
| "step": 715 | |
| }, | |
| { | |
| "epoch": 1.5, | |
| "grad_norm": 3.743952751159668, | |
| "learning_rate": 2.5977095645045302e-05, | |
| "loss": 0.5222, | |
| "step": 720 | |
| }, | |
| { | |
| "epoch": 1.5104166666666665, | |
| "grad_norm": 4.179549694061279, | |
| "learning_rate": 2.5904093107721638e-05, | |
| "loss": 0.6145, | |
| "step": 725 | |
| }, | |
| { | |
| "epoch": 1.5208333333333335, | |
| "grad_norm": 3.5873751640319824, | |
| "learning_rate": 2.5830538709865716e-05, | |
| "loss": 0.6512, | |
| "step": 730 | |
| }, | |
| { | |
| "epoch": 1.53125, | |
| "grad_norm": 3.451427459716797, | |
| "learning_rate": 2.575643617409516e-05, | |
| "loss": 0.6115, | |
| "step": 735 | |
| }, | |
| { | |
| "epoch": 1.5416666666666665, | |
| "grad_norm": 3.440810441970825, | |
| "learning_rate": 2.5681789250769066e-05, | |
| "loss": 0.6576, | |
| "step": 740 | |
| }, | |
| { | |
| "epoch": 1.5520833333333335, | |
| "grad_norm": 3.5209007263183594, | |
| "learning_rate": 2.5606601717798212e-05, | |
| "loss": 0.589, | |
| "step": 745 | |
| }, | |
| { | |
| "epoch": 1.5625, | |
| "grad_norm": 3.2105367183685303, | |
| "learning_rate": 2.5530877380453847e-05, | |
| "loss": 0.542, | |
| "step": 750 | |
| }, | |
| { | |
| "epoch": 1.5729166666666665, | |
| "grad_norm": 2.4990975856781006, | |
| "learning_rate": 2.5454620071175094e-05, | |
| "loss": 0.5704, | |
| "step": 755 | |
| }, | |
| { | |
| "epoch": 1.5833333333333335, | |
| "grad_norm": 3.4688243865966797, | |
| "learning_rate": 2.537783364937501e-05, | |
| "loss": 0.5372, | |
| "step": 760 | |
| }, | |
| { | |
| "epoch": 1.59375, | |
| "grad_norm": 3.4793436527252197, | |
| "learning_rate": 2.5300522001245253e-05, | |
| "loss": 0.6051, | |
| "step": 765 | |
| }, | |
| { | |
| "epoch": 1.6041666666666665, | |
| "grad_norm": 3.4781627655029297, | |
| "learning_rate": 2.5222689039559384e-05, | |
| "loss": 0.5604, | |
| "step": 770 | |
| }, | |
| { | |
| "epoch": 1.6145833333333335, | |
| "grad_norm": 3.2748799324035645, | |
| "learning_rate": 2.5144338703474855e-05, | |
| "loss": 0.6796, | |
| "step": 775 | |
| }, | |
| { | |
| "epoch": 1.625, | |
| "grad_norm": 2.737703323364258, | |
| "learning_rate": 2.506547495833366e-05, | |
| "loss": 0.5417, | |
| "step": 780 | |
| }, | |
| { | |
| "epoch": 1.6354166666666665, | |
| "grad_norm": 3.479532480239868, | |
| "learning_rate": 2.4986101795461608e-05, | |
| "loss": 0.5709, | |
| "step": 785 | |
| }, | |
| { | |
| "epoch": 1.6458333333333335, | |
| "grad_norm": 3.192643165588379, | |
| "learning_rate": 2.4906223231966364e-05, | |
| "loss": 0.5792, | |
| "step": 790 | |
| }, | |
| { | |
| "epoch": 1.65625, | |
| "grad_norm": 3.1639351844787598, | |
| "learning_rate": 2.482584331053411e-05, | |
| "loss": 0.6079, | |
| "step": 795 | |
| }, | |
| { | |
| "epoch": 1.6666666666666665, | |
| "grad_norm": 4.1512556076049805, | |
| "learning_rate": 2.474496609922495e-05, | |
| "loss": 0.6267, | |
| "step": 800 | |
| }, | |
| { | |
| "epoch": 1.6770833333333335, | |
| "grad_norm": 4.018241882324219, | |
| "learning_rate": 2.4663595691267046e-05, | |
| "loss": 0.5342, | |
| "step": 805 | |
| }, | |
| { | |
| "epoch": 1.6875, | |
| "grad_norm": 3.3028879165649414, | |
| "learning_rate": 2.4581736204849427e-05, | |
| "loss": 0.5534, | |
| "step": 810 | |
| }, | |
| { | |
| "epoch": 1.6979166666666665, | |
| "grad_norm": 3.131824254989624, | |
| "learning_rate": 2.4499391782913587e-05, | |
| "loss": 0.5833, | |
| "step": 815 | |
| }, | |
| { | |
| "epoch": 1.7083333333333335, | |
| "grad_norm": 3.2798943519592285, | |
| "learning_rate": 2.4416566592943785e-05, | |
| "loss": 0.5835, | |
| "step": 820 | |
| }, | |
| { | |
| "epoch": 1.71875, | |
| "grad_norm": 3.742685317993164, | |
| "learning_rate": 2.4333264826756165e-05, | |
| "loss": 0.5246, | |
| "step": 825 | |
| }, | |
| { | |
| "epoch": 1.7291666666666665, | |
| "grad_norm": 2.709505081176758, | |
| "learning_rate": 2.4249490700286578e-05, | |
| "loss": 0.5761, | |
| "step": 830 | |
| }, | |
| { | |
| "epoch": 1.7395833333333335, | |
| "grad_norm": 3.0152900218963623, | |
| "learning_rate": 2.416524845337721e-05, | |
| "loss": 0.5829, | |
| "step": 835 | |
| }, | |
| { | |
| "epoch": 1.75, | |
| "grad_norm": 3.867718458175659, | |
| "learning_rate": 2.408054234956202e-05, | |
| "loss": 0.5658, | |
| "step": 840 | |
| }, | |
| { | |
| "epoch": 1.7604166666666665, | |
| "grad_norm": 3.0808358192443848, | |
| "learning_rate": 2.3995376675850948e-05, | |
| "loss": 0.6101, | |
| "step": 845 | |
| }, | |
| { | |
| "epoch": 1.7708333333333335, | |
| "grad_norm": 3.5606300830841064, | |
| "learning_rate": 2.3909755742512944e-05, | |
| "loss": 0.6245, | |
| "step": 850 | |
| }, | |
| { | |
| "epoch": 1.78125, | |
| "grad_norm": 3.40679669380188, | |
| "learning_rate": 2.3823683882857837e-05, | |
| "loss": 0.5546, | |
| "step": 855 | |
| }, | |
| { | |
| "epoch": 1.7916666666666665, | |
| "grad_norm": 3.359506607055664, | |
| "learning_rate": 2.3737165453017033e-05, | |
| "loss": 0.6038, | |
| "step": 860 | |
| }, | |
| { | |
| "epoch": 1.8020833333333335, | |
| "grad_norm": 2.8603711128234863, | |
| "learning_rate": 2.365020483172301e-05, | |
| "loss": 0.5793, | |
| "step": 865 | |
| }, | |
| { | |
| "epoch": 1.8125, | |
| "grad_norm": 2.9927525520324707, | |
| "learning_rate": 2.3562806420087736e-05, | |
| "loss": 0.5902, | |
| "step": 870 | |
| }, | |
| { | |
| "epoch": 1.8229166666666665, | |
| "grad_norm": 3.9396262168884277, | |
| "learning_rate": 2.3474974641379948e-05, | |
| "loss": 0.589, | |
| "step": 875 | |
| }, | |
| { | |
| "epoch": 1.8333333333333335, | |
| "grad_norm": 3.388766288757324, | |
| "learning_rate": 2.3386713940801236e-05, | |
| "loss": 0.5153, | |
| "step": 880 | |
| }, | |
| { | |
| "epoch": 1.84375, | |
| "grad_norm": 3.3024256229400635, | |
| "learning_rate": 2.3298028785261107e-05, | |
| "loss": 0.5328, | |
| "step": 885 | |
| }, | |
| { | |
| "epoch": 1.8541666666666665, | |
| "grad_norm": 3.5591013431549072, | |
| "learning_rate": 2.320892366315092e-05, | |
| "loss": 0.6063, | |
| "step": 890 | |
| }, | |
| { | |
| "epoch": 1.8645833333333335, | |
| "grad_norm": 3.653662919998169, | |
| "learning_rate": 2.3119403084116683e-05, | |
| "loss": 0.528, | |
| "step": 895 | |
| }, | |
| { | |
| "epoch": 1.875, | |
| "grad_norm": 3.3605880737304688, | |
| "learning_rate": 2.302947157883087e-05, | |
| "loss": 0.5476, | |
| "step": 900 | |
| }, | |
| { | |
| "epoch": 1.8854166666666665, | |
| "grad_norm": 3.730949878692627, | |
| "learning_rate": 2.293913369876308e-05, | |
| "loss": 0.5364, | |
| "step": 905 | |
| }, | |
| { | |
| "epoch": 1.8958333333333335, | |
| "grad_norm": 4.435757160186768, | |
| "learning_rate": 2.2848394015949722e-05, | |
| "loss": 0.5431, | |
| "step": 910 | |
| }, | |
| { | |
| "epoch": 1.90625, | |
| "grad_norm": 2.7192821502685547, | |
| "learning_rate": 2.275725712276259e-05, | |
| "loss": 0.4695, | |
| "step": 915 | |
| }, | |
| { | |
| "epoch": 1.9166666666666665, | |
| "grad_norm": 4.427341938018799, | |
| "learning_rate": 2.266572763167645e-05, | |
| "loss": 0.5594, | |
| "step": 920 | |
| }, | |
| { | |
| "epoch": 1.9270833333333335, | |
| "grad_norm": 3.2589309215545654, | |
| "learning_rate": 2.2573810175035623e-05, | |
| "loss": 0.6364, | |
| "step": 925 | |
| }, | |
| { | |
| "epoch": 1.9375, | |
| "grad_norm": 3.652015209197998, | |
| "learning_rate": 2.2481509404819495e-05, | |
| "loss": 0.5876, | |
| "step": 930 | |
| }, | |
| { | |
| "epoch": 1.9479166666666665, | |
| "grad_norm": 3.712876081466675, | |
| "learning_rate": 2.238882999240714e-05, | |
| "loss": 0.5858, | |
| "step": 935 | |
| }, | |
| { | |
| "epoch": 1.9583333333333335, | |
| "grad_norm": 3.8530917167663574, | |
| "learning_rate": 2.2295776628340843e-05, | |
| "loss": 0.571, | |
| "step": 940 | |
| }, | |
| { | |
| "epoch": 1.96875, | |
| "grad_norm": 3.2704858779907227, | |
| "learning_rate": 2.2202354022088736e-05, | |
| "loss": 0.5649, | |
| "step": 945 | |
| }, | |
| { | |
| "epoch": 1.9791666666666665, | |
| "grad_norm": 3.2081778049468994, | |
| "learning_rate": 2.210856690180645e-05, | |
| "loss": 0.5929, | |
| "step": 950 | |
| }, | |
| { | |
| "epoch": 1.9895833333333335, | |
| "grad_norm": 3.1505792140960693, | |
| "learning_rate": 2.2014420014097815e-05, | |
| "loss": 0.5307, | |
| "step": 955 | |
| }, | |
| { | |
| "epoch": 2.0, | |
| "grad_norm": 3.620272636413574, | |
| "learning_rate": 2.1919918123774633e-05, | |
| "loss": 0.4887, | |
| "step": 960 | |
| }, | |
| { | |
| "epoch": 2.0104166666666665, | |
| "grad_norm": 3.332247018814087, | |
| "learning_rate": 2.1825066013615546e-05, | |
| "loss": 0.4907, | |
| "step": 965 | |
| }, | |
| { | |
| "epoch": 2.0208333333333335, | |
| "grad_norm": 4.510130882263184, | |
| "learning_rate": 2.172986848412394e-05, | |
| "loss": 0.4879, | |
| "step": 970 | |
| }, | |
| { | |
| "epoch": 2.03125, | |
| "grad_norm": 3.2008066177368164, | |
| "learning_rate": 2.163433035328502e-05, | |
| "loss": 0.4268, | |
| "step": 975 | |
| }, | |
| { | |
| "epoch": 2.0416666666666665, | |
| "grad_norm": 3.7556169033050537, | |
| "learning_rate": 2.153845645632197e-05, | |
| "loss": 0.4314, | |
| "step": 980 | |
| }, | |
| { | |
| "epoch": 2.0520833333333335, | |
| "grad_norm": 3.621457099914551, | |
| "learning_rate": 2.144225164545123e-05, | |
| "loss": 0.4612, | |
| "step": 985 | |
| }, | |
| { | |
| "epoch": 2.0625, | |
| "grad_norm": 3.804307222366333, | |
| "learning_rate": 2.1345720789636913e-05, | |
| "loss": 0.5026, | |
| "step": 990 | |
| }, | |
| { | |
| "epoch": 2.0729166666666665, | |
| "grad_norm": 3.083373546600342, | |
| "learning_rate": 2.124886877434442e-05, | |
| "loss": 0.4759, | |
| "step": 995 | |
| }, | |
| { | |
| "epoch": 2.0833333333333335, | |
| "grad_norm": 4.1420512199401855, | |
| "learning_rate": 2.1151700501293142e-05, | |
| "loss": 0.4145, | |
| "step": 1000 | |
| }, | |
| { | |
| "epoch": 2.0833333333333335, | |
| "eval_accuracy": 0.8234977095122609, | |
| "eval_f1": 0.8175286419519977, | |
| "eval_loss": 0.5490740537643433, | |
| "eval_precision": 0.8161029518583486, | |
| "eval_recall": 0.8234977095122609, | |
| "eval_runtime": 65.5822, | |
| "eval_samples_per_second": 113.171, | |
| "eval_steps_per_second": 1.769, | |
| "step": 1000 | |
| }, | |
| { | |
| "epoch": 2.09375, | |
| "grad_norm": 3.869738817214966, | |
| "learning_rate": 2.1054220888208405e-05, | |
| "loss": 0.4822, | |
| "step": 1005 | |
| }, | |
| { | |
| "epoch": 2.1041666666666665, | |
| "grad_norm": 3.0370893478393555, | |
| "learning_rate": 2.0956434868572593e-05, | |
| "loss": 0.4409, | |
| "step": 1010 | |
| }, | |
| { | |
| "epoch": 2.1145833333333335, | |
| "grad_norm": 4.113424301147461, | |
| "learning_rate": 2.0858347391375438e-05, | |
| "loss": 0.4663, | |
| "step": 1015 | |
| }, | |
| { | |
| "epoch": 2.125, | |
| "grad_norm": 4.991008281707764, | |
| "learning_rate": 2.0759963420863553e-05, | |
| "loss": 0.5008, | |
| "step": 1020 | |
| }, | |
| { | |
| "epoch": 2.1354166666666665, | |
| "grad_norm": 4.31297492980957, | |
| "learning_rate": 2.0661287936289216e-05, | |
| "loss": 0.4854, | |
| "step": 1025 | |
| }, | |
| { | |
| "epoch": 2.1458333333333335, | |
| "grad_norm": 3.8414719104766846, | |
| "learning_rate": 2.0562325931658342e-05, | |
| "loss": 0.4392, | |
| "step": 1030 | |
| }, | |
| { | |
| "epoch": 2.15625, | |
| "grad_norm": 3.8043479919433594, | |
| "learning_rate": 2.0463082415477722e-05, | |
| "loss": 0.5119, | |
| "step": 1035 | |
| }, | |
| { | |
| "epoch": 2.1666666666666665, | |
| "grad_norm": 3.0781004428863525, | |
| "learning_rate": 2.036356241050158e-05, | |
| "loss": 0.4858, | |
| "step": 1040 | |
| }, | |
| { | |
| "epoch": 2.1770833333333335, | |
| "grad_norm": 3.7248101234436035, | |
| "learning_rate": 2.0263770953477354e-05, | |
| "loss": 0.4602, | |
| "step": 1045 | |
| }, | |
| { | |
| "epoch": 2.1875, | |
| "grad_norm": 3.1117935180664062, | |
| "learning_rate": 2.016371309489076e-05, | |
| "loss": 0.42, | |
| "step": 1050 | |
| }, | |
| { | |
| "epoch": 2.1979166666666665, | |
| "grad_norm": 3.9277963638305664, | |
| "learning_rate": 2.006339389871022e-05, | |
| "loss": 0.4665, | |
| "step": 1055 | |
| }, | |
| { | |
| "epoch": 2.2083333333333335, | |
| "grad_norm": 3.2659618854522705, | |
| "learning_rate": 1.996281844213054e-05, | |
| "loss": 0.4392, | |
| "step": 1060 | |
| }, | |
| { | |
| "epoch": 2.21875, | |
| "grad_norm": 3.3441226482391357, | |
| "learning_rate": 1.986199181531599e-05, | |
| "loss": 0.4594, | |
| "step": 1065 | |
| }, | |
| { | |
| "epoch": 2.2291666666666665, | |
| "grad_norm": 2.595576286315918, | |
| "learning_rate": 1.9760919121142643e-05, | |
| "loss": 0.3846, | |
| "step": 1070 | |
| }, | |
| { | |
| "epoch": 2.2395833333333335, | |
| "grad_norm": 3.091625690460205, | |
| "learning_rate": 1.9659605474940163e-05, | |
| "loss": 0.4692, | |
| "step": 1075 | |
| }, | |
| { | |
| "epoch": 2.25, | |
| "grad_norm": 4.52020788192749, | |
| "learning_rate": 1.955805600423287e-05, | |
| "loss": 0.4408, | |
| "step": 1080 | |
| }, | |
| { | |
| "epoch": 2.2604166666666665, | |
| "grad_norm": 4.338654041290283, | |
| "learning_rate": 1.945627584848027e-05, | |
| "loss": 0.4053, | |
| "step": 1085 | |
| }, | |
| { | |
| "epoch": 2.2708333333333335, | |
| "grad_norm": 3.8205623626708984, | |
| "learning_rate": 1.9354270158816936e-05, | |
| "loss": 0.463, | |
| "step": 1090 | |
| }, | |
| { | |
| "epoch": 2.28125, | |
| "grad_norm": 2.9788827896118164, | |
| "learning_rate": 1.9252044097791792e-05, | |
| "loss": 0.4457, | |
| "step": 1095 | |
| }, | |
| { | |
| "epoch": 2.2916666666666665, | |
| "grad_norm": 3.9684176445007324, | |
| "learning_rate": 1.914960283910685e-05, | |
| "loss": 0.4507, | |
| "step": 1100 | |
| }, | |
| { | |
| "epoch": 2.3020833333333335, | |
| "grad_norm": 4.322586536407471, | |
| "learning_rate": 1.9046951567355362e-05, | |
| "loss": 0.4661, | |
| "step": 1105 | |
| }, | |
| { | |
| "epoch": 2.3125, | |
| "grad_norm": 3.332106828689575, | |
| "learning_rate": 1.894409547775943e-05, | |
| "loss": 0.3704, | |
| "step": 1110 | |
| }, | |
| { | |
| "epoch": 2.3229166666666665, | |
| "grad_norm": 4.265518665313721, | |
| "learning_rate": 1.884103977590706e-05, | |
| "loss": 0.4075, | |
| "step": 1115 | |
| }, | |
| { | |
| "epoch": 2.3333333333333335, | |
| "grad_norm": 4.3574676513671875, | |
| "learning_rate": 1.8737789677488714e-05, | |
| "loss": 0.5206, | |
| "step": 1120 | |
| }, | |
| { | |
| "epoch": 2.34375, | |
| "grad_norm": 3.931515693664551, | |
| "learning_rate": 1.8634350408033364e-05, | |
| "loss": 0.4985, | |
| "step": 1125 | |
| }, | |
| { | |
| "epoch": 2.3541666666666665, | |
| "grad_norm": 4.26662540435791, | |
| "learning_rate": 1.853072720264397e-05, | |
| "loss": 0.435, | |
| "step": 1130 | |
| }, | |
| { | |
| "epoch": 2.3645833333333335, | |
| "grad_norm": 3.806845188140869, | |
| "learning_rate": 1.8426925305732585e-05, | |
| "loss": 0.4122, | |
| "step": 1135 | |
| }, | |
| { | |
| "epoch": 2.375, | |
| "grad_norm": 3.4609289169311523, | |
| "learning_rate": 1.832294997075492e-05, | |
| "loss": 0.421, | |
| "step": 1140 | |
| }, | |
| { | |
| "epoch": 2.3854166666666665, | |
| "grad_norm": 3.3323652744293213, | |
| "learning_rate": 1.821880645994443e-05, | |
| "loss": 0.4633, | |
| "step": 1145 | |
| }, | |
| { | |
| "epoch": 2.3958333333333335, | |
| "grad_norm": 3.005038022994995, | |
| "learning_rate": 1.8114500044046036e-05, | |
| "loss": 0.4468, | |
| "step": 1150 | |
| }, | |
| { | |
| "epoch": 2.40625, | |
| "grad_norm": 3.6419589519500732, | |
| "learning_rate": 1.8010036002049353e-05, | |
| "loss": 0.4819, | |
| "step": 1155 | |
| }, | |
| { | |
| "epoch": 2.4166666666666665, | |
| "grad_norm": 3.866816759109497, | |
| "learning_rate": 1.7905419620921498e-05, | |
| "loss": 0.448, | |
| "step": 1160 | |
| }, | |
| { | |
| "epoch": 2.4270833333333335, | |
| "grad_norm": 3.973491907119751, | |
| "learning_rate": 1.7800656195339543e-05, | |
| "loss": 0.4014, | |
| "step": 1165 | |
| }, | |
| { | |
| "epoch": 2.4375, | |
| "grad_norm": 4.084470748901367, | |
| "learning_rate": 1.769575102742255e-05, | |
| "loss": 0.4955, | |
| "step": 1170 | |
| }, | |
| { | |
| "epoch": 2.4479166666666665, | |
| "grad_norm": 3.2326691150665283, | |
| "learning_rate": 1.7590709426463195e-05, | |
| "loss": 0.4242, | |
| "step": 1175 | |
| }, | |
| { | |
| "epoch": 2.4583333333333335, | |
| "grad_norm": 3.3827245235443115, | |
| "learning_rate": 1.7485536708659103e-05, | |
| "loss": 0.5078, | |
| "step": 1180 | |
| }, | |
| { | |
| "epoch": 2.46875, | |
| "grad_norm": 3.542229413986206, | |
| "learning_rate": 1.738023819684377e-05, | |
| "loss": 0.3848, | |
| "step": 1185 | |
| }, | |
| { | |
| "epoch": 2.4791666666666665, | |
| "grad_norm": 3.2745306491851807, | |
| "learning_rate": 1.7274819220217177e-05, | |
| "loss": 0.4261, | |
| "step": 1190 | |
| }, | |
| { | |
| "epoch": 2.4895833333333335, | |
| "grad_norm": 3.926612615585327, | |
| "learning_rate": 1.7169285114076067e-05, | |
| "loss": 0.4598, | |
| "step": 1195 | |
| }, | |
| { | |
| "epoch": 2.5, | |
| "grad_norm": 3.3912479877471924, | |
| "learning_rate": 1.7063641219543956e-05, | |
| "loss": 0.4737, | |
| "step": 1200 | |
| }, | |
| { | |
| "epoch": 2.5104166666666665, | |
| "grad_norm": 3.76505446434021, | |
| "learning_rate": 1.6957892883300778e-05, | |
| "loss": 0.4406, | |
| "step": 1205 | |
| }, | |
| { | |
| "epoch": 2.5208333333333335, | |
| "grad_norm": 3.7545506954193115, | |
| "learning_rate": 1.68520454573123e-05, | |
| "loss": 0.4677, | |
| "step": 1210 | |
| }, | |
| { | |
| "epoch": 2.53125, | |
| "grad_norm": 4.593079090118408, | |
| "learning_rate": 1.6746104298559286e-05, | |
| "loss": 0.4573, | |
| "step": 1215 | |
| }, | |
| { | |
| "epoch": 2.5416666666666665, | |
| "grad_norm": 3.7693185806274414, | |
| "learning_rate": 1.664007476876633e-05, | |
| "loss": 0.458, | |
| "step": 1220 | |
| }, | |
| { | |
| "epoch": 2.5520833333333335, | |
| "grad_norm": 3.0765931606292725, | |
| "learning_rate": 1.6533962234130512e-05, | |
| "loss": 0.3747, | |
| "step": 1225 | |
| }, | |
| { | |
| "epoch": 2.5625, | |
| "grad_norm": 3.5096516609191895, | |
| "learning_rate": 1.6427772065049856e-05, | |
| "loss": 0.429, | |
| "step": 1230 | |
| }, | |
| { | |
| "epoch": 2.5729166666666665, | |
| "grad_norm": 2.7782936096191406, | |
| "learning_rate": 1.6321509635851463e-05, | |
| "loss": 0.4168, | |
| "step": 1235 | |
| }, | |
| { | |
| "epoch": 2.5833333333333335, | |
| "grad_norm": 3.2030630111694336, | |
| "learning_rate": 1.6215180324519554e-05, | |
| "loss": 0.459, | |
| "step": 1240 | |
| }, | |
| { | |
| "epoch": 2.59375, | |
| "grad_norm": 4.029175281524658, | |
| "learning_rate": 1.6108789512423302e-05, | |
| "loss": 0.3947, | |
| "step": 1245 | |
| }, | |
| { | |
| "epoch": 2.6041666666666665, | |
| "grad_norm": 3.927734851837158, | |
| "learning_rate": 1.6002342584044432e-05, | |
| "loss": 0.4248, | |
| "step": 1250 | |
| }, | |
| { | |
| "epoch": 2.6145833333333335, | |
| "grad_norm": 4.7596869468688965, | |
| "learning_rate": 1.5895844926704746e-05, | |
| "loss": 0.3883, | |
| "step": 1255 | |
| }, | |
| { | |
| "epoch": 2.625, | |
| "grad_norm": 3.7099876403808594, | |
| "learning_rate": 1.5789301930293478e-05, | |
| "loss": 0.4272, | |
| "step": 1260 | |
| }, | |
| { | |
| "epoch": 2.6354166666666665, | |
| "grad_norm": 2.8906173706054688, | |
| "learning_rate": 1.5682718986994456e-05, | |
| "loss": 0.3948, | |
| "step": 1265 | |
| }, | |
| { | |
| "epoch": 2.6458333333333335, | |
| "grad_norm": 3.518240451812744, | |
| "learning_rate": 1.557610149101326e-05, | |
| "loss": 0.4808, | |
| "step": 1270 | |
| }, | |
| { | |
| "epoch": 2.65625, | |
| "grad_norm": 3.6957006454467773, | |
| "learning_rate": 1.546945483830419e-05, | |
| "loss": 0.4169, | |
| "step": 1275 | |
| }, | |
| { | |
| "epoch": 2.6666666666666665, | |
| "grad_norm": 4.2952046394348145, | |
| "learning_rate": 1.536278442629718e-05, | |
| "loss": 0.3973, | |
| "step": 1280 | |
| }, | |
| { | |
| "epoch": 2.6770833333333335, | |
| "grad_norm": 3.022895336151123, | |
| "learning_rate": 1.5256095653624623e-05, | |
| "loss": 0.4385, | |
| "step": 1285 | |
| }, | |
| { | |
| "epoch": 2.6875, | |
| "grad_norm": 3.962228775024414, | |
| "learning_rate": 1.5149393919848169e-05, | |
| "loss": 0.5022, | |
| "step": 1290 | |
| }, | |
| { | |
| "epoch": 2.6979166666666665, | |
| "grad_norm": 3.597813367843628, | |
| "learning_rate": 1.5042684625185415e-05, | |
| "loss": 0.4599, | |
| "step": 1295 | |
| }, | |
| { | |
| "epoch": 2.7083333333333335, | |
| "grad_norm": 3.568366289138794, | |
| "learning_rate": 1.4935973170236636e-05, | |
| "loss": 0.5205, | |
| "step": 1300 | |
| }, | |
| { | |
| "epoch": 2.71875, | |
| "grad_norm": 3.4070606231689453, | |
| "learning_rate": 1.4829264955711437e-05, | |
| "loss": 0.4369, | |
| "step": 1305 | |
| }, | |
| { | |
| "epoch": 2.7291666666666665, | |
| "grad_norm": 4.111550807952881, | |
| "learning_rate": 1.4722565382155407e-05, | |
| "loss": 0.451, | |
| "step": 1310 | |
| }, | |
| { | |
| "epoch": 2.7395833333333335, | |
| "grad_norm": 3.9338576793670654, | |
| "learning_rate": 1.4615879849676831e-05, | |
| "loss": 0.4372, | |
| "step": 1315 | |
| }, | |
| { | |
| "epoch": 2.75, | |
| "grad_norm": 3.9936351776123047, | |
| "learning_rate": 1.450921375767336e-05, | |
| "loss": 0.475, | |
| "step": 1320 | |
| }, | |
| { | |
| "epoch": 2.7604166666666665, | |
| "grad_norm": 3.7527687549591064, | |
| "learning_rate": 1.440257250455876e-05, | |
| "loss": 0.4186, | |
| "step": 1325 | |
| }, | |
| { | |
| "epoch": 2.7708333333333335, | |
| "grad_norm": 4.303956985473633, | |
| "learning_rate": 1.4295961487489677e-05, | |
| "loss": 0.4309, | |
| "step": 1330 | |
| }, | |
| { | |
| "epoch": 2.78125, | |
| "grad_norm": 4.955887794494629, | |
| "learning_rate": 1.4189386102092525e-05, | |
| "loss": 0.3792, | |
| "step": 1335 | |
| }, | |
| { | |
| "epoch": 2.7916666666666665, | |
| "grad_norm": 3.663379669189453, | |
| "learning_rate": 1.4082851742190363e-05, | |
| "loss": 0.4754, | |
| "step": 1340 | |
| }, | |
| { | |
| "epoch": 2.8020833333333335, | |
| "grad_norm": 3.618136167526245, | |
| "learning_rate": 1.3976363799529938e-05, | |
| "loss": 0.4379, | |
| "step": 1345 | |
| }, | |
| { | |
| "epoch": 2.8125, | |
| "grad_norm": 5.682356834411621, | |
| "learning_rate": 1.3869927663508803e-05, | |
| "loss": 0.5003, | |
| "step": 1350 | |
| }, | |
| { | |
| "epoch": 2.8229166666666665, | |
| "grad_norm": 3.9929006099700928, | |
| "learning_rate": 1.3763548720902561e-05, | |
| "loss": 0.4501, | |
| "step": 1355 | |
| }, | |
| { | |
| "epoch": 2.8333333333333335, | |
| "grad_norm": 3.655776262283325, | |
| "learning_rate": 1.3657232355592217e-05, | |
| "loss": 0.428, | |
| "step": 1360 | |
| }, | |
| { | |
| "epoch": 2.84375, | |
| "grad_norm": 3.520394802093506, | |
| "learning_rate": 1.3550983948291743e-05, | |
| "loss": 0.4272, | |
| "step": 1365 | |
| }, | |
| { | |
| "epoch": 2.8541666666666665, | |
| "grad_norm": 4.128672122955322, | |
| "learning_rate": 1.34448088762757e-05, | |
| "loss": 0.408, | |
| "step": 1370 | |
| }, | |
| { | |
| "epoch": 2.8645833333333335, | |
| "grad_norm": 3.8139350414276123, | |
| "learning_rate": 1.3338712513107136e-05, | |
| "loss": 0.4182, | |
| "step": 1375 | |
| }, | |
| { | |
| "epoch": 2.875, | |
| "grad_norm": 3.9030158519744873, | |
| "learning_rate": 1.3232700228365606e-05, | |
| "loss": 0.4075, | |
| "step": 1380 | |
| }, | |
| { | |
| "epoch": 2.8854166666666665, | |
| "grad_norm": 4.010909557342529, | |
| "learning_rate": 1.3126777387375431e-05, | |
| "loss": 0.4313, | |
| "step": 1385 | |
| }, | |
| { | |
| "epoch": 2.8958333333333335, | |
| "grad_norm": 4.179037570953369, | |
| "learning_rate": 1.3020949350934127e-05, | |
| "loss": 0.4231, | |
| "step": 1390 | |
| }, | |
| { | |
| "epoch": 2.90625, | |
| "grad_norm": 4.152976036071777, | |
| "learning_rate": 1.291522147504115e-05, | |
| "loss": 0.4008, | |
| "step": 1395 | |
| }, | |
| { | |
| "epoch": 2.9166666666666665, | |
| "grad_norm": 3.956671953201294, | |
| "learning_rate": 1.2809599110626756e-05, | |
| "loss": 0.4441, | |
| "step": 1400 | |
| }, | |
| { | |
| "epoch": 2.9270833333333335, | |
| "grad_norm": 3.796563148498535, | |
| "learning_rate": 1.2704087603281236e-05, | |
| "loss": 0.4647, | |
| "step": 1405 | |
| }, | |
| { | |
| "epoch": 2.9375, | |
| "grad_norm": 3.6390767097473145, | |
| "learning_rate": 1.2598692292984361e-05, | |
| "loss": 0.374, | |
| "step": 1410 | |
| }, | |
| { | |
| "epoch": 2.9479166666666665, | |
| "grad_norm": 4.512059211730957, | |
| "learning_rate": 1.2493418513835126e-05, | |
| "loss": 0.4546, | |
| "step": 1415 | |
| }, | |
| { | |
| "epoch": 2.9583333333333335, | |
| "grad_norm": 3.269153594970703, | |
| "learning_rate": 1.2388271593781767e-05, | |
| "loss": 0.3955, | |
| "step": 1420 | |
| }, | |
| { | |
| "epoch": 2.96875, | |
| "grad_norm": 3.7052197456359863, | |
| "learning_rate": 1.2283256854352162e-05, | |
| "loss": 0.4648, | |
| "step": 1425 | |
| }, | |
| { | |
| "epoch": 2.9791666666666665, | |
| "grad_norm": 3.807889461517334, | |
| "learning_rate": 1.2178379610384452e-05, | |
| "loss": 0.4162, | |
| "step": 1430 | |
| }, | |
| { | |
| "epoch": 2.9895833333333335, | |
| "grad_norm": 5.0151753425598145, | |
| "learning_rate": 1.2073645169758078e-05, | |
| "loss": 0.4305, | |
| "step": 1435 | |
| }, | |
| { | |
| "epoch": 3.0, | |
| "grad_norm": 3.7280149459838867, | |
| "learning_rate": 1.1969058833125151e-05, | |
| "loss": 0.4562, | |
| "step": 1440 | |
| }, | |
| { | |
| "epoch": 3.0104166666666665, | |
| "grad_norm": 3.6162660121917725, | |
| "learning_rate": 1.1864625893642187e-05, | |
| "loss": 0.3746, | |
| "step": 1445 | |
| }, | |
| { | |
| "epoch": 3.0208333333333335, | |
| "grad_norm": 3.955254316329956, | |
| "learning_rate": 1.1760351636702194e-05, | |
| "loss": 0.3679, | |
| "step": 1450 | |
| }, | |
| { | |
| "epoch": 3.03125, | |
| "grad_norm": 3.7382540702819824, | |
| "learning_rate": 1.1656241339667196e-05, | |
| "loss": 0.3178, | |
| "step": 1455 | |
| }, | |
| { | |
| "epoch": 3.0416666666666665, | |
| "grad_norm": 3.5673348903656006, | |
| "learning_rate": 1.1552300271601167e-05, | |
| "loss": 0.3385, | |
| "step": 1460 | |
| }, | |
| { | |
| "epoch": 3.0520833333333335, | |
| "grad_norm": 4.800466060638428, | |
| "learning_rate": 1.14485336930033e-05, | |
| "loss": 0.4036, | |
| "step": 1465 | |
| }, | |
| { | |
| "epoch": 3.0625, | |
| "grad_norm": 3.5082385540008545, | |
| "learning_rate": 1.1344946855541823e-05, | |
| "loss": 0.3252, | |
| "step": 1470 | |
| }, | |
| { | |
| "epoch": 3.0729166666666665, | |
| "grad_norm": 3.968536376953125, | |
| "learning_rate": 1.1241545001788187e-05, | |
| "loss": 0.374, | |
| "step": 1475 | |
| }, | |
| { | |
| "epoch": 3.0833333333333335, | |
| "grad_norm": 3.048720359802246, | |
| "learning_rate": 1.1138333364951753e-05, | |
| "loss": 0.3279, | |
| "step": 1480 | |
| }, | |
| { | |
| "epoch": 3.09375, | |
| "grad_norm": 3.3985679149627686, | |
| "learning_rate": 1.1035317168614904e-05, | |
| "loss": 0.3289, | |
| "step": 1485 | |
| }, | |
| { | |
| "epoch": 3.1041666666666665, | |
| "grad_norm": 3.3650944232940674, | |
| "learning_rate": 1.093250162646874e-05, | |
| "loss": 0.3118, | |
| "step": 1490 | |
| }, | |
| { | |
| "epoch": 3.1145833333333335, | |
| "grad_norm": 3.9324982166290283, | |
| "learning_rate": 1.0829891942049136e-05, | |
| "loss": 0.3426, | |
| "step": 1495 | |
| }, | |
| { | |
| "epoch": 3.125, | |
| "grad_norm": 4.731880187988281, | |
| "learning_rate": 1.0727493308473439e-05, | |
| "loss": 0.358, | |
| "step": 1500 | |
| }, | |
| { | |
| "epoch": 3.125, | |
| "eval_accuracy": 0.8201293451899757, | |
| "eval_f1": 0.8197684685585088, | |
| "eval_loss": 0.5668273568153381, | |
| "eval_precision": 0.8208574046266796, | |
| "eval_recall": 0.8201293451899757, | |
| "eval_runtime": 65.5161, | |
| "eval_samples_per_second": 113.285, | |
| "eval_steps_per_second": 1.771, | |
| "step": 1500 | |
| }, | |
| { | |
| "epoch": 3.1354166666666665, | |
| "grad_norm": 3.8050031661987305, | |
| "learning_rate": 1.0625310908177625e-05, | |
| "loss": 0.3367, | |
| "step": 1505 | |
| }, | |
| { | |
| "epoch": 3.1458333333333335, | |
| "grad_norm": 3.8859758377075195, | |
| "learning_rate": 1.0523349912654028e-05, | |
| "loss": 0.33, | |
| "step": 1510 | |
| }, | |
| { | |
| "epoch": 3.15625, | |
| "grad_norm": 4.330165863037109, | |
| "learning_rate": 1.0421615482189573e-05, | |
| "loss": 0.4071, | |
| "step": 1515 | |
| }, | |
| { | |
| "epoch": 3.1666666666666665, | |
| "grad_norm": 3.670168399810791, | |
| "learning_rate": 1.0320112765604669e-05, | |
| "loss": 0.3112, | |
| "step": 1520 | |
| }, | |
| { | |
| "epoch": 3.1770833333333335, | |
| "grad_norm": 4.089378833770752, | |
| "learning_rate": 1.0218846899992563e-05, | |
| "loss": 0.3756, | |
| "step": 1525 | |
| }, | |
| { | |
| "epoch": 3.1875, | |
| "grad_norm": 3.681405782699585, | |
| "learning_rate": 1.0117823010459394e-05, | |
| "loss": 0.3755, | |
| "step": 1530 | |
| }, | |
| { | |
| "epoch": 3.1979166666666665, | |
| "grad_norm": 3.308830499649048, | |
| "learning_rate": 1.0017046209864795e-05, | |
| "loss": 0.3809, | |
| "step": 1535 | |
| }, | |
| { | |
| "epoch": 3.2083333333333335, | |
| "grad_norm": 5.036259174346924, | |
| "learning_rate": 9.916521598563123e-06, | |
| "loss": 0.3482, | |
| "step": 1540 | |
| }, | |
| { | |
| "epoch": 3.21875, | |
| "grad_norm": 4.433037757873535, | |
| "learning_rate": 9.816254264145328e-06, | |
| "loss": 0.3538, | |
| "step": 1545 | |
| }, | |
| { | |
| "epoch": 3.2291666666666665, | |
| "grad_norm": 3.244652032852173, | |
| "learning_rate": 9.716249281181499e-06, | |
| "loss": 0.3228, | |
| "step": 1550 | |
| }, | |
| { | |
| "epoch": 3.2395833333333335, | |
| "grad_norm": 3.6601569652557373, | |
| "learning_rate": 9.616511710963979e-06, | |
| "loss": 0.3853, | |
| "step": 1555 | |
| }, | |
| { | |
| "epoch": 3.25, | |
| "grad_norm": 3.561739206314087, | |
| "learning_rate": 9.517046601251269e-06, | |
| "loss": 0.3016, | |
| "step": 1560 | |
| }, | |
| { | |
| "epoch": 3.2604166666666665, | |
| "grad_norm": 3.814207077026367, | |
| "learning_rate": 9.417858986012523e-06, | |
| "loss": 0.3125, | |
| "step": 1565 | |
| }, | |
| { | |
| "epoch": 3.2708333333333335, | |
| "grad_norm": 3.6229615211486816, | |
| "learning_rate": 9.31895388517279e-06, | |
| "loss": 0.3665, | |
| "step": 1570 | |
| }, | |
| { | |
| "epoch": 3.28125, | |
| "grad_norm": 3.768630027770996, | |
| "learning_rate": 9.220336304358956e-06, | |
| "loss": 0.3301, | |
| "step": 1575 | |
| }, | |
| { | |
| "epoch": 3.2916666666666665, | |
| "grad_norm": 3.874178886413574, | |
| "learning_rate": 9.12201123464639e-06, | |
| "loss": 0.3278, | |
| "step": 1580 | |
| }, | |
| { | |
| "epoch": 3.3020833333333335, | |
| "grad_norm": 3.75437331199646, | |
| "learning_rate": 9.023983652306378e-06, | |
| "loss": 0.3238, | |
| "step": 1585 | |
| }, | |
| { | |
| "epoch": 3.3125, | |
| "grad_norm": 3.6128146648406982, | |
| "learning_rate": 8.926258518554237e-06, | |
| "loss": 0.3179, | |
| "step": 1590 | |
| }, | |
| { | |
| "epoch": 3.3229166666666665, | |
| "grad_norm": 3.788473606109619, | |
| "learning_rate": 8.828840779298237e-06, | |
| "loss": 0.3218, | |
| "step": 1595 | |
| }, | |
| { | |
| "epoch": 3.3333333333333335, | |
| "grad_norm": 4.516867637634277, | |
| "learning_rate": 8.731735364889302e-06, | |
| "loss": 0.3539, | |
| "step": 1600 | |
| }, | |
| { | |
| "epoch": 3.34375, | |
| "grad_norm": 3.14803409576416, | |
| "learning_rate": 8.634947189871452e-06, | |
| "loss": 0.2916, | |
| "step": 1605 | |
| }, | |
| { | |
| "epoch": 3.3541666666666665, | |
| "grad_norm": 3.4791629314422607, | |
| "learning_rate": 8.538481152733103e-06, | |
| "loss": 0.3375, | |
| "step": 1610 | |
| }, | |
| { | |
| "epoch": 3.3645833333333335, | |
| "grad_norm": 3.549140214920044, | |
| "learning_rate": 8.44234213565915e-06, | |
| "loss": 0.3848, | |
| "step": 1615 | |
| }, | |
| { | |
| "epoch": 3.375, | |
| "grad_norm": 4.595663547515869, | |
| "learning_rate": 8.346535004283872e-06, | |
| "loss": 0.3952, | |
| "step": 1620 | |
| }, | |
| { | |
| "epoch": 3.3854166666666665, | |
| "grad_norm": 4.323939323425293, | |
| "learning_rate": 8.251064607444658e-06, | |
| "loss": 0.4012, | |
| "step": 1625 | |
| }, | |
| { | |
| "epoch": 3.3958333333333335, | |
| "grad_norm": 4.13346004486084, | |
| "learning_rate": 8.155935776936651e-06, | |
| "loss": 0.3707, | |
| "step": 1630 | |
| }, | |
| { | |
| "epoch": 3.40625, | |
| "grad_norm": 4.471806049346924, | |
| "learning_rate": 8.06115332726817e-06, | |
| "loss": 0.3096, | |
| "step": 1635 | |
| }, | |
| { | |
| "epoch": 3.4166666666666665, | |
| "grad_norm": 3.0733044147491455, | |
| "learning_rate": 7.966722055417062e-06, | |
| "loss": 0.3255, | |
| "step": 1640 | |
| }, | |
| { | |
| "epoch": 3.4270833333333335, | |
| "grad_norm": 3.596834421157837, | |
| "learning_rate": 7.872646740587944e-06, | |
| "loss": 0.3439, | |
| "step": 1645 | |
| }, | |
| { | |
| "epoch": 3.4375, | |
| "grad_norm": 3.9217464923858643, | |
| "learning_rate": 7.778932143970282e-06, | |
| "loss": 0.364, | |
| "step": 1650 | |
| }, | |
| { | |
| "epoch": 3.4479166666666665, | |
| "grad_norm": 4.791758060455322, | |
| "learning_rate": 7.685583008497446e-06, | |
| "loss": 0.3203, | |
| "step": 1655 | |
| }, | |
| { | |
| "epoch": 3.4583333333333335, | |
| "grad_norm": 4.293978214263916, | |
| "learning_rate": 7.592604058606685e-06, | |
| "loss": 0.4126, | |
| "step": 1660 | |
| }, | |
| { | |
| "epoch": 3.46875, | |
| "grad_norm": 3.9913458824157715, | |
| "learning_rate": 7.500000000000004e-06, | |
| "loss": 0.3593, | |
| "step": 1665 | |
| }, | |
| { | |
| "epoch": 3.4791666666666665, | |
| "grad_norm": 5.22377347946167, | |
| "learning_rate": 7.407775519406005e-06, | |
| "loss": 0.3759, | |
| "step": 1670 | |
| }, | |
| { | |
| "epoch": 3.4895833333333335, | |
| "grad_norm": 5.089886665344238, | |
| "learning_rate": 7.315935284342693e-06, | |
| "loss": 0.2982, | |
| "step": 1675 | |
| }, | |
| { | |
| "epoch": 3.5, | |
| "grad_norm": 4.4779534339904785, | |
| "learning_rate": 7.22448394288127e-06, | |
| "loss": 0.3756, | |
| "step": 1680 | |
| }, | |
| { | |
| "epoch": 3.5104166666666665, | |
| "grad_norm": 4.305549144744873, | |
| "learning_rate": 7.133426123410848e-06, | |
| "loss": 0.3117, | |
| "step": 1685 | |
| }, | |
| { | |
| "epoch": 3.5208333333333335, | |
| "grad_norm": 4.1521124839782715, | |
| "learning_rate": 7.042766434404253e-06, | |
| "loss": 0.3421, | |
| "step": 1690 | |
| }, | |
| { | |
| "epoch": 3.53125, | |
| "grad_norm": 3.1790106296539307, | |
| "learning_rate": 6.952509464184763e-06, | |
| "loss": 0.3425, | |
| "step": 1695 | |
| }, | |
| { | |
| "epoch": 3.5416666666666665, | |
| "grad_norm": 4.193013668060303, | |
| "learning_rate": 6.862659780693894e-06, | |
| "loss": 0.343, | |
| "step": 1700 | |
| }, | |
| { | |
| "epoch": 3.5520833333333335, | |
| "grad_norm": 4.356819152832031, | |
| "learning_rate": 6.773221931260216e-06, | |
| "loss": 0.3438, | |
| "step": 1705 | |
| }, | |
| { | |
| "epoch": 3.5625, | |
| "grad_norm": 5.121899604797363, | |
| "learning_rate": 6.684200442369233e-06, | |
| "loss": 0.3554, | |
| "step": 1710 | |
| }, | |
| { | |
| "epoch": 3.5729166666666665, | |
| "grad_norm": 4.9324541091918945, | |
| "learning_rate": 6.595599819434235e-06, | |
| "loss": 0.3183, | |
| "step": 1715 | |
| }, | |
| { | |
| "epoch": 3.5833333333333335, | |
| "grad_norm": 4.787963390350342, | |
| "learning_rate": 6.50742454656835e-06, | |
| "loss": 0.3739, | |
| "step": 1720 | |
| }, | |
| { | |
| "epoch": 3.59375, | |
| "grad_norm": 4.315763473510742, | |
| "learning_rate": 6.419679086357554e-06, | |
| "loss": 0.376, | |
| "step": 1725 | |
| }, | |
| { | |
| "epoch": 3.6041666666666665, | |
| "grad_norm": 4.592148303985596, | |
| "learning_rate": 6.332367879634836e-06, | |
| "loss": 0.3626, | |
| "step": 1730 | |
| }, | |
| { | |
| "epoch": 3.6145833333333335, | |
| "grad_norm": 4.4928812980651855, | |
| "learning_rate": 6.245495345255436e-06, | |
| "loss": 0.3263, | |
| "step": 1735 | |
| }, | |
| { | |
| "epoch": 3.625, | |
| "grad_norm": 4.379279136657715, | |
| "learning_rate": 6.159065879873226e-06, | |
| "loss": 0.4041, | |
| "step": 1740 | |
| }, | |
| { | |
| "epoch": 3.6354166666666665, | |
| "grad_norm": 4.787333011627197, | |
| "learning_rate": 6.073083857718157e-06, | |
| "loss": 0.3452, | |
| "step": 1745 | |
| }, | |
| { | |
| "epoch": 3.6458333333333335, | |
| "grad_norm": 3.5181643962860107, | |
| "learning_rate": 5.987553630374911e-06, | |
| "loss": 0.3447, | |
| "step": 1750 | |
| }, | |
| { | |
| "epoch": 3.65625, | |
| "grad_norm": 5.4470601081848145, | |
| "learning_rate": 5.902479526562649e-06, | |
| "loss": 0.3294, | |
| "step": 1755 | |
| }, | |
| { | |
| "epoch": 3.6666666666666665, | |
| "grad_norm": 4.063173770904541, | |
| "learning_rate": 5.817865851915943e-06, | |
| "loss": 0.326, | |
| "step": 1760 | |
| }, | |
| { | |
| "epoch": 3.6770833333333335, | |
| "grad_norm": 4.259072780609131, | |
| "learning_rate": 5.733716888766846e-06, | |
| "loss": 0.3219, | |
| "step": 1765 | |
| }, | |
| { | |
| "epoch": 3.6875, | |
| "grad_norm": 4.095818996429443, | |
| "learning_rate": 5.650036895928197e-06, | |
| "loss": 0.3371, | |
| "step": 1770 | |
| }, | |
| { | |
| "epoch": 3.6979166666666665, | |
| "grad_norm": 4.348902225494385, | |
| "learning_rate": 5.566830108478046e-06, | |
| "loss": 0.336, | |
| "step": 1775 | |
| }, | |
| { | |
| "epoch": 3.7083333333333335, | |
| "grad_norm": 4.979382514953613, | |
| "learning_rate": 5.484100737545319e-06, | |
| "loss": 0.3292, | |
| "step": 1780 | |
| }, | |
| { | |
| "epoch": 3.71875, | |
| "grad_norm": 4.148449420928955, | |
| "learning_rate": 5.401852970096719e-06, | |
| "loss": 0.2889, | |
| "step": 1785 | |
| }, | |
| { | |
| "epoch": 3.7291666666666665, | |
| "grad_norm": 4.086822032928467, | |
| "learning_rate": 5.32009096872479e-06, | |
| "loss": 0.2791, | |
| "step": 1790 | |
| }, | |
| { | |
| "epoch": 3.7395833333333335, | |
| "grad_norm": 3.611013650894165, | |
| "learning_rate": 5.238818871437267e-06, | |
| "loss": 0.3691, | |
| "step": 1795 | |
| }, | |
| { | |
| "epoch": 3.75, | |
| "grad_norm": 3.48419451713562, | |
| "learning_rate": 5.1580407914476364e-06, | |
| "loss": 0.3038, | |
| "step": 1800 | |
| }, | |
| { | |
| "epoch": 3.7604166666666665, | |
| "grad_norm": 4.841831207275391, | |
| "learning_rate": 5.077760816966986e-06, | |
| "loss": 0.2852, | |
| "step": 1805 | |
| }, | |
| { | |
| "epoch": 3.7708333333333335, | |
| "grad_norm": 5.054813385009766, | |
| "learning_rate": 4.99798301099706e-06, | |
| "loss": 0.3712, | |
| "step": 1810 | |
| }, | |
| { | |
| "epoch": 3.78125, | |
| "grad_norm": 4.303066253662109, | |
| "learning_rate": 4.918711411124666e-06, | |
| "loss": 0.3174, | |
| "step": 1815 | |
| }, | |
| { | |
| "epoch": 3.7916666666666665, | |
| "grad_norm": 4.433043956756592, | |
| "learning_rate": 4.83995002931732e-06, | |
| "loss": 0.3421, | |
| "step": 1820 | |
| }, | |
| { | |
| "epoch": 3.8020833333333335, | |
| "grad_norm": 4.922135353088379, | |
| "learning_rate": 4.761702851720191e-06, | |
| "loss": 0.3687, | |
| "step": 1825 | |
| }, | |
| { | |
| "epoch": 3.8125, | |
| "grad_norm": 3.8969547748565674, | |
| "learning_rate": 4.683973838454364e-06, | |
| "loss": 0.3654, | |
| "step": 1830 | |
| }, | |
| { | |
| "epoch": 3.8229166666666665, | |
| "grad_norm": 5.1149702072143555, | |
| "learning_rate": 4.606766923416437e-06, | |
| "loss": 0.3509, | |
| "step": 1835 | |
| }, | |
| { | |
| "epoch": 3.8333333333333335, | |
| "grad_norm": 4.445886135101318, | |
| "learning_rate": 4.530086014079382e-06, | |
| "loss": 0.3183, | |
| "step": 1840 | |
| }, | |
| { | |
| "epoch": 3.84375, | |
| "grad_norm": 3.8765218257904053, | |
| "learning_rate": 4.453934991294824e-06, | |
| "loss": 0.3447, | |
| "step": 1845 | |
| }, | |
| { | |
| "epoch": 3.8541666666666665, | |
| "grad_norm": 4.272921562194824, | |
| "learning_rate": 4.378317709096615e-06, | |
| "loss": 0.3288, | |
| "step": 1850 | |
| }, | |
| { | |
| "epoch": 3.8645833333333335, | |
| "grad_norm": 4.007284641265869, | |
| "learning_rate": 4.3032379945057855e-06, | |
| "loss": 0.3231, | |
| "step": 1855 | |
| }, | |
| { | |
| "epoch": 3.875, | |
| "grad_norm": 4.158379554748535, | |
| "learning_rate": 4.228699647336842e-06, | |
| "loss": 0.3061, | |
| "step": 1860 | |
| }, | |
| { | |
| "epoch": 3.8854166666666665, | |
| "grad_norm": 3.829052448272705, | |
| "learning_rate": 4.15470644000549e-06, | |
| "loss": 0.2934, | |
| "step": 1865 | |
| }, | |
| { | |
| "epoch": 3.8958333333333335, | |
| "grad_norm": 3.956110954284668, | |
| "learning_rate": 4.081262117337665e-06, | |
| "loss": 0.391, | |
| "step": 1870 | |
| }, | |
| { | |
| "epoch": 3.90625, | |
| "grad_norm": 4.6542253494262695, | |
| "learning_rate": 4.0083703963800385e-06, | |
| "loss": 0.3053, | |
| "step": 1875 | |
| }, | |
| { | |
| "epoch": 3.9166666666666665, | |
| "grad_norm": 4.672625541687012, | |
| "learning_rate": 3.936034966211892e-06, | |
| "loss": 0.2828, | |
| "step": 1880 | |
| }, | |
| { | |
| "epoch": 3.9270833333333335, | |
| "grad_norm": 4.2120184898376465, | |
| "learning_rate": 3.8642594877584e-06, | |
| "loss": 0.3176, | |
| "step": 1885 | |
| }, | |
| { | |
| "epoch": 3.9375, | |
| "grad_norm": 4.534182548522949, | |
| "learning_rate": 3.7930475936053555e-06, | |
| "loss": 0.3518, | |
| "step": 1890 | |
| }, | |
| { | |
| "epoch": 3.9479166666666665, | |
| "grad_norm": 3.582456588745117, | |
| "learning_rate": 3.722402887815341e-06, | |
| "loss": 0.3034, | |
| "step": 1895 | |
| }, | |
| { | |
| "epoch": 3.9583333333333335, | |
| "grad_norm": 4.563004016876221, | |
| "learning_rate": 3.6523289457452785e-06, | |
| "loss": 0.3743, | |
| "step": 1900 | |
| }, | |
| { | |
| "epoch": 3.96875, | |
| "grad_norm": 3.4178104400634766, | |
| "learning_rate": 3.5828293138655306e-06, | |
| "loss": 0.3135, | |
| "step": 1905 | |
| }, | |
| { | |
| "epoch": 3.9791666666666665, | |
| "grad_norm": 5.349637508392334, | |
| "learning_rate": 3.513907509580383e-06, | |
| "loss": 0.3501, | |
| "step": 1910 | |
| }, | |
| { | |
| "epoch": 3.9895833333333335, | |
| "grad_norm": 4.109852313995361, | |
| "learning_rate": 3.445567021050035e-06, | |
| "loss": 0.2746, | |
| "step": 1915 | |
| }, | |
| { | |
| "epoch": 4.0, | |
| "grad_norm": 4.35547399520874, | |
| "learning_rate": 3.3778113070140664e-06, | |
| "loss": 0.3528, | |
| "step": 1920 | |
| }, | |
| { | |
| "epoch": 4.010416666666667, | |
| "grad_norm": 3.588036060333252, | |
| "learning_rate": 3.3106437966163776e-06, | |
| "loss": 0.2311, | |
| "step": 1925 | |
| }, | |
| { | |
| "epoch": 4.020833333333333, | |
| "grad_norm": 4.849432945251465, | |
| "learning_rate": 3.2440678892316524e-06, | |
| "loss": 0.321, | |
| "step": 1930 | |
| }, | |
| { | |
| "epoch": 4.03125, | |
| "grad_norm": 3.775545597076416, | |
| "learning_rate": 3.178086954293304e-06, | |
| "loss": 0.2871, | |
| "step": 1935 | |
| }, | |
| { | |
| "epoch": 4.041666666666667, | |
| "grad_norm": 4.8619279861450195, | |
| "learning_rate": 3.112704331122957e-06, | |
| "loss": 0.3232, | |
| "step": 1940 | |
| }, | |
| { | |
| "epoch": 4.052083333333333, | |
| "grad_norm": 4.006852149963379, | |
| "learning_rate": 3.0479233287614326e-06, | |
| "loss": 0.2653, | |
| "step": 1945 | |
| }, | |
| { | |
| "epoch": 4.0625, | |
| "grad_norm": 4.46446418762207, | |
| "learning_rate": 2.983747225801282e-06, | |
| "loss": 0.2819, | |
| "step": 1950 | |
| }, | |
| { | |
| "epoch": 4.072916666666667, | |
| "grad_norm": 3.509814739227295, | |
| "learning_rate": 2.920179270220853e-06, | |
| "loss": 0.2894, | |
| "step": 1955 | |
| }, | |
| { | |
| "epoch": 4.083333333333333, | |
| "grad_norm": 3.8410885334014893, | |
| "learning_rate": 2.8572226792199153e-06, | |
| "loss": 0.3222, | |
| "step": 1960 | |
| }, | |
| { | |
| "epoch": 4.09375, | |
| "grad_norm": 3.6083624362945557, | |
| "learning_rate": 2.7948806390568277e-06, | |
| "loss": 0.2919, | |
| "step": 1965 | |
| }, | |
| { | |
| "epoch": 4.104166666666667, | |
| "grad_norm": 4.186297416687012, | |
| "learning_rate": 2.7331563048872838e-06, | |
| "loss": 0.3145, | |
| "step": 1970 | |
| }, | |
| { | |
| "epoch": 4.114583333333333, | |
| "grad_norm": 3.532168388366699, | |
| "learning_rate": 2.672052800604631e-06, | |
| "loss": 0.2829, | |
| "step": 1975 | |
| }, | |
| { | |
| "epoch": 4.125, | |
| "grad_norm": 4.1477179527282715, | |
| "learning_rate": 2.6115732186817664e-06, | |
| "loss": 0.2592, | |
| "step": 1980 | |
| }, | |
| { | |
| "epoch": 4.135416666666667, | |
| "grad_norm": 3.916287422180176, | |
| "learning_rate": 2.5517206200146243e-06, | |
| "loss": 0.2424, | |
| "step": 1985 | |
| }, | |
| { | |
| "epoch": 4.145833333333333, | |
| "grad_norm": 4.036369323730469, | |
| "learning_rate": 2.4924980337672655e-06, | |
| "loss": 0.2516, | |
| "step": 1990 | |
| }, | |
| { | |
| "epoch": 4.15625, | |
| "grad_norm": 4.0196967124938965, | |
| "learning_rate": 2.43390845721857e-06, | |
| "loss": 0.3066, | |
| "step": 1995 | |
| }, | |
| { | |
| "epoch": 4.166666666666667, | |
| "grad_norm": 4.879697799682617, | |
| "learning_rate": 2.3759548556105397e-06, | |
| "loss": 0.3167, | |
| "step": 2000 | |
| }, | |
| { | |
| "epoch": 4.166666666666667, | |
| "eval_accuracy": 0.8305039073026138, | |
| "eval_f1": 0.8275940579902538, | |
| "eval_loss": 0.5550197958946228, | |
| "eval_precision": 0.8275738513170563, | |
| "eval_recall": 0.8305039073026138, | |
| "eval_runtime": 65.5125, | |
| "eval_samples_per_second": 113.291, | |
| "eval_steps_per_second": 1.771, | |
| "step": 2000 | |
| }, | |
| { | |
| "epoch": 4.177083333333333, | |
| "grad_norm": 4.092766761779785, | |
| "learning_rate": 2.318640161998234e-06, | |
| "loss": 0.2783, | |
| "step": 2005 | |
| }, | |
| { | |
| "epoch": 4.1875, | |
| "grad_norm": 3.7694523334503174, | |
| "learning_rate": 2.261967277101318e-06, | |
| "loss": 0.2999, | |
| "step": 2010 | |
| }, | |
| { | |
| "epoch": 4.197916666666667, | |
| "grad_norm": 3.900221347808838, | |
| "learning_rate": 2.2059390691572623e-06, | |
| "loss": 0.2688, | |
| "step": 2015 | |
| }, | |
| { | |
| "epoch": 4.208333333333333, | |
| "grad_norm": 4.138686656951904, | |
| "learning_rate": 2.150558373776176e-06, | |
| "loss": 0.2632, | |
| "step": 2020 | |
| }, | |
| { | |
| "epoch": 4.21875, | |
| "grad_norm": 4.359192848205566, | |
| "learning_rate": 2.095827993797298e-06, | |
| "loss": 0.3564, | |
| "step": 2025 | |
| }, | |
| { | |
| "epoch": 4.229166666666667, | |
| "grad_norm": 4.815664291381836, | |
| "learning_rate": 2.0417506991471454e-06, | |
| "loss": 0.266, | |
| "step": 2030 | |
| }, | |
| { | |
| "epoch": 4.239583333333333, | |
| "grad_norm": 4.142850875854492, | |
| "learning_rate": 1.98832922669932e-06, | |
| "loss": 0.3219, | |
| "step": 2035 | |
| }, | |
| { | |
| "epoch": 4.25, | |
| "grad_norm": 3.9295759201049805, | |
| "learning_rate": 1.9355662801360045e-06, | |
| "loss": 0.2443, | |
| "step": 2040 | |
| }, | |
| { | |
| "epoch": 4.260416666666667, | |
| "grad_norm": 4.1614766120910645, | |
| "learning_rate": 1.8834645298111164e-06, | |
| "loss": 0.3282, | |
| "step": 2045 | |
| }, | |
| { | |
| "epoch": 4.270833333333333, | |
| "grad_norm": 4.225388526916504, | |
| "learning_rate": 1.8320266126151714e-06, | |
| "loss": 0.2325, | |
| "step": 2050 | |
| }, | |
| { | |
| "epoch": 4.28125, | |
| "grad_norm": 3.859156847000122, | |
| "learning_rate": 1.7812551318418169e-06, | |
| "loss": 0.2909, | |
| "step": 2055 | |
| }, | |
| { | |
| "epoch": 4.291666666666667, | |
| "grad_norm": 3.560105323791504, | |
| "learning_rate": 1.7311526570560936e-06, | |
| "loss": 0.2697, | |
| "step": 2060 | |
| }, | |
| { | |
| "epoch": 4.302083333333333, | |
| "grad_norm": 2.339308261871338, | |
| "learning_rate": 1.6817217239643718e-06, | |
| "loss": 0.2619, | |
| "step": 2065 | |
| }, | |
| { | |
| "epoch": 4.3125, | |
| "grad_norm": 5.133707046508789, | |
| "learning_rate": 1.6329648342860343e-06, | |
| "loss": 0.3477, | |
| "step": 2070 | |
| }, | |
| { | |
| "epoch": 4.322916666666667, | |
| "grad_norm": 3.572845220565796, | |
| "learning_rate": 1.5848844556268528e-06, | |
| "loss": 0.3461, | |
| "step": 2075 | |
| }, | |
| { | |
| "epoch": 4.333333333333333, | |
| "grad_norm": 3.7399535179138184, | |
| "learning_rate": 1.537483021354103e-06, | |
| "loss": 0.2693, | |
| "step": 2080 | |
| }, | |
| { | |
| "epoch": 4.34375, | |
| "grad_norm": 3.5088624954223633, | |
| "learning_rate": 1.490762930473416e-06, | |
| "loss": 0.2539, | |
| "step": 2085 | |
| }, | |
| { | |
| "epoch": 4.354166666666667, | |
| "grad_norm": 4.644739627838135, | |
| "learning_rate": 1.4447265475073562e-06, | |
| "loss": 0.3029, | |
| "step": 2090 | |
| }, | |
| { | |
| "epoch": 4.364583333333333, | |
| "grad_norm": 4.528818607330322, | |
| "learning_rate": 1.3993762023757588e-06, | |
| "loss": 0.3181, | |
| "step": 2095 | |
| }, | |
| { | |
| "epoch": 4.375, | |
| "grad_norm": 4.050258159637451, | |
| "learning_rate": 1.3547141902778098e-06, | |
| "loss": 0.3337, | |
| "step": 2100 | |
| }, | |
| { | |
| "epoch": 4.385416666666667, | |
| "grad_norm": 4.387625217437744, | |
| "learning_rate": 1.3107427715758795e-06, | |
| "loss": 0.2743, | |
| "step": 2105 | |
| }, | |
| { | |
| "epoch": 4.395833333333333, | |
| "grad_norm": 3.5373356342315674, | |
| "learning_rate": 1.2674641716811302e-06, | |
| "loss": 0.2687, | |
| "step": 2110 | |
| }, | |
| { | |
| "epoch": 4.40625, | |
| "grad_norm": 3.2303831577301025, | |
| "learning_rate": 1.2248805809408903e-06, | |
| "loss": 0.2527, | |
| "step": 2115 | |
| }, | |
| { | |
| "epoch": 4.416666666666667, | |
| "grad_norm": 3.294447422027588, | |
| "learning_rate": 1.1829941545277916e-06, | |
| "loss": 0.3154, | |
| "step": 2120 | |
| }, | |
| { | |
| "epoch": 4.427083333333333, | |
| "grad_norm": 4.352384567260742, | |
| "learning_rate": 1.1418070123306989e-06, | |
| "loss": 0.282, | |
| "step": 2125 | |
| }, | |
| { | |
| "epoch": 4.4375, | |
| "grad_norm": 3.544809579849243, | |
| "learning_rate": 1.1013212388474248e-06, | |
| "loss": 0.2886, | |
| "step": 2130 | |
| }, | |
| { | |
| "epoch": 4.447916666666667, | |
| "grad_norm": 4.165502071380615, | |
| "learning_rate": 1.0615388830792277e-06, | |
| "loss": 0.2984, | |
| "step": 2135 | |
| }, | |
| { | |
| "epoch": 4.458333333333333, | |
| "grad_norm": 4.670546531677246, | |
| "learning_rate": 1.0224619584271121e-06, | |
| "loss": 0.2539, | |
| "step": 2140 | |
| }, | |
| { | |
| "epoch": 4.46875, | |
| "grad_norm": 3.987929344177246, | |
| "learning_rate": 9.840924425899345e-07, | |
| "loss": 0.3058, | |
| "step": 2145 | |
| }, | |
| { | |
| "epoch": 4.479166666666667, | |
| "grad_norm": 3.9754199981689453, | |
| "learning_rate": 9.464322774642998e-07, | |
| "loss": 0.2753, | |
| "step": 2150 | |
| }, | |
| { | |
| "epoch": 4.489583333333333, | |
| "grad_norm": 4.420624256134033, | |
| "learning_rate": 9.094833690462973e-07, | |
| "loss": 0.3667, | |
| "step": 2155 | |
| }, | |
| { | |
| "epoch": 4.5, | |
| "grad_norm": 4.443474292755127, | |
| "learning_rate": 8.732475873350193e-07, | |
| "loss": 0.2823, | |
| "step": 2160 | |
| }, | |
| { | |
| "epoch": 4.510416666666667, | |
| "grad_norm": 4.589036464691162, | |
| "learning_rate": 8.377267662379384e-07, | |
| "loss": 0.2847, | |
| "step": 2165 | |
| }, | |
| { | |
| "epoch": 4.520833333333333, | |
| "grad_norm": 3.150341033935547, | |
| "learning_rate": 8.029227034780751e-07, | |
| "loss": 0.3007, | |
| "step": 2170 | |
| }, | |
| { | |
| "epoch": 4.53125, | |
| "grad_norm": 3.597015142440796, | |
| "learning_rate": 7.688371605030287e-07, | |
| "loss": 0.2625, | |
| "step": 2175 | |
| }, | |
| { | |
| "epoch": 4.541666666666667, | |
| "grad_norm": 3.2903940677642822, | |
| "learning_rate": 7.35471862395819e-07, | |
| "loss": 0.2678, | |
| "step": 2180 | |
| }, | |
| { | |
| "epoch": 4.552083333333333, | |
| "grad_norm": 4.422494411468506, | |
| "learning_rate": 7.028284977875876e-07, | |
| "loss": 0.2848, | |
| "step": 2185 | |
| }, | |
| { | |
| "epoch": 4.5625, | |
| "grad_norm": 4.285615921020508, | |
| "learning_rate": 6.709087187721297e-07, | |
| "loss": 0.25, | |
| "step": 2190 | |
| }, | |
| { | |
| "epoch": 4.572916666666667, | |
| "grad_norm": 4.550652980804443, | |
| "learning_rate": 6.397141408222807e-07, | |
| "loss": 0.2596, | |
| "step": 2195 | |
| }, | |
| { | |
| "epoch": 4.583333333333333, | |
| "grad_norm": 4.278378963470459, | |
| "learning_rate": 6.092463427081652e-07, | |
| "loss": 0.3434, | |
| "step": 2200 | |
| }, | |
| { | |
| "epoch": 4.59375, | |
| "grad_norm": 4.464384078979492, | |
| "learning_rate": 5.795068664172809e-07, | |
| "loss": 0.3174, | |
| "step": 2205 | |
| }, | |
| { | |
| "epoch": 4.604166666666667, | |
| "grad_norm": 3.651949405670166, | |
| "learning_rate": 5.504972170764694e-07, | |
| "loss": 0.2682, | |
| "step": 2210 | |
| }, | |
| { | |
| "epoch": 4.614583333333333, | |
| "grad_norm": 4.239529132843018, | |
| "learning_rate": 5.222188628757401e-07, | |
| "loss": 0.2686, | |
| "step": 2215 | |
| }, | |
| { | |
| "epoch": 4.625, | |
| "grad_norm": 3.5339441299438477, | |
| "learning_rate": 4.946732349939537e-07, | |
| "loss": 0.2734, | |
| "step": 2220 | |
| }, | |
| { | |
| "epoch": 4.635416666666667, | |
| "grad_norm": 4.265580177307129, | |
| "learning_rate": 4.6786172752640575e-07, | |
| "loss": 0.2836, | |
| "step": 2225 | |
| }, | |
| { | |
| "epoch": 4.645833333333333, | |
| "grad_norm": 4.334507942199707, | |
| "learning_rate": 4.417856974142559e-07, | |
| "loss": 0.2955, | |
| "step": 2230 | |
| }, | |
| { | |
| "epoch": 4.65625, | |
| "grad_norm": 4.246450901031494, | |
| "learning_rate": 4.164464643758653e-07, | |
| "loss": 0.2976, | |
| "step": 2235 | |
| }, | |
| { | |
| "epoch": 4.666666666666667, | |
| "grad_norm": 4.4311323165893555, | |
| "learning_rate": 3.918453108399955e-07, | |
| "loss": 0.2487, | |
| "step": 2240 | |
| }, | |
| { | |
| "epoch": 4.677083333333333, | |
| "grad_norm": 4.169331073760986, | |
| "learning_rate": 3.6798348188090857e-07, | |
| "loss": 0.2548, | |
| "step": 2245 | |
| }, | |
| { | |
| "epoch": 4.6875, | |
| "grad_norm": 4.3604865074157715, | |
| "learning_rate": 3.448621851553557e-07, | |
| "loss": 0.2867, | |
| "step": 2250 | |
| }, | |
| { | |
| "epoch": 4.697916666666667, | |
| "grad_norm": 3.658160924911499, | |
| "learning_rate": 3.22482590841448e-07, | |
| "loss": 0.2728, | |
| "step": 2255 | |
| }, | |
| { | |
| "epoch": 4.708333333333333, | |
| "grad_norm": 3.9489758014678955, | |
| "learning_rate": 3.0084583157944546e-07, | |
| "loss": 0.2844, | |
| "step": 2260 | |
| }, | |
| { | |
| "epoch": 4.71875, | |
| "grad_norm": 3.8050849437713623, | |
| "learning_rate": 2.799530024144259e-07, | |
| "loss": 0.2236, | |
| "step": 2265 | |
| }, | |
| { | |
| "epoch": 4.729166666666667, | |
| "grad_norm": 3.9371848106384277, | |
| "learning_rate": 2.598051607408647e-07, | |
| "loss": 0.2457, | |
| "step": 2270 | |
| }, | |
| { | |
| "epoch": 4.739583333333333, | |
| "grad_norm": 2.5769946575164795, | |
| "learning_rate": 2.40403326249124e-07, | |
| "loss": 0.2614, | |
| "step": 2275 | |
| }, | |
| { | |
| "epoch": 4.75, | |
| "grad_norm": 4.463439464569092, | |
| "learning_rate": 2.2174848087383736e-07, | |
| "loss": 0.3168, | |
| "step": 2280 | |
| }, | |
| { | |
| "epoch": 4.760416666666667, | |
| "grad_norm": 3.5698652267456055, | |
| "learning_rate": 2.03841568744228e-07, | |
| "loss": 0.2737, | |
| "step": 2285 | |
| }, | |
| { | |
| "epoch": 4.770833333333333, | |
| "grad_norm": 3.795032501220703, | |
| "learning_rate": 1.8668349613630674e-07, | |
| "loss": 0.29, | |
| "step": 2290 | |
| }, | |
| { | |
| "epoch": 4.78125, | |
| "grad_norm": 3.9918630123138428, | |
| "learning_rate": 1.70275131427024e-07, | |
| "loss": 0.3302, | |
| "step": 2295 | |
| }, | |
| { | |
| "epoch": 4.791666666666667, | |
| "grad_norm": 4.1846489906311035, | |
| "learning_rate": 1.5461730505030635e-07, | |
| "loss": 0.2761, | |
| "step": 2300 | |
| }, | |
| { | |
| "epoch": 4.802083333333333, | |
| "grad_norm": 4.343273162841797, | |
| "learning_rate": 1.3971080945503867e-07, | |
| "loss": 0.2767, | |
| "step": 2305 | |
| }, | |
| { | |
| "epoch": 4.8125, | |
| "grad_norm": 3.917971134185791, | |
| "learning_rate": 1.2555639906494944e-07, | |
| "loss": 0.2823, | |
| "step": 2310 | |
| }, | |
| { | |
| "epoch": 4.822916666666667, | |
| "grad_norm": 3.1917574405670166, | |
| "learning_rate": 1.1215479024043462e-07, | |
| "loss": 0.251, | |
| "step": 2315 | |
| }, | |
| { | |
| "epoch": 4.833333333333333, | |
| "grad_norm": 3.084055185317993, | |
| "learning_rate": 9.950666124229845e-08, | |
| "loss": 0.3036, | |
| "step": 2320 | |
| }, | |
| { | |
| "epoch": 4.84375, | |
| "grad_norm": 4.097941875457764, | |
| "learning_rate": 8.761265219743409e-08, | |
| "loss": 0.3377, | |
| "step": 2325 | |
| }, | |
| { | |
| "epoch": 4.854166666666667, | |
| "grad_norm": 3.777550458908081, | |
| "learning_rate": 7.647336506641289e-08, | |
| "loss": 0.2752, | |
| "step": 2330 | |
| }, | |
| { | |
| "epoch": 4.864583333333333, | |
| "grad_norm": 4.147736072540283, | |
| "learning_rate": 6.608936361303219e-08, | |
| "loss": 0.2782, | |
| "step": 2335 | |
| }, | |
| { | |
| "epoch": 4.875, | |
| "grad_norm": 5.2703118324279785, | |
| "learning_rate": 5.646117337577972e-08, | |
| "loss": 0.3357, | |
| "step": 2340 | |
| }, | |
| { | |
| "epoch": 4.885416666666667, | |
| "grad_norm": 4.2694478034973145, | |
| "learning_rate": 4.7589281641226557e-08, | |
| "loss": 0.2479, | |
| "step": 2345 | |
| }, | |
| { | |
| "epoch": 4.895833333333333, | |
| "grad_norm": 4.704000949859619, | |
| "learning_rate": 3.947413741938022e-08, | |
| "loss": 0.2929, | |
| "step": 2350 | |
| }, | |
| { | |
| "epoch": 4.90625, | |
| "grad_norm": 3.745163679122925, | |
| "learning_rate": 3.211615142094781e-08, | |
| "loss": 0.3228, | |
| "step": 2355 | |
| }, | |
| { | |
| "epoch": 4.916666666666667, | |
| "grad_norm": 3.8283140659332275, | |
| "learning_rate": 2.5515696036557678e-08, | |
| "loss": 0.2744, | |
| "step": 2360 | |
| }, | |
| { | |
| "epoch": 4.927083333333333, | |
| "grad_norm": 4.659512042999268, | |
| "learning_rate": 1.9673105317906113e-08, | |
| "loss": 0.3067, | |
| "step": 2365 | |
| }, | |
| { | |
| "epoch": 4.9375, | |
| "grad_norm": 3.370952606201172, | |
| "learning_rate": 1.4588674960859249e-08, | |
| "loss": 0.2843, | |
| "step": 2370 | |
| }, | |
| { | |
| "epoch": 4.947916666666667, | |
| "grad_norm": 4.8212995529174805, | |
| "learning_rate": 1.0262662290476677e-08, | |
| "loss": 0.2667, | |
| "step": 2375 | |
| }, | |
| { | |
| "epoch": 4.958333333333333, | |
| "grad_norm": 3.5038435459136963, | |
| "learning_rate": 6.695286248000198e-09, | |
| "loss": 0.2809, | |
| "step": 2380 | |
| }, | |
| { | |
| "epoch": 4.96875, | |
| "grad_norm": 4.139862060546875, | |
| "learning_rate": 3.8867273797627e-09, | |
| "loss": 0.3412, | |
| "step": 2385 | |
| }, | |
| { | |
| "epoch": 4.979166666666667, | |
| "grad_norm": 3.801501512527466, | |
| "learning_rate": 1.8371278280571168e-09, | |
| "loss": 0.3109, | |
| "step": 2390 | |
| }, | |
| { | |
| "epoch": 4.989583333333333, | |
| "grad_norm": 3.0491151809692383, | |
| "learning_rate": 5.465913239388609e-10, | |
| "loss": 0.2954, | |
| "step": 2395 | |
| }, | |
| { | |
| "epoch": 5.0, | |
| "grad_norm": 4.403063774108887, | |
| "learning_rate": 1.518318198168167e-11, | |
| "loss": 0.223, | |
| "step": 2400 | |
| } | |
| ], | |
| "logging_steps": 5, | |
| "max_steps": 2400, | |
| "num_input_tokens_seen": 0, | |
| "num_train_epochs": 5, | |
| "save_steps": 500, | |
| "stateful_callbacks": { | |
| "TrainerControl": { | |
| "args": { | |
| "should_epoch_stop": false, | |
| "should_evaluate": false, | |
| "should_log": false, | |
| "should_save": true, | |
| "should_training_stop": true | |
| }, | |
| "attributes": {} | |
| } | |
| }, | |
| "total_flos": 4.04160352616448e+16, | |
| "train_batch_size": 64, | |
| "trial_name": null, | |
| "trial_params": null | |
| } | |