| { | |
| "best_metric": null, | |
| "best_model_checkpoint": null, | |
| "epoch": 1.9953917050691246, | |
| "eval_steps": 500, | |
| "global_step": 758, | |
| "is_hyper_param_search": false, | |
| "is_local_process_zero": true, | |
| "is_world_process_zero": true, | |
| "log_history": [ | |
| { | |
| "epoch": 0.0026333113890717576, | |
| "grad_norm": 31.375, | |
| "learning_rate": 3.947368421052631e-06, | |
| "loss": 2.687, | |
| "step": 1 | |
| }, | |
| { | |
| "epoch": 0.013166556945358789, | |
| "grad_norm": 11.375, | |
| "learning_rate": 1.9736842105263155e-05, | |
| "loss": 2.3265, | |
| "step": 5 | |
| }, | |
| { | |
| "epoch": 0.026333113890717578, | |
| "grad_norm": 3.5625, | |
| "learning_rate": 3.947368421052631e-05, | |
| "loss": 1.834, | |
| "step": 10 | |
| }, | |
| { | |
| "epoch": 0.03949967083607637, | |
| "grad_norm": 2.828125, | |
| "learning_rate": 5.921052631578947e-05, | |
| "loss": 1.6157, | |
| "step": 15 | |
| }, | |
| { | |
| "epoch": 0.052666227781435156, | |
| "grad_norm": 7.34375, | |
| "learning_rate": 7.894736842105262e-05, | |
| "loss": 1.5546, | |
| "step": 20 | |
| }, | |
| { | |
| "epoch": 0.06583278472679395, | |
| "grad_norm": 1.6796875, | |
| "learning_rate": 9.868421052631579e-05, | |
| "loss": 1.561, | |
| "step": 25 | |
| }, | |
| { | |
| "epoch": 0.07899934167215274, | |
| "grad_norm": 1.875, | |
| "learning_rate": 0.00011842105263157894, | |
| "loss": 1.4433, | |
| "step": 30 | |
| }, | |
| { | |
| "epoch": 0.09216589861751152, | |
| "grad_norm": 2.0, | |
| "learning_rate": 0.0001381578947368421, | |
| "loss": 1.5053, | |
| "step": 35 | |
| }, | |
| { | |
| "epoch": 0.10533245556287031, | |
| "grad_norm": 4.9375, | |
| "learning_rate": 0.00015789473684210524, | |
| "loss": 1.5204, | |
| "step": 40 | |
| }, | |
| { | |
| "epoch": 0.1184990125082291, | |
| "grad_norm": 2.03125, | |
| "learning_rate": 0.00017763157894736838, | |
| "loss": 1.5645, | |
| "step": 45 | |
| }, | |
| { | |
| "epoch": 0.1316655694535879, | |
| "grad_norm": 2.140625, | |
| "learning_rate": 0.00019736842105263157, | |
| "loss": 1.5742, | |
| "step": 50 | |
| }, | |
| { | |
| "epoch": 0.1448321263989467, | |
| "grad_norm": 1.9765625, | |
| "learning_rate": 0.00021710526315789472, | |
| "loss": 1.6198, | |
| "step": 55 | |
| }, | |
| { | |
| "epoch": 0.15799868334430547, | |
| "grad_norm": 2.125, | |
| "learning_rate": 0.00023684210526315788, | |
| "loss": 1.6436, | |
| "step": 60 | |
| }, | |
| { | |
| "epoch": 0.17116524028966426, | |
| "grad_norm": 2.125, | |
| "learning_rate": 0.00025657894736842105, | |
| "loss": 1.6867, | |
| "step": 65 | |
| }, | |
| { | |
| "epoch": 0.18433179723502305, | |
| "grad_norm": 2.359375, | |
| "learning_rate": 0.0002763157894736842, | |
| "loss": 1.7356, | |
| "step": 70 | |
| }, | |
| { | |
| "epoch": 0.19749835418038184, | |
| "grad_norm": 3.859375, | |
| "learning_rate": 0.00029605263157894733, | |
| "loss": 1.7819, | |
| "step": 75 | |
| }, | |
| { | |
| "epoch": 0.21066491112574062, | |
| "grad_norm": 5.40625, | |
| "learning_rate": 0.0002999745375637391, | |
| "loss": 1.9272, | |
| "step": 80 | |
| }, | |
| { | |
| "epoch": 0.2238314680710994, | |
| "grad_norm": 5.15625, | |
| "learning_rate": 0.00029987111123173417, | |
| "loss": 2.0363, | |
| "step": 85 | |
| }, | |
| { | |
| "epoch": 0.2369980250164582, | |
| "grad_norm": 3.109375, | |
| "learning_rate": 0.00029968818442293417, | |
| "loss": 1.8288, | |
| "step": 90 | |
| }, | |
| { | |
| "epoch": 0.250164581961817, | |
| "grad_norm": 2.84375, | |
| "learning_rate": 0.00029942585417250744, | |
| "loss": 1.8436, | |
| "step": 95 | |
| }, | |
| { | |
| "epoch": 0.2633311389071758, | |
| "grad_norm": 2.40625, | |
| "learning_rate": 0.00029908425963589115, | |
| "loss": 1.7724, | |
| "step": 100 | |
| }, | |
| { | |
| "epoch": 0.2764976958525346, | |
| "grad_norm": 1.875, | |
| "learning_rate": 0.00029866358201497474, | |
| "loss": 1.7534, | |
| "step": 105 | |
| }, | |
| { | |
| "epoch": 0.2896642527978934, | |
| "grad_norm": 1.9765625, | |
| "learning_rate": 0.0002981640444619799, | |
| "loss": 1.7532, | |
| "step": 110 | |
| }, | |
| { | |
| "epoch": 0.30283080974325216, | |
| "grad_norm": 2.796875, | |
| "learning_rate": 0.00029758591196108743, | |
| "loss": 1.7545, | |
| "step": 115 | |
| }, | |
| { | |
| "epoch": 0.31599736668861095, | |
| "grad_norm": 2.015625, | |
| "learning_rate": 0.00029692949118787415, | |
| "loss": 1.8269, | |
| "step": 120 | |
| }, | |
| { | |
| "epoch": 0.32916392363396973, | |
| "grad_norm": 2.125, | |
| "learning_rate": 0.0002961951303466338, | |
| "loss": 1.7823, | |
| "step": 125 | |
| }, | |
| { | |
| "epoch": 0.3423304805793285, | |
| "grad_norm": 2.71875, | |
| "learning_rate": 0.0002953832189856691, | |
| "loss": 1.7371, | |
| "step": 130 | |
| }, | |
| { | |
| "epoch": 0.3554970375246873, | |
| "grad_norm": 1.9921875, | |
| "learning_rate": 0.00029449418779065257, | |
| "loss": 1.7607, | |
| "step": 135 | |
| }, | |
| { | |
| "epoch": 0.3686635944700461, | |
| "grad_norm": 2.109375, | |
| "learning_rate": 0.00029352850835616504, | |
| "loss": 1.7956, | |
| "step": 140 | |
| }, | |
| { | |
| "epoch": 0.3818301514154049, | |
| "grad_norm": 2.0625, | |
| "learning_rate": 0.00029248669293553437, | |
| "loss": 1.7176, | |
| "step": 145 | |
| }, | |
| { | |
| "epoch": 0.39499670836076367, | |
| "grad_norm": 1.9453125, | |
| "learning_rate": 0.0002913692941691059, | |
| "loss": 1.843, | |
| "step": 150 | |
| }, | |
| { | |
| "epoch": 0.40816326530612246, | |
| "grad_norm": 1.828125, | |
| "learning_rate": 0.0002901769047910895, | |
| "loss": 1.7918, | |
| "step": 155 | |
| }, | |
| { | |
| "epoch": 0.42132982225148125, | |
| "grad_norm": 1.5625, | |
| "learning_rate": 0.0002889101573151384, | |
| "loss": 1.7714, | |
| "step": 160 | |
| }, | |
| { | |
| "epoch": 0.43449637919684003, | |
| "grad_norm": 2.671875, | |
| "learning_rate": 0.00028756972369882667, | |
| "loss": 1.8033, | |
| "step": 165 | |
| }, | |
| { | |
| "epoch": 0.4476629361421988, | |
| "grad_norm": 1.6015625, | |
| "learning_rate": 0.0002861563149872031, | |
| "loss": 1.8409, | |
| "step": 170 | |
| }, | |
| { | |
| "epoch": 0.4608294930875576, | |
| "grad_norm": 1.8203125, | |
| "learning_rate": 0.0002846706809356112, | |
| "loss": 1.8259, | |
| "step": 175 | |
| }, | |
| { | |
| "epoch": 0.4739960500329164, | |
| "grad_norm": 1.8125, | |
| "learning_rate": 0.0002831136096119747, | |
| "loss": 1.7612, | |
| "step": 180 | |
| }, | |
| { | |
| "epoch": 0.4871626069782752, | |
| "grad_norm": 1.796875, | |
| "learning_rate": 0.0002814859269787596, | |
| "loss": 1.7649, | |
| "step": 185 | |
| }, | |
| { | |
| "epoch": 0.500329163923634, | |
| "grad_norm": 1.8359375, | |
| "learning_rate": 0.0002797884964548353, | |
| "loss": 1.7443, | |
| "step": 190 | |
| }, | |
| { | |
| "epoch": 0.5134957208689928, | |
| "grad_norm": 1.5703125, | |
| "learning_rate": 0.0002780222184574662, | |
| "loss": 1.7219, | |
| "step": 195 | |
| }, | |
| { | |
| "epoch": 0.5266622778143516, | |
| "grad_norm": 1.6328125, | |
| "learning_rate": 0.0002761880299246772, | |
| "loss": 1.7409, | |
| "step": 200 | |
| }, | |
| { | |
| "epoch": 0.5398288347597103, | |
| "grad_norm": 1.59375, | |
| "learning_rate": 0.00027428690381824637, | |
| "loss": 1.7043, | |
| "step": 205 | |
| }, | |
| { | |
| "epoch": 0.5529953917050692, | |
| "grad_norm": 1.6171875, | |
| "learning_rate": 0.00027231984860758907, | |
| "loss": 1.6709, | |
| "step": 210 | |
| }, | |
| { | |
| "epoch": 0.5661619486504279, | |
| "grad_norm": 2.0, | |
| "learning_rate": 0.000270287907734806, | |
| "loss": 1.7417, | |
| "step": 215 | |
| }, | |
| { | |
| "epoch": 0.5793285055957867, | |
| "grad_norm": 1.65625, | |
| "learning_rate": 0.0002681921590611799, | |
| "loss": 1.66, | |
| "step": 220 | |
| }, | |
| { | |
| "epoch": 0.5924950625411455, | |
| "grad_norm": 1.8671875, | |
| "learning_rate": 0.0002660337142954145, | |
| "loss": 1.732, | |
| "step": 225 | |
| }, | |
| { | |
| "epoch": 0.6056616194865043, | |
| "grad_norm": 1.5234375, | |
| "learning_rate": 0.0002638137184039186, | |
| "loss": 1.6964, | |
| "step": 230 | |
| }, | |
| { | |
| "epoch": 0.618828176431863, | |
| "grad_norm": 1.625, | |
| "learning_rate": 0.00026153334900344853, | |
| "loss": 1.648, | |
| "step": 235 | |
| }, | |
| { | |
| "epoch": 0.6319947333772219, | |
| "grad_norm": 1.375, | |
| "learning_rate": 0.0002591938157364303, | |
| "loss": 1.6197, | |
| "step": 240 | |
| }, | |
| { | |
| "epoch": 0.6451612903225806, | |
| "grad_norm": 1.7109375, | |
| "learning_rate": 0.00025679635962929455, | |
| "loss": 1.701, | |
| "step": 245 | |
| }, | |
| { | |
| "epoch": 0.6583278472679395, | |
| "grad_norm": 1.640625, | |
| "learning_rate": 0.00025434225243416234, | |
| "loss": 1.7649, | |
| "step": 250 | |
| }, | |
| { | |
| "epoch": 0.6714944042132982, | |
| "grad_norm": 1.6328125, | |
| "learning_rate": 0.0002518327959542333, | |
| "loss": 1.712, | |
| "step": 255 | |
| }, | |
| { | |
| "epoch": 0.684660961158657, | |
| "grad_norm": 1.796875, | |
| "learning_rate": 0.0002492693213532321, | |
| "loss": 1.6628, | |
| "step": 260 | |
| }, | |
| { | |
| "epoch": 0.6978275181040158, | |
| "grad_norm": 2.015625, | |
| "learning_rate": 0.0002466531884492808, | |
| "loss": 1.6714, | |
| "step": 265 | |
| }, | |
| { | |
| "epoch": 0.7109940750493746, | |
| "grad_norm": 1.9921875, | |
| "learning_rate": 0.0002439857849935712, | |
| "loss": 1.6833, | |
| "step": 270 | |
| }, | |
| { | |
| "epoch": 0.7241606319947334, | |
| "grad_norm": 1.703125, | |
| "learning_rate": 0.00024126852593421967, | |
| "loss": 1.7174, | |
| "step": 275 | |
| }, | |
| { | |
| "epoch": 0.7373271889400922, | |
| "grad_norm": 1.6015625, | |
| "learning_rate": 0.0002385028526656952, | |
| "loss": 1.6437, | |
| "step": 280 | |
| }, | |
| { | |
| "epoch": 0.7504937458854509, | |
| "grad_norm": 1.7109375, | |
| "learning_rate": 0.00023569023226421883, | |
| "loss": 1.6515, | |
| "step": 285 | |
| }, | |
| { | |
| "epoch": 0.7636603028308098, | |
| "grad_norm": 1.6015625, | |
| "learning_rate": 0.0002328321567095398, | |
| "loss": 1.6352, | |
| "step": 290 | |
| }, | |
| { | |
| "epoch": 0.7768268597761685, | |
| "grad_norm": 1.625, | |
| "learning_rate": 0.00022993014209350167, | |
| "loss": 1.6205, | |
| "step": 295 | |
| }, | |
| { | |
| "epoch": 0.7899934167215273, | |
| "grad_norm": 1.546875, | |
| "learning_rate": 0.00022698572781581757, | |
| "loss": 1.6508, | |
| "step": 300 | |
| }, | |
| { | |
| "epoch": 0.8031599736668861, | |
| "grad_norm": 1.453125, | |
| "learning_rate": 0.0002240004757674819, | |
| "loss": 1.5989, | |
| "step": 305 | |
| }, | |
| { | |
| "epoch": 0.8163265306122449, | |
| "grad_norm": 1.8046875, | |
| "learning_rate": 0.00022097596950225134, | |
| "loss": 1.6176, | |
| "step": 310 | |
| }, | |
| { | |
| "epoch": 0.8294930875576036, | |
| "grad_norm": 1.3671875, | |
| "learning_rate": 0.00021791381339663423, | |
| "loss": 1.6204, | |
| "step": 315 | |
| }, | |
| { | |
| "epoch": 0.8426596445029625, | |
| "grad_norm": 1.5390625, | |
| "learning_rate": 0.00021481563179883502, | |
| "loss": 1.5592, | |
| "step": 320 | |
| }, | |
| { | |
| "epoch": 0.8558262014483212, | |
| "grad_norm": 1.3125, | |
| "learning_rate": 0.00021168306816710393, | |
| "loss": 1.5973, | |
| "step": 325 | |
| }, | |
| { | |
| "epoch": 0.8689927583936801, | |
| "grad_norm": 1.421875, | |
| "learning_rate": 0.0002085177841979498, | |
| "loss": 1.5367, | |
| "step": 330 | |
| }, | |
| { | |
| "epoch": 0.8821593153390388, | |
| "grad_norm": 1.6796875, | |
| "learning_rate": 0.00020532145894467828, | |
| "loss": 1.5283, | |
| "step": 335 | |
| }, | |
| { | |
| "epoch": 0.8953258722843976, | |
| "grad_norm": 1.46875, | |
| "learning_rate": 0.000202095787926723, | |
| "loss": 1.5374, | |
| "step": 340 | |
| }, | |
| { | |
| "epoch": 0.9084924292297564, | |
| "grad_norm": 1.515625, | |
| "learning_rate": 0.00019884248223024203, | |
| "loss": 1.5021, | |
| "step": 345 | |
| }, | |
| { | |
| "epoch": 0.9216589861751152, | |
| "grad_norm": 1.3046875, | |
| "learning_rate": 0.00019556326760045658, | |
| "loss": 1.5345, | |
| "step": 350 | |
| }, | |
| { | |
| "epoch": 0.934825543120474, | |
| "grad_norm": 1.328125, | |
| "learning_rate": 0.00019225988352621445, | |
| "loss": 1.5164, | |
| "step": 355 | |
| }, | |
| { | |
| "epoch": 0.9479921000658328, | |
| "grad_norm": 1.3984375, | |
| "learning_rate": 0.0001889340823172622, | |
| "loss": 1.4778, | |
| "step": 360 | |
| }, | |
| { | |
| "epoch": 0.9611586570111915, | |
| "grad_norm": 1.2734375, | |
| "learning_rate": 0.00018558762817471678, | |
| "loss": 1.5624, | |
| "step": 365 | |
| }, | |
| { | |
| "epoch": 0.9743252139565504, | |
| "grad_norm": 1.453125, | |
| "learning_rate": 0.00018222229625522928, | |
| "loss": 1.527, | |
| "step": 370 | |
| }, | |
| { | |
| "epoch": 0.9874917709019092, | |
| "grad_norm": 1.609375, | |
| "learning_rate": 0.00017883987172933707, | |
| "loss": 1.4608, | |
| "step": 375 | |
| }, | |
| { | |
| "epoch": 1.0, | |
| "grad_norm": 1.3671875, | |
| "learning_rate": 0.0001754421488345041, | |
| "loss": 1.4084, | |
| "step": 380 | |
| }, | |
| { | |
| "epoch": 1.0131665569453587, | |
| "grad_norm": 1.4375, | |
| "learning_rate": 0.00017203092992335137, | |
| "loss": 1.013, | |
| "step": 385 | |
| }, | |
| { | |
| "epoch": 1.0263331138907177, | |
| "grad_norm": 1.3984375, | |
| "learning_rate": 0.0001686080245075831, | |
| "loss": 1.0124, | |
| "step": 390 | |
| }, | |
| { | |
| "epoch": 1.0394996708360764, | |
| "grad_norm": 1.53125, | |
| "learning_rate": 0.0001651752482981148, | |
| "loss": 1.0275, | |
| "step": 395 | |
| }, | |
| { | |
| "epoch": 1.0526662277814351, | |
| "grad_norm": 1.2265625, | |
| "learning_rate": 0.00016173442224191309, | |
| "loss": 0.9538, | |
| "step": 400 | |
| }, | |
| { | |
| "epoch": 1.0658327847267939, | |
| "grad_norm": 1.2109375, | |
| "learning_rate": 0.00015828737155605804, | |
| "loss": 0.9683, | |
| "step": 405 | |
| }, | |
| { | |
| "epoch": 1.0789993416721528, | |
| "grad_norm": 1.21875, | |
| "learning_rate": 0.0001548359247595405, | |
| "loss": 1.0414, | |
| "step": 410 | |
| }, | |
| { | |
| "epoch": 1.0921658986175116, | |
| "grad_norm": 1.2578125, | |
| "learning_rate": 0.00015138191270330773, | |
| "loss": 0.9749, | |
| "step": 415 | |
| }, | |
| { | |
| "epoch": 1.1053324555628703, | |
| "grad_norm": 1.4921875, | |
| "learning_rate": 0.00014792716759907186, | |
| "loss": 0.9802, | |
| "step": 420 | |
| }, | |
| { | |
| "epoch": 1.118499012508229, | |
| "grad_norm": 1.34375, | |
| "learning_rate": 0.00014447352204739712, | |
| "loss": 0.9399, | |
| "step": 425 | |
| }, | |
| { | |
| "epoch": 1.131665569453588, | |
| "grad_norm": 1.21875, | |
| "learning_rate": 0.00014102280806558006, | |
| "loss": 1.0111, | |
| "step": 430 | |
| }, | |
| { | |
| "epoch": 1.1448321263989467, | |
| "grad_norm": 1.2890625, | |
| "learning_rate": 0.00013757685611583983, | |
| "loss": 0.9483, | |
| "step": 435 | |
| }, | |
| { | |
| "epoch": 1.1579986833443054, | |
| "grad_norm": 1.15625, | |
| "learning_rate": 0.00013413749413433273, | |
| "loss": 0.9546, | |
| "step": 440 | |
| }, | |
| { | |
| "epoch": 1.1711652402896642, | |
| "grad_norm": 1.2734375, | |
| "learning_rate": 0.0001307065465615073, | |
| "loss": 0.9294, | |
| "step": 445 | |
| }, | |
| { | |
| "epoch": 1.1843317972350231, | |
| "grad_norm": 1.2265625, | |
| "learning_rate": 0.00012728583337431353, | |
| "loss": 0.9498, | |
| "step": 450 | |
| }, | |
| { | |
| "epoch": 1.1974983541803819, | |
| "grad_norm": 1.296875, | |
| "learning_rate": 0.0001238771691207795, | |
| "loss": 0.942, | |
| "step": 455 | |
| }, | |
| { | |
| "epoch": 1.2106649111257406, | |
| "grad_norm": 1.4375, | |
| "learning_rate": 0.00012048236195746822, | |
| "loss": 0.9069, | |
| "step": 460 | |
| }, | |
| { | |
| "epoch": 1.2238314680710993, | |
| "grad_norm": 1.5078125, | |
| "learning_rate": 0.00011710321269032502, | |
| "loss": 0.9452, | |
| "step": 465 | |
| }, | |
| { | |
| "epoch": 1.2369980250164583, | |
| "grad_norm": 1.3984375, | |
| "learning_rate": 0.00011374151381942327, | |
| "loss": 0.9533, | |
| "step": 470 | |
| }, | |
| { | |
| "epoch": 1.250164581961817, | |
| "grad_norm": 1.375, | |
| "learning_rate": 0.00011039904858811712, | |
| "loss": 0.9229, | |
| "step": 475 | |
| }, | |
| { | |
| "epoch": 1.2633311389071757, | |
| "grad_norm": 1.1015625, | |
| "learning_rate": 0.00010707759003710384, | |
| "loss": 0.8528, | |
| "step": 480 | |
| }, | |
| { | |
| "epoch": 1.2764976958525347, | |
| "grad_norm": 1.328125, | |
| "learning_rate": 0.00010377890006389856, | |
| "loss": 0.8836, | |
| "step": 485 | |
| }, | |
| { | |
| "epoch": 1.2896642527978934, | |
| "grad_norm": 1.3203125, | |
| "learning_rate": 0.00010050472848821968, | |
| "loss": 0.9177, | |
| "step": 490 | |
| }, | |
| { | |
| "epoch": 1.3028308097432522, | |
| "grad_norm": 1.296875, | |
| "learning_rate": 9.725681212378167e-05, | |
| "loss": 0.8867, | |
| "step": 495 | |
| }, | |
| { | |
| "epoch": 1.315997366688611, | |
| "grad_norm": 1.2421875, | |
| "learning_rate": 9.403687385698632e-05, | |
| "loss": 0.9074, | |
| "step": 500 | |
| }, | |
| { | |
| "epoch": 1.3291639236339696, | |
| "grad_norm": 1.1796875, | |
| "learning_rate": 9.084662173300223e-05, | |
| "loss": 0.8652, | |
| "step": 505 | |
| }, | |
| { | |
| "epoch": 1.3423304805793286, | |
| "grad_norm": 1.1796875, | |
| "learning_rate": 8.768774804971705e-05, | |
| "loss": 0.8758, | |
| "step": 510 | |
| }, | |
| { | |
| "epoch": 1.3554970375246873, | |
| "grad_norm": 3.953125, | |
| "learning_rate": 8.456192846004275e-05, | |
| "loss": 0.8357, | |
| "step": 515 | |
| }, | |
| { | |
| "epoch": 1.368663594470046, | |
| "grad_norm": 1.0703125, | |
| "learning_rate": 8.147082108305058e-05, | |
| "loss": 0.8258, | |
| "step": 520 | |
| }, | |
| { | |
| "epoch": 1.381830151415405, | |
| "grad_norm": 1.1015625, | |
| "learning_rate": 7.84160656244067e-05, | |
| "loss": 0.906, | |
| "step": 525 | |
| }, | |
| { | |
| "epoch": 1.3949967083607637, | |
| "grad_norm": 1.1484375, | |
| "learning_rate": 7.539928250657594e-05, | |
| "loss": 0.809, | |
| "step": 530 | |
| }, | |
| { | |
| "epoch": 1.4081632653061225, | |
| "grad_norm": 1.15625, | |
| "learning_rate": 7.242207200925383e-05, | |
| "loss": 0.7685, | |
| "step": 535 | |
| }, | |
| { | |
| "epoch": 1.4213298222514812, | |
| "grad_norm": 1.1171875, | |
| "learning_rate": 6.948601342048397e-05, | |
| "loss": 0.8473, | |
| "step": 540 | |
| }, | |
| { | |
| "epoch": 1.43449637919684, | |
| "grad_norm": 1.1015625, | |
| "learning_rate": 6.65926641989106e-05, | |
| "loss": 0.8022, | |
| "step": 545 | |
| }, | |
| { | |
| "epoch": 1.4476629361421989, | |
| "grad_norm": 1.15625, | |
| "learning_rate": 6.374355914761062e-05, | |
| "loss": 0.7762, | |
| "step": 550 | |
| }, | |
| { | |
| "epoch": 1.4608294930875576, | |
| "grad_norm": 1.109375, | |
| "learning_rate": 6.094020959994336e-05, | |
| "loss": 0.862, | |
| "step": 555 | |
| }, | |
| { | |
| "epoch": 1.4739960500329163, | |
| "grad_norm": 1.0703125, | |
| "learning_rate": 5.818410261785056e-05, | |
| "loss": 0.793, | |
| "step": 560 | |
| }, | |
| { | |
| "epoch": 1.4871626069782753, | |
| "grad_norm": 1.0625, | |
| "learning_rate": 5.5476700203030643e-05, | |
| "loss": 0.7979, | |
| "step": 565 | |
| }, | |
| { | |
| "epoch": 1.500329163923634, | |
| "grad_norm": 1.28125, | |
| "learning_rate": 5.281943852140697e-05, | |
| "loss": 0.8223, | |
| "step": 570 | |
| }, | |
| { | |
| "epoch": 1.5134957208689928, | |
| "grad_norm": 1.0078125, | |
| "learning_rate": 5.021372714130087e-05, | |
| "loss": 0.84, | |
| "step": 575 | |
| }, | |
| { | |
| "epoch": 1.5266622778143515, | |
| "grad_norm": 1.21875, | |
| "learning_rate": 4.766094828571313e-05, | |
| "loss": 0.7897, | |
| "step": 580 | |
| }, | |
| { | |
| "epoch": 1.5398288347597102, | |
| "grad_norm": 1.0859375, | |
| "learning_rate": 4.516245609911161e-05, | |
| "loss": 0.7917, | |
| "step": 585 | |
| }, | |
| { | |
| "epoch": 1.5529953917050692, | |
| "grad_norm": 1.1015625, | |
| "learning_rate": 4.271957592911325e-05, | |
| "loss": 0.7691, | |
| "step": 590 | |
| }, | |
| { | |
| "epoch": 1.566161948650428, | |
| "grad_norm": 1.1875, | |
| "learning_rate": 4.033360362344117e-05, | |
| "loss": 0.8063, | |
| "step": 595 | |
| }, | |
| { | |
| "epoch": 1.5793285055957869, | |
| "grad_norm": 1.1640625, | |
| "learning_rate": 3.800580484253105e-05, | |
| "loss": 0.7744, | |
| "step": 600 | |
| }, | |
| { | |
| "epoch": 1.5924950625411456, | |
| "grad_norm": 1.1015625, | |
| "learning_rate": 3.5737414388149785e-05, | |
| "loss": 0.7701, | |
| "step": 605 | |
| }, | |
| { | |
| "epoch": 1.6056616194865043, | |
| "grad_norm": 1.046875, | |
| "learning_rate": 3.352963554838402e-05, | |
| "loss": 0.7414, | |
| "step": 610 | |
| }, | |
| { | |
| "epoch": 1.618828176431863, | |
| "grad_norm": 1.2109375, | |
| "learning_rate": 3.138363945934523e-05, | |
| "loss": 0.7739, | |
| "step": 615 | |
| }, | |
| { | |
| "epoch": 1.6319947333772218, | |
| "grad_norm": 1.0859375, | |
| "learning_rate": 2.9300564483929852e-05, | |
| "loss": 0.794, | |
| "step": 620 | |
| }, | |
| { | |
| "epoch": 1.6451612903225805, | |
| "grad_norm": 1.3203125, | |
| "learning_rate": 2.728151560796454e-05, | |
| "loss": 0.8121, | |
| "step": 625 | |
| }, | |
| { | |
| "epoch": 1.6583278472679395, | |
| "grad_norm": 1.0546875, | |
| "learning_rate": 2.5327563854056714e-05, | |
| "loss": 0.7925, | |
| "step": 630 | |
| }, | |
| { | |
| "epoch": 1.6714944042132982, | |
| "grad_norm": 1.2109375, | |
| "learning_rate": 2.3439745713460624e-05, | |
| "loss": 0.8124, | |
| "step": 635 | |
| }, | |
| { | |
| "epoch": 1.6846609611586572, | |
| "grad_norm": 1.109375, | |
| "learning_rate": 2.1619062596261583e-05, | |
| "loss": 0.7899, | |
| "step": 640 | |
| }, | |
| { | |
| "epoch": 1.6978275181040159, | |
| "grad_norm": 1.046875, | |
| "learning_rate": 1.9866480300168885e-05, | |
| "loss": 0.7489, | |
| "step": 645 | |
| }, | |
| { | |
| "epoch": 1.7109940750493746, | |
| "grad_norm": 1.03125, | |
| "learning_rate": 1.8182928498199634e-05, | |
| "loss": 0.7739, | |
| "step": 650 | |
| }, | |
| { | |
| "epoch": 1.7241606319947334, | |
| "grad_norm": 0.99609375, | |
| "learning_rate": 1.6569300245525457e-05, | |
| "loss": 0.7311, | |
| "step": 655 | |
| }, | |
| { | |
| "epoch": 1.737327188940092, | |
| "grad_norm": 1.15625, | |
| "learning_rate": 1.5026451505743408e-05, | |
| "loss": 0.7321, | |
| "step": 660 | |
| }, | |
| { | |
| "epoch": 1.7504937458854508, | |
| "grad_norm": 1.1171875, | |
| "learning_rate": 1.3555200696822232e-05, | |
| "loss": 0.7963, | |
| "step": 665 | |
| }, | |
| { | |
| "epoch": 1.7636603028308098, | |
| "grad_norm": 1.1171875, | |
| "learning_rate": 1.215632825696541e-05, | |
| "loss": 0.7587, | |
| "step": 670 | |
| }, | |
| { | |
| "epoch": 1.7768268597761685, | |
| "grad_norm": 1.171875, | |
| "learning_rate": 1.0830576230620492e-05, | |
| "loss": 0.7989, | |
| "step": 675 | |
| }, | |
| { | |
| "epoch": 1.7899934167215275, | |
| "grad_norm": 1.078125, | |
| "learning_rate": 9.578647874855095e-06, | |
| "loss": 0.8169, | |
| "step": 680 | |
| }, | |
| { | |
| "epoch": 1.8031599736668862, | |
| "grad_norm": 1.09375, | |
| "learning_rate": 8.401207286307881e-06, | |
| "loss": 0.7674, | |
| "step": 685 | |
| }, | |
| { | |
| "epoch": 1.816326530612245, | |
| "grad_norm": 1.0234375, | |
| "learning_rate": 7.2988790489124424e-06, | |
| "loss": 0.8234, | |
| "step": 690 | |
| }, | |
| { | |
| "epoch": 1.8294930875576036, | |
| "grad_norm": 1.15625, | |
| "learning_rate": 6.272247902581201e-06, | |
| "loss": 0.7603, | |
| "step": 695 | |
| }, | |
| { | |
| "epoch": 1.8426596445029624, | |
| "grad_norm": 1.015625, | |
| "learning_rate": 5.3218584330249e-06, | |
| "loss": 0.795, | |
| "step": 700 | |
| }, | |
| { | |
| "epoch": 1.8558262014483211, | |
| "grad_norm": 1.0546875, | |
| "learning_rate": 4.448214782872134e-06, | |
| "loss": 0.759, | |
| "step": 705 | |
| }, | |
| { | |
| "epoch": 1.86899275839368, | |
| "grad_norm": 1.125, | |
| "learning_rate": 3.6517803842424474e-06, | |
| "loss": 0.7344, | |
| "step": 710 | |
| }, | |
| { | |
| "epoch": 1.8821593153390388, | |
| "grad_norm": 1.0546875, | |
| "learning_rate": 2.932977712914586e-06, | |
| "loss": 0.7102, | |
| "step": 715 | |
| }, | |
| { | |
| "epoch": 1.8953258722843978, | |
| "grad_norm": 1.046875, | |
| "learning_rate": 2.292188064220374e-06, | |
| "loss": 0.7783, | |
| "step": 720 | |
| }, | |
| { | |
| "epoch": 1.9084924292297565, | |
| "grad_norm": 1.125, | |
| "learning_rate": 1.7297513507832927e-06, | |
| "loss": 0.7961, | |
| "step": 725 | |
| }, | |
| { | |
| "epoch": 1.9216589861751152, | |
| "grad_norm": 44.75, | |
| "learning_rate": 1.2459659222086304e-06, | |
| "loss": 0.7633, | |
| "step": 730 | |
| }, | |
| { | |
| "epoch": 1.934825543120474, | |
| "grad_norm": 1.0859375, | |
| "learning_rate": 8.410884068213941e-07, | |
| "loss": 0.7727, | |
| "step": 735 | |
| }, | |
| { | |
| "epoch": 1.9479921000658327, | |
| "grad_norm": 1.09375, | |
| "learning_rate": 5.153335755354038e-07, | |
| "loss": 0.7779, | |
| "step": 740 | |
| }, | |
| { | |
| "epoch": 1.9611586570111914, | |
| "grad_norm": 1.0234375, | |
| "learning_rate": 2.688742279261913e-07, | |
| "loss": 0.7058, | |
| "step": 745 | |
| }, | |
| { | |
| "epoch": 1.9743252139565504, | |
| "grad_norm": 2.1875, | |
| "learning_rate": 1.0184110056790651e-07, | |
| "loss": 0.8194, | |
| "step": 750 | |
| }, | |
| { | |
| "epoch": 1.9874917709019093, | |
| "grad_norm": 1.0546875, | |
| "learning_rate": 1.432279768290856e-08, | |
| "loss": 0.7634, | |
| "step": 755 | |
| }, | |
| { | |
| "epoch": 1.9953917050691246, | |
| "step": 758, | |
| "total_flos": 1.449790274661253e+17, | |
| "train_loss": 1.2643018703032924, | |
| "train_runtime": 2142.5178, | |
| "train_samples_per_second": 11.339, | |
| "train_steps_per_second": 0.354 | |
| } | |
| ], | |
| "logging_steps": 5, | |
| "max_steps": 758, | |
| "num_input_tokens_seen": 0, | |
| "num_train_epochs": 2, | |
| "save_steps": 500, | |
| "stateful_callbacks": { | |
| "TrainerControl": { | |
| "args": { | |
| "should_epoch_stop": false, | |
| "should_evaluate": false, | |
| "should_log": false, | |
| "should_save": false, | |
| "should_training_stop": false | |
| }, | |
| "attributes": {} | |
| } | |
| }, | |
| "total_flos": 1.449790274661253e+17, | |
| "train_batch_size": 4, | |
| "trial_name": null, | |
| "trial_params": null | |
| } | |